prepare_candidates_for_matching <- function( path_data_prep_officers, path_out_report_list, path_out_committees, path_addresses = "data-out/addresses" ) { lg_info_target(lg_get_logger()) report_list <- out_open_dataset_db(path_out_report_list) addresses <- out_open_dataset_db(path_addresses) committees <- out_open_dataset_db(path_out_committees) candidates_db <- out_open_dataset_db(path_data_prep_officers) candidates <- candidates_db |> semi_join(report_list, by = "report_id") |> filter(type == "Candidate") |> rows_patch( committees |> select(sboe_id, address = address_lookup), by = "sboe_id", unmatched = "ignore" ) |> mutate( name_clean = toupper(name), name_clean = REGEXP_REPLACE(name_clean, " FOR .+$", "", "g"), name_clean = REGEXP_REPLACE(name_clean, "\\s+", " ", "g") ) |> distinct(sboe_id, name_clean, address) |> rename(address_raw = address) |> left_join( addresses |> select(1:2), by = join_by(address_raw == address_lookup) ) |> rename(address = address_resolved) # Add new rows with aliases for people like `"ROLLANDE \"ROLIE\" SAMPSON"` candidates_aliases <- candidates |> filter(grepl('"[A-Z]+"', name_clean)) |> mutate( name_clean = REGEXP_REPLACE(name_clean, '.+? "([A-Z]+)"(.+)$', "\\1 \\2", "g"), name_clean = REGEXP_REPLACE(name_clean, "[^A-Z ]", "", "g"), name_clean = REGEXP_REPLACE(name_clean, "\\s+", " ", "g"), name_clean = REGEXP_REPLACE(name_clean, "^\\s+|\\s+$", "", "g") ) |> collect() candidates <- candidates |> mutate( name_clean = REGEXP_REPLACE(name_clean, "[^A-Z ]", "", "g"), name_clean = REGEXP_REPLACE(name_clean, "\\s+", " ", "g"), name_clean = REGEXP_REPLACE(name_clean, "^\\s+|\\s+$", "", "g") ) |> collect() candidates |> bind_rows(candidates_aliases) |> filter(!is.na(name_clean)) |> tidyr::extract( address, c("street", "city", "state", "postal_code"), "(.+), (.+), ([A-Z]{2}), (\\d{5})" ) |> mutate( street = if_else(!grepl("\\d", street), NA_character_, street), ) } prepare_candidate_listing_for_matching <- function(candidate_listing) { lg_info_target(lg_get_logger()) candidate_name <- candidate_listing$cl_candidates |> mutate( name_full = paste(first_name, middle_name, last_name, name_suffix_lbl), name_mi = paste(first_name, substr(middle_name, 1, 1), last_name, name_suffix_lbl), name_first_last = paste(first_name, last_name, name_suffix_lbl), ) |> select(candidate_id, name_full, name_mi, name_first_last) |> tidyr::pivot_longer(-candidate_id, values_to = "name_clean") |> select(1, name_clean) candidate_alias <- candidate_listing$cl_name_on_ballot |> distinct(candidate_id, name_clean = name_on_ballot) |> mutate( name_clean = toupper(name_clean), name_clean = gsub("[,.]", "", name_clean) ) candidate_aka <- candidate_alias |> filter(grepl("[(].+[)]", name_clean)) |> mutate(name_clean = sub(".+? \\((.+?)\\) (.+)$", "\\1 \\2", name_clean)) bind_rows(candidate_name, candidate_alias, candidate_aka) |> mutate(name_clean = stringr::str_squish(name_clean)) |> left_join( candidate_listing$cl_contact |> select(1, street:zip_code), by = "candidate_id", relationship = "many-to-many" ) |> distinct() } fastlink_candidates <- function(candidates_for_matching, candidate_listing_for_matching) { lg_info_target(lg_get_logger()) fastLink::fastLink( candidates_for_matching, candidate_listing_for_matching, varnames = c("name_clean", "street", "city"), stringdist.match = c("name_clean", "street"), partial.match = c("name_clean", "street"), stringdist.method = "dl", threshold.match = 0.9 ) } fastlink_match_candidates <- function( candidates_for_matching, candidate_listing_for_matching, candidates_linked ) { lg_info_target(lg_get_logger()) matches <- fastLink::getMatches( candidates_for_matching, candidate_listing_for_matching, candidates_linked, threshold.match = 0.9 ) matches <- as_tibble(matches) distinct(matches, sboe_id, candidate_id) } candidates_match <- function( candidates_for_matching, candidate_listing_for_matching, candidates_linked ) { # First, direct matches candidates_matched_1 <- candidates_for_matching |> inner_join( candidate_listing_for_matching |> select(candidate_id, name_clean, street, city), by = join_by( name_clean == name_clean, street == street, city == city ), relationship = "many-to-many" ) |> distinct(sboe_id, candidate_id) # Then unambiguous matches on street + city matches_street_city <- candidates_for_matching |> anti_join(candidates_matched_1, by = "sboe_id") |> inner_join( candidate_listing_for_matching |> select(candidate_id, street, city), by = join_by( street == street, city == city ), relationship = "many-to-many" ) |> distinct(sboe_id, candidate_id, street, city) |> group_by(street, city) |> mutate(n_names = n_distinct(candidate_id)) |> ungroup() |> filter(n_names == 1) |> select(sboe_id, candidate_id) # And unambiguous names matches_name_obvious <- inner_join( candidates_for_matching, candidate_listing_for_matching |> distinct(candidate_id, name_clean) |> add_count(name_clean) |> filter(n == 1) |> select(-n), by = "name_clean", relationship = "many-to-many" ) |> distinct(sboe_id, candidate_id) candidates_matched_2 <- candidates_matched_1 |> union(matches_street_city) |> union(matches_name_obvious) candidates_for_matching_left <- candidates_for_matching |> anti_join(candidates_matched_2, by = "sboe_id") # Now fuzzyjoin... candidates_fuzzy_name <- zoomerjoin::jaccard_inner_join( candidates_for_matching_left, candidate_listing_for_matching |> select(name_on_ballot, name_clean, street, city), by = "name_clean", threshold = 0.85 ) candidates_fuzzy_name |> distinct(sboe_id, name_on_ballot) |> group_by(sboe_id) |> mutate(n_names = n_distinct(name_on_ballot)) |> ungroup() |> filter(n_names != 1) |> arrange(desc(n_names), sboe_id) }