|
- prepare_candidates_for_matching <- function(
- path_data_prep_officers,
- path_out_report_list,
- path_out_committees,
- path_addresses = "data-out/addresses"
- ) {
- lg_info_target(lg_get_logger())
-
- report_list <- out_open_dataset_db(path_out_report_list)
- addresses <- out_open_dataset_db(path_addresses)
- committees <- out_open_dataset_db(path_out_committees)
- candidates_db <- out_open_dataset_db(path_data_prep_officers)
-
- candidates <-
- candidates_db |>
- semi_join(report_list, by = "report_id") |>
- filter(type == "Candidate") |>
- rows_patch(
- committees |> select(sboe_id, address = address_lookup),
- by = "sboe_id",
- unmatched = "ignore"
- ) |>
- mutate(
- name_clean = toupper(name),
- name_clean = REGEXP_REPLACE(name_clean, " FOR .+$", "", "g"),
- name_clean = REGEXP_REPLACE(name_clean, "\\s+", " ", "g")
- ) |>
- distinct(sboe_id, name_clean, address) |>
- rename(address_raw = address) |>
- left_join(
- addresses |> select(1:2),
- by = join_by(address_raw == address_lookup)
- ) |>
- rename(address = address_resolved)
-
-
- # Add new rows with aliases for people like `"ROLLANDE \"ROLIE\" SAMPSON"`
- candidates_aliases <-
- candidates |>
- filter(grepl('"[A-Z]+"', name_clean)) |>
- mutate(
- name_clean = REGEXP_REPLACE(name_clean, '.+? "([A-Z]+)"(.+)$', "\\1 \\2", "g"),
- name_clean = REGEXP_REPLACE(name_clean, "[^A-Z ]", "", "g"),
- name_clean = REGEXP_REPLACE(name_clean, "\\s+", " ", "g"),
- name_clean = REGEXP_REPLACE(name_clean, "^\\s+|\\s+$", "", "g")
- ) |>
- collect()
-
- candidates <- candidates |>
- mutate(
- name_clean = REGEXP_REPLACE(name_clean, "[^A-Z ]", "", "g"),
- name_clean = REGEXP_REPLACE(name_clean, "\\s+", " ", "g"),
- name_clean = REGEXP_REPLACE(name_clean, "^\\s+|\\s+$", "", "g")
- ) |>
- collect()
-
- candidates |>
- bind_rows(candidates_aliases) |>
- filter(!is.na(name_clean)) |>
- tidyr::extract(
- address,
- c("street", "city", "state", "postal_code"),
- "(.+), (.+), ([A-Z]{2}), (\\d{5})"
- ) |>
- mutate(
- street = if_else(!grepl("\\d", street), NA_character_, street),
- )
- }
-
- prepare_candidate_listing_for_matching <- function(candidate_listing) {
- lg_info_target(lg_get_logger())
-
- candidate_name <-
- candidate_listing$cl_candidates |>
- mutate(
- name_full = paste(first_name, middle_name, last_name, name_suffix_lbl),
- name_mi = paste(first_name, substr(middle_name, 1, 1), last_name, name_suffix_lbl),
- name_first_last = paste(first_name, last_name, name_suffix_lbl),
- ) |>
- select(candidate_id, name_full, name_mi, name_first_last) |>
- tidyr::pivot_longer(-candidate_id, values_to = "name_clean") |>
- select(1, name_clean)
-
- candidate_alias <-
- candidate_listing$cl_name_on_ballot |>
- distinct(candidate_id, name_clean = name_on_ballot) |>
- mutate(
- name_clean = toupper(name_clean),
- name_clean = gsub("[,.]", "", name_clean)
- )
-
- candidate_aka <-
- candidate_alias |>
- filter(grepl("[(].+[)]", name_clean)) |>
- mutate(name_clean = sub(".+? \\((.+?)\\) (.+)$", "\\1 \\2", name_clean))
-
- bind_rows(candidate_name, candidate_alias, candidate_aka) |>
- mutate(name_clean = stringr::str_squish(name_clean)) |>
- left_join(
- candidate_listing$cl_contact |> select(1, street:zip_code),
- by = "candidate_id",
- relationship = "many-to-many"
- ) |>
- distinct()
- }
-
- fastlink_candidates <- function(candidates_for_matching, candidate_listing_for_matching) {
- lg_info_target(lg_get_logger())
-
- fastLink::fastLink(
- candidates_for_matching,
- candidate_listing_for_matching,
- varnames = c("name_clean", "street", "city"),
- stringdist.match = c("name_clean", "street"),
- partial.match = c("name_clean", "street"),
- stringdist.method = "dl",
- threshold.match = 0.9
- )
- }
-
- fastlink_match_candidates <- function(
- candidates_for_matching,
- candidate_listing_for_matching,
- candidates_linked
- ) {
- lg_info_target(lg_get_logger())
-
- matches <- fastLink::getMatches(
- candidates_for_matching,
- candidate_listing_for_matching,
- candidates_linked,
- threshold.match = 0.9
- )
-
- matches <- as_tibble(matches)
-
- distinct(matches, sboe_id, candidate_id)
- }
-
- candidates_match <- function(
- candidates_for_matching,
- candidate_listing_for_matching,
- candidates_linked
- ) {
- # First, direct matches
- candidates_matched_1 <-
- candidates_for_matching |>
- inner_join(
- candidate_listing_for_matching |>
- select(candidate_id, name_clean, street, city),
- by = join_by(
- name_clean == name_clean,
- street == street,
- city == city
- ),
- relationship = "many-to-many"
- ) |>
- distinct(sboe_id, candidate_id)
-
- # Then unambiguous matches on street + city
- matches_street_city <-
- candidates_for_matching |>
- anti_join(candidates_matched_1, by = "sboe_id") |>
- inner_join(
- candidate_listing_for_matching |>
- select(candidate_id, street, city),
- by = join_by(
- street == street,
- city == city
- ),
- relationship = "many-to-many"
- ) |>
- distinct(sboe_id, candidate_id, street, city) |>
- group_by(street, city) |>
- mutate(n_names = n_distinct(candidate_id)) |>
- ungroup() |>
- filter(n_names == 1) |>
- select(sboe_id, candidate_id)
-
- # And unambiguous names
- matches_name_obvious <-
- inner_join(
- candidates_for_matching,
- candidate_listing_for_matching |>
- distinct(candidate_id, name_clean) |>
- add_count(name_clean) |>
- filter(n == 1) |>
- select(-n),
- by = "name_clean",
- relationship = "many-to-many"
- ) |>
- distinct(sboe_id, candidate_id)
-
- candidates_matched_2 <-
- candidates_matched_1 |>
- union(matches_street_city) |>
- union(matches_name_obvious)
-
- candidates_for_matching_left <-
- candidates_for_matching |>
- anti_join(candidates_matched_2, by = "sboe_id")
-
- # Now fuzzyjoin...
- candidates_fuzzy_name <-
- zoomerjoin::jaccard_inner_join(
- candidates_for_matching_left,
- candidate_listing_for_matching |>
- select(name_on_ballot, name_clean, street, city),
- by = "name_clean",
- threshold = 0.85
- )
-
- candidates_fuzzy_name |>
- distinct(sboe_id, name_on_ballot) |>
- group_by(sboe_id) |>
- mutate(n_names = n_distinct(name_on_ballot)) |>
- ungroup() |>
- filter(n_names != 1) |>
- arrange(desc(n_names), sboe_id)
-
-
- }
|