prep_candidates_dedupe_mapping <- function( candidate_listing_raw, candidate_listing_dedupe ) { # candidate names ---- # Extract candidate names, these will be primary keys for the candidates table candidate_names <- candidate_listing_raw |> arrange(last_name, first_name, middle_name) |> distinct(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) |> mutate(candidate_id = row_number(), .before = 1) # Find last election ---- candidates_last_contest <- candidate_listing_raw |> group_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) |> slice_max(election_dt, n = 1) |> distinct(contest_last = election_dt) deduped_ids <- candidate_listing_dedupe$matches |> left_join( candidate_names, by = join_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) ) |> distinct(dupe_id = dedupe.ids, candidate_id) |> add_count(dupe_id) |> filter(n > 1) |> select(-n) |> mutate(dupe_id = fct_infreq(paste(dupe_id))) |> arrange(dupe_id, candidate_id) |> group_split(dupe_id) mapping <- candidate_names |> select(candidate_id) |> mutate(candidate_group = candidate_id) for (dupes in deduped_ids) { map_group <- left_join(dupes[-1], mapping, by = "candidate_id") all_ids <- union(map_group$candidate_id, map_group$candidate_group) map_others <- mapping |> filter(candidate_group %in% all_ids) browser(expr = nrow(map_group) < nrow(map_others)) # recompute current grouping to min of all ids update <- dplyr::union(map_group, map_others) |> mutate(candidate_group = min(candidate_id, candidate_group)) mapping <- rows_update(mapping, update, by = "candidate_id") } mapping |> left_join(candidate_names, by = "candidate_id") |> left_join(candidates_last_contest, by = join_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl)) } prep_dedupe_candidates <- function(candidate_listing_raw) { candidate_names <- candidate_listing_raw |> arrange(last_name, first_name, middle_name) |> distinct(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl, street_address, city, contest_name, county_name) |> mutate(candidate_id = row_number(), .before = 1) # first, last, street_address ---- f_l_street <- candidate_names |> group_by(first_name, last_name, street_address) |> mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |> ungroup() # first, last, cleaned contest name f_l_cleaned_contest <- candidate_names |> mutate( contest_name_clean = if_else( !is.na(city) & map2_lgl(city, contest_name, \(c, cn) grepl(paste("TOWN OF", c), cn)), map2_chr(city, contest_name, \(c, cn) sub(paste("TOWN OF", c, ""), "", cn)), contest_name ), contest_name_clean = sub("([A-Z]+ ){1,2}COUNTY ", "", contest_name_clean) ) |> group_by(first_name, last_name, contest_name_clean) |> mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |> ungroup() # first, middle, last, county_name f_m_l_county <- candidate_names |> group_by(first_name, middle_name, last_name, county_name) |> mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |> ungroup() # first, middle initial, last, county_name f_mi_l_county <- candidate_names |> mutate(middle_name = stringr::str_sub(middle_name, 1, 1)) |> group_by(first_name, middle_name, last_name, county_name) |> mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |> ungroup() # name_on_ballot, middle_name nb_m <- candidate_names |> mutate( name_ballot_clean = sub("(Mrs?|Dr|Mr|Ms|Miss)[.]? ", "", name_on_ballot), name_ballot_clean = gsub("[.]", "", name_ballot_clean), ) |> group_by(name_ballot_clean, middle_name) |> mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |> ungroup() candidate_names |> dedupe_update_mapping(f_l_street, candidate_id, candidate_group, name = "first, last, street") |> dedupe_update_mapping(f_l_cleaned_contest, candidate_id, candidate_group, name = "first, last, contest") |> dedupe_update_mapping(f_m_l_county, candidate_id, candidate_group, name = "first, middle, last, county") |> dedupe_update_mapping(f_mi_l_county, candidate_id, candidate_group, name = "first, mi, last, county") |> dedupe_update_mapping(nb_m, candidate_id, candidate_group, name = "ballot name, middle name") } prep_candidate_listing <- function( candidate_listing_raw, candidate_listing_dedupe ) { # Give candidates a unique id from dedupe mapping cl_dedupe <- candidate_listing_dedupe |> group_by(candidate_group) |> mutate(candidate_id = cur_group_id()) |> ungroup() |> select(-candidate_group) # cl raw + candidate_id ---- cl_raw <- candidate_listing_raw |> left_join(cl_dedupe, by = setdiff(names(cl_dedupe), "candidate_id")) # candidate_names ---- candidate_names <- cl_raw |> distinct(candidate_id, election_dt, first_name, middle_name, last_name, name_suffix_lbl) |> arrange(candidate_id, election_dt) # candidate_name_on_ballot ---- candidate_name_on_ballot <- cl_raw |> distinct(candidate_id, election_dt, name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) |> arrange(candidate_id, election_dt) # candidate_address ---- candidate_address <- cl_raw |> mutate( phone = coalesce(phone, office_phone, business_phone), street_address = toupper(street_address), street_address = stringr::str_replace_all(street_address, " +", " "), street_address = trimws(street_address), ) |> select(-office_phone, -business_phone) |> distinct(candidate_id, election_dt, street = street_address, city, state, zip_code, phone, email) |> add_address_lookup_local(street = street, postal_code = zip_code) |> # remove rows that are entirely empty except for candidate_id filter(!(is.na(street) & is.na(city) & is.na(state) & is.na(zip_code) & is.na(phone) & is.na(email))) |> arrange(candidate_id, election_dt) # candidate_party ---- candidate_party <- cl_raw |> distinct(candidate_id, election_dt, party_candidate) |> mutate(party_candidate = forcats::fct_inorder(party_candidate)) |> filter(!is.na(party_candidate)) |> arrange(candidate_id, election_dt) # Extract contests (remaining data in candidate_listing) ---- cols_related <- setdiff( c(colnames(candidate_name_on_ballot), colnames(candidate_address), colnames(candidate_party)), c("election_dt", colnames(candidate_names)) ) contests <- cl_raw |> select(election_dt:name_suffix_lbl, candidate_id) |> select(-any_of(cols_related)) |> relocate(candidate_id, .before = first_name) # Get current candidate information ---- candidate_best_name <- candidate_names |> mutate(n_char = nchar(paste(first_name, middle_name, last_name, name_suffix_lbl))) |> arrange(desc(n_char), desc(election_dt)) |> slice_max(n_char, n = 1, by = candidate_id, with_ties = FALSE) |> select(-n_char, -election_dt) current_contact <- candidate_listing_current_contact_info(candidate_address) current_name_on_ballot <- candidate_name_on_ballot |> slice_max(election_dt, by = candidate_id, n = 1, with_ties = FALSE) |> select(candidate_id, name_on_ballot) current_party <- candidate_party |> filter(!is.na(party_candidate)) |> group_by(candidate_id) |> slice_max(election_dt, n = 1) |> arrange(party_candidate) |> slice_head(n = 1) |> ungroup() |> select(-election_dt) |> rename(party_last = party_candidate) current_most_party <- candidate_party |> filter(!is.na(party_candidate)) |> group_by(candidate_id) |> count(party_candidate) |> slice_max(n, n = 1) |> arrange(party_candidate) |> slice_head(n = 1) |> ungroup() |> select(-n) |> rename(party_most = party_candidate) current_contest_count <- contests |> group_by(candidate_id) |> distinct(candidate_id, election_dt, contest_name) |> summarize( contest_n = n(), contest_first = min(election_dt), contest_latest = max(election_dt) ) # Join candidates into one big table ---- left_join_by_candidate <- function(x, y) { left_join(x, y, join_by(candidate_id), relationship = "one-to-one") } candidates <- candidate_best_name |> left_join_by_candidate(current_name_on_ballot) |> left_join_by_candidate(current_contact) |> left_join_by_candidate(current_party) |> left_join_by_candidate(current_most_party) |> left_join_by_candidate(current_contest_count) |> relocate(name_on_ballot, .before = first_name) |> relocate(starts_with("party"), .before = street) |> relocate(starts_with("contest"), .before = street) |> add_address_lookup_local(street = street, postal_code = zip_code) |> arrange(candidate_id) # Return list of tables list( cl_elections = contests, cl_candidates = candidates, cl_name_on_ballot = candidate_name_on_ballot, cl_contact = candidate_address, cl_party = candidate_party ) } candidate_listing_current_contact_info <- function(candidate_address) { candidate_phone_current <- candidate_address |> filter(!is.na(phone)) |> slice_max(election_dt, by = candidate_id, n = 1, with_ties = FALSE) |> select(candidate_id, phone) candidate_email_current <- candidate_address |> filter(!is.na(email)) |> slice_max(election_dt, by = candidate_id, n = 1, with_ties = FALSE) |> select(candidate_id, email) candidate_address_current <- candidate_address |> group_by(candidate_id) |> slice_max(election_dt, n = 1) |> select(-election_dt) |> filter( n_distinct(street) == 1 | !grepl("PO BOX", street) ) |> slice_head(n = 1) |> select(candidate_id, street:zip_code) |> ungroup() candidate_address_current |> left_join(candidate_phone_current, by = "candidate_id", relationship = "one-to-one") |> left_join(candidate_email_current, by = "candidate_id", relationship = "one-to-one") } # This function isn't used anymore -- I opted for a more manual approach of # using overlapping signals. But this general idea could work in other places, # e.g. for deduping donors. fastlink_candidate_listing <- function(candidate_listing_raw) { data <- candidate_listing_raw |> distinct( name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl, street_address, city ) |> mutate( name_on_ballot_clean = sub(" \\(.+?\\)\\s?", "", name_on_ballot) ) linked <- fastLink::fastLink( data, data, varnames = c( "name_on_ballot_clean", "first_name", "middle_name", "last_name", "name_suffix_lbl", "street_address", "city" ), stringdist.match = c("middle_name"), stringdist.method = "jw", jw.weight = .25, threshold.match = 0.98 ) matches <- fastLink::getMatches( data, data, linked, threshold.match = 0.9 ) |> as_tibble() list( data = data, linked = linked, matches = matches ) }