|
- fastlink_candidate_listing <- function(candidate_listing_raw) {
- data <-
- candidate_listing_raw |>
- distinct(
- name_on_ballot,
- first_name,
- middle_name,
- last_name,
- name_suffix_lbl,
- street_address,
- city
- ) |>
- mutate(
- name_on_ballot_clean = sub(" \\(.+?\\)\\s?", "", name_on_ballot)
- )
-
- linked <- fastLink::fastLink(
- data,
- data,
- varnames = c(
- "name_on_ballot_clean",
- "first_name", "middle_name", "last_name", "name_suffix_lbl",
- "street_address", "city"
- ),
- stringdist.match = c("middle_name"),
- stringdist.method = "jw",
- jw.weight = .25,
- threshold.match = 0.98
- )
-
- matches <-
- fastLink::getMatches(
- data,
- data,
- linked,
- threshold.match = 0.9
- ) |>
- as_tibble()
-
- list(
- data = data,
- linked = linked,
- matches = matches
- )
- }
-
- prep_candidates_dedupe_mapping <- function(
- candidate_listing_raw,
- candidate_listing_dedupe
- ) {
- # candidate names ----
- # Extract candidate names, these will be primary keys for the candidates table
- candidate_names <-
- candidate_listing_raw |>
- arrange(last_name, first_name, middle_name) |>
- distinct(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) |>
- mutate(candidate_id = row_number(), .before = 1)
-
- # Find last election ----
- candidates_last_contest <-
- candidate_listing_raw |>
- group_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) |>
- slice_max(election_dt, n = 1) |>
- distinct(contest_last = election_dt)
-
- deduped_ids <-
- candidate_listing_dedupe$matches |>
- left_join(
- candidate_names,
- by = join_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl)
- ) |>
- distinct(dupe_id = dedupe.ids, candidate_id) |>
- add_count(dupe_id) |>
- filter(n > 1) |>
- select(-n) |>
- mutate(dupe_id = fct_infreq(paste(dupe_id))) |>
- arrange(dupe_id, candidate_id) |>
- group_split(dupe_id)
-
- mapping <- candidate_names |> select(candidate_id) |> mutate(candidate_group = candidate_id)
-
- for (dupes in deduped_ids) {
-
- map_group <- left_join(dupes[-1], mapping, by = "candidate_id")
- all_ids <- union(map_group$candidate_id, map_group$candidate_group)
- map_others <- mapping |> filter(candidate_group %in% all_ids)
- browser(expr = nrow(map_group) < nrow(map_others))
-
- # recompute current grouping to min of all ids
- update <-
- dplyr::union(map_group, map_others) |>
- mutate(candidate_group = min(candidate_id, candidate_group))
-
- mapping <- rows_update(mapping, update, by = "candidate_id")
- }
-
- mapping |>
- left_join(candidate_names, by = "candidate_id") |>
- left_join(candidates_last_contest, by = join_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl))
- }
-
- prep_dedupe_candidates <- function(candidate_listing_raw) {
- candidate_names <-
- candidate_listing_raw |>
- arrange(last_name, first_name, middle_name) |>
- distinct(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl, street_address, city, contest_name, county_name) |>
- mutate(candidate_id = row_number(), .before = 1)
-
- # first, last, street_address ----
- f_l_street <-
- candidate_names |>
- group_by(first_name, last_name, street_address) |>
- mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |>
- ungroup()
-
- # first, last, cleaned contest name
- f_l_cleaned_contest <-
- candidate_names |>
- mutate(
- contest_name_clean = if_else(
- !is.na(city) & map2_lgl(city, contest_name, \(c, cn) grepl(paste("TOWN OF", c), cn)),
- map2_chr(city, contest_name, \(c, cn) sub(paste("TOWN OF", c, ""), "", cn)),
- contest_name
- ),
- contest_name_clean = sub("([A-Z]+ ){1,2}COUNTY ", "", contest_name_clean)
- ) |>
- group_by(first_name, last_name, contest_name_clean) |>
- mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |>
- ungroup()
-
- # first, middle, last, county_name
- f_m_l_county <-
- candidate_names |>
- group_by(first_name, middle_name, last_name, county_name) |>
- mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |>
- ungroup()
-
- # first, middle initial, last, county_name
- f_mi_l_county <-
- candidate_names |>
- mutate(middle_name = stringr::str_sub(middle_name, 1, 1)) |>
- group_by(first_name, middle_name, last_name, county_name) |>
- mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |>
- ungroup()
-
- # name_on_ballot, middle_name
- nb_m <-
- candidate_names |>
- mutate(
- name_ballot_clean = sub("(Mrs?|Dr|Mr|Ms|Miss)[.]? ", "", name_on_ballot),
- name_ballot_clean = gsub("[.]", "", name_ballot_clean),
- ) |>
- group_by(name_ballot_clean, middle_name) |>
- mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |>
- ungroup()
-
- candidate_names |>
- dedupe_update_mapping(f_l_street, candidate_id, candidate_group, name = "first, last, street") |>
- dedupe_update_mapping(f_l_cleaned_contest, candidate_id, candidate_group, name = "first, last, contest") |>
- dedupe_update_mapping(f_m_l_county, candidate_id, candidate_group, name = "first, middle, last, county") |>
- dedupe_update_mapping(f_mi_l_county, candidate_id, candidate_group, name = "first, mi, last, county") |>
- dedupe_update_mapping(nb_m, candidate_id, candidate_group, name = "ballot name, middle name")
- }
-
- prep_candidate_listing <- function(
- candidate_listing_raw,
- candidate_listing_dedupe
- ) {
-
- # Give candidates a unique id from dedupe mapping
- cl_dedupe <-
- candidate_listing_dedupe |>
- group_by(candidate_group) |>
- mutate(candidate_id = cur_group_id()) |>
- ungroup() |>
- select(-candidate_group)
-
- # cl raw + candidate_id ----
- cl_raw <-
- candidate_listing_raw |>
- left_join(cl_dedupe, by = setdiff(names(cl_dedupe), "candidate_id"))
-
-
- # candidate_names ----
- candidate_names <-
- cl_raw |>
- distinct(candidate_id, election_dt, first_name, middle_name, last_name, name_suffix_lbl) |>
- arrange(candidate_id, election_dt)
-
- # candidate_name_on_ballot ----
- candidate_name_on_ballot <-
- cl_raw |>
- distinct(candidate_id, election_dt, name_on_ballot) |>
- arrange(candidate_id, election_dt)
-
- # candidate_address ----
- candidate_address <-
- cl_raw |>
- mutate(
- phone = coalesce(phone, office_phone, business_phone),
- street_address = toupper(street_address),
- street_address = stringr::str_replace_all(street_address, " +", " "),
- street_address = trimws(street_address),
- ) |>
- select(-office_phone, -business_phone) |>
- distinct(candidate_id, election_dt, street = street_address, city, state, zip_code, phone, email) |>
- add_address_lookup_local(street = street, postal_code = zip_code) |>
- # remove rows that are entirely empty except for candidate_id
- filter(!(is.na(street) & is.na(city) & is.na(state) & is.na(zip_code) & is.na(phone) & is.na(email))) |>
- arrange(candidate_id, election_dt)
-
- # candidate_party ----
- candidate_party <-
- cl_raw |>
- distinct(candidate_id, election_dt, party_candidate) |>
- mutate(party_candidate = forcats::fct_inorder(party_candidate)) |>
- filter(!is.na(party_candidate)) |>
- arrange(candidate_id, election_dt)
-
- # Extract contests (remaining data in candidate_listing) ----
- cols_related <- setdiff(
- c(colnames(candidate_name_on_ballot), colnames(candidate_address), colnames(candidate_party)),
- c("election_dt", "candidate_id")
- )
-
- contests <-
- cl_raw |>
- select(election_dt:name_suffix_lbl, candidate_id) |>
- select(-any_of(cols_related)) |>
- relocate(candidate_id, .before = first_name)
-
- # Get current candidate information ----
- candidate_best_name <-
- candidate_names |>
- mutate(n_char = nchar(paste(first_name, middle_name, last_name, name_suffix_lbl))) |>
- arrange(desc(n_char), desc(election_dt)) |>
- slice_max(n_char, n = 1, by = candidate_id, with_ties = FALSE) |>
- select(-n_char, -election_dt)
-
- current_contact <-
- candidate_listing_current_contact_info(candidate_address)
-
- current_name_on_ballot <-
- candidate_name_on_ballot |>
- slice_max(election_dt, by = candidate_id, n = 1, with_ties = FALSE) |>
- select(-election_dt)
-
- current_party <-
- candidate_party |>
- filter(!is.na(party_candidate)) |>
- group_by(candidate_id) |>
- slice_max(election_dt, n = 1) |>
- arrange(party_candidate) |>
- slice_head(n = 1) |>
- ungroup() |>
- select(-election_dt) |>
- rename(party_last = party_candidate)
-
- current_most_party <-
- candidate_party |>
- filter(!is.na(party_candidate)) |>
- group_by(candidate_id) |>
- count(party_candidate) |>
- slice_max(n, n = 1) |>
- arrange(party_candidate) |>
- slice_head(n = 1) |>
- ungroup() |>
- select(-n) |>
- rename(party_most = party_candidate)
-
- current_contest_count <-
- contests |>
- group_by(candidate_id) |>
- distinct(candidate_id, election_dt, contest_name) |>
- summarize(
- contest_n = n(),
- contest_first = min(election_dt),
- contest_latest = max(election_dt)
- )
-
- # Join candidates into one big table ----
- left_join_by_candidate <- function(x, y) {
- left_join(x, y, join_by(candidate_id), relationship = "one-to-one")
- }
-
- candidates <-
- candidate_best_name |>
- left_join_by_candidate(current_name_on_ballot) |>
- left_join_by_candidate(current_contact) |>
- left_join_by_candidate(current_party) |>
- left_join_by_candidate(current_most_party) |>
- left_join_by_candidate(current_contest_count) |>
- relocate(name_on_ballot, .before = first_name) |>
- relocate(starts_with("party"), .before = street) |>
- relocate(starts_with("contest"), .before = street) |>
- add_address_lookup_local(street = street, postal_code = zip_code)
-
- # Return list of tables
- list(
- cl_elections = contests,
- cl_candidates = candidates,
- cl_name_on_ballot = candidate_name_on_ballot,
- cl_contact = candidate_address,
- cl_party = candidate_party
- )
- }
-
- extract_candidate_info <- function(
- candidate_listing_contest,
- candidate_names,
- info_vars
- ) {
- candidate_listing_contest |>
- select(first_name:name_suffix_lbl, election_dt, all_of(info_vars)) |>
- distinct() |>
- left_join(
- candidate_names,
- by = c("first_name", "middle_name", "last_name", "name_suffix_lbl")
- ) |>
- select(-first_name, -middle_name, -last_name, -name_suffix_lbl) |>
- relocate(candidate_id, .before = 1) |>
- distinct() |>
- arrange(candidate_id, election_dt, !!!rlang::syms(info_vars))
- }
-
- candidate_listing_current_contact_info <- function(candidate_address) {
- candidate_phone_current <-
- candidate_address |>
- filter(!is.na(phone)) |>
- slice_max(election_dt, by = candidate_id, n = 1, with_ties = FALSE) |>
- select(candidate_id, phone)
-
- candidate_email_current <-
- candidate_address |>
- filter(!is.na(email)) |>
- slice_max(election_dt, by = candidate_id, n = 1, with_ties = FALSE) |>
- select(candidate_id, email)
-
- candidate_address_current <-
- candidate_address |>
- group_by(candidate_id) |>
- slice_max(election_dt, n = 1) |>
- select(-election_dt) |>
- filter(
- n_distinct(street) == 1 | !grepl("PO BOX", street)
- ) |>
- slice_head(n = 1) |>
- select(candidate_id, street:zip_code) |>
- ungroup()
-
- candidate_address_current |>
- left_join(candidate_phone_current, by = "candidate_id", relationship = "one-to-one") |>
- left_join(candidate_email_current, by = "candidate_id", relationship = "one-to-one")
- }
|