| prep_candidates_dedupe_mapping <- function( | |||||
| candidate_listing_raw, | |||||
| candidate_listing_dedupe | |||||
| ) { | |||||
| # candidate names ---- | |||||
| # Extract candidate names, these will be primary keys for the candidates table | |||||
| candidate_names <- | |||||
| candidate_listing_raw |> | |||||
| arrange(last_name, first_name, middle_name) |> | |||||
| distinct(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) |> | |||||
| mutate(candidate_id = row_number(), .before = 1) | |||||
| # Find last election ---- | |||||
| candidates_last_contest <- | |||||
| candidate_listing_raw |> | |||||
| group_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) |> | |||||
| slice_max(election_dt, n = 1) |> | |||||
| distinct(contest_last = election_dt) | |||||
| deduped_ids <- | |||||
| candidate_listing_dedupe$matches |> | |||||
| left_join( | |||||
| candidate_names, | |||||
| by = join_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) | |||||
| ) |> | |||||
| distinct(dupe_id = dedupe.ids, candidate_id) |> | |||||
| add_count(dupe_id) |> | |||||
| filter(n > 1) |> | |||||
| select(-n) |> | |||||
| mutate(dupe_id = fct_infreq(paste(dupe_id))) |> | |||||
| arrange(dupe_id, candidate_id) |> | |||||
| group_split(dupe_id) | |||||
| mapping <- candidate_names |> select(candidate_id) |> mutate(candidate_group = candidate_id) | |||||
| for (dupes in deduped_ids) { | |||||
| map_group <- left_join(dupes[-1], mapping, by = "candidate_id") | |||||
| all_ids <- union(map_group$candidate_id, map_group$candidate_group) | |||||
| map_others <- mapping |> filter(candidate_group %in% all_ids) | |||||
| browser(expr = nrow(map_group) < nrow(map_others)) | |||||
| # recompute current grouping to min of all ids | |||||
| update <- | |||||
| dplyr::union(map_group, map_others) |> | |||||
| mutate(candidate_group = min(candidate_id, candidate_group)) | |||||
| mapping <- rows_update(mapping, update, by = "candidate_id") | |||||
| } | |||||
| mapping |> | |||||
| left_join(candidate_names, by = "candidate_id") |> | |||||
| left_join(candidates_last_contest, by = join_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl)) | |||||
| } | |||||
| prep_dedupe_candidates <- function(candidate_listing_raw) { | prep_dedupe_candidates <- function(candidate_listing_raw) { | ||||
| candidate_names <- | candidate_names <- | ||||
| matches = matches | matches = matches | ||||
| ) | ) | ||||
| } | } | ||||
| # This also isn't used anymore... | |||||
| prep_candidates_dedupe_mapping <- function( | |||||
| candidate_listing_raw, | |||||
| candidate_listing_dedupe | |||||
| ) { | |||||
| # candidate names ---- | |||||
| # Extract candidate names, these will be primary keys for the candidates table | |||||
| candidate_names <- | |||||
| candidate_listing_raw |> | |||||
| arrange(last_name, first_name, middle_name) |> | |||||
| distinct(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) |> | |||||
| mutate(candidate_id = row_number(), .before = 1) | |||||
| # Find last election ---- | |||||
| candidates_last_contest <- | |||||
| candidate_listing_raw |> | |||||
| group_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) |> | |||||
| slice_max(election_dt, n = 1) |> | |||||
| distinct(contest_last = election_dt) | |||||
| deduped_ids <- | |||||
| candidate_listing_dedupe$matches |> | |||||
| left_join( | |||||
| candidate_names, | |||||
| by = join_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) | |||||
| ) |> | |||||
| distinct(dupe_id = dedupe.ids, candidate_id) |> | |||||
| add_count(dupe_id) |> | |||||
| filter(n > 1) |> | |||||
| select(-n) |> | |||||
| mutate(dupe_id = fct_infreq(paste(dupe_id))) |> | |||||
| arrange(dupe_id, candidate_id) |> | |||||
| group_split(dupe_id) | |||||
| mapping <- candidate_names |> select(candidate_id) |> mutate(candidate_group = candidate_id) | |||||
| for (dupes in deduped_ids) { | |||||
| map_group <- left_join(dupes[-1], mapping, by = "candidate_id") | |||||
| all_ids <- union(map_group$candidate_id, map_group$candidate_group) | |||||
| map_others <- mapping |> filter(candidate_group %in% all_ids) | |||||
| browser(expr = nrow(map_group) < nrow(map_others)) | |||||
| # recompute current grouping to min of all ids | |||||
| update <- | |||||
| dplyr::union(map_group, map_others) |> | |||||
| mutate(candidate_group = min(candidate_id, candidate_group)) | |||||
| mapping <- rows_update(mapping, update, by = "candidate_id") | |||||
| } | |||||
| mapping |> | |||||
| left_join(candidate_names, by = "candidate_id") |> | |||||
| left_join(candidates_last_contest, by = join_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl)) | |||||
| } |