| @@ -1,57 +1,3 @@ | |||
| prep_candidates_dedupe_mapping <- function( | |||
| candidate_listing_raw, | |||
| candidate_listing_dedupe | |||
| ) { | |||
| # candidate names ---- | |||
| # Extract candidate names, these will be primary keys for the candidates table | |||
| candidate_names <- | |||
| candidate_listing_raw |> | |||
| arrange(last_name, first_name, middle_name) |> | |||
| distinct(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) |> | |||
| mutate(candidate_id = row_number(), .before = 1) | |||
| # Find last election ---- | |||
| candidates_last_contest <- | |||
| candidate_listing_raw |> | |||
| group_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) |> | |||
| slice_max(election_dt, n = 1) |> | |||
| distinct(contest_last = election_dt) | |||
| deduped_ids <- | |||
| candidate_listing_dedupe$matches |> | |||
| left_join( | |||
| candidate_names, | |||
| by = join_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) | |||
| ) |> | |||
| distinct(dupe_id = dedupe.ids, candidate_id) |> | |||
| add_count(dupe_id) |> | |||
| filter(n > 1) |> | |||
| select(-n) |> | |||
| mutate(dupe_id = fct_infreq(paste(dupe_id))) |> | |||
| arrange(dupe_id, candidate_id) |> | |||
| group_split(dupe_id) | |||
| mapping <- candidate_names |> select(candidate_id) |> mutate(candidate_group = candidate_id) | |||
| for (dupes in deduped_ids) { | |||
| map_group <- left_join(dupes[-1], mapping, by = "candidate_id") | |||
| all_ids <- union(map_group$candidate_id, map_group$candidate_group) | |||
| map_others <- mapping |> filter(candidate_group %in% all_ids) | |||
| browser(expr = nrow(map_group) < nrow(map_others)) | |||
| # recompute current grouping to min of all ids | |||
| update <- | |||
| dplyr::union(map_group, map_others) |> | |||
| mutate(candidate_group = min(candidate_id, candidate_group)) | |||
| mapping <- rows_update(mapping, update, by = "candidate_id") | |||
| } | |||
| mapping |> | |||
| left_join(candidate_names, by = "candidate_id") |> | |||
| left_join(candidates_last_contest, by = join_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl)) | |||
| } | |||
| prep_dedupe_candidates <- function(candidate_listing_raw) { | |||
| candidate_names <- | |||
| @@ -338,3 +284,59 @@ fastlink_candidate_listing <- function(candidate_listing_raw) { | |||
| matches = matches | |||
| ) | |||
| } | |||
| # This also isn't used anymore... | |||
| prep_candidates_dedupe_mapping <- function( | |||
| candidate_listing_raw, | |||
| candidate_listing_dedupe | |||
| ) { | |||
| # candidate names ---- | |||
| # Extract candidate names, these will be primary keys for the candidates table | |||
| candidate_names <- | |||
| candidate_listing_raw |> | |||
| arrange(last_name, first_name, middle_name) |> | |||
| distinct(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) |> | |||
| mutate(candidate_id = row_number(), .before = 1) | |||
| # Find last election ---- | |||
| candidates_last_contest <- | |||
| candidate_listing_raw |> | |||
| group_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) |> | |||
| slice_max(election_dt, n = 1) |> | |||
| distinct(contest_last = election_dt) | |||
| deduped_ids <- | |||
| candidate_listing_dedupe$matches |> | |||
| left_join( | |||
| candidate_names, | |||
| by = join_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) | |||
| ) |> | |||
| distinct(dupe_id = dedupe.ids, candidate_id) |> | |||
| add_count(dupe_id) |> | |||
| filter(n > 1) |> | |||
| select(-n) |> | |||
| mutate(dupe_id = fct_infreq(paste(dupe_id))) |> | |||
| arrange(dupe_id, candidate_id) |> | |||
| group_split(dupe_id) | |||
| mapping <- candidate_names |> select(candidate_id) |> mutate(candidate_group = candidate_id) | |||
| for (dupes in deduped_ids) { | |||
| map_group <- left_join(dupes[-1], mapping, by = "candidate_id") | |||
| all_ids <- union(map_group$candidate_id, map_group$candidate_group) | |||
| map_others <- mapping |> filter(candidate_group %in% all_ids) | |||
| browser(expr = nrow(map_group) < nrow(map_others)) | |||
| # recompute current grouping to min of all ids | |||
| update <- | |||
| dplyr::union(map_group, map_others) |> | |||
| mutate(candidate_group = min(candidate_id, candidate_group)) | |||
| mapping <- rows_update(mapping, update, by = "candidate_id") | |||
| } | |||
| mapping |> | |||
| left_join(candidate_names, by = "candidate_id") |> | |||
| left_join(candidates_last_contest, by = join_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl)) | |||
| } | |||