Garrick Aden-Buie 2 лет назад
Родитель
Сommit
769eb5546d
Не найден GPG ключ соответствующий данной подписи
1 измененных файлов: 56 добавлений и 54 удалений
  1. +56
    -54
      process/R/out_candidate_listing.R

+ 56
- 54
process/R/out_candidate_listing.R Просмотреть файл

@@ -1,57 +1,3 @@
prep_candidates_dedupe_mapping <- function(
candidate_listing_raw,
candidate_listing_dedupe
) {
# candidate names ----
# Extract candidate names, these will be primary keys for the candidates table
candidate_names <-
candidate_listing_raw |>
arrange(last_name, first_name, middle_name) |>
distinct(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) |>
mutate(candidate_id = row_number(), .before = 1)

# Find last election ----
candidates_last_contest <-
candidate_listing_raw |>
group_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) |>
slice_max(election_dt, n = 1) |>
distinct(contest_last = election_dt)

deduped_ids <-
candidate_listing_dedupe$matches |>
left_join(
candidate_names,
by = join_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl)
) |>
distinct(dupe_id = dedupe.ids, candidate_id) |>
add_count(dupe_id) |>
filter(n > 1) |>
select(-n) |>
mutate(dupe_id = fct_infreq(paste(dupe_id))) |>
arrange(dupe_id, candidate_id) |>
group_split(dupe_id)

mapping <- candidate_names |> select(candidate_id) |> mutate(candidate_group = candidate_id)

for (dupes in deduped_ids) {

map_group <- left_join(dupes[-1], mapping, by = "candidate_id")
all_ids <- union(map_group$candidate_id, map_group$candidate_group)
map_others <- mapping |> filter(candidate_group %in% all_ids)
browser(expr = nrow(map_group) < nrow(map_others))

# recompute current grouping to min of all ids
update <-
dplyr::union(map_group, map_others) |>
mutate(candidate_group = min(candidate_id, candidate_group))

mapping <- rows_update(mapping, update, by = "candidate_id")
}

mapping |>
left_join(candidate_names, by = "candidate_id") |>
left_join(candidates_last_contest, by = join_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl))
}

prep_dedupe_candidates <- function(candidate_listing_raw) {
candidate_names <-
@@ -338,3 +284,59 @@ fastlink_candidate_listing <- function(candidate_listing_raw) {
matches = matches
)
}

# This also isn't used anymore...
prep_candidates_dedupe_mapping <- function(
candidate_listing_raw,
candidate_listing_dedupe
) {
# candidate names ----
# Extract candidate names, these will be primary keys for the candidates table
candidate_names <-
candidate_listing_raw |>
arrange(last_name, first_name, middle_name) |>
distinct(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) |>
mutate(candidate_id = row_number(), .before = 1)

# Find last election ----
candidates_last_contest <-
candidate_listing_raw |>
group_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) |>
slice_max(election_dt, n = 1) |>
distinct(contest_last = election_dt)

deduped_ids <-
candidate_listing_dedupe$matches |>
left_join(
candidate_names,
by = join_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl)
) |>
distinct(dupe_id = dedupe.ids, candidate_id) |>
add_count(dupe_id) |>
filter(n > 1) |>
select(-n) |>
mutate(dupe_id = fct_infreq(paste(dupe_id))) |>
arrange(dupe_id, candidate_id) |>
group_split(dupe_id)

mapping <- candidate_names |> select(candidate_id) |> mutate(candidate_group = candidate_id)

for (dupes in deduped_ids) {

map_group <- left_join(dupes[-1], mapping, by = "candidate_id")
all_ids <- union(map_group$candidate_id, map_group$candidate_group)
map_others <- mapping |> filter(candidate_group %in% all_ids)
browser(expr = nrow(map_group) < nrow(map_others))

# recompute current grouping to min of all ids
update <-
dplyr::union(map_group, map_others) |>
mutate(candidate_group = min(candidate_id, candidate_group))

mapping <- rows_update(mapping, update, by = "candidate_id")
}

mapping |>
left_join(candidate_names, by = "candidate_id") |>
left_join(candidates_last_contest, by = join_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl))
}

Загрузка…
Отмена
Сохранить