|
|
|
@@ -1,147 +1,307 @@ |
|
|
|
prep_candidate_listing <- function( |
|
|
|
path_candidate_listing_raw |
|
|
|
) { |
|
|
|
fastlink_candidate_listing <- function(candidate_listing_raw) { |
|
|
|
data <- |
|
|
|
candidate_listing_raw |> |
|
|
|
distinct( |
|
|
|
name_on_ballot, |
|
|
|
first_name, |
|
|
|
middle_name, |
|
|
|
last_name, |
|
|
|
name_suffix_lbl, |
|
|
|
street_address, |
|
|
|
city |
|
|
|
) |> |
|
|
|
mutate( |
|
|
|
name_on_ballot_clean = sub(" \\(.+?\\)\\s?", "", name_on_ballot) |
|
|
|
) |
|
|
|
|
|
|
|
linked <- fastLink::fastLink( |
|
|
|
data, |
|
|
|
data, |
|
|
|
varnames = c( |
|
|
|
"name_on_ballot_clean", |
|
|
|
"first_name", "middle_name", "last_name", "name_suffix_lbl", |
|
|
|
"street_address", "city" |
|
|
|
), |
|
|
|
stringdist.match = c("middle_name"), |
|
|
|
stringdist.method = "jw", |
|
|
|
jw.weight = .25, |
|
|
|
threshold.match = 0.98 |
|
|
|
) |
|
|
|
|
|
|
|
candidate_listing_contest <- |
|
|
|
prep_open_dataset_db(fs::path_dir(path_candidate_listing_raw)) |> |
|
|
|
filter(name_on_ballot != "No Preference") |> |
|
|
|
tidyr::replace_na(list( |
|
|
|
first_name = "", |
|
|
|
middle_name = "", |
|
|
|
last_name = "", |
|
|
|
name_suffix_lbl = "" |
|
|
|
)) |
|
|
|
matches <- |
|
|
|
fastLink::getMatches( |
|
|
|
data, |
|
|
|
data, |
|
|
|
linked, |
|
|
|
threshold.match = 0.9 |
|
|
|
) |> |
|
|
|
as_tibble() |
|
|
|
|
|
|
|
list( |
|
|
|
data = data, |
|
|
|
linked = linked, |
|
|
|
matches = matches |
|
|
|
) |
|
|
|
} |
|
|
|
|
|
|
|
prep_candidates_dedupe_mapping <- function( |
|
|
|
candidate_listing_raw, |
|
|
|
candidate_listing_dedupe |
|
|
|
) { |
|
|
|
# candidate names ---- |
|
|
|
# Extract candidate names, these will be primary keys for the candidates table |
|
|
|
candidate_names <- |
|
|
|
candidate_listing_contest |> |
|
|
|
dbplyr::window_order(last_name, first_name, middle_name) |> |
|
|
|
distinct(first_name, middle_name, last_name, name_suffix_lbl) |> |
|
|
|
candidate_listing_raw |> |
|
|
|
arrange(last_name, first_name, middle_name) |> |
|
|
|
distinct(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) |> |
|
|
|
mutate(candidate_id = row_number(), .before = 1) |
|
|
|
|
|
|
|
# candidate_name_on_ballot ---- |
|
|
|
candidate_name_on_ballot <- |
|
|
|
extract_candidate_info( |
|
|
|
candidate_listing_contest, |
|
|
|
# Find last election ---- |
|
|
|
candidates_last_contest <- |
|
|
|
candidate_listing_raw |> |
|
|
|
group_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) |> |
|
|
|
slice_max(election_dt, n = 1) |> |
|
|
|
distinct(contest_last = election_dt) |
|
|
|
|
|
|
|
deduped_ids <- |
|
|
|
candidate_listing_dedupe$matches |> |
|
|
|
left_join( |
|
|
|
candidate_names, |
|
|
|
info_vars = c("name_on_ballot") |
|
|
|
by = join_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) |
|
|
|
) |> |
|
|
|
collect() |
|
|
|
distinct(dupe_id = dedupe.ids, candidate_id) |> |
|
|
|
add_count(dupe_id) |> |
|
|
|
filter(n > 1) |> |
|
|
|
select(-n) |> |
|
|
|
mutate(dupe_id = fct_infreq(paste(dupe_id))) |> |
|
|
|
arrange(dupe_id, candidate_id) |> |
|
|
|
group_split(dupe_id) |
|
|
|
|
|
|
|
mapping <- candidate_names |> select(candidate_id) |> mutate(candidate_group = candidate_id) |
|
|
|
|
|
|
|
for (dupes in deduped_ids) { |
|
|
|
|
|
|
|
map_group <- left_join(dupes[-1], mapping, by = "candidate_id") |
|
|
|
all_ids <- union(map_group$candidate_id, map_group$candidate_group) |
|
|
|
map_others <- mapping |> filter(candidate_group %in% all_ids) |
|
|
|
browser(expr = nrow(map_group) < nrow(map_others)) |
|
|
|
|
|
|
|
# recompute current grouping to min of all ids |
|
|
|
update <- |
|
|
|
dplyr::union(map_group, map_others) |> |
|
|
|
mutate(candidate_group = min(candidate_id, candidate_group)) |
|
|
|
|
|
|
|
mapping <- rows_update(mapping, update, by = "candidate_id") |
|
|
|
} |
|
|
|
|
|
|
|
mapping |> |
|
|
|
left_join(candidate_names, by = "candidate_id") |> |
|
|
|
left_join(candidates_last_contest, by = join_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl)) |
|
|
|
} |
|
|
|
|
|
|
|
prep_dedupe_candidates <- function(candidate_listing_raw) { |
|
|
|
candidate_names <- |
|
|
|
candidate_listing_raw |> |
|
|
|
arrange(last_name, first_name, middle_name) |> |
|
|
|
distinct(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl, street_address, city, contest_name, county_name) |> |
|
|
|
mutate(candidate_id = row_number(), .before = 1) |
|
|
|
|
|
|
|
# first, last, street_address ---- |
|
|
|
f_l_street <- |
|
|
|
candidate_names |> |
|
|
|
group_by(first_name, last_name, street_address) |> |
|
|
|
mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |> |
|
|
|
ungroup() |
|
|
|
|
|
|
|
# first, last, cleaned contest name |
|
|
|
f_l_cleaned_contest <- |
|
|
|
candidate_names |> |
|
|
|
mutate( |
|
|
|
contest_name_clean = if_else( |
|
|
|
!is.na(city) & map2_lgl(city, contest_name, \(c, cn) grepl(paste("TOWN OF", c), cn)), |
|
|
|
map2_chr(city, contest_name, \(c, cn) sub(paste("TOWN OF", c, ""), "", cn)), |
|
|
|
contest_name |
|
|
|
), |
|
|
|
contest_name_clean = sub("([A-Z]+ ){1,2}COUNTY ", "", contest_name_clean) |
|
|
|
) |> |
|
|
|
group_by(first_name, last_name, contest_name_clean) |> |
|
|
|
mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |> |
|
|
|
ungroup() |
|
|
|
|
|
|
|
# first, middle, last, county_name |
|
|
|
f_m_l_county <- |
|
|
|
candidate_names |> |
|
|
|
group_by(first_name, middle_name, last_name, county_name) |> |
|
|
|
mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |> |
|
|
|
ungroup() |
|
|
|
|
|
|
|
# first, middle initial, last, county_name |
|
|
|
f_mi_l_county <- |
|
|
|
candidate_names |> |
|
|
|
mutate(middle_name = stringr::str_sub(middle_name, 1, 1)) |> |
|
|
|
group_by(first_name, middle_name, last_name, county_name) |> |
|
|
|
mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |> |
|
|
|
ungroup() |
|
|
|
|
|
|
|
# name_on_ballot, middle_name |
|
|
|
nb_m <- |
|
|
|
candidate_names |> |
|
|
|
mutate( |
|
|
|
name_ballot_clean = sub("(Mrs?|Dr|Mr|Ms|Miss)[.]? ", "", name_on_ballot), |
|
|
|
name_ballot_clean = gsub("[.]", "", name_ballot_clean), |
|
|
|
) |> |
|
|
|
group_by(name_ballot_clean, middle_name) |> |
|
|
|
mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |> |
|
|
|
ungroup() |
|
|
|
|
|
|
|
candidate_names |> |
|
|
|
dedupe_update_mapping(f_l_street, candidate_id, candidate_group, name = "first, last, street") |> |
|
|
|
dedupe_update_mapping(f_l_cleaned_contest, candidate_id, candidate_group, name = "first, last, contest") |> |
|
|
|
dedupe_update_mapping(f_m_l_county, candidate_id, candidate_group, name = "first, middle, last, county") |> |
|
|
|
dedupe_update_mapping(f_mi_l_county, candidate_id, candidate_group, name = "first, mi, last, county") |> |
|
|
|
dedupe_update_mapping(nb_m, candidate_id, candidate_group, name = "ballot name, middle name") |
|
|
|
} |
|
|
|
|
|
|
|
prep_candidate_listing <- function( |
|
|
|
candidate_listing_raw, |
|
|
|
candidate_listing_dedupe |
|
|
|
) { |
|
|
|
|
|
|
|
# Give candidates a unique id from dedupe mapping |
|
|
|
cl_dedupe <- |
|
|
|
candidate_listing_dedupe |> |
|
|
|
group_by(candidate_group) |> |
|
|
|
mutate(candidate_id = cur_group_id()) |> |
|
|
|
ungroup() |> |
|
|
|
select(-candidate_group) |
|
|
|
|
|
|
|
# cl raw + candidate_id ---- |
|
|
|
cl_raw <- |
|
|
|
candidate_listing_raw |> |
|
|
|
left_join(cl_dedupe, by = setdiff(names(cl_dedupe), "candidate_id")) |
|
|
|
|
|
|
|
|
|
|
|
# candidate_names ---- |
|
|
|
candidate_names <- |
|
|
|
cl_raw |> |
|
|
|
distinct(candidate_id, election_dt, first_name, middle_name, last_name, name_suffix_lbl) |> |
|
|
|
arrange(candidate_id, election_dt) |
|
|
|
|
|
|
|
# candidate_name_on_ballot ---- |
|
|
|
candidate_name_on_ballot <- |
|
|
|
cl_raw |> |
|
|
|
distinct(candidate_id, election_dt, name_on_ballot) |> |
|
|
|
arrange(candidate_id, election_dt) |
|
|
|
|
|
|
|
# candidate_address ---- |
|
|
|
candidate_address <- |
|
|
|
candidate_listing_contest |> |
|
|
|
cl_raw |> |
|
|
|
mutate( |
|
|
|
phone = coalesce(phone, office_phone, business_phone), |
|
|
|
street_address = toupper(street_address), |
|
|
|
street_address = REGEXP_REPLACE(street_address, " +", " ", "g"), |
|
|
|
street_address = stringr::str_replace_all(street_address, " +", " "), |
|
|
|
street_address = trimws(street_address), |
|
|
|
) |> |
|
|
|
select(-office_phone, -business_phone) |> |
|
|
|
extract_candidate_info( |
|
|
|
candidate_names, |
|
|
|
info_vars = c("street_address", "city", "state", "zip_code", "phone", "email") |
|
|
|
) |> |
|
|
|
collect() |> |
|
|
|
rename(street = street_address) |
|
|
|
distinct(candidate_id, election_dt, street = street_address, city, state, zip_code, phone, email) |> |
|
|
|
add_address_lookup_local(street = street, postal_code = zip_code) |> |
|
|
|
# remove rows that are entirely empty except for candidate_id |
|
|
|
filter(!(is.na(street) & is.na(city) & is.na(state) & is.na(zip_code) & is.na(phone) & is.na(email))) |> |
|
|
|
arrange(candidate_id, election_dt) |
|
|
|
|
|
|
|
# candidate_party ---- |
|
|
|
candidate_party <- |
|
|
|
candidate_listing_contest |> |
|
|
|
extract_candidate_info( |
|
|
|
candidate_names, |
|
|
|
info_vars = c("party_candidate") |
|
|
|
) |> |
|
|
|
collect() |> |
|
|
|
mutate(party_candidate = forcats::fct_inorder(party_candidate)) |
|
|
|
cl_raw |> |
|
|
|
distinct(candidate_id, election_dt, party_candidate) |> |
|
|
|
mutate(party_candidate = forcats::fct_inorder(party_candidate)) |> |
|
|
|
filter(!is.na(party_candidate)) |> |
|
|
|
arrange(candidate_id, election_dt) |
|
|
|
|
|
|
|
# Extract contests (remaining data in candidate_listing) ---- |
|
|
|
cols_candidate_id <- intersect(colnames(candidate_names), colnames(candidate_listing_contest)) |
|
|
|
cols_related <- setdiff( |
|
|
|
c(colnames(candidate_name_on_ballot), colnames(candidate_address), colnames(candidate_party)), |
|
|
|
"election_dt" |
|
|
|
c("election_dt", "candidate_id") |
|
|
|
) |
|
|
|
|
|
|
|
contests <- |
|
|
|
candidate_listing_contest |> |
|
|
|
select(election_dt:name_suffix_lbl) |> |
|
|
|
cl_raw |> |
|
|
|
select(election_dt:name_suffix_lbl, candidate_id) |> |
|
|
|
select(-any_of(cols_related)) |> |
|
|
|
left_join(candidate_names, by = cols_candidate_id) |> |
|
|
|
relocate(candidate_id, .before = first_name) |> |
|
|
|
collect() |
|
|
|
relocate(candidate_id, .before = first_name) |
|
|
|
|
|
|
|
# Get current candidate information ---- |
|
|
|
candidate_best_name <- |
|
|
|
candidate_names |> |
|
|
|
mutate(n_char = nchar(paste(first_name, middle_name, last_name, name_suffix_lbl))) |> |
|
|
|
arrange(desc(n_char), desc(election_dt)) |> |
|
|
|
slice_max(n_char, n = 1, by = candidate_id, with_ties = FALSE) |> |
|
|
|
select(-n_char, -election_dt) |
|
|
|
|
|
|
|
# Get current complete contact information ---- |
|
|
|
candidate_contact_current <- |
|
|
|
current_contact <- |
|
|
|
candidate_listing_current_contact_info(candidate_address) |
|
|
|
|
|
|
|
current_name_on_ballot <- |
|
|
|
candidate_name_on_ballot |> |
|
|
|
slice_max(election_dt, by = candidate_id, n = 1, with_ties = FALSE) |> |
|
|
|
select(-election_dt) |
|
|
|
|
|
|
|
current_party <- |
|
|
|
candidate_party |> |
|
|
|
filter(!is.na(party_candidate)) |> |
|
|
|
group_by(candidate_id) |> |
|
|
|
slice_max(election_dt, n = 1) |> |
|
|
|
arrange(party_candidate) |> |
|
|
|
slice_head(n = 1) |> |
|
|
|
ungroup() |> |
|
|
|
select(-election_dt) |> |
|
|
|
rename(party_last = party_candidate) |
|
|
|
|
|
|
|
current_most_party <- |
|
|
|
candidate_party |> |
|
|
|
filter(!is.na(party_candidate)) |> |
|
|
|
group_by(candidate_id) |> |
|
|
|
count(party_candidate) |> |
|
|
|
slice_max(n, n = 1) |> |
|
|
|
arrange(party_candidate) |> |
|
|
|
slice_head(n = 1) |> |
|
|
|
ungroup() |> |
|
|
|
select(-n) |> |
|
|
|
rename(party_most = party_candidate) |
|
|
|
|
|
|
|
current_contest_count <- |
|
|
|
contests |> |
|
|
|
group_by(candidate_id) |> |
|
|
|
distinct(candidate_id, election_dt, contest_name) |> |
|
|
|
summarize( |
|
|
|
contest_n = n(), |
|
|
|
contest_first = min(election_dt), |
|
|
|
contest_latest = max(election_dt) |
|
|
|
) |
|
|
|
|
|
|
|
# Join candidates into one big table ---- |
|
|
|
left_join_by_candidate <- function(x, y) { |
|
|
|
left_join(x, y, join_by(candidate_id), relationship = "one-to-one") |
|
|
|
} |
|
|
|
|
|
|
|
candidates <- |
|
|
|
candidate_names |> |
|
|
|
collect() |> |
|
|
|
left_join( |
|
|
|
candidate_name_on_ballot |> |
|
|
|
slice_max(election_dt, by = candidate_id, n = 1) |> |
|
|
|
select(-election_dt), |
|
|
|
by = "candidate_id", |
|
|
|
relationship = "one-to-one" |
|
|
|
) |> |
|
|
|
left_join( |
|
|
|
candidate_contact_current, |
|
|
|
by = "candidate_id", |
|
|
|
relationship = "one-to-one" |
|
|
|
) |> |
|
|
|
left_join( |
|
|
|
candidate_party |> |
|
|
|
filter(!is.na(party_candidate)) |> |
|
|
|
group_by(candidate_id) |> |
|
|
|
slice_max(election_dt, n = 1) |> |
|
|
|
arrange(party_candidate) |> |
|
|
|
slice_head(n = 1) |> |
|
|
|
ungroup() |> |
|
|
|
select(-election_dt) |> |
|
|
|
rename(party_last = party_candidate), |
|
|
|
by = "candidate_id", |
|
|
|
relationship = "one-to-one" |
|
|
|
) |> |
|
|
|
left_join( |
|
|
|
candidate_party |> |
|
|
|
filter(!is.na(party_candidate)) |> |
|
|
|
group_by(candidate_id) |> |
|
|
|
count(party_candidate) |> |
|
|
|
slice_max(n, n = 1) |> |
|
|
|
arrange(party_candidate) |> |
|
|
|
slice_head(n = 1) |> |
|
|
|
ungroup() |> |
|
|
|
select(-n) |> |
|
|
|
rename(party_most = party_candidate), |
|
|
|
by = "candidate_id", |
|
|
|
relationship = "one-to-one" |
|
|
|
) |> |
|
|
|
left_join( |
|
|
|
contests |> |
|
|
|
group_by(candidate_id) |> |
|
|
|
distinct(candidate_id, election_dt, contest_name) |> |
|
|
|
summarize( |
|
|
|
contest_n = n(), |
|
|
|
contest_first = min(election_dt), |
|
|
|
contest_latest = max(election_dt) |
|
|
|
), |
|
|
|
by = "candidate_id", |
|
|
|
relationship = "one-to-one" |
|
|
|
) |> |
|
|
|
candidate_best_name |> |
|
|
|
left_join_by_candidate(current_name_on_ballot) |> |
|
|
|
left_join_by_candidate(current_contact) |> |
|
|
|
left_join_by_candidate(current_party) |> |
|
|
|
left_join_by_candidate(current_most_party) |> |
|
|
|
left_join_by_candidate(current_contest_count) |> |
|
|
|
relocate(name_on_ballot, .before = first_name) |> |
|
|
|
relocate(starts_with("party"), .before = street) |> |
|
|
|
relocate(starts_with("contest"), .before = street) |> |
|
|
|
add_address_lookup(street = street, postal_code = zip_code) |
|
|
|
add_address_lookup_local(street = street, postal_code = zip_code) |
|
|
|
|
|
|
|
# Return list of tables |
|
|
|
list( |
|
|
|
elections = contests, |
|
|
|
candidates = candidates, |
|
|
|
candidate_name_on_ballot = candidate_name_on_ballot, |
|
|
|
candidate_contact = candidate_address, |
|
|
|
candidate_party = candidate_party |
|
|
|
cl_elections = contests, |
|
|
|
cl_candidates = candidates, |
|
|
|
cl_name_on_ballot = candidate_name_on_ballot, |
|
|
|
cl_contact = candidate_address, |
|
|
|
cl_party = candidate_party |
|
|
|
) |
|
|
|
} |
|
|
|
|
|
|
|
@@ -160,7 +320,7 @@ extract_candidate_info <- function( |
|
|
|
select(-first_name, -middle_name, -last_name, -name_suffix_lbl) |> |
|
|
|
relocate(candidate_id, .before = 1) |> |
|
|
|
distinct() |> |
|
|
|
arrange(candidate_id, election_dt, !!info_vars) |
|
|
|
arrange(candidate_id, election_dt, !!!rlang::syms(info_vars)) |
|
|
|
} |
|
|
|
|
|
|
|
candidate_listing_current_contact_info <- function(candidate_address) { |