Quellcode durchsuchen

normalized candidate listing (the hard way)

main
Garrick Aden-Buie vor 2 Jahren
Ursprung
Commit
91ecf9eb62
Es konnte kein GPG-Schlüssel zu dieser Signatur gefunden werden
7 geänderte Dateien mit 486 neuen und 155 gelöschten Zeilen
  1. +8
    -1
      process/R/candidate_listing.R
  2. +265
    -105
      process/R/out_candidate_listing.R
  3. +35
    -0
      process/R/prepare_addresses.R
  4. +49
    -16
      process/R/prepare_candidates.R
  5. +80
    -0
      process/R/utils-dedupe.R
  6. +30
    -25
      process/_targets.R
  7. +19
    -8
      process/_targets/meta/meta

+ 8
- 1
process/R/candidate_listing.R Datei anzeigen

@@ -14,7 +14,14 @@ get_candidate_listing <- function(years = 2016:2023) {
\(.x) sub("(\\d{3})(\\d{3})(\\d{4})", "(\\1) \\2-\\3", .x)
),
across(street_address, fixup_po_box)
)
) |>
filter(name_on_ballot != "No Preference") |>
tidyr::replace_na(list(
first_name = "",
middle_name = "",
last_name = "",
name_suffix_lbl = ""
))
}

get_candidate_listing_year <- function(year) {

+ 265
- 105
process/R/out_candidate_listing.R Datei anzeigen

@@ -1,147 +1,307 @@
prep_candidate_listing <- function(
path_candidate_listing_raw
) {
fastlink_candidate_listing <- function(candidate_listing_raw) {
data <-
candidate_listing_raw |>
distinct(
name_on_ballot,
first_name,
middle_name,
last_name,
name_suffix_lbl,
street_address,
city
) |>
mutate(
name_on_ballot_clean = sub(" \\(.+?\\)\\s?", "", name_on_ballot)
)

linked <- fastLink::fastLink(
data,
data,
varnames = c(
"name_on_ballot_clean",
"first_name", "middle_name", "last_name", "name_suffix_lbl",
"street_address", "city"
),
stringdist.match = c("middle_name"),
stringdist.method = "jw",
jw.weight = .25,
threshold.match = 0.98
)

candidate_listing_contest <-
prep_open_dataset_db(fs::path_dir(path_candidate_listing_raw)) |>
filter(name_on_ballot != "No Preference") |>
tidyr::replace_na(list(
first_name = "",
middle_name = "",
last_name = "",
name_suffix_lbl = ""
))
matches <-
fastLink::getMatches(
data,
data,
linked,
threshold.match = 0.9
) |>
as_tibble()

list(
data = data,
linked = linked,
matches = matches
)
}

prep_candidates_dedupe_mapping <- function(
candidate_listing_raw,
candidate_listing_dedupe
) {
# candidate names ----
# Extract candidate names, these will be primary keys for the candidates table
candidate_names <-
candidate_listing_contest |>
dbplyr::window_order(last_name, first_name, middle_name) |>
distinct(first_name, middle_name, last_name, name_suffix_lbl) |>
candidate_listing_raw |>
arrange(last_name, first_name, middle_name) |>
distinct(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) |>
mutate(candidate_id = row_number(), .before = 1)

# candidate_name_on_ballot ----
candidate_name_on_ballot <-
extract_candidate_info(
candidate_listing_contest,
# Find last election ----
candidates_last_contest <-
candidate_listing_raw |>
group_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) |>
slice_max(election_dt, n = 1) |>
distinct(contest_last = election_dt)

deduped_ids <-
candidate_listing_dedupe$matches |>
left_join(
candidate_names,
info_vars = c("name_on_ballot")
by = join_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl)
) |>
collect()
distinct(dupe_id = dedupe.ids, candidate_id) |>
add_count(dupe_id) |>
filter(n > 1) |>
select(-n) |>
mutate(dupe_id = fct_infreq(paste(dupe_id))) |>
arrange(dupe_id, candidate_id) |>
group_split(dupe_id)

mapping <- candidate_names |> select(candidate_id) |> mutate(candidate_group = candidate_id)

for (dupes in deduped_ids) {

map_group <- left_join(dupes[-1], mapping, by = "candidate_id")
all_ids <- union(map_group$candidate_id, map_group$candidate_group)
map_others <- mapping |> filter(candidate_group %in% all_ids)
browser(expr = nrow(map_group) < nrow(map_others))

# recompute current grouping to min of all ids
update <-
dplyr::union(map_group, map_others) |>
mutate(candidate_group = min(candidate_id, candidate_group))

mapping <- rows_update(mapping, update, by = "candidate_id")
}

mapping |>
left_join(candidate_names, by = "candidate_id") |>
left_join(candidates_last_contest, by = join_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl))
}

prep_dedupe_candidates <- function(candidate_listing_raw) {
candidate_names <-
candidate_listing_raw |>
arrange(last_name, first_name, middle_name) |>
distinct(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl, street_address, city, contest_name, county_name) |>
mutate(candidate_id = row_number(), .before = 1)

# first, last, street_address ----
f_l_street <-
candidate_names |>
group_by(first_name, last_name, street_address) |>
mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |>
ungroup()

# first, last, cleaned contest name
f_l_cleaned_contest <-
candidate_names |>
mutate(
contest_name_clean = if_else(
!is.na(city) & map2_lgl(city, contest_name, \(c, cn) grepl(paste("TOWN OF", c), cn)),
map2_chr(city, contest_name, \(c, cn) sub(paste("TOWN OF", c, ""), "", cn)),
contest_name
),
contest_name_clean = sub("([A-Z]+ ){1,2}COUNTY ", "", contest_name_clean)
) |>
group_by(first_name, last_name, contest_name_clean) |>
mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |>
ungroup()

# first, middle, last, county_name
f_m_l_county <-
candidate_names |>
group_by(first_name, middle_name, last_name, county_name) |>
mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |>
ungroup()

# first, middle initial, last, county_name
f_mi_l_county <-
candidate_names |>
mutate(middle_name = stringr::str_sub(middle_name, 1, 1)) |>
group_by(first_name, middle_name, last_name, county_name) |>
mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |>
ungroup()

# name_on_ballot, middle_name
nb_m <-
candidate_names |>
mutate(
name_ballot_clean = sub("(Mrs?|Dr|Mr|Ms|Miss)[.]? ", "", name_on_ballot),
name_ballot_clean = gsub("[.]", "", name_ballot_clean),
) |>
group_by(name_ballot_clean, middle_name) |>
mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |>
ungroup()

candidate_names |>
dedupe_update_mapping(f_l_street, candidate_id, candidate_group, name = "first, last, street") |>
dedupe_update_mapping(f_l_cleaned_contest, candidate_id, candidate_group, name = "first, last, contest") |>
dedupe_update_mapping(f_m_l_county, candidate_id, candidate_group, name = "first, middle, last, county") |>
dedupe_update_mapping(f_mi_l_county, candidate_id, candidate_group, name = "first, mi, last, county") |>
dedupe_update_mapping(nb_m, candidate_id, candidate_group, name = "ballot name, middle name")
}

prep_candidate_listing <- function(
candidate_listing_raw,
candidate_listing_dedupe
) {

# Give candidates a unique id from dedupe mapping
cl_dedupe <-
candidate_listing_dedupe |>
group_by(candidate_group) |>
mutate(candidate_id = cur_group_id()) |>
ungroup() |>
select(-candidate_group)

# cl raw + candidate_id ----
cl_raw <-
candidate_listing_raw |>
left_join(cl_dedupe, by = setdiff(names(cl_dedupe), "candidate_id"))


# candidate_names ----
candidate_names <-
cl_raw |>
distinct(candidate_id, election_dt, first_name, middle_name, last_name, name_suffix_lbl) |>
arrange(candidate_id, election_dt)

# candidate_name_on_ballot ----
candidate_name_on_ballot <-
cl_raw |>
distinct(candidate_id, election_dt, name_on_ballot) |>
arrange(candidate_id, election_dt)

# candidate_address ----
candidate_address <-
candidate_listing_contest |>
cl_raw |>
mutate(
phone = coalesce(phone, office_phone, business_phone),
street_address = toupper(street_address),
street_address = REGEXP_REPLACE(street_address, " +", " ", "g"),
street_address = stringr::str_replace_all(street_address, " +", " "),
street_address = trimws(street_address),
) |>
select(-office_phone, -business_phone) |>
extract_candidate_info(
candidate_names,
info_vars = c("street_address", "city", "state", "zip_code", "phone", "email")
) |>
collect() |>
rename(street = street_address)
distinct(candidate_id, election_dt, street = street_address, city, state, zip_code, phone, email) |>
add_address_lookup_local(street = street, postal_code = zip_code) |>
# remove rows that are entirely empty except for candidate_id
filter(!(is.na(street) & is.na(city) & is.na(state) & is.na(zip_code) & is.na(phone) & is.na(email))) |>
arrange(candidate_id, election_dt)

# candidate_party ----
candidate_party <-
candidate_listing_contest |>
extract_candidate_info(
candidate_names,
info_vars = c("party_candidate")
) |>
collect() |>
mutate(party_candidate = forcats::fct_inorder(party_candidate))
cl_raw |>
distinct(candidate_id, election_dt, party_candidate) |>
mutate(party_candidate = forcats::fct_inorder(party_candidate)) |>
filter(!is.na(party_candidate)) |>
arrange(candidate_id, election_dt)

# Extract contests (remaining data in candidate_listing) ----
cols_candidate_id <- intersect(colnames(candidate_names), colnames(candidate_listing_contest))
cols_related <- setdiff(
c(colnames(candidate_name_on_ballot), colnames(candidate_address), colnames(candidate_party)),
"election_dt"
c("election_dt", "candidate_id")
)

contests <-
candidate_listing_contest |>
select(election_dt:name_suffix_lbl) |>
cl_raw |>
select(election_dt:name_suffix_lbl, candidate_id) |>
select(-any_of(cols_related)) |>
left_join(candidate_names, by = cols_candidate_id) |>
relocate(candidate_id, .before = first_name) |>
collect()
relocate(candidate_id, .before = first_name)

# Get current candidate information ----
candidate_best_name <-
candidate_names |>
mutate(n_char = nchar(paste(first_name, middle_name, last_name, name_suffix_lbl))) |>
arrange(desc(n_char), desc(election_dt)) |>
slice_max(n_char, n = 1, by = candidate_id, with_ties = FALSE) |>
select(-n_char, -election_dt)

# Get current complete contact information ----
candidate_contact_current <-
current_contact <-
candidate_listing_current_contact_info(candidate_address)

current_name_on_ballot <-
candidate_name_on_ballot |>
slice_max(election_dt, by = candidate_id, n = 1, with_ties = FALSE) |>
select(-election_dt)

current_party <-
candidate_party |>
filter(!is.na(party_candidate)) |>
group_by(candidate_id) |>
slice_max(election_dt, n = 1) |>
arrange(party_candidate) |>
slice_head(n = 1) |>
ungroup() |>
select(-election_dt) |>
rename(party_last = party_candidate)

current_most_party <-
candidate_party |>
filter(!is.na(party_candidate)) |>
group_by(candidate_id) |>
count(party_candidate) |>
slice_max(n, n = 1) |>
arrange(party_candidate) |>
slice_head(n = 1) |>
ungroup() |>
select(-n) |>
rename(party_most = party_candidate)

current_contest_count <-
contests |>
group_by(candidate_id) |>
distinct(candidate_id, election_dt, contest_name) |>
summarize(
contest_n = n(),
contest_first = min(election_dt),
contest_latest = max(election_dt)
)

# Join candidates into one big table ----
left_join_by_candidate <- function(x, y) {
left_join(x, y, join_by(candidate_id), relationship = "one-to-one")
}

candidates <-
candidate_names |>
collect() |>
left_join(
candidate_name_on_ballot |>
slice_max(election_dt, by = candidate_id, n = 1) |>
select(-election_dt),
by = "candidate_id",
relationship = "one-to-one"
) |>
left_join(
candidate_contact_current,
by = "candidate_id",
relationship = "one-to-one"
) |>
left_join(
candidate_party |>
filter(!is.na(party_candidate)) |>
group_by(candidate_id) |>
slice_max(election_dt, n = 1) |>
arrange(party_candidate) |>
slice_head(n = 1) |>
ungroup() |>
select(-election_dt) |>
rename(party_last = party_candidate),
by = "candidate_id",
relationship = "one-to-one"
) |>
left_join(
candidate_party |>
filter(!is.na(party_candidate)) |>
group_by(candidate_id) |>
count(party_candidate) |>
slice_max(n, n = 1) |>
arrange(party_candidate) |>
slice_head(n = 1) |>
ungroup() |>
select(-n) |>
rename(party_most = party_candidate),
by = "candidate_id",
relationship = "one-to-one"
) |>
left_join(
contests |>
group_by(candidate_id) |>
distinct(candidate_id, election_dt, contest_name) |>
summarize(
contest_n = n(),
contest_first = min(election_dt),
contest_latest = max(election_dt)
),
by = "candidate_id",
relationship = "one-to-one"
) |>
candidate_best_name |>
left_join_by_candidate(current_name_on_ballot) |>
left_join_by_candidate(current_contact) |>
left_join_by_candidate(current_party) |>
left_join_by_candidate(current_most_party) |>
left_join_by_candidate(current_contest_count) |>
relocate(name_on_ballot, .before = first_name) |>
relocate(starts_with("party"), .before = street) |>
relocate(starts_with("contest"), .before = street) |>
add_address_lookup(street = street, postal_code = zip_code)
add_address_lookup_local(street = street, postal_code = zip_code)

# Return list of tables
list(
elections = contests,
candidates = candidates,
candidate_name_on_ballot = candidate_name_on_ballot,
candidate_contact = candidate_address,
candidate_party = candidate_party
cl_elections = contests,
cl_candidates = candidates,
cl_name_on_ballot = candidate_name_on_ballot,
cl_contact = candidate_address,
cl_party = candidate_party
)
}

@@ -160,7 +320,7 @@ extract_candidate_info <- function(
select(-first_name, -middle_name, -last_name, -name_suffix_lbl) |>
relocate(candidate_id, .before = 1) |>
distinct() |>
arrange(candidate_id, election_dt, !!info_vars)
arrange(candidate_id, election_dt, !!!rlang::syms(info_vars))
}

candidate_listing_current_contact_info <- function(candidate_address) {

+ 35
- 0
process/R/prepare_addresses.R Datei anzeigen

@@ -118,6 +118,41 @@ add_address_lookup <- function(
)
}

add_address_lookup_local <- function(
df,
street = street_1,
city = city,
state = state,
postal_code = full_zip,
name = "address_lookup"
) {
addresses <-
df |>
dplyr::filter(!is.na({{ street }})) |>
dplyr::distinct(
street = {{ street }},
city = {{ city }},
state = {{ state }},
postal_code = {{ postal_code }}
) |>
dplyr::mutate(
state = coalesce(state, "NC"),
!!name := toupper(paste(street, city, state, substr(postal_code, 1, 5), sep = ", ")),
!!name := gsub(" +", " ", !!rlang::sym(name))
)

dplyr::left_join(
df,
addresses,
by = dplyr::join_by(
{{ street }} == street,
{{ city }} == city,
{{ state }} == state,
{{ postal_code }} == postal_code
)
)
}

collect_full_addresses_from_parts <- function(
df,
street = street_1,

+ 49
- 16
process/R/prepare_candidates.R Datei anzeigen

@@ -62,18 +62,34 @@ prepare_candidates_for_matching <- function(
prepare_candidate_listing_for_matching <- function(candidate_listing) {
lg_info_target(lg_get_logger())

cl <-
candidate_listing |>
candidate_name <-
candidate_listing$candidates |>
mutate(
name_clean = toupper(name_on_ballot),
name_clean = gsub("[^A-Z ]", "", name_clean)
name_full = paste(first_name, middle_name, last_name, name_suffix_lbl),
name_mi = paste(first_name, substr(middle_name, 1, 1), last_name, name_suffix_lbl),
name_first_last = paste(first_name, last_name, name_suffix_lbl),
) |>
distinct(
name_on_ballot, name_clean,
street = street_address,
city, state,
postal_code = zip_code
select(candidate_id, name_full, name_mi, name_first_last) |>
tidyr::pivot_longer(-candidate_id, values_to = "name_clean") |>
select(1, name_clean)

candidate_alias <-
candidate_listing$candidate_name_on_ballot |>
distinct(candidate_id, name_clean = name_on_ballot) |>
mutate(
name_clean = toupper(name_clean),
name_clean = gsub("[,.]", "", name_clean)
)

bind_rows(candidate_name, candidate_alias) |>
mutate(name_clean = trimws(name_clean)) |>
left_join(
candidate_listing$candidate_contact |>
select(1, street:zip_code),
by = "candidate_id",
relationship = "many-to-many"
) |>
distinct()
}

fastlink_candidates <- function(candidates_for_matching, candidate_listing_for_matching) {
@@ -122,7 +138,7 @@ candidates_match <- function(
candidates_for_matching |>
inner_join(
candidate_listing_for_matching |>
select(name_on_ballot, name_clean, street, city),
select(candidate_id, name_clean, street, city),
by = join_by(
name_clean == name_clean,
street == street,
@@ -130,7 +146,7 @@ candidates_match <- function(
),
relationship = "many-to-many"
) |>
distinct(sboe_id, name_on_ballot)
distinct(sboe_id, candidate_id)

# Then unambiguous matches on street + city
matches_street_city <-
@@ -138,21 +154,38 @@ candidates_match <- function(
anti_join(candidates_matched_1, by = "sboe_id") |>
inner_join(
candidate_listing_for_matching |>
select(name_on_ballot, street, city),
select(candidate_id, street, city),
by = join_by(
street == street,
city == city
),
relationship = "many-to-many"
) |>
distinct(sboe_id, name_on_ballot, street, city) |>
distinct(sboe_id, candidate_id, street, city) |>
group_by(street, city) |>
mutate(n_names = n_distinct(name_on_ballot)) |>
mutate(n_names = n_distinct(candidate_id)) |>
ungroup() |>
filter(n_names == 1) |>
select(sboe_id, name_on_ballot)
select(sboe_id, candidate_id)

# And unambiguous names
matches_name_obvious <-
inner_join(
candidates_for_matching,
candidate_listing_for_matching |>
distinct(candidate_id, name_clean) |>
add_count(name_clean) |>
filter(n == 1) |>
select(-n),
by = "name_clean",
relationship = "many-to-many"
) |>
distinct(sboe_id, candidate_id)

candidates_matched_2 <- union(candidates_matched_1, matches_street_city)
candidates_matched_2 <-
candidates_matched_1 |>
union(matches_street_city) |>
union(matches_name_obvious)

candidates_for_matching_left <-
candidates_for_matching |>

+ 80
- 0
process/R/utils-dedupe.R Datei anzeigen

@@ -0,0 +1,80 @@
#' @param data Source data, must contain `data_id` column
#' @param dupes Duplicate group mapping, must contain `dupe_id` and `data_id`
#' columns. Each row maps a single `data_id` to a `dupe_id`, where the
#' `dupe_id` groups duplicate rows together.
#' @param data_id The original ID in the source data
#' @param dupe_id A duplicate group ID in from the de-duplication process
#' @param data_group_id The group ID added to the source data. This group ID
#' always the minimum ID of any grouped source IDs. The final groups created
#' with this ID may span several duplicate groups.
dedupe_update_mapping <- function(
data,
dupes,
data_id,
dupe_id,
data_group_id,
name = "duplicates"
) {
data_id <- rlang::enquo(data_id)
data_id_name <- rlang::quo_text(data_id)
dupe_id <- rlang::enquo(dupe_id)
dupe_id_name <- rlang::quo_text(dupe_id)
if (missing(data_group_id)) {
data_group_id <- dupe_id
} else {
data_group_id <- rlang::enquo(data_group_id)
}
data_group_id_name <- rlang::quo_text(data_group_id)

mapping <-
if (dupe_id_name %in% colnames(data)) {
# Extract mapping from current dataset
data |>
select(!!data_id, !!dupe_id)
} else {
# Initialize a new mapping table
data |>
select(!!data_id) |>
mutate("{data_group_id_name}" := !!data_id)
}

dupes <- group_split(dupes, !!dupe_id)
dupes <- purrr::keep(dupes, \(d) nrow(d) > 1)

cli::cli_progress_bar(
name = paste("Merging", name),
total = length(dupes),
clear = FALSE
)

for (dupe in dupes) {
# Find original ids in this dupe group
map_group <-
mapping |>
semi_join(dupe, by = join_by(!!data_id))

all_ids <- union(
map_group |> pull(!!data_id),
map_group |> pull(!!data_group_id)
)

# Find any other ids related to the ids in this dupe group
# so we can merge groups
map_others <- mapping |> filter(!!data_group_id %in% all_ids)

# recompute current grouping to min of all ids
update <-
dplyr::union(map_group, map_others) |>
mutate("{data_group_id_name}" := min(!!data_id, !!data_group_id))

mapping <- rows_update(mapping, update, by = data_id_name)
cli::cli_progress_update()
}
cli::cli_progress_done()

mapping |>
left_join(
data[setdiff(names(data), data_group_id_name)], # use new mapping
by = join_by(!!data_id)
)
}

+ 30
- 25
process/_targets.R Datei anzeigen

@@ -204,26 +204,31 @@ list(
candidate_listing_for_matching,
prepare_candidate_listing_for_matching(candidate_listing)
),
# tar_target(
# candidates_linked,
# fastlink_candidates(
# candidates_for_matching,
# candidate_listing_for_matching
# )
# ),
# tar_target(
# candidates_matched,
# fastlink_match_candidates(
# candidates_for_matching,
# candidate_listing_for_matching,
# candidates_linked
# )
# ),

# Candidate Listing -------------------------------------------------------
tar_target(
candidates_linked,
fastlink_candidates(
candidates_for_matching,
candidate_listing_for_matching
)
),
tar_target(
candidates_matched,
fastlink_match_candidates(
candidates_for_matching,
candidate_listing_for_matching,
candidates_linked
)
candidate_listing_dedupe,
prep_dedupe_candidates(candidate_listing_raw)
),

# Candidate Listing -------------------------------------------------------
tar_target(
candidate_listing,
prep_candidate_listing(path_candidate_listing_raw)
prep_candidate_listing(candidate_listing_raw, candidate_listing_dedupe)
),


@@ -290,32 +295,32 @@ list(
),

tar_target(
path_out_elections,
out_write_parquet(candidate_listing$elections, "elections"),
path_out_cl_elections,
out_write_parquet(candidate_listing$cl_elections, "cl_elections"),
format = "file"
),

tar_target(
path_out_elections_candidates,
out_write_parquet(candidate_listing$candidates, "elections_candidates"),
path_out_cl_candidates,
out_write_parquet(candidate_listing$cl_candidates, "cl_candidates"),
format = "file"
),

tar_target(
path_out_elections_candidates_alias,
out_write_parquet(candidate_listing$candidate_name_on_ballot, "elections_candidates_alias"),
path_out_cl_alias,
out_write_parquet(candidate_listing$cl_name_on_ballot, "cl_name_on_ballot"),
format = "file"
),

tar_target(
path_out_elections_candidates_contact,
out_write_parquet(candidate_listing$candidate_contact, "elections_candidates_contact"),
path_out_cl_contact,
out_write_parquet(candidate_listing$cl_contact, "cl_contact"),
format = "file"
),

tar_target(
path_out_elections_candidates_party,
out_write_parquet(candidate_listing$candidate_party, "elections_candidates_party"),
path_out_cl_party,
out_write_parquet(candidate_listing$cl_party, "cl_party"),
format = "file"
)


+ 19
- 8
process/_targets/meta/meta
Datei-Diff unterdrückt, da er zu groß ist
Datei anzeigen


Laden…
Abbrechen
Speichern