|
- prepare_donors_latest <- function(
- path_data_prep_receipts,
- path_out_report_list,
- path_addresses = "data-out/addresses"
- ) {
- report_list <- out_open_dataset_db(path_out_report_list)
-
- donations <-
- donations_from_receipts(
- receipts = path_data_prep_receipts,
- addresses_out = path_addresses
- ) |>
- semi_join(report_list, by = "report_id")
-
- donations_prep <-
- donations |>
- mutate(
- donor_name_clean = toupper(donor_name),
- donor_name_clean = if_else(
- grepl(" (INC|LLC|PAC)", donor_name_clean),
- donor_name_clean,
- # attempt to fix LAST, FIRST NAME issues
- REGEXP_REPLACE(donor_name_clean, "^([A-Z]{2, }), (.+)", "\\2 \\1", "g")
- ),
- # "O'BRIEN -> OBRIEN" plus other chars that should be non-spaced
- donor_name_clean = REGEXP_REPLACE(donor_name_clean, "['&*\".()]", "", "g"),
- # remove anything that isn't A-Z, space, period or parens
- donor_name_clean = REGEXP_REPLACE(donor_name_clean, "[^A-Z0-9 #:;?/]", " ", "g"),
- donor_name_clean = REGEXP_REPLACE(donor_name_clean, " +", " ", "g"),
- # get last name, but remove suffixes
- donor_name_last = REGEXP_REPLACE(donor_name_clean, " (II|III|IV|[JS]R[.]?)", "", "g"),
- donor_name_last = REGEXP_REPLACE(donor_name_clean, "^ +| +$", "", "g"),
- donor_name_last = RIGHT(donor_name_last, INSTR(REVERSE(donor_name_last), ' ') - 1L),
- donor_name_first_init = LEFT(donor_name_clean, 1L),
- ) |>
- relocate(donor_name_clean, donor_name_last, donor_name_first_init, .after = donor_name)
-
- donors_latest_tbl <-
- donations_prep |>
- group_by(donor_name_clean, donor_address) |>
- slice_max(occur_date, report_id, n = 1, with_ties = FALSE) |>
- ungroup() |>
- mutate(across(c(profession, employers_name), toupper)) |>
- distinct(
- donor_name,
- donor_name_clean,
- donor_name_last,
- donor_name_first_init,
- donor_address,
- donor_address_raw,
- profession,
- employers_name,
- report_id
- )
-
- donors_latest_tbl |>
- collect() |>
- mutate(
- donor_id_raw = row_number(),
- .before = 1
- )
- }
-
- prepare_donors_for_matching <- function(donors_latest) {
- donors_latest |>
- tidyr::extract(
- donor_address,
- c("street", "city", "state", "postal_code"),
- "(.+), (.+), ([A-Z]{2}), (\\d{5})"
- ) |>
- select(
- donor_id_raw,
- donor_name_clean,
- donor_name_first_init, donor_name_last,
- street, city, postal_code,
- profession, employers_name
- ) |>
- mutate(
- # A real address has to have a number in it, right?
- street = if_else(!grepl("\\d", street), NA_character_, street),
- )
- }
-
- prepare_donors_matching_sample <- function(donors_for_matching) {
- donors_in_nc <-
- donors_for_matching |>
- filter(is.na(postal_code) | grepl("^2[78]", postal_code))
-
- donors_outside_nc <- anti_join(donors_for_matching, donors_in_nc, by = "donor_id_raw")
-
- bind_rows(
- donors_in_nc |> slice_sample(n = 50000),
- donors_outside_nc |> slice_sample(n = 10000)
- )
- }
-
- prepare_donor_zip_blocks <- function(donors) {
- lg_info_target(lg_get_logger())
-
- pc_counts <-
- donors |>
- count(postal_code) |>
- filter(n > 1, !is.na(postal_code))
-
- donors |>
- semi_join(pc_counts, by = "postal_code") |>
- mutate(
- zip_pre = case_when(
- # North Carolina zip codes start with 27*** and 28***,
- # so we can group into smaller block
- grepl("^2[78]", postal_code) ~ substr(postal_code, 1, 3),
- # Outside of NC, we'll group into region codes (first two of zip)
- TRUE ~ substr(postal_code, 1, 1)
- )
- ) |>
- group_nest(zip_pre) |>
- mutate(n = map_int(data, nrow)) |>
- arrange(desc(n)) |>
- select(-n)
- }
-
- cluster_text <- function(x, column) {
- col_expr <- rlang::enexpr(column)
- col_sym <- rlang::sym(rlang::expr_text(col_expr))
-
- val <-
- x |>
- mutate(
- ..clean = stringr::str_to_lower(stringr::str_squish(!!col_sym))
- )
-
- cluster <- clustringr::cluster_strings(
- unique(val$..clean),
- max_dist = 1,
- )
- cluster <- cluster$df_clusters
-
- val |>
- left_join(cluster, by = join_by(..clean == node)) |>
- select(-..clean, -size) |>
- relocate(cluster, .after = !!col_sym) |>
- rename("{col_sym}_cluster" := cluster)
- }
-
- prepare_donor_city_blocks <- function(donors) {
- lg_info_target(lg_get_logger())
-
- donors |>
- cluster_text(city) |>
- group_nest(city_cluster) |>
- mutate(n = map_int(data, nrow)) |>
- filter(n > 1) |>
- arrange(desc(n)) |>
- select(-n)
- }
-
- prepare_donor_name_blocks <- function(donors) {
- lg_info_target(lg_get_logger())
-
- donors |>
- filter(!is.na(donor_name_last)) |>
- mutate(
- donor_name_last = case_when(
- grepl("[^A-Z]", donor_name_last) ~ "other",
- TRUE ~ substr(donor_name_last, 1, 1)
- )
- ) |>
- group_nest(donor_name_last) |>
- mutate(n = map_int(data, nrow)) |>
- filter(n > 1) |>
- arrange(desc(n)) |>
- select(-n)
- }
-
-
- fastlink_donor_blocks <- function(donor_blocks, ..., quiet = FALSE) {
- lg <- lg_get_logger()
- lg_info_target(lg)
-
- # The goal here is to train once on a subset of the complete data
- # and then apply the training to each block
-
- # fastLink(verbose = TRUE) can introduce errors (their fault)
- p_fastlink_donors <- partial(fastlink_donors, ..., verbose = TRUE)
-
- if (quiet) {
- p_fastlink_donors <- quietly(p_fastlink_donors)
- }
-
- # Now apply the model to each block
- lg$info("Applying EM model to {nrow(donor_blocks)} blocks...")
- donor_blocks$matches <- vector("list", nrow(donor_blocks))
-
- for (i in seq_len(nrow(donor_blocks))) {
- block_data <- donor_blocks$data[[i]]
- lg$info(" ┌ [{i}] start ({nrow(block_data)} rows)")
-
- res <- p_fastlink_donors(block_data)
- donor_blocks$matches[[i]] <- if (quiet) res$result else res
-
- lg$info(" └ [{i}] done")
- }
- lg$info("Found matches for all blocks")
-
- donor_blocks
- }
-
- fastlink_donors_em_model <- function(donors_em_model_sample) {
- lg <- lg_get_logger()
- lg_info_target(lg)
-
- lg$info("Training fastLink EM on donor data with {nrow(donors_em_model_sample)} rows...")
- em_obj <- fastlink_donors(donors_em_model_sample, estimate.only = TRUE)
- lg$info("Done: Training fastLink EM")
-
- em_obj
- }
-
- fastlink_donors <- function(
- donors_for_matching,
- ...,
- verbose = TRUE,
- exclude_vars = NUL
- ) {
- donors_for_matching <- janitor::remove_empty(donors_for_matching, "cols")
-
- is_city_constant <- n_distinct(donors_for_matching$city) == 1
-
- match_vars <- setdiff(
- names(donors_for_matching),
- c(
- "donor_id_raw", "postal_code",
- if (is_city_constant) "city",
- "donor_name_first_init", "donor_name_last"
- )
- )
-
- if (length(match_vars) == 0) {
- return(NULL)
- }
-
- str_vars <- c("donor_name_clean", "street", "profession", "employers_name")
- str_vars <- intersect(str_vars, match_vars)
-
- fastLink::fastLink(
- dfA = donors_for_matching,
- dfB = donors_for_matching,
- varnames = match_vars,
- stringdist.match = str_vars,
- partial.match = str_vars,
- verbose = verbose,
- ...
- )
- }
-
- prepare_donors_fastlink_matches <- function(donors_for_matching, donors_fl, ...) {
- fastLink::getMatches(
- dfA = donors_for_matching,
- dfB = donors_for_matching,
- fl.out = donors_fl
- )
- }
|