prepare_donors_latest <- function( path_data_prep_receipts, path_out_report_list, path_addresses = "data-out/addresses" ) { report_list <- out_open_dataset_db(path_out_report_list) donations <- donations_from_receipts( receipts = path_data_prep_receipts, addresses_out = path_addresses ) |> semi_join(report_list, by = "report_id") donations_prep <- donations |> mutate( donor_name_clean = toupper(donor_name), donor_name_clean = if_else( grepl(" (INC|LLC|PAC)", donor_name_clean), donor_name_clean, # attempt to fix LAST, FIRST NAME issues REGEXP_REPLACE(donor_name_clean, "^([A-Z]{2, }), (.+)", "\\2 \\1", "g") ), # "O'BRIEN -> OBRIEN" plus other chars that should be non-spaced donor_name_clean = REGEXP_REPLACE(donor_name_clean, "['&*\".()]", "", "g"), # remove anything that isn't A-Z, space, period or parens donor_name_clean = REGEXP_REPLACE(donor_name_clean, "[^A-Z0-9 #:;?/]", " ", "g"), donor_name_clean = REGEXP_REPLACE(donor_name_clean, " +", " ", "g"), # get last name, but remove suffixes donor_name_last = REGEXP_REPLACE(donor_name_clean, " (II|III|IV|[JS]R[.]?)", "", "g"), donor_name_last = REGEXP_REPLACE(donor_name_clean, "^ +| +$", "", "g"), donor_name_last = RIGHT(donor_name_last, INSTR(REVERSE(donor_name_last), ' ') - 1L), donor_name_first_init = LEFT(donor_name_clean, 1L), ) |> relocate(donor_name_clean, donor_name_last, donor_name_first_init, .after = donor_name) donors_latest_tbl <- donations_prep |> group_by(donor_name_clean, donor_address) |> slice_max(occur_date, report_id, n = 1, with_ties = FALSE) |> ungroup() |> mutate(across(c(profession, employers_name), toupper)) |> distinct( donor_name, donor_name_clean, donor_name_last, donor_name_first_init, donor_address, donor_address_raw, profession, employers_name, report_id ) donors_latest_tbl |> collect() |> mutate( donor_id_raw = row_number(), .before = 1 ) } prepare_donors_for_matching <- function(donors_latest) { donors_latest |> tidyr::extract( donor_address, c("street", "city", "state", "postal_code"), "(.+), (.+), ([A-Z]{2}), (\\d{5})" ) |> select( donor_id_raw, donor_name_clean, donor_name_first_init, donor_name_last, street, city, postal_code, profession, employers_name ) |> mutate( # A real address has to have a number in it, right? street = if_else(!grepl("\\d", street), NA_character_, street), ) } prepare_donors_matching_sample <- function(donors_for_matching) { donors_in_nc <- donors_for_matching |> filter(is.na(postal_code) | grepl("^2[78]", postal_code)) donors_outside_nc <- anti_join(donors_for_matching, donors_in_nc, by = "donor_id_raw") bind_rows( donors_in_nc |> slice_sample(n = 50000), donors_outside_nc |> slice_sample(n = 10000) ) } prepare_donor_zip_blocks <- function(donors) { lg_info_target(lg_get_logger()) pc_counts <- donors |> count(postal_code) |> filter(n > 1, !is.na(postal_code)) donors |> semi_join(pc_counts, by = "postal_code") |> mutate( zip_pre = case_when( # North Carolina zip codes start with 27*** and 28***, # so we can group into smaller block grepl("^2[78]", postal_code) ~ substr(postal_code, 1, 3), # Outside of NC, we'll group into region codes (first two of zip) TRUE ~ substr(postal_code, 1, 1) ) ) |> group_nest(zip_pre) |> mutate(n = map_int(data, nrow)) |> arrange(desc(n)) |> select(-n) } cluster_text <- function(x, column) { col_expr <- rlang::enexpr(column) col_sym <- rlang::sym(rlang::expr_text(col_expr)) val <- x |> mutate( ..clean = stringr::str_to_lower(stringr::str_squish(!!col_sym)) ) cluster <- clustringr::cluster_strings( unique(val$..clean), max_dist = 1, ) cluster <- cluster$df_clusters val |> left_join(cluster, by = join_by(..clean == node)) |> select(-..clean, -size) |> relocate(cluster, .after = !!col_sym) |> rename("{col_sym}_cluster" := cluster) } prepare_donor_city_blocks <- function(donors) { lg_info_target(lg_get_logger()) donors |> cluster_text(city) |> group_nest(city_cluster) |> mutate(n = map_int(data, nrow)) |> filter(n > 1) |> arrange(desc(n)) |> select(-n) } prepare_donor_name_blocks <- function(donors) { lg_info_target(lg_get_logger()) donors |> filter(!is.na(donor_name_last)) |> mutate( donor_name_last = case_when( grepl("[^A-Z]", donor_name_last) ~ "other", TRUE ~ substr(donor_name_last, 1, 1) ) ) |> group_nest(donor_name_last) |> mutate(n = map_int(data, nrow)) |> filter(n > 1) |> arrange(desc(n)) |> select(-n) } fastlink_donor_blocks <- function(donor_blocks, ..., quiet = FALSE) { lg <- lg_get_logger() lg_info_target(lg) # The goal here is to train once on a subset of the complete data # and then apply the training to each block # fastLink(verbose = TRUE) can introduce errors (their fault) p_fastlink_donors <- partial(fastlink_donors, ..., verbose = TRUE) if (quiet) { p_fastlink_donors <- quietly(p_fastlink_donors) } # Now apply the model to each block lg$info("Applying EM model to {nrow(donor_blocks)} blocks...") donor_blocks$matches <- vector("list", nrow(donor_blocks)) for (i in seq_len(nrow(donor_blocks))) { block_data <- donor_blocks$data[[i]] lg$info(" ┌ [{i}] start ({nrow(block_data)} rows)") res <- p_fastlink_donors(block_data) donor_blocks$matches[[i]] <- if (quiet) res$result else res lg$info(" └ [{i}] done") } lg$info("Found matches for all blocks") donor_blocks } fastlink_donors_em_model <- function(donors_em_model_sample) { lg <- lg_get_logger() lg_info_target(lg) lg$info("Training fastLink EM on donor data with {nrow(donors_em_model_sample)} rows...") em_obj <- fastlink_donors(donors_em_model_sample, estimate.only = TRUE) lg$info("Done: Training fastLink EM") em_obj } fastlink_donors <- function( donors_for_matching, ..., verbose = TRUE, exclude_vars = NUL ) { donors_for_matching <- janitor::remove_empty(donors_for_matching, "cols") is_city_constant <- n_distinct(donors_for_matching$city) == 1 match_vars <- setdiff( names(donors_for_matching), c( "donor_id_raw", "postal_code", if (is_city_constant) "city", "donor_name_first_init", "donor_name_last" ) ) if (length(match_vars) == 0) { return(NULL) } str_vars <- c("donor_name_clean", "street", "profession", "employers_name") str_vars <- intersect(str_vars, match_vars) fastLink::fastLink( dfA = donors_for_matching, dfB = donors_for_matching, varnames = match_vars, stringdist.match = str_vars, partial.match = str_vars, verbose = verbose, ... ) } prepare_donors_fastlink_matches <- function(donors_for_matching, donors_fl, ...) { fastLink::getMatches( dfA = donors_for_matching, dfB = donors_for_matching, fl.out = donors_fl ) }