Ви не можете вибрати більше 25 тем Теми мають розпочинатися з літери або цифри, можуть містити дефіси (-) і не повинні перевищувати 35 символів.

263 lines
7.1KB

  1. prepare_donors_latest <- function(
  2. path_data_prep_receipts,
  3. path_out_report_list,
  4. path_addresses = "data-out/addresses"
  5. ) {
  6. report_list <- out_open_dataset_db(path_out_report_list)
  7. donations <-
  8. donations_from_receipts(
  9. receipts = path_data_prep_receipts,
  10. addresses_out = path_addresses
  11. ) |>
  12. semi_join(report_list, by = "report_id")
  13. donations_prep <-
  14. donations |>
  15. mutate(
  16. donor_name_clean = toupper(donor_name),
  17. donor_name_clean = if_else(
  18. grepl(" (INC|LLC|PAC)", donor_name_clean),
  19. donor_name_clean,
  20. # attempt to fix LAST, FIRST NAME issues
  21. REGEXP_REPLACE(donor_name_clean, "^([A-Z]{2, }), (.+)", "\\2 \\1", "g")
  22. ),
  23. # "O'BRIEN -> OBRIEN" plus other chars that should be non-spaced
  24. donor_name_clean = REGEXP_REPLACE(donor_name_clean, "['&*\".()]", "", "g"),
  25. # remove anything that isn't A-Z, space, period or parens
  26. donor_name_clean = REGEXP_REPLACE(donor_name_clean, "[^A-Z0-9 #:;?/]", " ", "g"),
  27. donor_name_clean = REGEXP_REPLACE(donor_name_clean, " +", " ", "g"),
  28. # get last name, but remove suffixes
  29. donor_name_last = REGEXP_REPLACE(donor_name_clean, " (II|III|IV|[JS]R[.]?)", "", "g"),
  30. donor_name_last = REGEXP_REPLACE(donor_name_clean, "^ +| +$", "", "g"),
  31. donor_name_last = RIGHT(donor_name_last, INSTR(REVERSE(donor_name_last), ' ') - 1L),
  32. donor_name_first_init = LEFT(donor_name_clean, 1L),
  33. ) |>
  34. relocate(donor_name_clean, donor_name_last, donor_name_first_init, .after = donor_name)
  35. donors_latest_tbl <-
  36. donations_prep |>
  37. group_by(donor_name_clean, donor_address) |>
  38. slice_max(occur_date, report_id, n = 1, with_ties = FALSE) |>
  39. ungroup() |>
  40. mutate(across(c(profession, employers_name), toupper)) |>
  41. distinct(
  42. donor_name,
  43. donor_name_clean,
  44. donor_name_last,
  45. donor_name_first_init,
  46. donor_address,
  47. donor_address_raw,
  48. profession,
  49. employers_name,
  50. report_id
  51. )
  52. donors_latest_tbl |>
  53. collect() |>
  54. mutate(
  55. donor_id_raw = row_number(),
  56. .before = 1
  57. )
  58. }
  59. prepare_donors_for_matching <- function(donors_latest) {
  60. donors_latest |>
  61. tidyr::extract(
  62. donor_address,
  63. c("street", "city", "state", "postal_code"),
  64. "(.+), (.+), ([A-Z]{2}), (\\d{5})"
  65. ) |>
  66. select(
  67. donor_id_raw,
  68. donor_name_clean,
  69. donor_name_first_init, donor_name_last,
  70. street, city, postal_code,
  71. profession, employers_name
  72. ) |>
  73. mutate(
  74. # A real address has to have a number in it, right?
  75. street = if_else(!grepl("\\d", street), NA_character_, street),
  76. )
  77. }
  78. prepare_donors_matching_sample <- function(donors_for_matching) {
  79. donors_in_nc <-
  80. donors_for_matching |>
  81. filter(is.na(postal_code) | grepl("^2[78]", postal_code))
  82. donors_outside_nc <- anti_join(donors_for_matching, donors_in_nc, by = "donor_id_raw")
  83. bind_rows(
  84. donors_in_nc |> slice_sample(n = 50000),
  85. donors_outside_nc |> slice_sample(n = 10000)
  86. )
  87. }
  88. prepare_donor_zip_blocks <- function(donors) {
  89. lg_info_target(lg_get_logger())
  90. pc_counts <-
  91. donors |>
  92. count(postal_code) |>
  93. filter(n > 1, !is.na(postal_code))
  94. donors |>
  95. semi_join(pc_counts, by = "postal_code") |>
  96. mutate(
  97. zip_pre = case_when(
  98. # North Carolina zip codes start with 27*** and 28***,
  99. # so we can group into smaller block
  100. grepl("^2[78]", postal_code) ~ substr(postal_code, 1, 3),
  101. # Outside of NC, we'll group into region codes (first two of zip)
  102. TRUE ~ substr(postal_code, 1, 1)
  103. )
  104. ) |>
  105. group_nest(zip_pre) |>
  106. mutate(n = map_int(data, nrow)) |>
  107. arrange(desc(n)) |>
  108. select(-n)
  109. }
  110. cluster_text <- function(x, column) {
  111. col_expr <- rlang::enexpr(column)
  112. col_sym <- rlang::sym(rlang::expr_text(col_expr))
  113. val <-
  114. x |>
  115. mutate(
  116. ..clean = stringr::str_to_lower(stringr::str_squish(!!col_sym))
  117. )
  118. cluster <- clustringr::cluster_strings(
  119. unique(val$..clean),
  120. max_dist = 1,
  121. )
  122. cluster <- cluster$df_clusters
  123. val |>
  124. left_join(cluster, by = join_by(..clean == node)) |>
  125. select(-..clean, -size) |>
  126. relocate(cluster, .after = !!col_sym) |>
  127. rename("{col_sym}_cluster" := cluster)
  128. }
  129. prepare_donor_city_blocks <- function(donors) {
  130. lg_info_target(lg_get_logger())
  131. donors |>
  132. cluster_text(city) |>
  133. group_nest(city_cluster) |>
  134. mutate(n = map_int(data, nrow)) |>
  135. filter(n > 1) |>
  136. arrange(desc(n)) |>
  137. select(-n)
  138. }
  139. prepare_donor_name_blocks <- function(donors) {
  140. lg_info_target(lg_get_logger())
  141. donors |>
  142. filter(!is.na(donor_name_last)) |>
  143. mutate(
  144. donor_name_last = case_when(
  145. grepl("[^A-Z]", donor_name_last) ~ "other",
  146. TRUE ~ substr(donor_name_last, 1, 1)
  147. )
  148. ) |>
  149. group_nest(donor_name_last) |>
  150. mutate(n = map_int(data, nrow)) |>
  151. filter(n > 1) |>
  152. arrange(desc(n)) |>
  153. select(-n)
  154. }
  155. fastlink_donor_blocks <- function(donor_blocks, ..., quiet = FALSE) {
  156. lg <- lg_get_logger()
  157. lg_info_target(lg)
  158. # The goal here is to train once on a subset of the complete data
  159. # and then apply the training to each block
  160. # fastLink(verbose = TRUE) can introduce errors (their fault)
  161. p_fastlink_donors <- partial(fastlink_donors, ..., verbose = TRUE)
  162. if (quiet) {
  163. p_fastlink_donors <- quietly(p_fastlink_donors)
  164. }
  165. # Now apply the model to each block
  166. lg$info("Applying EM model to {nrow(donor_blocks)} blocks...")
  167. donor_blocks$matches <- vector("list", nrow(donor_blocks))
  168. for (i in seq_len(nrow(donor_blocks))) {
  169. block_data <- donor_blocks$data[[i]]
  170. lg$info(" ┌ [{i}] start ({nrow(block_data)} rows)")
  171. res <- p_fastlink_donors(block_data)
  172. donor_blocks$matches[[i]] <- if (quiet) res$result else res
  173. lg$info(" └ [{i}] done")
  174. }
  175. lg$info("Found matches for all blocks")
  176. donor_blocks
  177. }
  178. fastlink_donors_em_model <- function(donors_em_model_sample) {
  179. lg <- lg_get_logger()
  180. lg_info_target(lg)
  181. lg$info("Training fastLink EM on donor data with {nrow(donors_em_model_sample)} rows...")
  182. em_obj <- fastlink_donors(donors_em_model_sample, estimate.only = TRUE)
  183. lg$info("Done: Training fastLink EM")
  184. em_obj
  185. }
  186. fastlink_donors <- function(
  187. donors_for_matching,
  188. ...,
  189. verbose = TRUE,
  190. exclude_vars = NUL
  191. ) {
  192. donors_for_matching <- janitor::remove_empty(donors_for_matching, "cols")
  193. is_city_constant <- n_distinct(donors_for_matching$city) == 1
  194. match_vars <- setdiff(
  195. names(donors_for_matching),
  196. c(
  197. "donor_id_raw", "postal_code",
  198. if (is_city_constant) "city",
  199. "donor_name_first_init", "donor_name_last"
  200. )
  201. )
  202. if (length(match_vars) == 0) {
  203. return(NULL)
  204. }
  205. str_vars <- c("donor_name_clean", "street", "profession", "employers_name")
  206. str_vars <- intersect(str_vars, match_vars)
  207. fastLink::fastLink(
  208. dfA = donors_for_matching,
  209. dfB = donors_for_matching,
  210. varnames = match_vars,
  211. stringdist.match = str_vars,
  212. partial.match = str_vars,
  213. verbose = verbose,
  214. ...
  215. )
  216. }
  217. prepare_donors_fastlink_matches <- function(donors_for_matching, donors_fl, ...) {
  218. fastLink::getMatches(
  219. dfA = donors_for_matching,
  220. dfB = donors_for_matching,
  221. fl.out = donors_fl
  222. )
  223. }