Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

214 lines
6.0KB

  1. prepare_candidates_for_matching <- function(
  2. path_data_prep_officers,
  3. path_out_report_list,
  4. path_addresses = "data-out/addresses"
  5. ) {
  6. lg_info_target(lg_get_logger())
  7. report_list <- out_open_dataset_db(path_out_report_list)
  8. addresses <- out_open_dataset_db(path_addresses)
  9. candidates <-
  10. out_open_dataset_db(path_data_prep_officers) |>
  11. semi_join(report_list, by = "report_id") |>
  12. filter(type == "Candidate") |>
  13. mutate(
  14. name_clean = toupper(name),
  15. name_clean = REGEXP_REPLACE(name_clean, " FOR .+$", "", "g"),
  16. name_clean = REGEXP_REPLACE(name_clean, "\\s+", " ", "g")
  17. ) |>
  18. distinct(sboe_id, name_clean, address) |>
  19. rename(address_raw = address) |>
  20. left_join(
  21. addresses |> select(1:2),
  22. by = join_by(address_raw == address_lookup)
  23. ) |>
  24. rename(address = address_resolved)
  25. # Add new rows with aliases for people like `"ROLLANDE \"ROLIE\" SAMPSON"`
  26. candidates_aliases <-
  27. candidates |>
  28. filter(grepl('"[A-Z]+"', name_clean)) |>
  29. mutate(
  30. name_clean = REGEXP_REPLACE(name_clean, '.+? "([A-Z]+)"(.+)$', "\\1 \\2", "g"),
  31. name_clean = REGEXP_REPLACE(name_clean, "[^A-Z ]", "", "g"),
  32. name_clean = REGEXP_REPLACE(name_clean, "\\s+", " ", "g"),
  33. name_clean = REGEXP_REPLACE(name_clean, "^\\s+|\\s+$", "", "g")
  34. ) |>
  35. collect()
  36. candidates <- candidates |>
  37. mutate(
  38. name_clean = REGEXP_REPLACE(name_clean, "[^A-Z ]", "", "g"),
  39. name_clean = REGEXP_REPLACE(name_clean, "\\s+", " ", "g"),
  40. name_clean = REGEXP_REPLACE(name_clean, "^\\s+|\\s+$", "", "g")
  41. ) |>
  42. collect()
  43. candidates |>
  44. bind_rows(candidates_aliases) |>
  45. filter(!is.na(name_clean)) |>
  46. tidyr::extract(
  47. address,
  48. c("street", "city", "state", "postal_code"),
  49. "(.+), (.+), ([A-Z]{2}), (\\d{5})"
  50. ) |>
  51. mutate(
  52. street = if_else(!grepl("\\d", street), NA_character_, street),
  53. )
  54. }
  55. prepare_candidate_listing_for_matching <- function(candidate_listing) {
  56. lg_info_target(lg_get_logger())
  57. candidate_name <-
  58. candidate_listing$cl_candidates |>
  59. mutate(
  60. name_full = paste(first_name, middle_name, last_name, name_suffix_lbl),
  61. name_mi = paste(first_name, substr(middle_name, 1, 1), last_name, name_suffix_lbl),
  62. name_first_last = paste(first_name, last_name, name_suffix_lbl),
  63. ) |>
  64. select(candidate_id, name_full, name_mi, name_first_last) |>
  65. tidyr::pivot_longer(-candidate_id, values_to = "name_clean") |>
  66. select(1, name_clean)
  67. candidate_alias <-
  68. candidate_listing$cl_name_on_ballot |>
  69. distinct(candidate_id, name_clean = name_on_ballot) |>
  70. mutate(
  71. name_clean = toupper(name_clean),
  72. name_clean = gsub("[,.]", "", name_clean)
  73. )
  74. candidate_aka <-
  75. candidate_alias |>
  76. filter(grepl("[(].+[)]", name_clean)) |>
  77. mutate(name_clean = sub(".+? \\((.+?)\\) (.+)$", "\\1 \\2", name_clean))
  78. bind_rows(candidate_name, candidate_alias, candidate_aka) |>
  79. mutate(name_clean = stringr::str_squish(name_clean)) |>
  80. left_join(
  81. candidate_listing$cl_contact |> select(1, street:zip_code),
  82. by = "candidate_id",
  83. relationship = "many-to-many"
  84. ) |>
  85. distinct()
  86. }
  87. fastlink_candidates <- function(candidates_for_matching, candidate_listing_for_matching) {
  88. lg_info_target(lg_get_logger())
  89. fastLink::fastLink(
  90. candidates_for_matching,
  91. candidate_listing_for_matching,
  92. varnames = c("name_clean", "street", "city"),
  93. stringdist.match = c("name_clean", "street"),
  94. partial.match = c("name_clean", "street"),
  95. stringdist.method = "dl"
  96. )
  97. }
  98. fastlink_match_candidates <- function(
  99. candidates_for_matching,
  100. candidate_listing_for_matching,
  101. candidates_linked
  102. ) {
  103. lg_info_target(lg_get_logger())
  104. matches <- fastLink::getMatches(
  105. candidates_for_matching,
  106. candidate_listing_for_matching,
  107. candidates_linked,
  108. threshold.match = 0.8
  109. )
  110. matches <- as_tibble(matches)
  111. select(matches, sboe_id, candidate_id)
  112. }
  113. candidates_match <- function(
  114. candidates_for_matching,
  115. candidate_listing_for_matching,
  116. candidates_linked
  117. ) {
  118. # First, direct matches
  119. candidates_matched_1 <-
  120. candidates_for_matching |>
  121. inner_join(
  122. candidate_listing_for_matching |>
  123. select(candidate_id, name_clean, street, city),
  124. by = join_by(
  125. name_clean == name_clean,
  126. street == street,
  127. city == city
  128. ),
  129. relationship = "many-to-many"
  130. ) |>
  131. distinct(sboe_id, candidate_id)
  132. # Then unambiguous matches on street + city
  133. matches_street_city <-
  134. candidates_for_matching |>
  135. anti_join(candidates_matched_1, by = "sboe_id") |>
  136. inner_join(
  137. candidate_listing_for_matching |>
  138. select(candidate_id, street, city),
  139. by = join_by(
  140. street == street,
  141. city == city
  142. ),
  143. relationship = "many-to-many"
  144. ) |>
  145. distinct(sboe_id, candidate_id, street, city) |>
  146. group_by(street, city) |>
  147. mutate(n_names = n_distinct(candidate_id)) |>
  148. ungroup() |>
  149. filter(n_names == 1) |>
  150. select(sboe_id, candidate_id)
  151. # And unambiguous names
  152. matches_name_obvious <-
  153. inner_join(
  154. candidates_for_matching,
  155. candidate_listing_for_matching |>
  156. distinct(candidate_id, name_clean) |>
  157. add_count(name_clean) |>
  158. filter(n == 1) |>
  159. select(-n),
  160. by = "name_clean",
  161. relationship = "many-to-many"
  162. ) |>
  163. distinct(sboe_id, candidate_id)
  164. candidates_matched_2 <-
  165. candidates_matched_1 |>
  166. union(matches_street_city) |>
  167. union(matches_name_obvious)
  168. candidates_for_matching_left <-
  169. candidates_for_matching |>
  170. anti_join(candidates_matched_2, by = "sboe_id")
  171. # Now fuzzyjoin...
  172. candidates_fuzzy_name <-
  173. zoomerjoin::jaccard_inner_join(
  174. candidates_for_matching_left,
  175. candidate_listing_for_matching |>
  176. select(name_on_ballot, name_clean, street, city),
  177. by = "name_clean",
  178. threshold = 0.85
  179. )
  180. candidates_fuzzy_name |>
  181. distinct(sboe_id, name_on_ballot) |>
  182. group_by(sboe_id) |>
  183. mutate(n_names = n_distinct(name_on_ballot)) |>
  184. ungroup() |>
  185. filter(n_names != 1) |>
  186. arrange(desc(n_names), sboe_id)
  187. }