Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

223 lines
6.3KB

  1. prepare_candidates_for_matching <- function(
  2. path_data_prep_officers,
  3. path_out_report_list,
  4. path_out_committees,
  5. path_addresses = "data-out/addresses"
  6. ) {
  7. lg_info_target(lg_get_logger())
  8. report_list <- out_open_dataset_db(path_out_report_list)
  9. addresses <- out_open_dataset_db(path_addresses)
  10. committees <- out_open_dataset_db(path_out_committees)
  11. candidates_db <- out_open_dataset_db(path_data_prep_officers)
  12. candidates <-
  13. candidates_db |>
  14. semi_join(report_list, by = "report_id") |>
  15. filter(type == "Candidate") |>
  16. rows_patch(
  17. committees |> select(sboe_id, address = address_lookup),
  18. by = "sboe_id",
  19. unmatched = "ignore"
  20. ) |>
  21. mutate(
  22. name_clean = toupper(name),
  23. name_clean = REGEXP_REPLACE(name_clean, " FOR .+$", "", "g"),
  24. name_clean = REGEXP_REPLACE(name_clean, "\\s+", " ", "g")
  25. ) |>
  26. distinct(sboe_id, name_clean, address) |>
  27. rename(address_raw = address) |>
  28. left_join(
  29. addresses |> select(1:2),
  30. by = join_by(address_raw == address_lookup)
  31. ) |>
  32. rename(address = address_resolved)
  33. # Add new rows with aliases for people like `"ROLLANDE \"ROLIE\" SAMPSON"`
  34. candidates_aliases <-
  35. candidates |>
  36. filter(grepl('"[A-Z]+"', name_clean)) |>
  37. mutate(
  38. name_clean = REGEXP_REPLACE(name_clean, '.+? "([A-Z]+)"(.+)$', "\\1 \\2", "g"),
  39. name_clean = REGEXP_REPLACE(name_clean, "[^A-Z ]", "", "g"),
  40. name_clean = REGEXP_REPLACE(name_clean, "\\s+", " ", "g"),
  41. name_clean = REGEXP_REPLACE(name_clean, "^\\s+|\\s+$", "", "g")
  42. ) |>
  43. collect()
  44. candidates <- candidates |>
  45. mutate(
  46. name_clean = REGEXP_REPLACE(name_clean, "[^A-Z ]", "", "g"),
  47. name_clean = REGEXP_REPLACE(name_clean, "\\s+", " ", "g"),
  48. name_clean = REGEXP_REPLACE(name_clean, "^\\s+|\\s+$", "", "g")
  49. ) |>
  50. collect()
  51. candidates |>
  52. bind_rows(candidates_aliases) |>
  53. filter(!is.na(name_clean)) |>
  54. tidyr::extract(
  55. address,
  56. c("street", "city", "state", "postal_code"),
  57. "(.+), (.+), ([A-Z]{2}), (\\d{5})"
  58. ) |>
  59. mutate(
  60. street = if_else(!grepl("\\d", street), NA_character_, street),
  61. )
  62. }
  63. prepare_candidate_listing_for_matching <- function(candidate_listing) {
  64. lg_info_target(lg_get_logger())
  65. candidate_name <-
  66. candidate_listing$cl_candidates |>
  67. mutate(
  68. name_full = paste(first_name, middle_name, last_name, name_suffix_lbl),
  69. name_mi = paste(first_name, substr(middle_name, 1, 1), last_name, name_suffix_lbl),
  70. name_first_last = paste(first_name, last_name, name_suffix_lbl),
  71. ) |>
  72. select(candidate_id, name_full, name_mi, name_first_last) |>
  73. tidyr::pivot_longer(-candidate_id, values_to = "name_clean") |>
  74. select(1, name_clean)
  75. candidate_alias <-
  76. candidate_listing$cl_name_on_ballot |>
  77. distinct(candidate_id, name_clean = name_on_ballot) |>
  78. mutate(
  79. name_clean = toupper(name_clean),
  80. name_clean = gsub("[,.]", "", name_clean)
  81. )
  82. candidate_aka <-
  83. candidate_alias |>
  84. filter(grepl("[(].+[)]", name_clean)) |>
  85. mutate(name_clean = sub(".+? \\((.+?)\\) (.+)$", "\\1 \\2", name_clean))
  86. bind_rows(candidate_name, candidate_alias, candidate_aka) |>
  87. mutate(name_clean = stringr::str_squish(name_clean)) |>
  88. left_join(
  89. candidate_listing$cl_contact |> select(1, street:zip_code),
  90. by = "candidate_id",
  91. relationship = "many-to-many"
  92. ) |>
  93. distinct()
  94. }
  95. fastlink_candidates <- function(candidates_for_matching, candidate_listing_for_matching) {
  96. lg_info_target(lg_get_logger())
  97. fastLink::fastLink(
  98. candidates_for_matching,
  99. candidate_listing_for_matching,
  100. varnames = c("name_clean", "street", "city"),
  101. stringdist.match = c("name_clean", "street"),
  102. partial.match = c("name_clean", "street"),
  103. stringdist.method = "dl",
  104. threshold.match = 0.9
  105. )
  106. }
  107. fastlink_match_candidates <- function(
  108. candidates_for_matching,
  109. candidate_listing_for_matching,
  110. candidates_linked
  111. ) {
  112. lg_info_target(lg_get_logger())
  113. matches <- fastLink::getMatches(
  114. candidates_for_matching,
  115. candidate_listing_for_matching,
  116. candidates_linked,
  117. threshold.match = 0.9
  118. )
  119. matches <- as_tibble(matches)
  120. distinct(matches, sboe_id, candidate_id)
  121. }
  122. candidates_match <- function(
  123. candidates_for_matching,
  124. candidate_listing_for_matching,
  125. candidates_linked
  126. ) {
  127. # First, direct matches
  128. candidates_matched_1 <-
  129. candidates_for_matching |>
  130. inner_join(
  131. candidate_listing_for_matching |>
  132. select(candidate_id, name_clean, street, city),
  133. by = join_by(
  134. name_clean == name_clean,
  135. street == street,
  136. city == city
  137. ),
  138. relationship = "many-to-many"
  139. ) |>
  140. distinct(sboe_id, candidate_id)
  141. # Then unambiguous matches on street + city
  142. matches_street_city <-
  143. candidates_for_matching |>
  144. anti_join(candidates_matched_1, by = "sboe_id") |>
  145. inner_join(
  146. candidate_listing_for_matching |>
  147. select(candidate_id, street, city),
  148. by = join_by(
  149. street == street,
  150. city == city
  151. ),
  152. relationship = "many-to-many"
  153. ) |>
  154. distinct(sboe_id, candidate_id, street, city) |>
  155. group_by(street, city) |>
  156. mutate(n_names = n_distinct(candidate_id)) |>
  157. ungroup() |>
  158. filter(n_names == 1) |>
  159. select(sboe_id, candidate_id)
  160. # And unambiguous names
  161. matches_name_obvious <-
  162. inner_join(
  163. candidates_for_matching,
  164. candidate_listing_for_matching |>
  165. distinct(candidate_id, name_clean) |>
  166. add_count(name_clean) |>
  167. filter(n == 1) |>
  168. select(-n),
  169. by = "name_clean",
  170. relationship = "many-to-many"
  171. ) |>
  172. distinct(sboe_id, candidate_id)
  173. candidates_matched_2 <-
  174. candidates_matched_1 |>
  175. union(matches_street_city) |>
  176. union(matches_name_obvious)
  177. candidates_for_matching_left <-
  178. candidates_for_matching |>
  179. anti_join(candidates_matched_2, by = "sboe_id")
  180. # Now fuzzyjoin...
  181. candidates_fuzzy_name <-
  182. zoomerjoin::jaccard_inner_join(
  183. candidates_for_matching_left,
  184. candidate_listing_for_matching |>
  185. select(name_on_ballot, name_clean, street, city),
  186. by = "name_clean",
  187. threshold = 0.85
  188. )
  189. candidates_fuzzy_name |>
  190. distinct(sboe_id, name_on_ballot) |>
  191. group_by(sboe_id) |>
  192. mutate(n_names = n_distinct(name_on_ballot)) |>
  193. ungroup() |>
  194. filter(n_names != 1) |>
  195. arrange(desc(n_names), sboe_id)
  196. }