You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

181 lines
4.9KB

  1. prepare_candidates_for_matching <- function(
  2. path_data_prep_officers,
  3. path_out_report_list,
  4. path_addresses = "data-out/addresses"
  5. ) {
  6. lg_info_target(lg_get_logger())
  7. report_list <- out_open_dataset_db(path_out_report_list)
  8. addresses <- out_open_dataset_db(path_addresses)
  9. candidates <-
  10. out_open_dataset_db(path_data_prep_officers) |>
  11. semi_join(report_list, by = "report_id") |>
  12. filter(type == "Candidate") |>
  13. mutate(
  14. name_clean = toupper(name),
  15. name_clean = REGEXP_REPLACE(name_clean, " FOR .+$", "", "g"),
  16. name_clean = REGEXP_REPLACE(name_clean, "\\s+", " ", "g")
  17. ) |>
  18. distinct(sboe_id, name_clean, address) |>
  19. rename(address_raw = address) |>
  20. left_join(
  21. addresses |> select(1:2),
  22. by = join_by(address_raw == address_lookup)
  23. ) |>
  24. rename(address = address_resolved)
  25. # Add new rows with aliases for people like `"ROLLANDE \"ROLIE\" SAMPSON"`
  26. candidates_aliases <-
  27. candidates |>
  28. filter(grepl('"[A-Z]+"', name_clean)) |>
  29. mutate(
  30. name_clean = REGEXP_REPLACE(name_clean, '.+? "([A-Z]+)"(.+)$', "\\1 \\2", "g"),
  31. name_clean = REGEXP_REPLACE(name_clean, "[^A-Z ]", "", "g"),
  32. name_clean = REGEXP_REPLACE(name_clean, "\\s+", " ", "g"),
  33. name_clean = REGEXP_REPLACE(name_clean, "^\\s+|\\s+$", "", "g")
  34. ) |>
  35. collect()
  36. candidates <- candidates |>
  37. mutate(
  38. name_clean = REGEXP_REPLACE(name_clean, "[^A-Z ]", "", "g"),
  39. name_clean = REGEXP_REPLACE(name_clean, "\\s+", " ", "g"),
  40. name_clean = REGEXP_REPLACE(name_clean, "^\\s+|\\s+$", "", "g")
  41. ) |>
  42. collect()
  43. candidates |>
  44. bind_rows(candidates_aliases) |>
  45. filter(!is.na(name_clean)) |>
  46. tidyr::extract(
  47. address,
  48. c("street", "city", "state", "postal_code"),
  49. "(.+), (.+), ([A-Z]{2}), (\\d{5})"
  50. ) |>
  51. mutate(
  52. street = if_else(!grepl("\\d", street), NA_character_, street),
  53. )
  54. }
  55. prepare_candidate_listing_for_matching <- function(candidate_listing) {
  56. lg_info_target(lg_get_logger())
  57. cl <-
  58. candidate_listing |>
  59. mutate(
  60. name_clean = toupper(name_on_ballot),
  61. name_clean = gsub("[^A-Z ]", "", name_clean)
  62. ) |>
  63. distinct(
  64. name_on_ballot, name_clean,
  65. street = street_address,
  66. city, state,
  67. postal_code = zip_code
  68. )
  69. }
  70. fastlink_candidates <- function(candidates_for_matching, candidate_listing_for_matching) {
  71. lg_info_target(lg_get_logger())
  72. fastLink::fastLink(
  73. candidates_for_matching,
  74. candidate_listing_for_matching,
  75. varnames = c("name_clean", "street", "city"),
  76. stringdist.match = c("name_clean", "street"),
  77. partial.match = c("name_clean", "street"),
  78. stringdist.method = "dl"
  79. )
  80. }
  81. fastlink_match_candidates <- function(
  82. candidates_for_matching,
  83. candidate_listing_for_matching,
  84. candidates_linked
  85. ) {
  86. lg_info_target(lg_get_logger())
  87. matches <- fastLink::getMatches(
  88. candidates_for_matching,
  89. candidate_listing_for_matching,
  90. candidates_linked,
  91. threshold.match = 0.8
  92. )
  93. names(matches)[ncol(candidates_for_matching) + 1] <- "name_on_ballot"
  94. candidates_for_matching |>
  95. left_join(
  96. matches |> select(sboe_id, name_on_ballot),
  97. by = join_by(sboe_id == sboe_id)
  98. )
  99. }
  100. candidates_match <- function(
  101. candidates_for_matching,
  102. candidate_listing_for_matching,
  103. candidates_linked
  104. ) {
  105. # First, direct matches
  106. candidates_matched_1 <-
  107. candidates_for_matching |>
  108. inner_join(
  109. candidate_listing_for_matching |>
  110. select(name_on_ballot, name_clean, street, city),
  111. by = join_by(
  112. name_clean == name_clean,
  113. street == street,
  114. city == city
  115. ),
  116. relationship = "many-to-many"
  117. ) |>
  118. distinct(sboe_id, name_on_ballot)
  119. # Then unambiguous matches on street + city
  120. matches_street_city <-
  121. candidates_for_matching |>
  122. anti_join(candidates_matched_1, by = "sboe_id") |>
  123. inner_join(
  124. candidate_listing_for_matching |>
  125. select(name_on_ballot, street, city),
  126. by = join_by(
  127. street == street,
  128. city == city
  129. ),
  130. relationship = "many-to-many"
  131. ) |>
  132. distinct(sboe_id, name_on_ballot, street, city) |>
  133. group_by(street, city) |>
  134. mutate(n_names = n_distinct(name_on_ballot)) |>
  135. ungroup() |>
  136. filter(n_names == 1) |>
  137. select(sboe_id, name_on_ballot)
  138. candidates_matched_2 <- union(candidates_matched_1, matches_street_city)
  139. candidates_for_matching_left <-
  140. candidates_for_matching |>
  141. anti_join(candidates_matched_2, by = "sboe_id")
  142. # Now fuzzyjoin...
  143. candidates_fuzzy_name <-
  144. zoomerjoin::jaccard_inner_join(
  145. candidates_for_matching_left,
  146. candidate_listing_for_matching |>
  147. select(name_on_ballot, name_clean, street, city),
  148. by = "name_clean",
  149. threshold = 0.85
  150. )
  151. candidates_fuzzy_name |>
  152. distinct(sboe_id, name_on_ballot) |>
  153. group_by(sboe_id) |>
  154. mutate(n_names = n_distinct(name_on_ballot)) |>
  155. ungroup() |>
  156. filter(n_names != 1) |>
  157. arrange(desc(n_names), sboe_id)
  158. }