Ви не можете вибрати більше 25 тем Теми мають розпочинатися з літери або цифри, можуть містити дефіси (-) і не повинні перевищувати 35 символів.

340 lines
11KB

  1. prep_candidates_dedupe_mapping <- function(
  2. candidate_listing_raw,
  3. candidate_listing_dedupe
  4. ) {
  5. # candidate names ----
  6. # Extract candidate names, these will be primary keys for the candidates table
  7. candidate_names <-
  8. candidate_listing_raw |>
  9. arrange(last_name, first_name, middle_name) |>
  10. distinct(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) |>
  11. mutate(candidate_id = row_number(), .before = 1)
  12. # Find last election ----
  13. candidates_last_contest <-
  14. candidate_listing_raw |>
  15. group_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) |>
  16. slice_max(election_dt, n = 1) |>
  17. distinct(contest_last = election_dt)
  18. deduped_ids <-
  19. candidate_listing_dedupe$matches |>
  20. left_join(
  21. candidate_names,
  22. by = join_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl)
  23. ) |>
  24. distinct(dupe_id = dedupe.ids, candidate_id) |>
  25. add_count(dupe_id) |>
  26. filter(n > 1) |>
  27. select(-n) |>
  28. mutate(dupe_id = fct_infreq(paste(dupe_id))) |>
  29. arrange(dupe_id, candidate_id) |>
  30. group_split(dupe_id)
  31. mapping <- candidate_names |> select(candidate_id) |> mutate(candidate_group = candidate_id)
  32. for (dupes in deduped_ids) {
  33. map_group <- left_join(dupes[-1], mapping, by = "candidate_id")
  34. all_ids <- union(map_group$candidate_id, map_group$candidate_group)
  35. map_others <- mapping |> filter(candidate_group %in% all_ids)
  36. browser(expr = nrow(map_group) < nrow(map_others))
  37. # recompute current grouping to min of all ids
  38. update <-
  39. dplyr::union(map_group, map_others) |>
  40. mutate(candidate_group = min(candidate_id, candidate_group))
  41. mapping <- rows_update(mapping, update, by = "candidate_id")
  42. }
  43. mapping |>
  44. left_join(candidate_names, by = "candidate_id") |>
  45. left_join(candidates_last_contest, by = join_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl))
  46. }
  47. prep_dedupe_candidates <- function(candidate_listing_raw) {
  48. candidate_names <-
  49. candidate_listing_raw |>
  50. arrange(last_name, first_name, middle_name) |>
  51. distinct(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl, street_address, city, contest_name, county_name) |>
  52. mutate(candidate_id = row_number(), .before = 1)
  53. # first, last, street_address ----
  54. f_l_street <-
  55. candidate_names |>
  56. group_by(first_name, last_name, street_address) |>
  57. mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |>
  58. ungroup()
  59. # first, last, cleaned contest name
  60. f_l_cleaned_contest <-
  61. candidate_names |>
  62. mutate(
  63. contest_name_clean = if_else(
  64. !is.na(city) & map2_lgl(city, contest_name, \(c, cn) grepl(paste("TOWN OF", c), cn)),
  65. map2_chr(city, contest_name, \(c, cn) sub(paste("TOWN OF", c, ""), "", cn)),
  66. contest_name
  67. ),
  68. contest_name_clean = sub("([A-Z]+ ){1,2}COUNTY ", "", contest_name_clean)
  69. ) |>
  70. group_by(first_name, last_name, contest_name_clean) |>
  71. mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |>
  72. ungroup()
  73. # first, middle, last, county_name
  74. f_m_l_county <-
  75. candidate_names |>
  76. group_by(first_name, middle_name, last_name, county_name) |>
  77. mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |>
  78. ungroup()
  79. # first, middle initial, last, county_name
  80. f_mi_l_county <-
  81. candidate_names |>
  82. mutate(middle_name = stringr::str_sub(middle_name, 1, 1)) |>
  83. group_by(first_name, middle_name, last_name, county_name) |>
  84. mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |>
  85. ungroup()
  86. # name_on_ballot, middle_name
  87. nb_m <-
  88. candidate_names |>
  89. mutate(
  90. name_ballot_clean = sub("(Mrs?|Dr|Mr|Ms|Miss)[.]? ", "", name_on_ballot),
  91. name_ballot_clean = gsub("[.]", "", name_ballot_clean),
  92. ) |>
  93. group_by(name_ballot_clean, middle_name) |>
  94. mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |>
  95. ungroup()
  96. candidate_names |>
  97. dedupe_update_mapping(f_l_street, candidate_id, candidate_group, name = "first, last, street") |>
  98. dedupe_update_mapping(f_l_cleaned_contest, candidate_id, candidate_group, name = "first, last, contest") |>
  99. dedupe_update_mapping(f_m_l_county, candidate_id, candidate_group, name = "first, middle, last, county") |>
  100. dedupe_update_mapping(f_mi_l_county, candidate_id, candidate_group, name = "first, mi, last, county") |>
  101. dedupe_update_mapping(nb_m, candidate_id, candidate_group, name = "ballot name, middle name")
  102. }
  103. prep_candidate_listing <- function(
  104. candidate_listing_raw,
  105. candidate_listing_dedupe
  106. ) {
  107. # Give candidates a unique id from dedupe mapping
  108. cl_dedupe <-
  109. candidate_listing_dedupe |>
  110. group_by(candidate_group) |>
  111. mutate(candidate_id = cur_group_id()) |>
  112. ungroup() |>
  113. select(-candidate_group)
  114. # cl raw + candidate_id ----
  115. cl_raw <-
  116. candidate_listing_raw |>
  117. left_join(cl_dedupe, by = setdiff(names(cl_dedupe), "candidate_id"))
  118. # candidate_names ----
  119. candidate_names <-
  120. cl_raw |>
  121. distinct(candidate_id, election_dt, first_name, middle_name, last_name, name_suffix_lbl) |>
  122. arrange(candidate_id, election_dt)
  123. # candidate_name_on_ballot ----
  124. candidate_name_on_ballot <-
  125. cl_raw |>
  126. distinct(candidate_id, election_dt, name_on_ballot) |>
  127. arrange(candidate_id, election_dt)
  128. # candidate_address ----
  129. candidate_address <-
  130. cl_raw |>
  131. mutate(
  132. phone = coalesce(phone, office_phone, business_phone),
  133. street_address = toupper(street_address),
  134. street_address = stringr::str_replace_all(street_address, " +", " "),
  135. street_address = trimws(street_address),
  136. ) |>
  137. select(-office_phone, -business_phone) |>
  138. distinct(candidate_id, election_dt, street = street_address, city, state, zip_code, phone, email) |>
  139. add_address_lookup_local(street = street, postal_code = zip_code) |>
  140. # remove rows that are entirely empty except for candidate_id
  141. filter(!(is.na(street) & is.na(city) & is.na(state) & is.na(zip_code) & is.na(phone) & is.na(email))) |>
  142. arrange(candidate_id, election_dt)
  143. # candidate_party ----
  144. candidate_party <-
  145. cl_raw |>
  146. distinct(candidate_id, election_dt, party_candidate) |>
  147. mutate(party_candidate = forcats::fct_inorder(party_candidate)) |>
  148. filter(!is.na(party_candidate)) |>
  149. arrange(candidate_id, election_dt)
  150. # Extract contests (remaining data in candidate_listing) ----
  151. cols_related <- setdiff(
  152. c(colnames(candidate_name_on_ballot), colnames(candidate_address), colnames(candidate_party)),
  153. c("election_dt", "candidate_id")
  154. )
  155. contests <-
  156. cl_raw |>
  157. select(election_dt:name_suffix_lbl, candidate_id) |>
  158. select(-any_of(cols_related)) |>
  159. relocate(candidate_id, .before = first_name)
  160. # Get current candidate information ----
  161. candidate_best_name <-
  162. candidate_names |>
  163. mutate(n_char = nchar(paste(first_name, middle_name, last_name, name_suffix_lbl))) |>
  164. arrange(desc(n_char), desc(election_dt)) |>
  165. slice_max(n_char, n = 1, by = candidate_id, with_ties = FALSE) |>
  166. select(-n_char, -election_dt)
  167. current_contact <-
  168. candidate_listing_current_contact_info(candidate_address)
  169. current_name_on_ballot <-
  170. candidate_name_on_ballot |>
  171. slice_max(election_dt, by = candidate_id, n = 1, with_ties = FALSE) |>
  172. select(-election_dt)
  173. current_party <-
  174. candidate_party |>
  175. filter(!is.na(party_candidate)) |>
  176. group_by(candidate_id) |>
  177. slice_max(election_dt, n = 1) |>
  178. arrange(party_candidate) |>
  179. slice_head(n = 1) |>
  180. ungroup() |>
  181. select(-election_dt) |>
  182. rename(party_last = party_candidate)
  183. current_most_party <-
  184. candidate_party |>
  185. filter(!is.na(party_candidate)) |>
  186. group_by(candidate_id) |>
  187. count(party_candidate) |>
  188. slice_max(n, n = 1) |>
  189. arrange(party_candidate) |>
  190. slice_head(n = 1) |>
  191. ungroup() |>
  192. select(-n) |>
  193. rename(party_most = party_candidate)
  194. current_contest_count <-
  195. contests |>
  196. group_by(candidate_id) |>
  197. distinct(candidate_id, election_dt, contest_name) |>
  198. summarize(
  199. contest_n = n(),
  200. contest_first = min(election_dt),
  201. contest_latest = max(election_dt)
  202. )
  203. # Join candidates into one big table ----
  204. left_join_by_candidate <- function(x, y) {
  205. left_join(x, y, join_by(candidate_id), relationship = "one-to-one")
  206. }
  207. candidates <-
  208. candidate_best_name |>
  209. left_join_by_candidate(current_name_on_ballot) |>
  210. left_join_by_candidate(current_contact) |>
  211. left_join_by_candidate(current_party) |>
  212. left_join_by_candidate(current_most_party) |>
  213. left_join_by_candidate(current_contest_count) |>
  214. relocate(name_on_ballot, .before = first_name) |>
  215. relocate(starts_with("party"), .before = street) |>
  216. relocate(starts_with("contest"), .before = street) |>
  217. add_address_lookup_local(street = street, postal_code = zip_code)
  218. # Return list of tables
  219. list(
  220. cl_elections = contests,
  221. cl_candidates = candidates,
  222. cl_name_on_ballot = candidate_name_on_ballot,
  223. cl_contact = candidate_address,
  224. cl_party = candidate_party
  225. )
  226. }
  227. candidate_listing_current_contact_info <- function(candidate_address) {
  228. candidate_phone_current <-
  229. candidate_address |>
  230. filter(!is.na(phone)) |>
  231. slice_max(election_dt, by = candidate_id, n = 1, with_ties = FALSE) |>
  232. select(candidate_id, phone)
  233. candidate_email_current <-
  234. candidate_address |>
  235. filter(!is.na(email)) |>
  236. slice_max(election_dt, by = candidate_id, n = 1, with_ties = FALSE) |>
  237. select(candidate_id, email)
  238. candidate_address_current <-
  239. candidate_address |>
  240. group_by(candidate_id) |>
  241. slice_max(election_dt, n = 1) |>
  242. select(-election_dt) |>
  243. filter(
  244. n_distinct(street) == 1 | !grepl("PO BOX", street)
  245. ) |>
  246. slice_head(n = 1) |>
  247. select(candidate_id, street:zip_code) |>
  248. ungroup()
  249. candidate_address_current |>
  250. left_join(candidate_phone_current, by = "candidate_id", relationship = "one-to-one") |>
  251. left_join(candidate_email_current, by = "candidate_id", relationship = "one-to-one")
  252. }
  253. # This function isn't used anymore -- I opted for a more manual approach of
  254. # using overlapping signals. But this general idea could work in other places,
  255. # e.g. for deduping donors.
  256. fastlink_candidate_listing <- function(candidate_listing_raw) {
  257. data <-
  258. candidate_listing_raw |>
  259. distinct(
  260. name_on_ballot,
  261. first_name,
  262. middle_name,
  263. last_name,
  264. name_suffix_lbl,
  265. street_address,
  266. city
  267. ) |>
  268. mutate(
  269. name_on_ballot_clean = sub(" \\(.+?\\)\\s?", "", name_on_ballot)
  270. )
  271. linked <- fastLink::fastLink(
  272. data,
  273. data,
  274. varnames = c(
  275. "name_on_ballot_clean",
  276. "first_name", "middle_name", "last_name", "name_suffix_lbl",
  277. "street_address", "city"
  278. ),
  279. stringdist.match = c("middle_name"),
  280. stringdist.method = "jw",
  281. jw.weight = .25,
  282. threshold.match = 0.98
  283. )
  284. matches <-
  285. fastLink::getMatches(
  286. data,
  287. data,
  288. linked,
  289. threshold.match = 0.9
  290. ) |>
  291. as_tibble()
  292. list(
  293. data = data,
  294. linked = linked,
  295. matches = matches
  296. )
  297. }