Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

343 lines
11KB

  1. prep_dedupe_candidates <- function(candidate_listing_raw) {
  2. candidate_names <-
  3. candidate_listing_raw |>
  4. arrange(last_name, first_name, middle_name) |>
  5. distinct(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl, street_address, city, contest_name, county_name) |>
  6. mutate(candidate_id = row_number(), .before = 1)
  7. # first, last, street_address ----
  8. f_l_street <-
  9. candidate_names |>
  10. group_by(first_name, last_name, street_address) |>
  11. mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |>
  12. ungroup()
  13. # first, last, cleaned contest name
  14. f_l_cleaned_contest <-
  15. candidate_names |>
  16. mutate(
  17. contest_name_clean = if_else(
  18. !is.na(city) & map2_lgl(city, contest_name, \(c, cn) grepl(paste("TOWN OF", c), cn)),
  19. map2_chr(city, contest_name, \(c, cn) sub(paste("TOWN OF", c, ""), "", cn)),
  20. contest_name
  21. ),
  22. contest_name_clean = sub("([A-Z]+ ){1,2}COUNTY ", "", contest_name_clean)
  23. ) |>
  24. group_by(first_name, last_name, contest_name_clean) |>
  25. mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |>
  26. ungroup()
  27. # first, middle, last, county_name
  28. f_m_l_county <-
  29. candidate_names |>
  30. group_by(first_name, middle_name, last_name, county_name) |>
  31. mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |>
  32. ungroup()
  33. # first, middle initial, last, county_name
  34. f_mi_l_county <-
  35. candidate_names |>
  36. mutate(middle_name = stringr::str_sub(middle_name, 1, 1)) |>
  37. group_by(first_name, middle_name, last_name, county_name) |>
  38. mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |>
  39. ungroup()
  40. # name_on_ballot, middle_name
  41. nb_m <-
  42. candidate_names |>
  43. mutate(
  44. name_ballot_clean = sub("(Mrs?|Dr|Mr|Ms|Miss)[.]? ", "", name_on_ballot),
  45. name_ballot_clean = gsub("[.]", "", name_ballot_clean),
  46. ) |>
  47. group_by(name_ballot_clean, middle_name) |>
  48. mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |>
  49. ungroup()
  50. candidate_names |>
  51. dedupe_update_mapping(f_l_street, candidate_id, candidate_group, name = "first, last, street") |>
  52. dedupe_update_mapping(f_l_cleaned_contest, candidate_id, candidate_group, name = "first, last, contest") |>
  53. dedupe_update_mapping(f_m_l_county, candidate_id, candidate_group, name = "first, middle, last, county") |>
  54. dedupe_update_mapping(f_mi_l_county, candidate_id, candidate_group, name = "first, mi, last, county") |>
  55. dedupe_update_mapping(nb_m, candidate_id, candidate_group, name = "ballot name, middle name")
  56. }
  57. prep_candidate_listing <- function(
  58. candidate_listing_raw,
  59. candidate_listing_dedupe
  60. ) {
  61. # Give candidates a unique id from dedupe mapping
  62. cl_dedupe <-
  63. candidate_listing_dedupe |>
  64. group_by(candidate_group) |>
  65. mutate(candidate_id = cur_group_id()) |>
  66. ungroup() |>
  67. select(-candidate_group)
  68. # cl raw + candidate_id ----
  69. cl_raw <-
  70. candidate_listing_raw |>
  71. left_join(cl_dedupe, by = setdiff(names(cl_dedupe), "candidate_id"))
  72. # candidate_names ----
  73. candidate_names <-
  74. cl_raw |>
  75. distinct(candidate_id, election_dt, first_name, middle_name, last_name, name_suffix_lbl) |>
  76. arrange(candidate_id, election_dt)
  77. # candidate_name_on_ballot ----
  78. candidate_name_on_ballot <-
  79. cl_raw |>
  80. distinct(candidate_id, election_dt, name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) |>
  81. arrange(candidate_id, election_dt)
  82. # candidate_address ----
  83. candidate_address <-
  84. cl_raw |>
  85. mutate(
  86. phone = coalesce(phone, office_phone, business_phone),
  87. street_address = toupper(street_address),
  88. street_address = stringr::str_replace_all(street_address, " +", " "),
  89. street_address = trimws(street_address),
  90. ) |>
  91. select(-office_phone, -business_phone) |>
  92. distinct(candidate_id, election_dt, street = street_address, city, state, zip_code, phone, email) |>
  93. add_address_lookup_local(street = street, postal_code = zip_code) |>
  94. # remove rows that are entirely empty except for candidate_id
  95. filter(!(is.na(street) & is.na(city) & is.na(state) & is.na(zip_code) & is.na(phone) & is.na(email))) |>
  96. arrange(candidate_id, election_dt)
  97. # candidate_party ----
  98. candidate_party <-
  99. cl_raw |>
  100. distinct(candidate_id, election_dt, party_candidate) |>
  101. mutate(party_candidate = forcats::fct_inorder(party_candidate)) |>
  102. filter(!is.na(party_candidate)) |>
  103. arrange(candidate_id, election_dt)
  104. # Extract contests (remaining data in candidate_listing) ----
  105. cols_related <- setdiff(
  106. c(colnames(candidate_name_on_ballot), colnames(candidate_address), colnames(candidate_party)),
  107. c("election_dt", colnames(candidate_names))
  108. )
  109. contests <-
  110. cl_raw |>
  111. select(election_dt:name_suffix_lbl, candidate_id) |>
  112. select(-any_of(cols_related)) |>
  113. relocate(candidate_id, .before = first_name)
  114. # Get current candidate information ----
  115. candidate_best_name <-
  116. candidate_names |>
  117. mutate(n_char = nchar(paste(first_name, middle_name, last_name, name_suffix_lbl))) |>
  118. arrange(desc(n_char), desc(election_dt)) |>
  119. slice_max(n_char, n = 1, by = candidate_id, with_ties = FALSE) |>
  120. select(-n_char, -election_dt)
  121. current_contact <-
  122. candidate_listing_current_contact_info(candidate_address)
  123. current_name_on_ballot <-
  124. candidate_name_on_ballot |>
  125. slice_max(election_dt, by = candidate_id, n = 1, with_ties = FALSE) |>
  126. select(candidate_id, name_on_ballot)
  127. current_party <-
  128. candidate_party |>
  129. filter(!is.na(party_candidate)) |>
  130. group_by(candidate_id) |>
  131. slice_max(election_dt, n = 1) |>
  132. arrange(party_candidate) |>
  133. slice_head(n = 1) |>
  134. ungroup() |>
  135. select(-election_dt) |>
  136. rename(party_last = party_candidate)
  137. current_most_party <-
  138. candidate_party |>
  139. filter(!is.na(party_candidate)) |>
  140. group_by(candidate_id) |>
  141. count(party_candidate) |>
  142. slice_max(n, n = 1) |>
  143. arrange(party_candidate) |>
  144. slice_head(n = 1) |>
  145. ungroup() |>
  146. select(-n) |>
  147. rename(party_most = party_candidate)
  148. current_contest_count <-
  149. contests |>
  150. group_by(candidate_id) |>
  151. distinct(candidate_id, election_dt, contest_name) |>
  152. summarize(
  153. contest_n = n(),
  154. contest_first = min(election_dt),
  155. contest_latest = max(election_dt)
  156. )
  157. # Join candidates into one big table ----
  158. left_join_by_candidate <- function(x, y) {
  159. left_join(x, y, join_by(candidate_id), relationship = "one-to-one")
  160. }
  161. candidates <-
  162. candidate_best_name |>
  163. left_join_by_candidate(current_name_on_ballot) |>
  164. left_join_by_candidate(current_contact) |>
  165. left_join_by_candidate(current_party) |>
  166. left_join_by_candidate(current_most_party) |>
  167. left_join_by_candidate(current_contest_count) |>
  168. relocate(name_on_ballot, .before = first_name) |>
  169. relocate(starts_with("party"), .before = street) |>
  170. relocate(starts_with("contest"), .before = street) |>
  171. add_address_lookup_local(street = street, postal_code = zip_code) |>
  172. arrange(candidate_id)
  173. # Return list of tables
  174. list(
  175. cl_elections = contests,
  176. cl_candidates = candidates,
  177. cl_name_on_ballot = candidate_name_on_ballot,
  178. cl_contact = candidate_address,
  179. cl_party = candidate_party
  180. )
  181. }
  182. candidate_listing_current_contact_info <- function(candidate_address) {
  183. candidate_phone_current <-
  184. candidate_address |>
  185. filter(!is.na(phone)) |>
  186. slice_max(election_dt, by = candidate_id, n = 1, with_ties = FALSE) |>
  187. select(candidate_id, phone)
  188. candidate_email_current <-
  189. candidate_address |>
  190. filter(!is.na(email)) |>
  191. slice_max(election_dt, by = candidate_id, n = 1, with_ties = FALSE) |>
  192. select(candidate_id, email)
  193. candidate_address_current <-
  194. candidate_address |>
  195. group_by(candidate_id) |>
  196. slice_max(election_dt, n = 1) |>
  197. select(-election_dt) |>
  198. filter(
  199. n_distinct(street) == 1 | !grepl("PO BOX", street)
  200. ) |>
  201. slice_head(n = 1) |>
  202. select(candidate_id, street:zip_code) |>
  203. ungroup()
  204. candidate_address_current |>
  205. left_join(candidate_phone_current, by = "candidate_id", relationship = "one-to-one") |>
  206. left_join(candidate_email_current, by = "candidate_id", relationship = "one-to-one")
  207. }
  208. # This function isn't used anymore -- I opted for a more manual approach of
  209. # using overlapping signals. But this general idea could work in other places,
  210. # e.g. for deduping donors.
  211. fastlink_candidate_listing <- function(candidate_listing_raw) {
  212. data <-
  213. candidate_listing_raw |>
  214. distinct(
  215. name_on_ballot,
  216. first_name,
  217. middle_name,
  218. last_name,
  219. name_suffix_lbl,
  220. street_address,
  221. city
  222. ) |>
  223. mutate(
  224. name_on_ballot_clean = sub(" \\(.+?\\)\\s?", "", name_on_ballot)
  225. )
  226. linked <- fastLink::fastLink(
  227. data,
  228. data,
  229. varnames = c(
  230. "name_on_ballot_clean",
  231. "first_name", "middle_name", "last_name", "name_suffix_lbl",
  232. "street_address", "city"
  233. ),
  234. stringdist.match = c("middle_name"),
  235. stringdist.method = "jw",
  236. jw.weight = .25,
  237. threshold.match = 0.98
  238. )
  239. matches <-
  240. fastLink::getMatches(
  241. data,
  242. data,
  243. linked,
  244. threshold.match = 0.9
  245. ) |>
  246. as_tibble()
  247. list(
  248. data = data,
  249. linked = linked,
  250. matches = matches
  251. )
  252. }
  253. # This also isn't used anymore...
  254. prep_candidates_dedupe_mapping <- function(
  255. candidate_listing_raw,
  256. candidate_listing_dedupe
  257. ) {
  258. # candidate names ----
  259. # Extract candidate names, these will be primary keys for the candidates table
  260. candidate_names <-
  261. candidate_listing_raw |>
  262. arrange(last_name, first_name, middle_name) |>
  263. distinct(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) |>
  264. mutate(candidate_id = row_number(), .before = 1)
  265. # Find last election ----
  266. candidates_last_contest <-
  267. candidate_listing_raw |>
  268. group_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) |>
  269. slice_max(election_dt, n = 1) |>
  270. distinct(contest_last = election_dt)
  271. deduped_ids <-
  272. candidate_listing_dedupe$matches |>
  273. left_join(
  274. candidate_names,
  275. by = join_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl)
  276. ) |>
  277. distinct(dupe_id = dedupe.ids, candidate_id) |>
  278. add_count(dupe_id) |>
  279. filter(n > 1) |>
  280. select(-n) |>
  281. mutate(dupe_id = fct_infreq(paste(dupe_id))) |>
  282. arrange(dupe_id, candidate_id) |>
  283. group_split(dupe_id)
  284. mapping <- candidate_names |> select(candidate_id) |> mutate(candidate_group = candidate_id)
  285. for (dupes in deduped_ids) {
  286. map_group <- left_join(dupes[-1], mapping, by = "candidate_id")
  287. all_ids <- union(map_group$candidate_id, map_group$candidate_group)
  288. map_others <- mapping |> filter(candidate_group %in% all_ids)
  289. browser(expr = nrow(map_group) < nrow(map_others))
  290. # recompute current grouping to min of all ids
  291. update <-
  292. dplyr::union(map_group, map_others) |>
  293. mutate(candidate_group = min(candidate_id, candidate_group))
  294. mapping <- rows_update(mapping, update, by = "candidate_id")
  295. }
  296. mapping |>
  297. left_join(candidate_names, by = "candidate_id") |>
  298. left_join(candidates_last_contest, by = join_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl))
  299. }