Nie możesz wybrać więcej, niż 25 tematów Tematy muszą się zaczynać od litery lub cyfry, mogą zawierać myślniki ('-') i mogą mieć do 35 znaków.

341 lines
11KB

  1. prep_dedupe_candidates <- function(candidate_listing_raw) {
  2. candidate_names <-
  3. candidate_listing_raw |>
  4. arrange(last_name, first_name, middle_name) |>
  5. distinct(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl, street_address, city, contest_name, county_name) |>
  6. mutate(candidate_id = row_number(), .before = 1)
  7. # first, last, street_address ----
  8. f_l_street <-
  9. candidate_names |>
  10. group_by(first_name, last_name, street_address) |>
  11. mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |>
  12. ungroup()
  13. # first, last, cleaned contest name
  14. f_l_cleaned_contest <-
  15. candidate_names |>
  16. group_by(first_name, last_name, contest_name) |>
  17. mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |>
  18. ungroup()
  19. # first, middle, last, county_name
  20. f_m_l_county <-
  21. candidate_names |>
  22. group_by(first_name, middle_name, last_name, county_name) |>
  23. mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |>
  24. ungroup()
  25. # first, middle initial, last, county_name
  26. f_mi_l_county <-
  27. candidate_names |>
  28. mutate(middle_name = stringr::str_sub(middle_name, 1, 1)) |>
  29. group_by(first_name, middle_name, last_name, county_name) |>
  30. mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |>
  31. ungroup()
  32. # name_on_ballot, middle_name
  33. nb_m <-
  34. candidate_names |>
  35. mutate(
  36. name_ballot_clean = sub("(Mrs?|Dr|Mr|Ms|Miss)[.]? ", "", name_on_ballot),
  37. name_ballot_clean = gsub("[.]", "", name_ballot_clean),
  38. ) |>
  39. group_by(name_ballot_clean, middle_name) |>
  40. mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |>
  41. ungroup()
  42. candidate_names |>
  43. dedupe_update_mapping(f_l_street, candidate_id, candidate_group, name = "first, last, street") |>
  44. dedupe_update_mapping(f_l_cleaned_contest, candidate_id, candidate_group, name = "first, last, contest") |>
  45. dedupe_update_mapping(f_m_l_county, candidate_id, candidate_group, name = "first, middle, last, county") |>
  46. dedupe_update_mapping(f_mi_l_county, candidate_id, candidate_group, name = "first, mi, last, county") |>
  47. dedupe_update_mapping(nb_m, candidate_id, candidate_group, name = "ballot name, middle name")
  48. }
  49. prep_candidate_listing <- function(
  50. candidate_listing_raw,
  51. candidate_listing_dedupe
  52. ) {
  53. # Give candidates a unique id from dedupe mapping
  54. cl_dedupe <-
  55. candidate_listing_dedupe |>
  56. group_by(candidate_group) |>
  57. mutate(candidate_id = cur_group_id()) |>
  58. ungroup() |>
  59. select(-candidate_group)
  60. # cl raw + candidate_id ----
  61. cl_raw <-
  62. candidate_listing_raw |>
  63. left_join(cl_dedupe, by = setdiff(names(cl_dedupe), "candidate_id"))
  64. # candidate_names ----
  65. candidate_names <-
  66. cl_raw |>
  67. distinct(candidate_id, election_dt, first_name, middle_name, last_name, name_suffix_lbl) |>
  68. arrange(candidate_id, election_dt)
  69. # candidate_name_on_ballot ----
  70. candidate_name_on_ballot <-
  71. cl_raw |>
  72. distinct(candidate_id, election_dt, name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) |>
  73. arrange(candidate_id, election_dt)
  74. # candidate_address ----
  75. candidate_address <-
  76. cl_raw |>
  77. mutate(
  78. phone = coalesce(phone, office_phone, business_phone),
  79. street_address = toupper(street_address),
  80. street_address = stringr::str_replace_all(street_address, " +", " "),
  81. street_address = trimws(street_address),
  82. ) |>
  83. select(-office_phone, -business_phone) |>
  84. distinct(candidate_id, election_dt, street = street_address, city, state, zip_code, phone, email) |>
  85. add_address_lookup_local(street = street, postal_code = zip_code) |>
  86. # remove rows that are entirely empty except for candidate_id
  87. filter(!(is.na(street) & is.na(city) & is.na(state) & is.na(zip_code) & is.na(phone) & is.na(email))) |>
  88. arrange(candidate_id, election_dt)
  89. # candidate_party ----
  90. candidate_party <-
  91. cl_raw |>
  92. distinct(candidate_id, election_dt, party_candidate) |>
  93. mutate(party_candidate = forcats::fct_inorder(party_candidate)) |>
  94. filter(!is.na(party_candidate)) |>
  95. arrange(candidate_id, election_dt)
  96. # Extract contests (remaining data in candidate_listing) ----
  97. cols_related <- setdiff(
  98. c(colnames(candidate_name_on_ballot), colnames(candidate_address), colnames(candidate_party)),
  99. c("election_dt", colnames(candidate_names))
  100. )
  101. contests <-
  102. cl_raw |>
  103. select(election_dt:name_suffix_lbl, candidate_id) |>
  104. select(-any_of(cols_related)) |>
  105. relocate(candidate_id, .before = first_name)
  106. # Get current candidate information ----
  107. candidate_best_name <-
  108. candidate_names |>
  109. mutate(n_char = nchar(paste(first_name, middle_name, last_name, name_suffix_lbl))) |>
  110. arrange(desc(n_char), desc(election_dt)) |>
  111. slice_max(n_char, n = 1, by = candidate_id, with_ties = FALSE) |>
  112. select(-n_char, -election_dt)
  113. current_contact <-
  114. candidate_listing_current_contact_info(candidate_address)
  115. candidate_contest_count <-
  116. cl_raw |>
  117. distinct(candidate_id, election_dt, contest_name) |>
  118. slice_max(election_dt, by = candidate_id, n = 1)
  119. current_name_on_ballot <-
  120. candidate_name_on_ballot |>
  121. slice_max(election_dt, by = candidate_id, n = 1, with_ties = FALSE) |>
  122. select(candidate_id, name_on_ballot)
  123. current_party <-
  124. candidate_party |>
  125. filter(!is.na(party_candidate)) |>
  126. group_by(candidate_id) |>
  127. slice_max(election_dt, n = 1) |>
  128. arrange(party_candidate) |>
  129. slice_head(n = 1) |>
  130. ungroup() |>
  131. select(-election_dt) |>
  132. rename(party_last = party_candidate)
  133. current_most_party <-
  134. candidate_party |>
  135. filter(!is.na(party_candidate)) |>
  136. group_by(candidate_id) |>
  137. count(party_candidate) |>
  138. slice_max(n, n = 1) |>
  139. arrange(party_candidate) |>
  140. slice_head(n = 1) |>
  141. ungroup() |>
  142. select(-n) |>
  143. rename(party_most = party_candidate)
  144. current_contest_count <-
  145. contests |>
  146. group_by(candidate_id) |>
  147. distinct(candidate_id, election_dt, contest_name) |>
  148. summarize(
  149. contest_n = n(),
  150. contest_first = min(election_dt),
  151. contest_latest = max(election_dt)
  152. )
  153. # Join candidates into one big table ----
  154. left_join_by_candidate <- function(x, y) {
  155. left_join(x, y, join_by(candidate_id), relationship = "one-to-one")
  156. }
  157. candidates <-
  158. candidate_best_name |>
  159. left_join_by_candidate(current_name_on_ballot) |>
  160. left_join_by_candidate(current_contact) |>
  161. left_join_by_candidate(current_party) |>
  162. left_join_by_candidate(current_most_party) |>
  163. left_join_by_candidate(current_contest_count) |>
  164. relocate(name_on_ballot, .before = first_name) |>
  165. relocate(starts_with("party"), .before = street) |>
  166. relocate(starts_with("contest"), .before = street) |>
  167. add_address_lookup_local(street = street, postal_code = zip_code) |>
  168. arrange(candidate_id)
  169. # Return list of tables
  170. list(
  171. cl_elections = contests,
  172. cl_candidates = candidates,
  173. cl_name_on_ballot = candidate_name_on_ballot,
  174. cl_contact = candidate_address,
  175. cl_party = candidate_party
  176. )
  177. }
  178. candidate_listing_current_contact_info <- function(candidate_address) {
  179. candidate_phone_current <-
  180. candidate_address |>
  181. filter(!is.na(phone)) |>
  182. slice_max(election_dt, by = candidate_id, n = 1, with_ties = FALSE) |>
  183. select(candidate_id, phone)
  184. candidate_email_current <-
  185. candidate_address |>
  186. filter(!is.na(email)) |>
  187. slice_max(election_dt, by = candidate_id, n = 1, with_ties = FALSE) |>
  188. select(candidate_id, email)
  189. candidate_address_current <-
  190. candidate_address |>
  191. group_by(candidate_id) |>
  192. slice_max(election_dt, n = 1) |>
  193. select(-election_dt) |>
  194. filter(
  195. n_distinct(street) == 1 | !grepl("PO BOX", street)
  196. ) |>
  197. slice_head(n = 1) |>
  198. select(candidate_id, street:zip_code) |>
  199. ungroup()
  200. candidate_address_current |>
  201. left_join(candidate_phone_current, by = "candidate_id", relationship = "one-to-one") |>
  202. left_join(candidate_email_current, by = "candidate_id", relationship = "one-to-one")
  203. }
  204. # This function isn't used anymore -- I opted for a more manual approach of
  205. # using overlapping signals. But this general idea could work in other places,
  206. # e.g. for deduping donors.
  207. fastlink_candidate_listing <- function(candidate_listing_raw) {
  208. data <-
  209. candidate_listing_raw |>
  210. distinct(
  211. name_on_ballot,
  212. first_name,
  213. middle_name,
  214. last_name,
  215. name_suffix_lbl,
  216. street_address,
  217. city
  218. ) |>
  219. mutate(
  220. name_on_ballot_clean = sub(" \\(.+?\\)\\s?", "", name_on_ballot)
  221. )
  222. linked <- fastLink::fastLink(
  223. data,
  224. data,
  225. varnames = c(
  226. "name_on_ballot_clean",
  227. "first_name", "middle_name", "last_name", "name_suffix_lbl",
  228. "street_address", "city"
  229. ),
  230. stringdist.match = c("middle_name"),
  231. stringdist.method = "jw",
  232. jw.weight = .25,
  233. threshold.match = 0.98
  234. )
  235. matches <-
  236. fastLink::getMatches(
  237. data,
  238. data,
  239. linked,
  240. threshold.match = 0.9
  241. ) |>
  242. as_tibble()
  243. list(
  244. data = data,
  245. linked = linked,
  246. matches = matches
  247. )
  248. }
  249. # This also isn't used anymore...
  250. prep_candidates_dedupe_mapping <- function(
  251. candidate_listing_raw,
  252. candidate_listing_dedupe
  253. ) {
  254. # candidate names ----
  255. # Extract candidate names, these will be primary keys for the candidates table
  256. candidate_names <-
  257. candidate_listing_raw |>
  258. arrange(last_name, first_name, middle_name) |>
  259. distinct(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) |>
  260. mutate(candidate_id = row_number(), .before = 1)
  261. # Find last election ----
  262. candidates_last_contest <-
  263. candidate_listing_raw |>
  264. group_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) |>
  265. slice_max(election_dt, n = 1) |>
  266. distinct(contest_last = election_dt)
  267. deduped_ids <-
  268. candidate_listing_dedupe$matches |>
  269. left_join(
  270. candidate_names,
  271. by = join_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl)
  272. ) |>
  273. distinct(dupe_id = dedupe.ids, candidate_id) |>
  274. add_count(dupe_id) |>
  275. filter(n > 1) |>
  276. select(-n) |>
  277. mutate(dupe_id = fct_infreq(paste(dupe_id))) |>
  278. arrange(dupe_id, candidate_id) |>
  279. group_split(dupe_id)
  280. mapping <- candidate_names |> select(candidate_id) |> mutate(candidate_group = candidate_id)
  281. for (dupes in deduped_ids) {
  282. map_group <- left_join(dupes[-1], mapping, by = "candidate_id")
  283. all_ids <- union(map_group$candidate_id, map_group$candidate_group)
  284. map_others <- mapping |> filter(candidate_group %in% all_ids)
  285. browser(expr = nrow(map_group) < nrow(map_others))
  286. # recompute current grouping to min of all ids
  287. update <-
  288. dplyr::union(map_group, map_others) |>
  289. mutate(candidate_group = min(candidate_id, candidate_group))
  290. mapping <- rows_update(mapping, update, by = "candidate_id")
  291. }
  292. mapping |>
  293. left_join(candidate_names, by = "candidate_id") |>
  294. left_join(candidates_last_contest, by = join_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl))
  295. }