選択できるのは25トピックまでです。 トピックは、先頭が英数字で、英数字とダッシュ('-')を使用した35文字以内のものにしてください。

355 行
11KB

  1. fastlink_candidate_listing <- function(candidate_listing_raw) {
  2. data <-
  3. candidate_listing_raw |>
  4. distinct(
  5. name_on_ballot,
  6. first_name,
  7. middle_name,
  8. last_name,
  9. name_suffix_lbl,
  10. street_address,
  11. city
  12. ) |>
  13. mutate(
  14. name_on_ballot_clean = sub(" \\(.+?\\)\\s?", "", name_on_ballot)
  15. )
  16. linked <- fastLink::fastLink(
  17. data,
  18. data,
  19. varnames = c(
  20. "name_on_ballot_clean",
  21. "first_name", "middle_name", "last_name", "name_suffix_lbl",
  22. "street_address", "city"
  23. ),
  24. stringdist.match = c("middle_name"),
  25. stringdist.method = "jw",
  26. jw.weight = .25,
  27. threshold.match = 0.98
  28. )
  29. matches <-
  30. fastLink::getMatches(
  31. data,
  32. data,
  33. linked,
  34. threshold.match = 0.9
  35. ) |>
  36. as_tibble()
  37. list(
  38. data = data,
  39. linked = linked,
  40. matches = matches
  41. )
  42. }
  43. prep_candidates_dedupe_mapping <- function(
  44. candidate_listing_raw,
  45. candidate_listing_dedupe
  46. ) {
  47. # candidate names ----
  48. # Extract candidate names, these will be primary keys for the candidates table
  49. candidate_names <-
  50. candidate_listing_raw |>
  51. arrange(last_name, first_name, middle_name) |>
  52. distinct(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) |>
  53. mutate(candidate_id = row_number(), .before = 1)
  54. # Find last election ----
  55. candidates_last_contest <-
  56. candidate_listing_raw |>
  57. group_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) |>
  58. slice_max(election_dt, n = 1) |>
  59. distinct(contest_last = election_dt)
  60. deduped_ids <-
  61. candidate_listing_dedupe$matches |>
  62. left_join(
  63. candidate_names,
  64. by = join_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl)
  65. ) |>
  66. distinct(dupe_id = dedupe.ids, candidate_id) |>
  67. add_count(dupe_id) |>
  68. filter(n > 1) |>
  69. select(-n) |>
  70. mutate(dupe_id = fct_infreq(paste(dupe_id))) |>
  71. arrange(dupe_id, candidate_id) |>
  72. group_split(dupe_id)
  73. mapping <- candidate_names |> select(candidate_id) |> mutate(candidate_group = candidate_id)
  74. for (dupes in deduped_ids) {
  75. map_group <- left_join(dupes[-1], mapping, by = "candidate_id")
  76. all_ids <- union(map_group$candidate_id, map_group$candidate_group)
  77. map_others <- mapping |> filter(candidate_group %in% all_ids)
  78. browser(expr = nrow(map_group) < nrow(map_others))
  79. # recompute current grouping to min of all ids
  80. update <-
  81. dplyr::union(map_group, map_others) |>
  82. mutate(candidate_group = min(candidate_id, candidate_group))
  83. mapping <- rows_update(mapping, update, by = "candidate_id")
  84. }
  85. mapping |>
  86. left_join(candidate_names, by = "candidate_id") |>
  87. left_join(candidates_last_contest, by = join_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl))
  88. }
  89. prep_dedupe_candidates <- function(candidate_listing_raw) {
  90. candidate_names <-
  91. candidate_listing_raw |>
  92. arrange(last_name, first_name, middle_name) |>
  93. distinct(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl, street_address, city, contest_name, county_name) |>
  94. mutate(candidate_id = row_number(), .before = 1)
  95. # first, last, street_address ----
  96. f_l_street <-
  97. candidate_names |>
  98. group_by(first_name, last_name, street_address) |>
  99. mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |>
  100. ungroup()
  101. # first, last, cleaned contest name
  102. f_l_cleaned_contest <-
  103. candidate_names |>
  104. mutate(
  105. contest_name_clean = if_else(
  106. !is.na(city) & map2_lgl(city, contest_name, \(c, cn) grepl(paste("TOWN OF", c), cn)),
  107. map2_chr(city, contest_name, \(c, cn) sub(paste("TOWN OF", c, ""), "", cn)),
  108. contest_name
  109. ),
  110. contest_name_clean = sub("([A-Z]+ ){1,2}COUNTY ", "", contest_name_clean)
  111. ) |>
  112. group_by(first_name, last_name, contest_name_clean) |>
  113. mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |>
  114. ungroup()
  115. # first, middle, last, county_name
  116. f_m_l_county <-
  117. candidate_names |>
  118. group_by(first_name, middle_name, last_name, county_name) |>
  119. mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |>
  120. ungroup()
  121. # first, middle initial, last, county_name
  122. f_mi_l_county <-
  123. candidate_names |>
  124. mutate(middle_name = stringr::str_sub(middle_name, 1, 1)) |>
  125. group_by(first_name, middle_name, last_name, county_name) |>
  126. mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |>
  127. ungroup()
  128. # name_on_ballot, middle_name
  129. nb_m <-
  130. candidate_names |>
  131. mutate(
  132. name_ballot_clean = sub("(Mrs?|Dr|Mr|Ms|Miss)[.]? ", "", name_on_ballot),
  133. name_ballot_clean = gsub("[.]", "", name_ballot_clean),
  134. ) |>
  135. group_by(name_ballot_clean, middle_name) |>
  136. mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |>
  137. ungroup()
  138. candidate_names |>
  139. dedupe_update_mapping(f_l_street, candidate_id, candidate_group, name = "first, last, street") |>
  140. dedupe_update_mapping(f_l_cleaned_contest, candidate_id, candidate_group, name = "first, last, contest") |>
  141. dedupe_update_mapping(f_m_l_county, candidate_id, candidate_group, name = "first, middle, last, county") |>
  142. dedupe_update_mapping(f_mi_l_county, candidate_id, candidate_group, name = "first, mi, last, county") |>
  143. dedupe_update_mapping(nb_m, candidate_id, candidate_group, name = "ballot name, middle name")
  144. }
  145. prep_candidate_listing <- function(
  146. candidate_listing_raw,
  147. candidate_listing_dedupe
  148. ) {
  149. # Give candidates a unique id from dedupe mapping
  150. cl_dedupe <-
  151. candidate_listing_dedupe |>
  152. group_by(candidate_group) |>
  153. mutate(candidate_id = cur_group_id()) |>
  154. ungroup() |>
  155. select(-candidate_group)
  156. # cl raw + candidate_id ----
  157. cl_raw <-
  158. candidate_listing_raw |>
  159. left_join(cl_dedupe, by = setdiff(names(cl_dedupe), "candidate_id"))
  160. # candidate_names ----
  161. candidate_names <-
  162. cl_raw |>
  163. distinct(candidate_id, election_dt, first_name, middle_name, last_name, name_suffix_lbl) |>
  164. arrange(candidate_id, election_dt)
  165. # candidate_name_on_ballot ----
  166. candidate_name_on_ballot <-
  167. cl_raw |>
  168. distinct(candidate_id, election_dt, name_on_ballot) |>
  169. arrange(candidate_id, election_dt)
  170. # candidate_address ----
  171. candidate_address <-
  172. cl_raw |>
  173. mutate(
  174. phone = coalesce(phone, office_phone, business_phone),
  175. street_address = toupper(street_address),
  176. street_address = stringr::str_replace_all(street_address, " +", " "),
  177. street_address = trimws(street_address),
  178. ) |>
  179. select(-office_phone, -business_phone) |>
  180. distinct(candidate_id, election_dt, street = street_address, city, state, zip_code, phone, email) |>
  181. add_address_lookup_local(street = street, postal_code = zip_code) |>
  182. # remove rows that are entirely empty except for candidate_id
  183. filter(!(is.na(street) & is.na(city) & is.na(state) & is.na(zip_code) & is.na(phone) & is.na(email))) |>
  184. arrange(candidate_id, election_dt)
  185. # candidate_party ----
  186. candidate_party <-
  187. cl_raw |>
  188. distinct(candidate_id, election_dt, party_candidate) |>
  189. mutate(party_candidate = forcats::fct_inorder(party_candidate)) |>
  190. filter(!is.na(party_candidate)) |>
  191. arrange(candidate_id, election_dt)
  192. # Extract contests (remaining data in candidate_listing) ----
  193. cols_related <- setdiff(
  194. c(colnames(candidate_name_on_ballot), colnames(candidate_address), colnames(candidate_party)),
  195. c("election_dt", "candidate_id")
  196. )
  197. contests <-
  198. cl_raw |>
  199. select(election_dt:name_suffix_lbl, candidate_id) |>
  200. select(-any_of(cols_related)) |>
  201. relocate(candidate_id, .before = first_name)
  202. # Get current candidate information ----
  203. candidate_best_name <-
  204. candidate_names |>
  205. mutate(n_char = nchar(paste(first_name, middle_name, last_name, name_suffix_lbl))) |>
  206. arrange(desc(n_char), desc(election_dt)) |>
  207. slice_max(n_char, n = 1, by = candidate_id, with_ties = FALSE) |>
  208. select(-n_char, -election_dt)
  209. current_contact <-
  210. candidate_listing_current_contact_info(candidate_address)
  211. current_name_on_ballot <-
  212. candidate_name_on_ballot |>
  213. slice_max(election_dt, by = candidate_id, n = 1, with_ties = FALSE) |>
  214. select(-election_dt)
  215. current_party <-
  216. candidate_party |>
  217. filter(!is.na(party_candidate)) |>
  218. group_by(candidate_id) |>
  219. slice_max(election_dt, n = 1) |>
  220. arrange(party_candidate) |>
  221. slice_head(n = 1) |>
  222. ungroup() |>
  223. select(-election_dt) |>
  224. rename(party_last = party_candidate)
  225. current_most_party <-
  226. candidate_party |>
  227. filter(!is.na(party_candidate)) |>
  228. group_by(candidate_id) |>
  229. count(party_candidate) |>
  230. slice_max(n, n = 1) |>
  231. arrange(party_candidate) |>
  232. slice_head(n = 1) |>
  233. ungroup() |>
  234. select(-n) |>
  235. rename(party_most = party_candidate)
  236. current_contest_count <-
  237. contests |>
  238. group_by(candidate_id) |>
  239. distinct(candidate_id, election_dt, contest_name) |>
  240. summarize(
  241. contest_n = n(),
  242. contest_first = min(election_dt),
  243. contest_latest = max(election_dt)
  244. )
  245. # Join candidates into one big table ----
  246. left_join_by_candidate <- function(x, y) {
  247. left_join(x, y, join_by(candidate_id), relationship = "one-to-one")
  248. }
  249. candidates <-
  250. candidate_best_name |>
  251. left_join_by_candidate(current_name_on_ballot) |>
  252. left_join_by_candidate(current_contact) |>
  253. left_join_by_candidate(current_party) |>
  254. left_join_by_candidate(current_most_party) |>
  255. left_join_by_candidate(current_contest_count) |>
  256. relocate(name_on_ballot, .before = first_name) |>
  257. relocate(starts_with("party"), .before = street) |>
  258. relocate(starts_with("contest"), .before = street) |>
  259. add_address_lookup_local(street = street, postal_code = zip_code)
  260. # Return list of tables
  261. list(
  262. cl_elections = contests,
  263. cl_candidates = candidates,
  264. cl_name_on_ballot = candidate_name_on_ballot,
  265. cl_contact = candidate_address,
  266. cl_party = candidate_party
  267. )
  268. }
  269. extract_candidate_info <- function(
  270. candidate_listing_contest,
  271. candidate_names,
  272. info_vars
  273. ) {
  274. candidate_listing_contest |>
  275. select(first_name:name_suffix_lbl, election_dt, all_of(info_vars)) |>
  276. distinct() |>
  277. left_join(
  278. candidate_names,
  279. by = c("first_name", "middle_name", "last_name", "name_suffix_lbl")
  280. ) |>
  281. select(-first_name, -middle_name, -last_name, -name_suffix_lbl) |>
  282. relocate(candidate_id, .before = 1) |>
  283. distinct() |>
  284. arrange(candidate_id, election_dt, !!!rlang::syms(info_vars))
  285. }
  286. candidate_listing_current_contact_info <- function(candidate_address) {
  287. candidate_phone_current <-
  288. candidate_address |>
  289. filter(!is.na(phone)) |>
  290. slice_max(election_dt, by = candidate_id, n = 1, with_ties = FALSE) |>
  291. select(candidate_id, phone)
  292. candidate_email_current <-
  293. candidate_address |>
  294. filter(!is.na(email)) |>
  295. slice_max(election_dt, by = candidate_id, n = 1, with_ties = FALSE) |>
  296. select(candidate_id, email)
  297. candidate_address_current <-
  298. candidate_address |>
  299. group_by(candidate_id) |>
  300. slice_max(election_dt, n = 1) |>
  301. select(-election_dt) |>
  302. filter(
  303. n_distinct(street) == 1 | !grepl("PO BOX", street)
  304. ) |>
  305. slice_head(n = 1) |>
  306. select(candidate_id, street:zip_code) |>
  307. ungroup()
  308. candidate_address_current |>
  309. left_join(candidate_phone_current, by = "candidate_id", relationship = "one-to-one") |>
  310. left_join(candidate_email_current, by = "candidate_id", relationship = "one-to-one")
  311. }