You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

195 line
5.9KB

  1. prep_candidate_listing <- function(
  2. path_candidate_listing_raw
  3. ) {
  4. candidate_listing_contest <-
  5. prep_open_dataset_db(fs::path_dir(path_candidate_listing_raw)) |>
  6. filter(name_on_ballot != "No Preference") |>
  7. tidyr::replace_na(list(
  8. first_name = "",
  9. middle_name = "",
  10. last_name = "",
  11. name_suffix_lbl = ""
  12. ))
  13. # candidate names ----
  14. # Extract candidate names, these will be primary keys for the candidates table
  15. candidate_names <-
  16. candidate_listing_contest |>
  17. dbplyr::window_order(last_name, first_name, middle_name) |>
  18. distinct(first_name, middle_name, last_name, name_suffix_lbl) |>
  19. mutate(candidate_id = row_number(), .before = 1)
  20. # candidate_name_on_ballot ----
  21. candidate_name_on_ballot <-
  22. extract_candidate_info(
  23. candidate_listing_contest,
  24. candidate_names,
  25. info_vars = c("name_on_ballot")
  26. ) |>
  27. collect()
  28. # candidate_address ----
  29. candidate_address <-
  30. candidate_listing_contest |>
  31. mutate(
  32. phone = coalesce(phone, office_phone, business_phone),
  33. street_address = toupper(street_address),
  34. street_address = REGEXP_REPLACE(street_address, " +", " ", "g"),
  35. street_address = trimws(street_address),
  36. ) |>
  37. select(-office_phone, -business_phone) |>
  38. extract_candidate_info(
  39. candidate_names,
  40. info_vars = c("street_address", "city", "state", "zip_code", "phone", "email")
  41. ) |>
  42. collect() |>
  43. rename(street = street_address)
  44. # candidate_party ----
  45. candidate_party <-
  46. candidate_listing_contest |>
  47. extract_candidate_info(
  48. candidate_names,
  49. info_vars = c("party_candidate")
  50. ) |>
  51. collect() |>
  52. mutate(party_candidate = forcats::fct_inorder(party_candidate))
  53. # Extract contests (remaining data in candidate_listing) ----
  54. cols_candidate_id <- intersect(colnames(candidate_names), colnames(candidate_listing_contest))
  55. cols_related <- setdiff(
  56. c(colnames(candidate_name_on_ballot), colnames(candidate_address), colnames(candidate_party)),
  57. "election_dt"
  58. )
  59. contests <-
  60. candidate_listing_contest |>
  61. select(election_dt:name_suffix_lbl) |>
  62. select(-any_of(cols_related)) |>
  63. left_join(candidate_names, by = cols_candidate_id) |>
  64. relocate(candidate_id, .before = first_name) |>
  65. collect()
  66. # Get current complete contact information ----
  67. candidate_contact_current <-
  68. candidate_listing_current_contact_info(candidate_address)
  69. # Join candidates into one big table ----
  70. candidates <-
  71. candidate_names |>
  72. collect() |>
  73. left_join(
  74. candidate_name_on_ballot |>
  75. slice_max(election_dt, by = candidate_id, n = 1) |>
  76. select(-election_dt),
  77. by = "candidate_id",
  78. relationship = "one-to-one"
  79. ) |>
  80. left_join(
  81. candidate_contact_current,
  82. by = "candidate_id",
  83. relationship = "one-to-one"
  84. ) |>
  85. left_join(
  86. candidate_party |>
  87. filter(!is.na(party_candidate)) |>
  88. group_by(candidate_id) |>
  89. slice_max(election_dt, n = 1) |>
  90. arrange(party_candidate) |>
  91. slice_head(n = 1) |>
  92. ungroup() |>
  93. select(-election_dt) |>
  94. rename(party_last = party_candidate),
  95. by = "candidate_id",
  96. relationship = "one-to-one"
  97. ) |>
  98. left_join(
  99. candidate_party |>
  100. filter(!is.na(party_candidate)) |>
  101. group_by(candidate_id) |>
  102. count(party_candidate) |>
  103. slice_max(n, n = 1) |>
  104. arrange(party_candidate) |>
  105. slice_head(n = 1) |>
  106. ungroup() |>
  107. select(-n) |>
  108. rename(party_most = party_candidate),
  109. by = "candidate_id",
  110. relationship = "one-to-one"
  111. ) |>
  112. left_join(
  113. contests |>
  114. group_by(candidate_id) |>
  115. distinct(candidate_id, election_dt, contest_name) |>
  116. summarize(
  117. contest_n = n(),
  118. contest_first = min(election_dt),
  119. contest_latest = max(election_dt)
  120. ),
  121. by = "candidate_id",
  122. relationship = "one-to-one"
  123. ) |>
  124. relocate(name_on_ballot, .before = first_name) |>
  125. relocate(starts_with("party"), .before = street) |>
  126. relocate(starts_with("contest"), .before = street) |>
  127. add_address_lookup(street = street, postal_code = zip_code)
  128. # Return list of tables
  129. list(
  130. elections = contests,
  131. candidates = candidates,
  132. candidate_name_on_ballot = candidate_name_on_ballot,
  133. candidate_contact = candidate_address,
  134. candidate_party = candidate_party
  135. )
  136. }
  137. extract_candidate_info <- function(
  138. candidate_listing_contest,
  139. candidate_names,
  140. info_vars
  141. ) {
  142. candidate_listing_contest |>
  143. select(first_name:name_suffix_lbl, election_dt, all_of(info_vars)) |>
  144. distinct() |>
  145. left_join(
  146. candidate_names,
  147. by = c("first_name", "middle_name", "last_name", "name_suffix_lbl")
  148. ) |>
  149. select(-first_name, -middle_name, -last_name, -name_suffix_lbl) |>
  150. relocate(candidate_id, .before = 1) |>
  151. distinct() |>
  152. arrange(candidate_id, election_dt, !!info_vars)
  153. }
  154. candidate_listing_current_contact_info <- function(candidate_address) {
  155. candidate_phone_current <-
  156. candidate_address |>
  157. filter(!is.na(phone)) |>
  158. slice_max(election_dt, by = candidate_id, n = 1, with_ties = FALSE) |>
  159. select(candidate_id, phone)
  160. candidate_email_current <-
  161. candidate_address |>
  162. filter(!is.na(email)) |>
  163. slice_max(election_dt, by = candidate_id, n = 1, with_ties = FALSE) |>
  164. select(candidate_id, email)
  165. candidate_address_current <-
  166. candidate_address |>
  167. group_by(candidate_id) |>
  168. slice_max(election_dt, n = 1) |>
  169. select(-election_dt) |>
  170. filter(
  171. n_distinct(street) == 1 | !grepl("PO BOX", street)
  172. ) |>
  173. slice_head(n = 1) |>
  174. select(candidate_id, street:zip_code) |>
  175. ungroup()
  176. candidate_address_current |>
  177. left_join(candidate_phone_current, by = "candidate_id", relationship = "one-to-one") |>
  178. left_join(candidate_email_current, by = "candidate_id", relationship = "one-to-one")
  179. }