選択できるのは25トピックまでです。 トピックは、先頭が英数字で、英数字とダッシュ('-')を使用した35文字以内のものにしてください。

179 行
4.6KB

  1. prep_collect_addresses_raw <- function(
  2. path_officers = "../data-prep/officers",
  3. path_receipts = "../data-prep/receipts",
  4. path_expenditures = "../data-prep/expenditures",
  5. # path_voters = "../data-raw/voters/ncvoter_statewide.parquet"
  6. path_voters = NULL,
  7. path_candidate_listing = NULL
  8. ) {
  9. address_officers <- prep_collect_addresses_raw_officers(path_officers)
  10. address_receipts <-
  11. arrow::open_dataset(path_receipts, partitioning = "sboe_id") |>
  12. collect_full_addresses_from_parts()
  13. address_expenditures <-
  14. arrow::open_dataset(path_expenditures, partitioning = "sboe_id") |>
  15. collect_full_addresses_from_parts()
  16. address_candidate_listing <-
  17. if (!is.null(path_candidate_listing)) {
  18. arrow::open_dataset(path_candidate_listing) |>
  19. dplyr::filter(!is.na(state)) |>
  20. collect_full_addresses_from_parts(
  21. street = street_address,
  22. postal_code = zip_code
  23. )
  24. }
  25. address_voters <-
  26. if (!is.null(path_voters)) {
  27. arrow::open_dataset(path_voters) |>
  28. collect_full_addresses_from_parts(
  29. street = res_street_address,
  30. city = res_city_desc,
  31. state = state_cd,
  32. postal_code = zip_code
  33. )
  34. }
  35. dplyr::bind_rows(
  36. address_voters,
  37. address_candidate_listing,
  38. address_receipts,
  39. address_expenditures,
  40. address_officers,
  41. ) |>
  42. dplyr::mutate(address = fixup_po_box(address)) |>
  43. dplyr::distinct(address, .keep_all = TRUE)
  44. }
  45. prep_collect_addresses_raw_officers <- function(
  46. path_officers = "../data-prep/officers"
  47. ) {
  48. address_officers <-
  49. arrow::open_dataset(path_officers, partitioning = "sboe_id") |>
  50. dplyr::filter(!is.na(address)) |>
  51. dplyr::mutate(address = toupper(address)) |>
  52. dplyr::distinct(address) |>
  53. dplyr::collect() |>
  54. dplyr::mutate(
  55. address = stringr::str_replace(
  56. address,
  57. "(\\d{5})-\\d{4}$",
  58. "\\1"
  59. )
  60. )
  61. address_officers_parts <-
  62. poster::parse_addr(address_officers$address) |>
  63. dplyr::select(city, state, postal_code) |>
  64. dplyr::mutate(across(everything(), toupper))
  65. # address_officers <-
  66. address_officers |>
  67. dplyr::bind_cols(address_officers_parts) |>
  68. dplyr::mutate(
  69. address_minus_street = paste("", city, state, postal_code, sep = ", "),
  70. street = stringr::str_remove(address, stringr::fixed(address_minus_street)),
  71. ) |>
  72. dplyr::select(-address_minus_street) |>
  73. dplyr::relocate(street, .before = city)
  74. }
  75. add_address_lookup <- function(
  76. df,
  77. street = street_1,
  78. city = city,
  79. state = state,
  80. postal_code = full_zip,
  81. name = "address_lookup"
  82. ) {
  83. addresses <-
  84. df |>
  85. dplyr::filter(!is.na({{ street }})) |>
  86. dplyr::distinct(
  87. street = {{ street }},
  88. city = {{ city }},
  89. state = {{ state }},
  90. postal_code = {{ postal_code }}
  91. ) |>
  92. dplyr::mutate(
  93. state = coalesce(state, "NC"),
  94. !!name := REGEXP_REPLACE(
  95. UPPER(paste(street, city, state, substr(postal_code, 1, 5), sep = ", ")),
  96. " +", " "
  97. )
  98. )
  99. dplyr::left_join(
  100. df,
  101. addresses,
  102. by = dplyr::join_by(
  103. {{ street }} == street,
  104. {{ city }} == city,
  105. {{ state }} == state,
  106. {{ postal_code }} == postal_code
  107. )
  108. )
  109. }
  110. add_address_lookup_local <- function(
  111. df,
  112. street = street_1,
  113. city = city,
  114. state = state,
  115. postal_code = full_zip,
  116. name = "address_lookup"
  117. ) {
  118. addresses <-
  119. df |>
  120. dplyr::filter(!is.na({{ street }})) |>
  121. dplyr::distinct(
  122. street = {{ street }},
  123. city = {{ city }},
  124. state = {{ state }},
  125. postal_code = {{ postal_code }}
  126. ) |>
  127. dplyr::mutate(
  128. state = coalesce(state, "NC"),
  129. !!name := toupper(paste(street, city, state, substr(postal_code, 1, 5), sep = ", ")),
  130. !!name := gsub(" +", " ", !!rlang::sym(name))
  131. )
  132. dplyr::left_join(
  133. df,
  134. addresses,
  135. by = dplyr::join_by(
  136. {{ street }} == street,
  137. {{ city }} == city,
  138. {{ state }} == state,
  139. {{ postal_code }} == postal_code
  140. )
  141. )
  142. }
  143. collect_full_addresses_from_parts <- function(
  144. df,
  145. street = street_1,
  146. city = city,
  147. state = state,
  148. postal_code = full_zip
  149. ) {
  150. df |>
  151. dplyr::filter(!is.na({{ street }})) |>
  152. dplyr::distinct(
  153. street = {{ street }},
  154. city = {{ city }},
  155. state = {{ state }},
  156. postal_code = substr({{ postal_code }}, 1, 5)
  157. ) |>
  158. dplyr::collect() |>
  159. dplyr::mutate(
  160. address = glue::glue("{street}, {city}, {if_else(is.na(state), 'NC', state)}, {postal_code}", .na = ""),
  161. address = toupper(address),
  162. address = gsub(" +", " ", address)
  163. ) |>
  164. dplyr::relocate(address, .before = 1)
  165. }