|
- prep_collect_addresses_raw <- function(
- path_officers = "../data-prep/officers",
- path_receipts = "../data-prep/receipts",
- path_expenditures = "../data-prep/expenditures",
- # path_voters = "../data-raw/voters/ncvoter_statewide.parquet"
- path_voters = NULL,
- path_candidate_listing = NULL
- ) {
- address_officers <- prep_collect_addresses_raw_officers(path_officers)
-
- address_receipts <-
- arrow::open_dataset(path_receipts, partitioning = "sboe_id") |>
- collect_full_addresses_from_parts()
-
- address_expenditures <-
- arrow::open_dataset(path_expenditures, partitioning = "sboe_id") |>
- collect_full_addresses_from_parts()
-
- address_candidate_listing <-
- if (!is.null(path_candidate_listing)) {
- arrow::open_dataset(path_candidate_listing) |>
- dplyr::filter(!is.na(state)) |>
- collect_full_addresses_from_parts(
- street = street_address,
- postal_code = zip_code
- )
- }
-
- address_voters <-
- if (!is.null(path_voters)) {
- arrow::open_dataset(path_voters) |>
- collect_full_addresses_from_parts(
- street = res_street_address,
- city = res_city_desc,
- state = state_cd,
- postal_code = zip_code
- )
- }
-
- dplyr::bind_rows(
- address_voters,
- address_candidate_listing,
- address_receipts,
- address_expenditures,
- address_officers,
- ) |>
- dplyr::mutate(address = fixup_po_box(address)) |>
- dplyr::distinct(address, .keep_all = TRUE)
- }
-
- prep_collect_addresses_raw_officers <- function(
- path_officers = "../data-prep/officers"
- ) {
- address_officers <-
- arrow::open_dataset(path_officers, partitioning = "sboe_id") |>
- dplyr::filter(!is.na(address)) |>
- dplyr::mutate(address = toupper(address)) |>
- dplyr::distinct(address) |>
- dplyr::collect() |>
- dplyr::mutate(
- address = stringr::str_replace(
- address,
- "(\\d{5})-\\d{4}$",
- "\\1"
- )
- )
-
- address_officers_parts <-
- poster::parse_addr(address_officers$address) |>
- dplyr::select(city, state, postal_code) |>
- dplyr::mutate(across(everything(), toupper))
-
- # address_officers <-
- address_officers |>
- dplyr::bind_cols(address_officers_parts) |>
- dplyr::mutate(
- address_minus_street = paste("", city, state, postal_code, sep = ", "),
- street = stringr::str_remove(address, stringr::fixed(address_minus_street)),
- ) |>
- dplyr::select(-address_minus_street) |>
- dplyr::relocate(street, .before = city)
- }
-
- add_address_lookup <- function(
- df,
- street = street_1,
- city = city,
- state = state,
- postal_code = full_zip,
- name = "address_lookup"
- ) {
- addresses <-
- df |>
- dplyr::filter(!is.na({{ street }})) |>
- dplyr::distinct(
- street = {{ street }},
- city = {{ city }},
- state = {{ state }},
- postal_code = {{ postal_code }}
- ) |>
- dplyr::mutate(
- state = coalesce(state, "NC"),
- !!name := REGEXP_REPLACE(
- UPPER(paste(street, city, state, substr(postal_code, 1, 5), sep = ", ")),
- " +", " "
- )
- )
-
- dplyr::left_join(
- df,
- addresses,
- by = dplyr::join_by(
- {{ street }} == street,
- {{ city }} == city,
- {{ state }} == state,
- {{ postal_code }} == postal_code
- )
- )
- }
-
- add_address_lookup_local <- function(
- df,
- street = street_1,
- city = city,
- state = state,
- postal_code = full_zip,
- name = "address_lookup"
- ) {
- addresses <-
- df |>
- dplyr::filter(!is.na({{ street }})) |>
- dplyr::distinct(
- street = {{ street }},
- city = {{ city }},
- state = {{ state }},
- postal_code = {{ postal_code }}
- ) |>
- dplyr::mutate(
- state = coalesce(state, "NC"),
- !!name := toupper(paste(street, city, state, substr(postal_code, 1, 5), sep = ", ")),
- !!name := gsub(" +", " ", !!rlang::sym(name))
- )
-
- dplyr::left_join(
- df,
- addresses,
- by = dplyr::join_by(
- {{ street }} == street,
- {{ city }} == city,
- {{ state }} == state,
- {{ postal_code }} == postal_code
- )
- )
- }
-
- collect_full_addresses_from_parts <- function(
- df,
- street = street_1,
- city = city,
- state = state,
- postal_code = full_zip
- ) {
- df |>
- dplyr::filter(!is.na({{ street }})) |>
- dplyr::distinct(
- street = {{ street }},
- city = {{ city }},
- state = {{ state }},
- postal_code = substr({{ postal_code }}, 1, 5)
- ) |>
- dplyr::collect() |>
- dplyr::mutate(
- address = glue::glue("{street}, {city}, {if_else(is.na(state), 'NC', state)}, {postal_code}", .na = ""),
- address = toupper(address),
- address = gsub(" +", " ", address)
- ) |>
- dplyr::relocate(address, .before = 1)
- }
|