garrick
/
nc-campaign-finance


			
				
					
						
						
							
							prep_collect_addresses_raw <- function(
  path_officers = "../data-prep/officers",
  path_receipts = "../data-prep/receipts",
  path_expenditures = "../data-prep/expenditures",
  # path_voters = "../data-raw/voters/ncvoter_statewide.parquet"
  path_voters = NULL,
  path_candidate_listing = NULL
) {
  address_officers <- prep_collect_addresses_raw_officers(path_officers)

  address_receipts <-
    arrow::open_dataset(path_receipts, partitioning = "sboe_id") |>
    collect_full_addresses_from_parts()

  address_expenditures <-
    arrow::open_dataset(path_expenditures, partitioning = "sboe_id") |>
    collect_full_addresses_from_parts()

  address_candidate_listing <-
    if (!is.null(path_candidate_listing)) {
      arrow::open_dataset(path_candidate_listing) |>
      dplyr::filter(!is.na(state)) |>
      collect_full_addresses_from_parts(
        street = street_address,
        postal_code = zip_code
      )
    }

  address_voters <-
    if (!is.null(path_voters)) {
      arrow::open_dataset(path_voters) |>
        collect_full_addresses_from_parts(
          street = res_street_address,
          city = res_city_desc,
          state = state_cd,
          postal_code = zip_code
        )
    }

  dplyr::bind_rows(
    address_voters,
    address_candidate_listing,
    address_receipts,
    address_expenditures,
    address_officers,
  ) |>
    dplyr::mutate(address = fixup_po_box(address)) |>
    dplyr::distinct(address, .keep_all = TRUE)
}

prep_collect_addresses_raw_officers <- function(
  path_officers = "../data-prep/officers"
) {
  address_officers <-
    arrow::open_dataset(path_officers, partitioning = "sboe_id") |>
    dplyr::filter(!is.na(address)) |>
    dplyr::mutate(address = toupper(address)) |>
    dplyr::distinct(address) |>
    dplyr::collect() |>
    dplyr::mutate(
      address = stringr::str_replace(
        address,
        "(\\d{5})-\\d{4}$",
        "\\1"
      )
    )

  address_officers_parts <-
    poster::parse_addr(address_officers$address) |>
    dplyr::select(city, state, postal_code) |>
    dplyr::mutate(across(everything(), toupper))

  # address_officers <-
  address_officers |>
    dplyr::bind_cols(address_officers_parts) |>
    dplyr::mutate(
      address_minus_street = paste("", city, state, postal_code, sep = ", "),
      street = stringr::str_remove(address, stringr::fixed(address_minus_street)),
    ) |>
    dplyr::select(-address_minus_street) |>
    dplyr::relocate(street, .before = city)
}

add_address_lookup <- function(
  df,
  street = street_1,
  city = city,
  state = state,
  postal_code = full_zip,
  name = "address_lookup"
) {
  addresses <-
    df |>
    dplyr::filter(!is.na({{ street }})) |>
    dplyr::distinct(
      street = {{ street }},
      city = {{ city }},
      state = {{ state }},
      postal_code = {{ postal_code }}
    ) |>
    dplyr::mutate(
      state = coalesce(state, "NC"),
      !!name := REGEXP_REPLACE(
        UPPER(paste(street, city, state, substr(postal_code, 1, 5), sep = ", ")),
        " +", " "
      )
    )

  dplyr::left_join(
    df,
    addresses,
    by = dplyr::join_by(
      {{ street }} == street,
      {{ city }} == city,
      {{ state }} == state,
      {{ postal_code }} == postal_code
    )
  )
}

add_address_lookup_local <- function(
  df,
  street = street_1,
  city = city,
  state = state,
  postal_code = full_zip,
  name = "address_lookup"
) {
  addresses <-
    df |>
    dplyr::filter(!is.na({{ street }})) |>
    dplyr::distinct(
      street = {{ street }},
      city = {{ city }},
      state = {{ state }},
      postal_code = {{ postal_code }}
    ) |>
    dplyr::mutate(
      state = coalesce(state, "NC"),
      !!name := toupper(paste(street, city, state, substr(postal_code, 1, 5), sep = ", ")),
      !!name := gsub(" +", " ", !!rlang::sym(name))
    )

  dplyr::left_join(
    df,
    addresses,
    by = dplyr::join_by(
      {{ street }} == street,
      {{ city }} == city,
      {{ state }} == state,
      {{ postal_code }} == postal_code
    )
  )
}

collect_full_addresses_from_parts <- function(
  df,
  street = street_1,
  city = city,
  state = state,
  postal_code = full_zip
) {
  df |>
    dplyr::filter(!is.na({{ street }})) |>
    dplyr::distinct(
      street = {{ street }},
      city = {{ city }},
      state = {{ state }},
      postal_code = substr({{ postal_code }}, 1, 5)
    ) |>
    dplyr::collect() |>
    dplyr::mutate(
      address = glue::glue("{street}, {city}, {if_else(is.na(state), 'NC', state)}, {postal_code}", .na = ""),
      address = toupper(address),
      address = gsub(" +", " ", address)
    ) |>
    dplyr::relocate(address, .before = 1)
}