Quellcode durchsuchen

work snapshot

main
Garrick Aden-Buie vor 2 Jahren
Ursprung
Commit
8d9ac72fc5
Es konnte kein GPG-Schlüssel zu dieser Signatur gefunden werden
12 geänderte Dateien mit 28694 neuen und 27 gelöschten Zeilen
  1. +27
    -0
      process/R/candidate_listing.R
  2. +1
    -0
      process/R/list_transpose_bind.R
  3. +67
    -0
      process/R/prep_addresses_build_db.R
  4. +3
    -0
      process/R/prep_open_dataset.R
  5. +143
    -0
      process/R/prepare_addresses.R
  6. +1
    -0
      process/R/prepare_candidates.R
  7. +1
    -0
      process/R/process_report_dates.R
  8. +15
    -0
      process/R/utils-address.R
  9. +5
    -0
      process/R/utils-parquet.R
  10. +192
    -0
      process/R/voters.R
  11. +52
    -4
      process/_targets.R
  12. +28187
    -23
      process/_targets/meta/meta

+ 27
- 0
process/R/candidate_listing.R Datei anzeigen

@@ -0,0 +1,27 @@
get_candidate_listing <- function(years = 2016:2023) {
years |>
map(get_candidate_listing_year) |>
bind_rows() |>
type_convert(
col_types = cols(
election_dt = col_date(format = "%m/%d/%Y"),
candidacy_dt = col_date(format = "%m/%d/%Y")
)
) |>
mutate(
across(
contains("phone"),
\(.x) sub("(\\d{3})(\\d{3})(\\d{4})", "(\\1) \\2-\\3", .x)
),
across(street_address, fixup_po_box)
)
}

get_candidate_listing_year <- function(year) {
url <- glue::glue("https://s3.amazonaws.com/dl.ncsbe.gov/Elections/{year}/Candidate%20Filing/Candidate_Listing_{year}.csv")
read_csv(
url,
col_types = cols(.default = col_character()),
locale = locale(encoding = "latin1")
)
}

+ 1
- 0
process/R/list_transpose_bind.R Datei anzeigen

@@ -1,4 +1,5 @@
list_transpose_bind <- function(x) {
if (is.null(x) || length(x) == 0) return(x)
purrr::reduce(x, list_transpose_bind_impl)
}


+ 67
- 0
process/R/prep_addresses_build_db.R Datei anzeigen

@@ -0,0 +1,67 @@
prepare_addresses_lookup_db <- function(
addresses,
path_address_db = "../data-prep/address_lookup.sqlite"
) {
if (!fs::file_exists(path_address_db)) {
prepare_addresses_create_db(path_address_db)
}

con <- DBI::dbConnect(RSQLite::SQLite(), path_address_db)
withr::defer(DBI::dbDisconnect(con))
db <- dplyr::tbl(con, "resolved")

seen <- db |> dplyr::pull(address)

if (length(seen)) {
addresses <- setdiff(trimws(addresses), seen)
}

if (!length(addresses)) {
cli::cli_alert_success("All addresses have been geocoded")
return(path_address_db)
}

cli::cli_inform("Geocoding {length(addresses)} addresses")

blocks <- seq(1, length(addresses), by = 5000L)
if (blocks[length(blocks)] != length(addresses)) {
blocks <- c(blocks, length(addresses) + 1L)
}
blocks <- blocks - 1L
for (i in seq_along(blocks)[-1]) {
start <- blocks[i - 1] + 1
end <- blocks[i]

resolved <-
tidygeocoder::geo(
addresses[start:end],
method = "census",
full_results = TRUE
) |>
dplyr::select(-input_address)

dplyr::rows_upsert(db, resolved, by = "address", in_place = TRUE, copy = TRUE)
}

return(path_address_db)
}

prepare_addresses_create_db <- function(path_address_db) {
sql <- "CREATE TABLE `resolved` (
`address` TEXT UNIQUE,
`lat` REAL,
`long` REAL,
`id` INTEGER,
`match_indicator` TEXT,
`match_type` TEXT,
`matched_address` TEXT,
`tiger_line_id` INTEGER,
`tiger_side` TEXT
)"

con <- DBI::dbConnect(RSQLite::SQLite(), path_address_db)
DBI::dbExecute(con, sql)
DBI::dbDisconnect(con)

invisible(path_address_db)
}

+ 3
- 0
process/R/prep_open_dataset.R Datei anzeigen

@@ -2,10 +2,13 @@ prep_open_dataset <- function(path_prep, partitioning = "sboe_id", ...) {
if (!fs::file_exists(path_prep)) {
path_here <- here::here("data-prep/", path_prep)
path_up <- fs::path("..", "data-prep", path_prep)
path_up2 <- fs::path("..", "..", "data-prep", path_prep)
if (fs::file_exists(path_here)) {
path_prep <- path_here
} else if (fs::file_exists(path_up)) {
path_prep <- path_up
} else if (fs::file_exists(path_up2)) {
path_prep <- path_up2
} else {
stop("File not found: ", path_prep)
}

+ 143
- 0
process/R/prepare_addresses.R Datei anzeigen

@@ -0,0 +1,143 @@
prep_collect_addresses_raw <- function(
path_officers = "../data-prep/officers",
path_receipts = "../data-prep/receipts",
path_expenditures = "../data-prep/expenditures",
# path_voters = "../data-raw/voters/ncvoter_statewide.parquet"
path_voters = NULL,
path_candidate_listing = NULL
) {
address_officers <- prep_collect_addresses_raw_officers(path_officers)

address_receipts <-
arrow::open_dataset(path_receipts, partitioning = "sboe_id") |>
collect_full_addresses_from_parts()

address_expenditures <-
arrow::open_dataset(path_expenditures, partitioning = "sboe_id") |>
collect_full_addresses_from_parts()

address_candidate_listing <-
if (!is.null(path_candidate_listing)) {
arrow::open_dataset(path_candidate_listing) |>
dplyr::filter(!is.na(state)) |>
collect_full_addresses_from_parts(
street = street_address,
postal_code = zip_code
)
}

address_voters <-
if (!is.null(path_voters)) {
arrow::open_dataset(path_voters) |>
collect_full_addresses_from_parts(
street = res_street_address,
city = res_city_desc,
state = state_cd,
postal_code = zip_code
)
}

dplyr::bind_rows(
address_voters,
address_candidate_listing,
address_receipts,
address_expenditures,
address_officers,
) |>
dplyr::mutate(address = fixup_po_box(address)) |>
dplyr::distinct(address, .keep_all = TRUE)
}

prep_collect_addresses_raw_officers <- function(
path_officers = "../data-prep/officers"
) {
address_officers <-
arrow::open_dataset(path_officers, partitioning = "sboe_id") |>
dplyr::filter(!is.na(address)) |>
dplyr::mutate(address = toupper(address)) |>
dplyr::distinct(address) |>
dplyr::collect() |>
dplyr::mutate(
address = stringr::str_replace(
address,
"(\\d{5})-\\d{4}$",
"\\1"
)
)

address_officers_parts <-
poster::parse_addr(address_officers$address) |>
dplyr::select(city, state, postal_code) |>
dplyr::mutate(across(everything(), toupper))

# address_officers <-
address_officers |>
dplyr::bind_cols(address_officers_parts) |>
dplyr::mutate(
address_minus_street = paste("", city, state, postal_code, sep = ", "),
street = stringr::str_remove(address, stringr::fixed(address_minus_street)),
) |>
dplyr::select(-address_minus_street) |>
dplyr::relocate(street, .before = city)
}

add_address_lookup <- function(
df,
street = street_1,
city = city,
state = state,
postal_code = full_zip,
name = "address_lookup"
) {
addresses <-
df |>
dplyr::filter(!is.na({{ street }})) |>
dplyr::distinct(
street = {{ street }},
city = {{ city }},
state = {{ state }},
postal_code = {{ postal_code }}
) |>
dplyr::mutate(
state = coalesce(state, "NC"),
!!name := REGEXP_REPLACE(
UPPER(paste(street, city, state, substr(postal_code, 1, 5), sep = ", ")),
" +", " "
)
)

dplyr::left_join(
df,
addresses,
by = dplyr::join_by(
{{ street }} == street,
{{ city }} == city,
{{ state }} == state,
{{ postal_code }} == postal_code
)
)
}

collect_full_addresses_from_parts <- function(
df,
street = street_1,
city = city,
state = state,
postal_code = full_zip
) {
df |>
dplyr::filter(!is.na({{ street }})) |>
dplyr::distinct(
street = {{ street }},
city = {{ city }},
state = {{ state }},
postal_code = substr({{ postal_code }}, 1, 5)
) |>
dplyr::collect() |>
dplyr::mutate(
address = glue::glue("{street}, {city}, {if_else(is.na(state), 'NC', state)}, {postal_code}", .na = ""),
address = toupper(address),
address = gsub(" +", " ", address)
) |>
dplyr::relocate(address, .before = 1)
}

+ 1
- 0
process/R/prepare_candidates.R Datei anzeigen

@@ -4,6 +4,7 @@ prepare_candidates <- function(path_officers = "../data-prep/officers", report_l
officers <- officers_pq |> filter(type == "Candidate") |> collect()

officers |>
filter(type == "Candidate") |>
filter(!is.na(name)) |>
mutate(
name_display = name,

+ 1
- 0
process/R/process_report_dates.R Datei anzeigen

@@ -1,6 +1,7 @@
process_report_dates <- function(report_list_raw, cover_raw) {
cover_dates <-
cover_raw |>
distinct() |>
select(
report_id,
cover_start_date = date_from,

+ 15
- 0
process/R/utils-address.R Datei anzeigen

@@ -0,0 +1,15 @@
fixup_po_box <- function(x) {
gsub(
"P\\s*[.]*\\s*O\\s*[.]*\\s*BOX|POST OFFICE BOX",
"PO BOX",
x
)
}

fixup_po_box_query <- function(x) {
REGEXP_REPLACE(
"P\\s*[.]*\\s*O\\s*[.]*\\s*BOX|POST OFFICE BOX",
"PO BOX",
x
)
}

+ 5
- 0
process/R/utils-parquet.R Datei anzeigen

@@ -0,0 +1,5 @@
write_parquet <- function(x, path, ...) {
fs::dir_create(fs::path_dir(path))
arrow::write_parquet(x, path, ...)
path
}

+ 192
- 0
process/R/voters.R Datei anzeigen

@@ -0,0 +1,192 @@
voter_statewide_download <- function(output_dir = here::here("../data-raw/voters")) {
url <- "https://s3.amazonaws.com/dl.ncsbe.gov/data/ncvoter_Statewide.zip"
fs::dir_create(output_dir)
path <- fs::path(output_dir, fs::path_file(url))
download.file(url, path)

withr::with_dir(output_dir, {
zip::unzip(fs::path_file(url))
})

invisible(fs::path_ext_set(path, "txt"))
}

voter_statewide_convert_parquet <- function(path) {
path <- fs::path_norm(path)
path_out <- fs::path_ext_set(tolower(path), "parquet")

x <- readr::read_tsv(path, col_types = voter_statewide_spec())
arrow::write_parquet(x, path_out)

invisible(path_out)
}

voter_statewide_spec <- function() {
col_state_abbr <- col_factor(
levels = c(
state.abb,
"AP", "DC", "GU", "MP", "NO", "OC", "PR", "UN", "VI"
)
)

codes_status <- c(
"A" = "ACTIVE",
"D" = "DENIED",
"I" = "INACTIVE",
"R" = "REMOVED",
"S" = "TEMPORARY"
)

codes_race <- c(
"A" = "ASIAN",
"B" = "BLACK or AFRICAN AMERICAN",
"I" = "AMERICAN INDIAN or ALASKA NATIVE",
"M" = "TWO or MORE RACES ",
"O" = "OTHER",
"P" = "NATIVE HAWAIIAN or PACIFIC ISLANDER",
"U" = "UNDESIGNATED",
"W" = "WHITE"
)

codes_ethnic <- c(
"HL" = "HISPANIC or LATINO",
"NL" = "NOT HISPANIC or NOT LATINO",
"UN" = "UNDESIGNATED"
)

codes_gender <- c(
"F" = "FEMALE",
"M" = "MALE",
"U" = "UNDESIGNATED"
)

codes_reason <- c(
"AV" = "VERIFIED",
"IN" = "CONFIRMATION NOT RETURNED",
"RD" = "DECEASED",
"IU" = "CONFIRMATION RETURNED UNDELIVERABLE",
"DU" = "VERIFICATION RETURNED UNDELIVERABLE",
"RM" = "REMOVED AFTER 2 FED GENERAL ELECTIONS IN INACTIVE STATUS",
"RL" = "MOVED FROM COUNTY",
"RS" = "MOVED FROM STATE",
"A2" = "CONFIRMATION PENDING",
"AP" = "VERIFICATION PENDING",
"DI" = "UNAVAILABLE ESSENTIAL INFORMATION",
"RF" = "FELONY CONVICTION",
"RH" = "MOVED WITHIN STATE",
"RQ" = "REQUEST FROM VOTER",
"SO" = "OVERSEAS CITIZEN",
"SM" = "MILITARY",
"RT" = "TEMPORARY REGISTRANT",
"RA" = "ADMINISTRATIVE",
"A1" = "UNVERIFIED"
)

cols(
county_id = col_character(),
county_desc = col_character(),
voter_reg_num = col_character(),
ncid = col_character(),
last_name = col_character(),
first_name = col_character(),
middle_name = col_character(),
name_suffix_lbl = col_character(),
status_cd = col_factor(names(codes_status), ordered = TRUE),
voter_status_desc = col_factor(unname(codes_status), ordered = TRUE),
reason_cd = col_factor(names(codes_reason)),
voter_status_reason_desc = col_factor(unname(codes_reason)),
res_street_address = col_character(),
res_city_desc = col_character(),
state_cd = col_state_abbr,
zip_code = col_character(),
mail_addr1 = col_character(),
mail_addr2 = col_character(),
mail_addr3 = col_character(),
mail_addr4 = col_character(),
mail_city = col_character(),
mail_state = col_state_abbr,
mail_zipcode = col_character(),
full_phone_number = col_character(),
confidential_ind = col_character(),
registr_dt = col_date(format = "%m/%d/%Y"),
race_code = col_factor(names(codes_race)),
ethnic_code = col_factor(names(codes_ethnic)),
party_cd = col_factor(),
gender_code = col_factor(names(codes_gender)),
birth_year = col_integer(),
age_at_year_end = col_integer(),
birth_state = col_state_abbr,
drivers_lic = col_character(),
precinct_abbrv = col_factor(),
precinct_desc = col_character(),
municipality_abbrv = col_factor(),
municipality_desc = col_character(),
ward_abbrv = col_factor(),
ward_desc = col_character(),
cong_dist_abbrv = col_factor(),
super_court_abbrv = col_factor(),
judic_dist_abbrv = col_factor(),
nc_senate_abbrv = col_factor(),
nc_house_abbrv = col_factor(),
county_commiss_abbrv = col_factor(),
county_commiss_desc = col_character(),
township_abbrv = col_factor(),
township_desc = col_character(),
school_dist_abbrv = col_factor(),
school_dist_desc = col_character(),
fire_dist_abbrv = col_factor(),
fire_dist_desc = col_character(),
water_dist_abbrv = col_factor(),
water_dist_desc = col_character(),
sewer_dist_abbrv = col_factor(),
sewer_dist_desc = col_character(),
sanit_dist_abbrv = col_factor(),
sanit_dist_desc = col_character(),
rescue_dist_abbrv = col_factor(),
rescue_dist_desc = col_character(),
munic_dist_abbrv = col_factor(),
munic_dist_desc = col_character(),
dist_1_abbrv = col_factor(),
dist_1_desc = col_character(),
vtd_abbrv = col_factor(),
vtd_desc = col_character()
)
}

voter_snapshot_list <- function() {
url <- "https://s3.amazonaws.com/dl.ncsbe.gov/?delimiter=/&prefix=data/Snapshots/"

res <-
httr2::request(url) |>
httr2::req_perform() |>
httr2::resp_body_xml() |>
xml2::as_list()

res <- res$ListBucketResult
res <- res[which(names(res) == "Contents")]
res <- map(res, unlist)
res <- dplyr::bind_rows(res)
res <- janitor::clean_names(res)
res$size <- rlang::as_bytes(as.integer(res$size))
res$url <- paste0("https://s3.amazonaws.com/dl.ncsbe.gov/", res$key)
res
}

voter_snapshot_download <- function(year, output_dir = here::here("data-raw/voting")) {
listing <-
get_voter_snapshot_list() |>
dplyr::filter(str_detect(key, paste0("VR_Snapshot_", year)))

if (!nrow(listing)) {
return(NULL)
} else if (nrow(listing) > 1) {
listing <- listing |> dplyr::slice_max(key, n = 1)
}

fs::dir_create(output_dir)

download.file(
listing$url,
fs::path(output_dir, fs::path_file(listing$key))
)
}

+ 52
- 4
process/_targets.R Datei anzeigen

@@ -27,20 +27,30 @@ list(
tar_target(report_list_raw, arrow::read_parquet(path_report_list_raw)),

tar_target(
dirs_all,
dirs_all_src,
fs::dir_ls("../data-raw/reports", glob = "**/all", recurse = TRUE, type = "directory"),
format = "file"
),
# This comes from Will's answer in https://stackoverflow.com/a/70293576
# We're basically tricking targets into letting us branch over a file target
tar_target(dirs_all_names, dirs_all_src),
tar_target(dirs_all, {dirs_all_src; dirs_all_names}, pattern = map(dirs_all_names), format = "file"),

tar_target(
dirs_receipts,
dirs_receipts_src,
fs::dir_ls("../data-raw/reports", glob = "**/receipts", recurse = TRUE, type = "directory"),
format = "file"
),
tar_target(dirs_receipts_names, dirs_receipts_src),
tar_target(dirs_receipts, {dirs_receipts_src; dirs_receipts_names}, pattern = map(dirs_receipts_names), format = "file"),

tar_target(
dirs_expenditures,
dirs_expenditures_src,
fs::dir_ls("../data-raw/reports", glob = "**/expenditures", recurse = TRUE, type = "directory"),
format = "file"
),
tar_target(dirs_expenditures_names, dirs_expenditures_src),
tar_target(dirs_expenditures, {dirs_expenditures_src; dirs_expenditures_names}, pattern = map(dirs_expenditures_names), format = "file"),

tar_target(
paths_all_parquet,
@@ -65,6 +75,8 @@ list(

tar_target(path_data_prep_cover, { paths_all_parquet; "../data-prep/cover" }, format = "file"),
tar_target(path_data_prep_officers, { paths_all_parquet; "../data-prep/officers" }, format = "file"),
tar_target(path_data_prep_receipts, { paths_all_parquet; "../data-prep/receipts" }, format = "file"),
tar_target(path_data_prep_expenditures, { paths_all_parquet; "../data-prep/expenditures" }, format = "file"),


tar_target(
@@ -76,12 +88,36 @@ list(
report_dates,
process_report_dates(report_list_raw, cover_raw)
),
tar_target(
path_report_dates, {
out_path <- "../data-prep/report_dates/part-0.parquet"
fs::dir_create(fs::path_dir(out_path))
arrow::write_parquet(report_dates, out_path)
}),

tar_target(
report_amended_score,
calc_report_amended_score(report_dates)
),

tar_target(
addresses_raw,
prep_collect_addresses_raw(
path_officers = path_data_prep_officers,
path_receipts = path_data_prep_receipts,
path_expenditures = path_data_prep_expenditures,
path_candidate_listing = path_candidate_listing,
path_voters = NULL # path_voters_parquet
),
format = "parquet"
),

tar_target(
path_addresses_db,
prepare_addresses_lookup_db(addresses_raw$address)
),

# This report list uses the latest amended report -----
tar_target(
report_list,
process_report_list(report_list_raw, report_amended_score)
@@ -89,5 +125,17 @@ list(

tar_target(committees, prepare_committees(cover_raw, report_list)),

tar_target(candidates, prepare_candidates(path_data_prep_officers, report_list))
tar_target(candidates, prepare_candidates(path_data_prep_officers, report_list)),

# Outside data sources -----
tar_target(candidate_listing, get_candidate_listing(2016:2023)),
tar_target(path_candidate_listing, write_parquet(candidate_listing, "../data-prep/candidate_listing/part-0.parquet")),

## Voter registration records
tar_target(path_voters_txt, voter_statewide_download(), cue = tar_cue("never")), #<< invalidate to get latest
tar_target(
path_voters_parquet,
voter_statewide_convert_parquet(path_voters_txt),
cue = tar_cue("never")
)
)

+ 28187
- 23
process/_targets/meta/meta
Datei-Diff unterdrückt, da er zu groß ist
Datei anzeigen


Laden…
Abbrechen
Speichern