| @@ -0,0 +1,27 @@ | |||
| get_candidate_listing <- function(years = 2016:2023) { | |||
| years |> | |||
| map(get_candidate_listing_year) |> | |||
| bind_rows() |> | |||
| type_convert( | |||
| col_types = cols( | |||
| election_dt = col_date(format = "%m/%d/%Y"), | |||
| candidacy_dt = col_date(format = "%m/%d/%Y") | |||
| ) | |||
| ) |> | |||
| mutate( | |||
| across( | |||
| contains("phone"), | |||
| \(.x) sub("(\\d{3})(\\d{3})(\\d{4})", "(\\1) \\2-\\3", .x) | |||
| ), | |||
| across(street_address, fixup_po_box) | |||
| ) | |||
| } | |||
| get_candidate_listing_year <- function(year) { | |||
| url <- glue::glue("https://s3.amazonaws.com/dl.ncsbe.gov/Elections/{year}/Candidate%20Filing/Candidate_Listing_{year}.csv") | |||
| read_csv( | |||
| url, | |||
| col_types = cols(.default = col_character()), | |||
| locale = locale(encoding = "latin1") | |||
| ) | |||
| } | |||
| @@ -1,4 +1,5 @@ | |||
| list_transpose_bind <- function(x) { | |||
| if (is.null(x) || length(x) == 0) return(x) | |||
| purrr::reduce(x, list_transpose_bind_impl) | |||
| } | |||
| @@ -0,0 +1,67 @@ | |||
| prepare_addresses_lookup_db <- function( | |||
| addresses, | |||
| path_address_db = "../data-prep/address_lookup.sqlite" | |||
| ) { | |||
| if (!fs::file_exists(path_address_db)) { | |||
| prepare_addresses_create_db(path_address_db) | |||
| } | |||
| con <- DBI::dbConnect(RSQLite::SQLite(), path_address_db) | |||
| withr::defer(DBI::dbDisconnect(con)) | |||
| db <- dplyr::tbl(con, "resolved") | |||
| seen <- db |> dplyr::pull(address) | |||
| if (length(seen)) { | |||
| addresses <- setdiff(trimws(addresses), seen) | |||
| } | |||
| if (!length(addresses)) { | |||
| cli::cli_alert_success("All addresses have been geocoded") | |||
| return(path_address_db) | |||
| } | |||
| cli::cli_inform("Geocoding {length(addresses)} addresses") | |||
| blocks <- seq(1, length(addresses), by = 5000L) | |||
| if (blocks[length(blocks)] != length(addresses)) { | |||
| blocks <- c(blocks, length(addresses) + 1L) | |||
| } | |||
| blocks <- blocks - 1L | |||
| for (i in seq_along(blocks)[-1]) { | |||
| start <- blocks[i - 1] + 1 | |||
| end <- blocks[i] | |||
| resolved <- | |||
| tidygeocoder::geo( | |||
| addresses[start:end], | |||
| method = "census", | |||
| full_results = TRUE | |||
| ) |> | |||
| dplyr::select(-input_address) | |||
| dplyr::rows_upsert(db, resolved, by = "address", in_place = TRUE, copy = TRUE) | |||
| } | |||
| return(path_address_db) | |||
| } | |||
| prepare_addresses_create_db <- function(path_address_db) { | |||
| sql <- "CREATE TABLE `resolved` ( | |||
| `address` TEXT UNIQUE, | |||
| `lat` REAL, | |||
| `long` REAL, | |||
| `id` INTEGER, | |||
| `match_indicator` TEXT, | |||
| `match_type` TEXT, | |||
| `matched_address` TEXT, | |||
| `tiger_line_id` INTEGER, | |||
| `tiger_side` TEXT | |||
| )" | |||
| con <- DBI::dbConnect(RSQLite::SQLite(), path_address_db) | |||
| DBI::dbExecute(con, sql) | |||
| DBI::dbDisconnect(con) | |||
| invisible(path_address_db) | |||
| } | |||
| @@ -2,10 +2,13 @@ prep_open_dataset <- function(path_prep, partitioning = "sboe_id", ...) { | |||
| if (!fs::file_exists(path_prep)) { | |||
| path_here <- here::here("data-prep/", path_prep) | |||
| path_up <- fs::path("..", "data-prep", path_prep) | |||
| path_up2 <- fs::path("..", "..", "data-prep", path_prep) | |||
| if (fs::file_exists(path_here)) { | |||
| path_prep <- path_here | |||
| } else if (fs::file_exists(path_up)) { | |||
| path_prep <- path_up | |||
| } else if (fs::file_exists(path_up2)) { | |||
| path_prep <- path_up2 | |||
| } else { | |||
| stop("File not found: ", path_prep) | |||
| } | |||
| @@ -0,0 +1,143 @@ | |||
| prep_collect_addresses_raw <- function( | |||
| path_officers = "../data-prep/officers", | |||
| path_receipts = "../data-prep/receipts", | |||
| path_expenditures = "../data-prep/expenditures", | |||
| # path_voters = "../data-raw/voters/ncvoter_statewide.parquet" | |||
| path_voters = NULL, | |||
| path_candidate_listing = NULL | |||
| ) { | |||
| address_officers <- prep_collect_addresses_raw_officers(path_officers) | |||
| address_receipts <- | |||
| arrow::open_dataset(path_receipts, partitioning = "sboe_id") |> | |||
| collect_full_addresses_from_parts() | |||
| address_expenditures <- | |||
| arrow::open_dataset(path_expenditures, partitioning = "sboe_id") |> | |||
| collect_full_addresses_from_parts() | |||
| address_candidate_listing <- | |||
| if (!is.null(path_candidate_listing)) { | |||
| arrow::open_dataset(path_candidate_listing) |> | |||
| dplyr::filter(!is.na(state)) |> | |||
| collect_full_addresses_from_parts( | |||
| street = street_address, | |||
| postal_code = zip_code | |||
| ) | |||
| } | |||
| address_voters <- | |||
| if (!is.null(path_voters)) { | |||
| arrow::open_dataset(path_voters) |> | |||
| collect_full_addresses_from_parts( | |||
| street = res_street_address, | |||
| city = res_city_desc, | |||
| state = state_cd, | |||
| postal_code = zip_code | |||
| ) | |||
| } | |||
| dplyr::bind_rows( | |||
| address_voters, | |||
| address_candidate_listing, | |||
| address_receipts, | |||
| address_expenditures, | |||
| address_officers, | |||
| ) |> | |||
| dplyr::mutate(address = fixup_po_box(address)) |> | |||
| dplyr::distinct(address, .keep_all = TRUE) | |||
| } | |||
| prep_collect_addresses_raw_officers <- function( | |||
| path_officers = "../data-prep/officers" | |||
| ) { | |||
| address_officers <- | |||
| arrow::open_dataset(path_officers, partitioning = "sboe_id") |> | |||
| dplyr::filter(!is.na(address)) |> | |||
| dplyr::mutate(address = toupper(address)) |> | |||
| dplyr::distinct(address) |> | |||
| dplyr::collect() |> | |||
| dplyr::mutate( | |||
| address = stringr::str_replace( | |||
| address, | |||
| "(\\d{5})-\\d{4}$", | |||
| "\\1" | |||
| ) | |||
| ) | |||
| address_officers_parts <- | |||
| poster::parse_addr(address_officers$address) |> | |||
| dplyr::select(city, state, postal_code) |> | |||
| dplyr::mutate(across(everything(), toupper)) | |||
| # address_officers <- | |||
| address_officers |> | |||
| dplyr::bind_cols(address_officers_parts) |> | |||
| dplyr::mutate( | |||
| address_minus_street = paste("", city, state, postal_code, sep = ", "), | |||
| street = stringr::str_remove(address, stringr::fixed(address_minus_street)), | |||
| ) |> | |||
| dplyr::select(-address_minus_street) |> | |||
| dplyr::relocate(street, .before = city) | |||
| } | |||
| add_address_lookup <- function( | |||
| df, | |||
| street = street_1, | |||
| city = city, | |||
| state = state, | |||
| postal_code = full_zip, | |||
| name = "address_lookup" | |||
| ) { | |||
| addresses <- | |||
| df |> | |||
| dplyr::filter(!is.na({{ street }})) |> | |||
| dplyr::distinct( | |||
| street = {{ street }}, | |||
| city = {{ city }}, | |||
| state = {{ state }}, | |||
| postal_code = {{ postal_code }} | |||
| ) |> | |||
| dplyr::mutate( | |||
| state = coalesce(state, "NC"), | |||
| !!name := REGEXP_REPLACE( | |||
| UPPER(paste(street, city, state, substr(postal_code, 1, 5), sep = ", ")), | |||
| " +", " " | |||
| ) | |||
| ) | |||
| dplyr::left_join( | |||
| df, | |||
| addresses, | |||
| by = dplyr::join_by( | |||
| {{ street }} == street, | |||
| {{ city }} == city, | |||
| {{ state }} == state, | |||
| {{ postal_code }} == postal_code | |||
| ) | |||
| ) | |||
| } | |||
| collect_full_addresses_from_parts <- function( | |||
| df, | |||
| street = street_1, | |||
| city = city, | |||
| state = state, | |||
| postal_code = full_zip | |||
| ) { | |||
| df |> | |||
| dplyr::filter(!is.na({{ street }})) |> | |||
| dplyr::distinct( | |||
| street = {{ street }}, | |||
| city = {{ city }}, | |||
| state = {{ state }}, | |||
| postal_code = substr({{ postal_code }}, 1, 5) | |||
| ) |> | |||
| dplyr::collect() |> | |||
| dplyr::mutate( | |||
| address = glue::glue("{street}, {city}, {if_else(is.na(state), 'NC', state)}, {postal_code}", .na = ""), | |||
| address = toupper(address), | |||
| address = gsub(" +", " ", address) | |||
| ) |> | |||
| dplyr::relocate(address, .before = 1) | |||
| } | |||
| @@ -4,6 +4,7 @@ prepare_candidates <- function(path_officers = "../data-prep/officers", report_l | |||
| officers <- officers_pq |> filter(type == "Candidate") |> collect() | |||
| officers |> | |||
| filter(type == "Candidate") |> | |||
| filter(!is.na(name)) |> | |||
| mutate( | |||
| name_display = name, | |||
| @@ -1,6 +1,7 @@ | |||
| process_report_dates <- function(report_list_raw, cover_raw) { | |||
| cover_dates <- | |||
| cover_raw |> | |||
| distinct() |> | |||
| select( | |||
| report_id, | |||
| cover_start_date = date_from, | |||
| @@ -0,0 +1,15 @@ | |||
| fixup_po_box <- function(x) { | |||
| gsub( | |||
| "P\\s*[.]*\\s*O\\s*[.]*\\s*BOX|POST OFFICE BOX", | |||
| "PO BOX", | |||
| x | |||
| ) | |||
| } | |||
| fixup_po_box_query <- function(x) { | |||
| REGEXP_REPLACE( | |||
| "P\\s*[.]*\\s*O\\s*[.]*\\s*BOX|POST OFFICE BOX", | |||
| "PO BOX", | |||
| x | |||
| ) | |||
| } | |||
| @@ -0,0 +1,5 @@ | |||
| write_parquet <- function(x, path, ...) { | |||
| fs::dir_create(fs::path_dir(path)) | |||
| arrow::write_parquet(x, path, ...) | |||
| path | |||
| } | |||
| @@ -0,0 +1,192 @@ | |||
| voter_statewide_download <- function(output_dir = here::here("../data-raw/voters")) { | |||
| url <- "https://s3.amazonaws.com/dl.ncsbe.gov/data/ncvoter_Statewide.zip" | |||
| fs::dir_create(output_dir) | |||
| path <- fs::path(output_dir, fs::path_file(url)) | |||
| download.file(url, path) | |||
| withr::with_dir(output_dir, { | |||
| zip::unzip(fs::path_file(url)) | |||
| }) | |||
| invisible(fs::path_ext_set(path, "txt")) | |||
| } | |||
| voter_statewide_convert_parquet <- function(path) { | |||
| path <- fs::path_norm(path) | |||
| path_out <- fs::path_ext_set(tolower(path), "parquet") | |||
| x <- readr::read_tsv(path, col_types = voter_statewide_spec()) | |||
| arrow::write_parquet(x, path_out) | |||
| invisible(path_out) | |||
| } | |||
| voter_statewide_spec <- function() { | |||
| col_state_abbr <- col_factor( | |||
| levels = c( | |||
| state.abb, | |||
| "AP", "DC", "GU", "MP", "NO", "OC", "PR", "UN", "VI" | |||
| ) | |||
| ) | |||
| codes_status <- c( | |||
| "A" = "ACTIVE", | |||
| "D" = "DENIED", | |||
| "I" = "INACTIVE", | |||
| "R" = "REMOVED", | |||
| "S" = "TEMPORARY" | |||
| ) | |||
| codes_race <- c( | |||
| "A" = "ASIAN", | |||
| "B" = "BLACK or AFRICAN AMERICAN", | |||
| "I" = "AMERICAN INDIAN or ALASKA NATIVE", | |||
| "M" = "TWO or MORE RACES ", | |||
| "O" = "OTHER", | |||
| "P" = "NATIVE HAWAIIAN or PACIFIC ISLANDER", | |||
| "U" = "UNDESIGNATED", | |||
| "W" = "WHITE" | |||
| ) | |||
| codes_ethnic <- c( | |||
| "HL" = "HISPANIC or LATINO", | |||
| "NL" = "NOT HISPANIC or NOT LATINO", | |||
| "UN" = "UNDESIGNATED" | |||
| ) | |||
| codes_gender <- c( | |||
| "F" = "FEMALE", | |||
| "M" = "MALE", | |||
| "U" = "UNDESIGNATED" | |||
| ) | |||
| codes_reason <- c( | |||
| "AV" = "VERIFIED", | |||
| "IN" = "CONFIRMATION NOT RETURNED", | |||
| "RD" = "DECEASED", | |||
| "IU" = "CONFIRMATION RETURNED UNDELIVERABLE", | |||
| "DU" = "VERIFICATION RETURNED UNDELIVERABLE", | |||
| "RM" = "REMOVED AFTER 2 FED GENERAL ELECTIONS IN INACTIVE STATUS", | |||
| "RL" = "MOVED FROM COUNTY", | |||
| "RS" = "MOVED FROM STATE", | |||
| "A2" = "CONFIRMATION PENDING", | |||
| "AP" = "VERIFICATION PENDING", | |||
| "DI" = "UNAVAILABLE ESSENTIAL INFORMATION", | |||
| "RF" = "FELONY CONVICTION", | |||
| "RH" = "MOVED WITHIN STATE", | |||
| "RQ" = "REQUEST FROM VOTER", | |||
| "SO" = "OVERSEAS CITIZEN", | |||
| "SM" = "MILITARY", | |||
| "RT" = "TEMPORARY REGISTRANT", | |||
| "RA" = "ADMINISTRATIVE", | |||
| "A1" = "UNVERIFIED" | |||
| ) | |||
| cols( | |||
| county_id = col_character(), | |||
| county_desc = col_character(), | |||
| voter_reg_num = col_character(), | |||
| ncid = col_character(), | |||
| last_name = col_character(), | |||
| first_name = col_character(), | |||
| middle_name = col_character(), | |||
| name_suffix_lbl = col_character(), | |||
| status_cd = col_factor(names(codes_status), ordered = TRUE), | |||
| voter_status_desc = col_factor(unname(codes_status), ordered = TRUE), | |||
| reason_cd = col_factor(names(codes_reason)), | |||
| voter_status_reason_desc = col_factor(unname(codes_reason)), | |||
| res_street_address = col_character(), | |||
| res_city_desc = col_character(), | |||
| state_cd = col_state_abbr, | |||
| zip_code = col_character(), | |||
| mail_addr1 = col_character(), | |||
| mail_addr2 = col_character(), | |||
| mail_addr3 = col_character(), | |||
| mail_addr4 = col_character(), | |||
| mail_city = col_character(), | |||
| mail_state = col_state_abbr, | |||
| mail_zipcode = col_character(), | |||
| full_phone_number = col_character(), | |||
| confidential_ind = col_character(), | |||
| registr_dt = col_date(format = "%m/%d/%Y"), | |||
| race_code = col_factor(names(codes_race)), | |||
| ethnic_code = col_factor(names(codes_ethnic)), | |||
| party_cd = col_factor(), | |||
| gender_code = col_factor(names(codes_gender)), | |||
| birth_year = col_integer(), | |||
| age_at_year_end = col_integer(), | |||
| birth_state = col_state_abbr, | |||
| drivers_lic = col_character(), | |||
| precinct_abbrv = col_factor(), | |||
| precinct_desc = col_character(), | |||
| municipality_abbrv = col_factor(), | |||
| municipality_desc = col_character(), | |||
| ward_abbrv = col_factor(), | |||
| ward_desc = col_character(), | |||
| cong_dist_abbrv = col_factor(), | |||
| super_court_abbrv = col_factor(), | |||
| judic_dist_abbrv = col_factor(), | |||
| nc_senate_abbrv = col_factor(), | |||
| nc_house_abbrv = col_factor(), | |||
| county_commiss_abbrv = col_factor(), | |||
| county_commiss_desc = col_character(), | |||
| township_abbrv = col_factor(), | |||
| township_desc = col_character(), | |||
| school_dist_abbrv = col_factor(), | |||
| school_dist_desc = col_character(), | |||
| fire_dist_abbrv = col_factor(), | |||
| fire_dist_desc = col_character(), | |||
| water_dist_abbrv = col_factor(), | |||
| water_dist_desc = col_character(), | |||
| sewer_dist_abbrv = col_factor(), | |||
| sewer_dist_desc = col_character(), | |||
| sanit_dist_abbrv = col_factor(), | |||
| sanit_dist_desc = col_character(), | |||
| rescue_dist_abbrv = col_factor(), | |||
| rescue_dist_desc = col_character(), | |||
| munic_dist_abbrv = col_factor(), | |||
| munic_dist_desc = col_character(), | |||
| dist_1_abbrv = col_factor(), | |||
| dist_1_desc = col_character(), | |||
| vtd_abbrv = col_factor(), | |||
| vtd_desc = col_character() | |||
| ) | |||
| } | |||
| voter_snapshot_list <- function() { | |||
| url <- "https://s3.amazonaws.com/dl.ncsbe.gov/?delimiter=/&prefix=data/Snapshots/" | |||
| res <- | |||
| httr2::request(url) |> | |||
| httr2::req_perform() |> | |||
| httr2::resp_body_xml() |> | |||
| xml2::as_list() | |||
| res <- res$ListBucketResult | |||
| res <- res[which(names(res) == "Contents")] | |||
| res <- map(res, unlist) | |||
| res <- dplyr::bind_rows(res) | |||
| res <- janitor::clean_names(res) | |||
| res$size <- rlang::as_bytes(as.integer(res$size)) | |||
| res$url <- paste0("https://s3.amazonaws.com/dl.ncsbe.gov/", res$key) | |||
| res | |||
| } | |||
| voter_snapshot_download <- function(year, output_dir = here::here("data-raw/voting")) { | |||
| listing <- | |||
| get_voter_snapshot_list() |> | |||
| dplyr::filter(str_detect(key, paste0("VR_Snapshot_", year))) | |||
| if (!nrow(listing)) { | |||
| return(NULL) | |||
| } else if (nrow(listing) > 1) { | |||
| listing <- listing |> dplyr::slice_max(key, n = 1) | |||
| } | |||
| fs::dir_create(output_dir) | |||
| download.file( | |||
| listing$url, | |||
| fs::path(output_dir, fs::path_file(listing$key)) | |||
| ) | |||
| } | |||
| @@ -27,20 +27,30 @@ list( | |||
| tar_target(report_list_raw, arrow::read_parquet(path_report_list_raw)), | |||
| tar_target( | |||
| dirs_all, | |||
| dirs_all_src, | |||
| fs::dir_ls("../data-raw/reports", glob = "**/all", recurse = TRUE, type = "directory"), | |||
| format = "file" | |||
| ), | |||
| # This comes from Will's answer in https://stackoverflow.com/a/70293576 | |||
| # We're basically tricking targets into letting us branch over a file target | |||
| tar_target(dirs_all_names, dirs_all_src), | |||
| tar_target(dirs_all, {dirs_all_src; dirs_all_names}, pattern = map(dirs_all_names), format = "file"), | |||
| tar_target( | |||
| dirs_receipts, | |||
| dirs_receipts_src, | |||
| fs::dir_ls("../data-raw/reports", glob = "**/receipts", recurse = TRUE, type = "directory"), | |||
| format = "file" | |||
| ), | |||
| tar_target(dirs_receipts_names, dirs_receipts_src), | |||
| tar_target(dirs_receipts, {dirs_receipts_src; dirs_receipts_names}, pattern = map(dirs_receipts_names), format = "file"), | |||
| tar_target( | |||
| dirs_expenditures, | |||
| dirs_expenditures_src, | |||
| fs::dir_ls("../data-raw/reports", glob = "**/expenditures", recurse = TRUE, type = "directory"), | |||
| format = "file" | |||
| ), | |||
| tar_target(dirs_expenditures_names, dirs_expenditures_src), | |||
| tar_target(dirs_expenditures, {dirs_expenditures_src; dirs_expenditures_names}, pattern = map(dirs_expenditures_names), format = "file"), | |||
| tar_target( | |||
| paths_all_parquet, | |||
| @@ -65,6 +75,8 @@ list( | |||
| tar_target(path_data_prep_cover, { paths_all_parquet; "../data-prep/cover" }, format = "file"), | |||
| tar_target(path_data_prep_officers, { paths_all_parquet; "../data-prep/officers" }, format = "file"), | |||
| tar_target(path_data_prep_receipts, { paths_all_parquet; "../data-prep/receipts" }, format = "file"), | |||
| tar_target(path_data_prep_expenditures, { paths_all_parquet; "../data-prep/expenditures" }, format = "file"), | |||
| tar_target( | |||
| @@ -76,12 +88,36 @@ list( | |||
| report_dates, | |||
| process_report_dates(report_list_raw, cover_raw) | |||
| ), | |||
| tar_target( | |||
| path_report_dates, { | |||
| out_path <- "../data-prep/report_dates/part-0.parquet" | |||
| fs::dir_create(fs::path_dir(out_path)) | |||
| arrow::write_parquet(report_dates, out_path) | |||
| }), | |||
| tar_target( | |||
| report_amended_score, | |||
| calc_report_amended_score(report_dates) | |||
| ), | |||
| tar_target( | |||
| addresses_raw, | |||
| prep_collect_addresses_raw( | |||
| path_officers = path_data_prep_officers, | |||
| path_receipts = path_data_prep_receipts, | |||
| path_expenditures = path_data_prep_expenditures, | |||
| path_candidate_listing = path_candidate_listing, | |||
| path_voters = NULL # path_voters_parquet | |||
| ), | |||
| format = "parquet" | |||
| ), | |||
| tar_target( | |||
| path_addresses_db, | |||
| prepare_addresses_lookup_db(addresses_raw$address) | |||
| ), | |||
| # This report list uses the latest amended report ----- | |||
| tar_target( | |||
| report_list, | |||
| process_report_list(report_list_raw, report_amended_score) | |||
| @@ -89,5 +125,17 @@ list( | |||
| tar_target(committees, prepare_committees(cover_raw, report_list)), | |||
| tar_target(candidates, prepare_candidates(path_data_prep_officers, report_list)) | |||
| tar_target(candidates, prepare_candidates(path_data_prep_officers, report_list)), | |||
| # Outside data sources ----- | |||
| tar_target(candidate_listing, get_candidate_listing(2016:2023)), | |||
| tar_target(path_candidate_listing, write_parquet(candidate_listing, "../data-prep/candidate_listing/part-0.parquet")), | |||
| ## Voter registration records | |||
| tar_target(path_voters_txt, voter_statewide_download(), cue = tar_cue("never")), #<< invalidate to get latest | |||
| tar_target( | |||
| path_voters_parquet, | |||
| voter_statewide_convert_parquet(path_voters_txt), | |||
| cue = tar_cue("never") | |||
| ) | |||
| ) | |||