| data-raw | data-raw | ||||
| collect/data-raw/ | collect/data-raw/ | ||||
| collect/data-old/ | collect/data-old/ | ||||
| data/ |
| Type: Project | |||||
| Package: nc-campaign-finance-process | |||||
| Title: Process the NC Campaign Finance Data | |||||
| Version: 0.0.0.9000 | |||||
| Authors@R: | |||||
| person("Garrick", "Aden-Buie", , "garrick@adenbuie.com", role = c("aut", "cre"), | |||||
| comment = c(ORCID = "0000-0002-7111-0077")) | |||||
| Description: Proces the NC Campaign Finance Data. | |||||
| License: Proprietary | |||||
| Encoding: UTF-8 | |||||
| Roxygen: list(markdown = TRUE) | |||||
| RoxygenNote: 7.2.3 | |||||
| Depends: | |||||
| dplyr, | |||||
| fs, | |||||
| glue, | |||||
| httr2, | |||||
| purrr, | |||||
| readr, | |||||
| rlang, | |||||
| targets | |||||
| Imports: | |||||
| arrow, | |||||
| cli, | |||||
| crew, | |||||
| DBI, | |||||
| dbplyr, | |||||
| desc, | |||||
| docopt, | |||||
| duckdb, | |||||
| here, | |||||
| visNetwork |
| process_expenditures_csv <- function(dir_sboe_id, report_list = tar_read(report_list)) { | |||||
| # Read the files in the directory, extract report_id from the path | |||||
| # Compare to report_list to determine which reports go into the data | |||||
| files <- dir_ls(dir_sboe_id) | |||||
| info <- report_path_info(files) | |||||
| info$path <- files | |||||
| # These are the reports we want to keep in the data | |||||
| info <- semi_join(info, report_list, by = c("sboe_id", "report_id")) | |||||
| expenditures <- | |||||
| info |> | |||||
| pmap(function(sboe_id, report_id, path, ...) { | |||||
| read_expenditures_csv(path, sboe_id, report_id) | |||||
| }) |> | |||||
| list_rbind() | |||||
| names(expenditures) <- snakecase::to_snake_case(names(expenditures), parsing_option = 3) | |||||
| expenditures | |||||
| } | |||||
| write_expenditures_parquet <- function(dir_sboe_id, report_list = tar_read(report_list)) { | |||||
| expenditures <- process_expenditures_csv(dir_sboe_id, report_list) | |||||
| info <- report_path_info(dir_sboe_id) | |||||
| data_dir <- here::here("..", "data", "expenditures", sprintf("sboe_id=%s", info$sboe_id)) | |||||
| data_path <- path(data_dir, "part-0.parquet") | |||||
| dir_create(data_dir) | |||||
| arrow::write_parquet(expenditures, data_path) | |||||
| data_path | |||||
| } | |||||
| read_expenditures_csv <- function(path, sboe_id = NULL, report_id = NULL) { | |||||
| if (file_size(path) < 1) { | |||||
| return(NULL) | |||||
| } | |||||
| if (is.null(sboe_id) || is.null(report_id)) { | |||||
| info <- report_path_info(path) | |||||
| sboe_id <- info$sboe_id | |||||
| report_id <- info$report_id | |||||
| } | |||||
| x <- read_csv( | |||||
| path, | |||||
| col_types = cols( | |||||
| .default = col_character(), | |||||
| OccurDate = col_date("%m/%d/%Y"), | |||||
| IsOrg = col_logical(), | |||||
| IsUS = col_logical(), | |||||
| Amount = col_double(), | |||||
| SumToDate = col_double(), | |||||
| IsAggregated = col_logical() | |||||
| ) | |||||
| ) | |||||
| record_problems(x, label = "expenditures") | |||||
| x |> | |||||
| mutate(sboe_id = sboe_id, report_id = report_id, .before = 0) | |||||
| } |
| process_receipts_csv <- function(dir_sboe_id, report_list = tar_read(report_list)) { | |||||
| # Read the files in the directory, extract report_id from the path | |||||
| # Compare to report_list to determine which reports go into the data | |||||
| files <- dir_ls(dir_sboe_id) | |||||
| info <- report_path_info(files) | |||||
| info$path <- files | |||||
| # These are the reports we want to keep in the data | |||||
| info <- semi_join(info, report_list, by = c("sboe_id", "report_id")) | |||||
| receipts <- | |||||
| info |> | |||||
| pmap(function(sboe_id, report_id, path, ...) { | |||||
| if (file_size(path) < 1) { | |||||
| return(NULL) | |||||
| } | |||||
| read_receipts_csv(path, sboe_id, report_id) | |||||
| }) |> | |||||
| list_rbind() | |||||
| names(receipts) <- snakecase::to_snake_case(names(receipts), parsing_option = 3) | |||||
| receipts | |||||
| } | |||||
| write_receipts_parquet <- function(dir_sboe_id, report_list = tar_read(report_list)) { | |||||
| receipts <- process_receipts_csv(dir_sboe_id, report_list) | |||||
| info <- report_path_info(dir_sboe_id) | |||||
| data_dir <- here::here("..", "data", "receipts", sprintf("sboe_id=%s", info$sboe_id)) | |||||
| data_path <- path(data_dir, "part-0.parquet") | |||||
| dir_create(data_dir) | |||||
| arrow::write_parquet(receipts, data_path) | |||||
| data_path | |||||
| } | |||||
| read_receipts_csv <- function(path, sboe_id = NULL, report_id = NULL) { | |||||
| if (is.null(sboe_id) || is.null(report_id)) { | |||||
| info <- report_path_info(path) | |||||
| sboe_id <- info$sboe_id | |||||
| report_id <- info$report_id | |||||
| } | |||||
| x <- read_csv( | |||||
| path, | |||||
| col_types = cols( | |||||
| .default = col_character(), | |||||
| GroupID = col_integer(), | |||||
| IsOrg = col_logical(), | |||||
| IsUS = col_logical(), | |||||
| Amount = col_double(), | |||||
| SumToDate = col_double(), | |||||
| IsAggregated = col_logical(), | |||||
| IsPrior = col_character() | |||||
| ) | |||||
| ) | |||||
| record_problems(x, label = "receipts") | |||||
| x |> | |||||
| mutate(IsPrior = IsPrior == "X") |> | |||||
| mutate(sboe_id = sboe_id, report_id = report_id, .before = 0) | |||||
| } |
| process_report_list <- function(path_report_list) { | |||||
| out <- path("..", "data", "report_list", "part-0.parquet") | |||||
| dir_create(path_dir(out)) | |||||
| read_csv( | |||||
| path_report_list, | |||||
| col_types = cols( | |||||
| year = col_integer(), | |||||
| report_id = col_integer() | |||||
| ) | |||||
| ) |> | |||||
| arrow::write_parquet(out) | |||||
| } |
| return_path | return_path | ||||
| } | } | ||||
| report_path_info <- function(report_path) { | |||||
| # data-raw/reports/{sboe_id}/all/{report_id}_{received_date}.txt | |||||
| report_path <- fs::path_abs(report_path) | |||||
| x <- fs::path_rel(report_path, here::here("data-raw", "reports")) | |||||
| x <- fs::path_split(x)[[1]] | |||||
| # x[2] is "all" | |||||
| id <- strsplit(x[3], "_")[[1]][1] | |||||
| list( | |||||
| sboe_id = x[1], | |||||
| report_id = as.integer(id) | |||||
| ) | |||||
| } |
| record_problems <- function(x, path, label) { | |||||
| if (nrow(problems(x)) == 0) { | |||||
| return(invisible()) | |||||
| } | |||||
| problem_dir <- here::here("..", "problems") | |||||
| problem_path <- path(problem_dir, label, ext = "csv") | |||||
| dir_create(problem_dir) | |||||
| write_csv(problems(x), problem_path, append = file_exists(problem_path)) | |||||
| } |
| report_path_info <- function(report_path) { | |||||
| # data-raw/reports/{sboe_id}/all/{report_id}_{received_date}.txt | |||||
| report_path <- fs::path_abs(report_path) | |||||
| x <- fs::path_rel(report_path, here::here("..", "data-raw", "reports")) | |||||
| x <- map(x, path_split) |> list_flatten() | |||||
| tibble( | |||||
| sboe_id = map_vec(x, \(x) x[1]), | |||||
| report_id = map_int(x, \(x) as.integer(strsplit(x[3], "_")[[1]][1])), | |||||
| ) | |||||
| } |
| # Created by use_targets(). | |||||
| # Follow the comments below to fill in this target script. | |||||
| # Then follow the manual to check and run the pipeline: | |||||
| # https://books.ropensci.org/targets/walkthrough.html#inspect-the-pipeline | |||||
| # Load packages required to define the pipeline: | |||||
| library(targets) | |||||
| # Set target options: | |||||
| tar_option_set( | |||||
| packages = strsplit(desc::desc_get_field("Depends"), ", ")[[1]], | |||||
| # For distributed computing in tar_make(), supply a {crew} controller | |||||
| # as discussed at https://books.ropensci.org/targets/crew.html. | |||||
| controller = crew::crew_controller_local(workers = 12), | |||||
| # debug = "path_receipts_parquet_8d195f7e", | |||||
| # cue = tar_cue(mode = "never") | |||||
| error = "stop" | |||||
| ) | |||||
| # Run the R scripts in the R/ folder with your custom functions: | |||||
| tar_source() | |||||
| # Replace the target list below with your own: | |||||
| list( | |||||
| tar_target(path_report_list, "../data-raw/report_list.csv", format = "file"), | |||||
| tar_target(report_list, process_report_list(path_report_list)), | |||||
| tar_target( | |||||
| dirs_all, | |||||
| fs::dir_ls("../data-raw/reports", glob = "**/all", recurse = TRUE, type = "directory") | |||||
| ), | |||||
| tar_target( | |||||
| dirs_receipts, | |||||
| fs::dir_ls("../data-raw/reports", glob = "**/receipts", recurse = TRUE, type = "directory") | |||||
| ), | |||||
| tar_target( | |||||
| dirs_expenditures, | |||||
| fs::dir_ls("../data-raw/reports", glob = "**/expenditures", recurse = TRUE, type = "directory") | |||||
| ), | |||||
| tar_target( | |||||
| path_receipts_parquet, | |||||
| write_receipts_parquet(dirs_receipts, report_list), | |||||
| pattern = map(dirs_receipts), | |||||
| format = "file" | |||||
| ), | |||||
| tar_target( | |||||
| path_expenditures_parquet, | |||||
| write_expenditures_parquet(dirs_expenditures, report_list), | |||||
| pattern = map(dirs_expenditures), | |||||
| format = "file" | |||||
| ) | |||||
| ) |
| * | |||||
| !.gitignore | |||||
| !meta | |||||
| meta/* | |||||
| !meta/meta |
| Version: 1.0 | |||||
| RestoreWorkspace: No | |||||
| SaveWorkspace: No | |||||
| AlwaysSaveHistory: Default | |||||
| EnableCodeIndexing: Yes | |||||
| UseSpacesForTab: Yes | |||||
| NumSpacesForTab: 2 | |||||
| Encoding: UTF-8 | |||||
| RnwWeave: Sweave | |||||
| LaTeX: pdfLaTeX | |||||
| AutoAppendNewline: Yes | |||||
| StripTrailingWhitespace: Yes | |||||
| LineEndingConversion: Posix | |||||
| BuildType: Package | |||||
| PackageUseDevtools: Yes | |||||
| PackageInstallArgs: --no-multiarch --with-keep.source | |||||
| PackageRoxygenize: rd,collate,namespace |
| #!/usr/bin/env Rscript | |||||
| # This is a helper script to run the pipeline. | |||||
| # Choose how to execute the pipeline below. | |||||
| # See https://books.ropensci.org/targets/hpc.html | |||||
| # to learn about your options. | |||||
| 'usage: | |||||
| run.R all | |||||
| run.R target <targets>... | |||||
| run.R -h | --help | |||||
| options: | |||||
| -h --help Show this screen' -> doc | |||||
| library(docopt) | |||||
| opts <- docopt(doc) | |||||
| Sys.setenv("IN_TARGETS" = "true") | |||||
| Sys.setenv("ALLOW_DOWNLOADS" = "true") | |||||
| if (opts$all) { | |||||
| cli::cli_alert_info("Running all targets.") | |||||
| targets::tar_make() | |||||
| } else { | |||||
| cli::cli_alert_info("Running targets: {.and {.field {opts$targets}}}") | |||||
| targets::tar_make(targets::any_of(!!opts$targets)) | |||||
| } | |||||
| # targets::tar_make_clustermq(workers = 2) # nolint |