| @@ -194,7 +194,7 @@ get_raw_report_all <- function(report_id) { | |||
| } | |||
| save_raw_report_all <- function(report_id, sboe_id, received) { | |||
| dir <- here::here("data-raw", "reports", sboe_id) | |||
| dir <- here::here("data-raw", "reports", sboe_id, "all") | |||
| file_name <- paste0(report_id, "_", received) | |||
| path <- path(dir, file_name, ext = "txt") | |||
| @@ -211,3 +211,117 @@ save_raw_report_all <- function(report_id, sboe_id, received) { | |||
| invisible(path) | |||
| } | |||
| get_report_receipts_json <- function(report_id, page_size = 500) { | |||
| delay() | |||
| page <- 0 | |||
| cli::cli_progress_step("Getting page 0 of ??") | |||
| res <- | |||
| report_id |> | |||
| req_report_receipts(page = page, page_size = page_size) |> | |||
| req_perform() |> | |||
| resp_body_json(check_type = FALSE) | |||
| data <- res$Data$results | |||
| total_rows <- res$Data$recordCountKey | |||
| actual_page_size <- length(data) | |||
| if (length(data) < total_rows) { | |||
| while (length(data) < total_rows) { | |||
| page <- page + 1 | |||
| cli::cli_progress_step("Getting page {page} of {ceiling(total_rows / actual_page_size)} [{length(res$Data$results)}]") | |||
| res <- | |||
| report_id |> | |||
| req_report_receipts(page = page, page_size = page_size) |> | |||
| req_perform() |> | |||
| resp_body_json(check_type = FALSE) | |||
| data <- c(data, res$Data$results) | |||
| } | |||
| } | |||
| cli::cli_progress_step("Preparing data frame") | |||
| data |> | |||
| map_depth(2, \(x) if (length(x)) x) |> | |||
| map_dfr(\(x) x) | |||
| } | |||
| save_report_receipts_csv <- function(report_id, sboe_id, received) { | |||
| dir <- here::here("data-raw", "reports", sboe_id, "receipts") | |||
| file_name <- paste0(report_id, "_", received, "_", "receipts") | |||
| path <- path(dir, file_name, ext = "csv") | |||
| if (file_exists(path)) { | |||
| # We don't need to re-download any reports | |||
| return(path) | |||
| } | |||
| res <- get_report_receipts_json(report_id) | |||
| dir_create(dir) | |||
| write_csv(res, path) | |||
| invisible(path) | |||
| } | |||
| get_report_expenditures_json <- function(report_id, page_size = 500) { | |||
| delay() | |||
| page <- 0 | |||
| cli::cli_progress_step("Getting expenses page 0 of ??") | |||
| res <- | |||
| report_id |> | |||
| req_report_expenditures(page = page, page_size = page_size) |> | |||
| req_perform() |> | |||
| resp_body_json(check_type = FALSE) | |||
| data <- res$Data$results | |||
| total_rows <- res$Data$recordCountKey | |||
| actual_page_size <- length(data) | |||
| if (length(data) < total_rows) { | |||
| while (length(data) < total_rows) { | |||
| cli::cli_progress_step("Getting expenses page {page + 1} of {ceiling(total_rows / actual_page_size)} [{length(res$Data$results)}]") | |||
| page <- page + 1 | |||
| res <- | |||
| report_id |> | |||
| req_report_expenditures(page = page, page_size = page_size) |> | |||
| req_perform() |> | |||
| resp_body_json(check_type = FALSE) | |||
| data <- c(data, res$Data$results) | |||
| } | |||
| } | |||
| cli::cli_progress_step("Preparing expenses data frame") | |||
| data |> | |||
| map_depth(2, \(x) if (length(x)) x) |> | |||
| map_dfr(\(x) x) | |||
| } | |||
| save_report_expenditures_csv <- function(report_id, sboe_id, received) { | |||
| dir <- here::here("data-raw", "reports", sboe_id, "expenditures") | |||
| file_name <- paste0(report_id, "_", received, "_", "expenditures") | |||
| path <- path(dir, file_name, ext = "csv") | |||
| if (file_exists(path)) { | |||
| # We don't need to re-download any reports | |||
| return(path) | |||
| } | |||
| res <- get_report_expenditures_json(report_id) | |||
| dir_create(dir) | |||
| write_csv(res, path) | |||
| invisible(path) | |||
| } | |||
| @@ -10,6 +10,14 @@ url_nc_cf_export_detail_results <- function() { | |||
| "https://cf.ncsbe.gov/CFOrgLkup/ExportDetailResults/" | |||
| } | |||
| url_nc_cf_report_get_receipts <- function() { | |||
| "https://cf.ncsbe.gov/CFOrgLkup/GetReceipts" | |||
| } | |||
| url_nc_cf_report_get_expenditures <- function() { | |||
| "https://cf.ncsbe.gov/CFOrgLkup/GetExpenditures" | |||
| } | |||
| match_report_type <- function(report, collapse = TRUE) { | |||
| report <- toupper(report) | |||
| @@ -73,3 +81,28 @@ req_report_detail <- function(report_id, section = "receipts") { | |||
| req | |||
| } | |||
| req_report_receipts <- function(report_id, page = 0, page_size = 300) { | |||
| req <- request(url_nc_cf_report_get_receipts()) | |||
| req <- req_url_query( | |||
| req, | |||
| ReportID = report_id, | |||
| page = page, | |||
| pageSize = page_size | |||
| ) | |||
| req | |||
| } | |||
| req_report_expenditures <- function(report_id, page = 0, page_size = 300) { | |||
| req <- request(url_nc_cf_report_get_expenditures()) | |||
| req <- req_url_query( | |||
| req, | |||
| ReportID = report_id, | |||
| page = page, | |||
| pageSize = page_size, | |||
| ShowIEColumns = TRUE # this doesn't appear to do anything, but it's required | |||
| ) | |||
| req | |||
| } | |||
| @@ -54,3 +54,22 @@ https://cf.ncsbe.gov/CFOrgLkup/ExportDetailResults/?ReportID=197247&Type=REC | |||
| https://cf.ncsbe.gov/CFOrgLkup/ReportDetail/?RID=197247&TP=EXP | |||
| ``` | |||
| ## Trying yet again | |||
| It turns out that using the "Export data to CSV" link from a | |||
| [page like this one](https://cf.ncsbe.gov/CFOrgLkup/ReportDetail/?RID=205761&TP=ALL) | |||
| will return a very badly formatted CSV file, potentially full of errors. | |||
| On the other hand, we can the pieces via: | |||
| ``` | |||
| # Reports | |||
| # (paged, unclear page size) | |||
| https://cf.ncsbe.gov/CFOrgLkup/GetReceipts?ReportID={report_id}&page=0&pageSize=100 | |||
| # Expenses | |||
| # (paged, unclear page size) | |||
| https://cf.ncsbe.gov/CFOrgLkup/GetExpenditures?ReportID={report_id}&page=0&pageSize=100&ShowIEColumns=true | |||
| ``` | |||
| @@ -65,10 +65,54 @@ list( | |||
| ), | |||
| format = "file_fast" | |||
| ), | |||
| # Gets the JSON version of the report's receipts via an internal API call, | |||
| # that is processed into a standard table before saving as CSV. | |||
| tar_target( | |||
| parquet_report_cover_path, | |||
| write_reports_by_sboe_id(report_list_sboe_id), | |||
| pattern = map(unique(report_list_sboe_id)), | |||
| report_receipts_csv_path, | |||
| save_report_receipts_csv( | |||
| report_list_report_id, | |||
| report_list_sboe_id, | |||
| report_list_received | |||
| ), | |||
| pattern = map( | |||
| report_list_report_id, | |||
| report_list_sboe_id, | |||
| report_list_received | |||
| ), | |||
| format = "file_fast" | |||
| ), | |||
| # Gets the JSON version of the report's expenses via an internal API call, | |||
| # that is processed into a standard table before saving as CSV. | |||
| tar_target( | |||
| report_expenditures_csv_path, | |||
| save_report_expenditures_csv( | |||
| report_list_report_id, | |||
| report_list_sboe_id, | |||
| report_list_received | |||
| ), | |||
| pattern = map( | |||
| report_list_report_id, | |||
| report_list_sboe_id, | |||
| report_list_received | |||
| ), | |||
| format = "file_fast" | |||
| ), | |||
| tar_target( | |||
| dirs_all, | |||
| fs::dir_ls("data-raw/reports", glob = "**/all", recurse = TRUE, type = "directory") | |||
| ), | |||
| tar_target( | |||
| dirs_receipts, | |||
| fs::dir_ls("data-raw/reports", glob = "**/receipts", recurse = TRUE, type = "directory") | |||
| ), | |||
| tar_target( | |||
| dirs_expenditures, | |||
| fs::dir_ls("data-raw/reports", glob = "**/expenditures", recurse = TRUE, type = "directory") | |||
| ) | |||
| # tar_target( | |||
| # parquet_report_cover_path, | |||
| # write_reports_by_sboe_id(report_list_sboe_id), | |||
| # pattern = map(unique(report_list_sboe_id)), | |||
| # format = "file_fast" | |||
| # ) | |||
| ) | |||
| @@ -19,12 +19,11 @@ opts <- docopt(doc) | |||
| Sys.setenv("IN_TARGETS" = "true") | |||
| Sys.setenv("ALLOW_DOWNLOADS" = "true") | |||
| targets::tar_make_future(workers = 8) | |||
| if (opts$all) { | |||
| cli::cli_alert_into("Running all targets.") | |||
| cli::cli_alert_info("Running all targets.") | |||
| targets::tar_make() | |||
| } else { | |||
| cli::cli_alert_into("Running targets: {.and {.field {targets}}}") | |||
| targets::tar_make(targets::any_of(opts$targets)) | |||
| cli::cli_alert_info("Running targets: {.and {.field {opts$targets}}}") | |||
| targets::tar_make(targets::any_of(!!opts$targets)) | |||
| } | |||
| # targets::tar_make_clustermq(workers = 2) # nolint | |||
| @@ -0,0 +1,38 @@ | |||
| x <- 1 | |||
| # I originally dumped everything in data-raw/reports/{sboe_id}/ | |||
| # but I realized that things will work out best with targets if I can point | |||
| # to a single directory for each type of file that needs to be processed. | |||
| # | |||
| # This script moves the files into the appropriate subdirectories. | |||
| # | |||
| # {sboe_id}/{report_id}_{received}.txt --> {sboe_id}/all/{report_id}_{received}.txt | |||
| # {sboe_id}/{report_id}_{received}_receipts.csv --> {sboe_id}/receipts/{report_id}_{received}_receipts.csv | |||
| # {sboe_id}/{report_id}_{received}_expenditures.csv --> {sboe_id}/expenditures/{report_id}_{received}_expenditures.csv | |||
| library(fs) | |||
| all <- dir_ls(here::here("data-raw", "reports"), glob = "*.txt", recurse = TRUE) | |||
| all_new <- path(path_dir(all), "all", path_file(all)) | |||
| receipts <- dir_ls(here::here("data-raw", "reports"), glob = "*_receipts.csv", recurse = TRUE) | |||
| receipts_new <- path(path_dir(receipts), "receipts", path_file(receipts)) | |||
| expenditures <- dir_ls(here::here("data-raw", "reports"), glob = "*_expenditures.csv", recurse = TRUE) | |||
| expenditures_new <- path(path_dir(expenditures), "expenditures", path_file(expenditures)) | |||
| sboe_dirs <- path_dir(all) |> unique() | |||
| cli::cli_progress_step("Moving {.field all}") | |||
| dir_create(path(sboe_dirs, "all")) | |||
| file_move(all, all_new) | |||
| cli::cli_progress_step("Moving {.field receipts}") | |||
| dir_create(path(sboe_dirs, "receipts")) | |||
| file_move(receipts, receipts_new) | |||
| cli::cli_progress_step("Moving {.field expenditures}") | |||
| dir_create(path(sboe_dirs, "expenditures")) | |||
| file_move(expenditures, expenditures_new) | |||
| cli::cli_progress_done() | |||