2 лет назад · c87a804c04
--- a/R/get.R
+++ b/R/get.R
@@ -194,7 +194,7 @@ get_raw_report_all <- function(report_id) {
 }

 save_raw_report_all <- function(report_id, sboe_id, received) {
  dir <- here::here("data-raw", "reports", sboe_id)
  dir <- here::here("data-raw", "reports", sboe_id, "all")
  file_name <- paste0(report_id, "_", received)
  path <- path(dir, file_name, ext = "txt")

@@ -211,3 +211,117 @@ save_raw_report_all <- function(report_id, sboe_id, received) {

  invisible(path)
 }


 get_report_receipts_json <- function(report_id, page_size = 500) {
  delay()

  page <- 0

  cli::cli_progress_step("Getting page 0 of ??")
  res <-
    report_id |>
    req_report_receipts(page = page, page_size = page_size) |>
    req_perform() |>
    resp_body_json(check_type = FALSE)

  data <- res$Data$results
  total_rows <- res$Data$recordCountKey
  actual_page_size <- length(data)

  if (length(data) < total_rows) {
    while (length(data) < total_rows) {
      page <- page + 1
      cli::cli_progress_step("Getting page {page} of {ceiling(total_rows / actual_page_size)} [{length(res$Data$results)}]")

      res <-
        report_id |>
        req_report_receipts(page = page, page_size = page_size) |>
        req_perform() |>
        resp_body_json(check_type = FALSE)

      data <- c(data, res$Data$results)
    }
  }

  cli::cli_progress_step("Preparing data frame")
  data |>
    map_depth(2, \(x) if (length(x)) x) |>
    map_dfr(\(x) x)
 }

 save_report_receipts_csv <- function(report_id, sboe_id, received) {
  dir <- here::here("data-raw", "reports", sboe_id, "receipts")
  file_name <- paste0(report_id, "_", received, "_", "receipts")
  path <- path(dir, file_name, ext = "csv")

  if (file_exists(path)) {
    # We don't need to re-download any reports
    return(path)
  }

  res <- get_report_receipts_json(report_id)

  dir_create(dir)

  write_csv(res, path)

  invisible(path)
 }


 get_report_expenditures_json <- function(report_id, page_size = 500) {
  delay()

  page <- 0

  cli::cli_progress_step("Getting expenses page 0 of ??")
  res <-
    report_id |>
    req_report_expenditures(page = page, page_size = page_size) |>
    req_perform() |>
    resp_body_json(check_type = FALSE)

  data <- res$Data$results
  total_rows <- res$Data$recordCountKey
  actual_page_size <- length(data)

  if (length(data) < total_rows) {
    while (length(data) < total_rows) {
      cli::cli_progress_step("Getting expenses page {page + 1} of {ceiling(total_rows / actual_page_size)} [{length(res$Data$results)}]")
      page <- page + 1

      res <-
        report_id |>
        req_report_expenditures(page = page, page_size = page_size) |>
        req_perform() |>
        resp_body_json(check_type = FALSE)

      data <- c(data, res$Data$results)
    }
  }

  cli::cli_progress_step("Preparing expenses data frame")
  data |>
    map_depth(2, \(x) if (length(x)) x) |>
    map_dfr(\(x) x)
 }

 save_report_expenditures_csv <- function(report_id, sboe_id, received) {
  dir <- here::here("data-raw", "reports", sboe_id, "expenditures")
  file_name <- paste0(report_id, "_", received, "_", "expenditures")
  path <- path(dir, file_name, ext = "csv")

  if (file_exists(path)) {
    # We don't need to re-download any reports
    return(path)
  }

  res <- get_report_expenditures_json(report_id)

  dir_create(dir)

  write_csv(res, path)

  invisible(path)
 }
--- a/R/urls.R
+++ b/R/urls.R
@@ -10,6 +10,14 @@ url_nc_cf_export_detail_results  <- function() {
  "https://cf.ncsbe.gov/CFOrgLkup/ExportDetailResults/"
 }

 url_nc_cf_report_get_receipts <- function() {
  "https://cf.ncsbe.gov/CFOrgLkup/GetReceipts"
 }

 url_nc_cf_report_get_expenditures <- function() {
  "https://cf.ncsbe.gov/CFOrgLkup/GetExpenditures"
 }

 match_report_type <- function(report, collapse = TRUE) {
  report <- toupper(report)

@@ -73,3 +81,28 @@ req_report_detail <- function(report_id, section = "receipts") {

  req
 }

 req_report_receipts <- function(report_id, page = 0, page_size = 300) {
  req <- request(url_nc_cf_report_get_receipts())
  req <- req_url_query(
    req,
    ReportID = report_id,
    page = page,
    pageSize = page_size
  )

  req
 }

 req_report_expenditures <- function(report_id, page = 0, page_size = 300) {
  req <- request(url_nc_cf_report_get_expenditures())
  req <- req_url_query(
    req,
    ReportID = report_id,
    page = page,
    pageSize = page_size,
    ShowIEColumns = TRUE # this doesn't appear to do anything, but it's required
  )

  req
 }
--- a/README.Rmd
+++ b/README.Rmd
@@ -54,3 +54,22 @@ https://cf.ncsbe.gov/CFOrgLkup/ExportDetailResults/?ReportID=197247&Type=REC
 https://cf.ncsbe.gov/CFOrgLkup/ReportDetail/?RID=197247&TP=EXP
 ```

 ## Trying yet again

 It turns out that using the "Export data to CSV" link from a
 [page like this one](https://cf.ncsbe.gov/CFOrgLkup/ReportDetail/?RID=205761&TP=ALL)
 will return a very badly formatted CSV file, potentially full of errors.

 On the other hand, we can the pieces via:

 ```
 # Reports
 # (paged, unclear page size)
 https://cf.ncsbe.gov/CFOrgLkup/GetReceipts?ReportID={report_id}&page=0&pageSize=100

 # Expenses
 # (paged, unclear page size)
 https://cf.ncsbe.gov/CFOrgLkup/GetExpenditures?ReportID={report_id}&page=0&pageSize=100&ShowIEColumns=true


 ```
--- a/_targets.R
+++ b/_targets.R
@@ -65,10 +65,54 @@ list(
    ),
    format = "file_fast"
  ),
  # Gets the JSON version of the report's receipts via an internal API call,
  # that is processed into a standard table before saving as CSV.
  tar_target(
    parquet_report_cover_path,
    write_reports_by_sboe_id(report_list_sboe_id),
    pattern = map(unique(report_list_sboe_id)),
    report_receipts_csv_path,
    save_report_receipts_csv(
      report_list_report_id,
      report_list_sboe_id,
      report_list_received
    ),
    pattern = map(
      report_list_report_id,
      report_list_sboe_id,
      report_list_received
    ),
    format = "file_fast"
  ),
  # Gets the JSON version of the report's expenses via an internal API call,
  # that is processed into a standard table before saving as CSV.
  tar_target(
    report_expenditures_csv_path,
    save_report_expenditures_csv(
      report_list_report_id,
      report_list_sboe_id,
      report_list_received
    ),
    pattern = map(
      report_list_report_id,
      report_list_sboe_id,
      report_list_received
    ),
    format = "file_fast"
  ),
  tar_target(
    dirs_all,
     fs::dir_ls("data-raw/reports", glob = "**/all", recurse = TRUE, type = "directory")
  ),
  tar_target(
    dirs_receipts,
     fs::dir_ls("data-raw/reports", glob = "**/receipts", recurse = TRUE, type = "directory")
  ),
  tar_target(
    dirs_expenditures,
     fs::dir_ls("data-raw/reports", glob = "**/expenditures", recurse = TRUE, type = "directory")
  )
  # tar_target(
  #   parquet_report_cover_path,
  #   write_reports_by_sboe_id(report_list_sboe_id),
  #   pattern = map(unique(report_list_sboe_id)),
  #   format = "file_fast"
  # )
 )
--- a/_targets/meta/meta
+++ b/_targets/meta/meta
--- a/run.R
+++ b/run.R
@@ -19,12 +19,11 @@ opts <- docopt(doc)
 Sys.setenv("IN_TARGETS" = "true")
 Sys.setenv("ALLOW_DOWNLOADS" = "true")

 targets::tar_make_future(workers = 8)
 if (opts$all) {
  cli::cli_alert_into("Running all targets.")
  cli::cli_alert_info("Running all targets.")
  targets::tar_make()
 } else {
  cli::cli_alert_into("Running targets: {.and {.field {targets}}}")
  targets::tar_make(targets::any_of(opts$targets))
  cli::cli_alert_info("Running targets: {.and {.field {opts$targets}}}")
  targets::tar_make(targets::any_of(!!opts$targets))
 }
 # targets::tar_make_clustermq(workers = 2) # nolint
--- a/scripts/2023-09-24_organize-data-raw-into-subfolders.R
+++ b/scripts/2023-09-24_organize-data-raw-into-subfolders.R
@@ -0,0 +1,38 @@
 x <- 1

 # I originally dumped everything in data-raw/reports/{sboe_id}/
 # but I realized that things will work out best with targets if I can point
 # to a single directory for each type of file that needs to be processed.
 #
 # This script moves the files into the appropriate subdirectories.
 #
 # {sboe_id}/{report_id}_{received}.txt --> {sboe_id}/all/{report_id}_{received}.txt
 # {sboe_id}/{report_id}_{received}_receipts.csv --> {sboe_id}/receipts/{report_id}_{received}_receipts.csv
 # {sboe_id}/{report_id}_{received}_expenditures.csv --> {sboe_id}/expenditures/{report_id}_{received}_expenditures.csv

 library(fs)

 all <- dir_ls(here::here("data-raw", "reports"), glob = "*.txt", recurse = TRUE)
 all_new <- path(path_dir(all), "all", path_file(all))

 receipts <- dir_ls(here::here("data-raw", "reports"), glob = "*_receipts.csv", recurse = TRUE)
 receipts_new <- path(path_dir(receipts), "receipts", path_file(receipts))

 expenditures <- dir_ls(here::here("data-raw", "reports"), glob = "*_expenditures.csv", recurse = TRUE)
 expenditures_new <- path(path_dir(expenditures), "expenditures", path_file(expenditures))

 sboe_dirs <- path_dir(all) |> unique()

 cli::cli_progress_step("Moving {.field all}")
 dir_create(path(sboe_dirs, "all"))
 file_move(all, all_new)

 cli::cli_progress_step("Moving {.field receipts}")
 dir_create(path(sboe_dirs, "receipts"))
 file_move(receipts, receipts_new)

 cli::cli_progress_step("Moving {.field expenditures}")
 dir_create(path(sboe_dirs, "expenditures"))
 file_move(expenditures, expenditures_new)

 cli::cli_progress_done()