Просмотр исходного кода

getting receipts and expenditures worked out

main
Garrick Aden-Buie 2 лет назад
Родитель
Сommit
c87a804c04
Не найден GPG ключ соответствующий данной подписи
7 измененных файлов: 143994 добавлений и 46834 удалений
  1. +115
    -1
      R/get.R
  2. +33
    -0
      R/urls.R
  3. +19
    -0
      README.Rmd
  4. +47
    -3
      _targets.R
  5. +143739
    -46826
      _targets/meta/meta
  6. +3
    -4
      run.R
  7. +38
    -0
      scripts/2023-09-24_organize-data-raw-into-subfolders.R

+ 115
- 1
R/get.R Просмотреть файл

@@ -194,7 +194,7 @@ get_raw_report_all <- function(report_id) {
}

save_raw_report_all <- function(report_id, sboe_id, received) {
dir <- here::here("data-raw", "reports", sboe_id)
dir <- here::here("data-raw", "reports", sboe_id, "all")
file_name <- paste0(report_id, "_", received)
path <- path(dir, file_name, ext = "txt")

@@ -211,3 +211,117 @@ save_raw_report_all <- function(report_id, sboe_id, received) {

invisible(path)
}


get_report_receipts_json <- function(report_id, page_size = 500) {
delay()

page <- 0

cli::cli_progress_step("Getting page 0 of ??")
res <-
report_id |>
req_report_receipts(page = page, page_size = page_size) |>
req_perform() |>
resp_body_json(check_type = FALSE)

data <- res$Data$results
total_rows <- res$Data$recordCountKey
actual_page_size <- length(data)

if (length(data) < total_rows) {
while (length(data) < total_rows) {
page <- page + 1
cli::cli_progress_step("Getting page {page} of {ceiling(total_rows / actual_page_size)} [{length(res$Data$results)}]")

res <-
report_id |>
req_report_receipts(page = page, page_size = page_size) |>
req_perform() |>
resp_body_json(check_type = FALSE)

data <- c(data, res$Data$results)
}
}

cli::cli_progress_step("Preparing data frame")
data |>
map_depth(2, \(x) if (length(x)) x) |>
map_dfr(\(x) x)
}

save_report_receipts_csv <- function(report_id, sboe_id, received) {
dir <- here::here("data-raw", "reports", sboe_id, "receipts")
file_name <- paste0(report_id, "_", received, "_", "receipts")
path <- path(dir, file_name, ext = "csv")

if (file_exists(path)) {
# We don't need to re-download any reports
return(path)
}

res <- get_report_receipts_json(report_id)

dir_create(dir)

write_csv(res, path)

invisible(path)
}


get_report_expenditures_json <- function(report_id, page_size = 500) {
delay()

page <- 0

cli::cli_progress_step("Getting expenses page 0 of ??")
res <-
report_id |>
req_report_expenditures(page = page, page_size = page_size) |>
req_perform() |>
resp_body_json(check_type = FALSE)

data <- res$Data$results
total_rows <- res$Data$recordCountKey
actual_page_size <- length(data)

if (length(data) < total_rows) {
while (length(data) < total_rows) {
cli::cli_progress_step("Getting expenses page {page + 1} of {ceiling(total_rows / actual_page_size)} [{length(res$Data$results)}]")
page <- page + 1

res <-
report_id |>
req_report_expenditures(page = page, page_size = page_size) |>
req_perform() |>
resp_body_json(check_type = FALSE)

data <- c(data, res$Data$results)
}
}

cli::cli_progress_step("Preparing expenses data frame")
data |>
map_depth(2, \(x) if (length(x)) x) |>
map_dfr(\(x) x)
}

save_report_expenditures_csv <- function(report_id, sboe_id, received) {
dir <- here::here("data-raw", "reports", sboe_id, "expenditures")
file_name <- paste0(report_id, "_", received, "_", "expenditures")
path <- path(dir, file_name, ext = "csv")

if (file_exists(path)) {
# We don't need to re-download any reports
return(path)
}

res <- get_report_expenditures_json(report_id)

dir_create(dir)

write_csv(res, path)

invisible(path)
}

+ 33
- 0
R/urls.R Просмотреть файл

@@ -10,6 +10,14 @@ url_nc_cf_export_detail_results <- function() {
"https://cf.ncsbe.gov/CFOrgLkup/ExportDetailResults/"
}

url_nc_cf_report_get_receipts <- function() {
"https://cf.ncsbe.gov/CFOrgLkup/GetReceipts"
}

url_nc_cf_report_get_expenditures <- function() {
"https://cf.ncsbe.gov/CFOrgLkup/GetExpenditures"
}

match_report_type <- function(report, collapse = TRUE) {
report <- toupper(report)

@@ -73,3 +81,28 @@ req_report_detail <- function(report_id, section = "receipts") {

req
}

req_report_receipts <- function(report_id, page = 0, page_size = 300) {
req <- request(url_nc_cf_report_get_receipts())
req <- req_url_query(
req,
ReportID = report_id,
page = page,
pageSize = page_size
)

req
}

req_report_expenditures <- function(report_id, page = 0, page_size = 300) {
req <- request(url_nc_cf_report_get_expenditures())
req <- req_url_query(
req,
ReportID = report_id,
page = page,
pageSize = page_size,
ShowIEColumns = TRUE # this doesn't appear to do anything, but it's required
)

req
}

+ 19
- 0
README.Rmd Просмотреть файл

@@ -54,3 +54,22 @@ https://cf.ncsbe.gov/CFOrgLkup/ExportDetailResults/?ReportID=197247&Type=REC
https://cf.ncsbe.gov/CFOrgLkup/ReportDetail/?RID=197247&TP=EXP
```

## Trying yet again

It turns out that using the "Export data to CSV" link from a
[page like this one](https://cf.ncsbe.gov/CFOrgLkup/ReportDetail/?RID=205761&TP=ALL)
will return a very badly formatted CSV file, potentially full of errors.

On the other hand, we can the pieces via:

```
# Reports
# (paged, unclear page size)
https://cf.ncsbe.gov/CFOrgLkup/GetReceipts?ReportID={report_id}&page=0&pageSize=100

# Expenses
# (paged, unclear page size)
https://cf.ncsbe.gov/CFOrgLkup/GetExpenditures?ReportID={report_id}&page=0&pageSize=100&ShowIEColumns=true


```

+ 47
- 3
_targets.R Просмотреть файл

@@ -65,10 +65,54 @@ list(
),
format = "file_fast"
),
# Gets the JSON version of the report's receipts via an internal API call,
# that is processed into a standard table before saving as CSV.
tar_target(
parquet_report_cover_path,
write_reports_by_sboe_id(report_list_sboe_id),
pattern = map(unique(report_list_sboe_id)),
report_receipts_csv_path,
save_report_receipts_csv(
report_list_report_id,
report_list_sboe_id,
report_list_received
),
pattern = map(
report_list_report_id,
report_list_sboe_id,
report_list_received
),
format = "file_fast"
),
# Gets the JSON version of the report's expenses via an internal API call,
# that is processed into a standard table before saving as CSV.
tar_target(
report_expenditures_csv_path,
save_report_expenditures_csv(
report_list_report_id,
report_list_sboe_id,
report_list_received
),
pattern = map(
report_list_report_id,
report_list_sboe_id,
report_list_received
),
format = "file_fast"
),
tar_target(
dirs_all,
fs::dir_ls("data-raw/reports", glob = "**/all", recurse = TRUE, type = "directory")
),
tar_target(
dirs_receipts,
fs::dir_ls("data-raw/reports", glob = "**/receipts", recurse = TRUE, type = "directory")
),
tar_target(
dirs_expenditures,
fs::dir_ls("data-raw/reports", glob = "**/expenditures", recurse = TRUE, type = "directory")
)
# tar_target(
# parquet_report_cover_path,
# write_reports_by_sboe_id(report_list_sboe_id),
# pattern = map(unique(report_list_sboe_id)),
# format = "file_fast"
# )
)

+ 143739
- 46826
_targets/meta/meta
Разница между файлами не показана из-за своего большого размера
Просмотреть файл


+ 3
- 4
run.R Просмотреть файл

@@ -19,12 +19,11 @@ opts <- docopt(doc)
Sys.setenv("IN_TARGETS" = "true")
Sys.setenv("ALLOW_DOWNLOADS" = "true")

targets::tar_make_future(workers = 8)
if (opts$all) {
cli::cli_alert_into("Running all targets.")
cli::cli_alert_info("Running all targets.")
targets::tar_make()
} else {
cli::cli_alert_into("Running targets: {.and {.field {targets}}}")
targets::tar_make(targets::any_of(opts$targets))
cli::cli_alert_info("Running targets: {.and {.field {opts$targets}}}")
targets::tar_make(targets::any_of(!!opts$targets))
}
# targets::tar_make_clustermq(workers = 2) # nolint

+ 38
- 0
scripts/2023-09-24_organize-data-raw-into-subfolders.R Просмотреть файл

@@ -0,0 +1,38 @@
x <- 1

# I originally dumped everything in data-raw/reports/{sboe_id}/
# but I realized that things will work out best with targets if I can point
# to a single directory for each type of file that needs to be processed.
#
# This script moves the files into the appropriate subdirectories.
#
# {sboe_id}/{report_id}_{received}.txt --> {sboe_id}/all/{report_id}_{received}.txt
# {sboe_id}/{report_id}_{received}_receipts.csv --> {sboe_id}/receipts/{report_id}_{received}_receipts.csv
# {sboe_id}/{report_id}_{received}_expenditures.csv --> {sboe_id}/expenditures/{report_id}_{received}_expenditures.csv

library(fs)

all <- dir_ls(here::here("data-raw", "reports"), glob = "*.txt", recurse = TRUE)
all_new <- path(path_dir(all), "all", path_file(all))

receipts <- dir_ls(here::here("data-raw", "reports"), glob = "*_receipts.csv", recurse = TRUE)
receipts_new <- path(path_dir(receipts), "receipts", path_file(receipts))

expenditures <- dir_ls(here::here("data-raw", "reports"), glob = "*_expenditures.csv", recurse = TRUE)
expenditures_new <- path(path_dir(expenditures), "expenditures", path_file(expenditures))

sboe_dirs <- path_dir(all) |> unique()

cli::cli_progress_step("Moving {.field all}")
dir_create(path(sboe_dirs, "all"))
file_move(all, all_new)

cli::cli_progress_step("Moving {.field receipts}")
dir_create(path(sboe_dirs, "receipts"))
file_move(receipts, receipts_new)

cli::cli_progress_step("Moving {.field expenditures}")
dir_create(path(sboe_dirs, "expenditures"))
file_move(expenditures, expenditures_new)

cli::cli_progress_done()

Загрузка…
Отмена
Сохранить