Garrick Aden-Buie 2 лет назад
Родитель
Сommit
7b988094ca
Не найден GPG ключ соответствующий данной подписи
6 измененных файлов: 14145 добавлений и 9374 удалений
  1. +72
    -24
      process/R/read_report_file.R
  2. +12
    -3
      process/R/record_problems.R
  3. +7
    -0
      process/R/report_path_info.R
  4. +5
    -0
      process/README.Rmd
  5. +9
    -2
      process/_targets.R
  6. +14040
    -9345
      process/_targets/meta/meta

+ 72
- 24
process/R/read_report_file.R Просмотреть файл

@@ -6,11 +6,24 @@ read_report_file <- function(report_path) {
sections_raw <- trimws(sections_raw)
sections_raw <- sections_raw[nzchar(sections_raw)]

sections <- lapply(sections_raw, read_report_section, info = info, report_path = report_path)
sections <-
sections_raw |>
map(read_report_section, info = info, report_path = report_path)

purrr::flatten(sections)
}

skip_report_sections <- function() {
c(
# receipts and expenditures are collected separately
"receipts",
"expenditures",
# the debts tables have __problems__
"debts_owed_to_the_committee",
"debts_owed_by_the_committee"
)
}

read_report_section <- function(section, info, report_path) {
if (!grepl("^[A-Z ]+\n", section)) {
# browser()
@@ -18,46 +31,79 @@ read_report_section <- function(section, info, report_path) {
}
title <- snakecase::to_snake_case(sub("\n.+", "", section))

if (title %in% skip_report_sections()) {
return(NULL)
}

#remove title
body <- sub("^[A-Z ]+\n", "", section)
header <- strsplit(body, "\n")[[1]][[1]]
# trailing commas should be on the previous line
body <- gsub("(\\w) ?\n,,,", "\\1,,,", body)
# # trailing commas should be on the previous line
# body <- gsub("(\\w) ?\n,,,", "\\1,,,", body)
# remove header
body <- trimws(sub(header, "", body, fixed = TRUE))
body <- gsub("([^,]+ )\n([^,]+)", "\\1\\2", body)
# body <- gsub("([^,]+ )\n([^,]+)", "\\1\\2", body)

body <- pre_process_table_body(title, header, body)

csv <- paste0(header, "\n", body)
data <- read_csv(I(csv), show_col_types = FALSE, col_types = cols(.default = "c"))

# browser(expr = title == "cover" && ncol(data) != 16)

if (nrow(problems(data))) {
browser()
problems <- problems(data)
problems$file <- report_path
problems$section <- title
path <- here::here("data-raw", "reports", "read-raw-reports-problems.csv")
problems |>
write_csv(path, append = fs::file_exists(path))
}
record_problems(data, label = title, path = report_path)

if ("SBoE ID" %in% names(data)) {
names(data)[which("SBoE ID" == names(data))] <- "sboe_id"
}
names(data) <- snakecase::to_snake_case(names(data), parsing_option = 3)

post_process_steps_for_table(data, title)

data <- mutate(data, !!!info, .before = 1)
structure(list(data), names = title)
}

read_reports_by_sboe_id <- function(sboe_id) {
raw_paths <- fs::dir_ls(
here::here("data-raw", "reports", sboe_id, "all"),
glob = "*.txt"
pre_process_table_body <- function(table, header, body) {
if (table != "accounts") return(body)

exp_commas <- stringr::str_count(header, ",")

body_lines <- strsplit(body, "\n")[[1]]
body_lines_no_quoted_fields <- gsub('("[^"]+")', "", body_lines)

if (all(stringr::str_count(body_lines_no_quoted_fields, ",") == exp_commas)) {
return(body)
}
browser()

i <- 1
while (i < length(body_lines)) {
if (stringr::str_count(body_lines_no_quoted_fields[i], ",") >= exp_commas) {
i <- i + 1
next
}

body_lines[i] <- paste(body_lines[i], body_lines[i + 1], sep = " ")
body_lines <- body_lines[-(i + 1)]
body_lines_no_quoted_fields <- body_lines_no_quoted_fields[-(i + 1)]
}

paste(body_lines, collapse = "\n")
}

post_process_steps_for_table <- function(data, table) {
switch(
table,
forgiven_loans = ,
loan_proceeds = filter(data, !is.na(amount)),
data
)
names(raw_paths) <- fs::path_rel(raw_paths, here::here("data-raw", "reports"))
}

process_report_export <- function(dir_sboe_id, report_list = tar_read(report_list)) {
all_exports <- dir_ls(dir_sboe_id, glob = "*.txt")
info <- report_info_in_report_list(all_exports, report_list)

raw_paths |>
info$path |>
map(read_report_file) |>
transpose() |>
map(list_rbind) |>
@@ -77,10 +123,12 @@ report_data_set_column_type <- function(data) {
)
}

write_reports_by_sboe_id <- function(sboe_id) {
reports <- read_reports_by_sboe_id(sboe_id)
write_processed_report_export <- function(dir_sboe_id, report_list = tar_read(report_list)) {
reports <- process_report_export(dir_sboe_id, report_list)

sboe_id <- report_path_info(dir_sboe_id)$sboe_id

base_dir <- here::here("data")
base_dir <- here::here("..", "data")
sboe_id_param <- sprintf("sboe_id=%s", sboe_id)

return_path <- c()

+ 12
- 3
process/R/record_problems.R Просмотреть файл

@@ -1,11 +1,20 @@
record_problems <- function(x, path, label) {
if (nrow(problems(x)) == 0) {
record_problems <- function(x, label, path = NULL) {
probs <- problems(x)
if (nrow(probs) == 0) {
return(invisible())
}

cli::cli_inform(
"{nrow(probs)} problem{?s} in parsing the {label} table."
)

problem_dir <- here::here("..", "problems")
problem_path <- path(problem_dir, label, ext = "csv")
dir_create(problem_dir)

write_csv(problems(x), problem_path, append = file_exists(problem_path))
if (!is.null(path)) {
probs$file <- path
}

write_csv(probs, problem_path, append = file_exists(problem_path))
}

+ 7
- 0
process/R/report_path_info.R Просмотреть файл

@@ -9,3 +9,10 @@ report_path_info <- function(report_path) {
report_id = map_int(x, \(x) as.integer(strsplit(x[3], "_")[[1]][1])),
)
}

report_info_in_report_list <- function(files, report_list = tar_read(report_list)) {
info <- report_path_info(files)
info$path <- files
info <- semi_join(info, report_list, by = c("sboe_id", "report_id"))
info
}

+ 5
- 0
process/README.Rmd Просмотреть файл

@@ -0,0 +1,5 @@
## Callouts

Occassionally, `occur_date` in the `expenditures` table may be missing.
If it matters, I think we could fill these in with a fixed date in the reporting period.
For example: the last day or the mid point of the reporting period.

+ 9
- 2
process/_targets.R Просмотреть файл

@@ -11,10 +11,10 @@ tar_option_set(
packages = strsplit(desc::desc_get_field("Depends"), ", ")[[1]],
# For distributed computing in tar_make(), supply a {crew} controller
# as discussed at https://books.ropensci.org/targets/crew.html.
controller = crew::crew_controller_local(workers = 12),
controller = crew::crew_controller_local(workers = 24),
# debug = "path_receipts_parquet_8d195f7e",
# cue = tar_cue(mode = "never")
error = "stop"
error = "null"
)

# Run the R scripts in the R/ folder with your custom functions:
@@ -50,5 +50,12 @@ list(
write_expenditures_parquet(dirs_expenditures, report_list),
pattern = map(dirs_expenditures),
format = "file"
),

tar_target(
paths_all_parquet,
write_processed_report_export(dirs_all, report_list),
pattern = map(dirs_all),
format = "file"
)
)

+ 14040
- 9345
process/_targets/meta/meta
Разница между файлами не показана из-за своего большого размера
Просмотреть файл


Загрузка…
Отмена
Сохранить