|
|
|
@@ -6,11 +6,24 @@ read_report_file <- function(report_path) { |
|
|
|
sections_raw <- trimws(sections_raw) |
|
|
|
sections_raw <- sections_raw[nzchar(sections_raw)] |
|
|
|
|
|
|
|
sections <- lapply(sections_raw, read_report_section, info = info, report_path = report_path) |
|
|
|
sections <- |
|
|
|
sections_raw |> |
|
|
|
map(read_report_section, info = info, report_path = report_path) |
|
|
|
|
|
|
|
purrr::flatten(sections) |
|
|
|
} |
|
|
|
|
|
|
|
skip_report_sections <- function() { |
|
|
|
c( |
|
|
|
# receipts and expenditures are collected separately |
|
|
|
"receipts", |
|
|
|
"expenditures", |
|
|
|
# the debts tables have __problems__ |
|
|
|
"debts_owed_to_the_committee", |
|
|
|
"debts_owed_by_the_committee" |
|
|
|
) |
|
|
|
} |
|
|
|
|
|
|
|
read_report_section <- function(section, info, report_path) { |
|
|
|
if (!grepl("^[A-Z ]+\n", section)) { |
|
|
|
# browser() |
|
|
|
@@ -18,46 +31,79 @@ read_report_section <- function(section, info, report_path) { |
|
|
|
} |
|
|
|
title <- snakecase::to_snake_case(sub("\n.+", "", section)) |
|
|
|
|
|
|
|
if (title %in% skip_report_sections()) { |
|
|
|
return(NULL) |
|
|
|
} |
|
|
|
|
|
|
|
#remove title |
|
|
|
body <- sub("^[A-Z ]+\n", "", section) |
|
|
|
header <- strsplit(body, "\n")[[1]][[1]] |
|
|
|
# trailing commas should be on the previous line |
|
|
|
body <- gsub("(\\w) ?\n,,,", "\\1,,,", body) |
|
|
|
# # trailing commas should be on the previous line |
|
|
|
# body <- gsub("(\\w) ?\n,,,", "\\1,,,", body) |
|
|
|
# remove header |
|
|
|
body <- trimws(sub(header, "", body, fixed = TRUE)) |
|
|
|
body <- gsub("([^,]+ )\n([^,]+)", "\\1\\2", body) |
|
|
|
# body <- gsub("([^,]+ )\n([^,]+)", "\\1\\2", body) |
|
|
|
|
|
|
|
body <- pre_process_table_body(title, header, body) |
|
|
|
|
|
|
|
csv <- paste0(header, "\n", body) |
|
|
|
data <- read_csv(I(csv), show_col_types = FALSE, col_types = cols(.default = "c")) |
|
|
|
|
|
|
|
# browser(expr = title == "cover" && ncol(data) != 16) |
|
|
|
|
|
|
|
if (nrow(problems(data))) { |
|
|
|
browser() |
|
|
|
problems <- problems(data) |
|
|
|
problems$file <- report_path |
|
|
|
problems$section <- title |
|
|
|
path <- here::here("data-raw", "reports", "read-raw-reports-problems.csv") |
|
|
|
problems |> |
|
|
|
write_csv(path, append = fs::file_exists(path)) |
|
|
|
} |
|
|
|
record_problems(data, label = title, path = report_path) |
|
|
|
|
|
|
|
if ("SBoE ID" %in% names(data)) { |
|
|
|
names(data)[which("SBoE ID" == names(data))] <- "sboe_id" |
|
|
|
} |
|
|
|
names(data) <- snakecase::to_snake_case(names(data), parsing_option = 3) |
|
|
|
|
|
|
|
post_process_steps_for_table(data, title) |
|
|
|
|
|
|
|
data <- mutate(data, !!!info, .before = 1) |
|
|
|
structure(list(data), names = title) |
|
|
|
} |
|
|
|
|
|
|
|
read_reports_by_sboe_id <- function(sboe_id) { |
|
|
|
raw_paths <- fs::dir_ls( |
|
|
|
here::here("data-raw", "reports", sboe_id, "all"), |
|
|
|
glob = "*.txt" |
|
|
|
pre_process_table_body <- function(table, header, body) { |
|
|
|
if (table != "accounts") return(body) |
|
|
|
|
|
|
|
exp_commas <- stringr::str_count(header, ",") |
|
|
|
|
|
|
|
body_lines <- strsplit(body, "\n")[[1]] |
|
|
|
body_lines_no_quoted_fields <- gsub('("[^"]+")', "", body_lines) |
|
|
|
|
|
|
|
if (all(stringr::str_count(body_lines_no_quoted_fields, ",") == exp_commas)) { |
|
|
|
return(body) |
|
|
|
} |
|
|
|
browser() |
|
|
|
|
|
|
|
i <- 1 |
|
|
|
while (i < length(body_lines)) { |
|
|
|
if (stringr::str_count(body_lines_no_quoted_fields[i], ",") >= exp_commas) { |
|
|
|
i <- i + 1 |
|
|
|
next |
|
|
|
} |
|
|
|
|
|
|
|
body_lines[i] <- paste(body_lines[i], body_lines[i + 1], sep = " ") |
|
|
|
body_lines <- body_lines[-(i + 1)] |
|
|
|
body_lines_no_quoted_fields <- body_lines_no_quoted_fields[-(i + 1)] |
|
|
|
} |
|
|
|
|
|
|
|
paste(body_lines, collapse = "\n") |
|
|
|
} |
|
|
|
|
|
|
|
post_process_steps_for_table <- function(data, table) { |
|
|
|
switch( |
|
|
|
table, |
|
|
|
forgiven_loans = , |
|
|
|
loan_proceeds = filter(data, !is.na(amount)), |
|
|
|
data |
|
|
|
) |
|
|
|
names(raw_paths) <- fs::path_rel(raw_paths, here::here("data-raw", "reports")) |
|
|
|
} |
|
|
|
|
|
|
|
process_report_export <- function(dir_sboe_id, report_list = tar_read(report_list)) { |
|
|
|
all_exports <- dir_ls(dir_sboe_id, glob = "*.txt") |
|
|
|
info <- report_info_in_report_list(all_exports, report_list) |
|
|
|
|
|
|
|
raw_paths |> |
|
|
|
info$path |> |
|
|
|
map(read_report_file) |> |
|
|
|
transpose() |> |
|
|
|
map(list_rbind) |> |
|
|
|
@@ -77,10 +123,12 @@ report_data_set_column_type <- function(data) { |
|
|
|
) |
|
|
|
} |
|
|
|
|
|
|
|
write_reports_by_sboe_id <- function(sboe_id) { |
|
|
|
reports <- read_reports_by_sboe_id(sboe_id) |
|
|
|
write_processed_report_export <- function(dir_sboe_id, report_list = tar_read(report_list)) { |
|
|
|
reports <- process_report_export(dir_sboe_id, report_list) |
|
|
|
|
|
|
|
sboe_id <- report_path_info(dir_sboe_id)$sboe_id |
|
|
|
|
|
|
|
base_dir <- here::here("data") |
|
|
|
base_dir <- here::here("..", "data") |
|
|
|
sboe_id_param <- sprintf("sboe_id=%s", sboe_id) |
|
|
|
|
|
|
|
return_path <- c() |