|
- read_report_file <- function(report_path) {
- info <- report_path_info(report_path)
- lines <- brio::read_file(report_path)
-
- sections_raw <- strsplit(lines, "\n\r?\n")[[1]]
- sections_raw <- trimws(sections_raw)
- sections_raw <- sections_raw[nzchar(sections_raw)]
-
- sections <- lapply(sections_raw, read_report_section, info = info, report_path = report_path)
-
- purrr::flatten(sections)
- }
-
- read_report_section <- function(section, info, report_path) {
- if (!grepl("^[A-Z ]+\n", section)) {
- # browser()
- stop("Expected a title at the start of a section")
- }
- title <- snakecase::to_snake_case(sub("\n.+", "", section))
-
- #remove title
- body <- sub("^[A-Z ]+\n", "", section)
- header <- strsplit(body, "\n")[[1]][[1]]
- # trailing commas should be on the previous line
- body <- gsub("(\\w) ?\n,,,", "\\1,,,", body)
- # remove header
- body <- trimws(sub(header, "", body, fixed = TRUE))
- body <- gsub("([^,]+ )\n([^,]+)", "\\1\\2", body)
-
- csv <- paste0(header, "\n", body)
- data <- read_csv(I(csv), show_col_types = FALSE, col_types = cols(.default = "c"))
-
- # browser(expr = title == "cover" && ncol(data) != 16)
-
- if (nrow(problems(data))) {
- browser()
- problems <- problems(data)
- problems$file <- report_path
- problems$section <- title
- path <- here::here("data-raw", "reports", "read-raw-reports-problems.csv")
- problems |>
- write_csv(path, append = fs::file_exists(path))
- }
-
- if ("SBoE ID" %in% names(data)) {
- names(data)[which("SBoE ID" == names(data))] <- "sboe_id"
- }
- names(data) <- snakecase::to_snake_case(names(data), parsing_option = 3)
- data <- mutate(data, !!!info, .before = 1)
- structure(list(data), names = title)
- }
-
- read_reports_by_sboe_id <- function(sboe_id) {
- raw_paths <- fs::dir_ls(
- here::here("data-raw", "reports", sboe_id, "all"),
- glob = "*.txt"
- )
- names(raw_paths) <- fs::path_rel(raw_paths, here::here("data-raw", "reports"))
-
- raw_paths |>
- map(read_report_file) |>
- transpose() |>
- map(list_rbind) |>
- map(report_data_set_column_type)
- }
-
- report_data_set_column_type <- function(data) {
- maybe_numeric <- c("period", "cycle", "amount", "sum_to_date", "begin_balance", "end_balance")
-
- data |>
- mutate(
- across(
- c(matches("_date|date_"), -any_of(maybe_numeric)),
- lubridate::mdy
- ),
- across(any_of(maybe_numeric), parse_number)
- )
- }
-
- write_reports_by_sboe_id <- function(sboe_id) {
- reports <- read_reports_by_sboe_id(sboe_id)
-
- base_dir <- here::here("data")
- sboe_id_param <- sprintf("sboe_id=%s", sboe_id)
-
- return_path <- c()
-
- for (table in names(reports)) {
- path <- fs::path(base_dir, table, sboe_id_param, "part-0", ext = "parquet")
- dir_create(fs::path_dir(path))
-
- if (table == "cover") {
- return_path <- path
- }
-
- arrow::write_parquet(reports[[table]], path)
- }
-
- return_path
- }
|