|
|
|
@@ -0,0 +1,115 @@ |
|
|
|
read_report_file <- function(report_path) { |
|
|
|
info <- report_path_info(report_path) |
|
|
|
lines <- brio::read_file(report_path) |
|
|
|
|
|
|
|
sections_raw <- strsplit(lines, "\n\r\n")[[1]] |
|
|
|
sections_raw <- trimws(sections_raw) |
|
|
|
sections_raw <- sections_raw[nzchar(sections_raw)] |
|
|
|
|
|
|
|
sections <- lapply(sections_raw, read_report_section, info = info) |
|
|
|
|
|
|
|
purrr::flatten(sections) |
|
|
|
} |
|
|
|
|
|
|
|
read_report_section <- function(section, info) { |
|
|
|
if (!grepl("^[A-Z ]+\n", section)) { |
|
|
|
browser() |
|
|
|
stop("Expected a title at the start of a section") |
|
|
|
} |
|
|
|
title <- snakecase::to_snake_case(sub("\n.+", "", section)) |
|
|
|
|
|
|
|
#remove title |
|
|
|
body <- sub("^[A-Z ]+\n", "", section) |
|
|
|
header <- strsplit(body, "\n")[[1]][[1]] |
|
|
|
# trailing commas should be on the previous line |
|
|
|
body <- gsub("(\\w)\n,", "\\1,", body) |
|
|
|
# remove header |
|
|
|
body <- trimws(sub(header, "", body, fixed = TRUE)) |
|
|
|
body <- gsub("([^,]+ )\n([^,]+)", "\\1\\2", body) |
|
|
|
|
|
|
|
csv <- paste0(header, "\n", body) |
|
|
|
data <- read_csv(I(csv), show_col_types = FALSE, col_types = cols(.default = "c")) |
|
|
|
|
|
|
|
# browser(expr = title == "cover" && ncol(data) != 16) |
|
|
|
|
|
|
|
if (nrow(problems(data))) { |
|
|
|
problems <- problems(data) |
|
|
|
problems$file <- report_path |
|
|
|
problems$section <- title |
|
|
|
problems |> |
|
|
|
write_csv( |
|
|
|
here::here("data-raw", "reports", "read-raw-reports-problems.csv"), |
|
|
|
append = TRUE |
|
|
|
) |
|
|
|
} |
|
|
|
|
|
|
|
if ("SBoE ID" %in% names(data)) { |
|
|
|
names(data)[which("SBoE ID" == names(data))] <- "sboe_id" |
|
|
|
} |
|
|
|
names(data) <- snakecase::to_snake_case(names(data), parsing_option = 3) |
|
|
|
data <- mutate(data, !!!info, .before = 1) |
|
|
|
structure(list(data), names = title) |
|
|
|
} |
|
|
|
|
|
|
|
read_reports_by_sboe_id <- function(sboe_id) { |
|
|
|
raw_paths <- fs::dir_ls( |
|
|
|
here::here("data-raw", "reports", sboe_id), |
|
|
|
glob = "*.txt" |
|
|
|
) |
|
|
|
names(raw_paths) <- fs::path_rel(raw_paths, here::here("data-raw", "reports")) |
|
|
|
|
|
|
|
data <- |
|
|
|
raw_paths |> |
|
|
|
map(read_report_file) |> |
|
|
|
transpose() |> |
|
|
|
map(list_rbind) |> |
|
|
|
map(report_data_set_column_type) |
|
|
|
} |
|
|
|
|
|
|
|
report_data_set_column_type <- function(data) { |
|
|
|
maybe_numeric <- c("period", "cycle", "amount", "sum_to_date", "begin_balance", "end_balance") |
|
|
|
|
|
|
|
data |> |
|
|
|
mutate( |
|
|
|
across( |
|
|
|
c(matches("_date|date_"), -any_of(maybe_numeric)), |
|
|
|
lubridate::mdy |
|
|
|
), |
|
|
|
across(any_of(maybe_numeric), parse_number) |
|
|
|
) |
|
|
|
} |
|
|
|
|
|
|
|
write_reports_by_sboe_id <- function(sboe_id) { |
|
|
|
reports <- read_reports_by_sboe_id(sboe_id) |
|
|
|
|
|
|
|
base_dir <- here::here("data") |
|
|
|
sboe_id_param <- sprintf("sboe_id=%s", sboe_id) |
|
|
|
|
|
|
|
return_path <- c() |
|
|
|
|
|
|
|
for (table in names(reports)) { |
|
|
|
path <- fs::path(base_dir, table, sboe_id_param, "part-0", ext = "parquet") |
|
|
|
dir_create(fs::path_dir(path)) |
|
|
|
|
|
|
|
if (table == "cover") { |
|
|
|
return_path <- path |
|
|
|
} |
|
|
|
|
|
|
|
arrow::write_parquet(reports[[table]], path) |
|
|
|
} |
|
|
|
|
|
|
|
return_path |
|
|
|
} |
|
|
|
|
|
|
|
report_path_info <- function(report_path) { |
|
|
|
report_path <- fs::path_abs(report_path) |
|
|
|
x <- fs::path_rel(report_path, here::here("data-raw", "reports")) |
|
|
|
x <- fs::path_split(x)[[1]] |
|
|
|
|
|
|
|
id <- strsplit(x[2], "_")[[1]][1] |
|
|
|
|
|
|
|
list( |
|
|
|
sboe_id = x[1], |
|
|
|
report_id = as.integer(id) |
|
|
|
) |
|
|
|
} |