You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

101 lines
2.8KB

  1. read_report_file <- function(report_path) {
  2. info <- report_path_info(report_path)
  3. lines <- brio::read_file(report_path)
  4. sections_raw <- strsplit(lines, "\n\r?\n")[[1]]
  5. sections_raw <- trimws(sections_raw)
  6. sections_raw <- sections_raw[nzchar(sections_raw)]
  7. sections <- lapply(sections_raw, read_report_section, info = info, report_path = report_path)
  8. purrr::flatten(sections)
  9. }
  10. read_report_section <- function(section, info, report_path) {
  11. if (!grepl("^[A-Z ]+\n", section)) {
  12. # browser()
  13. stop("Expected a title at the start of a section")
  14. }
  15. title <- snakecase::to_snake_case(sub("\n.+", "", section))
  16. #remove title
  17. body <- sub("^[A-Z ]+\n", "", section)
  18. header <- strsplit(body, "\n")[[1]][[1]]
  19. # trailing commas should be on the previous line
  20. body <- gsub("(\\w) ?\n,,,", "\\1,,,", body)
  21. # remove header
  22. body <- trimws(sub(header, "", body, fixed = TRUE))
  23. body <- gsub("([^,]+ )\n([^,]+)", "\\1\\2", body)
  24. csv <- paste0(header, "\n", body)
  25. data <- read_csv(I(csv), show_col_types = FALSE, col_types = cols(.default = "c"))
  26. # browser(expr = title == "cover" && ncol(data) != 16)
  27. if (nrow(problems(data))) {
  28. browser()
  29. problems <- problems(data)
  30. problems$file <- report_path
  31. problems$section <- title
  32. path <- here::here("data-raw", "reports", "read-raw-reports-problems.csv")
  33. problems |>
  34. write_csv(path, append = fs::file_exists(path))
  35. }
  36. if ("SBoE ID" %in% names(data)) {
  37. names(data)[which("SBoE ID" == names(data))] <- "sboe_id"
  38. }
  39. names(data) <- snakecase::to_snake_case(names(data), parsing_option = 3)
  40. data <- mutate(data, !!!info, .before = 1)
  41. structure(list(data), names = title)
  42. }
  43. read_reports_by_sboe_id <- function(sboe_id) {
  44. raw_paths <- fs::dir_ls(
  45. here::here("data-raw", "reports", sboe_id, "all"),
  46. glob = "*.txt"
  47. )
  48. names(raw_paths) <- fs::path_rel(raw_paths, here::here("data-raw", "reports"))
  49. raw_paths |>
  50. map(read_report_file) |>
  51. transpose() |>
  52. map(list_rbind) |>
  53. map(report_data_set_column_type)
  54. }
  55. report_data_set_column_type <- function(data) {
  56. maybe_numeric <- c("period", "cycle", "amount", "sum_to_date", "begin_balance", "end_balance")
  57. data |>
  58. mutate(
  59. across(
  60. c(matches("_date|date_"), -any_of(maybe_numeric)),
  61. lubridate::mdy
  62. ),
  63. across(any_of(maybe_numeric), parse_number)
  64. )
  65. }
  66. write_reports_by_sboe_id <- function(sboe_id) {
  67. reports <- read_reports_by_sboe_id(sboe_id)
  68. base_dir <- here::here("data")
  69. sboe_id_param <- sprintf("sboe_id=%s", sboe_id)
  70. return_path <- c()
  71. for (table in names(reports)) {
  72. path <- fs::path(base_dir, table, sboe_id_param, "part-0", ext = "parquet")
  73. dir_create(fs::path_dir(path))
  74. if (table == "cover") {
  75. return_path <- path
  76. }
  77. arrow::write_parquet(reports[[table]], path)
  78. }
  79. return_path
  80. }