Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.

115 lines
3.1KB

  1. read_report_file <- function(report_path) {
  2. info <- report_path_info(report_path)
  3. lines <- brio::read_file(report_path)
  4. sections_raw <- strsplit(lines, "\n\r?\n")[[1]]
  5. sections_raw <- trimws(sections_raw)
  6. sections_raw <- sections_raw[nzchar(sections_raw)]
  7. sections <- lapply(sections_raw, read_report_section, info = info, report_path = report_path)
  8. purrr::flatten(sections)
  9. }
  10. read_report_section <- function(section, info, report_path) {
  11. if (!grepl("^[A-Z ]+\n", section)) {
  12. # browser()
  13. stop("Expected a title at the start of a section")
  14. }
  15. title <- snakecase::to_snake_case(sub("\n.+", "", section))
  16. #remove title
  17. body <- sub("^[A-Z ]+\n", "", section)
  18. header <- strsplit(body, "\n")[[1]][[1]]
  19. # trailing commas should be on the previous line
  20. body <- gsub("(\\w) ?\n,,,", "\\1,,,", body)
  21. # remove header
  22. body <- trimws(sub(header, "", body, fixed = TRUE))
  23. body <- gsub("([^,]+ )\n([^,]+)", "\\1\\2", body)
  24. csv <- paste0(header, "\n", body)
  25. data <- read_csv(I(csv), show_col_types = FALSE, col_types = cols(.default = "c"))
  26. # browser(expr = title == "cover" && ncol(data) != 16)
  27. if (nrow(problems(data))) {
  28. browser()
  29. problems <- problems(data)
  30. problems$file <- report_path
  31. problems$section <- title
  32. path <- here::here("data-raw", "reports", "read-raw-reports-problems.csv")
  33. problems |>
  34. write_csv(path, append = fs::file_exists(path))
  35. }
  36. if ("SBoE ID" %in% names(data)) {
  37. names(data)[which("SBoE ID" == names(data))] <- "sboe_id"
  38. }
  39. names(data) <- snakecase::to_snake_case(names(data), parsing_option = 3)
  40. data <- mutate(data, !!!info, .before = 1)
  41. structure(list(data), names = title)
  42. }
  43. read_reports_by_sboe_id <- function(sboe_id) {
  44. raw_paths <- fs::dir_ls(
  45. here::here("data-raw", "reports", sboe_id),
  46. glob = "*.txt"
  47. )
  48. names(raw_paths) <- fs::path_rel(raw_paths, here::here("data-raw", "reports"))
  49. data <-
  50. raw_paths |>
  51. map(read_report_file) |>
  52. transpose() |>
  53. map(list_rbind) |>
  54. map(report_data_set_column_type)
  55. }
  56. report_data_set_column_type <- function(data) {
  57. maybe_numeric <- c("period", "cycle", "amount", "sum_to_date", "begin_balance", "end_balance")
  58. data |>
  59. mutate(
  60. across(
  61. c(matches("_date|date_"), -any_of(maybe_numeric)),
  62. lubridate::mdy
  63. ),
  64. across(any_of(maybe_numeric), parse_number)
  65. )
  66. }
  67. write_reports_by_sboe_id <- function(sboe_id) {
  68. reports <- read_reports_by_sboe_id(sboe_id)
  69. base_dir <- here::here("data")
  70. sboe_id_param <- sprintf("sboe_id=%s", sboe_id)
  71. return_path <- c()
  72. for (table in names(reports)) {
  73. path <- fs::path(base_dir, table, sboe_id_param, "part-0", ext = "parquet")
  74. dir_create(fs::path_dir(path))
  75. if (table == "cover") {
  76. return_path <- path
  77. }
  78. arrow::write_parquet(reports[[table]], path)
  79. }
  80. return_path
  81. }
  82. report_path_info <- function(report_path) {
  83. report_path <- fs::path_abs(report_path)
  84. x <- fs::path_rel(report_path, here::here("data-raw", "reports"))
  85. x <- fs::path_split(x)[[1]]
  86. id <- strsplit(x[2], "_")[[1]][1]
  87. list(
  88. sboe_id = x[1],
  89. report_id = as.integer(id)
  90. )
  91. }