Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.

116 lines
3.0KB

  1. read_report_file <- function(report_path) {
  2. info <- report_path_info(report_path)
  3. lines <- brio::read_file(report_path)
  4. sections_raw <- strsplit(lines, "\n\r\n")[[1]]
  5. sections_raw <- trimws(sections_raw)
  6. sections_raw <- sections_raw[nzchar(sections_raw)]
  7. sections <- lapply(sections_raw, read_report_section, info = info)
  8. purrr::flatten(sections)
  9. }
  10. read_report_section <- function(section, info) {
  11. if (!grepl("^[A-Z ]+\n", section)) {
  12. browser()
  13. stop("Expected a title at the start of a section")
  14. }
  15. title <- snakecase::to_snake_case(sub("\n.+", "", section))
  16. #remove title
  17. body <- sub("^[A-Z ]+\n", "", section)
  18. header <- strsplit(body, "\n")[[1]][[1]]
  19. # trailing commas should be on the previous line
  20. body <- gsub("(\\w)\n,", "\\1,", body)
  21. # remove header
  22. body <- trimws(sub(header, "", body, fixed = TRUE))
  23. body <- gsub("([^,]+ )\n([^,]+)", "\\1\\2", body)
  24. csv <- paste0(header, "\n", body)
  25. data <- read_csv(I(csv), show_col_types = FALSE, col_types = cols(.default = "c"))
  26. # browser(expr = title == "cover" && ncol(data) != 16)
  27. if (nrow(problems(data))) {
  28. problems <- problems(data)
  29. problems$file <- report_path
  30. problems$section <- title
  31. problems |>
  32. write_csv(
  33. here::here("data-raw", "reports", "read-raw-reports-problems.csv"),
  34. append = TRUE
  35. )
  36. }
  37. if ("SBoE ID" %in% names(data)) {
  38. names(data)[which("SBoE ID" == names(data))] <- "sboe_id"
  39. }
  40. names(data) <- snakecase::to_snake_case(names(data), parsing_option = 3)
  41. data <- mutate(data, !!!info, .before = 1)
  42. structure(list(data), names = title)
  43. }
  44. read_reports_by_sboe_id <- function(sboe_id) {
  45. raw_paths <- fs::dir_ls(
  46. here::here("data-raw", "reports", sboe_id),
  47. glob = "*.txt"
  48. )
  49. names(raw_paths) <- fs::path_rel(raw_paths, here::here("data-raw", "reports"))
  50. data <-
  51. raw_paths |>
  52. map(read_report_file) |>
  53. transpose() |>
  54. map(list_rbind) |>
  55. map(report_data_set_column_type)
  56. }
  57. report_data_set_column_type <- function(data) {
  58. maybe_numeric <- c("period", "cycle", "amount", "sum_to_date", "begin_balance", "end_balance")
  59. data |>
  60. mutate(
  61. across(
  62. c(matches("_date|date_"), -any_of(maybe_numeric)),
  63. lubridate::mdy
  64. ),
  65. across(any_of(maybe_numeric), parse_number)
  66. )
  67. }
  68. write_reports_by_sboe_id <- function(sboe_id) {
  69. reports <- read_reports_by_sboe_id(sboe_id)
  70. base_dir <- here::here("data")
  71. sboe_id_param <- sprintf("sboe_id=%s", sboe_id)
  72. return_path <- c()
  73. for (table in names(reports)) {
  74. path <- fs::path(base_dir, table, sboe_id_param, "part-0", ext = "parquet")
  75. dir_create(fs::path_dir(path))
  76. if (table == "cover") {
  77. return_path <- path
  78. }
  79. arrow::write_parquet(reports[[table]], path)
  80. }
  81. return_path
  82. }
  83. report_path_info <- function(report_path) {
  84. report_path <- fs::path_abs(report_path)
  85. x <- fs::path_rel(report_path, here::here("data-raw", "reports"))
  86. x <- fs::path_split(x)[[1]]
  87. id <- strsplit(x[2], "_")[[1]][1]
  88. list(
  89. sboe_id = x[1],
  90. report_id = as.integer(id)
  91. )
  92. }