Nie możesz wybrać więcej, niż 25 tematów Tematy muszą się zaczynać od litery lub cyfry, mogą zawierać myślniki ('-') i mogą mieć do 35 znaków.

148 lines
3.9KB

  1. read_report_file <- function(report_path) {
  2. info <- report_path_info(report_path)
  3. lines <- brio::read_file(report_path)
  4. sections_raw <- strsplit(lines, "\n\r?\n")[[1]]
  5. sections_raw <- trimws(sections_raw)
  6. sections_raw <- sections_raw[nzchar(sections_raw)]
  7. sections <-
  8. sections_raw |>
  9. map(read_report_section, info = info, report_path = report_path)
  10. purrr::flatten(sections)
  11. }
  12. skip_report_sections <- function() {
  13. c(
  14. # receipts and expenditures are collected separately
  15. "receipts",
  16. "expenditures",
  17. # the debts tables have __problems__
  18. "debts_owed_to_the_committee",
  19. "debts_owed_by_the_committee"
  20. )
  21. }
  22. read_report_section <- function(section, info, report_path) {
  23. if (!grepl("^[A-Z ]+\n", section)) {
  24. # browser()
  25. stop("Expected a title at the start of a section")
  26. }
  27. title <- snakecase::to_snake_case(sub("\n.+", "", section))
  28. if (title %in% skip_report_sections()) {
  29. return(NULL)
  30. }
  31. #remove title
  32. body <- sub("^[A-Z ]+\n", "", section)
  33. header <- strsplit(body, "\n")[[1]][[1]]
  34. # # trailing commas should be on the previous line
  35. # body <- gsub("(\\w) ?\n,,,", "\\1,,,", body)
  36. # remove header
  37. body <- trimws(sub(header, "", body, fixed = TRUE))
  38. # body <- gsub("([^,]+ )\n([^,]+)", "\\1\\2", body)
  39. body <- pre_process_table_body(title, header, body)
  40. csv <- paste0(header, "\n", body)
  41. data <- read_csv(I(csv), show_col_types = FALSE, col_types = cols(.default = "c"))
  42. record_problems(data, label = title, path = report_path)
  43. if ("SBoE ID" %in% names(data)) {
  44. names(data)[which("SBoE ID" == names(data))] <- "sboe_id"
  45. }
  46. names(data) <- snakecase::to_snake_case(names(data), parsing_option = 3)
  47. post_process_steps_for_table(data, title)
  48. data <- mutate(data, !!!info, .before = 1)
  49. structure(list(data), names = title)
  50. }
  51. pre_process_table_body <- function(table, header, body) {
  52. if (table != "accounts") return(body)
  53. exp_commas <- stringr::str_count(header, ",")
  54. body_lines <- strsplit(body, "\n")[[1]]
  55. body_lines_no_quoted_fields <- gsub('("[^"]+")', "", body_lines)
  56. if (all(stringr::str_count(body_lines_no_quoted_fields, ",") == exp_commas)) {
  57. return(body)
  58. }
  59. browser()
  60. i <- 1
  61. while (i < length(body_lines)) {
  62. if (stringr::str_count(body_lines_no_quoted_fields[i], ",") >= exp_commas) {
  63. i <- i + 1
  64. next
  65. }
  66. body_lines[i] <- paste(body_lines[i], body_lines[i + 1], sep = " ")
  67. body_lines <- body_lines[-(i + 1)]
  68. body_lines_no_quoted_fields <- body_lines_no_quoted_fields[-(i + 1)]
  69. }
  70. paste(body_lines, collapse = "\n")
  71. }
  72. post_process_steps_for_table <- function(data, table) {
  73. switch(
  74. table,
  75. forgiven_loans = ,
  76. loan_proceeds = filter(data, !is.na(amount)),
  77. data
  78. )
  79. }
  80. process_report_export <- function(dir_sboe_id, report_list = tar_read(report_list)) {
  81. all_exports <- dir_ls(dir_sboe_id, glob = "*.txt")
  82. info <- report_info_in_report_list(all_exports, report_list)
  83. info$path |>
  84. map(read_report_file) |>
  85. list_transpose_bind() |>
  86. map(report_data_set_column_type)
  87. }
  88. report_data_set_column_type <- function(data) {
  89. maybe_numeric <- c("period", "cycle", "amount", "sum_to_date", "begin_balance", "end_balance")
  90. data |>
  91. mutate(
  92. across(
  93. c(matches("_date|date_"), -any_of(maybe_numeric)),
  94. lubridate::mdy
  95. ),
  96. across(any_of(maybe_numeric), parse_number)
  97. )
  98. }
  99. write_processed_report_export <- function(dir_sboe_id, report_list = tar_read(report_list)) {
  100. reports <- process_report_export(dir_sboe_id, report_list)
  101. sboe_id <- report_path_info(dir_sboe_id)$sboe_id
  102. base_dir <- here::here("..", "data")
  103. sboe_id_param <- sprintf("sboe_id=%s", sboe_id)
  104. return_path <- c()
  105. for (table in names(reports)) {
  106. path <- fs::path(base_dir, table, sboe_id_param, "part-0", ext = "parquet")
  107. dir_create(fs::path_dir(path))
  108. if (table == "cover") {
  109. return_path <- path
  110. }
  111. arrow::write_parquet(reports[[table]], path)
  112. }
  113. return_path
  114. }