Просмотр исходного кода

prepping to read into parquet format

main
Garrick Aden-Buie 2 лет назад
Родитель
Сommit
df7d8d347b
Не найден GPG ключ соответствующий данной подписи
7 измененных файлов: 4475 добавлений и 60 удалений
  1. +1
    -0
      .gitignore
  2. +2
    -0
      DESCRIPTION
  3. +6
    -2
      R/get.R
  4. +115
    -0
      R/read_report_file.R
  5. +7
    -1
      _targets.R
  6. +4343
    -56
      _targets/meta/meta
  7. +1
    -1
      run.R

+ 1
- 0
.gitignore Просмотреть файл

@@ -5,3 +5,4 @@
.history
0-time-log.csv
data-raw/
data/

+ 2
- 0
DESCRIPTION Просмотреть файл

@@ -15,10 +15,12 @@ Depends:
fs,
glue,
httr2,
purrr,
readr,
rlang,
targets
Imports:
arrow,
cli,
crew,
desc,

+ 6
- 2
R/get.R Просмотреть файл

@@ -61,8 +61,12 @@ get_report_by_year_scrape <- function(year, report) {
res <- res[grepl("^\\s*var data = \\[", res)]
res <- sub("\\s*var data = ", "", res)

jsonlite::fromJSON(res) |>
as_tibble() |>
tbl <- jsonlite::fromJSON(res) |>
as_tibble()

if (nrow(tbl) == 0) return(NULL)

tbl |>
readr::type_convert(col_types = spec_report_by_year_scrape()) |>
select(
year = ReportYear,

+ 115
- 0
R/read_report_file.R Просмотреть файл

@@ -0,0 +1,115 @@
read_report_file <- function(report_path) {
info <- report_path_info(report_path)
lines <- brio::read_file(report_path)

sections_raw <- strsplit(lines, "\n\r\n")[[1]]
sections_raw <- trimws(sections_raw)
sections_raw <- sections_raw[nzchar(sections_raw)]

sections <- lapply(sections_raw, read_report_section, info = info)

purrr::flatten(sections)
}

read_report_section <- function(section, info) {
if (!grepl("^[A-Z ]+\n", section)) {
browser()
stop("Expected a title at the start of a section")
}
title <- snakecase::to_snake_case(sub("\n.+", "", section))

#remove title
body <- sub("^[A-Z ]+\n", "", section)
header <- strsplit(body, "\n")[[1]][[1]]
# trailing commas should be on the previous line
body <- gsub("(\\w)\n,", "\\1,", body)
# remove header
body <- trimws(sub(header, "", body, fixed = TRUE))
body <- gsub("([^,]+ )\n([^,]+)", "\\1\\2", body)

csv <- paste0(header, "\n", body)
data <- read_csv(I(csv), show_col_types = FALSE, col_types = cols(.default = "c"))

# browser(expr = title == "cover" && ncol(data) != 16)

if (nrow(problems(data))) {
problems <- problems(data)
problems$file <- report_path
problems$section <- title
problems |>
write_csv(
here::here("data-raw", "reports", "read-raw-reports-problems.csv"),
append = TRUE
)
}

if ("SBoE ID" %in% names(data)) {
names(data)[which("SBoE ID" == names(data))] <- "sboe_id"
}
names(data) <- snakecase::to_snake_case(names(data), parsing_option = 3)
data <- mutate(data, !!!info, .before = 1)
structure(list(data), names = title)
}

read_reports_by_sboe_id <- function(sboe_id) {
raw_paths <- fs::dir_ls(
here::here("data-raw", "reports", sboe_id),
glob = "*.txt"
)
names(raw_paths) <- fs::path_rel(raw_paths, here::here("data-raw", "reports"))

data <-
raw_paths |>
map(read_report_file) |>
transpose() |>
map(list_rbind) |>
map(report_data_set_column_type)
}

report_data_set_column_type <- function(data) {
maybe_numeric <- c("period", "cycle", "amount", "sum_to_date", "begin_balance", "end_balance")

data |>
mutate(
across(
c(matches("_date|date_"), -any_of(maybe_numeric)),
lubridate::mdy
),
across(any_of(maybe_numeric), parse_number)
)
}

write_reports_by_sboe_id <- function(sboe_id) {
reports <- read_reports_by_sboe_id(sboe_id)

base_dir <- here::here("data")
sboe_id_param <- sprintf("sboe_id=%s", sboe_id)

return_path <- c()

for (table in names(reports)) {
path <- fs::path(base_dir, table, sboe_id_param, "part-0", ext = "parquet")
dir_create(fs::path_dir(path))

if (table == "cover") {
return_path <- path
}

arrow::write_parquet(reports[[table]], path)
}

return_path
}

report_path_info <- function(report_path) {
report_path <- fs::path_abs(report_path)
x <- fs::path_rel(report_path, here::here("data-raw", "reports"))
x <- fs::path_split(x)[[1]]

id <- strsplit(x[2], "_")[[1]][1]

list(
sboe_id = x[1],
report_id = as.integer(id)
)
}

+ 7
- 1
_targets.R Просмотреть файл

@@ -11,7 +11,7 @@ tar_option_set(
packages = strsplit(desc::desc_get_field("Depends"), ", ")[[1]],
# For distributed computing in tar_make(), supply a {crew} controller
# as discussed at https://books.ropensci.org/targets/crew.html.
controller = crew::crew_controller_local(workers = 2)
controller = crew::crew_controller_local(workers = 6)
)

# Run the R scripts in the R/ folder with your custom functions:
@@ -62,5 +62,11 @@ list(
report_list_received
),
format = "file_fast"
),
tar_target(
parquet_report_cover_path,
write_reports_by_sboe_id(report_list_sboe_id),
pattern = map(report_list_sboe_id),
format = "file_fast"
)
)

+ 4343
- 56
_targets/meta/meta
Разница между файлами не показана из-за своего большого размера
Просмотреть файл


+ 1
- 1
run.R Просмотреть файл

@@ -6,7 +6,7 @@
# to learn about your options.

Sys.setenv("IN_TARGETS" = "true")
Sys.setenv("ALLOW_DOWNLOADS" = "false")
Sys.setenv("ALLOW_DOWNLOADS" = "true")

targets::tar_make_future(workers = 8)
targets::tar_make()

Загрузка…
Отмена
Сохранить