Просмотр исходного кода

trying to parse the badly formatted csvs

main
Garrick Aden-Buie 2 лет назад
Родитель
Сommit
23d145fb5d
Не найден GPG ключ соответствующий данной подписи
5 измененных файлов: 4642 добавлений и 15 удалений
  1. +4
    -0
      DESCRIPTION
  2. +8
    -9
      R/read_report_file.R
  3. +4
    -2
      _targets.R
  4. +4608
    -3
      _targets/meta/meta
  5. +18
    -1
      run.R

+ 4
- 0
DESCRIPTION Просмотреть файл

@@ -23,6 +23,10 @@ Imports:
arrow,
cli,
crew,
DBI,
dbplyr,
desc,
docopt,
duckdb,
here,
visNetwork

+ 8
- 9
R/read_report_file.R Просмотреть файл

@@ -2,18 +2,18 @@ read_report_file <- function(report_path) {
info <- report_path_info(report_path)
lines <- brio::read_file(report_path)

sections_raw <- strsplit(lines, "\n\r\n")[[1]]
sections_raw <- strsplit(lines, "\n\r?\n")[[1]]
sections_raw <- trimws(sections_raw)
sections_raw <- sections_raw[nzchar(sections_raw)]

sections <- lapply(sections_raw, read_report_section, info = info)
sections <- lapply(sections_raw, read_report_section, info = info, report_path = report_path)

purrr::flatten(sections)
}

read_report_section <- function(section, info) {
read_report_section <- function(section, info, report_path) {
if (!grepl("^[A-Z ]+\n", section)) {
browser()
# browser()
stop("Expected a title at the start of a section")
}
title <- snakecase::to_snake_case(sub("\n.+", "", section))
@@ -22,7 +22,7 @@ read_report_section <- function(section, info) {
body <- sub("^[A-Z ]+\n", "", section)
header <- strsplit(body, "\n")[[1]][[1]]
# trailing commas should be on the previous line
body <- gsub("(\\w)\n,", "\\1,", body)
body <- gsub("(\\w) ?\n,,,", "\\1,,,", body)
# remove header
body <- trimws(sub(header, "", body, fixed = TRUE))
body <- gsub("([^,]+ )\n([^,]+)", "\\1\\2", body)
@@ -33,14 +33,13 @@ read_report_section <- function(section, info) {
# browser(expr = title == "cover" && ncol(data) != 16)

if (nrow(problems(data))) {
browser()
problems <- problems(data)
problems$file <- report_path
problems$section <- title
path <- here::here("data-raw", "reports", "read-raw-reports-problems.csv")
problems |>
write_csv(
here::here("data-raw", "reports", "read-raw-reports-problems.csv"),
append = TRUE
)
write_csv(path, append = fs::file_exists(path))
}

if ("SBoE ID" %in% names(data)) {

+ 4
- 2
_targets.R Просмотреть файл

@@ -11,7 +11,9 @@ tar_option_set(
packages = strsplit(desc::desc_get_field("Depends"), ", ")[[1]],
# For distributed computing in tar_make(), supply a {crew} controller
# as discussed at https://books.ropensci.org/targets/crew.html.
controller = crew::crew_controller_local(workers = 6)
controller = crew::crew_controller_local(workers = 12),
error = "null"
# debug = "parquet_report_cover_path_e8fc956a"
)

# Run the R scripts in the R/ folder with your custom functions:
@@ -66,7 +68,7 @@ list(
tar_target(
parquet_report_cover_path,
write_reports_by_sboe_id(report_list_sboe_id),
pattern = map(report_list_sboe_id),
pattern = map(unique(report_list_sboe_id)),
format = "file_fast"
)
)

+ 4608
- 3
_targets/meta/meta
Разница между файлами не показана из-за своего большого размера
Просмотреть файл


+ 18
- 1
run.R Просмотреть файл

@@ -5,9 +5,26 @@
# See https://books.ropensci.org/targets/hpc.html
# to learn about your options.

'usage:
run.R all
run.R target <targets>...
run.R -h | --help

options:
-h --help Show this screen' -> doc

library(docopt)
opts <- docopt(doc)

Sys.setenv("IN_TARGETS" = "true")
Sys.setenv("ALLOW_DOWNLOADS" = "true")

targets::tar_make_future(workers = 8)
targets::tar_make()
if (opts$all) {
cli::cli_alert_into("Running all targets.")
targets::tar_make()
} else {
cli::cli_alert_into("Running targets: {.and {.field {targets}}}")
targets::tar_make(targets::any_of(opts$targets))
}
# targets::tar_make_clustermq(workers = 2) # nolint

Загрузка…
Отмена
Сохранить