瀏覽代碼

trying to parse the badly formatted csvs

main
Garrick Aden-Buie 2 年之前
父節點
當前提交
23d145fb5d
沒有發現已知的金鑰在資料庫的簽署中
共有 5 個檔案被更改,包括 4642 行新增15 行删除
  1. +4
    -0
      DESCRIPTION
  2. +8
    -9
      R/read_report_file.R
  3. +4
    -2
      _targets.R
  4. +4608
    -3
      _targets/meta/meta
  5. +18
    -1
      run.R

+ 4
- 0
DESCRIPTION 查看文件

arrow, arrow,
cli, cli,
crew, crew,
DBI,
dbplyr,
desc, desc,
docopt,
duckdb,
here, here,
visNetwork visNetwork

+ 8
- 9
R/read_report_file.R 查看文件

info <- report_path_info(report_path) info <- report_path_info(report_path)
lines <- brio::read_file(report_path) lines <- brio::read_file(report_path)


sections_raw <- strsplit(lines, "\n\r\n")[[1]]
sections_raw <- strsplit(lines, "\n\r?\n")[[1]]
sections_raw <- trimws(sections_raw) sections_raw <- trimws(sections_raw)
sections_raw <- sections_raw[nzchar(sections_raw)] sections_raw <- sections_raw[nzchar(sections_raw)]


sections <- lapply(sections_raw, read_report_section, info = info)
sections <- lapply(sections_raw, read_report_section, info = info, report_path = report_path)


purrr::flatten(sections) purrr::flatten(sections)
} }


read_report_section <- function(section, info) {
read_report_section <- function(section, info, report_path) {
if (!grepl("^[A-Z ]+\n", section)) { if (!grepl("^[A-Z ]+\n", section)) {
browser()
# browser()
stop("Expected a title at the start of a section") stop("Expected a title at the start of a section")
} }
title <- snakecase::to_snake_case(sub("\n.+", "", section)) title <- snakecase::to_snake_case(sub("\n.+", "", section))
body <- sub("^[A-Z ]+\n", "", section) body <- sub("^[A-Z ]+\n", "", section)
header <- strsplit(body, "\n")[[1]][[1]] header <- strsplit(body, "\n")[[1]][[1]]
# trailing commas should be on the previous line # trailing commas should be on the previous line
body <- gsub("(\\w)\n,", "\\1,", body)
body <- gsub("(\\w) ?\n,,,", "\\1,,,", body)
# remove header # remove header
body <- trimws(sub(header, "", body, fixed = TRUE)) body <- trimws(sub(header, "", body, fixed = TRUE))
body <- gsub("([^,]+ )\n([^,]+)", "\\1\\2", body) body <- gsub("([^,]+ )\n([^,]+)", "\\1\\2", body)
# browser(expr = title == "cover" && ncol(data) != 16) # browser(expr = title == "cover" && ncol(data) != 16)


if (nrow(problems(data))) { if (nrow(problems(data))) {
browser()
problems <- problems(data) problems <- problems(data)
problems$file <- report_path problems$file <- report_path
problems$section <- title problems$section <- title
path <- here::here("data-raw", "reports", "read-raw-reports-problems.csv")
problems |> problems |>
write_csv(
here::here("data-raw", "reports", "read-raw-reports-problems.csv"),
append = TRUE
)
write_csv(path, append = fs::file_exists(path))
} }


if ("SBoE ID" %in% names(data)) { if ("SBoE ID" %in% names(data)) {

+ 4
- 2
_targets.R 查看文件

packages = strsplit(desc::desc_get_field("Depends"), ", ")[[1]], packages = strsplit(desc::desc_get_field("Depends"), ", ")[[1]],
# For distributed computing in tar_make(), supply a {crew} controller # For distributed computing in tar_make(), supply a {crew} controller
# as discussed at https://books.ropensci.org/targets/crew.html. # as discussed at https://books.ropensci.org/targets/crew.html.
controller = crew::crew_controller_local(workers = 6)
controller = crew::crew_controller_local(workers = 12),
error = "null"
# debug = "parquet_report_cover_path_e8fc956a"
) )


# Run the R scripts in the R/ folder with your custom functions: # Run the R scripts in the R/ folder with your custom functions:
tar_target( tar_target(
parquet_report_cover_path, parquet_report_cover_path,
write_reports_by_sboe_id(report_list_sboe_id), write_reports_by_sboe_id(report_list_sboe_id),
pattern = map(report_list_sboe_id),
pattern = map(unique(report_list_sboe_id)),
format = "file_fast" format = "file_fast"
) )
) )

+ 4608
- 3
_targets/meta/meta
文件差異過大導致無法顯示
查看文件


+ 18
- 1
run.R 查看文件

# See https://books.ropensci.org/targets/hpc.html # See https://books.ropensci.org/targets/hpc.html
# to learn about your options. # to learn about your options.


'usage:
run.R all
run.R target <targets>...
run.R -h | --help

options:
-h --help Show this screen' -> doc

library(docopt)
opts <- docopt(doc)

Sys.setenv("IN_TARGETS" = "true") Sys.setenv("IN_TARGETS" = "true")
Sys.setenv("ALLOW_DOWNLOADS" = "true") Sys.setenv("ALLOW_DOWNLOADS" = "true")


targets::tar_make_future(workers = 8) targets::tar_make_future(workers = 8)
targets::tar_make()
if (opts$all) {
cli::cli_alert_into("Running all targets.")
targets::tar_make()
} else {
cli::cli_alert_into("Running targets: {.and {.field {targets}}}")
targets::tar_make(targets::any_of(opts$targets))
}
# targets::tar_make_clustermq(workers = 2) # nolint # targets::tar_make_clustermq(workers = 2) # nolint

Loading…
取消
儲存