Преглед на файлове

easier method to load prepped data from parquet to duckdb tables

main
Garrick Aden-Buie преди 2 години
родител
ревизия
5478feb4c0
No known key found for this signature in database
променени са 3 файла, в които са добавени 52 реда и са изтрити 1 реда
  1. +11
    -1
      process/R/db_connect.R
  2. +27
    -0
      process/R/prep_open_dataset.R
  3. +14
    -0
      process/R/prepare_candidates.R

+ 11
- 1
process/R/db_connect.R Целия файл

@@ -1,3 +1,13 @@
.globals <- new.env(parent = emptyenv())

duckdb_global_con <- function() {
if (is.null(.globals$duckdb_con)) {
.globals$duckdb_con <- con <- DBI::dbConnect(duckdb::duckdb())
}

.globals$duckdb_con
}

cf_prep_db_create <- function(data_dir = here::here("data-prep")) {
tables <- dir_ls(data_dir)
names(tables) <- path_file(tables)
@@ -17,7 +27,7 @@ cf_prep_db_create <- function(data_dir = here::here("data-prep")) {
tbls_arrow$report_list <- arrow::open_dataset(tables["report_list"])
cli::cli_progress_done()

con <- DBI::dbConnect(duckdb::duckdb())
con <- duckdb_global_con()

cli::cli_progress_step("Creating db")


+ 27
- 0
process/R/prep_open_dataset.R Целия файл

@@ -0,0 +1,27 @@
prep_open_dataset <- function(path_prep, partitioning = "sboe_id", ...) {
if (!fs::file_exists(path_prep)) {
path_here <- here::here("data-prep/", path_prep)
path_up <- fs::path("..", "data-prep", path_prep)
if (fs::file_exists(path_here)) {
path_prep <- path_here
} else if (fs::file_exists(path_up)) {
path_prep <- path_up
} else {
stop("File not found: ", path_prep)
}
}

if (length(fs::dir_ls(path_prep, type = "dir")) == 0) {
partitioning <- NULL
}

arrow::open_dataset(path_prep, partitioning = partitioning, ...)
}

prep_open_dataset_db <- function(table, ..., path_prep = table) {
pq <- prep_open_dataset(path_prep, ...)

con <- duckdb_global_con()
duckdb::duckdb_register_arrow(con, table, pq)
dplyr::tbl(con, table)
}

+ 14
- 0
process/R/prepare_candidates.R Целия файл

@@ -0,0 +1,14 @@
prepare_candidates <- function(path_officers = "../data-prep/officers", report_list = tar_read(report_list)) {
officers_pq <- prep_open_dataset(path_officers)

officers <- officers_pq |> filter(type == "Candidate") |> collect()

officers |>
filter(!is.na(name)) |>
mutate(
name_display = name,
name = toupper(name)
) |>
distinct(sboe_id, name, .keep_all = TRUE) |>
semi_join(report_list, by = "report_id")
}

Loading…
Отказ
Запис