| @@ -1,3 +1,13 @@ | |||
| .globals <- new.env(parent = emptyenv()) | |||
| duckdb_global_con <- function() { | |||
| if (is.null(.globals$duckdb_con)) { | |||
| .globals$duckdb_con <- con <- DBI::dbConnect(duckdb::duckdb()) | |||
| } | |||
| .globals$duckdb_con | |||
| } | |||
| cf_prep_db_create <- function(data_dir = here::here("data-prep")) { | |||
| tables <- dir_ls(data_dir) | |||
| names(tables) <- path_file(tables) | |||
| @@ -17,7 +27,7 @@ cf_prep_db_create <- function(data_dir = here::here("data-prep")) { | |||
| tbls_arrow$report_list <- arrow::open_dataset(tables["report_list"]) | |||
| cli::cli_progress_done() | |||
| con <- DBI::dbConnect(duckdb::duckdb()) | |||
| con <- duckdb_global_con() | |||
| cli::cli_progress_step("Creating db") | |||
| @@ -0,0 +1,27 @@ | |||
| prep_open_dataset <- function(path_prep, partitioning = "sboe_id", ...) { | |||
| if (!fs::file_exists(path_prep)) { | |||
| path_here <- here::here("data-prep/", path_prep) | |||
| path_up <- fs::path("..", "data-prep", path_prep) | |||
| if (fs::file_exists(path_here)) { | |||
| path_prep <- path_here | |||
| } else if (fs::file_exists(path_up)) { | |||
| path_prep <- path_up | |||
| } else { | |||
| stop("File not found: ", path_prep) | |||
| } | |||
| } | |||
| if (length(fs::dir_ls(path_prep, type = "dir")) == 0) { | |||
| partitioning <- NULL | |||
| } | |||
| arrow::open_dataset(path_prep, partitioning = partitioning, ...) | |||
| } | |||
| prep_open_dataset_db <- function(table, ..., path_prep = table) { | |||
| pq <- prep_open_dataset(path_prep, ...) | |||
| con <- duckdb_global_con() | |||
| duckdb::duckdb_register_arrow(con, table, pq) | |||
| dplyr::tbl(con, table) | |||
| } | |||
| @@ -0,0 +1,14 @@ | |||
| prepare_candidates <- function(path_officers = "../data-prep/officers", report_list = tar_read(report_list)) { | |||
| officers_pq <- prep_open_dataset(path_officers) | |||
| officers <- officers_pq |> filter(type == "Candidate") |> collect() | |||
| officers |> | |||
| filter(!is.na(name)) |> | |||
| mutate( | |||
| name_display = name, | |||
| name = toupper(name) | |||
| ) |> | |||
| distinct(sboe_id, name, .keep_all = TRUE) |> | |||
| semi_join(report_list, by = "report_id") | |||
| } | |||