| @@ -0,0 +1,60 @@ | |||
| #+ setup | |||
| library(tidyverse) | |||
| library(fs) | |||
| load_all(here::here("process")) | |||
| #+ candidates | |||
| john_bell <- "STA-8S285O-C-001" | |||
| dan_forest <- "STA-M4HR0Y-C-001" | |||
| mitch_setzer <- "STA-C1877N-C-001" | |||
| #+ load-data | |||
| cf <- cf_db_create() | |||
| cf_dm <- dm::dm(!!!cf) | |||
| cf_dm <- dm::dm_add_pk(cf_dm, "report_list", c("sboe_id", "report_id")) | |||
| ## Report List vs Cover ---- | |||
| #+ basic-validation | |||
| cf$report_list |> count() | |||
| cf$cover |> count() | |||
| ## When de-duplicated there should be no more reports in "cover" than in "report_list" | |||
| cf$report_list |> distinct(report_id, sboe_id) |> count() | |||
| cf$cover |> distinct(report_id, sboe_id) |> count() | |||
| # duplicated sboe/report ids | |||
| duplicated <- | |||
| cf$cover |> | |||
| semi_join( | |||
| cf$cover |> count(sboe_id, report_id) |> filter(n > 1), | |||
| by = c("sboe_id", "report_id") | |||
| ) | |||
| # are all of the duplicated rows full duplicates? yes, both are the same. | |||
| # => I'm going to go back and call `distinct()` when adding the file | |||
| duplicated |> distinct(sboe_id, report_id) |> count() | |||
| duplicated |> distinct() |> count() | |||
| # The filed date is missing from some listed reports | |||
| # => I'm going to leave this alone | |||
| cf$report_list |> filter(is.na(received)) | |||
| # How many reports are missing from cover? | |||
| cf$report_list |> | |||
| anti_join(cf$cover, by = c("report_id", "sboe_id")) |> | |||
| collect() |> | |||
| arrange(sboe_id, year, report_id) |> | |||
| select(1:5, received) |> | |||
| mutate(path = glue::glue("../../data-raw/reports/{sboe_id}/all/{report_id}_{received}.txt")) | |||
| #> * `../../data-raw/reports/STA-C3372N-C-001/all/174237_2020-04-01.txt`: missing cover | |||
| #> * `../../data-raw/reports/STA-C4270N-C-001/all/169030_2019-08-15.txt`: missing cover | |||
| #> * `../../data-raw/reports/STA-MW53OC-C-001/all/194017_2022-01-28.txt`: missing cover | |||
| #> * `../../data-raw/reports/FED-JUU72L-C-001/all/201413_2021-07-27.txt`: no records | |||
| #> * `../../data-raw/reports/STA-H3F77S-C-002/all/201481_2021-12-29.txt`: no records | |||
| #> * `../../data-raw/reports/STA-O079OC-C-001/all/200919_2021-07-17.txt`: no records | |||
| #> * `../../data-raw/reports/STA-XD82JF-C-001/all/210255_2022-10-24.txt`: no records | |||
| # TODO: Identify reports that are missing a cover entry... | |||