您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

61 行
2.1KB

  1. #+ setup
  2. library(tidyverse)
  3. library(fs)
  4. load_all(here::here("process"))
  5. #+ candidates
  6. john_bell <- "STA-8S285O-C-001"
  7. dan_forest <- "STA-M4HR0Y-C-001"
  8. mitch_setzer <- "STA-C1877N-C-001"
  9. #+ load-data
  10. cf <- cf_db_create()
  11. cf_dm <- dm::dm(!!!cf)
  12. cf_dm <- dm::dm_add_pk(cf_dm, "report_list", c("sboe_id", "report_id"))
  13. ## Report List vs Cover ----
  14. #+ basic-validation
  15. cf$report_list |> count()
  16. cf$cover |> count()
  17. ## When de-duplicated there should be no more reports in "cover" than in "report_list"
  18. cf$report_list |> distinct(report_id, sboe_id) |> count()
  19. cf$cover |> distinct(report_id, sboe_id) |> count()
  20. # duplicated sboe/report ids
  21. duplicated <-
  22. cf$cover |>
  23. semi_join(
  24. cf$cover |> count(sboe_id, report_id) |> filter(n > 1),
  25. by = c("sboe_id", "report_id")
  26. )
  27. # are all of the duplicated rows full duplicates? yes, both are the same.
  28. # => I'm going to go back and call `distinct()` when adding the file
  29. duplicated |> distinct(sboe_id, report_id) |> count()
  30. duplicated |> distinct() |> count()
  31. # The filed date is missing from some listed reports
  32. # => I'm going to leave this alone
  33. cf$report_list |> filter(is.na(received))
  34. # How many reports are missing from cover?
  35. cf$report_list |>
  36. anti_join(cf$cover, by = c("report_id", "sboe_id")) |>
  37. collect() |>
  38. arrange(sboe_id, year, report_id) |>
  39. select(1:5, received) |>
  40. mutate(path = glue::glue("../../data-raw/reports/{sboe_id}/all/{report_id}_{received}.txt"))
  41. #> * `../../data-raw/reports/STA-C3372N-C-001/all/174237_2020-04-01.txt`: missing cover
  42. #> * `../../data-raw/reports/STA-C4270N-C-001/all/169030_2019-08-15.txt`: missing cover
  43. #> * `../../data-raw/reports/STA-MW53OC-C-001/all/194017_2022-01-28.txt`: missing cover
  44. #> * `../../data-raw/reports/FED-JUU72L-C-001/all/201413_2021-07-27.txt`: no records
  45. #> * `../../data-raw/reports/STA-H3F77S-C-002/all/201481_2021-12-29.txt`: no records
  46. #> * `../../data-raw/reports/STA-O079OC-C-001/all/200919_2021-07-17.txt`: no records
  47. #> * `../../data-raw/reports/STA-XD82JF-C-001/all/210255_2022-10-24.txt`: no records
  48. # TODO: Identify reports that are missing a cover entry...