|
- #+ setup
- library(tidyverse)
- library(fs)
- pkgload::load_all(here::here("process"))
-
- #+ candidates
- john_bell <- "STA-8S285O-C-001"
- dan_forest <- "STA-M4HR0Y-C-001"
- mitch_setzer <- "STA-C1877N-C-001"
-
- #+ load-data
- cf <- cf_db_create()
-
- cf_dm <- dm::dm(!!!cf)
- cf_dm <- dm::dm_add_pk(cf_dm, "report_list", c("sboe_id", "report_id"))
-
- ## Report List vs Cover ----
-
- #+ basic-validation
- cf$report_list |> count()
- cf$cover |> count()
-
- ## When de-duplicated there should be no more reports in "cover" than in "report_list"
- cf$report_list |> distinct(report_id, sboe_id) |> count()
- cf$cover |> distinct(report_id, sboe_id) |> count()
-
- # duplicated sboe/report ids
- duplicated <-
- cf$cover |>
- semi_join(
- cf$cover |> count(sboe_id, report_id) |> filter(n > 1),
- by = c("sboe_id", "report_id")
- )
-
- # are all of the duplicated rows full duplicates? yes, both are the same.
- # => I'm going to go back and call `distinct()` when adding the file
- duplicated |> distinct(sboe_id, report_id) |> count()
- duplicated |> distinct() |> count()
-
- # The filed date is missing from some listed reports
- # => I'm going to leave this alone
- cf$report_list |> filter(is.na(received))
-
- # How many reports are missing from cover?
- cf$report_list |>
- anti_join(cf$cover, by = c("report_id", "sboe_id")) |>
- collect() |>
- arrange(sboe_id, year, report_id) |>
- select(1:5, received) |>
- mutate(path = glue::glue("../../data-raw/reports/{sboe_id}/all/{report_id}_{received}.txt"))
-
- #> * `../../data-raw/reports/STA-C3372N-C-001/all/174237_2020-04-01.txt`: missing cover
- #> * `../../data-raw/reports/STA-C4270N-C-001/all/169030_2019-08-15.txt`: missing cover
- #> * `../../data-raw/reports/STA-MW53OC-C-001/all/194017_2022-01-28.txt`: missing cover
- #> * `../../data-raw/reports/FED-JUU72L-C-001/all/201413_2021-07-27.txt`: no records
- #> * `../../data-raw/reports/STA-H3F77S-C-002/all/201481_2021-12-29.txt`: no records
- #> * `../../data-raw/reports/STA-O079OC-C-001/all/200919_2021-07-17.txt`: no records
- #> * `../../data-raw/reports/STA-XD82JF-C-001/all/210255_2022-10-24.txt`: no records
-
- # Identify reports that are missing a cover entry...
- no_records <-
- cf$report_list |>
- anti_join(distinct(cf$receipts, sboe_id, report_id)) |>
- anti_join(distinct(cf$expenditures, sboe_id, report_id)) |>
- anti_join(distinct(cf$summary, sboe_id, report_id))
-
- missing_cover <-
- cf$report_list |>
- anti_join(cf$cover, by = c("sboe_id", "report_id")) |>
- anti_join(no_records, by = c("sboe_id", "report_id"))
-
- cf$report_list |> semi_join(missing_cover)
- cf$summary |>semi_join(missing_cover)
-
-
- # Some committees have "No Id" as their SBOE ID
- # => I'll make these "NOID-{report_id}"
- unique_committees <- cf$report_list |> distinct(sboe_id, committee_name)
-
- unique_committees |>
- semi_join(
- unique_committees |> count(sboe_id) |> filter(n > 1),
- by = "sboe_id"
- )
-
- cf$cover |> distinct(sboe_id, committee_name) |> count()
- cf$cover |> distinct(sboe_id) |> count()
|