Explorar el Código

fix missing sboe_id values that are "No Id" in the database

By filling these in with `NOID-{report_id}`. This creates a unique sboe_id
for the committee for the report, otherwise all "No Id" committees would
end up being impossible to differentiate.
main
Garrick Aden-Buie hace 2 años
padre
commit
070e95a120
No se encontró ninguna clave conocida en la base de datos para esta firma
Se han modificado 4 ficheros con 45 adiciones y 6 borrados
  1. +9
    -0
      process/R/fix-sboe_id-missing.R
  2. +1
    -0
      process/R/process_report_list.R
  3. +6
    -4
      process/R/report_path_info.R
  4. +29
    -2
      reports/2023-10-07_organizing/2023-10-07_organizing.R

+ 9
- 0
process/R/fix-sboe_id-missing.R Ver fichero

@@ -0,0 +1,9 @@
fix_sboe_id_missing <- function(sboe_id, report_id) {
idx_missing <- which(sboe_id == "No Id")
if (!length(idx_missing)) {
return(sboe_id)
}

sboe_id[idx_missing] <- paste0("NOID-", report_id[idx_missing])
sboe_id
}

+ 1
- 0
process/R/process_report_list.R Ver fichero

@@ -9,5 +9,6 @@ process_report_list <- function(path_report_list) {
report_id = col_integer()
)
) |>
mutate(sboe_id = fix_sboe_id_missing(sboe_id, report_id)) |>
arrow::write_parquet(out)
}

+ 6
- 4
process/R/report_path_info.R Ver fichero

@@ -5,10 +5,12 @@ report_path_info <- function(report_path) {

idx_reports <- map_int(x, \(x) which(x == "reports"))

tibble(
sboe_id = map2_vec(x, idx_reports, \(x, i) x[i + 1]),
report_id = map2_int(x, idx_reports, \(x, i) as.integer(strsplit(x[i + 3], "_")[[1]][1])),
)
sboe_id <- map2_vec(x, idx_reports, \(x, i) x[i + 1])
report_id <- map2_int(x, idx_reports, \(x, i) as.integer(strsplit(x[i + 3], "_")[[1]][1]))

sboe_id <- fix_sboe_id_missing(sboe_id, report_id)

tibble(sboe_id, report_id)
}

report_info_in_report_list <- function(files, report_list = tar_read(report_list)) {

+ 29
- 2
reports/2023-10-07_organizing/2023-10-07_organizing.R Ver fichero

@@ -1,7 +1,7 @@
#+ setup
library(tidyverse)
library(fs)
load_all(here::here("process"))
pkgload::load_all(here::here("process"))

#+ candidates
john_bell <- "STA-8S285O-C-001"
@@ -57,4 +57,31 @@ cf$report_list |>
#> * `../../data-raw/reports/STA-O079OC-C-001/all/200919_2021-07-17.txt`: no records
#> * `../../data-raw/reports/STA-XD82JF-C-001/all/210255_2022-10-24.txt`: no records

# TODO: Identify reports that are missing a cover entry...
# Identify reports that are missing a cover entry...
no_records <-
cf$report_list |>
anti_join(distinct(cf$receipts, sboe_id, report_id)) |>
anti_join(distinct(cf$expenditures, sboe_id, report_id)) |>
anti_join(distinct(cf$summary, sboe_id, report_id))

missing_cover <-
cf$report_list |>
anti_join(cf$cover, by = c("sboe_id", "report_id")) |>
anti_join(no_records, by = c("sboe_id", "report_id"))

cf$report_list |> semi_join(missing_cover)
cf$summary |>semi_join(missing_cover)


# Some committees have "No Id" as their SBOE ID
# => I'll make these "NOID-{report_id}"
unique_committees <- cf$report_list |> distinct(sboe_id, committee_name)

unique_committees |>
semi_join(
unique_committees |> count(sboe_id) |> filter(n > 1),
by = "sboe_id"
)

cf$cover |> distinct(sboe_id, committee_name) |> count()
cf$cover |> distinct(sboe_id) |> count()

Cargando…
Cancelar
Guardar