瀏覽代碼

fix missing sboe_id values that are "No Id" in the database

By filling these in with `NOID-{report_id}`. This creates a unique sboe_id
for the committee for the report, otherwise all "No Id" committees would
end up being impossible to differentiate.
main
Garrick Aden-Buie 2 年之前
父節點
當前提交
070e95a120
沒有發現已知的金鑰在資料庫的簽署中
共有 4 個文件被更改,包括 45 次插入6 次删除
  1. +9
    -0
      process/R/fix-sboe_id-missing.R
  2. +1
    -0
      process/R/process_report_list.R
  3. +6
    -4
      process/R/report_path_info.R
  4. +29
    -2
      reports/2023-10-07_organizing/2023-10-07_organizing.R

+ 9
- 0
process/R/fix-sboe_id-missing.R 查看文件

fix_sboe_id_missing <- function(sboe_id, report_id) {
idx_missing <- which(sboe_id == "No Id")
if (!length(idx_missing)) {
return(sboe_id)
}

sboe_id[idx_missing] <- paste0("NOID-", report_id[idx_missing])
sboe_id
}

+ 1
- 0
process/R/process_report_list.R 查看文件

report_id = col_integer() report_id = col_integer()
) )
) |> ) |>
mutate(sboe_id = fix_sboe_id_missing(sboe_id, report_id)) |>
arrow::write_parquet(out) arrow::write_parquet(out)
} }

+ 6
- 4
process/R/report_path_info.R 查看文件



idx_reports <- map_int(x, \(x) which(x == "reports")) idx_reports <- map_int(x, \(x) which(x == "reports"))


tibble(
sboe_id = map2_vec(x, idx_reports, \(x, i) x[i + 1]),
report_id = map2_int(x, idx_reports, \(x, i) as.integer(strsplit(x[i + 3], "_")[[1]][1])),
)
sboe_id <- map2_vec(x, idx_reports, \(x, i) x[i + 1])
report_id <- map2_int(x, idx_reports, \(x, i) as.integer(strsplit(x[i + 3], "_")[[1]][1]))

sboe_id <- fix_sboe_id_missing(sboe_id, report_id)

tibble(sboe_id, report_id)
} }


report_info_in_report_list <- function(files, report_list = tar_read(report_list)) { report_info_in_report_list <- function(files, report_list = tar_read(report_list)) {

+ 29
- 2
reports/2023-10-07_organizing/2023-10-07_organizing.R 查看文件

#+ setup #+ setup
library(tidyverse) library(tidyverse)
library(fs) library(fs)
load_all(here::here("process"))
pkgload::load_all(here::here("process"))


#+ candidates #+ candidates
john_bell <- "STA-8S285O-C-001" john_bell <- "STA-8S285O-C-001"
#> * `../../data-raw/reports/STA-O079OC-C-001/all/200919_2021-07-17.txt`: no records #> * `../../data-raw/reports/STA-O079OC-C-001/all/200919_2021-07-17.txt`: no records
#> * `../../data-raw/reports/STA-XD82JF-C-001/all/210255_2022-10-24.txt`: no records #> * `../../data-raw/reports/STA-XD82JF-C-001/all/210255_2022-10-24.txt`: no records


# TODO: Identify reports that are missing a cover entry...
# Identify reports that are missing a cover entry...
no_records <-
cf$report_list |>
anti_join(distinct(cf$receipts, sboe_id, report_id)) |>
anti_join(distinct(cf$expenditures, sboe_id, report_id)) |>
anti_join(distinct(cf$summary, sboe_id, report_id))

missing_cover <-
cf$report_list |>
anti_join(cf$cover, by = c("sboe_id", "report_id")) |>
anti_join(no_records, by = c("sboe_id", "report_id"))

cf$report_list |> semi_join(missing_cover)
cf$summary |>semi_join(missing_cover)


# Some committees have "No Id" as their SBOE ID
# => I'll make these "NOID-{report_id}"
unique_committees <- cf$report_list |> distinct(sboe_id, committee_name)

unique_committees |>
semi_join(
unique_committees |> count(sboe_id) |> filter(n > 1),
by = "sboe_id"
)

cf$cover |> distinct(sboe_id, committee_name) |> count()
cf$cover |> distinct(sboe_id) |> count()

Loading…
取消
儲存