преди 2 години · 7901dc4920
--- a/.gitignore
+++ b/.gitignore
 collect/data-raw/
 collect/data-old/
 data/
 data-prep/
 !collect/data-raw/report_list.csv
--- a/process/DESCRIPTION
+++ b/process/DESCRIPTION
    fs,
    glue,
    httr2,
    lubridate,
    purrr,
    readr,
    rlang,
--- a/process/R/00-nc-sboe-dates.R
+++ b/process/R/00-nc-sboe-dates.R
 election_dates <- function() {
  tibble::tribble(
  ~ date, ~ description, ~ source,
  "03/05/2024", "2024 Primary", "https://www.ncsbe.gov/campaign-finance/reporting-schedules",
  "11/05/2024", "2024 General", "https://www.ncsbe.gov/campaign-finance/reporting-schedules",
  "05/17/2022", "2022 Primary", "https://web.archive.org/web/20221110183405/https://www.ncsbe.gov/campaign-finance/reporting-schedules",
 	"11/08/2022", "2022 General", "https://web.archive.org/web/20221110183405/https://www.ncsbe.gov/campaign-finance/reporting-schedules",
  "3/03/2020", "2020 Primary", "https://web.archive.org/web/20200126233324/https://www.ncsbe.gov/index.html",
  "11/03/2020", "2020 General", "https://en.wikipedia.org/wiki/2020_United_States_presidential_election_in_North_Carolina",
  "05/08/2018", "2018 Primary", "https://web.archive.org/web/20181223144944/https://www.ncsbe.gov/Elections",
  "11/06/2018", "2018 General", "https://web.archive.org/web/20181223144944/https://www.ncsbe.gov/Elections",
  "03/15/2016", "2016 Primary", "https://web.archive.org/web/20160131011839/http://www.ncsbe.gov/Elections/Election-Information",
  "11/08/2016" ,"2016 General", "https://web.archive.org/web/20160131011839/http://www.ncsbe.gov/Elections/Election-Information"
 ) |>
  mutate(date = mdy(date))
 }
 reporting_schedule <- function() {
  list(
    "2023" =
      # https://www.ncsbe.gov/campaign-finance/reporting-schedules
      tibble::tribble(
        ~Report.Year,           ~Report.Name, ~Report.Start.Date, ~Report.End.Date, ~Report.Due.Date,
        2023L, "Mid-Year Semi-Annual",       "01/01/2023",     "06/30/2023",     "07/28/2023",
        2023L, "Year-End Semi-Annual",       "07/01/2023",     "12/31/2023",     "01/26/2024",
        2024L,   "First Quarter Plus",       "01/01/2024",     "02/17/2024",     "02/27/2024",
        2024L,       "Second Quarter",       "02/18/2024",     "06/30/2024",     "07/10/2024",
        2024L,   "Third Quarter Plus",       "07/01/2024",     "10/19/2024",     "10/29/2024",
        2024L,       "Fourth Quarter",       "10/20/2024",     "12/31/2024",     "01/10/2025",
        2024L, "Mid-year Semi-Annual*",       "01/01/2024",     "06/30/2024",     "07/26/2024",
        2024L, "Year-End Semi-Annual*",       "07/01/2024",     "12/31/2024",     "01/31/2025"
      ),
    "2021" =
      # https://web.archive.org/web/20210823183129/https://www.ncsbe.gov/campaign-finance/reporting-schedules
      tibble::tribble(
        ~Report.Year,          ~Report.Name, ~Report.Start.Date, ~Report.End.Date, ~Report.Due.Date,
        2021L, "Mid-Year Semiannual",       "01/01/2021",     "06/30/2021",     "07/30/2021",
        2021L, "Year-End Semiannual",       "07/01/2021",     "12/31/2021",     "01/28/2022"
      ),
    "2022" =
      # https://web.archive.org/web/20221110183405/https://www.ncsbe.gov/campaign-finance/reporting-schedules
      tibble::tribble(
        ~Report.Year,           ~Report.Name, ~Report.Start.Date, ~Report.End.Date, ~Report.Due.Date,
        2022L,   "First Quarter Plus",       "01/01/2022",     "04/30/2022",     "05/10/2022",
        2022L,       "Second Quarter",       "05/01/2022",     "06/30/2022",     "07/12/2022",
        2022L,   "Third Quarter Plus",       "07/01/2022",     "10/22/2022",     "11/01/2022",
        2022L,       "Fourth Quarter",       "10/23/2022",     "12/31/2022",     "01/11/2023",
        2022L, "Mid-Year Semiannual*",       "01/01/2022",     "06/30/2022",     "07/29/2022",
        2022L, "Year-End Semiannual*",       "07/01/2022",     "12/31/2022",     "01/27/2023"
      ),
    "2020" =
      # https://web.archive.org/web/20201228050159/https://www.ncsbe.gov/campaign-finance/reporting-schedules
      tibble::tribble(
        ~Report.Year, ~Report.Name,              ~Report.Start.Date,   ~Report.End.Date, ~Report.Due.Date,
        2019L,          "Mid-Year Semiannual",      "1/1/2019", "6/30/2019", "7/26/2019",
        2019L,          "Year-End Semiannual",      "7/1/2019", "12/31/2019", "1/31/2020",
        2020L,          "First Quarter Plus",       "1/1/2020", "2/15/2020", "2/25/2020",
        2020L,          "Second Quarter",           "2/16/2020", "6/30/2020", "7/10/2020",
        2020L,          "Third Quarter Plus",       "7/1/2020", "10/17/2020", "10/27/2020",
        2020L,          "Fourth Quarter",           "10/18/2020", "12/31/2020", "1/12/2021",
        2020L,          "Mid-Year Semi-Annual*",     "1/1/2020", "6/30/2020", "7/31/2020",
        2020L,          "Year-End Semi-Annual*",     "7/1/2020", "12/31/2020", "1/29/2021"
      ),
    "2018" =
      # https://web.archive.org/web/20181223145312/https://www.ncsbe.gov/campaign-finance/reporting-schedules
      tibble::tribble(
        ~Report.Year,            ~Report.Name, ~Report.Start.Date, ~Report.End.Date, ~Report.Due.Date,
        2018L,    "First Quarter Plus",         "1/1/2018",      "4/21/2018",      "4/30/2018",
        2018L,        "Second Quarter",        "4/22/2018",      "6/30/2018",      "7/11/2018",
        2018L, "Mid Year Semi Annual*",         "1/1/2018",      "6/30/2018",      "7/27/2018",
        2018L,    "Third Quarter Plus",         "7/1/2018",     "10/20/2018",     "10/29/2018",
        2018L,        "Fourth Quarter",       "10/21/2018",     "12/31/2018",      "1/10/2019",
        2018L, "Year End Semi Annual*",         "7/1/2018",     "12/31/2018",      "1/25/2019",
        2019L,  "Mid Year Semi Annual",         "1/1/2019",      "6/30/2019",      "7/26/2019",
        2019L,  "Year End Semi Annual",         "7/1/2019",     "12/31/2019",      "1/31/2020"
      ),
    "2017" =
      # https://web.archive.org/web/20170219143056/https://www.ncsbe.gov/campaign-finance/reporting-schedules
      tibble::tribble(
        ~Report.Year,            ~Report.Name, ~Report.Start.Date, ~Report.End.Date, ~Report.Due.Date,
        2017L,  "Mid Year Semi Annual",         "1/1/2017",      "6/30/2017",      "7/28/2017",
        2017L,  "Year End Semi Annual",         "7/1/2017",     "12/31/2017",      "1/26/2018",
      ),
    "2016" =
      # https://web.archive.org/web/20160201011756/https://www.ncsbe.gov/campaign-finance/reporting-schedules
      tibble::tribble(
        ~Report.Year,           ~Report.Name, ~Report.Start.Date, ~Report.End.Date,             ~Report.Due.Date,
        2015L, "Year End Semi Annual",         "7/1/2015",     "12/31/2015",                  "1/29/2016",
        2016L,        "First Quarter",         "1/1/2016",      "2/29/2016",                   "3/7/2016",
        2016L,       "Second Quarter",         "3/1/2016",      "6/30/2016",                  "7/12/2016",
        2016L, "Mid Year Semi Annual*",         "1/1/2016",      "6/30/2016",                  "7/29/2016",
        2016L,        "Third Quarter",         "7/1/2016",     "10/22/2016",                 "10/31/2016",
        2016L,       "Fourth Quarter",       "10/23/2016",     "12/31/2016",                  "1/11/2017",
        2016L, "Year End Semi Annual*",         "7/1/2016",     "12/31/2016",                  "1/27/2017"
      )
  ) |>
    list_rbind() |>
    set_names(c("year", "doc_name", "sboe_start_date", "sboe_end_date", "sboe_due_date")) |>
    type_convert(
      col_types = cols(
        doc_name = col_character(),
        sboe_start_date = col_date(format = "%m/%d/%Y"),
        sboe_end_date = col_date(format = "%m/%d/%Y"),
        sboe_due_date = col_date(format = "%m/%d/%Y")
      )
    ) |>
    mutate(
      sboe_alt = stringr::str_detect(doc_name, "[*]$"),
      doc_name = doc_name |>
        stringr::str_replace("Mid-[Yy]ear", "Mid Year") |>
        stringr::str_replace("Year-End", "Year End") |>
        stringr::str_replace("[Ss]emi ?[Aa]nnual", "Semi-Annual") |>
        stringr::str_remove(" Plus") |>
        stringr::str_remove("[*]")
    ) |>
    distinct()
 }
--- a/process/R/as_report_factor.R
+++ b/process/R/as_report_factor.R
 as_report_factor <- function(x) {
    levels_doc_name <- stringr::str_to_title(c(
        paste(c("first", "second", "third", "fourth"), "quarter"),
        paste(c("mid year", "year end"), "Semi-Annual")
    ))
    factor(x, levels_doc_name)
 }
--- a/process/R/calc_report_amended_score.R
+++ b/process/R/calc_report_amended_score.R
 calc_report_amended_score <- function(report_dates) {
  report_dates |>
    mutate(
      amended_score = amended_score(amended, report_id, received_image, received_data, cover_date_filed),
      .by = c(sboe_id, year, doc_name)
    )
 }
 amended_score <- function(amended, report_id, received_image, received_data, cover_date_filed) {
  x_amended <- as.integer(amended)
  has_and_is_max <- function(x) {
    if (all(is.na(x))) return(rep(0, length(x)))
    ret <- x == max(x, na.rm = TRUE)
    ret[is.na(ret)] <- FALSE
    as.integer(ret)
  }
  max_na_safe <- function(x) {
    if (all(is.na(x))) NA_Date_ else max(x, na.rm = TRUE)
  }
  the_dates <- tibble::tibble(received_image, received_data, cover_date_filed)
  row_max_date <- apply(the_dates, 1, max_na_safe)
  has_max_receievd_data <- has_and_is_max(received_data)
  has_max_date_filed <- has_and_is_max(cover_date_filed)
  (x_amended * 7) + # the amended check mark beats all else
    has_and_is_max(report_id) +
    has_and_is_max(received_image) +
    has_max_receievd_data +
    has_max_date_filed +
    (has_max_receievd_data & has_max_date_filed) +
    has_and_is_max(row_max_date)
 }
--- a/process/R/db_connect.R
+++ b/process/R/db_connect.R
 cf_db_create <- function(data_dir = here::here("data")) {
 cf_prep_db_create <- function(data_dir = here::here("data-prep")) {
  tables <- dir_ls(data_dir)
  names(tables) <- path_file(tables)
--- a/process/R/fix-sboe_id-missing.R
+++ b/process/R/fix-sboe_id-missing.R
    return(sboe_id)
  }
  idx_missing <- union(idx_missing, idx_has_report_id)
  idx_missing <- intersect(idx_missing, idx_has_report_id)
  sboe_id[idx_missing] <- paste0("NOID-", report_id[idx_missing])
  sboe_id
--- a/process/R/prepare_expenditures_csv.R
+++ b/process/R/prepare_expenditures_csv.R
 process_expenditures_csv <- function(dir_sboe_id, report_list = tar_read(report_list)) {
 prepare_expenditures_csv <- function(dir_sboe_id, report_list = tar_read(report_list)) {
  # Read the files in the directory, extract report_id from the path
  # Compare to report_list to determine which reports go into the data
  expenditures
 }
 write_expenditures_parquet <- function(dir_sboe_id, report_list = tar_read(report_list)) {
  expenditures <- process_expenditures_csv(dir_sboe_id, report_list)
 write_prepared_expenditures_parquet <- function(dir_sboe_id, report_list = tar_read(report_list)) {
  expenditures <- prepare_expenditures_csv(dir_sboe_id, report_list)
  info <- report_path_info(dir_sboe_id)
  data_dir <- here::here("..", "data", "expenditures", sprintf("sboe_id=%s", info$sboe_id))
  data_dir <- here::here("..", "data-prep", "expenditures", sprintf("sboe_id=%s", info$sboe_id))
  data_path <- path(data_dir, "part-0.parquet")
  dir_create(data_dir)
--- a/process/R/prepare_receipts_csv.R
+++ b/process/R/prepare_receipts_csv.R
 process_receipts_csv <- function(dir_sboe_id, report_list = tar_read(report_list)) {
 prepare_receipts_csv <- function(dir_sboe_id, report_list = tar_read(report_list)) {
  # Read the files in the directory, extract report_id from the path
  # Compare to report_list to determine which reports go into the data
  receipts
 }
 write_receipts_parquet <- function(dir_sboe_id, report_list = tar_read(report_list)) {
  receipts <- process_receipts_csv(dir_sboe_id, report_list)
 write_prepared_receipts_parquet <- function(dir_sboe_id, report_list = tar_read(report_list)) {
  receipts <- prepare_receipts_csv(dir_sboe_id, report_list)
  info <- report_path_info(dir_sboe_id)
  data_dir <- here::here("..", "data", "receipts", sprintf("sboe_id=%s", info$sboe_id))
  data_dir <- here::here("..", "data-prep", "receipts", sprintf("sboe_id=%s", info$sboe_id))
  data_path <- path(data_dir, "part-0.parquet")
  dir_create(data_dir)
--- a/process/R/prepare_report_list.R
+++ b/process/R/prepare_report_list.R
 prepare_report_list <- function(path_report_list) {
  out <- path("..", "data-prep", "report_list", "part-0.parquet")
  dir_create(path_dir(out))
  read_csv(
    path_report_list,
    col_types = cols(
      year = col_integer(),
      report_id = col_integer()
    )
  ) |>
    mutate(sboe_id = fix_sboe_id_missing(sboe_id, report_id)) |>
    spot_fix_report_list() |>
    arrow::write_parquet(out)
  out
 }
--- a/process/R/prepare_tables_from_report_export.R
+++ b/process/R/prepare_tables_from_report_export.R
 read_report_file <- function(report_path) {
 read_report_export <- function(report_path) {
  info <- report_path_info(report_path)
  lines <- brio::read_file(report_path)
  body <- trimws(sub(header, "", body, fixed = TRUE))
  # body <- gsub("([^,]+ )\n([^,]+)", "\\1\\2", body)
  body <- pre_process_table_body(title, header, body)
  body <- pre_prepare_table_body(title, header, body)
  csv <- paste0(header, "\n", body)
  data <- read_csv(I(csv), show_col_types = FALSE, col_types = cols(.default = "c"))
  }
  names(data) <- snakecase::to_snake_case(names(data), parsing_option = 3)
  data <- post_process_steps_for_table(data, title)
  data <- post_prepare_steps_for_table(data, title)
  data <- mutate(data, !!!info, .before = 1)
  structure(list(data), names = title)
 }
 pre_process_table_body <- function(table, header, body) {
 pre_prepare_table_body <- function(table, header, body) {
  if (table != "accounts") return(body)
  exp_commas <- stringr::str_count(header, ",")
  paste(body_lines, collapse = "\n")
 }
 post_process_steps_for_table <- function(data, table) {
 post_prepare_steps_for_table <- function(data, table) {
  switch(
    table,
    cover = ,
  all <-
    info$path |>
    map(read_report_file) |>
    map(read_report_export) |>
    list_transpose_bind() |>
    map(report_data_set_column_type)
    )
 }
 write_processed_report_export <- function(dir_sboe_id, report_list = tar_read(report_list)) {
 write_prepared_report_export <- function(dir_sboe_id, report_list = tar_read(report_list)) {
  reports <- process_report_export(dir_sboe_id, report_list)
  info <- report_path_info(dir_sboe_id)
  if (!any(info$sboe_id == "No Id") && length(unique(info$sboe_id)) == 1) {
    return(write_processed_report_export_parquet(reports, unique(info$sboe_id)))
    return(write_prepared_report_export_parquet(reports, unique(info$sboe_id)))
  }
  sboe_ids <- map_dfr(reports, select, sboe_id) |> pull(sboe_id) |> unique()
    set_names() |>
    map(\(id) map(reports, \(d) filter(d, sboe_id == id)))
  map(sboe_ids, \(id) write_processed_report_export_parquet(reports[[id]], id)) |>
  map(sboe_ids, \(id) write_prepared_report_export_parquet(reports[[id]], id)) |>
    flatten_chr() |>
    unname()
 }
 write_processed_report_export_parquet <- function(reports, sboe_id) {
  base_dir <- here::here("..", "data")
 write_prepared_report_export_parquet <- function(reports, sboe_id) {
  base_dir <- here::here("..", "data-prep")
  sboe_id_param <- sprintf("sboe_id=%s", sboe_id)
  return_path <- c()
--- a/process/R/process_report_dates.R
+++ b/process/R/process_report_dates.R
 process_report_dates <- function(report_list_raw, cover_raw) {
  cover_dates <-
    cover_raw |>
    select(
      report_id,
      cover_start_date = date_from,
      cover_end_date = date_to,
      cover_date_filed = date_filed
    )
  report_list_raw |>
    left_join(reporting_schedule(), by = join_by(year, doc_name)) |>
    select(
      report_id, sboe_id, year, doc_name, amended,
      contains("received_"),
      matches("sboe_(start|end)_date")
    ) |>
    left_join(cover_dates, by = "report_id") |>
    mutate(across(matches("received|date"), na_if_obviously_wrong_date)) |>
    mutate(
      # If the received date isn't after at least one of the report or cover date, don't believe it
      received_image = received_image |> na_if_not_after_one_of(sboe_start_date, cover_start_date),
      received_data = received_data |> na_if_not_after_one_of(sboe_start_date, cover_start_date)
    )
 }
 na_if_obviously_wrong_date <- function(x) {
  x[x > today()] <- NA_Date_
  x[x < ymd("2016-01-01")] <- NA_Date_
  x
 }
 na_if_not_after_one_of <- function(x, ...) {
  others <- list(...)
  is_after <- function(x, y) {
    ret <- x >= y
    ret[is.na(ret)] <- FALSE
    ret
  }
  allow <- purrr::map(others, is_after, x = x) |> purrr::reduce(`|`)
  x[!allow] <- NA_Date_
  x
 }
 mean_date_scalar <- function(x, y) {
  if (is.na(x) && is.na(y)) {
    return(NA_Date_)
  }
  mean(c(x, y), na.rm = TRUE)
 }
--- a/process/R/process_report_list.R
+++ b/process/R/process_report_list.R
 process_report_list <- function(path_report_list) {
  out <- path("..", "data", "report_list", "part-0.parquet")
  dir_create(path_dir(out))
  read_csv(
    path_report_list,
    col_types = cols(
      year = col_integer(),
      report_id = col_integer()
 process_report_list <- function(report_list_raw, report_amended_score) {
  # select the correct report_id to use for each report group
  report_keep <-
    report_amended_score |>
    group_by(sboe_id, year, doc_name) |>
    slice_max(amended_score) |>
    # follow up to be certain there's only one report per group
    slice_max(report_id)
  report_keep_count <- report_keep |> count()
  if (any(report_keep_count$n > 1)) {
    stop("Have not successfully selected a single report per committee and report group.")
  }
  report_keep <- report_keep |> ungroup() |> select(report_id, matches("received|date"))
  report_list_raw |>
    select(-matches("received|date")) |>
    right_join(report_keep, by = "report_id") |>
    select(-committee_name, -doc_type, -tar_group, image_id = img_link) |>
    mutate(
      across(doc_name, as_report_factor),
      image_id = sub(".+?(\\d+)$", "\\1", image_id)
    )
  ) |>
    mutate(sboe_id = fix_sboe_id_missing(sboe_id, report_id)) |>
    arrow::write_parquet(out)
 }
--- a/process/R/spot_fix_report_list.R
+++ b/process/R/spot_fix_report_list.R
 spot_fixes_report_list <- list(
  list(
    # This report is actually an amended report
    ids = list(report_id = 195397),
    values = list(amended = TRUE)
  ),
  list(
    # This report is actually the first one, not amended
    ids = list(report_id = 159437),
    values = list(amended = FALSE)
  ),
  list(
    ids = list(report_id = 161042),
    values = list(doc_name = "Mid Year Semi-Annual")
  )
 )
 spot_fix_report_list <- function(report_list) {
  for (fix in spot_fixes_report_list) {
    fix_tbl <- fix |> map(as_tibble) |> unname() |> list_cbind()
    report_list <- rows_update(report_list, fix_tbl, by = names(fix$ids))
  }
  report_list
 }
--- a/process/R/write_committee_parquet.R
+++ b/process/R/write_committee_parquet.R
 write_committee_parquet <- function(report_list = tar_read(report_list)) {
  cover_path <- here::here("../data/cover")
  cover <- arrow::open_dataset(cover_path, partitioning = "sboe_id")
  latest_report_by_committee <- report_list |> slice_max(end_date, by = "sboe_id")
  committees <-
    cover |>
    semi_join(latest_report_by_committee, by = "report_id") |>
    collect() |>
    select(sboe_id:committee_type, fund_type, fund_name)
  arrow::write_parquet(committees, here::here("data/committees/part-0.parquet"))
 }
--- a/process/_targets.R
+++ b/process/_targets.R
 # Replace the target list below with your own:
 list(
  tar_target(path_report_list, "../data-raw/report_list.csv", format = "file"),
  tar_target(report_list, process_report_list(path_report_list)),
  tar_target(path_report_list_csv, "../data-raw/report_list.csv", format = "file"),
  tar_target(path_report_list_raw, prepare_report_list(path_report_list_csv)),
  tar_target(report_list_raw, arrow::read_parquet(path_report_list_raw)),
  tar_target(
    dirs_all,
     fs::dir_ls("../data-raw/reports", glob = "**/expenditures", recurse = TRUE, type = "directory")
  ),
  tar_target(
    paths_all_parquet,
    write_prepared_report_export(dirs_all, report_list_raw),
    pattern = map(dirs_all),
    format = "file"
  ),
  tar_target(
    path_receipts_parquet,
    write_receipts_parquet(dirs_receipts, report_list),
    write_prepared_receipts_parquet(dirs_receipts, report_list_raw),
    pattern = map(dirs_receipts),
    format = "file"
  ),
  tar_target(
    path_expenditures_parquet,
    write_expenditures_parquet(dirs_expenditures, report_list),
    write_prepared_expenditures_parquet(dirs_expenditures, report_list_raw),
    pattern = map(dirs_expenditures),
    format = "file"
  ),
  tar_target(
    paths_all_parquet,
    write_processed_report_export(dirs_all, report_list),
    pattern = map(dirs_all),
    format = "file"
    cover_raw,
    {
      paths_all_parquet # depends on prepared report_exports
      arrow::open_dataset("../data/cover", partitioning = "sboe_id") |>
        dplyr::collect()
    }
  ),
  tar_target(
    report_dates,
    process_report_dates(report_list_raw, cover_raw)
  ),
  tar_target(
    report_amended_score,
    calc_report_amended_score(report_dates)
  ),
  tar_target(
    report_list,
    process_report_list(report_list_raw, report_amended_score)
  )
 )
--- a/process/_targets/meta/meta
+++ b/process/_targets/meta/meta
--- a/process/run.R
+++ b/process/run.R
 'usage:
  run.R all
  run.R target <targets>...
  run.R target <targets>... [--shortcut]
  run.R -h | --help
 options:
  targets::tar_make()
 } else {
  cli::cli_alert_info("Running targets: {.and {.field {opts$targets}}}")
  targets::tar_make(targets::any_of(!!opts$targets))
  targets::tar_make(targets::any_of(!!opts$targets), shortcut = opts$shortcut)
 }
 # targets::tar_make_clustermq(workers = 2) # nolint
--- a/reports/2023-10-07_organizing/2023-10-07_organizing.R
+++ b/reports/2023-10-07_organizing/2023-10-07_organizing.R
 #+ basic-validation
 cf$report_list |> count()
 cf$cover |> count()
 ## When de-duplicated there should be no more reports in "cover" than in "report_list"
 cf$report_list |> distinct(report_id, sboe_id) |> count()
 ## Cover should have the same number of reports as report_list
 cf$cover |> count()
 cf$cover |> distinct(report_id, sboe_id) |> count()
 # duplicated sboe/report ids
    by = c("sboe_id", "report_id")
  )
 # are all of the duplicated rows full duplicates? yes, both are the same.
 # FIXED: are all of the duplicated rows full duplicates? yes, both are the same.
 # => I'm going to go back and call `distinct()` when adding the file
 duplicated |> distinct(sboe_id, report_id) |> count()
 duplicated |> distinct() |> count()
 # => I'm going to leave this alone
 cf$report_list |> filter(is.na(received))
 # How many reports are missing from cover?
 # FIXED: How many reports are missing from cover?
 cf$report_list |>
  anti_join(cf$cover, by = c("report_id", "sboe_id")) |>
  collect() |>
 #> * `../../data-raw/reports/STA-O079OC-C-001/all/200919_2021-07-17.txt`: no records
 #> * `../../data-raw/reports/STA-XD82JF-C-001/all/210255_2022-10-24.txt`: no records
 # Identify reports that are missing a cover entry...
 # FIXED: Identify reports that are missing a cover entry...
 no_records <-
  cf$report_list |>
  anti_join(distinct(cf$receipts, sboe_id, report_id)) |>
 # Some committees have "No Id" as their SBOE ID
 # => I'll make these "NOID-{report_id}"
 unique_committees <- cf$report_list |> distinct(sboe_id, committee_name)
 unique_committees_report <- cf$report_list |> distinct(sboe_id, committee_name)
 unique_committees |>
 unique_committees_report |>
  semi_join(
    unique_committees |> count(sboe_id) |> filter(n > 1),
    unique_committees_report |> count(sboe_id) |> filter(n > 1),
    by = "sboe_id"
  )
 cf$cover |> distinct(sboe_id, committee_name) |> count()
 cf$cover |> distinct(sboe_id) |> count()
 # Bad report dates
 cover <-
  cf$cover |>
  select(1:3, report_type, contains("date")) |>
  collect()
 cover |>
  filter(
    date_filed > today() | date_to > today() | date_from > today()
  )
 cover |> filter(date_filed < date_from)
 cover |> filter(sboe_id == "FED-361L24-C-001")
 cover |>
  filter(
    !(date_filed > today() | date_to > today() | date_from > today())
  ) |>
  ggplot() +
  aes(x = date_to, y = date_filed) +
  geom_point()
 # Committee names change over time?
 unique_committees <- cf$cover |> distinct(sboe_id, committee_name)
 cf$cover |>
  semi_join(
    unique_committees |> count(sboe_id) |> filter(n > 1),
    by = "sboe_id"
  ) |>
  arrange(sboe_id, date_filed) |>
  select(1, date_filed, 2:zip_code)
 # Committee registration ----
 cf$cover |>
  slice_max(date_filed, by = sboe_id) |>
  select(sboe_id:committee_type, last_date_filed = date_filed) |>
  left_join(
    cf$cover |> slice_min(date_filed, by = sboe_id) |> select(sboe_id, first_date_filed = date_filed),
    by = "sboe_id"
  ) |>
  arrange(desc(last_date_filed), first_date_filed)
 # simple committee registration (best effort, not perfect) FIXME
 committees <-
  report_list |>
  slice_max(end_date, by = sboe_id, n = 1) |>
  slice_max(report_id, by = sboe_id, n = 1) |>
  distinct(sboe_id, committee_name)
 # Dealing with appended reports -------------------------------------------
 ## Which reports have been appended?
 reports_summary <-
  report_list |>
  summarize(
    n_total = n(),
    n_amended = sum(amended),
    .by = c(sboe_id, year, doc_name)
  ) |>
  arrange(desc(n_total)) |>
  left_join(committees, by = "sboe_id")
 # Dan Forest, 2016-Q2 Report, 6 reports, 5 ammendments
 # [x] Can be resolved by looking at last received date!
 report_list |>
  filter(
    sboe_id == "STA-M4HR0Y-C-001",
    year == 2016,
    doc_name == "Second Quarter"
  ) |>
  slice_max(received)
 report_list |>
  group_by(sboe_id, year, doc_name) |>
  slice_max(received) |>
  count(sort = TRUE) |>
  filter(n > 1)
 # report_id is basically sequential
 report_list |>
  filter(!is.na(received_data)) |>
  filter(received < today()) |>
  filter(received > ymd("2016-01-01")) |>
  ggplot() +
  aes(received, report_id) +
  geom_point()
 report_list |>
  filter(report_id == min(report_id))
 report_list |>
  filter(
    between(received_image, ymd("2016-01-01"), today()),
    between(received_data, ymd("2016-01-01"), today())
  ) |>
  ggplot() +
  aes(received_image, received_data) +
  geom_point()
 report_list |>
  filter(
    between(received_image, ymd("2016-01-01"), today()),
    between(received_data, ymd("2016-01-01"), today())
  ) |>
  pivot_longer(contains("received_"), names_to = "received_item", values_to = "received_when") |>
  mutate(received_item = str_remove(received_item, "received_")) |>
  ggplot() +
  aes(end_date, received_when) +
  geom_point() +
  facet_wrap(~ received_item)
 report_list |> filter(end_date > today())
 levels_doc_name <- str_to_title(c(
  paste(c("first", "second", "third", "fourth"), "quarter"),
  paste(c("mid year", "year end"), "Semi-Annual")
 ))
 # wait, what are the correct reporting dates?
 report_list |>
  mutate(doc_name = factor(doc_name, levels_doc_name)) |>
  group_by(year, doc_name) |>
  count(start_date, end_date, sort = TRUE) |>
  slice_max(n) |>
  slice_max(end_date) |>
  print(n = 48)
 # https://www.ncleg.gov/EnactedLegislation/Statutes/HTML/BySection/Chapter_163/GS_163-278.9.html
 report_list |>
  mutate(doc_name = factor(doc_name, levels_doc_name)) |>
  filter(
    between(start_date, make_date(year), make_date(year + 1)),
    between(end_date, make_date(year), make_date(year + 1))
  ) |>
  group_by(year, doc_name) |>
  count(start_date, end_date, sort = TRUE) |>
  ungroup() |>
  arrange(year, doc_name) |>
  mutate(quarterly = ifelse(grepl("Quarter", doc_name), "Quarterly", "Semi-Annual")) |>
  ggplot() +
  aes(y = year, color = doc_name, group = doc_name) +
  geom_linerange(
    aes(xmin = start_date, xmax = end_date, alpha = n),
    linewidth = 20,
  ) +
  facet_wrap(~quarterly) +
  theme_minimal()
 report_list |>
  left_join(report_dates, by = c("year", "doc_name"), relationship = "many-to-one") |>
  filter(
    !(between(start_date, sboe_start_date, sboe_end_date) | between(end_date, sboe_start_date, sboe_end_date))
  ) |>
  View()
--- a/reports/problems/problems.qmd
+++ b/reports/problems/problems.qmd
 ---
 title: PROBLEMS
 author: Garrick Aden-Buie
 format: pdf
 execute:
  echo: true
 ---
 ## Setup 
 ```{r}
 library(tidyverse)
 library(fs)
 pkgload::load_all(here::here("process"))
 ```
 ```{r load-data}
 cf <- cf_prep_db_create()
 ```
 ```{r load-data-report_list}
 report_list <- 
  cf$report_list |> 
  collect() |>
  mutate(across(doc_name, as_report_factor))
 ```
 ### Problem scoping
 For helping determine the size of the problem
 ```{r}
 expenditures_by_report <- 
  cf$expenditures |>
  summarize(
    n_expenses = n(),
    total_expenses = sum(amount),
    .by = report_id
  ) |> 
  collect() |>
  full_join(report_list["report_id"], by = "report_id") |>
  replace_na(list(n_expenses = 0, total_expenses = 0))
 receipts_by_report <- 
  cf$receipts |>
  summarize(
    n_receipts = n(),
    total_receipts = sum(amount),
    .by = report_id
  ) |> 
  collect() |>
  full_join(report_list["report_id"], by = "report_id") |>
  replace_na(list(n_receipts = 0, total_receipts = 0))
 ```
 ## Doc search problems
 ```{r}
 report_cover_report_type <- 
  report_list |>
  mutate(report_type_listed = paste(year, doc_name)) |>
  select(report_id, sboe_id, report_type_listed) |>
  left_join(
    cf$cover |> select(sboe_id, report_id, report_type_cover = report_type) |> collect()
  )
 report_cover_report_type |> count(report_type_listed == report_type_cover)
 report_cover_report_type |> filter(report_type_listed != report_type_cover)
 ```
 ```{r}
 report_cover_report_type |>
  filter(report_type_listed != report_type_cover) |>
  left_join(expenditures_by_report) |>
  left_join(receipts_by_report) |>
  arrange(total_receipts)
 ```
 In some of these cases, the cover is probably wrong:
 ```{r}
 report_cover_report_type |>
  filter(report_type_listed != report_type_cover) |>
  left_join(
    cf$cover |> select(report_id, date_from, date_to) |> collect()
  ) |>
  left_join(
    reporting_schedule() |> 
      mutate(report_type_sched = paste(year, doc_name)) |>
      select(report_type_sched, sboe_start_date, sboe_end_date),
    by = c(date_from = "sboe_start_date", date_to = "sboe_end_date")
  )
 ```
 ## Dates
 ```{r}
 report_dates <- tar_read(report_dates, store = here::here("process/_targets"))
 ```
 ```{r}
 report_dates |> filter(sboe_start_date != cover_start_date) # 3,422
 report_dates |> filter(sboe_end_date != cover_end_date)     #   590
 report_dates |> filter(received_image < cover_start_date)   #    60
 report_dates |> filter(received_image < cover_end_date)     #   222
 report_dates |> filter(received_data < cover_start_date)    #     2
 report_dates |> filter(received_data < cover_end_date)      #    45
 report_dates |> filter(cover_date_filed < cover_end_date)   #   950
 ```
 ## Picking amended
 Picking the correct amended report is problematic because no date in the `report_list` can really be trusted.
 ### Interestingly problematic
 ```{r}
 # STA-C3235N-C-001  2017 Year End Semi-Annual
 # WAK-56BLZN-C-001  2020 Mid Year Semi-Annual CITIZENS FOR TOMMY MATTHEWS
 # STA-Z6M8TR-C-001  2017 Year End Semi-Annual FIREFIGHTERS FOR RESPON
 ```