| collect/data-raw/ | collect/data-raw/ | ||||
| collect/data-old/ | collect/data-old/ | ||||
| data/ | data/ | ||||
| data-prep/ | |||||
| !collect/data-raw/report_list.csv | !collect/data-raw/report_list.csv |
| fs, | fs, | ||||
| glue, | glue, | ||||
| httr2, | httr2, | ||||
| lubridate, | |||||
| purrr, | purrr, | ||||
| readr, | readr, | ||||
| rlang, | rlang, |
| election_dates <- function() { | |||||
| tibble::tribble( | |||||
| ~ date, ~ description, ~ source, | |||||
| "03/05/2024", "2024 Primary", "https://www.ncsbe.gov/campaign-finance/reporting-schedules", | |||||
| "11/05/2024", "2024 General", "https://www.ncsbe.gov/campaign-finance/reporting-schedules", | |||||
| "05/17/2022", "2022 Primary", "https://web.archive.org/web/20221110183405/https://www.ncsbe.gov/campaign-finance/reporting-schedules", | |||||
| "11/08/2022", "2022 General", "https://web.archive.org/web/20221110183405/https://www.ncsbe.gov/campaign-finance/reporting-schedules", | |||||
| "3/03/2020", "2020 Primary", "https://web.archive.org/web/20200126233324/https://www.ncsbe.gov/index.html", | |||||
| "11/03/2020", "2020 General", "https://en.wikipedia.org/wiki/2020_United_States_presidential_election_in_North_Carolina", | |||||
| "05/08/2018", "2018 Primary", "https://web.archive.org/web/20181223144944/https://www.ncsbe.gov/Elections", | |||||
| "11/06/2018", "2018 General", "https://web.archive.org/web/20181223144944/https://www.ncsbe.gov/Elections", | |||||
| "03/15/2016", "2016 Primary", "https://web.archive.org/web/20160131011839/http://www.ncsbe.gov/Elections/Election-Information", | |||||
| "11/08/2016" ,"2016 General", "https://web.archive.org/web/20160131011839/http://www.ncsbe.gov/Elections/Election-Information" | |||||
| ) |> | |||||
| mutate(date = mdy(date)) | |||||
| } | |||||
| reporting_schedule <- function() { | |||||
| list( | |||||
| "2023" = | |||||
| # https://www.ncsbe.gov/campaign-finance/reporting-schedules | |||||
| tibble::tribble( | |||||
| ~Report.Year, ~Report.Name, ~Report.Start.Date, ~Report.End.Date, ~Report.Due.Date, | |||||
| 2023L, "Mid-Year Semi-Annual", "01/01/2023", "06/30/2023", "07/28/2023", | |||||
| 2023L, "Year-End Semi-Annual", "07/01/2023", "12/31/2023", "01/26/2024", | |||||
| 2024L, "First Quarter Plus", "01/01/2024", "02/17/2024", "02/27/2024", | |||||
| 2024L, "Second Quarter", "02/18/2024", "06/30/2024", "07/10/2024", | |||||
| 2024L, "Third Quarter Plus", "07/01/2024", "10/19/2024", "10/29/2024", | |||||
| 2024L, "Fourth Quarter", "10/20/2024", "12/31/2024", "01/10/2025", | |||||
| 2024L, "Mid-year Semi-Annual*", "01/01/2024", "06/30/2024", "07/26/2024", | |||||
| 2024L, "Year-End Semi-Annual*", "07/01/2024", "12/31/2024", "01/31/2025" | |||||
| ), | |||||
| "2021" = | |||||
| # https://web.archive.org/web/20210823183129/https://www.ncsbe.gov/campaign-finance/reporting-schedules | |||||
| tibble::tribble( | |||||
| ~Report.Year, ~Report.Name, ~Report.Start.Date, ~Report.End.Date, ~Report.Due.Date, | |||||
| 2021L, "Mid-Year Semiannual", "01/01/2021", "06/30/2021", "07/30/2021", | |||||
| 2021L, "Year-End Semiannual", "07/01/2021", "12/31/2021", "01/28/2022" | |||||
| ), | |||||
| "2022" = | |||||
| # https://web.archive.org/web/20221110183405/https://www.ncsbe.gov/campaign-finance/reporting-schedules | |||||
| tibble::tribble( | |||||
| ~Report.Year, ~Report.Name, ~Report.Start.Date, ~Report.End.Date, ~Report.Due.Date, | |||||
| 2022L, "First Quarter Plus", "01/01/2022", "04/30/2022", "05/10/2022", | |||||
| 2022L, "Second Quarter", "05/01/2022", "06/30/2022", "07/12/2022", | |||||
| 2022L, "Third Quarter Plus", "07/01/2022", "10/22/2022", "11/01/2022", | |||||
| 2022L, "Fourth Quarter", "10/23/2022", "12/31/2022", "01/11/2023", | |||||
| 2022L, "Mid-Year Semiannual*", "01/01/2022", "06/30/2022", "07/29/2022", | |||||
| 2022L, "Year-End Semiannual*", "07/01/2022", "12/31/2022", "01/27/2023" | |||||
| ), | |||||
| "2020" = | |||||
| # https://web.archive.org/web/20201228050159/https://www.ncsbe.gov/campaign-finance/reporting-schedules | |||||
| tibble::tribble( | |||||
| ~Report.Year, ~Report.Name, ~Report.Start.Date, ~Report.End.Date, ~Report.Due.Date, | |||||
| 2019L, "Mid-Year Semiannual", "1/1/2019", "6/30/2019", "7/26/2019", | |||||
| 2019L, "Year-End Semiannual", "7/1/2019", "12/31/2019", "1/31/2020", | |||||
| 2020L, "First Quarter Plus", "1/1/2020", "2/15/2020", "2/25/2020", | |||||
| 2020L, "Second Quarter", "2/16/2020", "6/30/2020", "7/10/2020", | |||||
| 2020L, "Third Quarter Plus", "7/1/2020", "10/17/2020", "10/27/2020", | |||||
| 2020L, "Fourth Quarter", "10/18/2020", "12/31/2020", "1/12/2021", | |||||
| 2020L, "Mid-Year Semi-Annual*", "1/1/2020", "6/30/2020", "7/31/2020", | |||||
| 2020L, "Year-End Semi-Annual*", "7/1/2020", "12/31/2020", "1/29/2021" | |||||
| ), | |||||
| "2018" = | |||||
| # https://web.archive.org/web/20181223145312/https://www.ncsbe.gov/campaign-finance/reporting-schedules | |||||
| tibble::tribble( | |||||
| ~Report.Year, ~Report.Name, ~Report.Start.Date, ~Report.End.Date, ~Report.Due.Date, | |||||
| 2018L, "First Quarter Plus", "1/1/2018", "4/21/2018", "4/30/2018", | |||||
| 2018L, "Second Quarter", "4/22/2018", "6/30/2018", "7/11/2018", | |||||
| 2018L, "Mid Year Semi Annual*", "1/1/2018", "6/30/2018", "7/27/2018", | |||||
| 2018L, "Third Quarter Plus", "7/1/2018", "10/20/2018", "10/29/2018", | |||||
| 2018L, "Fourth Quarter", "10/21/2018", "12/31/2018", "1/10/2019", | |||||
| 2018L, "Year End Semi Annual*", "7/1/2018", "12/31/2018", "1/25/2019", | |||||
| 2019L, "Mid Year Semi Annual", "1/1/2019", "6/30/2019", "7/26/2019", | |||||
| 2019L, "Year End Semi Annual", "7/1/2019", "12/31/2019", "1/31/2020" | |||||
| ), | |||||
| "2017" = | |||||
| # https://web.archive.org/web/20170219143056/https://www.ncsbe.gov/campaign-finance/reporting-schedules | |||||
| tibble::tribble( | |||||
| ~Report.Year, ~Report.Name, ~Report.Start.Date, ~Report.End.Date, ~Report.Due.Date, | |||||
| 2017L, "Mid Year Semi Annual", "1/1/2017", "6/30/2017", "7/28/2017", | |||||
| 2017L, "Year End Semi Annual", "7/1/2017", "12/31/2017", "1/26/2018", | |||||
| ), | |||||
| "2016" = | |||||
| # https://web.archive.org/web/20160201011756/https://www.ncsbe.gov/campaign-finance/reporting-schedules | |||||
| tibble::tribble( | |||||
| ~Report.Year, ~Report.Name, ~Report.Start.Date, ~Report.End.Date, ~Report.Due.Date, | |||||
| 2015L, "Year End Semi Annual", "7/1/2015", "12/31/2015", "1/29/2016", | |||||
| 2016L, "First Quarter", "1/1/2016", "2/29/2016", "3/7/2016", | |||||
| 2016L, "Second Quarter", "3/1/2016", "6/30/2016", "7/12/2016", | |||||
| 2016L, "Mid Year Semi Annual*", "1/1/2016", "6/30/2016", "7/29/2016", | |||||
| 2016L, "Third Quarter", "7/1/2016", "10/22/2016", "10/31/2016", | |||||
| 2016L, "Fourth Quarter", "10/23/2016", "12/31/2016", "1/11/2017", | |||||
| 2016L, "Year End Semi Annual*", "7/1/2016", "12/31/2016", "1/27/2017" | |||||
| ) | |||||
| ) |> | |||||
| list_rbind() |> | |||||
| set_names(c("year", "doc_name", "sboe_start_date", "sboe_end_date", "sboe_due_date")) |> | |||||
| type_convert( | |||||
| col_types = cols( | |||||
| doc_name = col_character(), | |||||
| sboe_start_date = col_date(format = "%m/%d/%Y"), | |||||
| sboe_end_date = col_date(format = "%m/%d/%Y"), | |||||
| sboe_due_date = col_date(format = "%m/%d/%Y") | |||||
| ) | |||||
| ) |> | |||||
| mutate( | |||||
| sboe_alt = stringr::str_detect(doc_name, "[*]$"), | |||||
| doc_name = doc_name |> | |||||
| stringr::str_replace("Mid-[Yy]ear", "Mid Year") |> | |||||
| stringr::str_replace("Year-End", "Year End") |> | |||||
| stringr::str_replace("[Ss]emi ?[Aa]nnual", "Semi-Annual") |> | |||||
| stringr::str_remove(" Plus") |> | |||||
| stringr::str_remove("[*]") | |||||
| ) |> | |||||
| distinct() | |||||
| } |
| as_report_factor <- function(x) { | |||||
| levels_doc_name <- stringr::str_to_title(c( | |||||
| paste(c("first", "second", "third", "fourth"), "quarter"), | |||||
| paste(c("mid year", "year end"), "Semi-Annual") | |||||
| )) | |||||
| factor(x, levels_doc_name) | |||||
| } |
| calc_report_amended_score <- function(report_dates) { | |||||
| report_dates |> | |||||
| mutate( | |||||
| amended_score = amended_score(amended, report_id, received_image, received_data, cover_date_filed), | |||||
| .by = c(sboe_id, year, doc_name) | |||||
| ) | |||||
| } | |||||
| amended_score <- function(amended, report_id, received_image, received_data, cover_date_filed) { | |||||
| x_amended <- as.integer(amended) | |||||
| has_and_is_max <- function(x) { | |||||
| if (all(is.na(x))) return(rep(0, length(x))) | |||||
| ret <- x == max(x, na.rm = TRUE) | |||||
| ret[is.na(ret)] <- FALSE | |||||
| as.integer(ret) | |||||
| } | |||||
| max_na_safe <- function(x) { | |||||
| if (all(is.na(x))) NA_Date_ else max(x, na.rm = TRUE) | |||||
| } | |||||
| the_dates <- tibble::tibble(received_image, received_data, cover_date_filed) | |||||
| row_max_date <- apply(the_dates, 1, max_na_safe) | |||||
| has_max_receievd_data <- has_and_is_max(received_data) | |||||
| has_max_date_filed <- has_and_is_max(cover_date_filed) | |||||
| (x_amended * 7) + # the amended check mark beats all else | |||||
| has_and_is_max(report_id) + | |||||
| has_and_is_max(received_image) + | |||||
| has_max_receievd_data + | |||||
| has_max_date_filed + | |||||
| (has_max_receievd_data & has_max_date_filed) + | |||||
| has_and_is_max(row_max_date) | |||||
| } |
| cf_db_create <- function(data_dir = here::here("data")) { | |||||
| cf_prep_db_create <- function(data_dir = here::here("data-prep")) { | |||||
| tables <- dir_ls(data_dir) | tables <- dir_ls(data_dir) | ||||
| names(tables) <- path_file(tables) | names(tables) <- path_file(tables) | ||||
| return(sboe_id) | return(sboe_id) | ||||
| } | } | ||||
| idx_missing <- union(idx_missing, idx_has_report_id) | |||||
| idx_missing <- intersect(idx_missing, idx_has_report_id) | |||||
| sboe_id[idx_missing] <- paste0("NOID-", report_id[idx_missing]) | sboe_id[idx_missing] <- paste0("NOID-", report_id[idx_missing]) | ||||
| sboe_id | sboe_id |
| process_expenditures_csv <- function(dir_sboe_id, report_list = tar_read(report_list)) { | |||||
| prepare_expenditures_csv <- function(dir_sboe_id, report_list = tar_read(report_list)) { | |||||
| # Read the files in the directory, extract report_id from the path | # Read the files in the directory, extract report_id from the path | ||||
| # Compare to report_list to determine which reports go into the data | # Compare to report_list to determine which reports go into the data | ||||
| expenditures | expenditures | ||||
| } | } | ||||
| write_expenditures_parquet <- function(dir_sboe_id, report_list = tar_read(report_list)) { | |||||
| expenditures <- process_expenditures_csv(dir_sboe_id, report_list) | |||||
| write_prepared_expenditures_parquet <- function(dir_sboe_id, report_list = tar_read(report_list)) { | |||||
| expenditures <- prepare_expenditures_csv(dir_sboe_id, report_list) | |||||
| info <- report_path_info(dir_sboe_id) | info <- report_path_info(dir_sboe_id) | ||||
| data_dir <- here::here("..", "data", "expenditures", sprintf("sboe_id=%s", info$sboe_id)) | |||||
| data_dir <- here::here("..", "data-prep", "expenditures", sprintf("sboe_id=%s", info$sboe_id)) | |||||
| data_path <- path(data_dir, "part-0.parquet") | data_path <- path(data_dir, "part-0.parquet") | ||||
| dir_create(data_dir) | dir_create(data_dir) | ||||
| process_receipts_csv <- function(dir_sboe_id, report_list = tar_read(report_list)) { | |||||
| prepare_receipts_csv <- function(dir_sboe_id, report_list = tar_read(report_list)) { | |||||
| # Read the files in the directory, extract report_id from the path | # Read the files in the directory, extract report_id from the path | ||||
| # Compare to report_list to determine which reports go into the data | # Compare to report_list to determine which reports go into the data | ||||
| receipts | receipts | ||||
| } | } | ||||
| write_receipts_parquet <- function(dir_sboe_id, report_list = tar_read(report_list)) { | |||||
| receipts <- process_receipts_csv(dir_sboe_id, report_list) | |||||
| write_prepared_receipts_parquet <- function(dir_sboe_id, report_list = tar_read(report_list)) { | |||||
| receipts <- prepare_receipts_csv(dir_sboe_id, report_list) | |||||
| info <- report_path_info(dir_sboe_id) | info <- report_path_info(dir_sboe_id) | ||||
| data_dir <- here::here("..", "data", "receipts", sprintf("sboe_id=%s", info$sboe_id)) | |||||
| data_dir <- here::here("..", "data-prep", "receipts", sprintf("sboe_id=%s", info$sboe_id)) | |||||
| data_path <- path(data_dir, "part-0.parquet") | data_path <- path(data_dir, "part-0.parquet") | ||||
| dir_create(data_dir) | dir_create(data_dir) | ||||
| prepare_report_list <- function(path_report_list) { | |||||
| out <- path("..", "data-prep", "report_list", "part-0.parquet") | |||||
| dir_create(path_dir(out)) | |||||
| read_csv( | |||||
| path_report_list, | |||||
| col_types = cols( | |||||
| year = col_integer(), | |||||
| report_id = col_integer() | |||||
| ) | |||||
| ) |> | |||||
| mutate(sboe_id = fix_sboe_id_missing(sboe_id, report_id)) |> | |||||
| spot_fix_report_list() |> | |||||
| arrow::write_parquet(out) | |||||
| out | |||||
| } |
| read_report_file <- function(report_path) { | |||||
| read_report_export <- function(report_path) { | |||||
| info <- report_path_info(report_path) | info <- report_path_info(report_path) | ||||
| lines <- brio::read_file(report_path) | lines <- brio::read_file(report_path) | ||||
| body <- trimws(sub(header, "", body, fixed = TRUE)) | body <- trimws(sub(header, "", body, fixed = TRUE)) | ||||
| # body <- gsub("([^,]+ )\n([^,]+)", "\\1\\2", body) | # body <- gsub("([^,]+ )\n([^,]+)", "\\1\\2", body) | ||||
| body <- pre_process_table_body(title, header, body) | |||||
| body <- pre_prepare_table_body(title, header, body) | |||||
| csv <- paste0(header, "\n", body) | csv <- paste0(header, "\n", body) | ||||
| data <- read_csv(I(csv), show_col_types = FALSE, col_types = cols(.default = "c")) | data <- read_csv(I(csv), show_col_types = FALSE, col_types = cols(.default = "c")) | ||||
| } | } | ||||
| names(data) <- snakecase::to_snake_case(names(data), parsing_option = 3) | names(data) <- snakecase::to_snake_case(names(data), parsing_option = 3) | ||||
| data <- post_process_steps_for_table(data, title) | |||||
| data <- post_prepare_steps_for_table(data, title) | |||||
| data <- mutate(data, !!!info, .before = 1) | data <- mutate(data, !!!info, .before = 1) | ||||
| structure(list(data), names = title) | structure(list(data), names = title) | ||||
| } | } | ||||
| pre_process_table_body <- function(table, header, body) { | |||||
| pre_prepare_table_body <- function(table, header, body) { | |||||
| if (table != "accounts") return(body) | if (table != "accounts") return(body) | ||||
| exp_commas <- stringr::str_count(header, ",") | exp_commas <- stringr::str_count(header, ",") | ||||
| paste(body_lines, collapse = "\n") | paste(body_lines, collapse = "\n") | ||||
| } | } | ||||
| post_process_steps_for_table <- function(data, table) { | |||||
| post_prepare_steps_for_table <- function(data, table) { | |||||
| switch( | switch( | ||||
| table, | table, | ||||
| cover = , | cover = , | ||||
| all <- | all <- | ||||
| info$path |> | info$path |> | ||||
| map(read_report_file) |> | |||||
| map(read_report_export) |> | |||||
| list_transpose_bind() |> | list_transpose_bind() |> | ||||
| map(report_data_set_column_type) | map(report_data_set_column_type) | ||||
| ) | ) | ||||
| } | } | ||||
| write_processed_report_export <- function(dir_sboe_id, report_list = tar_read(report_list)) { | |||||
| write_prepared_report_export <- function(dir_sboe_id, report_list = tar_read(report_list)) { | |||||
| reports <- process_report_export(dir_sboe_id, report_list) | reports <- process_report_export(dir_sboe_id, report_list) | ||||
| info <- report_path_info(dir_sboe_id) | info <- report_path_info(dir_sboe_id) | ||||
| if (!any(info$sboe_id == "No Id") && length(unique(info$sboe_id)) == 1) { | if (!any(info$sboe_id == "No Id") && length(unique(info$sboe_id)) == 1) { | ||||
| return(write_processed_report_export_parquet(reports, unique(info$sboe_id))) | |||||
| return(write_prepared_report_export_parquet(reports, unique(info$sboe_id))) | |||||
| } | } | ||||
| sboe_ids <- map_dfr(reports, select, sboe_id) |> pull(sboe_id) |> unique() | sboe_ids <- map_dfr(reports, select, sboe_id) |> pull(sboe_id) |> unique() | ||||
| set_names() |> | set_names() |> | ||||
| map(\(id) map(reports, \(d) filter(d, sboe_id == id))) | map(\(id) map(reports, \(d) filter(d, sboe_id == id))) | ||||
| map(sboe_ids, \(id) write_processed_report_export_parquet(reports[[id]], id)) |> | |||||
| map(sboe_ids, \(id) write_prepared_report_export_parquet(reports[[id]], id)) |> | |||||
| flatten_chr() |> | flatten_chr() |> | ||||
| unname() | unname() | ||||
| } | } | ||||
| write_processed_report_export_parquet <- function(reports, sboe_id) { | |||||
| base_dir <- here::here("..", "data") | |||||
| write_prepared_report_export_parquet <- function(reports, sboe_id) { | |||||
| base_dir <- here::here("..", "data-prep") | |||||
| sboe_id_param <- sprintf("sboe_id=%s", sboe_id) | sboe_id_param <- sprintf("sboe_id=%s", sboe_id) | ||||
| return_path <- c() | return_path <- c() |
| process_report_dates <- function(report_list_raw, cover_raw) { | |||||
| cover_dates <- | |||||
| cover_raw |> | |||||
| select( | |||||
| report_id, | |||||
| cover_start_date = date_from, | |||||
| cover_end_date = date_to, | |||||
| cover_date_filed = date_filed | |||||
| ) | |||||
| report_list_raw |> | |||||
| left_join(reporting_schedule(), by = join_by(year, doc_name)) |> | |||||
| select( | |||||
| report_id, sboe_id, year, doc_name, amended, | |||||
| contains("received_"), | |||||
| matches("sboe_(start|end)_date") | |||||
| ) |> | |||||
| left_join(cover_dates, by = "report_id") |> | |||||
| mutate(across(matches("received|date"), na_if_obviously_wrong_date)) |> | |||||
| mutate( | |||||
| # If the received date isn't after at least one of the report or cover date, don't believe it | |||||
| received_image = received_image |> na_if_not_after_one_of(sboe_start_date, cover_start_date), | |||||
| received_data = received_data |> na_if_not_after_one_of(sboe_start_date, cover_start_date) | |||||
| ) | |||||
| } | |||||
| na_if_obviously_wrong_date <- function(x) { | |||||
| x[x > today()] <- NA_Date_ | |||||
| x[x < ymd("2016-01-01")] <- NA_Date_ | |||||
| x | |||||
| } | |||||
| na_if_not_after_one_of <- function(x, ...) { | |||||
| others <- list(...) | |||||
| is_after <- function(x, y) { | |||||
| ret <- x >= y | |||||
| ret[is.na(ret)] <- FALSE | |||||
| ret | |||||
| } | |||||
| allow <- purrr::map(others, is_after, x = x) |> purrr::reduce(`|`) | |||||
| x[!allow] <- NA_Date_ | |||||
| x | |||||
| } | |||||
| mean_date_scalar <- function(x, y) { | |||||
| if (is.na(x) && is.na(y)) { | |||||
| return(NA_Date_) | |||||
| } | |||||
| mean(c(x, y), na.rm = TRUE) | |||||
| } |
| process_report_list <- function(path_report_list) { | |||||
| out <- path("..", "data", "report_list", "part-0.parquet") | |||||
| dir_create(path_dir(out)) | |||||
| read_csv( | |||||
| path_report_list, | |||||
| col_types = cols( | |||||
| year = col_integer(), | |||||
| report_id = col_integer() | |||||
| process_report_list <- function(report_list_raw, report_amended_score) { | |||||
| # select the correct report_id to use for each report group | |||||
| report_keep <- | |||||
| report_amended_score |> | |||||
| group_by(sboe_id, year, doc_name) |> | |||||
| slice_max(amended_score) |> | |||||
| # follow up to be certain there's only one report per group | |||||
| slice_max(report_id) | |||||
| report_keep_count <- report_keep |> count() | |||||
| if (any(report_keep_count$n > 1)) { | |||||
| stop("Have not successfully selected a single report per committee and report group.") | |||||
| } | |||||
| report_keep <- report_keep |> ungroup() |> select(report_id, matches("received|date")) | |||||
| report_list_raw |> | |||||
| select(-matches("received|date")) |> | |||||
| right_join(report_keep, by = "report_id") |> | |||||
| select(-committee_name, -doc_type, -tar_group, image_id = img_link) |> | |||||
| mutate( | |||||
| across(doc_name, as_report_factor), | |||||
| image_id = sub(".+?(\\d+)$", "\\1", image_id) | |||||
| ) | ) | ||||
| ) |> | |||||
| mutate(sboe_id = fix_sboe_id_missing(sboe_id, report_id)) |> | |||||
| arrow::write_parquet(out) | |||||
| } | } |
| spot_fixes_report_list <- list( | |||||
| list( | |||||
| # This report is actually an amended report | |||||
| ids = list(report_id = 195397), | |||||
| values = list(amended = TRUE) | |||||
| ), | |||||
| list( | |||||
| # This report is actually the first one, not amended | |||||
| ids = list(report_id = 159437), | |||||
| values = list(amended = FALSE) | |||||
| ), | |||||
| list( | |||||
| ids = list(report_id = 161042), | |||||
| values = list(doc_name = "Mid Year Semi-Annual") | |||||
| ) | |||||
| ) | |||||
| spot_fix_report_list <- function(report_list) { | |||||
| for (fix in spot_fixes_report_list) { | |||||
| fix_tbl <- fix |> map(as_tibble) |> unname() |> list_cbind() | |||||
| report_list <- rows_update(report_list, fix_tbl, by = names(fix$ids)) | |||||
| } | |||||
| report_list | |||||
| } |
| write_committee_parquet <- function(report_list = tar_read(report_list)) { | |||||
| cover_path <- here::here("../data/cover") | |||||
| cover <- arrow::open_dataset(cover_path, partitioning = "sboe_id") | |||||
| latest_report_by_committee <- report_list |> slice_max(end_date, by = "sboe_id") | |||||
| committees <- | |||||
| cover |> | |||||
| semi_join(latest_report_by_committee, by = "report_id") |> | |||||
| collect() |> | |||||
| select(sboe_id:committee_type, fund_type, fund_name) | |||||
| arrow::write_parquet(committees, here::here("data/committees/part-0.parquet")) | |||||
| } |
| # Replace the target list below with your own: | # Replace the target list below with your own: | ||||
| list( | list( | ||||
| tar_target(path_report_list, "../data-raw/report_list.csv", format = "file"), | |||||
| tar_target(report_list, process_report_list(path_report_list)), | |||||
| tar_target(path_report_list_csv, "../data-raw/report_list.csv", format = "file"), | |||||
| tar_target(path_report_list_raw, prepare_report_list(path_report_list_csv)), | |||||
| tar_target(report_list_raw, arrow::read_parquet(path_report_list_raw)), | |||||
| tar_target( | tar_target( | ||||
| dirs_all, | dirs_all, | ||||
| fs::dir_ls("../data-raw/reports", glob = "**/expenditures", recurse = TRUE, type = "directory") | fs::dir_ls("../data-raw/reports", glob = "**/expenditures", recurse = TRUE, type = "directory") | ||||
| ), | ), | ||||
| tar_target( | |||||
| paths_all_parquet, | |||||
| write_prepared_report_export(dirs_all, report_list_raw), | |||||
| pattern = map(dirs_all), | |||||
| format = "file" | |||||
| ), | |||||
| tar_target( | tar_target( | ||||
| path_receipts_parquet, | path_receipts_parquet, | ||||
| write_receipts_parquet(dirs_receipts, report_list), | |||||
| write_prepared_receipts_parquet(dirs_receipts, report_list_raw), | |||||
| pattern = map(dirs_receipts), | pattern = map(dirs_receipts), | ||||
| format = "file" | format = "file" | ||||
| ), | ), | ||||
| tar_target( | tar_target( | ||||
| path_expenditures_parquet, | path_expenditures_parquet, | ||||
| write_expenditures_parquet(dirs_expenditures, report_list), | |||||
| write_prepared_expenditures_parquet(dirs_expenditures, report_list_raw), | |||||
| pattern = map(dirs_expenditures), | pattern = map(dirs_expenditures), | ||||
| format = "file" | format = "file" | ||||
| ), | ), | ||||
| tar_target( | tar_target( | ||||
| paths_all_parquet, | |||||
| write_processed_report_export(dirs_all, report_list), | |||||
| pattern = map(dirs_all), | |||||
| format = "file" | |||||
| cover_raw, | |||||
| { | |||||
| paths_all_parquet # depends on prepared report_exports | |||||
| arrow::open_dataset("../data/cover", partitioning = "sboe_id") |> | |||||
| dplyr::collect() | |||||
| } | |||||
| ), | |||||
| tar_target( | |||||
| report_dates, | |||||
| process_report_dates(report_list_raw, cover_raw) | |||||
| ), | |||||
| tar_target( | |||||
| report_amended_score, | |||||
| calc_report_amended_score(report_dates) | |||||
| ), | |||||
| tar_target( | |||||
| report_list, | |||||
| process_report_list(report_list_raw, report_amended_score) | |||||
| ) | ) | ||||
| ) | ) |
| 'usage: | 'usage: | ||||
| run.R all | run.R all | ||||
| run.R target <targets>... | |||||
| run.R target <targets>... [--shortcut] | |||||
| run.R -h | --help | run.R -h | --help | ||||
| options: | options: | ||||
| targets::tar_make() | targets::tar_make() | ||||
| } else { | } else { | ||||
| cli::cli_alert_info("Running targets: {.and {.field {opts$targets}}}") | cli::cli_alert_info("Running targets: {.and {.field {opts$targets}}}") | ||||
| targets::tar_make(targets::any_of(!!opts$targets)) | |||||
| targets::tar_make(targets::any_of(!!opts$targets), shortcut = opts$shortcut) | |||||
| } | } | ||||
| # targets::tar_make_clustermq(workers = 2) # nolint | # targets::tar_make_clustermq(workers = 2) # nolint |
| #+ basic-validation | #+ basic-validation | ||||
| cf$report_list |> count() | cf$report_list |> count() | ||||
| cf$cover |> count() | |||||
| ## When de-duplicated there should be no more reports in "cover" than in "report_list" | |||||
| cf$report_list |> distinct(report_id, sboe_id) |> count() | cf$report_list |> distinct(report_id, sboe_id) |> count() | ||||
| ## Cover should have the same number of reports as report_list | |||||
| cf$cover |> count() | |||||
| cf$cover |> distinct(report_id, sboe_id) |> count() | cf$cover |> distinct(report_id, sboe_id) |> count() | ||||
| # duplicated sboe/report ids | # duplicated sboe/report ids | ||||
| by = c("sboe_id", "report_id") | by = c("sboe_id", "report_id") | ||||
| ) | ) | ||||
| # are all of the duplicated rows full duplicates? yes, both are the same. | |||||
| # FIXED: are all of the duplicated rows full duplicates? yes, both are the same. | |||||
| # => I'm going to go back and call `distinct()` when adding the file | # => I'm going to go back and call `distinct()` when adding the file | ||||
| duplicated |> distinct(sboe_id, report_id) |> count() | duplicated |> distinct(sboe_id, report_id) |> count() | ||||
| duplicated |> distinct() |> count() | duplicated |> distinct() |> count() | ||||
| # => I'm going to leave this alone | # => I'm going to leave this alone | ||||
| cf$report_list |> filter(is.na(received)) | cf$report_list |> filter(is.na(received)) | ||||
| # How many reports are missing from cover? | |||||
| # FIXED: How many reports are missing from cover? | |||||
| cf$report_list |> | cf$report_list |> | ||||
| anti_join(cf$cover, by = c("report_id", "sboe_id")) |> | anti_join(cf$cover, by = c("report_id", "sboe_id")) |> | ||||
| collect() |> | collect() |> | ||||
| #> * `../../data-raw/reports/STA-O079OC-C-001/all/200919_2021-07-17.txt`: no records | #> * `../../data-raw/reports/STA-O079OC-C-001/all/200919_2021-07-17.txt`: no records | ||||
| #> * `../../data-raw/reports/STA-XD82JF-C-001/all/210255_2022-10-24.txt`: no records | #> * `../../data-raw/reports/STA-XD82JF-C-001/all/210255_2022-10-24.txt`: no records | ||||
| # Identify reports that are missing a cover entry... | |||||
| # FIXED: Identify reports that are missing a cover entry... | |||||
| no_records <- | no_records <- | ||||
| cf$report_list |> | cf$report_list |> | ||||
| anti_join(distinct(cf$receipts, sboe_id, report_id)) |> | anti_join(distinct(cf$receipts, sboe_id, report_id)) |> | ||||
| # Some committees have "No Id" as their SBOE ID | # Some committees have "No Id" as their SBOE ID | ||||
| # => I'll make these "NOID-{report_id}" | # => I'll make these "NOID-{report_id}" | ||||
| unique_committees <- cf$report_list |> distinct(sboe_id, committee_name) | |||||
| unique_committees_report <- cf$report_list |> distinct(sboe_id, committee_name) | |||||
| unique_committees |> | |||||
| unique_committees_report |> | |||||
| semi_join( | semi_join( | ||||
| unique_committees |> count(sboe_id) |> filter(n > 1), | |||||
| unique_committees_report |> count(sboe_id) |> filter(n > 1), | |||||
| by = "sboe_id" | by = "sboe_id" | ||||
| ) | ) | ||||
| cf$cover |> distinct(sboe_id, committee_name) |> count() | |||||
| cf$cover |> distinct(sboe_id) |> count() | |||||
| # Bad report dates | |||||
| cover <- | |||||
| cf$cover |> | |||||
| select(1:3, report_type, contains("date")) |> | |||||
| collect() | |||||
| cover |> | |||||
| filter( | |||||
| date_filed > today() | date_to > today() | date_from > today() | |||||
| ) | |||||
| cover |> filter(date_filed < date_from) | |||||
| cover |> filter(sboe_id == "FED-361L24-C-001") | |||||
| cover |> | |||||
| filter( | |||||
| !(date_filed > today() | date_to > today() | date_from > today()) | |||||
| ) |> | |||||
| ggplot() + | |||||
| aes(x = date_to, y = date_filed) + | |||||
| geom_point() | |||||
| # Committee names change over time? | |||||
| unique_committees <- cf$cover |> distinct(sboe_id, committee_name) | |||||
| cf$cover |> | |||||
| semi_join( | |||||
| unique_committees |> count(sboe_id) |> filter(n > 1), | |||||
| by = "sboe_id" | |||||
| ) |> | |||||
| arrange(sboe_id, date_filed) |> | |||||
| select(1, date_filed, 2:zip_code) | |||||
| # Committee registration ---- | |||||
| cf$cover |> | |||||
| slice_max(date_filed, by = sboe_id) |> | |||||
| select(sboe_id:committee_type, last_date_filed = date_filed) |> | |||||
| left_join( | |||||
| cf$cover |> slice_min(date_filed, by = sboe_id) |> select(sboe_id, first_date_filed = date_filed), | |||||
| by = "sboe_id" | |||||
| ) |> | |||||
| arrange(desc(last_date_filed), first_date_filed) | |||||
| # simple committee registration (best effort, not perfect) FIXME | |||||
| committees <- | |||||
| report_list |> | |||||
| slice_max(end_date, by = sboe_id, n = 1) |> | |||||
| slice_max(report_id, by = sboe_id, n = 1) |> | |||||
| distinct(sboe_id, committee_name) | |||||
| # Dealing with appended reports ------------------------------------------- | |||||
| ## Which reports have been appended? | |||||
| reports_summary <- | |||||
| report_list |> | |||||
| summarize( | |||||
| n_total = n(), | |||||
| n_amended = sum(amended), | |||||
| .by = c(sboe_id, year, doc_name) | |||||
| ) |> | |||||
| arrange(desc(n_total)) |> | |||||
| left_join(committees, by = "sboe_id") | |||||
| # Dan Forest, 2016-Q2 Report, 6 reports, 5 ammendments | |||||
| # [x] Can be resolved by looking at last received date! | |||||
| report_list |> | |||||
| filter( | |||||
| sboe_id == "STA-M4HR0Y-C-001", | |||||
| year == 2016, | |||||
| doc_name == "Second Quarter" | |||||
| ) |> | |||||
| slice_max(received) | |||||
| report_list |> | |||||
| group_by(sboe_id, year, doc_name) |> | |||||
| slice_max(received) |> | |||||
| count(sort = TRUE) |> | |||||
| filter(n > 1) | |||||
| # report_id is basically sequential | |||||
| report_list |> | |||||
| filter(!is.na(received_data)) |> | |||||
| filter(received < today()) |> | |||||
| filter(received > ymd("2016-01-01")) |> | |||||
| ggplot() + | |||||
| aes(received, report_id) + | |||||
| geom_point() | |||||
| report_list |> | |||||
| filter(report_id == min(report_id)) | |||||
| report_list |> | |||||
| filter( | |||||
| between(received_image, ymd("2016-01-01"), today()), | |||||
| between(received_data, ymd("2016-01-01"), today()) | |||||
| ) |> | |||||
| ggplot() + | |||||
| aes(received_image, received_data) + | |||||
| geom_point() | |||||
| report_list |> | |||||
| filter( | |||||
| between(received_image, ymd("2016-01-01"), today()), | |||||
| between(received_data, ymd("2016-01-01"), today()) | |||||
| ) |> | |||||
| pivot_longer(contains("received_"), names_to = "received_item", values_to = "received_when") |> | |||||
| mutate(received_item = str_remove(received_item, "received_")) |> | |||||
| ggplot() + | |||||
| aes(end_date, received_when) + | |||||
| geom_point() + | |||||
| facet_wrap(~ received_item) | |||||
| report_list |> filter(end_date > today()) | |||||
| levels_doc_name <- str_to_title(c( | |||||
| paste(c("first", "second", "third", "fourth"), "quarter"), | |||||
| paste(c("mid year", "year end"), "Semi-Annual") | |||||
| )) | |||||
| # wait, what are the correct reporting dates? | |||||
| report_list |> | |||||
| mutate(doc_name = factor(doc_name, levels_doc_name)) |> | |||||
| group_by(year, doc_name) |> | |||||
| count(start_date, end_date, sort = TRUE) |> | |||||
| slice_max(n) |> | |||||
| slice_max(end_date) |> | |||||
| print(n = 48) | |||||
| # https://www.ncleg.gov/EnactedLegislation/Statutes/HTML/BySection/Chapter_163/GS_163-278.9.html | |||||
| report_list |> | |||||
| mutate(doc_name = factor(doc_name, levels_doc_name)) |> | |||||
| filter( | |||||
| between(start_date, make_date(year), make_date(year + 1)), | |||||
| between(end_date, make_date(year), make_date(year + 1)) | |||||
| ) |> | |||||
| group_by(year, doc_name) |> | |||||
| count(start_date, end_date, sort = TRUE) |> | |||||
| ungroup() |> | |||||
| arrange(year, doc_name) |> | |||||
| mutate(quarterly = ifelse(grepl("Quarter", doc_name), "Quarterly", "Semi-Annual")) |> | |||||
| ggplot() + | |||||
| aes(y = year, color = doc_name, group = doc_name) + | |||||
| geom_linerange( | |||||
| aes(xmin = start_date, xmax = end_date, alpha = n), | |||||
| linewidth = 20, | |||||
| ) + | |||||
| facet_wrap(~quarterly) + | |||||
| theme_minimal() | |||||
| report_list |> | |||||
| left_join(report_dates, by = c("year", "doc_name"), relationship = "many-to-one") |> | |||||
| filter( | |||||
| !(between(start_date, sboe_start_date, sboe_end_date) | between(end_date, sboe_start_date, sboe_end_date)) | |||||
| ) |> | |||||
| View() |
| --- | |||||
| title: PROBLEMS | |||||
| author: Garrick Aden-Buie | |||||
| format: pdf | |||||
| execute: | |||||
| echo: true | |||||
| --- | |||||
| ## Setup | |||||
| ```{r} | |||||
| library(tidyverse) | |||||
| library(fs) | |||||
| pkgload::load_all(here::here("process")) | |||||
| ``` | |||||
| ```{r load-data} | |||||
| cf <- cf_prep_db_create() | |||||
| ``` | |||||
| ```{r load-data-report_list} | |||||
| report_list <- | |||||
| cf$report_list |> | |||||
| collect() |> | |||||
| mutate(across(doc_name, as_report_factor)) | |||||
| ``` | |||||
| ### Problem scoping | |||||
| For helping determine the size of the problem | |||||
| ```{r} | |||||
| expenditures_by_report <- | |||||
| cf$expenditures |> | |||||
| summarize( | |||||
| n_expenses = n(), | |||||
| total_expenses = sum(amount), | |||||
| .by = report_id | |||||
| ) |> | |||||
| collect() |> | |||||
| full_join(report_list["report_id"], by = "report_id") |> | |||||
| replace_na(list(n_expenses = 0, total_expenses = 0)) | |||||
| receipts_by_report <- | |||||
| cf$receipts |> | |||||
| summarize( | |||||
| n_receipts = n(), | |||||
| total_receipts = sum(amount), | |||||
| .by = report_id | |||||
| ) |> | |||||
| collect() |> | |||||
| full_join(report_list["report_id"], by = "report_id") |> | |||||
| replace_na(list(n_receipts = 0, total_receipts = 0)) | |||||
| ``` | |||||
| ## Doc search problems | |||||
| ```{r} | |||||
| report_cover_report_type <- | |||||
| report_list |> | |||||
| mutate(report_type_listed = paste(year, doc_name)) |> | |||||
| select(report_id, sboe_id, report_type_listed) |> | |||||
| left_join( | |||||
| cf$cover |> select(sboe_id, report_id, report_type_cover = report_type) |> collect() | |||||
| ) | |||||
| report_cover_report_type |> count(report_type_listed == report_type_cover) | |||||
| report_cover_report_type |> filter(report_type_listed != report_type_cover) | |||||
| ``` | |||||
| ```{r} | |||||
| report_cover_report_type |> | |||||
| filter(report_type_listed != report_type_cover) |> | |||||
| left_join(expenditures_by_report) |> | |||||
| left_join(receipts_by_report) |> | |||||
| arrange(total_receipts) | |||||
| ``` | |||||
| In some of these cases, the cover is probably wrong: | |||||
| ```{r} | |||||
| report_cover_report_type |> | |||||
| filter(report_type_listed != report_type_cover) |> | |||||
| left_join( | |||||
| cf$cover |> select(report_id, date_from, date_to) |> collect() | |||||
| ) |> | |||||
| left_join( | |||||
| reporting_schedule() |> | |||||
| mutate(report_type_sched = paste(year, doc_name)) |> | |||||
| select(report_type_sched, sboe_start_date, sboe_end_date), | |||||
| by = c(date_from = "sboe_start_date", date_to = "sboe_end_date") | |||||
| ) | |||||
| ``` | |||||
| ## Dates | |||||
| ```{r} | |||||
| report_dates <- tar_read(report_dates, store = here::here("process/_targets")) | |||||
| ``` | |||||
| ```{r} | |||||
| report_dates |> filter(sboe_start_date != cover_start_date) # 3,422 | |||||
| report_dates |> filter(sboe_end_date != cover_end_date) # 590 | |||||
| report_dates |> filter(received_image < cover_start_date) # 60 | |||||
| report_dates |> filter(received_image < cover_end_date) # 222 | |||||
| report_dates |> filter(received_data < cover_start_date) # 2 | |||||
| report_dates |> filter(received_data < cover_end_date) # 45 | |||||
| report_dates |> filter(cover_date_filed < cover_end_date) # 950 | |||||
| ``` | |||||
| ## Picking amended | |||||
| Picking the correct amended report is problematic because no date in the `report_list` can really be trusted. | |||||
| ### Interestingly problematic | |||||
| ```{r} | |||||
| # STA-C3235N-C-001 2017 Year End Semi-Annual | |||||
| # WAK-56BLZN-C-001 2020 Mid Year Semi-Annual CITIZENS FOR TOMMY MATTHEWS | |||||
| # STA-Z6M8TR-C-001 2017 Year End Semi-Annual FIREFIGHTERS FOR RESPON | |||||
| ``` |