spec_export_search_results <- function() { readr::cols( `Committee Name` = readr::col_character(), `SBoE ID` = readr::col_character(), Year = readr::col_character(), `Doc Type` = readr::col_character(), `Doc Name` = readr::col_character(), Amend = readr::col_character(), `Received Image` = readr::col_date(format = "%m/%d/%Y"), `Received Data` = readr::col_date(format = "%m/%d/%Y"), `Start Date` = readr::col_date(format = "%m/%d/%Y"), `End Date` = readr::col_date(format = "%m/%d/%Y"), Image = readr::col_character(), Data = readr::col_character() ) } spec_report_by_year_scrape <- function() { readr::cols( CommitteeName = readr::col_character(), SBoEID = readr::col_character(), DocumentType = readr::col_character(), ReportYear = readr::col_character(), ReportType = readr::col_character(), IsAmendment = readr::col_character(), ImageReceiptDate = readr::col_date(format = "%m/%d/%Y"), DataImportDate = readr::col_date(format = "%m/%d/%Y"), PeriodStartDate = readr::col_date(format = "%m/%d/%Y"), PeriodEndDate = readr::col_date(format = "%m/%d/%Y"), ImageType = readr::col_character(), DataType = readr::col_character(), DataLink = readr::col_character(), ImageLink = readr::col_character() ) } get_report_by_year_export <- function(year, report) { res <- req_report_by_year_export(year, report) |> req_perform() |> resp_body_string() |> readr::read_csv(col_types = spec_export_search_results()) names(res)[which(names(res) == "SBoE ID")] <- "sboe_id" names(res) <- snakecase::to_snake_case(names(res)) res |> mutate(amend = amend == "Y") |> relocate(year, doc_name) |> group_by(year, doc_name) |> targets::tar_group() } get_report_by_year_scrape <- function(year, report) { res <- req_report_by_year(year, report) |> req_perform() |> resp_body_string() res <- strsplit(res, "\r\n")[[1]] res <- res[grepl("^\\s*var data = \\[", res)] res <- sub("\\s*var data = ", "", res) tbl <- jsonlite::fromJSON(res) |> as_tibble() if (nrow(tbl) == 0) return(NULL) tbl |> readr::type_convert(col_types = spec_report_by_year_scrape()) |> select( year = ReportYear, doc_name = ReportType, sboe_id = SBoEID, committee_name = CommitteeName, report_id = DataLink, doc_type = DocumentType, amended = IsAmendment, received_image = ImageReceiptDate, received_data = DataImportDate, start_date = PeriodStartDate, end_date = PeriodEndDate, img_link = ImageLink ) |> mutate(amended = amended == "Y") |> group_by(year, doc_name) |> targets::tar_group() } spec_report_section_receipts <- function() { readr::cols( Date = readr::col_date(format = "%m/%d/%Y"), `Is Prior` = readr::col_character(), Name = readr::col_character(), `Street 1` = readr::col_character(), `Street 2` = readr::col_character(), City = readr::col_character(), State = readr::col_character(), `Full Zip` = readr::col_character(), `Country Name` = readr::col_character(), `Outside US Postal Code` = readr::col_character(), Profession = readr::col_character(), `Employers Name` = readr::col_character(), Purpose = readr::col_character(), `Receipt Type Desc` = readr::col_character(), `Account Abbr` = readr::col_character(), `Form Of Payment Desc` = readr::col_character(), Description = readr::col_character(), Amount = readr::col_double(), `Sum To Date` = readr::col_double() ) } spec_report_section_expenses <- function() { cols( Date = col_date(format = "%m/%d/%Y"), Name = col_character(), `Street 1` = col_character(), `Street 2` = col_character(), City = col_character(), State = col_character(), `Full Zip` = col_character(), `Country Name` = col_character(), `Outside US Postal Code` = col_character(), Profession = col_character(), `Employer Name` = col_character(), `Purpose Type Code` = col_character(), Purpose = col_character(), Candidate = col_character(), `Office Sought` = col_character(), Declaration = col_character(), Amount = col_double(), `Expenditure Type Desc` = col_character(), `Account Abbr` = col_character(), `Form Of Payment Desc` = col_character(), Description = col_character(), Amount1 = col_double(), `Sum To Date` = col_double() ) } get_report_section <- function( report_id, section = "receipts", sboe_id = NULL ) { delay() res <- req_report_detail(report_id, section) |> req_perform() if (identical(res$body, raw(0))) return(NULL) res <- resp_body_string(res) spec <- switch( section, receipts = list(skip = 1, col_types = spec_report_section_receipts()), expenses = , expenditures = list(skip = 1, col_types = spec_report_section_expenses()), NULL ) if (is.null(spec)) return(res) res <- readr::read_csv(res, col_types = spec$col_types, skip = spec$skip) names(res) <- snakecase::to_snake_case(names(res)) res <- mutate(res, report_id = !!report_id, .before = 1) if (!is.null(sboe_id)) { res <- mutate(res, sboe_id = !!sboe_id, .after = report_id) } res } get_raw_report_all <- function(report_id) { delay() if (!identical(Sys.getenv("ALLOW_DOWNLOADS"), "true")) { stop("Shouldn't be downloading reports now...") } res <- req_report_detail(report_id, "all") |> req_perform() if (identical(res$body, raw(0))) return("") resp_body_string(res) } save_raw_report_all <- function(report_id, sboe_id, received) { dir <- here::here("data-raw", "reports", sboe_id, "all") file_name <- paste0(report_id, "_", received) path <- path(dir, file_name, ext = "txt") if (file_exists(path)) { # We don't need to re-download any reports return(path) } res <- get_raw_report_all(report_id) dir_create(dir) brio::write_lines(res, path) invisible(path) } get_report_receipts_json <- function(report_id, page_size = 500) { delay() page <- 0 cli::cli_progress_step("Getting page 0 of ??") res <- report_id |> req_report_receipts(page = page, page_size = page_size) |> req_perform() |> resp_body_json(check_type = FALSE) data <- res$Data$results total_rows <- res$Data$recordCountKey actual_page_size <- length(data) if (length(data) < total_rows) { while (length(data) < total_rows) { page <- page + 1 cli::cli_progress_step("Getting page {page} of {ceiling(total_rows / actual_page_size)} [{length(res$Data$results)}]") res <- report_id |> req_report_receipts(page = page, page_size = page_size) |> req_perform() |> resp_body_json(check_type = FALSE) data <- c(data, res$Data$results) } } cli::cli_progress_step("Preparing data frame") data |> map_depth(2, \(x) if (length(x)) x) |> map_dfr(\(x) x) } save_report_receipts_csv <- function(report_id, sboe_id, received) { dir <- here::here("data-raw", "reports", sboe_id, "receipts") file_name <- paste0(report_id, "_", received, "_", "receipts") path <- path(dir, file_name, ext = "csv") if (file_exists(path)) { # We don't need to re-download any reports return(path) } res <- get_report_receipts_json(report_id) dir_create(dir) write_csv(res, path) invisible(path) } get_report_expenditures_json <- function(report_id, page_size = 500) { delay() page <- 0 cli::cli_progress_step("Getting expenses page 0 of ??") res <- report_id |> req_report_expenditures(page = page, page_size = page_size) |> req_perform() |> resp_body_json(check_type = FALSE) data <- res$Data$results total_rows <- res$Data$recordCountKey actual_page_size <- length(data) if (length(data) < total_rows) { while (length(data) < total_rows) { cli::cli_progress_step("Getting expenses page {page + 1} of {ceiling(total_rows / actual_page_size)} [{length(res$Data$results)}]") page <- page + 1 res <- report_id |> req_report_expenditures(page = page, page_size = page_size) |> req_perform() |> resp_body_json(check_type = FALSE) data <- c(data, res$Data$results) } } cli::cli_progress_step("Preparing expenses data frame") data |> map_depth(2, \(x) if (length(x)) x) |> map_dfr(\(x) x) } save_report_expenditures_csv <- function(report_id, sboe_id, received) { dir <- here::here("data-raw", "reports", sboe_id, "expenditures") file_name <- paste0(report_id, "_", received, "_", "expenditures") path <- path(dir, file_name, ext = "csv") if (file_exists(path)) { # We don't need to re-download any reports return(path) } res <- get_report_expenditures_json(report_id) dir_create(dir) write_csv(res, path) invisible(path) }