|
- # Created by use_targets().
- # Follow the comments below to fill in this target script.
- # Then follow the manual to check and run the pipeline:
- # https://books.ropensci.org/targets/walkthrough.html#inspect-the-pipeline
-
- # Load packages required to define the pipeline:
- library(targets)
-
- # Set target options:
- tar_option_set(
- packages = strsplit(desc::desc_get_field("Depends"), ", ")[[1]],
- # For distributed computing in tar_make(), supply a {crew} controller
- # as discussed at https://books.ropensci.org/targets/crew.html.
- controller = crew::crew_controller_local(workers = 24),
- # debug = "path_out_addresses",
- # cue = tar_cue(mode = "never"),
- error = "null"
- )
-
- # Run the R scripts in the R/ folder with your custom functions:
- tar_source()
-
- # Replace the target list below with your own:
- list(
- tar_target(path_report_list_csv, "../data-raw/report_list.csv", format = "file"),
- tar_target(path_report_list_raw, prepare_report_list(path_report_list_csv)),
- tar_target(report_list_raw, arrow::read_parquet(path_report_list_raw)),
-
- tar_target(
- dirs_all_src,
- fs::dir_ls("../data-raw/reports", glob = "**/all", recurse = TRUE, type = "directory"),
- format = "file"
- ),
- # This comes from Will's answer in https://stackoverflow.com/a/70293576
- # We're basically tricking targets into letting us branch over a file target
- tar_target(dirs_all_names, dirs_all_src),
- tar_target(dirs_all, {dirs_all_src; dirs_all_names}, pattern = map(dirs_all_names), format = "file"),
-
- tar_target(
- dirs_receipts_src,
- fs::dir_ls("../data-raw/reports", glob = "**/receipts", recurse = TRUE, type = "directory"),
- format = "file"
- ),
- tar_target(dirs_receipts_names, dirs_receipts_src),
- tar_target(dirs_receipts, {dirs_receipts_src; dirs_receipts_names}, pattern = map(dirs_receipts_names), format = "file"),
-
- tar_target(
- dirs_expenditures_src,
- fs::dir_ls("../data-raw/reports", glob = "**/expenditures", recurse = TRUE, type = "directory"),
- format = "file"
- ),
- tar_target(dirs_expenditures_names, dirs_expenditures_src),
- tar_target(dirs_expenditures, {dirs_expenditures_src; dirs_expenditures_names}, pattern = map(dirs_expenditures_names), format = "file"),
-
- tar_target(
- paths_all_parquet,
- write_prepared_report_export(dirs_all, report_list_raw),
- pattern = map(dirs_all),
- format = "file"
- ),
-
- tar_target(
- path_receipts_parquet,
- write_prepared_receipts_parquet(dirs_receipts, report_list_raw),
- pattern = map(dirs_receipts),
- format = "file"
- ),
-
- tar_target(
- path_expenditures_parquet,
- write_prepared_expenditures_parquet(dirs_expenditures, report_list_raw),
- pattern = map(dirs_expenditures),
- format = "file"
- ),
-
- tar_target(path_data_prep_cover, { paths_all_parquet; "../data-prep/cover" }, format = "file"),
- tar_target(path_data_prep_officers, { paths_all_parquet; "../data-prep/officers" }, format = "file"),
- tar_target(path_data_prep_receipts, { paths_all_parquet; "../data-prep/receipts" }, format = "file"),
- tar_target(path_data_prep_expenditures, { paths_all_parquet; "../data-prep/expenditures" }, format = "file"),
-
-
- tar_target(
- cover_raw,
- arrow::open_dataset(path_data_prep_cover, partitioning = "sboe_id") |> dplyr::collect()
- ),
-
- tar_target(
- report_dates,
- process_report_dates(report_list_raw, cover_raw)
- ),
- tar_target(
- path_report_dates, {
- out_path <- "../data-prep/report_dates/part-0.parquet"
- fs::dir_create(fs::path_dir(out_path))
- arrow::write_parquet(report_dates, out_path)
- }),
-
- tar_target(
- report_amended_score,
- calc_report_amended_score(report_dates)
- ),
-
- tar_target(
- addresses_raw,
- prep_collect_addresses_raw(
- path_officers = path_data_prep_officers,
- path_receipts = path_data_prep_receipts,
- path_expenditures = path_data_prep_expenditures,
- path_candidate_listing = path_candidate_listing,
- path_voters = NULL # path_voters_parquet
- ),
- format = "parquet"
- ),
-
- tar_target(
- path_addresses_db,
- prepare_addresses_lookup_db(addresses_raw$address),
- format = "file"
- ),
-
- # This report list uses the latest amended report -----
- tar_target(
- report_list,
- process_report_list(report_list_raw, report_amended_score)
- ),
-
- tar_target(committees, prepare_committees(cover_raw, report_list)),
-
- tar_target(donations, prepare_donations(path_data_prep_receipts, report_list)),
-
- # Outside data sources -----
- tar_target(candidate_listing_raw, get_candidate_listing(2016:2023)),
- tar_target(
- path_candidate_listing_raw,
- write_parquet(candidate_listing_raw, "../data-prep/candidate_listing/part-0.parquet"),
- format = "file"
- ),
-
- ## Voter registration records
- tar_target(path_voters_txt, voter_statewide_download(), cue = tar_cue("never")), #<< invalidate to get latest
- tar_target(
- path_voters_parquet,
- voter_statewide_convert_parquet(path_voters_txt),
- cue = tar_cue("never"),
- format = "file"
- ),
-
-
- # Donors ------------------------------------------------------------------
- # tar_target(
- # donors_latest,
- # prepare_donors_latest(
- # path_data_prep_receipts,
- # path_out_report_list,
- # path_addresses = "data-out/addresses"
- # )
- # ),
- # tar_target(donors_for_matching, prepare_donors_for_matching(donors_latest)),
- # tar_target(donors_for_matching_sample, sample_frac(donors_for_matching, 0.1)),
- # tar_target(
- # donors_for_matching_sample_nc,
- # donors_for_matching |> filter(grepl("^27", postal_code)) |> sample_n(10000)
- # ),
- #
- # tar_target(
- # donors_to_match,
- # {
- # # For testing, use a small sample of donors
- # # slice_sample(donors_for_matching, n = 5000)
- # donors_for_matching
- # }
- # ),
- #
- # # Build the EM matching model from a moderately sized sample
- # tar_target(donors_em_model_sample, prepare_donors_matching_sample(donors_to_match)),
- # tar_target(donors_em_model, fastlink_donors(donors_em_model_sample, estimate.only = TRUE)),
- #
- # # Then create blocks of data to match against
- # tar_target(donor_blocks_zip_pre, prepare_donor_zip_blocks(donors_to_match)),
- # tar_target(donor_blocks_city, prepare_donor_city_blocks(donors_to_match)),
- # tar_target(donor_blocks_name, prepare_donor_name_blocks(donors_to_match)),
- #
- # # Then apply the pre-trained EM model to each block
- # tar_target(
- # donor_linked_zip_pre,
- # fastlink_donor_blocks(donor_blocks_zip_pre, em.obj = donors_em_model)
- # ),
- # tar_target(
- # donor_linked_city,
- # fastlink_donor_blocks(donor_blocks_city, em.obj = donors_em_model)
- # ),
- # tar_target(
- # donor_linked_name,
- # fastlink_donor_blocks(donor_blocks_name, em.obj = donors_em_model)
- # ),
-
-
- # Candidates --------------------------------------------------------------
- tar_target(
- candidates_for_matching,
- prepare_candidates_for_matching(path_data_prep_officers, path_out_report_list)
- ),
- tar_target(
- candidate_listing_for_matching,
- prepare_candidate_listing_for_matching(candidate_listing)
- ),
- # tar_target(
- # candidates_linked,
- # fastlink_candidates(
- # candidates_for_matching,
- # candidate_listing_for_matching
- # )
- # ),
- # tar_target(
- # candidates_matched,
- # fastlink_match_candidates(
- # candidates_for_matching,
- # candidate_listing_for_matching,
- # candidates_linked
- # )
- # ),
-
- # Candidate Listing -------------------------------------------------------
- tar_target(
- candidate_listing_dedupe,
- prep_dedupe_candidates(candidate_listing_raw)
- ),
-
- tar_target(
- candidate_listing,
- prep_candidate_listing(candidate_listing_raw, candidate_listing_dedupe)
- ),
-
-
- # Output ------------------------------------------------------------------
- tar_target(path_out_report_list, out_report_list(report_list), format = "file"),
-
- tar_target(
- path_out_addresses,
- # This needs to be run manually, otherwise it doesn't run in {targets}
- # and throws an error: `bad value`
- out_addresses(path_addresses_db, "data-out/addresses.parquet"),
- cue = tar_cue("never"),
- format = "file"
- ),
-
- tar_target(
- path_out_cover,
- out_cover(path_data_prep_cover, path_out_report_list),
- format = "file"
- ),
-
- tar_target(
- path_out_committees,
- out_committees(path_out_cover),
- format = "file"
- ),
-
- tar_target(
- path_out_expenses_payee,
- out_expenses_payee(
- path_data_prep_expenditures,
- path_out_report_list
- ),
- format = "file"
- ),
-
- tar_target(
- path_out_expenses,
- out_expenses(
- path_data_prep_expenditures,
- path_out_expenses_payee,
- path_out_report_list
- ),
- format = "file"
- ),
-
- tar_target(
- path_out_receipts_payer,
- out_receipts_payer(
- path_data_prep_receipts,
- path_out_report_list
- ),
- format = "file"
- ),
-
- tar_target(
- path_out_receipts,
- out_receipts(
- path_data_prep_receipts,
- path_out_receipts_payer,
- path_out_report_list
- ),
- format = "file"
- ),
-
- tar_target(
- path_out_cl_elections,
- out_write_parquet(candidate_listing$cl_elections, "cl_elections"),
- format = "file"
- ),
-
- tar_target(
- path_out_cl_candidates,
- out_write_parquet(candidate_listing$cl_candidates, "cl_candidates"),
- format = "file"
- ),
-
- tar_target(
- path_out_cl_alias,
- out_write_parquet(candidate_listing$cl_name_on_ballot, "cl_name_on_ballot"),
- format = "file"
- ),
-
- tar_target(
- path_out_cl_contact,
- out_write_parquet(candidate_listing$cl_contact, "cl_contact"),
- format = "file"
- ),
-
- tar_target(
- path_out_cl_party,
- out_write_parquet(candidate_listing$cl_party, "cl_party"),
- format = "file"
- )
-
- )
|