# Created by use_targets(). # Follow the comments below to fill in this target script. # Then follow the manual to check and run the pipeline: # https://books.ropensci.org/targets/walkthrough.html#inspect-the-pipeline # Load packages required to define the pipeline: library(targets) # Set target options: tar_option_set( packages = strsplit(desc::desc_get_field("Depends"), ", ")[[1]], # For distributed computing in tar_make(), supply a {crew} controller # as discussed at https://books.ropensci.org/targets/crew.html. controller = crew::crew_controller_local(workers = 24), # debug = "path_out_addresses", # cue = tar_cue(mode = "never"), error = "null" ) # Run the R scripts in the R/ folder with your custom functions: tar_source() # Replace the target list below with your own: list( tar_target(path_report_list_csv, "../data-raw/report_list.csv", format = "file"), tar_target(path_report_list_raw, prepare_report_list(path_report_list_csv)), tar_target(report_list_raw, arrow::read_parquet(path_report_list_raw)), tar_target( dirs_all_src, fs::dir_ls("../data-raw/reports", glob = "**/all", recurse = TRUE, type = "directory"), format = "file" ), # This comes from Will's answer in https://stackoverflow.com/a/70293576 # We're basically tricking targets into letting us branch over a file target tar_target(dirs_all_names, dirs_all_src), tar_target(dirs_all, {dirs_all_src; dirs_all_names}, pattern = map(dirs_all_names), format = "file"), tar_target( dirs_receipts_src, fs::dir_ls("../data-raw/reports", glob = "**/receipts", recurse = TRUE, type = "directory"), format = "file" ), tar_target(dirs_receipts_names, dirs_receipts_src), tar_target(dirs_receipts, {dirs_receipts_src; dirs_receipts_names}, pattern = map(dirs_receipts_names), format = "file"), tar_target( dirs_expenditures_src, fs::dir_ls("../data-raw/reports", glob = "**/expenditures", recurse = TRUE, type = "directory"), format = "file" ), tar_target(dirs_expenditures_names, dirs_expenditures_src), tar_target(dirs_expenditures, {dirs_expenditures_src; dirs_expenditures_names}, pattern = map(dirs_expenditures_names), format = "file"), tar_target( paths_all_parquet, write_prepared_report_export(dirs_all, report_list_raw), pattern = map(dirs_all), format = "file" ), tar_target( path_receipts_parquet, write_prepared_receipts_parquet(dirs_receipts, report_list_raw), pattern = map(dirs_receipts), format = "file" ), tar_target( path_expenditures_parquet, write_prepared_expenditures_parquet(dirs_expenditures, report_list_raw), pattern = map(dirs_expenditures), format = "file" ), tar_target(path_data_prep_cover, { paths_all_parquet; "../data-prep/cover" }, format = "file"), tar_target(path_data_prep_officers, { paths_all_parquet; "../data-prep/officers" }, format = "file"), tar_target(path_data_prep_receipts, { paths_all_parquet; "../data-prep/receipts" }, format = "file"), tar_target(path_data_prep_expenditures, { paths_all_parquet; "../data-prep/expenditures" }, format = "file"), tar_target( cover_raw, arrow::open_dataset(path_data_prep_cover, partitioning = "sboe_id") |> dplyr::collect() ), tar_target( report_dates, process_report_dates(report_list_raw, cover_raw) ), tar_target( path_report_dates, { out_path <- "../data-prep/report_dates/part-0.parquet" fs::dir_create(fs::path_dir(out_path)) arrow::write_parquet(report_dates, out_path) }), tar_target( report_amended_score, calc_report_amended_score(report_dates) ), tar_target( addresses_raw, prep_collect_addresses_raw( path_officers = path_data_prep_officers, path_receipts = path_data_prep_receipts, path_expenditures = path_data_prep_expenditures, path_candidate_listing = path_candidate_listing_raw, path_voters = NULL # path_voters_parquet ), format = "parquet" ), tar_target( path_addresses_db, prepare_addresses_lookup_db(addresses_raw$address), format = "file" ), # This report list uses the latest amended report ----- tar_target( report_list, process_report_list(report_list_raw, report_amended_score) ), tar_target(committees, prepare_committees(cover_raw, report_list)), # tar_target(donations, prepare_donations(path_data_prep_receipts, report_list)), # Outside data sources ----- tar_target(candidate_listing_raw, get_candidate_listing(2016:2023)), tar_target( path_candidate_listing_raw, write_parquet(candidate_listing_raw, "../data-prep/candidate_listing/part-0.parquet"), format = "file" ), ## Voter registration records tar_target(path_voters_txt, voter_statewide_download(), cue = tar_cue("never")), #<< invalidate to get latest tar_target( path_out_voters, voter_statewide_convert_parquet(path_voters_txt), cue = tar_cue("never"), format = "file" ), # Donors ------------------------------------------------------------------ # tar_target( # donors_latest, # prepare_donors_latest( # path_data_prep_receipts, # path_out_report_list, # path_addresses = "data-out/addresses" # ) # ), # tar_target(donors_for_matching, prepare_donors_for_matching(donors_latest)), # tar_target(donors_for_matching_sample, sample_frac(donors_for_matching, 0.1)), # tar_target( # donors_for_matching_sample_nc, # donors_for_matching |> filter(grepl("^27", postal_code)) |> sample_n(10000) # ), # # tar_target( # donors_to_match, # { # # For testing, use a small sample of donors # # slice_sample(donors_for_matching, n = 5000) # donors_for_matching # } # ), # # # Build the EM matching model from a moderately sized sample # tar_target(donors_em_model_sample, prepare_donors_matching_sample(donors_to_match)), # tar_target(donors_em_model, fastlink_donors(donors_em_model_sample, estimate.only = TRUE)), # # # Then create blocks of data to match against # tar_target(donor_blocks_zip_pre, prepare_donor_zip_blocks(donors_to_match)), # tar_target(donor_blocks_city, prepare_donor_city_blocks(donors_to_match)), # tar_target(donor_blocks_name, prepare_donor_name_blocks(donors_to_match)), # # # Then apply the pre-trained EM model to each block # tar_target( # donor_linked_zip_pre, # fastlink_donor_blocks(donor_blocks_zip_pre, em.obj = donors_em_model) # ), # tar_target( # donor_linked_city, # fastlink_donor_blocks(donor_blocks_city, em.obj = donors_em_model) # ), # tar_target( # donor_linked_name, # fastlink_donor_blocks(donor_blocks_name, em.obj = donors_em_model) # ), # Candidate Listing ------------------------------------------------------- tar_target( candidate_listing_dedupe, prep_dedupe_candidates(candidate_listing_raw) ), tar_target( candidate_listing, prep_candidate_listing(candidate_listing_raw, candidate_listing_dedupe) ), # Candidates -------------------------------------------------------------- tar_target( candidates_for_matching, prepare_candidates_for_matching(path_data_prep_officers, path_out_report_list, path_out_committees) ), tar_target( candidate_listing_for_matching, prepare_candidate_listing_for_matching(candidate_listing) ), tar_target( candidates_linked, fastlink_candidates( candidates_for_matching, candidate_listing_for_matching ) ), tar_target( committee_candidate, fastlink_match_candidates( candidates_for_matching, candidate_listing_for_matching, candidates_linked ) ), # Output ------------------------------------------------------------------ tar_target(path_out_report_list, out_report_list(report_list), format = "file"), tar_target( path_out_addresses, # This needs to be run manually, otherwise it doesn't run in {targets} # and throws an error: `bad value` out_addresses(path_addresses_db, "data-out/addresses.parquet"), cue = tar_cue("never"), format = "file" ), tar_target( path_out_cover, out_cover(path_data_prep_cover, path_out_report_list), format = "file" ), tar_target( path_out_committees, out_committees(path_out_cover, path_out_report_list), format = "file" ), tar_target( path_out_officers, out_officers(path_data_prep_officers, path_out_report_list), format = "file" ), tar_target( path_out_expenses_payee, out_expenses_payee( path_data_prep_expenditures, path_out_report_list ), format = "file" ), tar_target( path_out_expenses, out_expenses( path_data_prep_expenditures, path_out_expenses_payee, path_out_report_list ), format = "file" ), tar_target( path_out_receipts_payer, out_receipts_payer( path_data_prep_receipts, path_out_report_list ), format = "file" ), tar_target( path_out_receipts, out_receipts( path_data_prep_receipts, path_out_receipts_payer, path_out_report_list ), format = "file" ), tar_target( path_out_cl_elections, out_write_parquet(candidate_listing$cl_elections, "cl_elections"), format = "file" ), tar_target( path_out_cl_candidates, out_write_parquet(candidate_listing$cl_candidates, "cl_candidates"), format = "file" ), tar_target( path_out_cl_alias, out_write_parquet(candidate_listing$cl_name_on_ballot, "cl_name_on_ballot"), format = "file" ), tar_target( path_out_cl_contact, out_write_parquet(candidate_listing$cl_contact, "cl_contact"), format = "file" ), tar_target( path_out_cl_party, out_write_parquet(candidate_listing$cl_party, "cl_party"), format = "file" ), tar_target( path_out_committee_candidate, out_write_parquet(committee_candidate, "committee_candidate"), format = "file" ) )