Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

338 lines
9.8KB

  1. # Created by use_targets().
  2. # Follow the comments below to fill in this target script.
  3. # Then follow the manual to check and run the pipeline:
  4. # https://books.ropensci.org/targets/walkthrough.html#inspect-the-pipeline
  5. # Load packages required to define the pipeline:
  6. library(targets)
  7. # Set target options:
  8. tar_option_set(
  9. packages = strsplit(desc::desc_get_field("Depends"), ", ")[[1]],
  10. # For distributed computing in tar_make(), supply a {crew} controller
  11. # as discussed at https://books.ropensci.org/targets/crew.html.
  12. controller = crew::crew_controller_local(workers = 24),
  13. # debug = "path_out_addresses",
  14. # cue = tar_cue(mode = "never"),
  15. error = "null"
  16. )
  17. # Run the R scripts in the R/ folder with your custom functions:
  18. tar_source()
  19. # Replace the target list below with your own:
  20. list(
  21. tar_target(path_report_list_csv, "../data-raw/report_list.csv", format = "file"),
  22. tar_target(path_report_list_raw, prepare_report_list(path_report_list_csv)),
  23. tar_target(report_list_raw, arrow::read_parquet(path_report_list_raw)),
  24. tar_target(
  25. dirs_all_src,
  26. fs::dir_ls("../data-raw/reports", glob = "**/all", recurse = TRUE, type = "directory"),
  27. format = "file"
  28. ),
  29. # This comes from Will's answer in https://stackoverflow.com/a/70293576
  30. # We're basically tricking targets into letting us branch over a file target
  31. tar_target(dirs_all_names, dirs_all_src),
  32. tar_target(dirs_all, {dirs_all_src; dirs_all_names}, pattern = map(dirs_all_names), format = "file"),
  33. tar_target(
  34. dirs_receipts_src,
  35. fs::dir_ls("../data-raw/reports", glob = "**/receipts", recurse = TRUE, type = "directory"),
  36. format = "file"
  37. ),
  38. tar_target(dirs_receipts_names, dirs_receipts_src),
  39. tar_target(dirs_receipts, {dirs_receipts_src; dirs_receipts_names}, pattern = map(dirs_receipts_names), format = "file"),
  40. tar_target(
  41. dirs_expenditures_src,
  42. fs::dir_ls("../data-raw/reports", glob = "**/expenditures", recurse = TRUE, type = "directory"),
  43. format = "file"
  44. ),
  45. tar_target(dirs_expenditures_names, dirs_expenditures_src),
  46. tar_target(dirs_expenditures, {dirs_expenditures_src; dirs_expenditures_names}, pattern = map(dirs_expenditures_names), format = "file"),
  47. tar_target(
  48. paths_all_parquet,
  49. write_prepared_report_export(dirs_all, report_list_raw),
  50. pattern = map(dirs_all),
  51. format = "file"
  52. ),
  53. tar_target(
  54. path_receipts_parquet,
  55. write_prepared_receipts_parquet(dirs_receipts, report_list_raw),
  56. pattern = map(dirs_receipts),
  57. format = "file"
  58. ),
  59. tar_target(
  60. path_expenditures_parquet,
  61. write_prepared_expenditures_parquet(dirs_expenditures, report_list_raw),
  62. pattern = map(dirs_expenditures),
  63. format = "file"
  64. ),
  65. tar_target(path_data_prep_cover, { paths_all_parquet; "../data-prep/cover" }, format = "file"),
  66. tar_target(path_data_prep_officers, { paths_all_parquet; "../data-prep/officers" }, format = "file"),
  67. tar_target(path_data_prep_receipts, { paths_all_parquet; "../data-prep/receipts" }, format = "file"),
  68. tar_target(path_data_prep_expenditures, { paths_all_parquet; "../data-prep/expenditures" }, format = "file"),
  69. tar_target(
  70. cover_raw,
  71. arrow::open_dataset(path_data_prep_cover, partitioning = "sboe_id") |> dplyr::collect()
  72. ),
  73. tar_target(
  74. report_dates,
  75. process_report_dates(report_list_raw, cover_raw)
  76. ),
  77. tar_target(
  78. path_report_dates, {
  79. out_path <- "../data-prep/report_dates/part-0.parquet"
  80. fs::dir_create(fs::path_dir(out_path))
  81. arrow::write_parquet(report_dates, out_path)
  82. }),
  83. tar_target(
  84. report_amended_score,
  85. calc_report_amended_score(report_dates)
  86. ),
  87. tar_target(
  88. addresses_raw,
  89. prep_collect_addresses_raw(
  90. path_officers = path_data_prep_officers,
  91. path_receipts = path_data_prep_receipts,
  92. path_expenditures = path_data_prep_expenditures,
  93. path_candidate_listing = path_candidate_listing_raw,
  94. path_voters = NULL # path_voters_parquet
  95. ),
  96. format = "parquet"
  97. ),
  98. tar_target(
  99. path_addresses_db,
  100. prepare_addresses_lookup_db(addresses_raw$address),
  101. format = "file"
  102. ),
  103. # This report list uses the latest amended report -----
  104. tar_target(
  105. report_list,
  106. process_report_list(report_list_raw, report_amended_score)
  107. ),
  108. tar_target(committees, prepare_committees(cover_raw, report_list)),
  109. # tar_target(donations, prepare_donations(path_data_prep_receipts, report_list)),
  110. # Outside data sources -----
  111. tar_target(candidate_listing_raw, get_candidate_listing(2016:2023)),
  112. tar_target(
  113. path_candidate_listing_raw,
  114. write_parquet(candidate_listing_raw, "../data-prep/candidate_listing/part-0.parquet"),
  115. format = "file"
  116. ),
  117. ## Voter registration records
  118. tar_target(path_voters_txt, voter_statewide_download(), cue = tar_cue("never")), #<< invalidate to get latest
  119. tar_target(
  120. path_out_voters,
  121. voter_statewide_convert_parquet(path_voters_txt),
  122. cue = tar_cue("never"),
  123. format = "file"
  124. ),
  125. # Donors ------------------------------------------------------------------
  126. # tar_target(
  127. # donors_latest,
  128. # prepare_donors_latest(
  129. # path_data_prep_receipts,
  130. # path_out_report_list,
  131. # path_addresses = "data-out/addresses"
  132. # )
  133. # ),
  134. # tar_target(donors_for_matching, prepare_donors_for_matching(donors_latest)),
  135. # tar_target(donors_for_matching_sample, sample_frac(donors_for_matching, 0.1)),
  136. # tar_target(
  137. # donors_for_matching_sample_nc,
  138. # donors_for_matching |> filter(grepl("^27", postal_code)) |> sample_n(10000)
  139. # ),
  140. #
  141. # tar_target(
  142. # donors_to_match,
  143. # {
  144. # # For testing, use a small sample of donors
  145. # # slice_sample(donors_for_matching, n = 5000)
  146. # donors_for_matching
  147. # }
  148. # ),
  149. #
  150. # # Build the EM matching model from a moderately sized sample
  151. # tar_target(donors_em_model_sample, prepare_donors_matching_sample(donors_to_match)),
  152. # tar_target(donors_em_model, fastlink_donors(donors_em_model_sample, estimate.only = TRUE)),
  153. #
  154. # # Then create blocks of data to match against
  155. # tar_target(donor_blocks_zip_pre, prepare_donor_zip_blocks(donors_to_match)),
  156. # tar_target(donor_blocks_city, prepare_donor_city_blocks(donors_to_match)),
  157. # tar_target(donor_blocks_name, prepare_donor_name_blocks(donors_to_match)),
  158. #
  159. # # Then apply the pre-trained EM model to each block
  160. # tar_target(
  161. # donor_linked_zip_pre,
  162. # fastlink_donor_blocks(donor_blocks_zip_pre, em.obj = donors_em_model)
  163. # ),
  164. # tar_target(
  165. # donor_linked_city,
  166. # fastlink_donor_blocks(donor_blocks_city, em.obj = donors_em_model)
  167. # ),
  168. # tar_target(
  169. # donor_linked_name,
  170. # fastlink_donor_blocks(donor_blocks_name, em.obj = donors_em_model)
  171. # ),
  172. # Candidate Listing -------------------------------------------------------
  173. tar_target(
  174. candidate_listing_dedupe,
  175. prep_dedupe_candidates(candidate_listing_raw)
  176. ),
  177. tar_target(
  178. candidate_listing,
  179. prep_candidate_listing(candidate_listing_raw, candidate_listing_dedupe)
  180. ),
  181. # Candidates --------------------------------------------------------------
  182. tar_target(
  183. candidates_for_matching,
  184. prepare_candidates_for_matching(path_data_prep_officers, path_out_report_list, path_out_committees)
  185. ),
  186. tar_target(
  187. candidate_listing_for_matching,
  188. prepare_candidate_listing_for_matching(candidate_listing)
  189. ),
  190. tar_target(
  191. candidates_linked,
  192. fastlink_candidates(
  193. candidates_for_matching,
  194. candidate_listing_for_matching
  195. )
  196. ),
  197. tar_target(
  198. committee_candidate,
  199. fastlink_match_candidates(
  200. candidates_for_matching,
  201. candidate_listing_for_matching,
  202. candidates_linked
  203. )
  204. ),
  205. # Output ------------------------------------------------------------------
  206. tar_target(path_out_report_list, out_report_list(report_list), format = "file"),
  207. tar_target(
  208. path_out_addresses,
  209. # This needs to be run manually, otherwise it doesn't run in {targets}
  210. # and throws an error: `bad value`
  211. out_addresses(path_addresses_db, "data-out/addresses.parquet"),
  212. cue = tar_cue("never"),
  213. format = "file"
  214. ),
  215. tar_target(
  216. path_out_cover,
  217. out_cover(path_data_prep_cover, path_out_report_list),
  218. format = "file"
  219. ),
  220. tar_target(
  221. path_out_committees,
  222. out_committees(path_out_cover, path_out_report_list),
  223. format = "file"
  224. ),
  225. tar_target(
  226. path_out_officers,
  227. out_officers(path_data_prep_officers, path_out_report_list),
  228. format = "file"
  229. ),
  230. tar_target(
  231. path_out_expenses_payee,
  232. out_expenses_payee(
  233. path_data_prep_expenditures,
  234. path_out_report_list
  235. ),
  236. format = "file"
  237. ),
  238. tar_target(
  239. path_out_expenses,
  240. out_expenses(
  241. path_data_prep_expenditures,
  242. path_out_expenses_payee,
  243. path_out_report_list
  244. ),
  245. format = "file"
  246. ),
  247. tar_target(
  248. path_out_receipts_payer,
  249. out_receipts_payer(
  250. path_data_prep_receipts,
  251. path_out_report_list
  252. ),
  253. format = "file"
  254. ),
  255. tar_target(
  256. path_out_receipts,
  257. out_receipts(
  258. path_data_prep_receipts,
  259. path_out_receipts_payer,
  260. path_out_report_list
  261. ),
  262. format = "file"
  263. ),
  264. tar_target(
  265. path_out_cl_elections,
  266. out_write_parquet(candidate_listing$cl_elections, "cl_elections"),
  267. format = "file"
  268. ),
  269. tar_target(
  270. path_out_cl_candidates,
  271. out_write_parquet(candidate_listing$cl_candidates, "cl_candidates"),
  272. format = "file"
  273. ),
  274. tar_target(
  275. path_out_cl_alias,
  276. out_write_parquet(candidate_listing$cl_name_on_ballot, "cl_name_on_ballot"),
  277. format = "file"
  278. ),
  279. tar_target(
  280. path_out_cl_contact,
  281. out_write_parquet(candidate_listing$cl_contact, "cl_contact"),
  282. format = "file"
  283. ),
  284. tar_target(
  285. path_out_cl_party,
  286. out_write_parquet(candidate_listing$cl_party, "cl_party"),
  287. format = "file"
  288. ),
  289. tar_target(
  290. path_out_committee_candidate,
  291. out_write_parquet(committee_candidate, "committee_candidate"),
  292. format = "file"
  293. )
  294. )