Ви не можете вибрати більше 25 тем Теми мають розпочинатися з літери або цифри, можуть містити дефіси (-) і не повинні перевищувати 35 символів.

92 lines
2.2KB

  1. out_receipts_payer <- function(
  2. path_data_prep_receipts,
  3. path_out_report_list
  4. ) {
  5. lg_info_target(lg_get_logger())
  6. out <- path("data-out", "receipts_payer", "receipts_payer.parquet")
  7. dir_create(path_dir(out))
  8. report_list <- out_open_dataset_db(path_out_report_list)
  9. receipts <-
  10. prep_open_dataset_db(path_data_prep_receipts) |>
  11. semi_join(report_list, by = "report_id")
  12. receipts_payer <-
  13. receipts |>
  14. receipts_replace_na() |>
  15. distinct(
  16. org_name,
  17. is_org,
  18. is_us,
  19. profession,
  20. employers_name,
  21. street_1,
  22. city,
  23. state,
  24. full_zip,
  25. country_name
  26. ) |>
  27. add_address_lookup(postal_code = full_zip, name = "address_lookup") |>
  28. mutate(payer_id = row_number(), .before = 1)
  29. arrow::write_parquet(collect(receipts_payer), out)
  30. dirname(out)
  31. }
  32. receipts_replace_na <- function(receipts) {
  33. receipts |>
  34. tidyr::replace_na(list(
  35. org_name = "",
  36. profession = "",
  37. employers_name = "",
  38. street_1 = "",
  39. city = "",
  40. state = "",
  41. full_zip = "",
  42. country_name = ""
  43. ))
  44. }
  45. out_receipts <- function(
  46. path_data_prep_receipts,
  47. path_out_receipts_payer,
  48. path_out_report_list
  49. ) {
  50. lg_info_target(lg_get_logger())
  51. out <- path("data-out", "receipts", "receipts.parquet")
  52. dir_create(path_dir(out))
  53. report_list <- out_open_dataset_db(path_out_report_list)
  54. receipts_payer <-
  55. out_open_dataset_db(path_out_receipts_payer) |>
  56. select(-address_lookup)
  57. receipts <-
  58. prep_open_dataset_db(path_data_prep_receipts) |>
  59. semi_join(report_list, by = "report_id")
  60. # Replace payer info with payer_id
  61. cols_payer_common <- intersect(colnames(receipts_payer), colnames(receipts))
  62. receipts <-
  63. receipts |>
  64. receipts_replace_na() |>
  65. left_join(receipts_payer, by = cols_payer_common) |>
  66. select(-any_of(cols_payer_common), -name_sort) |>
  67. mutate(
  68. is_donation = receipt_type_code %in% c("IND", "CPCM", "GEN", "PPTY", "OUTS", "NFPC"),
  69. .after = receipt_type_code
  70. ) |>
  71. relocate(payer_id, .after = report_id) |>
  72. collect() |>
  73. mutate(across(occur_date, lubridate::mdy))
  74. arrow::write_parquet(receipts, out)
  75. dirname(out)
  76. }