Przeglądaj źródła

replace missing values with "" in receipts

main
Garrick Aden-Buie 2 lat temu
rodzic
commit
be5f430ffe
Nie znaleziono w bazie danych klucza dla tego podpisu
2 zmienionych plików z 23 dodań i 5 usunięć
  1. +19
    -2
      process/R/out_receipts.R
  2. +4
    -3
      process/R/prepare_candidates.R

+ 19
- 2
process/R/out_receipts.R Wyświetl plik



receipts_payer <- receipts_payer <-
receipts |> receipts |>
receipts_replace_na() |>
distinct( distinct(
org_name, org_name,
is_org, is_org,
dirname(out) dirname(out)
} }


receipts_replace_na <- function(receipts) {
receipts |>
tidyr::replace_na(list(
org_name = "",
profession = "",
employers_name = "",
street_1 = "",
city = "",
state = "",
full_zip = "",
country_name = ""
))
}

out_receipts <- function( out_receipts <- function(
path_data_prep_receipts, path_data_prep_receipts,
path_out_receipts_payer, path_out_receipts_payer,


receipts <- receipts <-
receipts |> receipts |>
receipts_replace_na() |>
left_join(receipts_payer, by = cols_payer_common) |> left_join(receipts_payer, by = cols_payer_common) |>
select(-any_of(cols_payer_common)) |>
select(-any_of(cols_payer_common), -name_sort) |>
mutate( mutate(
is_donation = receipt_type_code %in% c("IND", "CPCM", "GEN", "PPTY", "OUTS", "NFPC"), is_donation = receipt_type_code %in% c("IND", "CPCM", "GEN", "PPTY", "OUTS", "NFPC"),
.after = receipt_type_code .after = receipt_type_code
) |> ) |>
relocate(payer_id, .after = report_id) |> relocate(payer_id, .after = report_id) |>
collect()
collect() |>
mutate(across(occur_date, lubridate::mdy))


arrow::write_parquet(receipts, out) arrow::write_parquet(receipts, out)



+ 4
- 3
process/R/prepare_candidates.R Wyświetl plik

varnames = c("name_clean", "street", "city"), varnames = c("name_clean", "street", "city"),
stringdist.match = c("name_clean", "street"), stringdist.match = c("name_clean", "street"),
partial.match = c("name_clean", "street"), partial.match = c("name_clean", "street"),
stringdist.method = "dl"
stringdist.method = "dl",
threshold.match = 0.9
) )
} }


candidates_for_matching, candidates_for_matching,
candidate_listing_for_matching, candidate_listing_for_matching,
candidates_linked, candidates_linked,
threshold.match = 0.8
threshold.match = 0.9
) )


matches <- as_tibble(matches) matches <- as_tibble(matches)


select(matches, sboe_id, candidate_id)
distinct(matches, sboe_id, candidate_id)
} }


candidates_match <- function( candidates_match <- function(

Ładowanie…
Anuluj
Zapisz