garrick
/
nc-campaign-finance


			
prep_dedupe_candidates <- function(candidate_listing_raw) {
  candidate_names <-
    candidate_listing_raw |>
    arrange(last_name, first_name, middle_name) |>
    distinct(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl, street_address, city, contest_name, county_name) |>
    mutate(candidate_id = row_number(), .before = 1)

  # first, last, street_address ----
  f_l_street <-
    candidate_names |>
    group_by(first_name, last_name, street_address) |>
    mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |>
    ungroup()

  # first, last, cleaned contest name
  f_l_cleaned_contest <-
    candidate_names |>
    group_by(first_name, last_name, contest_name) |>
    mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |>
    ungroup()

  # first, middle, last, county_name
  f_m_l_county <-
    candidate_names |>
    group_by(first_name, middle_name, last_name, county_name) |>
    mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |>
    ungroup()

  # first, middle initial, last, county_name
  f_mi_l_county <-
    candidate_names |>
    mutate(middle_name = stringr::str_sub(middle_name, 1, 1)) |>
    group_by(first_name, middle_name, last_name, county_name) |>
    mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |>
    ungroup()

  # name_on_ballot, middle_name
  nb_m <-
    candidate_names |>
    mutate(
      name_ballot_clean = sub("(Mrs?|Dr|Mr|Ms|Miss)[.]? ", "", name_on_ballot),
      name_ballot_clean = gsub("[.]", "", name_ballot_clean),
    ) |>
    group_by(name_ballot_clean, middle_name) |>
    mutate(candidate_group = dplyr::cur_group_id(), .after = 1) |>
    ungroup()

  candidate_names |>
    dedupe_update_mapping(f_l_street, candidate_id, candidate_group, name = "first, last, street") |>
    dedupe_update_mapping(f_l_cleaned_contest, candidate_id, candidate_group, name = "first, last, contest") |>
    dedupe_update_mapping(f_m_l_county, candidate_id, candidate_group, name = "first, middle, last, county") |>
    dedupe_update_mapping(f_mi_l_county, candidate_id, candidate_group, name = "first, mi, last, county") |>
    dedupe_update_mapping(nb_m, candidate_id, candidate_group, name = "ballot name, middle name")
}

prep_candidate_listing <- function(
  candidate_listing_raw,
  candidate_listing_dedupe
) {

  # Give candidates a unique id from dedupe mapping
  cl_dedupe <-
    candidate_listing_dedupe |>
    group_by(candidate_group) |>
    mutate(candidate_id = cur_group_id()) |>
    ungroup() |>
    select(-candidate_group)

  # cl raw + candidate_id ----
  cl_raw <-
    candidate_listing_raw |>
    left_join(cl_dedupe, by = setdiff(names(cl_dedupe), "candidate_id"))


  # candidate_names ----
  candidate_names <-
    cl_raw |>
    distinct(candidate_id, election_dt, first_name, middle_name, last_name, name_suffix_lbl) |>
    arrange(candidate_id, election_dt)

  # candidate_name_on_ballot ----
  candidate_name_on_ballot <-
    cl_raw |>
    distinct(candidate_id, election_dt, name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) |>
    arrange(candidate_id, election_dt)

  # candidate_address ----
  candidate_address <-
    cl_raw |>
    mutate(
      phone = coalesce(phone, office_phone, business_phone),
      street_address = toupper(street_address),
      street_address = stringr::str_replace_all(street_address, " +", " "),
      street_address = trimws(street_address),
    ) |>
    select(-office_phone, -business_phone) |>
    distinct(candidate_id, election_dt, street = street_address, city, state, zip_code, phone, email) |>
    add_address_lookup_local(street = street, postal_code = zip_code) |>
    # remove rows that are entirely empty except for candidate_id
    filter(!(is.na(street) & is.na(city) & is.na(state) & is.na(zip_code) & is.na(phone) & is.na(email))) |>
    arrange(candidate_id, election_dt)

  # candidate_party ----
  candidate_party <-
    cl_raw |>
    distinct(candidate_id, election_dt, party_candidate) |>
    mutate(party_candidate = forcats::fct_inorder(party_candidate)) |>
    filter(!is.na(party_candidate)) |>
    arrange(candidate_id, election_dt)

  # Extract contests (remaining data in candidate_listing) ----
  cols_related <- setdiff(
    c(colnames(candidate_name_on_ballot), colnames(candidate_address), colnames(candidate_party)),
    c("election_dt", colnames(candidate_names))
  )

  contests <-
    cl_raw |>
    select(election_dt:name_suffix_lbl, candidate_id) |>
    select(-any_of(cols_related)) |>
    relocate(candidate_id, .before = first_name)

  # Get current candidate information ----
  candidate_best_name <-
    candidate_names |>
    mutate(n_char = nchar(paste(first_name, middle_name, last_name, name_suffix_lbl))) |>
    arrange(desc(n_char), desc(election_dt)) |>
    slice_max(n_char, n = 1, by = candidate_id, with_ties = FALSE) |>
    select(-n_char, -election_dt)

  current_contact <-
    candidate_listing_current_contact_info(candidate_address)

  candidate_contest_count <-
    cl_raw |>
    distinct(candidate_id, election_dt, contest_name) |>
    slice_max(election_dt, by = candidate_id, n = 1)


  current_name_on_ballot <-
    candidate_name_on_ballot |>
    slice_max(election_dt, by = candidate_id, n = 1, with_ties = FALSE) |>
    select(candidate_id, name_on_ballot)

  current_party <-
    candidate_party |>
    filter(!is.na(party_candidate)) |>
    group_by(candidate_id) |>
    slice_max(election_dt, n = 1) |>
    arrange(party_candidate) |>
    slice_head(n = 1) |>
    ungroup() |>
    select(-election_dt) |>
    rename(party_last = party_candidate)

  current_most_party <-
    candidate_party |>
    filter(!is.na(party_candidate)) |>
    group_by(candidate_id) |>
    count(party_candidate) |>
    slice_max(n, n = 1) |>
    arrange(party_candidate) |>
    slice_head(n = 1) |>
    ungroup() |>
    select(-n) |>
    rename(party_most = party_candidate)

  current_contest_count <-
    contests |>
    group_by(candidate_id) |>
    distinct(candidate_id, election_dt, contest_name) |>
    summarize(
      contest_n = n(),
      contest_first = min(election_dt),
      contest_latest = max(election_dt)
    )

  # Join candidates into one big table ----
  left_join_by_candidate <- function(x, y) {
    left_join(x, y, join_by(candidate_id), relationship = "one-to-one")
  }

  candidates <-
    candidate_best_name |>
    left_join_by_candidate(current_name_on_ballot) |>
    left_join_by_candidate(current_contact) |>
    left_join_by_candidate(current_party) |>
    left_join_by_candidate(current_most_party) |>
    left_join_by_candidate(current_contest_count) |>
    relocate(name_on_ballot, .before = first_name) |>
    relocate(starts_with("party"), .before = street) |>
    relocate(starts_with("contest"), .before = street) |>
    add_address_lookup_local(street = street, postal_code = zip_code) |>
    arrange(candidate_id)

  # Return list of tables
  list(
    cl_elections = contests,
    cl_candidates = candidates,
    cl_name_on_ballot = candidate_name_on_ballot,
    cl_contact = candidate_address,
    cl_party = candidate_party
  )
}

candidate_listing_current_contact_info <- function(candidate_address) {
  candidate_phone_current <-
    candidate_address |>
    filter(!is.na(phone)) |>
    slice_max(election_dt, by = candidate_id, n = 1, with_ties = FALSE) |>
    select(candidate_id, phone)

  candidate_email_current <-
    candidate_address |>
    filter(!is.na(email)) |>
    slice_max(election_dt, by = candidate_id, n = 1, with_ties = FALSE) |>
    select(candidate_id, email)

  candidate_address_current <-
    candidate_address |>
    group_by(candidate_id) |>
    slice_max(election_dt, n = 1) |>
    select(-election_dt) |>
    filter(
      n_distinct(street) == 1 | !grepl("PO BOX", street)
    ) |>
    slice_head(n = 1) |>
    select(candidate_id, street:zip_code) |>
    ungroup()

  candidate_address_current |>
    left_join(candidate_phone_current, by = "candidate_id", relationship = "one-to-one") |>
    left_join(candidate_email_current, by = "candidate_id", relationship = "one-to-one")
}

# This function isn't used anymore -- I opted for a more manual approach of
# using overlapping signals. But this general idea could work in other places,
# e.g. for deduping donors.
fastlink_candidate_listing <- function(candidate_listing_raw) {
  data <-
    candidate_listing_raw |>
    distinct(
      name_on_ballot,
      first_name,
      middle_name,
      last_name,
      name_suffix_lbl,
      street_address,
      city
    ) |>
    mutate(
      name_on_ballot_clean = sub(" \\(.+?\\)\\s?", "", name_on_ballot)
    )

  linked <- fastLink::fastLink(
    data,
    data,
    varnames = c(
      "name_on_ballot_clean",
      "first_name", "middle_name", "last_name", "name_suffix_lbl",
      "street_address", "city"
    ),
    stringdist.match = c("middle_name"),
    stringdist.method = "jw",
    jw.weight = .25,
    threshold.match = 0.98
  )

  matches <-
    fastLink::getMatches(
      data,
      data,
      linked,
      threshold.match = 0.9
    ) |>
    as_tibble()

  list(
    data = data,
    linked = linked,
    matches = matches
  )
}

# This also isn't used anymore...
prep_candidates_dedupe_mapping <- function(
  candidate_listing_raw,
  candidate_listing_dedupe
) {
  # candidate names ----
  # Extract candidate names, these will be primary keys for the candidates table
  candidate_names <-
    candidate_listing_raw |>
    arrange(last_name, first_name, middle_name) |>
    distinct(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) |>
    mutate(candidate_id = row_number(), .before = 1)

  # Find last election ----
  candidates_last_contest <-
    candidate_listing_raw |>
    group_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl) |>
    slice_max(election_dt, n = 1) |>
    distinct(contest_last = election_dt)

  deduped_ids <-
    candidate_listing_dedupe$matches |>
    left_join(
      candidate_names,
      by = join_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl)
    ) |>
    distinct(dupe_id = dedupe.ids, candidate_id) |>
    add_count(dupe_id) |>
    filter(n > 1) |>
    select(-n) |>
    mutate(dupe_id = fct_infreq(paste(dupe_id))) |>
    arrange(dupe_id, candidate_id) |>
    group_split(dupe_id)

  mapping <- candidate_names |> select(candidate_id) |> mutate(candidate_group = candidate_id)

  for (dupes in deduped_ids) {

    map_group <- left_join(dupes[-1], mapping, by = "candidate_id")
    all_ids <- union(map_group$candidate_id, map_group$candidate_group)
    map_others <- mapping |> filter(candidate_group %in% all_ids)
    browser(expr = nrow(map_group) < nrow(map_others))

    # recompute current grouping to min of all ids
    update <-
      dplyr::union(map_group, map_others) |>
      mutate(candidate_group = min(candidate_id, candidate_group))

    mapping <- rows_update(mapping, update, by = "candidate_id")
  }

  mapping |>
    left_join(candidate_names, by = "candidate_id") |>
    left_join(candidates_last_contest, by = join_by(name_on_ballot, first_name, middle_name, last_name, name_suffix_lbl))
}