garrick
/
nc-campaign-finance


			
				
					
						
						
							
							#' @param data Source data, must contain `data_id` column
#' @param dupes Duplicate group mapping, must contain `dupe_id` and `data_id`
#'   columns. Each row maps a single `data_id` to a `dupe_id`, where the
#'   `dupe_id` groups duplicate rows together.
#' @param data_id The original ID in the source data
#' @param dupe_id A duplicate group ID in from the de-duplication process
#' @param data_group_id The group ID added to the source data. This group ID
#'   always the minimum ID of any grouped source IDs. The final groups created
#'   with this ID may span several duplicate groups.
dedupe_update_mapping <- function(
  data,
  dupes,
  data_id,
  dupe_id,
  data_group_id,
  name = "duplicates"
) {
  data_id <- rlang::enquo(data_id)
  data_id_name <- rlang::quo_text(data_id)
  dupe_id <- rlang::enquo(dupe_id)
  dupe_id_name <- rlang::quo_text(dupe_id)
  if (missing(data_group_id)) {
    data_group_id <- dupe_id
  } else {
    data_group_id <- rlang::enquo(data_group_id)
  }
  data_group_id_name <- rlang::quo_text(data_group_id)

  mapping <-
    if (dupe_id_name %in% colnames(data)) {
      # Extract mapping from current dataset
      data |>
        select(!!data_id, !!dupe_id)
    } else {
      # Initialize a new mapping table
      data |>
        select(!!data_id) |>
        mutate("{data_group_id_name}" := !!data_id)
    }

  dupes <- group_split(dupes, !!dupe_id)
  dupes <- purrr::keep(dupes, \(d) nrow(d) > 1)

  cli::cli_progress_bar(
    name = paste("Merging", name),
    total = length(dupes),
    clear = FALSE
  )

  for (dupe in dupes) {
    # Find original ids in this dupe group
    map_group <-
      mapping |>
      semi_join(dupe, by = join_by(!!data_id))

    all_ids <- union(
      map_group |> pull(!!data_id),
      map_group |> pull(!!data_group_id)
    )

    # Find any other ids related to the ids in this dupe group
    # so we can merge groups
    map_others <- mapping |> filter(!!data_group_id %in% all_ids)

    # recompute current grouping to min of all ids
    update <-
      dplyr::union(map_group, map_others) |>
      mutate("{data_group_id_name}" := min(!!data_id, !!data_group_id))

    mapping <- rows_update(mapping, update, by = data_id_name)
    cli::cli_progress_update()
  }
  cli::cli_progress_done()

  mapping |>
    left_join(
      data[setdiff(names(data), data_group_id_name)], # use new mapping
      by = join_by(!!data_id)
    )
}