|
- #' @param data Source data, must contain `data_id` column
- #' @param dupes Duplicate group mapping, must contain `dupe_id` and `data_id`
- #' columns. Each row maps a single `data_id` to a `dupe_id`, where the
- #' `dupe_id` groups duplicate rows together.
- #' @param data_id The original ID in the source data
- #' @param dupe_id A duplicate group ID in from the de-duplication process
- #' @param data_group_id The group ID added to the source data. This group ID
- #' always the minimum ID of any grouped source IDs. The final groups created
- #' with this ID may span several duplicate groups.
- dedupe_update_mapping <- function(
- data,
- dupes,
- data_id,
- dupe_id,
- data_group_id,
- name = "duplicates"
- ) {
- data_id <- rlang::enquo(data_id)
- data_id_name <- rlang::quo_text(data_id)
- dupe_id <- rlang::enquo(dupe_id)
- dupe_id_name <- rlang::quo_text(dupe_id)
- if (missing(data_group_id)) {
- data_group_id <- dupe_id
- } else {
- data_group_id <- rlang::enquo(data_group_id)
- }
- data_group_id_name <- rlang::quo_text(data_group_id)
-
- mapping <-
- if (dupe_id_name %in% colnames(data)) {
- # Extract mapping from current dataset
- data |>
- select(!!data_id, !!dupe_id)
- } else {
- # Initialize a new mapping table
- data |>
- select(!!data_id) |>
- mutate("{data_group_id_name}" := !!data_id)
- }
-
- dupes <- group_split(dupes, !!dupe_id)
- dupes <- purrr::keep(dupes, \(d) nrow(d) > 1)
-
- cli::cli_progress_bar(
- name = paste("Merging", name),
- total = length(dupes),
- clear = FALSE
- )
-
- for (dupe in dupes) {
- # Find original ids in this dupe group
- map_group <-
- mapping |>
- semi_join(dupe, by = join_by(!!data_id))
-
- all_ids <- union(
- map_group |> pull(!!data_id),
- map_group |> pull(!!data_group_id)
- )
-
- # Find any other ids related to the ids in this dupe group
- # so we can merge groups
- map_others <- mapping |> filter(!!data_group_id %in% all_ids)
-
- # recompute current grouping to min of all ids
- update <-
- dplyr::union(map_group, map_others) |>
- mutate("{data_group_id_name}" := min(!!data_id, !!data_group_id))
-
- mapping <- rows_update(mapping, update, by = data_id_name)
- cli::cli_progress_update()
- }
- cli::cli_progress_done()
-
- mapping |>
- left_join(
- data[setdiff(names(data), data_group_id_name)], # use new mapping
- by = join_by(!!data_id)
- )
- }
|