#' @param data Source data, must contain `data_id` column #' @param dupes Duplicate group mapping, must contain `dupe_id` and `data_id` #' columns. Each row maps a single `data_id` to a `dupe_id`, where the #' `dupe_id` groups duplicate rows together. #' @param data_id The original ID in the source data #' @param dupe_id A duplicate group ID in from the de-duplication process #' @param data_group_id The group ID added to the source data. This group ID #' always the minimum ID of any grouped source IDs. The final groups created #' with this ID may span several duplicate groups. dedupe_update_mapping <- function( data, dupes, data_id, dupe_id, data_group_id, name = "duplicates" ) { data_id <- rlang::enquo(data_id) data_id_name <- rlang::quo_text(data_id) dupe_id <- rlang::enquo(dupe_id) dupe_id_name <- rlang::quo_text(dupe_id) if (missing(data_group_id)) { data_group_id <- dupe_id } else { data_group_id <- rlang::enquo(data_group_id) } data_group_id_name <- rlang::quo_text(data_group_id) mapping <- if (dupe_id_name %in% colnames(data)) { # Extract mapping from current dataset data |> select(!!data_id, !!dupe_id) } else { # Initialize a new mapping table data |> select(!!data_id) |> mutate("{data_group_id_name}" := !!data_id) } dupes <- group_split(dupes, !!dupe_id) dupes <- purrr::keep(dupes, \(d) nrow(d) > 1) cli::cli_progress_bar( name = paste("Merging", name), total = length(dupes), clear = FALSE ) for (dupe in dupes) { # Find original ids in this dupe group map_group <- mapping |> semi_join(dupe, by = join_by(!!data_id)) all_ids <- union( map_group |> pull(!!data_id), map_group |> pull(!!data_group_id) ) # Find any other ids related to the ids in this dupe group # so we can merge groups map_others <- mapping |> filter(!!data_group_id %in% all_ids) # recompute current grouping to min of all ids update <- dplyr::union(map_group, map_others) |> mutate("{data_group_id_name}" := min(!!data_id, !!data_group_id)) mapping <- rows_update(mapping, update, by = data_id_name) cli::cli_progress_update() } cli::cli_progress_done() mapping |> left_join( data[setdiff(names(data), data_group_id_name)], # use new mapping by = join_by(!!data_id) ) }