Ви не можете вибрати більше 25 тем Теми мають розпочинатися з літери або цифри, можуть містити дефіси (-) і не повинні перевищувати 35 символів.

81 lines
2.4KB

  1. #' @param data Source data, must contain `data_id` column
  2. #' @param dupes Duplicate group mapping, must contain `dupe_id` and `data_id`
  3. #' columns. Each row maps a single `data_id` to a `dupe_id`, where the
  4. #' `dupe_id` groups duplicate rows together.
  5. #' @param data_id The original ID in the source data
  6. #' @param dupe_id A duplicate group ID in from the de-duplication process
  7. #' @param data_group_id The group ID added to the source data. This group ID
  8. #' always the minimum ID of any grouped source IDs. The final groups created
  9. #' with this ID may span several duplicate groups.
  10. dedupe_update_mapping <- function(
  11. data,
  12. dupes,
  13. data_id,
  14. dupe_id,
  15. data_group_id,
  16. name = "duplicates"
  17. ) {
  18. data_id <- rlang::enquo(data_id)
  19. data_id_name <- rlang::quo_text(data_id)
  20. dupe_id <- rlang::enquo(dupe_id)
  21. dupe_id_name <- rlang::quo_text(dupe_id)
  22. if (missing(data_group_id)) {
  23. data_group_id <- dupe_id
  24. } else {
  25. data_group_id <- rlang::enquo(data_group_id)
  26. }
  27. data_group_id_name <- rlang::quo_text(data_group_id)
  28. mapping <-
  29. if (dupe_id_name %in% colnames(data)) {
  30. # Extract mapping from current dataset
  31. data |>
  32. select(!!data_id, !!dupe_id)
  33. } else {
  34. # Initialize a new mapping table
  35. data |>
  36. select(!!data_id) |>
  37. mutate("{data_group_id_name}" := !!data_id)
  38. }
  39. dupes <- group_split(dupes, !!dupe_id)
  40. dupes <- purrr::keep(dupes, \(d) nrow(d) > 1)
  41. cli::cli_progress_bar(
  42. name = paste("Merging", name),
  43. total = length(dupes),
  44. clear = FALSE
  45. )
  46. for (dupe in dupes) {
  47. # Find original ids in this dupe group
  48. map_group <-
  49. mapping |>
  50. semi_join(dupe, by = join_by(!!data_id))
  51. all_ids <- union(
  52. map_group |> pull(!!data_id),
  53. map_group |> pull(!!data_group_id)
  54. )
  55. # Find any other ids related to the ids in this dupe group
  56. # so we can merge groups
  57. map_others <- mapping |> filter(!!data_group_id %in% all_ids)
  58. # recompute current grouping to min of all ids
  59. update <-
  60. dplyr::union(map_group, map_others) |>
  61. mutate("{data_group_id_name}" := min(!!data_id, !!data_group_id))
  62. mapping <- rows_update(mapping, update, by = data_id_name)
  63. cli::cli_progress_update()
  64. }
  65. cli::cli_progress_done()
  66. mapping |>
  67. left_join(
  68. data[setdiff(names(data), data_group_id_name)], # use new mapping
  69. by = join_by(!!data_id)
  70. )
  71. }