#' Preprocess data #' #' @param x a left dataset #' @param y a right dataset #' @param by a by argument for joins / set operations #' @param fill if missing ids should be filled #' #' @return a preprocessed dataset #' #' @examples #' NULL preprocess_data <- function(x, y, by, fill = TRUE) { #' test for #' a <- c("unique", "mult", "mult", "also unique") #' add_duplicate_number(a) add_duplicate_number <- function(a) { data_frame(v = a) %>% group_by(v) %>% mutate(id = paste(v, 1:n(), sep = "-")) %>% pull(id) } x <- x %>% unite(one_of(by), col = ".id", remove = FALSE) %>% mutate(.id_long = add_duplicate_number(.id)) y <- y %>% unite(one_of(by), col = ".id", remove = FALSE) %>% mutate(.id_long = add_duplicate_number(.id)) ids <- dplyr::union(x %>% dplyr::select(.id, .id_long), y %>% dplyr::select(.id, .id_long)) x_ <- process_data(x, ids, by, fill = fill) y_ <- process_data(y, ids, by, fill = fill) %>% mutate(.x = .x + ncol(x) - 1) return(list(x = x_, y = y_)) } #' Processes the data #' #' @param x a preprocessed dataset #' @param ids a data_frame of ids (.id and .id_long) #' @param by a vector of by-arguments #' @param width the width of the tiles #' @param side the side (x or y, lhs or rhs, etc) #' @param fill if missing ids should be filled #' #' @return a data_frame including all necessary information #' #' @examples #' NULL process_data <- function(x, ids, by, width = 1, side = NA, fill = TRUE) { if (is.na(side)) side <- deparse(substitute(x)) x_names <- names(x) %>% str_subset("^[^\\.]") x_keys <- 1:length(x_names) names(x_keys) <- x_names special_vars <- names(x) %>% str_subset("^\\.") x <- x %>% mutate(.r = row_number()) %>% gather_(key = ".col", value = ".val", names(x) %>% str_subset("^[^.]")) %>% mutate(.x = x_keys[.col], .y = -.r) %>% bind_rows(data_frame(.id = ".header", .id_long = paste(".header", x_names, sep = "_"), .r = 0, .col = x_names, .val = x_names, .x = x_keys, .y = 0), .) %>% mutate(.width = width, .side = side) # if there are multiple values in the ids (-2, -3 etc) but they are not present # in x, because it is in the second/other dataset, add these values here id_long <- ids$.id_long mis_ids <- id_long[!id_long %in% x$.id_long] # if the missing value is a -1, that means the missing value comes not from # missing dublicate ids mis_ids <- str_subset(mis_ids, "[^-1]$") if (length(mis_ids) > 0 && fill) { mis_ids_short <- str_replace(mis_ids, "-[0-9]+$", "") # insert the missing ids at the right place for (i in mis_ids_short) { irow <- (1:nrow(x))[x$.id == i] irow <- irow[1] x <- bind_rows( x %>% slice(1:irow), x %>% filter(.id %in% mis_ids_short) %>% mutate(.id_long = mis_ids), x %>% slice((irow + 1):nrow(x)) ) } } res <- add_color(x, ids$.id, by) return(res) } #' Adds Color to a processed data_frame #' #' @param x a processed data_frame #' @param ids a vector of ids for the color-matching #' @param by a vector of column names that constitute the by-argument of joins/sets #' @param color_header color for the header #' @param color_other color for "inactive" values #' @param color_missing color for missing values #' #' @return the processed data_frame with a new column .color #' #' @examples #' NULL add_color <- function(x, ids, by, color_header = "#bdbdbd", color_other = "#d0d0d0", color_missing = "#ffffff") { colors <- c(color_header, scales::brewer_pal(type = "qual", "Set1")(length(ids))) names(colors) <- c(".header", ids) res <- x %>% mutate(.color = ifelse(is.na(.val), color_missing, colors[.id]), .color = ifelse(.col %in% by, .color, color_other)) return(res) }