| deparse(substitute(x)), | deparse(substitute(x)), | ||||
| deparse(substitute(y))) | deparse(substitute(y))) | ||||
| ll <- preprocess_data(x, y, by = names(x)) | |||||
| if (type %in% c("union", "intersect", "setdiff")) { | |||||
| x <- dplyr::distinct(x) | |||||
| y <- dplyr::distinct(y) | |||||
| } | |||||
| if (type == "union_all") { | |||||
| ll <- preprocess_data(x, y, by = names(x), fill = FALSE) | |||||
| ll <- lapply(ll, function(a) | |||||
| a %>% mutate(.id_long = paste(.id_long, .side, sep = "-")) | |||||
| ) | |||||
| } else { | |||||
| ll <- preprocess_data(x, y, by = names(x)) | |||||
| } | |||||
| step0 <- bind_rows(ll$x, ll$y) %>% mutate(.frame = 0, .alpha = 1) | step0 <- bind_rows(ll$x, ll$y) %>% mutate(.frame = 0, .alpha = 1) | ||||
| if (!export %in% c("gif", "first", "last")) | if (!export %in% c("gif", "first", "last")) | ||||
| stop("export must be either gif, first, or last") | stop("export must be either gif, first, or last") | ||||
| title <- sprintf(paste0(type, "(%s, %s, by = c(\"%s\"))"), | |||||
| by_args <- ifelse(length(by) == 1, | |||||
| sprintf("\"%s\"", by), | |||||
| sprintf("c(\"%s\")", paste(by, collapse = "\", \"")) | |||||
| ) | |||||
| title <- sprintf(paste0(type, "(%s, %s, by = %s)"), | |||||
| deparse(substitute(x)), | deparse(substitute(x)), | ||||
| deparse(substitute(y)), | deparse(substitute(y)), | ||||
| paste(by, collapse = "\", \"")) | |||||
| by_args) | |||||
| if (type %in% c("semi_join", "anti_join")) { | |||||
| # for semi and anti_joins, there is no adding of multiple rows | |||||
| y <- dplyr::distinct(y) | |||||
| } | |||||
| ll <- preprocess_data(x, y, by) | ll <- preprocess_data(x, y, by) | ||||
| #' NULL | #' NULL | ||||
| move_together <- function(lhs, rhs, type) { | move_together <- function(lhs, rhs, type) { | ||||
| all_ids <- bind_rows(lhs, rhs) %>% distinct(.id) | |||||
| all <- bind_rows(lhs, rhs) | all <- bind_rows(lhs, rhs) | ||||
| x_cols <- lhs %>% distinct(col) | |||||
| y_cols <- rhs %>% distinct(col) | |||||
| # separate column and row-filter (ids) | |||||
| x_cols <- lhs %>% distinct(.col) | |||||
| y_cols <- rhs %>% distinct(.col) | |||||
| # separate header columns from ids and treat them as columns | |||||
| x_ids <- lhs %>% distinct(.id, .id_long) | |||||
| y_ids <- rhs %>% distinct(.id, .id_long) | |||||
| x_ids <- lhs %>% distinct(.id) | |||||
| y_ids <- rhs %>% distinct(.id) | |||||
| x_headers <- x_ids %>% filter(str_detect(.id_long, "^\\.header")) | |||||
| y_headers <- y_ids %>% filter(str_detect(.id_long, "^\\.header")) | |||||
| x_ids <- x_ids %>% filter(!str_detect(.id_long, "^\\.header")) | |||||
| y_ids <- y_ids %>% filter(!str_detect(.id_long, "^\\.header")) | |||||
| if (type == "full_join") { | if (type == "full_join") { | ||||
| col_combiner <- dplyr::full_join | col_combiner <- dplyr::full_join | ||||
| col_combiner <- dplyr::full_join | col_combiner <- dplyr::full_join | ||||
| row_combiner <- dplyr::right_join | row_combiner <- dplyr::right_join | ||||
| } else if (type == "semi_join") { | } else if (type == "semi_join") { | ||||
| col_combiner <- dplyr::semi_join | |||||
| col_combiner <- dplyr::left_join | |||||
| row_combiner <- dplyr::semi_join | row_combiner <- dplyr::semi_join | ||||
| } else if (type == "anti_join") { | } else if (type == "anti_join") { | ||||
| col_combiner <- dplyr::semi_join | col_combiner <- dplyr::semi_join | ||||
| } else if (type == "union_all") { | } else if (type == "union_all") { | ||||
| col_combiner <- dplyr::full_join | col_combiner <- dplyr::full_join | ||||
| row_combiner <- dplyr::union_all | row_combiner <- dplyr::union_all | ||||
| x_ids <- lhs %>% distinct(.id = .id_long) | |||||
| y_ids <- rhs %>% distinct(.id = .id_long) | |||||
| all <- all %>% rename(id_old = .id, .id = .id_long) | |||||
| # all <- all %>% rename(.id = .id_long) | |||||
| } else if (type == "intersect") { | } else if (type == "intersect") { | ||||
| col_combiner <- dplyr::full_join | col_combiner <- dplyr::full_join | ||||
| row_combiner <- dplyr::intersect | row_combiner <- dplyr::intersect | ||||
| stop("Unknown func") | stop("Unknown func") | ||||
| } | } | ||||
| take_cols <- col_combiner(x_cols, y_cols, by = "col") | |||||
| take_ids <- row_combiner(x_ids, y_ids, by = ".id") | |||||
| # make sure .header is always the first | |||||
| id_number <- which(str_detect(take_ids$.id, "^.header")) | |||||
| if (length(id_number) != 0) | |||||
| take_ids <- take_ids[c(id_number, (1:nrow(take_ids))[-id_number]), ] | |||||
| if (!any(str_detect(take_ids$.id, "^.header"))) | |||||
| take_ids <- bind_rows(data_frame(.id = ".header"), take_ids) | |||||
| take_cols <- col_combiner(x_cols, y_cols, by = ".col") | |||||
| take_ids <- row_combiner(x_ids, y_ids, by = c(".id", ".id_long")) | |||||
| take_headers <- col_combiner(x_headers, y_headers, by = c(".id", ".id_long")) | |||||
| take_ids <- bind_rows(take_headers, take_ids) | |||||
| take <- tidyr::crossing(take_ids, take_cols) | take <- tidyr::crossing(take_ids, take_cols) | ||||
| mid <- (2 + length(unique(lhs$col)) + length(unique(rhs$col))) / 2 | |||||
| mid <- (2 + length(unique(lhs$.col)) + length(unique(rhs$.col))) / 2 | |||||
| xvals <- 1:nrow(take_cols) | xvals <- 1:nrow(take_cols) | ||||
| xvals <- xvals - mean(xvals) + mid | xvals <- xvals - mean(xvals) + mid | ||||
| names(xvals) <- take_cols %>% pull(col) | |||||
| names(xvals) <- take_cols %>% pull(.col) | |||||
| n_non_header <- sum(str_detect(take_ids$.id, "^[^\\.header]")) | |||||
| yvals <- cumsum(ifelse(str_detect(take_ids$.id, "^\\.header"), 0, -1)) | |||||
| names(yvals) <- take_ids %>% pull(.id) | |||||
| yvals <- cumsum(ifelse(str_detect(take_ids$.id_long, "^\\.header"), 0, -1)) | |||||
| names(yvals) <- take_ids %>% pull(.id_long) | |||||
| take_vals <- semi_join(all, take, by = c(".id", "col")) %>% | |||||
| take_vals <- semi_join(all, take %>% select(".id", ".col"), | |||||
| by = c(".id", ".col")) %>% | |||||
| mutate(.alpha = 1, | mutate(.alpha = 1, | ||||
| .x = xvals[col], | |||||
| .y = yvals[.id]) | |||||
| if (type == "union_all") { | |||||
| take_vals <- take_vals %>% rename(.id_long = .id, .id = id_old) | |||||
| } | |||||
| .x = xvals[.col], | |||||
| .y = yvals[.id_long]) | |||||
| res <- bind_rows( | res <- bind_rows( | ||||
| # take, | # take, | ||||
| take_vals, | take_vals, | ||||
| # fade in place: | # fade in place: | ||||
| all %>% filter(!.id %in% take_ids$.id) %>% mutate(.alpha = 0), | |||||
| all %>% filter(!.id_long %in% take_ids$.id_long) %>% mutate(.alpha = 0), | |||||
| # moving fade or fade in place as well: | # moving fade or fade in place as well: | ||||
| all %>% filter(.id %in% take_ids$.id & !col %in% take_cols$col) %>% | |||||
| all %>% filter(.id_long %in% take_ids$.id_long & !.col %in% take_cols$.col) %>% | |||||
| mutate(.alpha = 0) | mutate(.alpha = 0) | ||||
| ) | ) | ||||
| return(res) | return(res) |
| } | } | ||||
| if (!".alpha" %in% names(d)) d <- d %>% mutate(.alpha = 1) | if (!".alpha" %in% names(d)) d <- d %>% mutate(.alpha = 1) | ||||
| ggplot(d, aes(x = .x, group = .id_long, y = .y, fill = .color, alpha = .alpha)) + | |||||
| d <- d %>% mutate(.item_id = paste(.id_long, .col, sep = "-")) | |||||
| ggplot(d, aes(x = .x, group = .item_id, y = .y, fill = .color, alpha = .alpha)) + | |||||
| geom_tile(width = 0.9, height = 0.9) + | geom_tile(width = 0.9, height = 0.9) + | ||||
| coord_equal() + | coord_equal() + | ||||
| geom_text(data = d %>% filter(!is.na(val)), aes(label = val), color = "white", | |||||
| geom_text(data = d %>% filter(!is.na(.val)), aes(label = .val), color = "white", | |||||
| family = text_family, size = text_size) + | family = text_family, size = text_size) + | ||||
| scale_fill_identity() + | scale_fill_identity() + | ||||
| scale_alpha_identity() + | scale_alpha_identity() + |
| #' @param x a left dataset | #' @param x a left dataset | ||||
| #' @param y a right dataset | #' @param y a right dataset | ||||
| #' @param by a by argument for joins / set operations | #' @param by a by argument for joins / set operations | ||||
| #' @param fill if missing ids should be filled | |||||
| #' | #' | ||||
| #' @return a preprocessed dataset | #' @return a preprocessed dataset | ||||
| #' | #' | ||||
| #' @examples | #' @examples | ||||
| #' NULL | #' NULL | ||||
| preprocess_data <- function(x, y, by) { | |||||
| preprocess_data <- function(x, y, by, fill = TRUE) { | |||||
| xvars <- names(x) %>% str_subset("^[^\\.]") | |||||
| yvars <- names(y) %>% str_subset("^[^\\.]") | |||||
| #' test for | |||||
| #' a <- c("unique", "mult", "mult", "also unique") | |||||
| #' add_duplicate_number(a) | |||||
| add_duplicate_number <- function(a) { | |||||
| data_frame(v = a) %>% | |||||
| group_by(v) %>% | |||||
| mutate(id = paste(v, 1:n(), sep = "-")) %>% | |||||
| pull(id) | |||||
| } | |||||
| x <- x %>% | x <- x %>% | ||||
| unite(one_of(by), col = ".id", remove = FALSE) %>% | unite(one_of(by), col = ".id", remove = FALSE) %>% | ||||
| unite(one_of(xvars), col = ".id_long", remove = FALSE) | |||||
| mutate(.id_long = add_duplicate_number(.id)) | |||||
| y <- y %>% | y <- y %>% | ||||
| unite(one_of(by), col = ".id", remove = FALSE) %>% | |||||
| unite(one_of(yvars), col = ".id_long", remove = FALSE) | |||||
| unite(one_of(by), col = ".id", remove = FALSE) %>% | |||||
| mutate(.id_long = add_duplicate_number(.id)) | |||||
| ids <- unique(c(x$.id, y$.id)) | |||||
| ids <- dplyr::union(x %>% dplyr::select(.id, .id_long), | |||||
| y %>% dplyr::select(.id, .id_long)) | |||||
| x_ <- process_data(x, ids, by) %>% | |||||
| mutate(.id_long = paste(.id_long, .side, .r, sep = "_")) | |||||
| y_ <- process_data(y, ids, by) %>% | |||||
| mutate(.x = .x + ncol(x), | |||||
| .id_long = paste(.id_long, .side, .r, sep = "_")) | |||||
| x_ <- process_data(x, ids, by, fill = fill) | |||||
| y_ <- process_data(y, ids, by, fill = fill) %>% | |||||
| mutate(.x = .x + ncol(x) - 1) | |||||
| return(list(x = x_, y = y_)) | return(list(x = x_, y = y_)) | ||||
| } | } | ||||
| #' Processes the data | #' Processes the data | ||||
| #' | #' | ||||
| #' @param x a preprocessed dataset | #' @param x a preprocessed dataset | ||||
| #' @param ids a vector of ids | |||||
| #' @param ids a data_frame of ids (.id and .id_long) | |||||
| #' @param by a vector of by-arguments | #' @param by a vector of by-arguments | ||||
| #' @param width the width of the tiles | #' @param width the width of the tiles | ||||
| #' @param side the side (x or y, lhs or rhs, etc) | #' @param side the side (x or y, lhs or rhs, etc) | ||||
| #' @param fill if missing ids should be filled | |||||
| #' | #' | ||||
| #' @return a data_frame including all necessary information | #' @return a data_frame including all necessary information | ||||
| #' | #' | ||||
| #' @examples | #' @examples | ||||
| #' NULL | #' NULL | ||||
| process_data <- function(x, ids, by, width = 1, side = NA) { | |||||
| process_data <- function(x, ids, by, width = 1, side = NA, fill = TRUE) { | |||||
| if (is.na(side)) side <- deparse(substitute(x)) | if (is.na(side)) side <- deparse(substitute(x)) | ||||
| x_names <- names(x) %>% str_subset("^[^\\.]") | x_names <- names(x) %>% str_subset("^[^\\.]") | ||||
| x <- x %>% | x <- x %>% | ||||
| mutate(.r = row_number()) %>% | mutate(.r = row_number()) %>% | ||||
| gather_(key = "col", value = "val", names(x) %>% str_subset("^[^.]")) %>% | |||||
| mutate(.x = x_keys[col], | |||||
| gather_(key = ".col", value = ".val", names(x) %>% str_subset("^[^.]")) %>% | |||||
| mutate(.x = x_keys[.col], | |||||
| .y = -.r) %>% | .y = -.r) %>% | ||||
| bind_rows(data_frame(.id = ".header", | bind_rows(data_frame(.id = ".header", | ||||
| .id_long = paste(".header", x_names, sep = "_"), | .id_long = paste(".header", x_names, sep = "_"), | ||||
| .r = 0, col = x_names, val = x_names, | |||||
| .r = 0, | |||||
| .col = x_names, | |||||
| .val = x_names, | |||||
| .x = x_keys, .y = 0), .) %>% | .x = x_keys, .y = 0), .) %>% | ||||
| mutate(.width = width, | mutate(.width = width, | ||||
| .side = side) | .side = side) | ||||
| add_color(x, ids, by) | |||||
| # if there are multiple values in the ids (-2, -3 etc) but they are not present | |||||
| # in x, because it is in the second/other dataset, add these values here | |||||
| id_long <- ids$.id_long | |||||
| mis_ids <- id_long[!id_long %in% x$.id_long] | |||||
| # if the missing value is a -1, that means the missing value comes not from | |||||
| # missing dublicate ids | |||||
| mis_ids <- str_subset(mis_ids, "[^-1]$") | |||||
| if (length(mis_ids) > 0 && fill) { | |||||
| mis_ids_short <- str_replace(mis_ids, "-[0-9]+$", "") | |||||
| # insert the missing ids at the right place | |||||
| for (i in mis_ids_short) { | |||||
| irow <- (1:nrow(x))[x$.id == i] | |||||
| irow <- irow[1] | |||||
| x <- bind_rows( | |||||
| x %>% slice(1:irow), | |||||
| x %>% filter(.id %in% mis_ids_short) %>% mutate(.id_long = mis_ids), | |||||
| x %>% slice((irow + 1):nrow(x)) | |||||
| ) | |||||
| } | |||||
| } | |||||
| res <- add_color(x, ids$.id, by) | |||||
| return(res) | |||||
| } | } | ||||
| #' Adds Color to a processed data_frame | #' Adds Color to a processed data_frame | ||||
| colors <- c(color_header, scales::brewer_pal(type = "qual", "Set1")(length(ids))) | colors <- c(color_header, scales::brewer_pal(type = "qual", "Set1")(length(ids))) | ||||
| names(colors) <- c(".header", ids) | names(colors) <- c(".header", ids) | ||||
| x %>% | |||||
| mutate(.color = ifelse(is.na(val), color_missing, colors[.id]), | |||||
| .color = ifelse(col %in% by, .color, color_other)) | |||||
| res <- x %>% | |||||
| mutate(.color = ifelse(is.na(.val), color_missing, colors[.id]), | |||||
| .color = ifelse(.col %in% by, .color, color_other)) | |||||
| return(res) | |||||
| } | } |
| > All rows from `x`, and all columns from `x` and `y`. Rows in `x` with no match in `y` will have `NA` values in the new columns. | > All rows from `x`, and all columns from `x` and `y`. Rows in `x` with no match in `y` will have `NA` values in the new columns. | ||||
| ```{r left-join, echo=T} | ```{r left-join, echo=T} | ||||
| animate_inner_join(x, y, by = "id") | |||||
| animate_left_join(x, y, by = "id") | |||||
| ``` | ``` | ||||
| > ... If there are multiple matches between `x` and `y`, all combinations of the matches are returned. | > ... If there are multiple matches between `x` and `y`, all combinations of the matches are returned. | ||||
| ```{r left-join-extra} | |||||
| # Not yet working | |||||
| # source("R/left_join_extra.R") | |||||
| ```{r left-join-extra, echo=T} | |||||
| y_extra <- bind_rows(y, data_frame(id = 2, y = "y5")) | y_extra <- bind_rows(y, data_frame(id = 2, y = "y5")) | ||||
| ``` | |||||
| y_extra # has multiple rows with the key from `x` | |||||
|  | |||||
| animate_left_join(x, y_extra, by = "id") | |||||
| ``` | |||||
| ```{r echo=TRUE} | ```{r echo=TRUE} | ||||
| y_extra # has multiple rows with the key from `x` | |||||
| left_join(x, y_extra, by = "id") | left_join(x, y_extra, by = "id") | ||||
| ``` | ``` | ||||
| > All rows from `x` where there are not matching values in `y`, keeping just columns from `x`. | > All rows from `x` where there are not matching values in `y`, keeping just columns from `x`. | ||||
| ```{r anti-join} | |||||
| ```{r anti-join, echo=T} | |||||
| animate_anti_join(x, y, by = "id") | animate_anti_join(x, y, by = "id") | ||||
| ``` | ``` | ||||
| > All unique rows from `x` and `y`. | > All unique rows from `x` and `y`. | ||||
| ```{r union, export=T} | |||||
| ```{r union, echo=T} | |||||
| animate_union(x, y) | animate_union(x, y) | ||||
| ``` | ``` | ||||
| > no match in `y` will have `NA` values in the new columns. | > no match in `y` will have `NA` values in the new columns. | ||||
| ``` r | ``` r | ||||
| animate_inner_join(x, y, by = "id") | |||||
| animate_left_join(x, y, by = "id") | |||||
| ``` | ``` | ||||
| <!-- --> | <!-- --> | ||||
| > … If there are multiple matches between `x` and `y`, all combinations | > … If there are multiple matches between `x` and `y`, all combinations | ||||
| > of the matches are returned. | > of the matches are returned. | ||||
|  | |||||
| ``` r | ``` r | ||||
| y_extra <- bind_rows(y, data_frame(id = 2, y = "y5")) | |||||
| y_extra # has multiple rows with the key from `x` | y_extra # has multiple rows with the key from `x` | ||||
| #> # A tibble: 4 x 2 | #> # A tibble: 4 x 2 | ||||
| #> id y | #> id y | ||||
| #> 2 2 y2 | #> 2 2 y2 | ||||
| #> 3 4 y4 | #> 3 4 y4 | ||||
| #> 4 2 y5 | #> 4 2 y5 | ||||
| animate_left_join(x, y_extra, by = "id") | |||||
| ``` | |||||
| <!-- --> | |||||
| ``` r | |||||
| left_join(x, y_extra, by = "id") | left_join(x, y_extra, by = "id") | ||||
| #> # A tibble: 4 x 3 | #> # A tibble: 4 x 3 | ||||
| #> id x y | #> id x y | ||||
| > All rows from `x` where there are not matching values in `y`, keeping | > All rows from `x` where there are not matching values in `y`, keeping | ||||
| > just columns from `x`. | > just columns from `x`. | ||||
| ``` r | |||||
| animate_anti_join(x, y, by = "id") | |||||
| ``` | |||||
| <!-- --> | <!-- --> | ||||
| ``` r | ``` r | ||||
| > All unique rows from `x` and `y`. | > All unique rows from `x` and `y`. | ||||
| ``` r | |||||
| animate_union(x, y) | |||||
| ``` | |||||
| <!-- --> | <!-- --> | ||||
| ``` r | ``` r |
| \alias{preprocess_data} | \alias{preprocess_data} | ||||
| \title{Preprocess data} | \title{Preprocess data} | ||||
| \usage{ | \usage{ | ||||
| preprocess_data(x, y, by) | |||||
| preprocess_data(x, y, by, fill = FALSE) | |||||
| } | } | ||||
| \arguments{ | \arguments{ | ||||
| \item{x}{a left dataset} | \item{x}{a left dataset} | ||||
| \item{y}{a right dataset} | \item{y}{a right dataset} | ||||
| \item{by}{a by argument for joins / set operations} | \item{by}{a by argument for joins / set operations} | ||||
| \item{fill}{if missing ids should be filled} | |||||
| } | } | ||||
| \value{ | \value{ | ||||
| a preprocessed dataset | a preprocessed dataset | ||||
| } | } | ||||
| \examples{ | \examples{ | ||||
| NULL | NULL | ||||
| test for | |||||
| a <- c("unique", "mult", "mult", "also unique") | |||||
| add_duplicate_number(a) | |||||
| } | } |
| \alias{process_data} | \alias{process_data} | ||||
| \title{Processes the data} | \title{Processes the data} | ||||
| \usage{ | \usage{ | ||||
| process_data(x, ids, by, width = 1, side = NA) | |||||
| process_data(x, ids, by, width = 1, side = NA, fill = TRUE) | |||||
| } | } | ||||
| \arguments{ | \arguments{ | ||||
| \item{x}{a preprocessed dataset} | \item{x}{a preprocessed dataset} | ||||
| \item{ids}{a vector of ids} | |||||
| \item{ids}{a data_frame of ids (.id and .id_long)} | |||||
| \item{by}{a vector of by-arguments} | \item{by}{a vector of by-arguments} | ||||
| \item{width}{the width of the tiles} | \item{width}{the width of the tiles} | ||||
| \item{side}{the side (x or y, lhs or rhs, etc)} | \item{side}{the side (x or y, lhs or rhs, etc)} | ||||
| \item{fill}{if missing ids should be filled} | |||||
| } | } | ||||
| \value{ | \value{ | ||||
| a data_frame including all necessary information | a data_frame including all necessary information |