Просмотр исходного кода

all joins and sets properly working

pull/10/head
David 7 лет назад
Родитель
Сommit
a29a10f3ae
8 измененных файлов: 146 добавлений и 75 удалений
  1. +25
    -3
      R/animate_helpers.R
  2. +28
    -34
      R/move_together.R
  3. +4
    -2
      R/plot_helpers.R
  4. +55
    -21
      R/process_data_helpers.R
  5. +7
    -9
      README.Rmd
  6. +17
    -3
      README.md
  7. +6
    -1
      man/preprocess_data.Rd
  8. +4
    -2
      man/process_data.Rd

+ 25
- 3
R/animate_helpers.R Просмотреть файл

deparse(substitute(x)), deparse(substitute(x)),
deparse(substitute(y))) deparse(substitute(y)))


ll <- preprocess_data(x, y, by = names(x))
if (type %in% c("union", "intersect", "setdiff")) {
x <- dplyr::distinct(x)
y <- dplyr::distinct(y)
}

if (type == "union_all") {
ll <- preprocess_data(x, y, by = names(x), fill = FALSE)
ll <- lapply(ll, function(a)
a %>% mutate(.id_long = paste(.id_long, .side, sep = "-"))
)
} else {
ll <- preprocess_data(x, y, by = names(x))
}


step0 <- bind_rows(ll$x, ll$y) %>% mutate(.frame = 0, .alpha = 1) step0 <- bind_rows(ll$x, ll$y) %>% mutate(.frame = 0, .alpha = 1)


if (!export %in% c("gif", "first", "last")) if (!export %in% c("gif", "first", "last"))
stop("export must be either gif, first, or last") stop("export must be either gif, first, or last")


title <- sprintf(paste0(type, "(%s, %s, by = c(\"%s\"))"),
by_args <- ifelse(length(by) == 1,
sprintf("\"%s\"", by),
sprintf("c(\"%s\")", paste(by, collapse = "\", \""))
)

title <- sprintf(paste0(type, "(%s, %s, by = %s)"),
deparse(substitute(x)), deparse(substitute(x)),
deparse(substitute(y)), deparse(substitute(y)),
paste(by, collapse = "\", \""))
by_args)

if (type %in% c("semi_join", "anti_join")) {
# for semi and anti_joins, there is no adding of multiple rows
y <- dplyr::distinct(y)
}


ll <- preprocess_data(x, y, by) ll <- preprocess_data(x, y, by)



+ 28
- 34
R/move_together.R Просмотреть файл

#' NULL #' NULL
move_together <- function(lhs, rhs, type) { move_together <- function(lhs, rhs, type) {


all_ids <- bind_rows(lhs, rhs) %>% distinct(.id)

all <- bind_rows(lhs, rhs) all <- bind_rows(lhs, rhs)


x_cols <- lhs %>% distinct(col)
y_cols <- rhs %>% distinct(col)
# separate column and row-filter (ids)
x_cols <- lhs %>% distinct(.col)
y_cols <- rhs %>% distinct(.col)

# separate header columns from ids and treat them as columns
x_ids <- lhs %>% distinct(.id, .id_long)
y_ids <- rhs %>% distinct(.id, .id_long)


x_ids <- lhs %>% distinct(.id)
y_ids <- rhs %>% distinct(.id)
x_headers <- x_ids %>% filter(str_detect(.id_long, "^\\.header"))
y_headers <- y_ids %>% filter(str_detect(.id_long, "^\\.header"))

x_ids <- x_ids %>% filter(!str_detect(.id_long, "^\\.header"))
y_ids <- y_ids %>% filter(!str_detect(.id_long, "^\\.header"))


if (type == "full_join") { if (type == "full_join") {
col_combiner <- dplyr::full_join col_combiner <- dplyr::full_join
col_combiner <- dplyr::full_join col_combiner <- dplyr::full_join
row_combiner <- dplyr::right_join row_combiner <- dplyr::right_join
} else if (type == "semi_join") { } else if (type == "semi_join") {
col_combiner <- dplyr::semi_join
col_combiner <- dplyr::left_join
row_combiner <- dplyr::semi_join row_combiner <- dplyr::semi_join
} else if (type == "anti_join") { } else if (type == "anti_join") {
col_combiner <- dplyr::semi_join col_combiner <- dplyr::semi_join
} else if (type == "union_all") { } else if (type == "union_all") {
col_combiner <- dplyr::full_join col_combiner <- dplyr::full_join
row_combiner <- dplyr::union_all row_combiner <- dplyr::union_all

x_ids <- lhs %>% distinct(.id = .id_long)
y_ids <- rhs %>% distinct(.id = .id_long)
all <- all %>% rename(id_old = .id, .id = .id_long)
# all <- all %>% rename(.id = .id_long)
} else if (type == "intersect") { } else if (type == "intersect") {
col_combiner <- dplyr::full_join col_combiner <- dplyr::full_join
row_combiner <- dplyr::intersect row_combiner <- dplyr::intersect
stop("Unknown func") stop("Unknown func")
} }


take_cols <- col_combiner(x_cols, y_cols, by = "col")
take_ids <- row_combiner(x_ids, y_ids, by = ".id")
# make sure .header is always the first
id_number <- which(str_detect(take_ids$.id, "^.header"))
if (length(id_number) != 0)
take_ids <- take_ids[c(id_number, (1:nrow(take_ids))[-id_number]), ]
if (!any(str_detect(take_ids$.id, "^.header")))
take_ids <- bind_rows(data_frame(.id = ".header"), take_ids)
take_cols <- col_combiner(x_cols, y_cols, by = ".col")
take_ids <- row_combiner(x_ids, y_ids, by = c(".id", ".id_long"))
take_headers <- col_combiner(x_headers, y_headers, by = c(".id", ".id_long"))

take_ids <- bind_rows(take_headers, take_ids)


take <- tidyr::crossing(take_ids, take_cols) take <- tidyr::crossing(take_ids, take_cols)


mid <- (2 + length(unique(lhs$col)) + length(unique(rhs$col))) / 2
mid <- (2 + length(unique(lhs$.col)) + length(unique(rhs$.col))) / 2
xvals <- 1:nrow(take_cols) xvals <- 1:nrow(take_cols)
xvals <- xvals - mean(xvals) + mid xvals <- xvals - mean(xvals) + mid
names(xvals) <- take_cols %>% pull(col)
names(xvals) <- take_cols %>% pull(.col)


n_non_header <- sum(str_detect(take_ids$.id, "^[^\\.header]"))
yvals <- cumsum(ifelse(str_detect(take_ids$.id, "^\\.header"), 0, -1))
names(yvals) <- take_ids %>% pull(.id)
yvals <- cumsum(ifelse(str_detect(take_ids$.id_long, "^\\.header"), 0, -1))
names(yvals) <- take_ids %>% pull(.id_long)


take_vals <- semi_join(all, take, by = c(".id", "col")) %>%
take_vals <- semi_join(all, take %>% select(".id", ".col"),
by = c(".id", ".col")) %>%
mutate(.alpha = 1, mutate(.alpha = 1,
.x = xvals[col],
.y = yvals[.id])

if (type == "union_all") {
take_vals <- take_vals %>% rename(.id_long = .id, .id = id_old)
}
.x = xvals[.col],
.y = yvals[.id_long])


res <- bind_rows( res <- bind_rows(
# take, # take,
take_vals, take_vals,
# fade in place: # fade in place:
all %>% filter(!.id %in% take_ids$.id) %>% mutate(.alpha = 0),
all %>% filter(!.id_long %in% take_ids$.id_long) %>% mutate(.alpha = 0),
# moving fade or fade in place as well: # moving fade or fade in place as well:
all %>% filter(.id %in% take_ids$.id & !col %in% take_cols$col) %>%
all %>% filter(.id_long %in% take_ids$.id_long & !.col %in% take_cols$.col) %>%
mutate(.alpha = 0) mutate(.alpha = 0)
) )
return(res) return(res)

+ 4
- 2
R/plot_helpers.R Просмотреть файл

} }


if (!".alpha" %in% names(d)) d <- d %>% mutate(.alpha = 1) if (!".alpha" %in% names(d)) d <- d %>% mutate(.alpha = 1)
ggplot(d, aes(x = .x, group = .id_long, y = .y, fill = .color, alpha = .alpha)) +
d <- d %>% mutate(.item_id = paste(.id_long, .col, sep = "-"))

ggplot(d, aes(x = .x, group = .item_id, y = .y, fill = .color, alpha = .alpha)) +
geom_tile(width = 0.9, height = 0.9) + geom_tile(width = 0.9, height = 0.9) +
coord_equal() + coord_equal() +
geom_text(data = d %>% filter(!is.na(val)), aes(label = val), color = "white",
geom_text(data = d %>% filter(!is.na(.val)), aes(label = .val), color = "white",
family = text_family, size = text_size) + family = text_family, size = text_size) +
scale_fill_identity() + scale_fill_identity() +
scale_alpha_identity() + scale_alpha_identity() +

+ 55
- 21
R/process_data_helpers.R Просмотреть файл

#' @param x a left dataset #' @param x a left dataset
#' @param y a right dataset #' @param y a right dataset
#' @param by a by argument for joins / set operations #' @param by a by argument for joins / set operations
#' @param fill if missing ids should be filled
#' #'
#' @return a preprocessed dataset #' @return a preprocessed dataset
#' #'
#' @examples #' @examples
#' NULL #' NULL
preprocess_data <- function(x, y, by) {
preprocess_data <- function(x, y, by, fill = TRUE) {


xvars <- names(x) %>% str_subset("^[^\\.]")
yvars <- names(y) %>% str_subset("^[^\\.]")
#' test for
#' a <- c("unique", "mult", "mult", "also unique")
#' add_duplicate_number(a)
add_duplicate_number <- function(a) {
data_frame(v = a) %>%
group_by(v) %>%
mutate(id = paste(v, 1:n(), sep = "-")) %>%
pull(id)
}


x <- x %>% x <- x %>%
unite(one_of(by), col = ".id", remove = FALSE) %>% unite(one_of(by), col = ".id", remove = FALSE) %>%
unite(one_of(xvars), col = ".id_long", remove = FALSE)
mutate(.id_long = add_duplicate_number(.id))


y <- y %>% y <- y %>%
unite(one_of(by), col = ".id", remove = FALSE) %>%
unite(one_of(yvars), col = ".id_long", remove = FALSE)
unite(one_of(by), col = ".id", remove = FALSE) %>%
mutate(.id_long = add_duplicate_number(.id))


ids <- unique(c(x$.id, y$.id))
ids <- dplyr::union(x %>% dplyr::select(.id, .id_long),
y %>% dplyr::select(.id, .id_long))


x_ <- process_data(x, ids, by) %>%
mutate(.id_long = paste(.id_long, .side, .r, sep = "_"))
y_ <- process_data(y, ids, by) %>%
mutate(.x = .x + ncol(x),
.id_long = paste(.id_long, .side, .r, sep = "_"))
x_ <- process_data(x, ids, by, fill = fill)
y_ <- process_data(y, ids, by, fill = fill) %>%
mutate(.x = .x + ncol(x) - 1)


return(list(x = x_, y = y_)) return(list(x = x_, y = y_))
} }
#' Processes the data #' Processes the data
#' #'
#' @param x a preprocessed dataset #' @param x a preprocessed dataset
#' @param ids a vector of ids
#' @param ids a data_frame of ids (.id and .id_long)
#' @param by a vector of by-arguments #' @param by a vector of by-arguments
#' @param width the width of the tiles #' @param width the width of the tiles
#' @param side the side (x or y, lhs or rhs, etc) #' @param side the side (x or y, lhs or rhs, etc)
#' @param fill if missing ids should be filled
#' #'
#' @return a data_frame including all necessary information #' @return a data_frame including all necessary information
#' #'
#' @examples #' @examples
#' NULL #' NULL
process_data <- function(x, ids, by, width = 1, side = NA) {
process_data <- function(x, ids, by, width = 1, side = NA, fill = TRUE) {
if (is.na(side)) side <- deparse(substitute(x)) if (is.na(side)) side <- deparse(substitute(x))


x_names <- names(x) %>% str_subset("^[^\\.]") x_names <- names(x) %>% str_subset("^[^\\.]")


x <- x %>% x <- x %>%
mutate(.r = row_number()) %>% mutate(.r = row_number()) %>%
gather_(key = "col", value = "val", names(x) %>% str_subset("^[^.]")) %>%
mutate(.x = x_keys[col],
gather_(key = ".col", value = ".val", names(x) %>% str_subset("^[^.]")) %>%
mutate(.x = x_keys[.col],
.y = -.r) %>% .y = -.r) %>%
bind_rows(data_frame(.id = ".header", bind_rows(data_frame(.id = ".header",
.id_long = paste(".header", x_names, sep = "_"), .id_long = paste(".header", x_names, sep = "_"),
.r = 0, col = x_names, val = x_names,
.r = 0,
.col = x_names,
.val = x_names,
.x = x_keys, .y = 0), .) %>% .x = x_keys, .y = 0), .) %>%
mutate(.width = width, mutate(.width = width,
.side = side) .side = side)


add_color(x, ids, by)
# if there are multiple values in the ids (-2, -3 etc) but they are not present
# in x, because it is in the second/other dataset, add these values here
id_long <- ids$.id_long
mis_ids <- id_long[!id_long %in% x$.id_long]
# if the missing value is a -1, that means the missing value comes not from
# missing dublicate ids
mis_ids <- str_subset(mis_ids, "[^-1]$")
if (length(mis_ids) > 0 && fill) {
mis_ids_short <- str_replace(mis_ids, "-[0-9]+$", "")

# insert the missing ids at the right place
for (i in mis_ids_short) {
irow <- (1:nrow(x))[x$.id == i]
irow <- irow[1]
x <- bind_rows(
x %>% slice(1:irow),
x %>% filter(.id %in% mis_ids_short) %>% mutate(.id_long = mis_ids),
x %>% slice((irow + 1):nrow(x))
)
}
}

res <- add_color(x, ids$.id, by)
return(res)
} }


#' Adds Color to a processed data_frame #' Adds Color to a processed data_frame
colors <- c(color_header, scales::brewer_pal(type = "qual", "Set1")(length(ids))) colors <- c(color_header, scales::brewer_pal(type = "qual", "Set1")(length(ids)))
names(colors) <- c(".header", ids) names(colors) <- c(".header", ids)


x %>%
mutate(.color = ifelse(is.na(val), color_missing, colors[.id]),
.color = ifelse(col %in% by, .color, color_other))
res <- x %>%
mutate(.color = ifelse(is.na(.val), color_missing, colors[.id]),
.color = ifelse(.col %in% by, .color, color_other))
return(res)
} }

+ 7
- 9
README.Rmd Просмотреть файл

> All rows from `x`, and all columns from `x` and `y`. Rows in `x` with no match in `y` will have `NA` values in the new columns. > All rows from `x`, and all columns from `x` and `y`. Rows in `x` with no match in `y` will have `NA` values in the new columns.


```{r left-join, echo=T} ```{r left-join, echo=T}
animate_inner_join(x, y, by = "id")
animate_left_join(x, y, by = "id")
``` ```






> ... If there are multiple matches between `x` and `y`, all combinations of the matches are returned. > ... If there are multiple matches between `x` and `y`, all combinations of the matches are returned.


```{r left-join-extra}
# Not yet working
# source("R/left_join_extra.R")
```{r left-join-extra, echo=T}
y_extra <- bind_rows(y, data_frame(id = 2, y = "y5")) y_extra <- bind_rows(y, data_frame(id = 2, y = "y5"))
```
y_extra # has multiple rows with the key from `x`


![](images/left-join-extra.gif)
animate_left_join(x, y_extra, by = "id")
```


```{r echo=TRUE} ```{r echo=TRUE}
y_extra # has multiple rows with the key from `x`
left_join(x, y_extra, by = "id") left_join(x, y_extra, by = "id")
``` ```




> All rows from `x` where there are not matching values in `y`, keeping just columns from `x`. > All rows from `x` where there are not matching values in `y`, keeping just columns from `x`.


```{r anti-join}
```{r anti-join, echo=T}
animate_anti_join(x, y, by = "id") animate_anti_join(x, y, by = "id")
``` ```




> All unique rows from `x` and `y`. > All unique rows from `x` and `y`.


```{r union, export=T}
```{r union, echo=T}
animate_union(x, y) animate_union(x, y)
``` ```



+ 17
- 3
README.md Просмотреть файл

> no match in `y` will have `NA` values in the new columns. > no match in `y` will have `NA` values in the new columns.


``` r ``` r
animate_inner_join(x, y, by = "id")
animate_left_join(x, y, by = "id")
``` ```


![](README_files/figure-gfm/left-join-1.gif)<!-- --> ![](README_files/figure-gfm/left-join-1.gif)<!-- -->
> … If there are multiple matches between `x` and `y`, all combinations > … If there are multiple matches between `x` and `y`, all combinations
> of the matches are returned. > of the matches are returned.


![](images/left-join-extra.gif)

``` r ``` r
y_extra <- bind_rows(y, data_frame(id = 2, y = "y5"))
y_extra # has multiple rows with the key from `x` y_extra # has multiple rows with the key from `x`
#> # A tibble: 4 x 2 #> # A tibble: 4 x 2
#> id y #> id y
#> 2 2 y2 #> 2 2 y2
#> 3 4 y4 #> 3 4 y4
#> 4 2 y5 #> 4 2 y5

animate_left_join(x, y_extra, by = "id")
```

![](README_files/figure-gfm/left-join-extra-1.gif)<!-- -->

``` r
left_join(x, y_extra, by = "id") left_join(x, y_extra, by = "id")
#> # A tibble: 4 x 3 #> # A tibble: 4 x 3
#> id x y #> id x y
> All rows from `x` where there are not matching values in `y`, keeping > All rows from `x` where there are not matching values in `y`, keeping
> just columns from `x`. > just columns from `x`.


``` r
animate_anti_join(x, y, by = "id")
```

![](README_files/figure-gfm/anti-join-1.gif)<!-- --> ![](README_files/figure-gfm/anti-join-1.gif)<!-- -->


``` r ``` r


> All unique rows from `x` and `y`. > All unique rows from `x` and `y`.


``` r
animate_union(x, y)
```

![](README_files/figure-gfm/union-1.gif)<!-- --> ![](README_files/figure-gfm/union-1.gif)<!-- -->


``` r ``` r

+ 6
- 1
man/preprocess_data.Rd Просмотреть файл

\alias{preprocess_data} \alias{preprocess_data}
\title{Preprocess data} \title{Preprocess data}
\usage{ \usage{
preprocess_data(x, y, by)
preprocess_data(x, y, by, fill = FALSE)
} }
\arguments{ \arguments{
\item{x}{a left dataset} \item{x}{a left dataset}
\item{y}{a right dataset} \item{y}{a right dataset}


\item{by}{a by argument for joins / set operations} \item{by}{a by argument for joins / set operations}

\item{fill}{if missing ids should be filled}
} }
\value{ \value{
a preprocessed dataset a preprocessed dataset
} }
\examples{ \examples{
NULL NULL
test for
a <- c("unique", "mult", "mult", "also unique")
add_duplicate_number(a)
} }

+ 4
- 2
man/process_data.Rd Просмотреть файл

\alias{process_data} \alias{process_data}
\title{Processes the data} \title{Processes the data}
\usage{ \usage{
process_data(x, ids, by, width = 1, side = NA)
process_data(x, ids, by, width = 1, side = NA, fill = TRUE)
} }
\arguments{ \arguments{
\item{x}{a preprocessed dataset} \item{x}{a preprocessed dataset}


\item{ids}{a vector of ids}
\item{ids}{a data_frame of ids (.id and .id_long)}


\item{by}{a vector of by-arguments} \item{by}{a vector of by-arguments}


\item{width}{the width of the tiles} \item{width}{the width of the tiles}


\item{side}{the side (x or y, lhs or rhs, etc)} \item{side}{the side (x or y, lhs or rhs, etc)}

\item{fill}{if missing ids should be filled}
} }
\value{ \value{
a data_frame including all necessary information a data_frame including all necessary information

Загрузка…
Отмена
Сохранить