#' Extract matched groups from regexp
#'
#' @param text Text to search
#' @param pattern regexp
#' @param global If `TRUE`, enables global pattern matching
#' @inheritDotParams base::regexec ignore.case perl fixed useBytes
run_regex <- function(
text,
pattern,
ignore.case = FALSE,
perl = FALSE,
fixed = FALSE,
useBytes = FALSE,
global = TRUE
) {
# Use regex to get matches by group, gives start index and length
m <- regexec(pattern, text, ignore.case, perl, fixed, useBytes)
m <- purrr::map2(text, m, ~ list(text = .x, idx = expand_matches(.y)))
attr(m, "global") <- global
if (!global) return(m)
mmi <- max_match_index(m)
if (any(!is.na(mmi))) {
subtext <- purrr::map_chr(m, "text") %>% purrr::map2_chr(mmi, substring)
subtext[is.na(subtext)] <- ""
m2 <- run_regex(subtext, pattern, ignore.case, perl, fixed, useBytes)
for (i in seq_along(m2)) {
if (is.null(m2[[i]]$idx[[1]])) next
m2[[i]]$idx[, c(1, 2)] <- m2[[i]]$idx[, c(1, 2)] + mmi[i] - 1L
m2[[i]]$idx$pass <- m2[[i]]$idx$pass + 1L
m[[i]]$idx <- rbind(m[[i]]$idx, m2[[i]]$idx)
}
}
m
}
expand_matches <- function(m) {
if (m[1] == -1) return(list(NULL))
m_length <- attr(m, "match.length")
x <- purrr::map2(m, m_length, ~ c(.x, .x + .y))
x <- as.data.frame(do.call(rbind, x))
names(x) <- c("start", "end")
x$start <- ifelse(x$start == 0L, NA_integer_, x$start)
x$end <- ifelse(x$end == 0L, NA_integer_, x$end)
x$group <- 1:nrow(x) - 1L
x$pass <- 1L
x
}
max_match_index <- function(m) {
max_na <- function(x) if (is.null(x)) NA else max(x, na.rm = TRUE)
max_int <- function(x) as.integer(max(x))
purrr::map(m, "idx") %>%
purrr::modify_depth(1, ~c(start = max_na(.x$start), end = max_na(.x$end))) %>%
purrr::map_int(max_int)
}
#' Wrap matches in HTML span tags to colorize via CSS
#'
#' @param x Individual list item in list returned by [run_regex()]
#' @inheritParams view_regex
#' @keywords internal
wrap_result <- function(x, escape = FALSE, exact = FALSE) {
if (is.null(x$idx[[1]])) return(if (escape) escape_html(x$text) else x$text)
text <- x$text
inserts <- x$idx
inserts$class <- sprintf("group g%02d", inserts$group)
inserts$pad <- 0L
names(inserts)[which(names(inserts) == "group")] <- "i"
for (j in seq_len(nrow(inserts))) {
if (inserts$i[j] == 0) next
if (is.na(inserts$start[j]) || is.na(inserts$end[j])) next
overlap <- filter(
inserts[1:(j-1), ],
.data$i != 0,
.data$start <= !!inserts$start[j] & .data$end >= !!inserts$end[j])
inserts[j, 'pad'] <- inserts$pad[j] + nrow(overlap)
}
inserts <- inserts %>%
tidyr::gather(type, loc, start:end) %>%
filter(!is.na(.data$loc)) %>%
dplyr::arrange(loc, class, dplyr::desc(type)) %>%
mutate(
class = ifelse(.data$pad > 0, sprintf("%s pad%02d", .data$class, .data$pad), .data$class),
insert = ifelse(.data$type == 'start', sprintf('', .data$class), "")
)
inserts <- if (max(inserts$pass) == 1) {
collapse_span_inserts(inserts)
} else {
inserts %>%
tidyr::nest(-pass) %>%
mutate(data = purrr::map(data, collapse_span_inserts)) %>%
tidyr::unnest() %>%
group_by(loc, type) %>%
summarize(insert = paste(insert, collapse = "")) %>%
dplyr::ungroup()
}
# inserts now gives html (span open and close) to insert and loc
# first split text at inserts$loc locations,
# then recombine by zipping with inserts$insert text
# start at 0, unless there's a hit on first character
# end at nchar(text) + 1 because window is idx[k] to idx[k+1]-1
idx_split <- c(0 - (inserts$loc[1] == 0), inserts$loc)
if (!(nchar(text) + 1) %in% idx_split)
idx_split <- c(idx_split, nchar(text) + 1)
text_split <- c()
for (k in seq_along(idx_split[-1])) {
text_split <- c(text_split, substr(text, idx_split[k], idx_split[k+1] - 1))
}
out <- c()
for (j in seq_along(text_split)) {
out <- c(
out,
ifelse(escape, escape_html(text_split[j]), text_split[j]),
if (!is.na(inserts$insert[j])) inserts$insert[j]
)
}
if (exact) out <- escape_backslash(out)
paste(out, collapse = '')
}
collapse_span_inserts <- function(inserts) {
inserts_g0 <- filter(inserts, class == "group g00")
inserts_other <- filter(inserts, class != "group g00")
dplyr::bind_rows(
filter(inserts_g0, type == "start"),
inserts_other,
filter(inserts_g0, type == "end")
) %>%
mutate(type = sprintf("%05d%s", 1:nrow(.), type)) %>%
group_by(.data$loc, .data$type) %>%
summarize(insert = paste(.data$insert, collapse = '')) %>%
dplyr::ungroup() %>%
mutate(type = sub("^\\d{5}", "", type))
}
#' Wraps capture groups in regex pattern in span tags to colorize with CSS
#'
#' @inheritParams view_regex
#' @keywords internal
wrap_regex <- function(pattern, escape = TRUE, exact = TRUE) {
stopifnot(length(pattern) == 1)
if (escape) pattern <- escape_html(pattern)
# 1. walk characters in pattern
# 2. if current is open parens
# 1. walk backwards, counting number of "\\" until first non-"\\" char
# 2. If odd, then not an opening group
# 3. Look forward, if followed by "?" then not a capturing group
# 4. If capturing group then add opening " ", resi, " ", text, "Pattern
",
"", wrap_regex(pattern, escape, exact), "