#' Extract matched groups from regexp #' #' @param text Text to search #' @param pattern regexp #' @param global If `TRUE`, enables global pattern matching #' @inheritDotParams base::regexec ignore.case perl fixed useBytes run_regex <- function( text, pattern, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE, global = FALSE ) { # Use regex to get matches by group, gives start index and length m <- regexec(pattern, text, ignore.case, perl, fixed, useBytes) m <- purrr::map2(text, m, ~ list(text = .x, idx = expand_matches(.y))) attr(m, "global") <- global if (!global) return(m) mmi <- max_match_index(m) if (any(!is.na(mmi))) { subtext <- purrr::map_chr(m, "text") %>% purrr::map2_chr(mmi, substring) subtext[is.na(subtext)] <- "" m2 <- run_regex(subtext, pattern, ignore.case, perl, fixed, useBytes) for (i in seq_along(m2)) { if (is.null(m2[[i]]$idx[[1]])) next m2[[i]]$idx[, c(1, 2)] <- m2[[i]]$idx[, c(1, 2)] + mmi[i] - 1 m[[i]]$idx <- dplyr::bind_rows(m[[i]]$idx, m2[[i]]$idx) } } m } expand_matches <- function(m) { if (m[1] == -1) return(list(NULL)) m_length <- attr(m, "match.length") x <- purrr::map2(m, m_length, ~ c(.x, .x + .y)) x <- as.data.frame(do.call(rbind, x)) names(x) <- c("start", "end") x$start <- ifelse(x$start == 0L, NA_integer_, x$start) x$end <- ifelse(x$end == 0L, NA_integer_, x$end) x$group <- 1:nrow(x) - 1L dplyr::as_tibble(x, validate = FALSE) } max_match_index <- function(m) { max_na <- function(x) if (is.null(x)) NA else max(x, na.rm = TRUE) max_int <- function(x) as.integer(max(x)) purrr::map(m, "idx") %>% purrr::modify_depth(1, ~c(start = max_na(.x$start), end = max_na(.x$end))) %>% purrr::map_int(max_int) } #' Wrap matches in HTML span tags to colorize via CSS #' #' @param x Individual list item in list returned by [run_regex()] #' @inheritParams view_regex #' @keywords internal wrap_result <- function(x, escape = FALSE, exact = FALSE) { if (is.null(x$idx[[1]])) return(if (escape) escape_html(x$text) else x$text) text <- x$text idx <- x$idx len_idx <- length(idx) inserts <- data.frame( i = 1:len_idx - 1, start = purrr::map_int(idx, ~ .[1]), end = purrr::map_int(idx, ~ .[2]) ) %>% mutate( # unmatched groups have start/end of zero start = ifelse(.data$start == 0, NA, .data$start), end = ifelse(.data$end == 0, NA, .data$end), class = sprintf("group g%02d", .data$i), pad = 0 ) for (j in seq_len(nrow(inserts))) { if (inserts$i[j] == 0) next if (is.na(inserts$start[j]) || is.na(inserts$end[j])) next overlap <- filter( inserts[1:(j-1), ], .data$i != 0, .data$start <= !!inserts$start[j] & .data$end >= !!inserts$end[j]) inserts[j, 'pad'] <- inserts$pad[j] + nrow(overlap) } inserts <- inserts %>% tidyr::gather(type, loc, start:end) %>% filter(!is.na(.data$loc)) %>% dplyr::arrange(loc, class, dplyr::desc(type)) %>% mutate( class = ifelse(.data$pad > 0, sprintf("%s pad%02d", .data$class, .data$pad), .data$class), insert = ifelse(.data$type == 'start', sprintf('', .data$class), "") ) inserts_g0 <- filter(inserts, class == "group g00") inserts_other <- filter(inserts, class != "group g00") inserts <- dplyr::bind_rows( filter(inserts_g0, type == "start"), inserts_other, filter(inserts_g0, type == "end") ) %>% mutate(type = sprintf("%05d%s", 1:nrow(.), type)) %>% group_by(.data$loc, .data$type) %>% summarize(insert = paste(.data$insert, collapse = '')) %>% dplyr::ungroup() %>% mutate(type = sub("^\\d{5}", "", type)) # inserts now gives html (span open and close) to insert and loc # first split text at inserts$loc locations, # then recombine by zipping with inserts$insert text # start at 0, unless there's a hit on first character # end at nchar(text) + 1 because window is idx[k] to idx[k+1]-1 idx_split <- c(0 - (inserts$loc[1] == 0), inserts$loc) if (!(nchar(text) + 1) %in% idx_split) idx_split <- c(idx_split, nchar(text) + 1) text_split <- c() for (k in seq_along(idx_split[-1])) { text_split <- c(text_split, substr(text, idx_split[k], idx_split[k+1] - 1)) } out <- c() for (j in seq_along(text_split)) { out <- c( out, ifelse(escape, escape_html(text_split[j]), text_split[j]), if (!is.na(inserts$insert[j])) inserts$insert[j] ) } if (exact) out <- escape_backslash(out) paste(out, collapse = '') } #' Wraps capture groups in regex pattern in span tags to colorize with CSS #' #' @inheritParams view_regex #' @keywords internal wrap_regex <- function(pattern, escape = TRUE, exact = TRUE) { stopifnot(length(pattern) == 1) if (escape) pattern <- escape_html(pattern) # 1. walk characters in pattern # 2. if current is open parens # 1. walk backwards, counting number of "\\" until first non-"\\" char # 2. If odd, then not an opening group # 3. Look forward, if followed by "?" then not a capturing group # 4. If capturing group then add opening "(" to out and # add TRUE for valid capture group to parens stack # 5. If non-capturing group, add "(" to out and FALSE for non-valid to paren stack # 3. if close parens, add closing "" to out out <- c() paren_stack <- c() group <- 0 pattern_chars <- strsplit(pattern, "")[[1]] for (i in seq_along(pattern_chars)) { is_capture_group <- FALSE if (pattern_chars[i] == "(") { backslash_count <- 0 if (i != 1) { j <- i-1 while (pattern_chars[j] == "\\" && j > 0) { backslash_count <- backslash_count + 1 j <- j - 1 } } if (backslash_count %% 2 == 0) { if (i != length(pattern_chars) && pattern_chars[i + 1] != "?") { is_capture_group <- TRUE } } if (is_capture_group) { group <- group + 1 paren_stack <- c(TRUE, paren_stack) #push out <- c(out, paste0('(')) } else { paren_stack <- c(FALSE, paren_stack) #push out <- c(out, "(") } } else if (pattern_chars[i] == ")") { closes_capture_group <- paren_stack[1] paren_stack <- paren_stack[-1] #pop if (closes_capture_group) { out <- c(out, ")") } else { out <- c(out, ")") } } else { out <- c(out, pattern_chars[i]) } } if (exact) out <- escape_backslash(out) paste(out, collapse = "") } #' View grouped regex results #' #' View the result of the regular expression when applied to the given text. #' The default behavior renders the result as HTML and opens the file in #' the RStudio viewer pane. If `render` is `FALSE`, the HTML itself is returned. #' If the output is destined for a [knitr] document, set `knitr` to `TRUE`. #' #' @examples #' view_regex("example", "amp", render=FALSE) #' #' @param text Text to search #' @param pattern Regex pattern to look for #' @param render Render results to an HTML doc and open in RStudio viewer? #' @param escape Escape HTML-related characters in `text`? #' @param exact Should the regex pattern be displayed as entered by the user #' into R console or source (default)? When `TRUE`, regex is displayed with #' the double `\\\\` required for escaping backslashes in R. When `FALSE`, #' regex is displayed as interpreted by the regex engine (i.e. double `\\\\` #' as a single `\\`). #' @param result_only Should only the result be displayed? If `FALSE`, then #' the colorized regular expression is also displayed in the output. #' @inheritDotParams base::regexec ignore.case perl fixed useBytes #' @export view_regex <- function( text, pattern, ..., render = TRUE, escape = render, exact = escape, result_only = FALSE ) { knitr <- isTRUE(getOption('knitr.in.progress')) if (knitr) { render <- FALSE escape <- TRUE } regex_opts <- deprecate_knitr_option(...) regex_opts$text <- text regex_opts$pattern <- pattern res <- do.call(run_regex, regex_opts) res <- purrr::map_chr(res, wrap_result, escape = escape, exact = exact) res <- purrr::map_chr(res, function(resi) { result_pad <- "" if (grepl("pad\\d{2}", resi)) { max_pad <- max(stringr::str_extract_all(resi, "pad\\d{2}")[[1]]) max_pad_level <- as.integer(stringr::str_extract(max_pad, "\\d{2}")) if (max_pad_level - 3 > 0) { result_pad <- sprintf("pad%02d", max_pad_level - 3) } } paste("

", resi, "

") }) res <- paste(res, collapse = "") if (!nchar(pattern)) res <- paste("

", text, "

") if (knitr) { # embed css group_css <- htmltools::htmlDependency( name = "regexplain-groups", version = packageVersion("regexplain"), src = system.file("styles", package = "regexplain"), stylesheet = "groups.css") res <- htmltools::attachDependencies(htmltools::HTML(res), group_css) return(res) } if (!render) return(res) head <- if (!result_only) c( "---", "pagetitle: View Regex", "---", "
Pattern
", "

", wrap_regex(pattern, escape, exact), "

", "
Matches
" ) res <- c(head, res) tmp <- tempfile(fileext = ".Rmd") cat(res, file = tmp, sep = "\n") tmp_html <- suppressWarnings( rmarkdown::render( tmp, output_format = rmarkdown::html_document(css = c(system.file("styles", 'skeleton.css', package='regexplain'), system.file("styles", 'view_regex.css', package='regexplain'), system.file("styles", 'groups.css', package='regexplain')), theme = NULL, md_extensions = "-autolink_bare_uris"), quiet = TRUE )) rstudioapi::viewer(tmp_html) } deprecate_knitr_option <- function(...) { regex_opts <- list(...) if ("knitr" %in% names(regex_opts)) { warning("The `knitr` parameter of `view_regex()` has been removed. Running `view_regex()` in R Markdown is automatically detected.") } regex_opts[setdiff(names(regex_opts), "knitr")] }