🔍 An RStudio addin slash regex utility belt
Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

289 lines
10KB

  1. #' Extract matched groups from regexp
  2. #'
  3. #' @param text Text to search
  4. #' @param pattern regexp
  5. #' @param global If `TRUE`, enables global pattern matching
  6. #' @inheritDotParams base::regexec ignore.case perl fixed useBytes
  7. run_regex <- function(
  8. text,
  9. pattern,
  10. ignore.case = FALSE,
  11. perl = FALSE,
  12. fixed = FALSE,
  13. useBytes = FALSE,
  14. global = FALSE
  15. ) {
  16. # Use regex to get matches by group, gives start index and length
  17. m <- regexec(pattern, text, ignore.case, perl, fixed, useBytes)
  18. m <- purrr::map2(text, m, ~ list(text = .x, idx = expand_matches(.y)))
  19. attr(m, "global") <- global
  20. if (!global) return(m)
  21. mmi <- max_match_index(m)
  22. if (any(!is.na(mmi))) {
  23. subtext <- purrr::map_chr(m, "text") %>% purrr::map2_chr(mmi, substring)
  24. subtext[is.na(subtext)] <- ""
  25. m2 <- run_regex(subtext, pattern, ignore.case, perl, fixed, useBytes)
  26. for (i in seq_along(m2)) {
  27. if (is.null(m2[[i]]$idx[[1]])) next
  28. m2[[i]]$idx[, c(1, 2)] <- m2[[i]]$idx[, c(1, 2)] + mmi[i] - 1
  29. m[[i]]$idx <- dplyr::bind_rows(m[[i]]$idx, m2[[i]]$idx)
  30. }
  31. }
  32. m
  33. }
  34. expand_matches <- function(m) {
  35. if (m[1] == -1) return(list(NULL))
  36. m_length <- attr(m, "match.length")
  37. x <- purrr::map2(m, m_length, ~ c(.x, .x + .y))
  38. x <- as.data.frame(do.call(rbind, x))
  39. names(x) <- c("start", "end")
  40. x$start <- ifelse(x$start == 0L, NA_integer_, x$start)
  41. x$end <- ifelse(x$end == 0L, NA_integer_, x$end)
  42. x$group <- 1:nrow(x) - 1L
  43. dplyr::as_tibble(x, validate = FALSE)
  44. }
  45. max_match_index <- function(m) {
  46. max_na <- function(x) if (is.null(x)) NA else max(x, na.rm = TRUE)
  47. max_int <- function(x) as.integer(max(x))
  48. purrr::map(m, "idx") %>%
  49. purrr::modify_depth(1, ~c(start = max_na(.x$start), end = max_na(.x$end))) %>%
  50. purrr::map_int(max_int)
  51. }
  52. #' Wrap matches in HTML span tags to colorize via CSS
  53. #'
  54. #' @param x Individual list item in list returned by [run_regex()]
  55. #' @inheritParams view_regex
  56. #' @keywords internal
  57. wrap_result <- function(x, escape = FALSE, exact = FALSE) {
  58. if (is.null(x$idx[[1]])) return(if (escape) escape_html(x$text) else x$text)
  59. text <- x$text
  60. idx <- x$idx
  61. len_idx <- length(idx)
  62. inserts <- data.frame(
  63. i = 1:len_idx - 1,
  64. start = purrr::map_int(idx, ~ .[1]),
  65. end = purrr::map_int(idx, ~ .[2])
  66. ) %>%
  67. mutate(
  68. # unmatched groups have start/end of zero
  69. start = ifelse(.data$start == 0, NA, .data$start),
  70. end = ifelse(.data$end == 0, NA, .data$end),
  71. class = sprintf("group g%02d", .data$i),
  72. pad = 0
  73. )
  74. for (j in seq_len(nrow(inserts))) {
  75. if (inserts$i[j] == 0) next
  76. if (is.na(inserts$start[j]) || is.na(inserts$end[j])) next
  77. overlap <- filter(
  78. inserts[1:(j-1), ],
  79. .data$i != 0,
  80. .data$start <= !!inserts$start[j] & .data$end >= !!inserts$end[j])
  81. inserts[j, 'pad'] <- inserts$pad[j] + nrow(overlap)
  82. }
  83. inserts <- inserts %>%
  84. tidyr::gather(type, loc, start:end) %>%
  85. filter(!is.na(.data$loc)) %>%
  86. dplyr::arrange(loc, class, dplyr::desc(type)) %>%
  87. mutate(
  88. class = ifelse(.data$pad > 0, sprintf("%s pad%02d", .data$class, .data$pad), .data$class),
  89. insert = ifelse(.data$type == 'start', sprintf('<span class="%s">', .data$class), "</span>")
  90. )
  91. inserts_g0 <- filter(inserts, class == "group g00")
  92. inserts_other <- filter(inserts, class != "group g00")
  93. inserts <- dplyr::bind_rows(
  94. filter(inserts_g0, type == "start"),
  95. inserts_other,
  96. filter(inserts_g0, type == "end")
  97. ) %>%
  98. mutate(type = sprintf("%05d%s", 1:nrow(.), type)) %>%
  99. group_by(.data$loc, .data$type) %>%
  100. summarize(insert = paste(.data$insert, collapse = '')) %>%
  101. dplyr::ungroup() %>%
  102. mutate(type = sub("^\\d{5}", "", type))
  103. # inserts now gives html (span open and close) to insert and loc
  104. # first split text at inserts$loc locations,
  105. # then recombine by zipping with inserts$insert text
  106. # start at 0, unless there's a hit on first character
  107. # end at nchar(text) + 1 because window is idx[k] to idx[k+1]-1
  108. idx_split <- c(0 - (inserts$loc[1] == 0), inserts$loc)
  109. if (!(nchar(text) + 1) %in% idx_split)
  110. idx_split <- c(idx_split, nchar(text) + 1)
  111. text_split <- c()
  112. for (k in seq_along(idx_split[-1])) {
  113. text_split <- c(text_split, substr(text, idx_split[k], idx_split[k+1] - 1))
  114. }
  115. out <- c()
  116. for (j in seq_along(text_split)) {
  117. out <- c(
  118. out,
  119. ifelse(escape, escape_html(text_split[j]), text_split[j]),
  120. if (!is.na(inserts$insert[j])) inserts$insert[j]
  121. )
  122. }
  123. if (exact) out <- escape_backslash(out)
  124. paste(out, collapse = '')
  125. }
  126. #' Wraps capture groups in regex pattern in span tags to colorize with CSS
  127. #'
  128. #' @inheritParams view_regex
  129. #' @keywords internal
  130. wrap_regex <- function(pattern, escape = TRUE, exact = TRUE) {
  131. stopifnot(length(pattern) == 1)
  132. if (escape) pattern <- escape_html(pattern)
  133. # 1. walk characters in pattern
  134. # 2. if current is open parens
  135. # 1. walk backwards, counting number of "\\" until first non-"\\" char
  136. # 2. If odd, then not an opening group
  137. # 3. Look forward, if followed by "?" then not a capturing group
  138. # 4. If capturing group then add opening "<span...>(" to out and
  139. # add TRUE for valid capture group to parens stack
  140. # 5. If non-capturing group, add "(" to out and FALSE for non-valid to paren stack
  141. # 3. if close parens, add closing "</span>" to out
  142. out <- c()
  143. paren_stack <- c()
  144. group <- 0
  145. pattern_chars <- strsplit(pattern, "")[[1]]
  146. for (i in seq_along(pattern_chars)) {
  147. is_capture_group <- FALSE
  148. if (pattern_chars[i] == "(") {
  149. backslash_count <- 0
  150. if (i != 1) {
  151. j <- i-1
  152. while (pattern_chars[j] == "\\" && j > 0) {
  153. backslash_count <- backslash_count + 1
  154. j <- j - 1
  155. }
  156. }
  157. if (backslash_count %% 2 == 0) {
  158. if (i != length(pattern_chars) && pattern_chars[i + 1] != "?") {
  159. is_capture_group <- TRUE
  160. }
  161. }
  162. if (is_capture_group) {
  163. group <- group + 1
  164. paren_stack <- c(TRUE, paren_stack) #push
  165. out <- c(out, paste0('<span class="g', sprintf("%02d", group), '">('))
  166. } else {
  167. paren_stack <- c(FALSE, paren_stack) #push
  168. out <- c(out, "(")
  169. }
  170. } else if (pattern_chars[i] == ")") {
  171. closes_capture_group <- paren_stack[1]
  172. paren_stack <- paren_stack[-1] #pop
  173. if (closes_capture_group) {
  174. out <- c(out, ")</span>")
  175. } else {
  176. out <- c(out, ")")
  177. }
  178. } else {
  179. out <- c(out, pattern_chars[i])
  180. }
  181. }
  182. if (exact) out <- escape_backslash(out)
  183. paste(out, collapse = "")
  184. }
  185. #' View grouped regex results
  186. #'
  187. #' View the result of the regular expression when applied to the given text.
  188. #' The default behavior renders the result as HTML and opens the file in
  189. #' the RStudio viewer pane. If `render` is `FALSE`, the HTML itself is returned.
  190. #' If the output is destined for a [knitr] document, set `knitr` to `TRUE`.
  191. #'
  192. #' @examples
  193. #' view_regex("example", "amp", render=FALSE)
  194. #'
  195. #' @param text Text to search
  196. #' @param pattern Regex pattern to look for
  197. #' @param render Render results to an HTML doc and open in RStudio viewer?
  198. #' @param escape Escape HTML-related characters in `text`?
  199. #' @param exact Should the regex pattern be displayed as entered by the user
  200. #' into R console or source (default)? When `TRUE`, regex is displayed with
  201. #' the double `\\\\` required for escaping backslashes in R. When `FALSE`,
  202. #' regex is displayed as interpreted by the regex engine (i.e. double `\\\\`
  203. #' as a single `\\`).
  204. #' @param result_only Should only the result be displayed? If `FALSE`, then
  205. #' the colorized regular expression is also displayed in the output.
  206. #' @inheritDotParams base::regexec ignore.case perl fixed useBytes
  207. #' @export
  208. view_regex <- function(
  209. text,
  210. pattern,
  211. ...,
  212. render = TRUE,
  213. escape = render,
  214. exact = escape,
  215. result_only = FALSE
  216. ) {
  217. knitr <- isTRUE(getOption('knitr.in.progress'))
  218. if (knitr) {
  219. render <- FALSE
  220. escape <- TRUE
  221. }
  222. regex_opts <- deprecate_knitr_option(...)
  223. regex_opts$text <- text
  224. regex_opts$pattern <- pattern
  225. res <- do.call(run_regex, regex_opts)
  226. res <- purrr::map_chr(res, wrap_result, escape = escape, exact = exact)
  227. res <- purrr::map_chr(res, function(resi) {
  228. result_pad <- ""
  229. if (grepl("pad\\d{2}", resi)) {
  230. max_pad <- max(stringr::str_extract_all(resi, "pad\\d{2}")[[1]])
  231. max_pad_level <- as.integer(stringr::str_extract(max_pad, "\\d{2}"))
  232. if (max_pad_level - 3 > 0) {
  233. result_pad <- sprintf("pad%02d", max_pad_level - 3)
  234. }
  235. }
  236. paste("<p class='results", result_pad, "'>", resi, "</p>")
  237. })
  238. res <- paste(res, collapse = "")
  239. if (!nchar(pattern)) res <- paste("<p class='results'>", text, "</p>")
  240. if (knitr) {
  241. # embed css
  242. group_css <- htmltools::htmlDependency(
  243. name = "regexplain-groups", version = packageVersion("regexplain"),
  244. src = system.file("styles", package = "regexplain"),
  245. stylesheet = "groups.css")
  246. res <- htmltools::attachDependencies(htmltools::HTML(res), group_css)
  247. return(res)
  248. }
  249. if (!render) return(res)
  250. head <- if (!result_only) c(
  251. "---", "pagetitle: View Regex", "---",
  252. "<h5>Pattern</h5>",
  253. "<p><pre>", wrap_regex(pattern, escape, exact), "</pre></p>",
  254. "<h5>Matches</h5>"
  255. )
  256. res <- c(head, res)
  257. tmp <- tempfile(fileext = ".Rmd")
  258. cat(res, file = tmp, sep = "\n")
  259. tmp_html <- suppressWarnings(
  260. rmarkdown::render(
  261. tmp,
  262. output_format = rmarkdown::html_document(css = c(system.file("styles", 'skeleton.css', package='regexplain'),
  263. system.file("styles", 'view_regex.css', package='regexplain'),
  264. system.file("styles", 'groups.css', package='regexplain')),
  265. theme = NULL,
  266. md_extensions = "-autolink_bare_uris"),
  267. quiet = TRUE
  268. ))
  269. rstudioapi::viewer(tmp_html)
  270. }
  271. deprecate_knitr_option <- function(...) {
  272. regex_opts <- list(...)
  273. if ("knitr" %in% names(regex_opts)) {
  274. warning("The `knitr` parameter of `view_regex()` has been removed. Running `view_regex()` in R Markdown is automatically detected.")
  275. }
  276. regex_opts[setdiff(names(regex_opts), "knitr")]
  277. }