🔍 An RStudio addin slash regex utility belt
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

259 satır
9.2KB

  1. #' Extract matched groups from regexp
  2. #'
  3. #' @param text Text to search
  4. #' @param pattern regexp
  5. #' @inheritDotParams base::regexec ignore.case perl fixed useBytes
  6. run_regex <- function(
  7. text,
  8. pattern,
  9. ignore.case = FALSE,
  10. perl = FALSE,
  11. fixed = FALSE,
  12. useBytes = FALSE
  13. ) {
  14. # Use regex to get matches by group, gives start index and length
  15. m <- regexec(pattern, text, ignore.case, perl, fixed, useBytes)
  16. # Convert to start/end index
  17. x <- purrr::map(m, function(mi) {
  18. list(
  19. 'idx' = purrr::map2(mi, attr(mi, "match.length"),
  20. ~ if(.x[1] != -1) c(.x, .x + .y)))
  21. })
  22. # Store text and original regexc result with same hierarchy
  23. y <- purrr::map(text, ~ list(text = .))
  24. z <- purrr::map(regmatches(text, m), ~ list(m = .))
  25. # Zip text, indexes and regexc match object lists
  26. purrr::map(seq_along(x), ~ list(text = y[[.]][[1]], idx = x[[.]][[1]], m = z[[.]][[1]]))
  27. }
  28. #' Wrap matches in HTML span tags to colorize via CSS
  29. #'
  30. #' @param x Individual list item in list returned by [run_regex()]
  31. #' @inheritParams view_regex
  32. #' @keywords internal
  33. wrap_result <- function(x, escape = FALSE, exact = FALSE) {
  34. if (is.null(x$idx[[1]])) return(if (escape) escape_html(x$text) else x$text)
  35. text <- x$text
  36. idx <- x$idx
  37. len_idx <- length(idx)
  38. inserts <- data.frame(
  39. i = 1:len_idx - 1,
  40. start = purrr::map_int(idx, ~ .[1]),
  41. end = purrr::map_int(idx, ~ .[2])
  42. ) %>%
  43. mutate(
  44. # unmatched groups have start/end of zero
  45. start = ifelse(.data$start == 0, NA, .data$start),
  46. end = ifelse(.data$end == 0, NA, .data$end),
  47. class = sprintf("group g%02d", .data$i),
  48. pad = 0
  49. )
  50. for (j in seq_len(nrow(inserts))) {
  51. if (inserts$i[j] == 0) next
  52. if (is.na(inserts$start[j]) || is.na(inserts$end[j])) next
  53. overlap <- filter(
  54. inserts[1:(j-1), ],
  55. .data$i != 0,
  56. .data$start <= !!inserts$start[j] & .data$end >= !!inserts$end[j])
  57. inserts[j, 'pad'] <- inserts$pad[j] + nrow(overlap)
  58. }
  59. inserts <- inserts %>%
  60. tidyr::gather(type, loc, start:end) %>%
  61. filter(!is.na(.data$loc)) %>%
  62. dplyr::arrange(loc, class, desc(type)) %>%
  63. mutate(
  64. class = ifelse(.data$pad > 0, sprintf("%s pad%02d", .data$class, .data$pad), .data$class),
  65. insert = ifelse(.data$type == 'start', sprintf('<span class="%s">', .data$class), "</span>")
  66. )
  67. inserts_g0 <- filter(inserts, class == "group g00")
  68. inserts_other <- filter(inserts, class != "group g00")
  69. inserts <- dplyr::bind_rows(
  70. filter(inserts_g0, type == "start"),
  71. inserts_other,
  72. filter(inserts_g0, type == "end")
  73. ) %>%
  74. mutate(type = sprintf("%05d%s", 1:nrow(.), type)) %>%
  75. group_by(.data$loc, .data$type) %>%
  76. summarize(insert = paste(.data$insert, collapse = '')) %>%
  77. dplyr::ungroup() %>%
  78. mutate(type = sub("^\\d{5}", "", type))
  79. # inserts now gives html (span open and close) to insert and loc
  80. # first split text at inserts$loc locations,
  81. # then recombine by zipping with inserts$insert text
  82. # start at 0, unless there's a hit on first character
  83. # end at nchar(text) + 1 because window is idx[k] to idx[k+1]-1
  84. idx_split <- c(0 - (inserts$loc[1] == 0), inserts$loc)
  85. if (!(nchar(text) + 1) %in% idx_split)
  86. idx_split <- c(idx_split, nchar(text) + 1)
  87. text_split <- c()
  88. for (k in seq_along(idx_split[-1])) {
  89. text_split <- c(text_split, substr(text, idx_split[k], idx_split[k+1] - 1))
  90. }
  91. out <- c()
  92. for (j in seq_along(text_split)) {
  93. out <- c(
  94. out,
  95. ifelse(escape, escape_html(text_split[j]), text_split[j]),
  96. if (!is.na(inserts$insert[j])) inserts$insert[j]
  97. )
  98. }
  99. if (exact) out <- escape_backslash(out)
  100. paste(out, collapse = '')
  101. }
  102. #' Wraps capture groups in regex pattern in span tags to colorize with CSS
  103. #'
  104. #' @inheritParams view_regex
  105. #' @keywords internal
  106. wrap_regex <- function(pattern, escape = TRUE, exact = TRUE) {
  107. stopifnot(length(pattern) == 1)
  108. if (escape) pattern <- escape_html(pattern)
  109. # 1. walk characters in pattern
  110. # 2. if current is open parens
  111. # 1. walk backwards, counting number of "\\" until first non-"\\" char
  112. # 2. If odd, then not an opening group
  113. # 3. Look forward, if followed by "?" then not a capturing group
  114. # 4. If capturing group then add opening "<span...>(" to out and
  115. # add TRUE for valid capture group to parens stack
  116. # 5. If non-capturing group, add "(" to out and FALSE for non-valid to paren stack
  117. # 3. if close parens, add closing "</span>" to out
  118. out <- c()
  119. paren_stack <- c()
  120. group <- 0
  121. pattern_chars <- strsplit(pattern, "")[[1]]
  122. for (i in seq_along(pattern_chars)) {
  123. is_capture_group <- FALSE
  124. if (pattern_chars[i] == "(") {
  125. backslash_count <- 0
  126. if (i != 1) {
  127. j <- i-1
  128. while (pattern_chars[j] == "\\" && j > 0) {
  129. backslash_count <- backslash_count + 1
  130. j <- j - 1
  131. }
  132. }
  133. if (backslash_count %% 2 == 0) {
  134. if (i != length(pattern_chars) && pattern_chars[i + 1] != "?") {
  135. is_capture_group <- TRUE
  136. }
  137. }
  138. if (is_capture_group) {
  139. group <- group + 1
  140. paren_stack <- c(TRUE, paren_stack) #push
  141. out <- c(out, paste0('<span class="g', sprintf("%02d", group), '">('))
  142. } else {
  143. paren_stack <- c(FALSE, paren_stack) #push
  144. out <- c(out, "(")
  145. }
  146. } else if (pattern_chars[i] == ")") {
  147. closes_capture_group <- paren_stack[1]
  148. paren_stack <- paren_stack[-1] #pop
  149. if (closes_capture_group) {
  150. out <- c(out, ")</span>")
  151. } else {
  152. out <- c(out, ")")
  153. }
  154. } else {
  155. out <- c(out, pattern_chars[i])
  156. }
  157. }
  158. if (exact) out <- escape_backslash(out)
  159. paste(out, collapse = "")
  160. }
  161. #' View grouped regex results
  162. #'
  163. #' View the result of the regular expression when applied to the given text.
  164. #' The default behavior renders the result as HTML and opens the file in
  165. #' the RStudio viewer pane. If `render` is `FALSE`, the HTML itself is returned.
  166. #' If the output is destined for a [knitr] document, set `knitr` to `TRUE`.
  167. #'
  168. #' @examples
  169. #' view_regex("example", "amp", render=FALSE)
  170. #'
  171. #' @param text Text to search
  172. #' @param pattern Regex pattern to look for
  173. #' @param render Render results to an HTML doc and open in RStudio viewer?
  174. #' @param escape Escape HTML-related characters in `text`?
  175. #' @param exact Should the regex pattern be displayed as entered by the user
  176. #' into R console or source (default)? When `TRUE`, regex is displayed with
  177. #' the double `\\\\` required for escaping backslashes in R. When `FALSE`,
  178. #' regex is displayed as interpreted by the regex engine (i.e. double `\\\\`
  179. #' as a single `\\`).
  180. #' @param result_only Should only the result be displayed? If `FALSE`, then
  181. #' the colorized regular expression is also displayed in the output.
  182. #' @inheritDotParams base::regexec ignore.case perl fixed useBytes
  183. #' @export
  184. view_regex <- function(
  185. text,
  186. pattern,
  187. ...,
  188. render = TRUE,
  189. escape = render,
  190. exact = escape,
  191. result_only = FALSE
  192. ) {
  193. knitr <- isTRUE(getOption('knitr.in.progress'))
  194. if (knitr) {
  195. render <- FALSE
  196. escape <- TRUE
  197. }
  198. regex_opts <- deprecate_knitr_option(...)
  199. regex_opts$text <- text
  200. regex_opts$pattern <- pattern
  201. res <- do.call(run_regex, regex_opts)
  202. res <- purrr::map_chr(res, wrap_result, escape = escape, exact = exact)
  203. res <- purrr::map_chr(res, function(resi) {
  204. result_pad <- ""
  205. if (grepl("pad\\d{2}", resi)) {
  206. max_pad <- max(stringr::str_extract_all(resi, "pad\\d{2}")[[1]])
  207. max_pad_level <- as.integer(stringr::str_extract(max_pad, "\\d{2}"))
  208. if (max_pad_level - 3 > 0) {
  209. result_pad <- sprintf("pad%02d", max_pad_level - 3)
  210. }
  211. }
  212. paste("<p class='results", result_pad, "'>", resi, "</p>")
  213. })
  214. res <- paste(res, collapse = "")
  215. if (!nchar(pattern)) res <- paste("<p class='results'>", text, "</p>")
  216. if (knitr) {
  217. # embed css
  218. group_css <- htmltools::htmlDependency(
  219. name = "regexplain-groups", version = packageVersion("regexplain"),
  220. src = system.file("styles", package = "regexplain"),
  221. stylesheet = "groups.css")
  222. res <- htmltools::attachDependencies(htmltools::HTML(res), group_css)
  223. return(res)
  224. }
  225. if (!render) return(res)
  226. head <- if (!result_only) c(
  227. "---", "pagetitle: View Regex", "---",
  228. "<h5>Pattern</h5>",
  229. "<p><pre>", wrap_regex(pattern, escape, exact), "</pre></p>",
  230. "<h5>Matches</h5>"
  231. )
  232. res <- c(head, res)
  233. tmp <- tempfile(fileext = ".Rmd")
  234. cat(res, file = tmp, sep = "\n")
  235. tmp_html <- suppressWarnings(
  236. rmarkdown::render(
  237. tmp,
  238. output_format = rmarkdown::html_document(css = c(system.file("styles", 'skeleton.css', package='regexplain'),
  239. system.file("styles", 'view_regex.css', package='regexplain'),
  240. system.file("styles", 'groups.css', package='regexplain')),
  241. theme = NULL,
  242. md_extensions = "-autolink_bare_uris"),
  243. quiet = TRUE
  244. ))
  245. rstudioapi::viewer(tmp_html)
  246. }
  247. deprecate_knitr_option <- function(...) {
  248. regex_opts <- list(...)
  249. if ("knitr" %in% names(regex_opts)) {
  250. warning("The `knitr` parameter of `view_regex()` has been removed. Running `view_regex()` in R Markdown is automatically detected.")
  251. }
  252. regex_opts[setdiff(names(regex_opts), "knitr")]
  253. }