🔍 An RStudio addin slash regex utility belt
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

227 lines
7.9KB

  1. #' Extract matched groups from regexp
  2. #'
  3. #' @param text Text to search
  4. #' @param pattern regexp
  5. #' @inheritDotParams base::regexec ignore.case perl fixed useBytes
  6. run_regex <- function(
  7. text,
  8. pattern,
  9. ignore.case = FALSE,
  10. perl = FALSE,
  11. fixed = FALSE,
  12. useBytes = FALSE
  13. ) {
  14. # Use regex to get matches by group, gives start index and length
  15. m <- regexec(pattern, text, ignore.case, perl, fixed, useBytes)
  16. # Convert to start/end index
  17. x <- purrr::map(m, function(mi) {
  18. list(
  19. 'idx' = purrr::map2(mi, attr(mi, "match.length"),
  20. ~ if(.x[1] != -1) c(.x, .x + .y)))
  21. })
  22. # Store text and original regexc result with same hierarchy
  23. y <- purrr::map(text, ~ list(text = .))
  24. z <- purrr::map(regmatches(text, m), ~ list(m = .))
  25. # Zip text, indexes and regexc match object lists
  26. purrr::map(seq_along(x), ~ list(text = y[[.]][[1]], idx = x[[.]][[1]], m = z[[.]][[1]]))
  27. }
  28. #' Wrap matches in HTML span tags to colorize via CSS
  29. #'
  30. #' @param x Individual list item in list returned by [run_regex()]
  31. #' @inheritParams view_regex
  32. #' @keywords internal
  33. wrap_result <- function(x, escape = FALSE, exact = FALSE) {
  34. if (is.null(x$idx[[1]])) return(if (escape) escape_html(x$text) else x$text)
  35. text <- x$text
  36. idx <- x$idx
  37. len_idx <- length(idx)
  38. inserts <- data.frame(
  39. i = 1:len_idx - 1,
  40. start = purrr::map_int(idx, ~ .[1]),
  41. end = purrr::map_int(idx, ~ .[2])
  42. ) %>%
  43. mutate(
  44. # unmatched groups have start/end of zero
  45. start = ifelse(.data$start == 0, NA, .data$start),
  46. end = ifelse(.data$end == 0, NA, .data$end),
  47. class = sprintf("group g%02d", .data$i),
  48. pad = 0
  49. )
  50. for (j in seq_len(nrow(inserts))) {
  51. if (inserts$i[j] == 0) next
  52. if (is.na(inserts$start[j]) || is.na(inserts$end[j])) next
  53. overlap <- filter(
  54. inserts[1:(j-1), ],
  55. .data$i != 0,
  56. .data$start <= !!inserts$start[j] & .data$end >= !!inserts$end[j])
  57. inserts[j, 'pad'] <- inserts$pad[j] + nrow(overlap)
  58. }
  59. inserts <- inserts %>%
  60. tidyr::gather(type, loc, start:end) %>%
  61. filter(!is.na(.data$loc)) %>%
  62. mutate(
  63. class = ifelse(.data$pad > 0, sprintf("%s pad%02d", .data$class, .data$pad), .data$class),
  64. insert = ifelse(.data$type == 'start', sprintf('<span class="%s">', .data$class), "</span>")
  65. ) %>%
  66. group_by(.data$loc, .data$type) %>%
  67. summarize(insert = paste(.data$insert, collapse = ''))
  68. # inserts now gives html (span open and close) to insert and loc
  69. # first split text at inserts$loc locations,
  70. # then recombine by zipping with inserts$insert text
  71. # start at 0, unless there's a hit on first character
  72. # end at nchar(text) + 1 because window is idx[k] to idx[k+1]-1
  73. idx_split <- c(0 - (inserts$loc[1] == 0), inserts$loc)
  74. if (!(nchar(text) + 1) %in% idx_split)
  75. idx_split <- c(idx_split, nchar(text) + 1)
  76. text_split <- c()
  77. for (k in seq_along(idx_split[-1])) {
  78. text_split <- c(text_split, substr(text, idx_split[k], idx_split[k+1] - 1))
  79. }
  80. out <- c()
  81. for (j in seq_along(text_split)) {
  82. out <- c(
  83. out,
  84. ifelse(escape, escape_html(text_split[j]), text_split[j]),
  85. if (!is.na(inserts$insert[j])) inserts$insert[j]
  86. )
  87. }
  88. if (exact) out <- escape_backslash(out)
  89. paste(out, collapse = '')
  90. }
  91. #' Wraps capture groups in regex pattern in span tags to colorize with CSS
  92. #'
  93. #' @inheritParams view_regex
  94. #' @keywords internal
  95. wrap_regex <- function(pattern, escape = TRUE, exact = TRUE) {
  96. stopifnot(length(pattern) == 1)
  97. if (escape) pattern <- escape_html(pattern)
  98. # 1. walk characters in pattern
  99. # 2. if current is open parens
  100. # 1. walk backwards, counting number of "\\" until first non-"\\" char
  101. # 2. If odd, then not an opening group
  102. # 3. Look forward, if followed by "?" then not a capturing group
  103. # 4. If capturing group then add opening "<span...>(" to out and
  104. # add TRUE for valid capture group to parens stack
  105. # 5. If non-capturing group, add "(" to out and FALSE for non-valid to paren stack
  106. # 3. if close parens, add closing "</span>" to out
  107. out <- c()
  108. paren_stack <- c()
  109. group <- 0
  110. pattern_chars <- strsplit(pattern, "")[[1]]
  111. for (i in seq_along(pattern_chars)) {
  112. is_capture_group <- FALSE
  113. if (pattern_chars[i] == "(") {
  114. backslash_count <- 0
  115. if (i != 1) {
  116. j <- i-1
  117. while (pattern_chars[j] == "\\" && j > 0) {
  118. backslash_count <- backslash_count + 1
  119. j <- j - 1
  120. }
  121. }
  122. if (backslash_count %% 2 == 0) {
  123. if (i != length(pattern_chars) && pattern_chars[i + 1] != "?") {
  124. is_capture_group <- TRUE
  125. }
  126. }
  127. if (is_capture_group) {
  128. group <- group + 1
  129. paren_stack <- c(TRUE, paren_stack) #push
  130. out <- c(out, paste0('<span class="g', sprintf("%02d", group), '">('))
  131. } else {
  132. paren_stack <- c(FALSE, paren_stack) #push
  133. out <- c(out, "(")
  134. }
  135. } else if (pattern_chars[i] == ")") {
  136. closes_capture_group <- paren_stack[1]
  137. paren_stack <- paren_stack[-1] #pop
  138. if (closes_capture_group) {
  139. out <- c(out, ")</span>")
  140. } else {
  141. out <- c(out, ")")
  142. }
  143. } else {
  144. out <- c(out, pattern_chars[i])
  145. }
  146. }
  147. if (exact) out <- escape_backslash(out)
  148. paste(out, collapse = "")
  149. }
  150. #' View grouped regex results
  151. #'
  152. #' View the result of the regular expression when applied to the given text.
  153. #' The default behavior renders the result as HTML and opens the file in
  154. #' the RStudio viewer pane. If `render` is `FALSE`, the HTML itself is returned.
  155. #' If the output is destined for a [knitr] document, set `knitr` to `TRUE`.
  156. #'
  157. #' @examples
  158. #' view_regex("example", "amp", render=FALSE)
  159. #'
  160. #' @param text Text to search
  161. #' @param pattern Regex pattern to look for
  162. #' @param render Render results to an HTML doc and open in RStudio viewer?
  163. #' @param escape Escape HTML-related characters in `text`?
  164. #' @param knitr Print into knitr doc? If `TRUE`, marks text as `asis_output` and
  165. #' sets `render = FALSE` and `escape = TRUE`.
  166. #' @param exact Should the regex pattern be displayed as entered by the user
  167. #' into R console or source (default)? When `TRUE`, regex is displayed with
  168. #' the double `\\\\` required for escaping backslashes in R. When `FALSE`,
  169. #' regex is displayed as interpreted by the regex engine (i.e. double `\\\\`
  170. #' as a single `\\`).
  171. #' @inheritDotParams base::regexec ignore.case perl fixed useBytes
  172. #' @export
  173. view_regex <- function(
  174. text,
  175. pattern,
  176. ...,
  177. render = TRUE,
  178. escape = render,
  179. knitr = FALSE,
  180. exact = escape
  181. ) {
  182. if (knitr) {
  183. render <- FALSE
  184. escape <- TRUE
  185. }
  186. res <- run_regex(text, pattern, ...)
  187. res <- purrr::map_chr(res, wrap_result, escape = escape, exact = exact)
  188. res <- purrr::map_chr(res, function(resi) {
  189. result_pad <- ""
  190. if (grepl("pad\\d{2}", resi)) {
  191. max_pad <- max(stringr::str_extract_all(resi, "pad\\d{2}")[[1]])
  192. max_pad_level <- as.integer(stringr::str_extract(max_pad, "\\d{2}"))
  193. if (max_pad_level - 3 > 0) {
  194. result_pad <- sprintf("pad%02d", max_pad_level - 3)
  195. }
  196. }
  197. paste("<p class='results", result_pad, "'>", resi, "</p>")
  198. })
  199. res <- paste(res, collapse = "")
  200. if (!nchar(pattern)) res <- paste("<p class='results'>", text, "</p>")
  201. if (knitr) return(knitr::asis_output(res))
  202. if (!render) return(res)
  203. head <- c(
  204. "---", "pagetitle: View Regex", "---",
  205. "<h5>Pattern</h5>",
  206. "<p><pre>", wrap_regex(pattern, escape, exact), "</pre></p>",
  207. "<h5>Matches</h5>"
  208. )
  209. res <- c(head, res)
  210. tmp <- tempfile(fileext = ".Rmd")
  211. cat(res, file = tmp, sep = "\n")
  212. tmp_html <- suppressWarnings(
  213. rmarkdown::render(
  214. tmp,
  215. output_format = rmarkdown::html_document(css = c(system.file("styles", 'skeleton.css', package='regexplain'),
  216. system.file("styles", 'style.css', package='regexplain')),
  217. theme = NULL,
  218. md_extensions = "-autolink_bare_uris"),
  219. quiet = TRUE
  220. ))
  221. rstudioapi::viewer(tmp_html)
  222. }