🔍 An RStudio addin slash regex utility belt
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

289 lines
10KB

  1. #' Extract matched groups from regexp
  2. #'
  3. #' @param text Text to search
  4. #' @param pattern regexp
  5. #' @param global If `TRUE`, enables global pattern matching
  6. #' @inheritDotParams base::regexec ignore.case perl fixed useBytes
  7. run_regex <- function(
  8. text,
  9. pattern,
  10. ignore.case = FALSE,
  11. perl = FALSE,
  12. fixed = FALSE,
  13. useBytes = FALSE,
  14. global = FALSE
  15. ) {
  16. # Use regex to get matches by group, gives start index and length
  17. m <- regexec(pattern, text, ignore.case, perl, fixed, useBytes)
  18. m <- purrr::map2(text, m, ~ list(text = .x, idx = expand_matches(.y)))
  19. attr(m, "global") <- global
  20. if (!global) return(m)
  21. mmi <- max_match_index(m)
  22. if (any(!is.na(mmi))) {
  23. subtext <- purrr::map_chr(m, "text") %>% purrr::map2_chr(mmi, substring)
  24. subtext[is.na(subtext)] <- ""
  25. m2 <- run_regex(subtext, pattern, ignore.case, perl, fixed, useBytes)
  26. for (i in seq_along(m2)) {
  27. if (is.null(m2[[i]]$idx[[1]])) next
  28. m2[[i]]$idx[, c(1, 2)] <- m2[[i]]$idx[, c(1, 2)] + mmi[i] - 1
  29. m[[i]]$idx <- dplyr::bind_rows(m[[i]]$idx, m2[[i]]$idx)
  30. }
  31. }
  32. m
  33. }
  34. expand_matches <- function(m) {
  35. if (m[1] == -1) return(list(NULL))
  36. m_length <- attr(m, "match.length")
  37. x <- purrr::map2(m, m_length, ~ c(.x, .x + .y))
  38. x <- as.data.frame(do.call(rbind, x))
  39. names(x) <- c("start", "end")
  40. x$start <- ifelse(x$start == 0L, NA_integer_, x$start)
  41. x$end <- ifelse(x$end == 0L, NA_integer_, x$end)
  42. x$group <- 1:nrow(x) - 1L
  43. dplyr::as_tibble(x, validate = FALSE)
  44. }
  45. max_match_index <- function(m) {
  46. max_na <- function(x) if (is.null(x)) NA else max(x, na.rm = TRUE)
  47. max_int <- function(x) as.integer(max(x))
  48. purrr::map(m, "idx") %>%
  49. purrr::modify_depth(1, ~c(start = max_na(.x$start), end = max_na(.x$end))) %>%
  50. purrr::map_int(max_int)
  51. }
  52. #' Wrap matches in HTML span tags to colorize via CSS
  53. #'
  54. #' @param x Individual list item in list returned by [run_regex()]
  55. #' @inheritParams view_regex
  56. #' @keywords internal
  57. wrap_result <- function(x, escape = FALSE, exact = FALSE) {
  58. if (is.null(x$idx[[1]])) return(if (escape) escape_html(x$text) else x$text)
  59. text <- x$text
  60. idx <- x$idx
  61. len_idx <- length(idx)
  62. inserts <- data.frame(
  63. i = 1:len_idx - 1,
  64. start = purrr::map_int(idx, ~ .[1]),
  65. end = purrr::map_int(idx, ~ .[2])
  66. ) %>%
  67. mutate(
  68. # unmatched groups have start/end of zero
  69. start = ifelse(.data$start == 0, NA, .data$start),
  70. end = ifelse(.data$end == 0, NA, .data$end),
  71. class = sprintf("group g%02d", .data$i),
  72. pad = 0
  73. )
  74. for (j in seq_len(nrow(inserts))) {
  75. if (inserts$i[j] == 0) next
  76. if (is.na(inserts$start[j]) || is.na(inserts$end[j])) next
  77. overlap <- filter(
  78. inserts[1:(j-1), ],
  79. .data$i != 0,
  80. .data$start <= !!inserts$start[j] & .data$end >= !!inserts$end[j])
  81. inserts[j, 'pad'] <- inserts$pad[j] + nrow(overlap)
  82. }
  83. inserts <- inserts %>%
  84. tidyr::gather(type, loc, start:end) %>%
  85. filter(!is.na(.data$loc)) %>%
  86. dplyr::arrange(loc, class, dplyr::desc(type)) %>%
  87. mutate(
  88. class = ifelse(.data$pad > 0, sprintf("%s pad%02d", .data$class, .data$pad), .data$class),
  89. insert = ifelse(.data$type == 'start', sprintf('<span class="%s">', .data$class), "</span>")
  90. )
  91. inserts_g0 <- filter(inserts, class == "group g00")
  92. inserts_other <- filter(inserts, class != "group g00")
  93. inserts <- dplyr::bind_rows(
  94. filter(inserts_g0, type == "start"),
  95. inserts_other,
  96. filter(inserts_g0, type == "end")
  97. ) %>%
  98. mutate(type = sprintf("%05d%s", 1:nrow(.), type)) %>%
  99. group_by(.data$loc, .data$type) %>%
  100. summarize(insert = paste(.data$insert, collapse = '')) %>%
  101. dplyr::ungroup() %>%
  102. mutate(type = sub("^\\d{5}", "", type))
  103. # inserts now gives html (span open and close) to insert and loc
  104. # first split text at inserts$loc locations,
  105. # then recombine by zipping with inserts$insert text
  106. # start at 0, unless there's a hit on first character
  107. # end at nchar(text) + 1 because window is idx[k] to idx[k+1]-1
  108. idx_split <- c(0 - (inserts$loc[1] == 0), inserts$loc)
  109. if (!(nchar(text) + 1) %in% idx_split)
  110. idx_split <- c(idx_split, nchar(text) + 1)
  111. text_split <- c()
  112. for (k in seq_along(idx_split[-1])) {
  113. text_split <- c(text_split, substr(text, idx_split[k], idx_split[k+1] - 1))
  114. }
  115. out <- c()
  116. for (j in seq_along(text_split)) {
  117. out <- c(
  118. out,
  119. ifelse(escape, escape_html(text_split[j]), text_split[j]),
  120. if (!is.na(inserts$insert[j])) inserts$insert[j]
  121. )
  122. }
  123. if (exact) out <- escape_backslash(out)
  124. paste(out, collapse = '')
  125. }
  126. #' Wraps capture groups in regex pattern in span tags to colorize with CSS
  127. #'
  128. #' @inheritParams view_regex
  129. #' @keywords internal
  130. wrap_regex <- function(pattern, escape = TRUE, exact = TRUE) {
  131. stopifnot(length(pattern) == 1)
  132. if (escape) pattern <- escape_html(pattern)
  133. # 1. walk characters in pattern
  134. # 2. if current is open parens
  135. # 1. walk backwards, counting number of "\\" until first non-"\\" char
  136. # 2. If odd, then not an opening group
  137. # 3. Look forward, if followed by "?" then not a capturing group
  138. # 4. If capturing group then add opening "<span...>(" to out and
  139. # add TRUE for valid capture group to parens stack
  140. # 5. If non-capturing group, add "(" to out and FALSE for non-valid to paren stack
  141. # 3. if close parens, add closing "</span>" to out
  142. out <- c()
  143. paren_stack <- c()
  144. group <- 0
  145. pattern_chars <- strsplit(pattern, "")[[1]]
  146. for (i in seq_along(pattern_chars)) {
  147. is_capture_group <- FALSE
  148. if (pattern_chars[i] == "(") {
  149. backslash_count <- 0
  150. if (i != 1) {
  151. j <- i-1
  152. while (pattern_chars[j] == "\\" && j > 0) {
  153. backslash_count <- backslash_count + 1
  154. j <- j - 1
  155. }
  156. }
  157. if (backslash_count %% 2 == 0) {
  158. if (i != length(pattern_chars) && pattern_chars[i + 1] != "?") {
  159. is_capture_group <- TRUE
  160. }
  161. }
  162. if (is_capture_group) {
  163. group <- group + 1
  164. paren_stack <- c(TRUE, paren_stack) #push
  165. out <- c(out, paste0('<span class="g', sprintf("%02d", group), '">('))
  166. } else {
  167. paren_stack <- c(FALSE, paren_stack) #push
  168. out <- c(out, "(")
  169. }
  170. } else if (pattern_chars[i] == ")") {
  171. closes_capture_group <- paren_stack[1]
  172. paren_stack <- paren_stack[-1] #pop
  173. if (closes_capture_group) {
  174. out <- c(out, ")</span>")
  175. } else {
  176. out <- c(out, ")")
  177. }
  178. } else {
  179. out <- c(out, pattern_chars[i])
  180. }
  181. }
  182. if (exact) out <- escape_backslash(out)
  183. paste(out, collapse = "")
  184. }
  185. #' View grouped regex results
  186. #'
  187. #' View the result of the regular expression when applied to the given text.
  188. #' The default behavior renders the result as HTML and opens the file in
  189. #' the RStudio viewer pane. If `render` is `FALSE`, the HTML itself is returned.
  190. #' If the output is destined for a [knitr] document, set `knitr` to `TRUE`.
  191. #'
  192. #' @examples
  193. #' view_regex("example", "amp", render=FALSE)
  194. #'
  195. #' @param text Text to search
  196. #' @param pattern Regex pattern to look for
  197. #' @param render Render results to an HTML doc and open in RStudio viewer?
  198. #' @param escape Escape HTML-related characters in `text`?
  199. #' @param exact Should the regex pattern be displayed as entered by the user
  200. #' into R console or source (default)? When `TRUE`, regex is displayed with
  201. #' the double `\\\\` required for escaping backslashes in R. When `FALSE`,
  202. #' regex is displayed as interpreted by the regex engine (i.e. double `\\\\`
  203. #' as a single `\\`).
  204. #' @param result_only Should only the result be displayed? If `FALSE`, then
  205. #' the colorized regular expression is also displayed in the output.
  206. #' @inheritDotParams base::regexec ignore.case perl fixed useBytes
  207. #' @export
  208. view_regex <- function(
  209. text,
  210. pattern,
  211. ...,
  212. render = TRUE,
  213. escape = render,
  214. exact = escape,
  215. result_only = FALSE
  216. ) {
  217. knitr <- isTRUE(getOption('knitr.in.progress'))
  218. if (knitr) {
  219. render <- FALSE
  220. escape <- TRUE
  221. }
  222. regex_opts <- deprecate_knitr_option(...)
  223. regex_opts$text <- text
  224. regex_opts$pattern <- pattern
  225. res <- do.call(run_regex, regex_opts)
  226. res <- purrr::map_chr(res, wrap_result, escape = escape, exact = exact)
  227. res <- purrr::map_chr(res, function(resi) {
  228. result_pad <- ""
  229. if (grepl("pad\\d{2}", resi)) {
  230. max_pad <- max(stringr::str_extract_all(resi, "pad\\d{2}")[[1]])
  231. max_pad_level <- as.integer(stringr::str_extract(max_pad, "\\d{2}"))
  232. if (max_pad_level - 3 > 0) {
  233. result_pad <- sprintf("pad%02d", max_pad_level - 3)
  234. }
  235. }
  236. paste("<p class='results", result_pad, "'>", resi, "</p>")
  237. })
  238. res <- paste(res, collapse = "")
  239. if (!nchar(pattern)) res <- paste("<p class='results'>", text, "</p>")
  240. if (knitr) {
  241. # embed css
  242. group_css <- htmltools::htmlDependency(
  243. name = "regexplain-groups", version = packageVersion("regexplain"),
  244. src = system.file("styles", package = "regexplain"),
  245. stylesheet = "groups.css")
  246. res <- htmltools::attachDependencies(htmltools::HTML(res), group_css)
  247. return(res)
  248. }
  249. if (!render) return(res)
  250. head <- if (!result_only) c(
  251. "---", "pagetitle: View Regex", "---",
  252. "<h5>Pattern</h5>",
  253. "<p><pre>", wrap_regex(pattern, escape, exact), "</pre></p>",
  254. "<h5>Matches</h5>"
  255. )
  256. res <- c(head, res)
  257. tmp <- tempfile(fileext = ".Rmd")
  258. cat(res, file = tmp, sep = "\n")
  259. tmp_html <- suppressWarnings(
  260. rmarkdown::render(
  261. tmp,
  262. output_format = rmarkdown::html_document(css = c(system.file("styles", 'skeleton.css', package='regexplain'),
  263. system.file("styles", 'view_regex.css', package='regexplain'),
  264. system.file("styles", 'groups.css', package='regexplain')),
  265. theme = NULL,
  266. md_extensions = "-autolink_bare_uris"),
  267. quiet = TRUE
  268. ))
  269. rstudioapi::viewer(tmp_html)
  270. }
  271. deprecate_knitr_option <- function(...) {
  272. regex_opts <- list(...)
  273. if ("knitr" %in% names(regex_opts)) {
  274. warning("The `knitr` parameter of `view_regex()` has been removed. Running `view_regex()` in R Markdown is automatically detected.")
  275. }
  276. regex_opts[setdiff(names(regex_opts), "knitr")]
  277. }