🔍 An RStudio addin slash regex utility belt
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

274 lines
9.6KB

  1. #' Extract matched groups from regexp
  2. #'
  3. #' @param text Text to search
  4. #' @param pattern regexp
  5. #' @inheritDotParams base::regexec ignore.case perl fixed useBytes
  6. run_regex <- function(
  7. text,
  8. pattern,
  9. ignore.case = FALSE,
  10. perl = FALSE,
  11. fixed = FALSE,
  12. useBytes = FALSE
  13. ) {
  14. # Use regex to get matches by group, gives start index and length
  15. m <- regexec(pattern, text, ignore.case, perl, fixed, useBytes)
  16. m <- purrr::map2(text, m, ~ list(text = .x, idx = expand_matches(.y)))
  17. mmi <- max_match_index(m)
  18. if (any(!is.na(mmi))) {
  19. subtext <- purrr::map_chr(m, "text") %>% purrr::map2_chr(mmi, substring)
  20. subtext[is.na(subtext)] <- ""
  21. m2 <- run_regex(subtext, pattern, ignore.case, perl, fixed, useBytes)
  22. for (i in seq_along(m2)) {
  23. if (is.null(m2[[i]]$idx[[1]])) next
  24. m[[i]]$idx <- c(m[[i]]$idx, purrr::map(m2[[i]]$idx, ~ . + mmi[i] - 1))
  25. }
  26. }
  27. m
  28. }
  29. expand_matches <- function(m) {
  30. if (m[1] == -1) return(list(NULL))
  31. m_length <- attr(m, "match.length")
  32. purrr::map2(m, m_length, ~ c(.x, .x + .y))
  33. }
  34. max_match_index <- function(m) {
  35. purrr::modify_depth(m, 1, ~purrr::pluck(., "idx")) %>%
  36. purrr::modify_depth(1, ~purrr::map_int(., ~ ifelse(is.null(.), NA, max(.)))) %>%
  37. purrr::map_int(max)
  38. }
  39. #' Wrap matches in HTML span tags to colorize via CSS
  40. #'
  41. #' @param x Individual list item in list returned by [run_regex()]
  42. #' @inheritParams view_regex
  43. #' @keywords internal
  44. wrap_result <- function(x, escape = FALSE, exact = FALSE) {
  45. if (is.null(x$idx[[1]])) return(if (escape) escape_html(x$text) else x$text)
  46. text <- x$text
  47. idx <- x$idx
  48. len_idx <- length(idx)
  49. inserts <- data.frame(
  50. i = 1:len_idx - 1,
  51. start = purrr::map_int(idx, ~ .[1]),
  52. end = purrr::map_int(idx, ~ .[2])
  53. ) %>%
  54. mutate(
  55. # unmatched groups have start/end of zero
  56. start = ifelse(.data$start == 0, NA, .data$start),
  57. end = ifelse(.data$end == 0, NA, .data$end),
  58. class = sprintf("group g%02d", .data$i),
  59. pad = 0
  60. )
  61. for (j in seq_len(nrow(inserts))) {
  62. if (inserts$i[j] == 0) next
  63. if (is.na(inserts$start[j]) || is.na(inserts$end[j])) next
  64. overlap <- filter(
  65. inserts[1:(j-1), ],
  66. .data$i != 0,
  67. .data$start <= !!inserts$start[j] & .data$end >= !!inserts$end[j])
  68. inserts[j, 'pad'] <- inserts$pad[j] + nrow(overlap)
  69. }
  70. inserts <- inserts %>%
  71. tidyr::gather(type, loc, start:end) %>%
  72. filter(!is.na(.data$loc)) %>%
  73. dplyr::arrange(loc, class, dplyr::desc(type)) %>%
  74. mutate(
  75. class = ifelse(.data$pad > 0, sprintf("%s pad%02d", .data$class, .data$pad), .data$class),
  76. insert = ifelse(.data$type == 'start', sprintf('<span class="%s">', .data$class), "</span>")
  77. )
  78. inserts_g0 <- filter(inserts, class == "group g00")
  79. inserts_other <- filter(inserts, class != "group g00")
  80. inserts <- dplyr::bind_rows(
  81. filter(inserts_g0, type == "start"),
  82. inserts_other,
  83. filter(inserts_g0, type == "end")
  84. ) %>%
  85. mutate(type = sprintf("%05d%s", 1:nrow(.), type)) %>%
  86. group_by(.data$loc, .data$type) %>%
  87. summarize(insert = paste(.data$insert, collapse = '')) %>%
  88. dplyr::ungroup() %>%
  89. mutate(type = sub("^\\d{5}", "", type))
  90. # inserts now gives html (span open and close) to insert and loc
  91. # first split text at inserts$loc locations,
  92. # then recombine by zipping with inserts$insert text
  93. # start at 0, unless there's a hit on first character
  94. # end at nchar(text) + 1 because window is idx[k] to idx[k+1]-1
  95. idx_split <- c(0 - (inserts$loc[1] == 0), inserts$loc)
  96. if (!(nchar(text) + 1) %in% idx_split)
  97. idx_split <- c(idx_split, nchar(text) + 1)
  98. text_split <- c()
  99. for (k in seq_along(idx_split[-1])) {
  100. text_split <- c(text_split, substr(text, idx_split[k], idx_split[k+1] - 1))
  101. }
  102. out <- c()
  103. for (j in seq_along(text_split)) {
  104. out <- c(
  105. out,
  106. ifelse(escape, escape_html(text_split[j]), text_split[j]),
  107. if (!is.na(inserts$insert[j])) inserts$insert[j]
  108. )
  109. }
  110. if (exact) out <- escape_backslash(out)
  111. paste(out, collapse = '')
  112. }
  113. #' Wraps capture groups in regex pattern in span tags to colorize with CSS
  114. #'
  115. #' @inheritParams view_regex
  116. #' @keywords internal
  117. wrap_regex <- function(pattern, escape = TRUE, exact = TRUE) {
  118. stopifnot(length(pattern) == 1)
  119. if (escape) pattern <- escape_html(pattern)
  120. # 1. walk characters in pattern
  121. # 2. if current is open parens
  122. # 1. walk backwards, counting number of "\\" until first non-"\\" char
  123. # 2. If odd, then not an opening group
  124. # 3. Look forward, if followed by "?" then not a capturing group
  125. # 4. If capturing group then add opening "<span...>(" to out and
  126. # add TRUE for valid capture group to parens stack
  127. # 5. If non-capturing group, add "(" to out and FALSE for non-valid to paren stack
  128. # 3. if close parens, add closing "</span>" to out
  129. out <- c()
  130. paren_stack <- c()
  131. group <- 0
  132. pattern_chars <- strsplit(pattern, "")[[1]]
  133. for (i in seq_along(pattern_chars)) {
  134. is_capture_group <- FALSE
  135. if (pattern_chars[i] == "(") {
  136. backslash_count <- 0
  137. if (i != 1) {
  138. j <- i-1
  139. while (pattern_chars[j] == "\\" && j > 0) {
  140. backslash_count <- backslash_count + 1
  141. j <- j - 1
  142. }
  143. }
  144. if (backslash_count %% 2 == 0) {
  145. if (i != length(pattern_chars) && pattern_chars[i + 1] != "?") {
  146. is_capture_group <- TRUE
  147. }
  148. }
  149. if (is_capture_group) {
  150. group <- group + 1
  151. paren_stack <- c(TRUE, paren_stack) #push
  152. out <- c(out, paste0('<span class="g', sprintf("%02d", group), '">('))
  153. } else {
  154. paren_stack <- c(FALSE, paren_stack) #push
  155. out <- c(out, "(")
  156. }
  157. } else if (pattern_chars[i] == ")") {
  158. closes_capture_group <- paren_stack[1]
  159. paren_stack <- paren_stack[-1] #pop
  160. if (closes_capture_group) {
  161. out <- c(out, ")</span>")
  162. } else {
  163. out <- c(out, ")")
  164. }
  165. } else {
  166. out <- c(out, pattern_chars[i])
  167. }
  168. }
  169. if (exact) out <- escape_backslash(out)
  170. paste(out, collapse = "")
  171. }
  172. #' View grouped regex results
  173. #'
  174. #' View the result of the regular expression when applied to the given text.
  175. #' The default behavior renders the result as HTML and opens the file in
  176. #' the RStudio viewer pane. If `render` is `FALSE`, the HTML itself is returned.
  177. #' If the output is destined for a [knitr] document, set `knitr` to `TRUE`.
  178. #'
  179. #' @examples
  180. #' view_regex("example", "amp", render=FALSE)
  181. #'
  182. #' @param text Text to search
  183. #' @param pattern Regex pattern to look for
  184. #' @param render Render results to an HTML doc and open in RStudio viewer?
  185. #' @param escape Escape HTML-related characters in `text`?
  186. #' @param exact Should the regex pattern be displayed as entered by the user
  187. #' into R console or source (default)? When `TRUE`, regex is displayed with
  188. #' the double `\\\\` required for escaping backslashes in R. When `FALSE`,
  189. #' regex is displayed as interpreted by the regex engine (i.e. double `\\\\`
  190. #' as a single `\\`).
  191. #' @param result_only Should only the result be displayed? If `FALSE`, then
  192. #' the colorized regular expression is also displayed in the output.
  193. #' @inheritDotParams base::regexec ignore.case perl fixed useBytes
  194. #' @export
  195. view_regex <- function(
  196. text,
  197. pattern,
  198. ...,
  199. render = TRUE,
  200. escape = render,
  201. exact = escape,
  202. result_only = FALSE
  203. ) {
  204. knitr <- isTRUE(getOption('knitr.in.progress'))
  205. if (knitr) {
  206. render <- FALSE
  207. escape <- TRUE
  208. }
  209. regex_opts <- deprecate_knitr_option(...)
  210. regex_opts$text <- text
  211. regex_opts$pattern <- pattern
  212. res <- do.call(run_regex, regex_opts)
  213. res <- purrr::map_chr(res, wrap_result, escape = escape, exact = exact)
  214. res <- purrr::map_chr(res, function(resi) {
  215. result_pad <- ""
  216. if (grepl("pad\\d{2}", resi)) {
  217. max_pad <- max(stringr::str_extract_all(resi, "pad\\d{2}")[[1]])
  218. max_pad_level <- as.integer(stringr::str_extract(max_pad, "\\d{2}"))
  219. if (max_pad_level - 3 > 0) {
  220. result_pad <- sprintf("pad%02d", max_pad_level - 3)
  221. }
  222. }
  223. paste("<p class='results", result_pad, "'>", resi, "</p>")
  224. })
  225. res <- paste(res, collapse = "")
  226. if (!nchar(pattern)) res <- paste("<p class='results'>", text, "</p>")
  227. if (knitr) {
  228. # embed css
  229. group_css <- htmltools::htmlDependency(
  230. name = "regexplain-groups", version = packageVersion("regexplain"),
  231. src = system.file("styles", package = "regexplain"),
  232. stylesheet = "groups.css")
  233. res <- htmltools::attachDependencies(htmltools::HTML(res), group_css)
  234. return(res)
  235. }
  236. if (!render) return(res)
  237. head <- if (!result_only) c(
  238. "---", "pagetitle: View Regex", "---",
  239. "<h5>Pattern</h5>",
  240. "<p><pre>", wrap_regex(pattern, escape, exact), "</pre></p>",
  241. "<h5>Matches</h5>"
  242. )
  243. res <- c(head, res)
  244. tmp <- tempfile(fileext = ".Rmd")
  245. cat(res, file = tmp, sep = "\n")
  246. tmp_html <- suppressWarnings(
  247. rmarkdown::render(
  248. tmp,
  249. output_format = rmarkdown::html_document(css = c(system.file("styles", 'skeleton.css', package='regexplain'),
  250. system.file("styles", 'view_regex.css', package='regexplain'),
  251. system.file("styles", 'groups.css', package='regexplain')),
  252. theme = NULL,
  253. md_extensions = "-autolink_bare_uris"),
  254. quiet = TRUE
  255. ))
  256. rstudioapi::viewer(tmp_html)
  257. }
  258. deprecate_knitr_option <- function(...) {
  259. regex_opts <- list(...)
  260. if ("knitr" %in% names(regex_opts)) {
  261. warning("The `knitr` parameter of `view_regex()` has been removed. Running `view_regex()` in R Markdown is automatically detected.")
  262. }
  263. regex_opts[setdiff(names(regex_opts), "knitr")]
  264. }