🔍 An RStudio addin slash regex utility belt
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

326 lines
10KB

  1. #' Extract matched groups from regexp
  2. #'
  3. #' @param text Text to search
  4. #' @param pattern regexp
  5. #' @param global If `TRUE`, enables global pattern matching
  6. #' @inheritParams base::regexec
  7. regex <- function(
  8. text,
  9. pattern,
  10. ignore.case = FALSE,
  11. perl = FALSE,
  12. fixed = FALSE,
  13. useBytes = FALSE,
  14. global = TRUE
  15. ) {
  16. # Use regex to get matches by group, gives start index and length
  17. m <- regexec(pattern, text, ignore.case, perl, fixed, useBytes)
  18. m <- purrr::map2(text, m, ~ list(text = .x, idx = expand_matches(.y)))
  19. attr(m, "global") <- global
  20. if (!global) {
  21. return(m)
  22. }
  23. mmi <- max_match_index(m)
  24. if (any(!is.na(mmi))) {
  25. subtext <- purrr::map_chr(m, "text") %>% purrr::map2_chr(mmi, substring)
  26. sub_idx <- which(!is.na(subtext))
  27. m2 <- regex(subtext[sub_idx], pattern, ignore.case, perl, fixed, useBytes)
  28. for (i in seq_along(m2)) {
  29. if (is.null(m2[[i]]$idx[[1]])) next
  30. m_idx <- sub_idx[i]
  31. m2[[i]]$idx[, c(1, 2)] <- m2[[i]]$idx[, c(1, 2)] + mmi[m_idx] - 1L
  32. m2[[i]]$idx$pass <- m2[[i]]$idx$pass + 1L
  33. m[[m_idx]]$idx <- rbind(m[[m_idx]]$idx, m2[[i]]$idx)
  34. }
  35. }
  36. m
  37. }
  38. expand_matches <- function(m) {
  39. if (m[1] == -1) {
  40. return(NULL)
  41. }
  42. m_length <- attr(m, "match.length")
  43. if (identical(as.vector(m[[1]]), 1L) && m_length == 0) {
  44. return(NULL)
  45. }
  46. x <- purrr::map2(m, m_length, ~ c(.x, .x + .y))
  47. x <- as.data.frame(do.call(rbind, x))
  48. names(x) <- c("start", "end")
  49. x$start <- ifelse(x$start == 0L, NA_integer_, x$start)
  50. x$end <- ifelse(x$end == 0L, NA_integer_, x$end)
  51. x$group <- 1:nrow(x) - 1L
  52. x$pass <- 1L
  53. x
  54. }
  55. max_match_index <- function(m) {
  56. purrr::map(m, "idx") %>%
  57. purrr::map_int(function(idx) {
  58. if (!is.null(idx)) {
  59. max(idx$start, idx$end, na.rm = TRUE)
  60. } else {
  61. NA
  62. }
  63. })
  64. }
  65. #' Wrap matches in HTML span tags to colorize via CSS
  66. #'
  67. #' @param x Individual list item in list returned by [regex()]
  68. #' @inheritParams view_regex
  69. #' @keywords internal
  70. wrap_result <- function(x, escape = FALSE, exact = FALSE) {
  71. if (is.null(x$idx[[1]])) {
  72. return(if (escape) escape_html(x$text) else x$text)
  73. }
  74. text <- x$text
  75. inserts <- x$idx
  76. inserts$class <- sprintf("group g%02d", inserts$group)
  77. inserts$pad <- 0L
  78. names(inserts)[which(names(inserts) == "group")] <- "i"
  79. for (j in seq_len(nrow(inserts))) {
  80. if (inserts$i[j] == 0) next
  81. if (is.na(inserts$start[j]) || is.na(inserts$end[j])) next
  82. overlap <- filter(
  83. inserts[1:(j - 1), ],
  84. .data$i != 0,
  85. .data$start <= !!inserts$start[j] & .data$end >= !!inserts$end[j]
  86. )
  87. inserts[j, "pad"] <- inserts$pad[j] + nrow(overlap)
  88. }
  89. inserts <- dplyr::bind_rows(
  90. inserts %>% select(-.data$end, dplyr::everything(), loc = .data$start) %>% mutate(type = "start"),
  91. inserts %>% select(-.data$start, dplyr::everything(), loc = .data$end) %>% mutate(type = "end")
  92. ) %>%
  93. filter(!is.na(.data$loc)) %>%
  94. dplyr::arrange(loc, class, dplyr::desc(type)) %>%
  95. mutate(
  96. class = ifelse(.data$pad > 0, sprintf("%s pad%02d", .data$class, .data$pad), .data$class),
  97. insert = ifelse(.data$type == "start", sprintf('<span class="%s">', .data$class), "</span>")
  98. )
  99. inserts <- if (max(inserts$pass) == 1) {
  100. collapse_span_inserts(inserts)
  101. } else {
  102. split(inserts, inserts$pass) %>%
  103. purrr::map(collapse_span_inserts) %>%
  104. dplyr::bind_rows() %>%
  105. group_by(.data$loc, .data$type) %>%
  106. summarize(insert = paste(.data$insert, collapse = "")) %>%
  107. dplyr::ungroup()
  108. }
  109. # inserts now gives html (span open and close) to insert and loc
  110. # first split text at inserts$loc locations,
  111. # then recombine by zipping with inserts$insert text
  112. # start at 0, unless there's a hit on first character
  113. # end at nchar(text) + 1 because window is idx[k] to idx[k+1]-1
  114. idx_split <- c(0 - (inserts$loc[1] == 0), inserts$loc)
  115. if (!(nchar(text) + 1) %in% idx_split) {
  116. idx_split <- c(idx_split, nchar(text) + 1)
  117. }
  118. text_split <- c()
  119. for (k in seq_along(idx_split[-1])) {
  120. text_split <- c(text_split, substr(text, idx_split[k], idx_split[k + 1] - 1))
  121. }
  122. out <- c()
  123. for (j in seq_along(text_split)) {
  124. out <- c(
  125. out,
  126. ifelse(escape, escape_html(text_split[j]), text_split[j]),
  127. if (!is.na(inserts$insert[j])) inserts$insert[j]
  128. )
  129. }
  130. if (exact) out <- escape_backslash(out)
  131. paste(out, collapse = "")
  132. }
  133. collapse_span_inserts <- function(inserts) {
  134. inserts_g0 <- filter(inserts, class == "group g00")
  135. inserts_other <- filter(inserts, class != "group g00")
  136. dplyr::bind_rows(
  137. filter(inserts_g0, type == "start"),
  138. inserts_other,
  139. filter(inserts_g0, type == "end")
  140. ) %>%
  141. mutate(type = sprintf("%05d%s", dplyr::row_number(), type)) %>%
  142. group_by(.data$loc, .data$type) %>%
  143. summarize(insert = paste(.data$insert, collapse = "")) %>%
  144. dplyr::ungroup() %>%
  145. mutate(type = sub("^\\d{5}", "", type))
  146. }
  147. #' Wraps capture groups in regex pattern in span tags to colorize with CSS
  148. #'
  149. #' @inheritParams view_regex
  150. #' @keywords internal
  151. wrap_regex <- function(pattern, escape = TRUE, exact = TRUE) {
  152. stopifnot(length(pattern) == 1)
  153. if (escape) pattern <- escape_html(pattern)
  154. # 1. walk characters in pattern
  155. # 2. if current is open parens
  156. # 1. walk backwards, counting number of "\\" until first non-"\\" char
  157. # 2. If odd, then not an opening group
  158. # 3. Look forward, if followed by "?" then not a capturing group
  159. # 4. If capturing group then add opening "<span...>(" to out and
  160. # add TRUE for valid capture group to parens stack
  161. # 5. If non-capturing group, add "(" to out and FALSE for non-valid to paren stack
  162. # 3. if close parens, add closing "</span>" to out
  163. out <- c()
  164. paren_stack <- c()
  165. group <- 0
  166. pattern_chars <- strsplit(pattern, "")[[1]]
  167. for (i in seq_along(pattern_chars)) {
  168. is_capture_group <- FALSE
  169. if (pattern_chars[i] == "(") {
  170. backslash_count <- 0
  171. if (i != 1) {
  172. j <- i - 1
  173. while (pattern_chars[j] == "\\" && j > 0) {
  174. backslash_count <- backslash_count + 1
  175. j <- j - 1
  176. }
  177. }
  178. if (backslash_count %% 2 == 0) {
  179. if (i != length(pattern_chars) && pattern_chars[i + 1] != "?") {
  180. is_capture_group <- TRUE
  181. }
  182. }
  183. if (is_capture_group) {
  184. group <- group + 1
  185. paren_stack <- c(TRUE, paren_stack) # push
  186. out <- c(out, paste0('<span class="g', sprintf("%02d", group), '">('))
  187. } else {
  188. paren_stack <- c(FALSE, paren_stack) # push
  189. out <- c(out, "(")
  190. }
  191. } else if (pattern_chars[i] == ")") {
  192. closes_capture_group <- paren_stack[1]
  193. paren_stack <- paren_stack[-1] # pop
  194. if (closes_capture_group) {
  195. out <- c(out, ")</span>")
  196. } else {
  197. out <- c(out, ")")
  198. }
  199. } else {
  200. out <- c(out, pattern_chars[i])
  201. }
  202. }
  203. if (exact) out <- escape_backslash(out)
  204. paste(out, collapse = "")
  205. }
  206. #' View grouped regex results
  207. #'
  208. #' View the result of the regular expression when applied to the given text.
  209. #' The default behavior renders the result as HTML and opens the file in
  210. #' the RStudio viewer pane. If `render` is `FALSE`, the HTML itself is returned.
  211. #' If the output is destined for a [knitr] document, set `knitr` to `TRUE`.
  212. #'
  213. #' @examples
  214. #' view_regex("example", "amp", render = FALSE)
  215. #' @param text Text to search
  216. #' @param pattern Regex pattern to look for
  217. #' @param render Render results as HTML?
  218. #' @param escape Escape HTML-related characters in `text`?
  219. #' @param exact Should the regex pattern be displayed as entered by the user
  220. #' into R console or source (default)? When `TRUE`, regex is displayed with
  221. #' the double `\\\\` required for escaping backslashes in R. When `FALSE`,
  222. #' regex is displayed as interpreted by the regex engine (i.e. double `\\\\`
  223. #' as a single `\\`).
  224. #' @param result_only Should only the result be displayed? If `FALSE`, then
  225. #' the colorized regular expression is also displayed in the output.
  226. #' @inheritDotParams base::regexec ignore.case perl fixed useBytes
  227. #' @export
  228. view_regex <- function(
  229. text,
  230. pattern,
  231. ...,
  232. render = TRUE,
  233. escape = render,
  234. exact = escape,
  235. result_only = FALSE
  236. ) {
  237. knitr <- isTRUE(getOption("knitr.in.progress"))
  238. if (knitr) {
  239. render <- FALSE
  240. escape <- TRUE
  241. }
  242. regex_opts <- deprecate_knitr_option(...)
  243. regex_opts$text <- text
  244. regex_opts$pattern <- pattern
  245. res <- do.call(regex, regex_opts)
  246. res <- purrr::map_chr(res, wrap_result, escape = escape, exact = exact)
  247. res <- purrr::map_chr(res, function(resi) {
  248. result_pad <- ""
  249. if (grepl("pad\\d{2}", resi)) {
  250. max_pad <- max(stringi::stri_extract_all_regex(resi, "pad\\d{2}")[[1]])
  251. max_pad_level <- as.integer(stringi::stri_extract_all_regex(max_pad, "\\d{2}"))
  252. if (max_pad_level - 3 > 0) {
  253. result_pad <- sprintf("pad%02d", max_pad_level - 3)
  254. }
  255. }
  256. paste('<p class="regexplain', result_pad, '">', resi, "</p>")
  257. })
  258. res <- paste(res, collapse = "")
  259. if (!nchar(pattern)) res <- paste("<p class='regexplain'>", text, "</p>")
  260. if (knitr) {
  261. return(
  262. htmltools::tagList(
  263. htmltools::HTML(res),
  264. regexplain_dependencies(full = FALSE)
  265. )
  266. )
  267. }
  268. if (!render) {
  269. return(res)
  270. }
  271. page <- result_page(wrap_regex(pattern, escape, exact), res, "View Regex")
  272. htmltools::browsable(page)
  273. }
  274. deprecate_knitr_option <- function(...) {
  275. regex_opts <- list(...)
  276. if ("knitr" %in% names(regex_opts)) {
  277. warning("The `knitr` parameter of `view_regex()` has been removed. Running `view_regex()` in R Markdown is automatically detected.")
  278. }
  279. regex_opts[setdiff(names(regex_opts), "knitr")]
  280. }
  281. result_page <- function(pattern, result, title = NULL) {
  282. pattern <- htmltools::HTML(pattern)
  283. result <- htmltools::HTML(result)
  284. page_title <- title
  285. htmltools::tagList(
  286. htmltools::div(
  287. class = "regexplain__result",
  288. if (!is.null(title)) htmltools::tags$head(htmltools::tags$title(page_title)),
  289. htmltools::h2("Pattern"),
  290. htmltools::pre(class = "regexplain__pattern", pattern),
  291. htmltools::h2("Matches"),
  292. result,
  293. regexplain_dependencies(full = TRUE)
  294. )
  295. )
  296. }
  297. regexplain_dependencies <- function(full = TRUE) {
  298. htmltools::htmlDependency(
  299. name = "regexplain",
  300. version = packageVersion("regexplain"),
  301. package = "regexplain",
  302. src = "styles",
  303. stylesheet = c(
  304. "groups.css",
  305. if (full) c("skeleton.css", "view_regex.css")
  306. ),
  307. all_files = FALSE
  308. )
  309. }