🔍 An RStudio addin slash regex utility belt
Nie możesz wybrać więcej, niż 25 tematów Tematy muszą się zaczynać od litery lub cyfry, mogą zawierać myślniki ('-') i mogą mieć do 35 znaków.

326 lines
10KB

  1. #' Extract matched groups from regexp
  2. #'
  3. #' @param text Text to search
  4. #' @param pattern regexp
  5. #' @param global If `TRUE`, enables global pattern matching
  6. #' @inheritParams base::regexec
  7. regex <- function(
  8. text,
  9. pattern,
  10. ignore.case = FALSE,
  11. perl = FALSE,
  12. fixed = FALSE,
  13. useBytes = FALSE,
  14. global = TRUE
  15. ) {
  16. # Use regex to get matches by group, gives start index and length
  17. m <- regexec(pattern, text, ignore.case, perl, fixed, useBytes)
  18. m <- purrr::map2(text, m, ~ list(text = .x, idx = expand_matches(.y)))
  19. attr(m, "global") <- global
  20. if (!global) {
  21. return(m)
  22. }
  23. mmi <- max_match_index(m)
  24. if (any(!is.na(mmi))) {
  25. subtext <- purrr::map_chr(m, "text") %>% purrr::map2_chr(mmi, substring)
  26. sub_idx <- which(!is.na(subtext))
  27. m2 <- regex(subtext[sub_idx], pattern, ignore.case, perl, fixed, useBytes)
  28. for (i in seq_along(m2)) {
  29. if (is.null(m2[[i]]$idx[[1]])) next
  30. m_idx <- sub_idx[i]
  31. m2[[i]]$idx[, c(1, 2)] <- m2[[i]]$idx[, c(1, 2)] + mmi[m_idx] - 1L
  32. m2[[i]]$idx$pass <- m2[[i]]$idx$pass + 1L
  33. m[[m_idx]]$idx <- rbind(m[[m_idx]]$idx, m2[[i]]$idx)
  34. }
  35. }
  36. m
  37. }
  38. expand_matches <- function(m) {
  39. if (m[1] == -1) {
  40. return(NULL)
  41. }
  42. m_length <- attr(m, "match.length")
  43. if (identical(as.vector(m[[1]]), 1L) && m_length == 0) {
  44. return(NULL)
  45. }
  46. x <- purrr::map2(m, m_length, ~ c(.x, .x + .y))
  47. x <- as.data.frame(do.call(rbind, x))
  48. names(x) <- c("start", "end")
  49. x$start <- ifelse(x$start == 0L, NA_integer_, x$start)
  50. x$end <- ifelse(x$end == 0L, NA_integer_, x$end)
  51. x$group <- 1:nrow(x) - 1L
  52. x$pass <- 1L
  53. x
  54. }
  55. max_match_index <- function(m) {
  56. purrr::map(m, "idx") %>%
  57. purrr::map_int(function(idx) {
  58. if (!is.null(idx)) {
  59. max(idx$start, idx$end, na.rm = TRUE)
  60. } else {
  61. NA
  62. }
  63. })
  64. }
  65. #' Wrap matches in HTML span tags to colorize via CSS
  66. #'
  67. #' @param x Individual list item in list returned by [regex()]
  68. #' @inheritParams view_regex
  69. #' @keywords internal
  70. wrap_result <- function(x, escape = FALSE, exact = FALSE) {
  71. if (is.null(x$idx[[1]])) {
  72. return(if (escape) escape_html(x$text) else x$text)
  73. }
  74. text <- x$text
  75. inserts <- x$idx
  76. inserts$class <- sprintf("group g%02d", inserts$group)
  77. inserts$pad <- 0L
  78. names(inserts)[which(names(inserts) == "group")] <- "i"
  79. for (j in seq_len(nrow(inserts))) {
  80. if (inserts$i[j] == 0) next
  81. if (is.na(inserts$start[j]) || is.na(inserts$end[j])) next
  82. overlap <- filter(
  83. inserts[1:(j - 1), ],
  84. .data$i != 0,
  85. .data$start <= !!inserts$start[j] & .data$end >= !!inserts$end[j]
  86. )
  87. inserts[j, "pad"] <- inserts$pad[j] + nrow(overlap)
  88. }
  89. inserts <- dplyr::bind_rows(
  90. inserts %>% select(-.data$end, dplyr::everything(), loc = .data$start) %>% mutate(type = "start"),
  91. inserts %>% select(-.data$start, dplyr::everything(), loc = .data$end) %>% mutate(type = "end")
  92. ) %>%
  93. filter(!is.na(.data$loc)) %>%
  94. dplyr::arrange(loc, class, dplyr::desc(type)) %>%
  95. mutate(
  96. class = ifelse(.data$pad > 0, sprintf("%s pad%02d", .data$class, .data$pad), .data$class),
  97. insert = ifelse(.data$type == "start", sprintf('<span class="%s">', .data$class), "</span>")
  98. )
  99. inserts <- if (max(inserts$pass) == 1) {
  100. collapse_span_inserts(inserts)
  101. } else {
  102. split(inserts, inserts$pass) %>%
  103. purrr::map(collapse_span_inserts) %>%
  104. dplyr::bind_rows() %>%
  105. group_by(.data$loc, .data$type) %>%
  106. summarize(insert = paste(.data$insert, collapse = "")) %>%
  107. dplyr::ungroup()
  108. }
  109. # inserts now gives html (span open and close) to insert and loc
  110. # first split text at inserts$loc locations,
  111. # then recombine by zipping with inserts$insert text
  112. # start at 0, unless there's a hit on first character
  113. # end at nchar(text) + 1 because window is idx[k] to idx[k+1]-1
  114. idx_split <- c(0 - (inserts$loc[1] == 0), inserts$loc)
  115. if (!(nchar(text) + 1) %in% idx_split) {
  116. idx_split <- c(idx_split, nchar(text) + 1)
  117. }
  118. text_split <- c()
  119. for (k in seq_along(idx_split[-1])) {
  120. text_split <- c(text_split, substr(text, idx_split[k], idx_split[k + 1] - 1))
  121. }
  122. out <- c()
  123. for (j in seq_along(text_split)) {
  124. out <- c(
  125. out,
  126. ifelse(escape, escape_html(text_split[j]), text_split[j]),
  127. if (!is.na(inserts$insert[j])) inserts$insert[j]
  128. )
  129. }
  130. if (exact) out <- escape_backslash(out)
  131. paste(out, collapse = "")
  132. }
  133. collapse_span_inserts <- function(inserts) {
  134. inserts_g0 <- filter(inserts, class == "group g00")
  135. inserts_other <- filter(inserts, class != "group g00")
  136. dplyr::bind_rows(
  137. filter(inserts_g0, type == "start"),
  138. inserts_other,
  139. filter(inserts_g0, type == "end")
  140. ) %>%
  141. mutate(type = sprintf("%05d%s", dplyr::row_number(), type)) %>%
  142. group_by(.data$loc, .data$type) %>%
  143. summarize(insert = paste(.data$insert, collapse = "")) %>%
  144. dplyr::ungroup() %>%
  145. mutate(type = sub("^\\d{5}", "", type))
  146. }
  147. #' Wraps capture groups in regex pattern in span tags to colorize with CSS
  148. #'
  149. #' @inheritParams view_regex
  150. #' @keywords internal
  151. wrap_regex <- function(pattern, escape = TRUE, exact = TRUE) {
  152. stopifnot(length(pattern) == 1)
  153. if (escape) pattern <- escape_html(pattern)
  154. # 1. walk characters in pattern
  155. # 2. if current is open parens
  156. # 1. walk backwards, counting number of "\\" until first non-"\\" char
  157. # 2. If odd, then not an opening group
  158. # 3. Look forward, if followed by "?" then not a capturing group
  159. # 4. If capturing group then add opening "<span...>(" to out and
  160. # add TRUE for valid capture group to parens stack
  161. # 5. If non-capturing group, add "(" to out and FALSE for non-valid to paren stack
  162. # 3. if close parens, add closing "</span>" to out
  163. out <- c()
  164. paren_stack <- c()
  165. group <- 0
  166. pattern_chars <- strsplit(pattern, "")[[1]]
  167. for (i in seq_along(pattern_chars)) {
  168. is_capture_group <- FALSE
  169. if (pattern_chars[i] == "(") {
  170. backslash_count <- 0
  171. if (i != 1) {
  172. j <- i - 1
  173. while (pattern_chars[j] == "\\" && j > 0) {
  174. backslash_count <- backslash_count + 1
  175. j <- j - 1
  176. }
  177. }
  178. if (backslash_count %% 2 == 0) {
  179. if (i != length(pattern_chars) && pattern_chars[i + 1] != "?") {
  180. is_capture_group <- TRUE
  181. }
  182. }
  183. if (is_capture_group) {
  184. group <- group + 1
  185. paren_stack <- c(TRUE, paren_stack) # push
  186. out <- c(out, paste0('<span class="g', sprintf("%02d", group), '">('))
  187. } else {
  188. paren_stack <- c(FALSE, paren_stack) # push
  189. out <- c(out, "(")
  190. }
  191. } else if (pattern_chars[i] == ")") {
  192. closes_capture_group <- paren_stack[1]
  193. paren_stack <- paren_stack[-1] # pop
  194. if (closes_capture_group) {
  195. out <- c(out, ")</span>")
  196. } else {
  197. out <- c(out, ")")
  198. }
  199. } else {
  200. out <- c(out, pattern_chars[i])
  201. }
  202. }
  203. if (exact) out <- escape_backslash(out)
  204. paste(out, collapse = "")
  205. }
  206. #' View grouped regex results
  207. #'
  208. #' View the result of the regular expression when applied to the given text.
  209. #' The default behavior renders the result as HTML and opens the file in
  210. #' the RStudio viewer pane. If `render` is `FALSE`, the HTML itself is returned.
  211. #' If the output is destined for a [knitr] document, set `knitr` to `TRUE`.
  212. #'
  213. #' @examples
  214. #' view_regex("example", "amp", render = FALSE)
  215. #' @param text Text to search
  216. #' @param pattern Regex pattern to look for
  217. #' @param render Render results as HTML?
  218. #' @param escape Escape HTML-related characters in `text`?
  219. #' @param exact Should the regex pattern be displayed as entered by the user
  220. #' into R console or source (default)? When `TRUE`, regex is displayed with
  221. #' the double `\\\\` required for escaping backslashes in R. When `FALSE`,
  222. #' regex is displayed as interpreted by the regex engine (i.e. double `\\\\`
  223. #' as a single `\\`).
  224. #' @param result_only Should only the result be displayed? If `FALSE`, then
  225. #' the colorized regular expression is also displayed in the output.
  226. #' @inheritDotParams base::regexec ignore.case perl fixed useBytes
  227. #' @export
  228. view_regex <- function(
  229. text,
  230. pattern,
  231. ...,
  232. render = TRUE,
  233. escape = render,
  234. exact = escape,
  235. result_only = FALSE
  236. ) {
  237. knitr <- isTRUE(getOption("knitr.in.progress"))
  238. if (knitr) {
  239. render <- FALSE
  240. escape <- TRUE
  241. }
  242. regex_opts <- deprecate_knitr_option(...)
  243. regex_opts$text <- text
  244. regex_opts$pattern <- pattern
  245. res <- do.call(regex, regex_opts)
  246. res <- purrr::map_chr(res, wrap_result, escape = escape, exact = exact)
  247. res <- purrr::map_chr(res, function(resi) {
  248. result_pad <- ""
  249. if (grepl("pad\\d{2}", resi)) {
  250. max_pad <- max(stringi::stri_extract_all_regex(resi, "pad\\d{2}")[[1]])
  251. max_pad_level <- as.integer(stringi::stri_extract_all_regex(max_pad, "\\d{2}"))
  252. if (max_pad_level - 3 > 0) {
  253. result_pad <- sprintf("pad%02d", max_pad_level - 3)
  254. }
  255. }
  256. paste('<p class="regexplain', result_pad, '">', resi, "</p>")
  257. })
  258. res <- paste(res, collapse = "")
  259. if (!nchar(pattern)) res <- paste("<p class='regexplain'>", text, "</p>")
  260. if (knitr) {
  261. return(
  262. htmltools::tagList(
  263. htmltools::HTML(res),
  264. regexplain_dependencies(full = FALSE)
  265. )
  266. )
  267. }
  268. if (!render) {
  269. return(res)
  270. }
  271. page <- result_page(wrap_regex(pattern, escape, exact), res, "View Regex")
  272. htmltools::browsable(page)
  273. }
  274. deprecate_knitr_option <- function(...) {
  275. regex_opts <- list(...)
  276. if ("knitr" %in% names(regex_opts)) {
  277. warning("The `knitr` parameter of `view_regex()` has been removed. Running `view_regex()` in R Markdown is automatically detected.")
  278. }
  279. regex_opts[setdiff(names(regex_opts), "knitr")]
  280. }
  281. result_page <- function(pattern, result, title = NULL) {
  282. pattern <- htmltools::HTML(pattern)
  283. result <- htmltools::HTML(result)
  284. page_title <- title
  285. htmltools::tagList(
  286. htmltools::div(
  287. class = "regexplain__result",
  288. if (!is.null(title)) htmltools::tags$head(htmltools::tags$title(page_title)),
  289. htmltools::h2("Pattern"),
  290. htmltools::pre(class = "regexplain__pattern", pattern),
  291. htmltools::h2("Matches"),
  292. result,
  293. regexplain_dependencies(full = TRUE)
  294. )
  295. )
  296. }
  297. regexplain_dependencies <- function(full = TRUE) {
  298. htmltools::htmlDependency(
  299. name = "regexplain",
  300. version = packageVersion("regexplain"),
  301. package = "regexplain",
  302. src = "styles",
  303. stylesheet = c(
  304. "groups.css",
  305. if (full) c("skeleton.css", "view_regex.css")
  306. ),
  307. all_files = FALSE
  308. )
  309. }