| @@ -11,7 +11,7 @@ run_regex <- function( | |||
| perl = FALSE, | |||
| fixed = FALSE, | |||
| useBytes = FALSE, | |||
| global = FALSE | |||
| global = TRUE | |||
| ) { | |||
| # Use regex to get matches by group, gives start index and length | |||
| m <- regexec(pattern, text, ignore.case, perl, fixed, useBytes) | |||
| @@ -27,8 +27,8 @@ run_regex <- function( | |||
| m2 <- run_regex(subtext, pattern, ignore.case, perl, fixed, useBytes) | |||
| for (i in seq_along(m2)) { | |||
| if (is.null(m2[[i]]$idx[[1]])) next | |||
| m2[[i]]$idx[, c(1, 2)] <- m2[[i]]$idx[, c(1, 2)] + mmi[i] - 1 | |||
| m[[i]]$idx <- dplyr::bind_rows(m[[i]]$idx, m2[[i]]$idx) | |||
| m2[[i]]$idx[, c(1, 2)] <- m2[[i]]$idx[, c(1, 2)] + mmi[i] - 1L | |||
| m[[i]]$idx <- rbind(m[[i]]$idx, m2[[i]]$idx) | |||
| } | |||
| } | |||
| m | |||
| @@ -43,7 +43,7 @@ expand_matches <- function(m) { | |||
| x$start <- ifelse(x$start == 0L, NA_integer_, x$start) | |||
| x$end <- ifelse(x$end == 0L, NA_integer_, x$end) | |||
| x$group <- 1:nrow(x) - 1L | |||
| dplyr::as_tibble(x, validate = FALSE) | |||
| x | |||
| } | |||
| max_match_index <- function(m) { | |||
| @@ -64,20 +64,12 @@ max_match_index <- function(m) { | |||
| wrap_result <- function(x, escape = FALSE, exact = FALSE) { | |||
| if (is.null(x$idx[[1]])) return(if (escape) escape_html(x$text) else x$text) | |||
| text <- x$text | |||
| idx <- x$idx | |||
| len_idx <- length(idx) | |||
| inserts <- data.frame( | |||
| i = 1:len_idx - 1, | |||
| start = purrr::map_int(idx, ~ .[1]), | |||
| end = purrr::map_int(idx, ~ .[2]) | |||
| ) %>% | |||
| mutate( | |||
| # unmatched groups have start/end of zero | |||
| start = ifelse(.data$start == 0, NA, .data$start), | |||
| end = ifelse(.data$end == 0, NA, .data$end), | |||
| class = sprintf("group g%02d", .data$i), | |||
| pad = 0 | |||
| ) | |||
| inserts <- x$idx | |||
| inserts$class <- sprintf("group g%02d", inserts$group) | |||
| inserts$pad <- 0L | |||
| names(inserts)[which(names(inserts) == "group")] <- "i" | |||
| for (j in seq_len(nrow(inserts))) { | |||
| if (inserts$i[j] == 0) next | |||
| if (is.na(inserts$start[j]) || is.na(inserts$end[j])) next | |||
| @@ -5,7 +5,7 @@ | |||
| \title{Extract matched groups from regexp} | |||
| \usage{ | |||
| run_regex(text, pattern, ignore.case = FALSE, perl = FALSE, | |||
| fixed = FALSE, useBytes = FALSE, global = FALSE) | |||
| fixed = FALSE, useBytes = FALSE, global = TRUE) | |||
| } | |||
| \arguments{ | |||
| \item{text}{Text to search} | |||
| @@ -2,7 +2,7 @@ context("test-regex") | |||
| test_that("expand_matches gives data frame of indices with groups", { | |||
| m <- regexec("(a)(b)(d)?", "abcaba") | |||
| idx <- dplyr::data_frame( | |||
| idx <- data.frame( | |||
| start = c(1L, 1L, 2L, NA_integer_), | |||
| end = c(3L, 2L, 3L, NA_integer_), | |||
| group = c(0L, 1L, 2L, 3L) | |||
| @@ -10,6 +10,16 @@ test_that("expand_matches gives data frame of indices with groups", { | |||
| expect_equal(expand_matches(m[[1]]), idx) | |||
| }) | |||
| test_that("start/end indices are integers", { | |||
| text <- "ab ab" | |||
| pattern <- "(a)(b)" | |||
| m <- run_regex(text, pattern, global = TRUE) | |||
| expect_is(m[[1]]$idx$start, "integer") | |||
| expect_is(m[[1]]$idx$end, "integer") | |||
| expect_is(m[[1]]$idx$group, "integer") | |||
| }) | |||
| test_that("max_match_index works", { | |||
| m <- run_regex(c("abcaba", "aba", "z"), c("(a)(b)(d)?c?"), global = TRUE) | |||
| m <- run_regex(c("abcaba", "aba", "z"), c("(a)(b)(d)?c?"), global = FALSE) | |||
| expect_equal(max_match_index(m), c(4, 3, NA_integer_)) | |||
| }) | |||
| @@ -6,38 +6,37 @@ test_that("wrap_result handles zero length groups", { | |||
| # (?<=\()([^)]*)(?=\)) | |||
| text <- "Type 'q()' to quit R." | |||
| pattern <- "(?<=\\()([^)]*)(?=\\))" | |||
| res <- wrap_result(run_regex(text, pattern, perl = TRUE)[[1]]) | |||
| res <- wrap_result(run_regex(text, pattern, perl = TRUE, global = FALSE)[[1]]) | |||
| expect_equal(res, "Type 'q(<span class=\"group g00\"><span class=\"group g01\"></span></span>)' to quit R.") | |||
| }) | |||
| test_that("wrap_results generally works", { | |||
| text <- "apples" | |||
| pattern <- "apples" | |||
| res <- wrap_result(run_regex(text, pattern, perl = TRUE)[[1]]) | |||
| res <- wrap_result(run_regex(text, pattern, perl = TRUE, global = FALSE)[[1]]) | |||
| expect_equal(res, "<span class=\"group g00\">apples</span>") | |||
| text <- "He wheeled the bike past the winding road." | |||
| pattern <- "(a|the) ([^ ]+)" | |||
| res <- wrap_result(run_regex(text, pattern, perl = TRUE)[[1]]) | |||
| res <- wrap_result(run_regex(text, pattern, perl = TRUE, global = FALSE)[[1]]) | |||
| expect_equal(res, "He wheeled <span class=\"group g00\"><span class=\"group g01\">the</span> <span class=\"group g02\">bike</span></span> past the winding road.") | |||
| text <- ".15in" | |||
| pattern <- "^(auto|inherit|((\\.\\d+)|(\\d+(\\.\\d+)?))(%|in|cm|mm|em|ex|pt|pc|px|vh|vw|vmin|vmax))$" | |||
| res <- wrap_result(run_regex(text, pattern, perl = TRUE)[[1]]) | |||
| res <- wrap_result(run_regex(text, pattern, perl = TRUE, global = FALSE)[[1]]) | |||
| expect_equal(res, "<span class=\"group g00\"><span class=\"group g01\"><span class=\"group g02 pad01\"><span class=\"group g03 pad02\">.15</span></span><span class=\"group g06 pad01\">in</span></span></span>") | |||
| }) | |||
| test_that("wrap_results works when groups start and end at same index", { | |||
| text <- "7282298386" | |||
| pattern <- "\\(?(\\d{3})[-). ]?(\\d{3})[- .]?(\\d{4})" | |||
| res <- wrap_result(run_regex(text, pattern, perl = TRUE)[[1]]) | |||
| res <- wrap_result(run_regex(text, pattern, perl = TRUE, global = FALSE)[[1]]) | |||
| expect_equal(res, "<span class=\"group g00\"><span class=\"group g01\">728</span><span class=\"group g02\">229</span><span class=\"group g03\">8386</span></span>") | |||
| }) | |||
| test_that("wrap_regex searches globally", { | |||
| testthat::skip("Global search test not ready yet") | |||
| text <- "ab ab" | |||
| pattern <- "(a)(b)" | |||
| result <- paste(rep("<span class=\"group g00\"><span class=\"group g01\">a</span><span class=\"group g02\">b</span></span>", 2), collapse = " ") | |||
| expect_equal(wrap_result(run_regex(text, pattern)[[1]]), result) | |||
| expect_equal(wrap_result(run_regex(text, pattern, global = TRUE)[[1]]), result) | |||
| }) | |||