and move octals to hex group (yeah, I know)

8 лет назад · e102d1391d
--- a/R/regex_gadget.R
+++ b/R/regex_gadget.R
@@ -380,7 +380,11 @@ regex_gadget <- function(text = NULL,

 sanitize_text_input <- function(x) {
  if (is.null(x) || !nchar(x)) return(x)
  if (grepl("\\u[0-9a-f]{4,8}|\\x[0-9a-f]{2}|\\x\\{[0-9a-f]{1,6}\\}|\\N|\\0[0-8]{1,3}", x)) {
  rx_unicode <- "\\u[0-9a-f]{4,8}"
  rx_hex <- "\\\\x[0-9a-f]{2}|\\\\x\\{[0-9a-f]{1,6}\\}"
  rx_octal <- "\\\\[0][0-7]{1,3}"
  rx_escape <- paste(rx_unicode, rx_hex, rx_octal, sep = "|")
  if (grepl(rx_escape, x, ignore.case = TRUE)) {
    try({
      y <- stringi::stri_unescape_unicode(x)
    }, silent = TRUE)
--- a/R/sysdata.rda
+++ b/R/sysdata.rda
--- a/data-raw/cheatsheet.R
+++ b/data-raw/cheatsheet.R
@@ -3,7 +3,7 @@ cheatsheet <- tibble::tribble(
  "character classes", "regular", "<code>.</code>", "any character except newline",
  "character classes", "regular", "<code>\\w</code> <code>\\d</code> <code>\\s</code>", "word, digit, whitespace",
  "character classes", "regular", "<code>\\W</code> <code>\\D</code> <code>\\S</code>", "not word, digit, whitespace",
  # "character classes", "regular", "<code>\\p{property name}</code>", "matches character with unicode property, like <code>\\p{Uppercase}</code>, see <a href=\"http://www.unicode.org/reports/tr44/#Property_Index.\">unicode property list</a>.",
  "character classes", "regular", "<code>\\p{property name}</code>", "matches character with unicode property, like <code>\\p{Uppercase}</code>, see <a href=\"http://www.unicode.org/reports/tr44/#Property_Index.\">unicode property list</a> (not supported by <em>regexplain</em>).",
  "character classes", "regular", "<code>[abc]</code>", "any of a, b or c",
  "character classes", "regular", "<code>[^abc]</code>", "not a, b, or c",
  "character classes", "regular", "<code>[a-g]</code> <code>[1-3]</code>", "character between a & g or 1 & 3",
@@ -38,12 +38,12 @@ cheatsheet <- tibble::tribble(
  "escaped characters", "hex", "<code>\\x{hhhh}</code>", "1-6 hex digits",
  "escaped characters", "hex", "<code>\\uhhhh</code>", "4 hex digitis",
  "escaped characters", "hex", "<code>\\Uhhhhhhhh</code>", "8 hex digits",
  "escaped characters", "hex", "<code>\\N{name}</code>", "Name of unicode character, e.g. <code>\\N{grinning face}</code>",
  "escaped characters", "hex", "<code>\\N{name}</code>", "Name of unicode character, e.g. <code>\\N{grinning face}</code> (not supported by <em>regexplain</em>)",
  "escaped characters", "hex", "<code>\\0ooo</code>", "octal character where \"ooo\" is 1-3 octal digits",
  "escaped characters", "control characters", "<code>\\a</code>", "bell",
  "escaped characters", "control characters", "<code>\\cX</code>", "match a control-X character",
  "escaped characters", "control characters", "<code>\\e</code>", "escape (<code>\\u001B</code>)",
  "escaped characters", "control characters", "<code>\\f</code>", "form feed (<code>\\u000C</code>)",
  "escaped characters", "control characters", "<code>\\0ooo</code>", "octal character where \"ooo\" is 1-3 octal digits",
  "groups", NA, "<code>(abc)</code>", "capture group",
  "groups", NA, "<code>\\1</code>, <code>\\2</code>, <code>\\3</code> ...", "backreference to group 1, group 2, etc.",
  "groups", NA, "<code>(?:abc)</code>", "non-capturing group, e.g. <code>\"gr(?:e|a)y\")</code>",
--- a/tests/testthat/test-sanitize_text_input.R
+++ b/tests/testthat/test-sanitize_text_input.R
@@ -0,0 +1,32 @@
 context("test-sanitize_text_input.R")

 test_that("sanitizes backreferences properly", {
  expect_equal(sanitize_text_input("\\1 \\2 \\3"), "\\1 \\2 \\3")
 })

 test_that("sanitizes unicode", {
  # rx_unicode <- "\\u[0-9a-f]{4,8}"
  expect_equal(sanitize_text_input("\\u2019"), "\u2019")
  expect_equal(sanitize_text_input("\\u000D"), "\r")
 })

 test_that("sanitizes hex", {
  # rx_hex <- "\\\\x[0-9a-f]{2}|\\\\x\\{[0-9a-f]{1,6}\\}"
  expect_equal(sanitize_text_input("\\x0D"), "\r")
  expect_equal(sanitize_text_input("\\x{20AC}"), "\u20AC")
 })

 test_that("sanitizes octal", {
  # rx_octal <- "\\\\[0][0-7]{1,3}"
  expect_equal(sanitize_text_input("\\02"), "\002")
 })

 test_that("doesn't escape normal letters", {
  # "\\u[0-9a-f]{4,8}|\\x[0-9a-f]{2}|\\x\\{[0-9a-f]{1,6}\\}|\\N|\\0[0-8]{1,3}"
  expect_equal(sanitize_text_input("a"), "a")
  expect_equal(sanitize_text_input("\a"), "\a")
  expect_equal(sanitize_text_input("\\a"), "\\a")
  expect_equal(sanitize_text_input("x"), "x")
  #expect_error(sanitize_text_input("\x"))
  expect_equal(sanitize_text_input("\\x"), "\\x")
 })