Просмотр исходного кода

Fix detection of octal characters in sanitize_text_input

and move octals to hex group (yeah, I know)
tags/v0.2.0
Garrick Aden-Buie 8 лет назад
Родитель
Сommit
e102d1391d
4 измененных файлов: 40 добавлений и 4 удалений
  1. +5
    -1
      R/regex_gadget.R
  2. Двоичные данные
      R/sysdata.rda
  3. +3
    -3
      data-raw/cheatsheet.R
  4. +32
    -0
      tests/testthat/test-sanitize_text_input.R

+ 5
- 1
R/regex_gadget.R Просмотреть файл

@@ -380,7 +380,11 @@ regex_gadget <- function(text = NULL,

sanitize_text_input <- function(x) {
if (is.null(x) || !nchar(x)) return(x)
if (grepl("\\u[0-9a-f]{4,8}|\\x[0-9a-f]{2}|\\x\\{[0-9a-f]{1,6}\\}|\\N|\\0[0-8]{1,3}", x)) {
rx_unicode <- "\\u[0-9a-f]{4,8}"
rx_hex <- "\\\\x[0-9a-f]{2}|\\\\x\\{[0-9a-f]{1,6}\\}"
rx_octal <- "\\\\[0][0-7]{1,3}"
rx_escape <- paste(rx_unicode, rx_hex, rx_octal, sep = "|")
if (grepl(rx_escape, x, ignore.case = TRUE)) {
try({
y <- stringi::stri_unescape_unicode(x)
}, silent = TRUE)

Двоичные данные
R/sysdata.rda Просмотреть файл


+ 3
- 3
data-raw/cheatsheet.R Просмотреть файл

@@ -3,7 +3,7 @@ cheatsheet <- tibble::tribble(
"character classes", "regular", "<code>.</code>", "any character except newline",
"character classes", "regular", "<code>\\w</code> <code>\\d</code> <code>\\s</code>", "word, digit, whitespace",
"character classes", "regular", "<code>\\W</code> <code>\\D</code> <code>\\S</code>", "not word, digit, whitespace",
# "character classes", "regular", "<code>\\p{property name}</code>", "matches character with unicode property, like <code>\\p{Uppercase}</code>, see <a href=\"http://www.unicode.org/reports/tr44/#Property_Index.\">unicode property list</a>.",
"character classes", "regular", "<code>\\p{property name}</code>", "matches character with unicode property, like <code>\\p{Uppercase}</code>, see <a href=\"http://www.unicode.org/reports/tr44/#Property_Index.\">unicode property list</a> (not supported by <em>regexplain</em>).",
"character classes", "regular", "<code>[abc]</code>", "any of a, b or c",
"character classes", "regular", "<code>[^abc]</code>", "not a, b, or c",
"character classes", "regular", "<code>[a-g]</code> <code>[1-3]</code>", "character between a & g or 1 & 3",
@@ -38,12 +38,12 @@ cheatsheet <- tibble::tribble(
"escaped characters", "hex", "<code>\\x{hhhh}</code>", "1-6 hex digits",
"escaped characters", "hex", "<code>\\uhhhh</code>", "4 hex digitis",
"escaped characters", "hex", "<code>\\Uhhhhhhhh</code>", "8 hex digits",
"escaped characters", "hex", "<code>\\N{name}</code>", "Name of unicode character, e.g. <code>\\N{grinning face}</code>",
"escaped characters", "hex", "<code>\\N{name}</code>", "Name of unicode character, e.g. <code>\\N{grinning face}</code> (not supported by <em>regexplain</em>)",
"escaped characters", "hex", "<code>\\0ooo</code>", "octal character where \"ooo\" is 1-3 octal digits",
"escaped characters", "control characters", "<code>\\a</code>", "bell",
"escaped characters", "control characters", "<code>\\cX</code>", "match a control-X character",
"escaped characters", "control characters", "<code>\\e</code>", "escape (<code>\\u001B</code>)",
"escaped characters", "control characters", "<code>\\f</code>", "form feed (<code>\\u000C</code>)",
"escaped characters", "control characters", "<code>\\0ooo</code>", "octal character where \"ooo\" is 1-3 octal digits",
"groups", NA, "<code>(abc)</code>", "capture group",
"groups", NA, "<code>\\1</code>, <code>\\2</code>, <code>\\3</code> ...", "backreference to group 1, group 2, etc.",
"groups", NA, "<code>(?:abc)</code>", "non-capturing group, e.g. <code>\"gr(?:e|a)y\")</code>",

+ 32
- 0
tests/testthat/test-sanitize_text_input.R Просмотреть файл

@@ -0,0 +1,32 @@
context("test-sanitize_text_input.R")

test_that("sanitizes backreferences properly", {
expect_equal(sanitize_text_input("\\1 \\2 \\3"), "\\1 \\2 \\3")
})

test_that("sanitizes unicode", {
# rx_unicode <- "\\u[0-9a-f]{4,8}"
expect_equal(sanitize_text_input("\\u2019"), "\u2019")
expect_equal(sanitize_text_input("\\u000D"), "\r")
})

test_that("sanitizes hex", {
# rx_hex <- "\\\\x[0-9a-f]{2}|\\\\x\\{[0-9a-f]{1,6}\\}"
expect_equal(sanitize_text_input("\\x0D"), "\r")
expect_equal(sanitize_text_input("\\x{20AC}"), "\u20AC")
})

test_that("sanitizes octal", {
# rx_octal <- "\\\\[0][0-7]{1,3}"
expect_equal(sanitize_text_input("\\02"), "\002")
})

test_that("doesn't escape normal letters", {
# "\\u[0-9a-f]{4,8}|\\x[0-9a-f]{2}|\\x\\{[0-9a-f]{1,6}\\}|\\N|\\0[0-8]{1,3}"
expect_equal(sanitize_text_input("a"), "a")
expect_equal(sanitize_text_input("\a"), "\a")
expect_equal(sanitize_text_input("\\a"), "\\a")
expect_equal(sanitize_text_input("x"), "x")
#expect_error(sanitize_text_input("\x"))
expect_equal(sanitize_text_input("\\x"), "\\x")
})

Загрузка…
Отмена
Сохранить