503 lines
16 KiB
Rust
503 lines
16 KiB
Rust
use std::str::pattern::*;
|
|
|
|
// This macro makes it easier to write
|
|
// tests that do a series of iterations
|
|
macro_rules! search_asserts {
|
|
($haystack:expr, $needle:expr, $testname:expr, [$($func:ident),*], $result:expr) => {
|
|
let mut searcher = $needle.into_searcher($haystack);
|
|
let arr = [$( Step::from(searcher.$func()) ),*];
|
|
assert_eq!(&arr[..], &$result, $testname);
|
|
}
|
|
}
|
|
|
|
/// Combined enum for the results of next() and next_match()/next_reject()
|
|
#[derive(Debug, PartialEq, Eq)]
|
|
enum Step {
|
|
// variant names purposely chosen to
|
|
// be the same length for easy alignment
|
|
Matches(usize, usize),
|
|
Rejects(usize, usize),
|
|
InRange(usize, usize),
|
|
Done,
|
|
}
|
|
|
|
use self::Step::*;
|
|
|
|
impl From<SearchStep> for Step {
|
|
fn from(x: SearchStep) -> Self {
|
|
match x {
|
|
SearchStep::Match(a, b) => Matches(a, b),
|
|
SearchStep::Reject(a, b) => Rejects(a, b),
|
|
SearchStep::Done => Done,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl From<Option<(usize, usize)>> for Step {
|
|
fn from(x: Option<(usize, usize)>) -> Self {
|
|
match x {
|
|
Some((a, b)) => InRange(a, b),
|
|
None => Done,
|
|
}
|
|
}
|
|
}
|
|
|
|
// FIXME(Manishearth) these tests focus on single-character searching (CharSearcher)
|
|
// and on next()/next_match(), not next_reject(). This is because
|
|
// the memchr changes make next_match() for single chars complex, but next_reject()
|
|
// continues to use next() under the hood. We should add more test cases for all
|
|
// of these, as well as tests for StrSearcher and higher level tests for str::find() (etc)
|
|
|
|
#[test]
|
|
fn test_simple_iteration() {
|
|
search_asserts!(
|
|
"abcdeabcd",
|
|
'a',
|
|
"forward iteration for ASCII string",
|
|
// a b c d e a b c d EOF
|
|
[next, next, next, next, next, next, next, next, next, next],
|
|
[
|
|
Matches(0, 1),
|
|
Rejects(1, 2),
|
|
Rejects(2, 3),
|
|
Rejects(3, 4),
|
|
Rejects(4, 5),
|
|
Matches(5, 6),
|
|
Rejects(6, 7),
|
|
Rejects(7, 8),
|
|
Rejects(8, 9),
|
|
Done
|
|
]
|
|
);
|
|
|
|
search_asserts!(
|
|
"abcdeabcd",
|
|
'a',
|
|
"reverse iteration for ASCII string",
|
|
// d c b a e d c b a EOF
|
|
[
|
|
next_back, next_back, next_back, next_back, next_back, next_back, next_back, next_back,
|
|
next_back, next_back
|
|
],
|
|
[
|
|
Rejects(8, 9),
|
|
Rejects(7, 8),
|
|
Rejects(6, 7),
|
|
Matches(5, 6),
|
|
Rejects(4, 5),
|
|
Rejects(3, 4),
|
|
Rejects(2, 3),
|
|
Rejects(1, 2),
|
|
Matches(0, 1),
|
|
Done
|
|
]
|
|
);
|
|
|
|
search_asserts!(
|
|
"我爱我的猫",
|
|
'我',
|
|
"forward iteration for Chinese string",
|
|
// 我 愛 我 的 貓 EOF
|
|
[next, next, next, next, next, next],
|
|
[Matches(0, 3), Rejects(3, 6), Matches(6, 9), Rejects(9, 12), Rejects(12, 15), Done]
|
|
);
|
|
|
|
search_asserts!(
|
|
"我的猫说meow",
|
|
'm',
|
|
"forward iteration for mixed string",
|
|
// 我 的 猫 说 m e o w EOF
|
|
[next, next, next, next, next, next, next, next, next],
|
|
[
|
|
Rejects(0, 3),
|
|
Rejects(3, 6),
|
|
Rejects(6, 9),
|
|
Rejects(9, 12),
|
|
Matches(12, 13),
|
|
Rejects(13, 14),
|
|
Rejects(14, 15),
|
|
Rejects(15, 16),
|
|
Done
|
|
]
|
|
);
|
|
|
|
search_asserts!(
|
|
"我的猫说meow",
|
|
'猫',
|
|
"reverse iteration for mixed string",
|
|
// w o e m 说 猫 的 我 EOF
|
|
[
|
|
next_back, next_back, next_back, next_back, next_back, next_back, next_back, next_back,
|
|
next_back
|
|
],
|
|
[
|
|
Rejects(15, 16),
|
|
Rejects(14, 15),
|
|
Rejects(13, 14),
|
|
Rejects(12, 13),
|
|
Rejects(9, 12),
|
|
Matches(6, 9),
|
|
Rejects(3, 6),
|
|
Rejects(0, 3),
|
|
Done
|
|
]
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_simple_search() {
|
|
search_asserts!(
|
|
"abcdeabcdeabcde",
|
|
'a',
|
|
"next_match for ASCII string",
|
|
[next_match, next_match, next_match, next_match],
|
|
[InRange(0, 1), InRange(5, 6), InRange(10, 11), Done]
|
|
);
|
|
|
|
search_asserts!(
|
|
"abcdeabcdeabcde",
|
|
'a',
|
|
"next_match_back for ASCII string",
|
|
[next_match_back, next_match_back, next_match_back, next_match_back],
|
|
[InRange(10, 11), InRange(5, 6), InRange(0, 1), Done]
|
|
);
|
|
|
|
search_asserts!(
|
|
"abcdeab",
|
|
'a',
|
|
"next_reject for ASCII string",
|
|
[next_reject, next_reject, next_match, next_reject, next_reject],
|
|
[InRange(1, 2), InRange(2, 3), InRange(5, 6), InRange(6, 7), Done]
|
|
);
|
|
|
|
search_asserts!(
|
|
"abcdeabcdeabcde",
|
|
'a',
|
|
"next_reject_back for ASCII string",
|
|
[
|
|
next_reject_back,
|
|
next_reject_back,
|
|
next_match_back,
|
|
next_reject_back,
|
|
next_reject_back,
|
|
next_reject_back
|
|
],
|
|
[
|
|
InRange(14, 15),
|
|
InRange(13, 14),
|
|
InRange(10, 11),
|
|
InRange(9, 10),
|
|
InRange(8, 9),
|
|
InRange(7, 8)
|
|
]
|
|
);
|
|
}
|
|
|
|
// Á, 각, ก, 😀 all end in 0x81
|
|
// 🁀, ᘀ do not end in 0x81 but contain the byte
|
|
// ꁁ has 0x81 as its second and third bytes.
|
|
//
|
|
// The memchr-using implementation of next_match
|
|
// and next_match_back temporarily violate
|
|
// the property that the search is always on a unicode boundary,
|
|
// which is fine as long as this never reaches next() or next_back().
|
|
// So we test if next() is correct after each next_match() as well.
|
|
const STRESS: &str = "Áa🁀bÁꁁfg😁각กᘀ각aÁ각ꁁก😁a";
|
|
|
|
#[test]
|
|
fn test_stress_indices() {
|
|
// this isn't really a test, more of documentation on the indices of each character in the stresstest string
|
|
|
|
search_asserts!(
|
|
STRESS,
|
|
'x',
|
|
"Indices of characters in stress test",
|
|
[
|
|
next, next, next, next, next, next, next, next, next, next, next, next, next, next,
|
|
next, next, next, next, next, next, next
|
|
],
|
|
[
|
|
Rejects(0, 2), // Á
|
|
Rejects(2, 3), // a
|
|
Rejects(3, 7), // 🁀
|
|
Rejects(7, 8), // b
|
|
Rejects(8, 10), // Á
|
|
Rejects(10, 13), // ꁁ
|
|
Rejects(13, 14), // f
|
|
Rejects(14, 15), // g
|
|
Rejects(15, 19), // 😀
|
|
Rejects(19, 22), // 각
|
|
Rejects(22, 25), // ก
|
|
Rejects(25, 28), // ᘀ
|
|
Rejects(28, 31), // 각
|
|
Rejects(31, 32), // a
|
|
Rejects(32, 34), // Á
|
|
Rejects(34, 37), // 각
|
|
Rejects(37, 40), // ꁁ
|
|
Rejects(40, 43), // ก
|
|
Rejects(43, 47), // 😀
|
|
Rejects(47, 48), // a
|
|
Done
|
|
]
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_forward_search_shared_bytes() {
|
|
search_asserts!(
|
|
STRESS,
|
|
'Á',
|
|
"Forward search for two-byte Latin character",
|
|
[next_match, next_match, next_match, next_match],
|
|
[InRange(0, 2), InRange(8, 10), InRange(32, 34), Done]
|
|
);
|
|
|
|
search_asserts!(
|
|
STRESS,
|
|
'Á',
|
|
"Forward search for two-byte Latin character; check if next() still works",
|
|
[next_match, next, next_match, next, next_match, next, next_match],
|
|
[
|
|
InRange(0, 2),
|
|
Rejects(2, 3),
|
|
InRange(8, 10),
|
|
Rejects(10, 13),
|
|
InRange(32, 34),
|
|
Rejects(34, 37),
|
|
Done
|
|
]
|
|
);
|
|
|
|
search_asserts!(
|
|
STRESS,
|
|
'각',
|
|
"Forward search for three-byte Hangul character",
|
|
[next_match, next, next_match, next_match, next_match],
|
|
[InRange(19, 22), Rejects(22, 25), InRange(28, 31), InRange(34, 37), Done]
|
|
);
|
|
|
|
search_asserts!(
|
|
STRESS,
|
|
'각',
|
|
"Forward search for three-byte Hangul character; check if next() still works",
|
|
[next_match, next, next_match, next, next_match, next, next_match],
|
|
[
|
|
InRange(19, 22),
|
|
Rejects(22, 25),
|
|
InRange(28, 31),
|
|
Rejects(31, 32),
|
|
InRange(34, 37),
|
|
Rejects(37, 40),
|
|
Done
|
|
]
|
|
);
|
|
|
|
search_asserts!(
|
|
STRESS,
|
|
'ก',
|
|
"Forward search for three-byte Thai character",
|
|
[next_match, next, next_match, next, next_match],
|
|
[InRange(22, 25), Rejects(25, 28), InRange(40, 43), Rejects(43, 47), Done]
|
|
);
|
|
|
|
search_asserts!(
|
|
STRESS,
|
|
'ก',
|
|
"Forward search for three-byte Thai character; check if next() still works",
|
|
[next_match, next, next_match, next, next_match],
|
|
[InRange(22, 25), Rejects(25, 28), InRange(40, 43), Rejects(43, 47), Done]
|
|
);
|
|
|
|
search_asserts!(
|
|
STRESS,
|
|
'😁',
|
|
"Forward search for four-byte emoji",
|
|
[next_match, next, next_match, next, next_match],
|
|
[InRange(15, 19), Rejects(19, 22), InRange(43, 47), Rejects(47, 48), Done]
|
|
);
|
|
|
|
search_asserts!(
|
|
STRESS,
|
|
'😁',
|
|
"Forward search for four-byte emoji; check if next() still works",
|
|
[next_match, next, next_match, next, next_match],
|
|
[InRange(15, 19), Rejects(19, 22), InRange(43, 47), Rejects(47, 48), Done]
|
|
);
|
|
|
|
search_asserts!(
|
|
STRESS,
|
|
'ꁁ',
|
|
"Forward search for three-byte Yi character with repeated bytes",
|
|
[next_match, next, next_match, next, next_match],
|
|
[InRange(10, 13), Rejects(13, 14), InRange(37, 40), Rejects(40, 43), Done]
|
|
);
|
|
|
|
search_asserts!(
|
|
STRESS,
|
|
'ꁁ',
|
|
"Forward search for three-byte Yi character with repeated bytes; check if next() still works",
|
|
[next_match, next, next_match, next, next_match],
|
|
[InRange(10, 13), Rejects(13, 14), InRange(37, 40), Rejects(40, 43), Done]
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_reverse_search_shared_bytes() {
|
|
search_asserts!(
|
|
STRESS,
|
|
'Á',
|
|
"Reverse search for two-byte Latin character",
|
|
[next_match_back, next_match_back, next_match_back, next_match_back],
|
|
[InRange(32, 34), InRange(8, 10), InRange(0, 2), Done]
|
|
);
|
|
|
|
search_asserts!(
|
|
STRESS,
|
|
'Á',
|
|
"Reverse search for two-byte Latin character; check if next_back() still works",
|
|
[next_match_back, next_back, next_match_back, next_back, next_match_back, next_back],
|
|
[InRange(32, 34), Rejects(31, 32), InRange(8, 10), Rejects(7, 8), InRange(0, 2), Done]
|
|
);
|
|
|
|
search_asserts!(
|
|
STRESS,
|
|
'각',
|
|
"Reverse search for three-byte Hangul character",
|
|
[next_match_back, next_back, next_match_back, next_match_back, next_match_back],
|
|
[InRange(34, 37), Rejects(32, 34), InRange(28, 31), InRange(19, 22), Done]
|
|
);
|
|
|
|
search_asserts!(
|
|
STRESS,
|
|
'각',
|
|
"Reverse search for three-byte Hangul character; check if next_back() still works",
|
|
[
|
|
next_match_back,
|
|
next_back,
|
|
next_match_back,
|
|
next_back,
|
|
next_match_back,
|
|
next_back,
|
|
next_match_back
|
|
],
|
|
[
|
|
InRange(34, 37),
|
|
Rejects(32, 34),
|
|
InRange(28, 31),
|
|
Rejects(25, 28),
|
|
InRange(19, 22),
|
|
Rejects(15, 19),
|
|
Done
|
|
]
|
|
);
|
|
|
|
search_asserts!(
|
|
STRESS,
|
|
'ก',
|
|
"Reverse search for three-byte Thai character",
|
|
[next_match_back, next_back, next_match_back, next_back, next_match_back],
|
|
[InRange(40, 43), Rejects(37, 40), InRange(22, 25), Rejects(19, 22), Done]
|
|
);
|
|
|
|
search_asserts!(
|
|
STRESS,
|
|
'ก',
|
|
"Reverse search for three-byte Thai character; check if next_back() still works",
|
|
[next_match_back, next_back, next_match_back, next_back, next_match_back],
|
|
[InRange(40, 43), Rejects(37, 40), InRange(22, 25), Rejects(19, 22), Done]
|
|
);
|
|
|
|
search_asserts!(
|
|
STRESS,
|
|
'😁',
|
|
"Reverse search for four-byte emoji",
|
|
[next_match_back, next_back, next_match_back, next_back, next_match_back],
|
|
[InRange(43, 47), Rejects(40, 43), InRange(15, 19), Rejects(14, 15), Done]
|
|
);
|
|
|
|
search_asserts!(
|
|
STRESS,
|
|
'😁',
|
|
"Reverse search for four-byte emoji; check if next_back() still works",
|
|
[next_match_back, next_back, next_match_back, next_back, next_match_back],
|
|
[InRange(43, 47), Rejects(40, 43), InRange(15, 19), Rejects(14, 15), Done]
|
|
);
|
|
|
|
search_asserts!(
|
|
STRESS,
|
|
'ꁁ',
|
|
"Reverse search for three-byte Yi character with repeated bytes",
|
|
[next_match_back, next_back, next_match_back, next_back, next_match_back],
|
|
[InRange(37, 40), Rejects(34, 37), InRange(10, 13), Rejects(8, 10), Done]
|
|
);
|
|
|
|
search_asserts!(
|
|
STRESS,
|
|
'ꁁ',
|
|
"Reverse search for three-byte Yi character with repeated bytes; check if next_back() still works",
|
|
[next_match_back, next_back, next_match_back, next_back, next_match_back],
|
|
[InRange(37, 40), Rejects(34, 37), InRange(10, 13), Rejects(8, 10), Done]
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn double_ended_regression_test() {
|
|
// https://github.com/rust-lang/rust/issues/47175
|
|
// Ensures that double ended searching comes to a convergence
|
|
search_asserts!(
|
|
"abcdeabcdeabcde",
|
|
'a',
|
|
"alternating double ended search",
|
|
[next_match, next_match_back, next_match, next_match_back],
|
|
[InRange(0, 1), InRange(10, 11), InRange(5, 6), Done]
|
|
);
|
|
search_asserts!(
|
|
"abcdeabcdeabcde",
|
|
'a',
|
|
"triple double ended search for a",
|
|
[next_match, next_match_back, next_match_back, next_match_back],
|
|
[InRange(0, 1), InRange(10, 11), InRange(5, 6), Done]
|
|
);
|
|
search_asserts!(
|
|
"abcdeabcdeabcde",
|
|
'd',
|
|
"triple double ended search for d",
|
|
[next_match, next_match_back, next_match_back, next_match_back],
|
|
[InRange(3, 4), InRange(13, 14), InRange(8, 9), Done]
|
|
);
|
|
search_asserts!(
|
|
STRESS,
|
|
'Á',
|
|
"Double ended search for two-byte Latin character",
|
|
[next_match, next_match_back, next_match, next_match_back],
|
|
[InRange(0, 2), InRange(32, 34), InRange(8, 10), Done]
|
|
);
|
|
search_asserts!(
|
|
STRESS,
|
|
'각',
|
|
"Reverse double ended search for three-byte Hangul character",
|
|
[next_match_back, next_back, next_match, next, next_match_back, next_match],
|
|
[InRange(34, 37), Rejects(32, 34), InRange(19, 22), Rejects(22, 25), InRange(28, 31), Done]
|
|
);
|
|
search_asserts!(
|
|
STRESS,
|
|
'ก',
|
|
"Double ended search for three-byte Thai character",
|
|
[next_match, next_back, next, next_match_back, next_match],
|
|
[InRange(22, 25), Rejects(47, 48), Rejects(25, 28), InRange(40, 43), Done]
|
|
);
|
|
search_asserts!(
|
|
STRESS,
|
|
'😁',
|
|
"Double ended search for four-byte emoji",
|
|
[next_match_back, next, next_match, next_back, next_match],
|
|
[InRange(43, 47), Rejects(0, 2), InRange(15, 19), Rejects(40, 43), Done]
|
|
);
|
|
search_asserts!(
|
|
STRESS,
|
|
'ꁁ',
|
|
"Double ended search for three-byte Yi character with repeated bytes",
|
|
[next_match, next, next_match_back, next_back, next_match],
|
|
[InRange(10, 13), Rejects(13, 14), InRange(37, 40), Rejects(34, 37), Done]
|
|
);
|
|
}
|