From 370d31b93dba75ceac236d676d6a6df07217ff07 Mon Sep 17 00:00:00 2001 From: Scott McMurray Date: Thu, 4 May 2023 14:26:19 -0700 Subject: [PATCH 1/3] Constify `[u8]::is_ascii` (unstably) UTF-8 checking in `const fn`-stabilized back in 1.63, but apparently somehow ASCII checking was never const-ified, despite being simpler. --- library/core/src/array/ascii.rs | 2 +- library/core/src/lib.rs | 1 + library/core/src/slice/ascii.rs | 42 +++++++++++++++++++++++---------- library/core/src/str/mod.rs | 5 ++-- 4 files changed, 35 insertions(+), 15 deletions(-) diff --git a/library/core/src/array/ascii.rs b/library/core/src/array/ascii.rs index 6750d7c0711..a942b9e4ae3 100644 --- a/library/core/src/array/ascii.rs +++ b/library/core/src/array/ascii.rs @@ -7,7 +7,7 @@ impl [u8; N] { #[unstable(feature = "ascii_char", issue = "110998")] #[must_use] #[inline] - pub fn as_ascii(&self) -> Option<&[ascii::Char; N]> { + pub const fn as_ascii(&self) -> Option<&[ascii::Char; N]> { if self.is_ascii() { // SAFETY: Just checked that it's ASCII Some(unsafe { self.as_ascii_unchecked() }) diff --git a/library/core/src/lib.rs b/library/core/src/lib.rs index a535a011aaf..01cc137c24e 100644 --- a/library/core/src/lib.rs +++ b/library/core/src/lib.rs @@ -149,6 +149,7 @@ #![feature(const_slice_from_raw_parts_mut)] #![feature(const_slice_from_ref)] #![feature(const_slice_index)] +#![feature(const_slice_is_ascii)] #![feature(const_slice_ptr_len)] #![feature(const_slice_split_at_mut)] #![feature(const_str_from_utf8_unchecked_mut)] diff --git a/library/core/src/slice/ascii.rs b/library/core/src/slice/ascii.rs index 7bae6692ad4..6a6c0c9ba8b 100644 --- a/library/core/src/slice/ascii.rs +++ b/library/core/src/slice/ascii.rs @@ -10,9 +10,10 @@ use crate::ops; impl [u8] { /// Checks if all bytes in this slice are within the ASCII range. #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")] + #[rustc_const_unstable(feature = "const_slice_is_ascii", issue = "111090")] #[must_use] #[inline] - pub fn is_ascii(&self) -> bool { + pub const fn is_ascii(&self) -> bool { is_ascii(self) } @@ -21,7 +22,7 @@ impl [u8] { #[unstable(feature = "ascii_char", issue = "110998")] #[must_use] #[inline] - pub fn as_ascii(&self) -> Option<&[ascii::Char]> { + pub const fn as_ascii(&self) -> Option<&[ascii::Char]> { if self.is_ascii() { // SAFETY: Just checked that it's ASCII Some(unsafe { self.as_ascii_unchecked() }) @@ -262,7 +263,7 @@ impl<'a> fmt::Debug for EscapeAscii<'a> { /// Returns `true` if any byte in the word `v` is nonascii (>= 128). Snarfed /// from `../str/mod.rs`, which does something similar for utf8 validation. #[inline] -fn contains_nonascii(v: usize) -> bool { +const fn contains_nonascii(v: usize) -> bool { const NONASCII_MASK: usize = usize::repeat_u8(0x80); (NONASCII_MASK & v) != 0 } @@ -280,7 +281,7 @@ fn contains_nonascii(v: usize) -> bool { /// If any of these loads produces something for which `contains_nonascii` /// (above) returns true, then we know the answer is false. #[inline] -fn is_ascii(s: &[u8]) -> bool { +const fn is_ascii(s: &[u8]) -> bool { const USIZE_SIZE: usize = mem::size_of::(); let len = s.len(); @@ -292,7 +293,16 @@ fn is_ascii(s: &[u8]) -> bool { // We also do this for architectures where `size_of::()` isn't // sufficient alignment for `usize`, because it's a weird edge case. if len < USIZE_SIZE || len < align_offset || USIZE_SIZE < mem::align_of::() { - return s.iter().all(|b| b.is_ascii()); + // FIXME: once iterators and closures can be used in `const fn`, + // return s.iter().all(|b| b.is_ascii()); + let mut i = 0; + while i < len { + if !s[i].is_ascii() { + return false; + } + i += 1; + } + return true; } // We always read the first word unaligned, which means `align_offset` is @@ -321,18 +331,26 @@ fn is_ascii(s: &[u8]) -> bool { // Paranoia check about alignment, since we're about to do a bunch of // unaligned loads. In practice this should be impossible barring a bug in // `align_offset` though. - debug_assert_eq!(word_ptr.addr() % mem::align_of::(), 0); + // While this method is allowed to spuriously fail in CTFE, if it doesn't + // have alignment information it should have given a `usize::MAX` for + // `align_offset` earlier, sending things through the scalar path instead of + // this one, so this check should pass if it's reachable. + debug_assert!(word_ptr.is_aligned_to(mem::align_of::())); // Read subsequent words until the last aligned word, excluding the last // aligned word by itself to be done in tail check later, to ensure that // tail is always one `usize` at most to extra branch `byte_pos == len`. while byte_pos < len - USIZE_SIZE { - debug_assert!( - // Sanity check that the read is in bounds - (word_ptr.addr() + USIZE_SIZE) <= start.addr().wrapping_add(len) && - // And that our assumptions about `byte_pos` hold. - (word_ptr.addr() - start.addr()) == byte_pos - ); + // Sanity check that the read is in bounds + debug_assert!(byte_pos + USIZE_SIZE <= len); + // And that our assumptions about `byte_pos` hold. + debug_assert!(matches!( + word_ptr.cast::().guaranteed_eq(start.wrapping_add(byte_pos)), + // These are from the same allocation, so will hopefully always be + // known to match even in CTFE, but if it refuses to compare them + // that's ok since it's just a debug check anyway. + None | Some(true), + )); // SAFETY: We know `word_ptr` is properly aligned (because of // `align_offset`), and we know that we have enough bytes between `word_ptr` and the end diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index 66fa9cf6f64..ef05b25fdd0 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -2358,9 +2358,10 @@ impl str { /// assert!(!non_ascii.is_ascii()); /// ``` #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")] + #[rustc_const_unstable(feature = "const_slice_is_ascii", issue = "111090")] #[must_use] #[inline] - pub fn is_ascii(&self) -> bool { + pub const fn is_ascii(&self) -> bool { // We can treat each byte as character here: all multibyte characters // start with a byte that is not in the ASCII range, so we will stop // there already. @@ -2372,7 +2373,7 @@ impl str { #[unstable(feature = "ascii_char", issue = "110998")] #[must_use] #[inline] - pub fn as_ascii(&self) -> Option<&[ascii::Char]> { + pub const fn as_ascii(&self) -> Option<&[ascii::Char]> { // Like in `is_ascii`, we can work on the bytes directly. self.as_bytes().as_ascii() } From 1cfcf71e0428b5fa314b8e82aae2ef5858e8a79a Mon Sep 17 00:00:00 2001 From: Scott McMurray Date: Thu, 4 May 2023 14:46:17 -0700 Subject: [PATCH 2/3] Add an example that depends on `is_ascii` in a `const` --- library/core/src/array/ascii.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/library/core/src/array/ascii.rs b/library/core/src/array/ascii.rs index a942b9e4ae3..3fea9a44049 100644 --- a/library/core/src/array/ascii.rs +++ b/library/core/src/array/ascii.rs @@ -4,6 +4,19 @@ use crate::ascii; impl [u8; N] { /// Converts this array of bytes into a array of ASCII characters, /// or returns `None` if any of the characters is non-ASCII. + /// + /// # Examples + /// + /// ``` + /// #![feature(ascii_char)] + /// #![feature(const_option)] + /// + /// const HEX_DIGITS: [std::ascii::Char; 16] = + /// *b"0123456789abcdef".as_ascii().unwrap(); + /// + /// assert_eq!(HEX_DIGITS[1].as_str(), "1"); + /// assert_eq!(HEX_DIGITS[10].as_str(), "a"); + /// ``` #[unstable(feature = "ascii_char", issue = "110998")] #[must_use] #[inline] From c8c5a587ac637aa1521c17c631fe0070aa1dc994 Mon Sep 17 00:00:00 2001 From: Scott McMurray Date: Fri, 5 May 2023 02:29:40 -0700 Subject: [PATCH 3/3] Tune the `is_ascii` implementation used for short slices --- library/core/src/slice/ascii.rs | 29 +++++++++++++++++--------- library/core/src/slice/mod.rs | 4 ++++ tests/assembly/slice-is_ascii.rs | 35 ++++++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 10 deletions(-) create mode 100644 tests/assembly/slice-is_ascii.rs diff --git a/library/core/src/slice/ascii.rs b/library/core/src/slice/ascii.rs index 6a6c0c9ba8b..f3311f76a7f 100644 --- a/library/core/src/slice/ascii.rs +++ b/library/core/src/slice/ascii.rs @@ -268,6 +268,24 @@ const fn contains_nonascii(v: usize) -> bool { (NONASCII_MASK & v) != 0 } +/// ASCII test *without* the chunk-at-a-time optimizations. +/// +/// This is carefully structured to produce nice small code -- it's smaller in +/// `-O` than what the "obvious" ways produces under `-C opt-level=s`. If you +/// touch it, be sure to run (and update if needed) the assembly test. +#[unstable(feature = "str_internals", issue = "none")] +#[doc(hidden)] +#[inline] +pub const fn is_ascii_simple(mut bytes: &[u8]) -> bool { + while let [rest @ .., last] = bytes { + if !last.is_ascii() { + break; + } + bytes = rest; + } + bytes.is_empty() +} + /// Optimized ASCII test that will use usize-at-a-time operations instead of /// byte-at-a-time operations (when possible). /// @@ -293,16 +311,7 @@ const fn is_ascii(s: &[u8]) -> bool { // We also do this for architectures where `size_of::()` isn't // sufficient alignment for `usize`, because it's a weird edge case. if len < USIZE_SIZE || len < align_offset || USIZE_SIZE < mem::align_of::() { - // FIXME: once iterators and closures can be used in `const fn`, - // return s.iter().all(|b| b.is_ascii()); - let mut i = 0; - while i < len { - if !s[i].is_ascii() { - return false; - } - i += 1; - } - return true; + return is_ascii_simple(s); } // We always read the first word unaligned, which means `align_offset` is diff --git a/library/core/src/slice/mod.rs b/library/core/src/slice/mod.rs index d4981af90d1..4c891ba550f 100644 --- a/library/core/src/slice/mod.rs +++ b/library/core/src/slice/mod.rs @@ -44,6 +44,10 @@ mod raw; mod rotate; mod specialize; +#[unstable(feature = "str_internals", issue = "none")] +#[doc(hidden)] +pub use ascii::is_ascii_simple; + #[stable(feature = "rust1", since = "1.0.0")] pub use iter::{Chunks, ChunksMut, Windows}; #[stable(feature = "rust1", since = "1.0.0")] diff --git a/tests/assembly/slice-is_ascii.rs b/tests/assembly/slice-is_ascii.rs new file mode 100644 index 00000000000..b3e1fee15a7 --- /dev/null +++ b/tests/assembly/slice-is_ascii.rs @@ -0,0 +1,35 @@ +// revisions: WIN LIN +// [WIN] only-windows +// [LIN] only-linux +// assembly-output: emit-asm +// compile-flags: --crate-type=lib -O -C llvm-args=-x86-asm-syntax=intel +// min-llvm-version: 14 +// only-x86_64 +// ignore-sgx +// ignore-debug + +#![feature(str_internals)] + +// CHECK-LABEL: is_ascii_simple_demo: +#[no_mangle] +pub fn is_ascii_simple_demo(bytes: &[u8]) -> bool { + // Linux (System V): pointer is rdi; length is rsi + // Windows: pointer is rcx; length is rdx. + + // CHECK-NOT: mov + // CHECK-NOT: test + // CHECK-NOT: cmp + + // CHECK: .[[LOOPHEAD:.+]]: + // CHECK-NEXT: mov [[TEMP:.+]], [[LEN:rsi|rdx]] + // CHECK-NEXT: sub [[LEN]], 1 + // CHECK-NEXT: jb .[[LOOPEXIT:.+]] + // CHECK-NEXT: cmp byte ptr [{{rdi|rcx}} + [[TEMP]] - 1], 0 + // CHECK-NEXT: jns .[[LOOPHEAD]] + + // CHECK-NEXT: .[[LOOPEXIT]]: + // CHECK-NEXT: test [[TEMP]], [[TEMP]] + // CHECK-NEXT: sete al + // CHECK-NEXT: ret + core::slice::is_ascii_simple(bytes) +}