Auto merge of #111222 - scottmcm:constify-is_ascii, r=thomcc
Constify `[u8]::is_ascii` (unstably) UTF-8 checking in `const fn`-stabilized back in 1.63 (#97367), but apparently somehow ASCII checking was never const-ified, despite being simpler. New constness-tracking issue for `is_ascii`: #111090 I noticed this working on `ascii::Char`: #110998
This commit is contained in:
commit
613a5c95ae
6 changed files with 96 additions and 15 deletions
|
@ -4,10 +4,23 @@ use crate::ascii;
|
||||||
impl<const N: usize> [u8; N] {
|
impl<const N: usize> [u8; N] {
|
||||||
/// Converts this array of bytes into a array of ASCII characters,
|
/// Converts this array of bytes into a array of ASCII characters,
|
||||||
/// or returns `None` if any of the characters is non-ASCII.
|
/// or returns `None` if any of the characters is non-ASCII.
|
||||||
|
///
|
||||||
|
/// # Examples
|
||||||
|
///
|
||||||
|
/// ```
|
||||||
|
/// #![feature(ascii_char)]
|
||||||
|
/// #![feature(const_option)]
|
||||||
|
///
|
||||||
|
/// const HEX_DIGITS: [std::ascii::Char; 16] =
|
||||||
|
/// *b"0123456789abcdef".as_ascii().unwrap();
|
||||||
|
///
|
||||||
|
/// assert_eq!(HEX_DIGITS[1].as_str(), "1");
|
||||||
|
/// assert_eq!(HEX_DIGITS[10].as_str(), "a");
|
||||||
|
/// ```
|
||||||
#[unstable(feature = "ascii_char", issue = "110998")]
|
#[unstable(feature = "ascii_char", issue = "110998")]
|
||||||
#[must_use]
|
#[must_use]
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn as_ascii(&self) -> Option<&[ascii::Char; N]> {
|
pub const fn as_ascii(&self) -> Option<&[ascii::Char; N]> {
|
||||||
if self.is_ascii() {
|
if self.is_ascii() {
|
||||||
// SAFETY: Just checked that it's ASCII
|
// SAFETY: Just checked that it's ASCII
|
||||||
Some(unsafe { self.as_ascii_unchecked() })
|
Some(unsafe { self.as_ascii_unchecked() })
|
||||||
|
|
|
@ -150,6 +150,7 @@
|
||||||
#![feature(const_slice_from_raw_parts_mut)]
|
#![feature(const_slice_from_raw_parts_mut)]
|
||||||
#![feature(const_slice_from_ref)]
|
#![feature(const_slice_from_ref)]
|
||||||
#![feature(const_slice_index)]
|
#![feature(const_slice_index)]
|
||||||
|
#![feature(const_slice_is_ascii)]
|
||||||
#![feature(const_slice_ptr_len)]
|
#![feature(const_slice_ptr_len)]
|
||||||
#![feature(const_slice_split_at_mut)]
|
#![feature(const_slice_split_at_mut)]
|
||||||
#![feature(const_str_from_utf8_unchecked_mut)]
|
#![feature(const_str_from_utf8_unchecked_mut)]
|
||||||
|
|
|
@ -10,9 +10,10 @@ use crate::ops;
|
||||||
impl [u8] {
|
impl [u8] {
|
||||||
/// Checks if all bytes in this slice are within the ASCII range.
|
/// Checks if all bytes in this slice are within the ASCII range.
|
||||||
#[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
|
#[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
|
||||||
|
#[rustc_const_unstable(feature = "const_slice_is_ascii", issue = "111090")]
|
||||||
#[must_use]
|
#[must_use]
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn is_ascii(&self) -> bool {
|
pub const fn is_ascii(&self) -> bool {
|
||||||
is_ascii(self)
|
is_ascii(self)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -21,7 +22,7 @@ impl [u8] {
|
||||||
#[unstable(feature = "ascii_char", issue = "110998")]
|
#[unstable(feature = "ascii_char", issue = "110998")]
|
||||||
#[must_use]
|
#[must_use]
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn as_ascii(&self) -> Option<&[ascii::Char]> {
|
pub const fn as_ascii(&self) -> Option<&[ascii::Char]> {
|
||||||
if self.is_ascii() {
|
if self.is_ascii() {
|
||||||
// SAFETY: Just checked that it's ASCII
|
// SAFETY: Just checked that it's ASCII
|
||||||
Some(unsafe { self.as_ascii_unchecked() })
|
Some(unsafe { self.as_ascii_unchecked() })
|
||||||
|
@ -262,11 +263,29 @@ impl<'a> fmt::Debug for EscapeAscii<'a> {
|
||||||
/// Returns `true` if any byte in the word `v` is nonascii (>= 128). Snarfed
|
/// Returns `true` if any byte in the word `v` is nonascii (>= 128). Snarfed
|
||||||
/// from `../str/mod.rs`, which does something similar for utf8 validation.
|
/// from `../str/mod.rs`, which does something similar for utf8 validation.
|
||||||
#[inline]
|
#[inline]
|
||||||
fn contains_nonascii(v: usize) -> bool {
|
const fn contains_nonascii(v: usize) -> bool {
|
||||||
const NONASCII_MASK: usize = usize::repeat_u8(0x80);
|
const NONASCII_MASK: usize = usize::repeat_u8(0x80);
|
||||||
(NONASCII_MASK & v) != 0
|
(NONASCII_MASK & v) != 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// ASCII test *without* the chunk-at-a-time optimizations.
|
||||||
|
///
|
||||||
|
/// This is carefully structured to produce nice small code -- it's smaller in
|
||||||
|
/// `-O` than what the "obvious" ways produces under `-C opt-level=s`. If you
|
||||||
|
/// touch it, be sure to run (and update if needed) the assembly test.
|
||||||
|
#[unstable(feature = "str_internals", issue = "none")]
|
||||||
|
#[doc(hidden)]
|
||||||
|
#[inline]
|
||||||
|
pub const fn is_ascii_simple(mut bytes: &[u8]) -> bool {
|
||||||
|
while let [rest @ .., last] = bytes {
|
||||||
|
if !last.is_ascii() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
bytes = rest;
|
||||||
|
}
|
||||||
|
bytes.is_empty()
|
||||||
|
}
|
||||||
|
|
||||||
/// Optimized ASCII test that will use usize-at-a-time operations instead of
|
/// Optimized ASCII test that will use usize-at-a-time operations instead of
|
||||||
/// byte-at-a-time operations (when possible).
|
/// byte-at-a-time operations (when possible).
|
||||||
///
|
///
|
||||||
|
@ -280,7 +299,7 @@ fn contains_nonascii(v: usize) -> bool {
|
||||||
/// If any of these loads produces something for which `contains_nonascii`
|
/// If any of these loads produces something for which `contains_nonascii`
|
||||||
/// (above) returns true, then we know the answer is false.
|
/// (above) returns true, then we know the answer is false.
|
||||||
#[inline]
|
#[inline]
|
||||||
fn is_ascii(s: &[u8]) -> bool {
|
const fn is_ascii(s: &[u8]) -> bool {
|
||||||
const USIZE_SIZE: usize = mem::size_of::<usize>();
|
const USIZE_SIZE: usize = mem::size_of::<usize>();
|
||||||
|
|
||||||
let len = s.len();
|
let len = s.len();
|
||||||
|
@ -292,7 +311,7 @@ fn is_ascii(s: &[u8]) -> bool {
|
||||||
// We also do this for architectures where `size_of::<usize>()` isn't
|
// We also do this for architectures where `size_of::<usize>()` isn't
|
||||||
// sufficient alignment for `usize`, because it's a weird edge case.
|
// sufficient alignment for `usize`, because it's a weird edge case.
|
||||||
if len < USIZE_SIZE || len < align_offset || USIZE_SIZE < mem::align_of::<usize>() {
|
if len < USIZE_SIZE || len < align_offset || USIZE_SIZE < mem::align_of::<usize>() {
|
||||||
return s.iter().all(|b| b.is_ascii());
|
return is_ascii_simple(s);
|
||||||
}
|
}
|
||||||
|
|
||||||
// We always read the first word unaligned, which means `align_offset` is
|
// We always read the first word unaligned, which means `align_offset` is
|
||||||
|
@ -321,18 +340,26 @@ fn is_ascii(s: &[u8]) -> bool {
|
||||||
// Paranoia check about alignment, since we're about to do a bunch of
|
// Paranoia check about alignment, since we're about to do a bunch of
|
||||||
// unaligned loads. In practice this should be impossible barring a bug in
|
// unaligned loads. In practice this should be impossible barring a bug in
|
||||||
// `align_offset` though.
|
// `align_offset` though.
|
||||||
debug_assert_eq!(word_ptr.addr() % mem::align_of::<usize>(), 0);
|
// While this method is allowed to spuriously fail in CTFE, if it doesn't
|
||||||
|
// have alignment information it should have given a `usize::MAX` for
|
||||||
|
// `align_offset` earlier, sending things through the scalar path instead of
|
||||||
|
// this one, so this check should pass if it's reachable.
|
||||||
|
debug_assert!(word_ptr.is_aligned_to(mem::align_of::<usize>()));
|
||||||
|
|
||||||
// Read subsequent words until the last aligned word, excluding the last
|
// Read subsequent words until the last aligned word, excluding the last
|
||||||
// aligned word by itself to be done in tail check later, to ensure that
|
// aligned word by itself to be done in tail check later, to ensure that
|
||||||
// tail is always one `usize` at most to extra branch `byte_pos == len`.
|
// tail is always one `usize` at most to extra branch `byte_pos == len`.
|
||||||
while byte_pos < len - USIZE_SIZE {
|
while byte_pos < len - USIZE_SIZE {
|
||||||
debug_assert!(
|
// Sanity check that the read is in bounds
|
||||||
// Sanity check that the read is in bounds
|
debug_assert!(byte_pos + USIZE_SIZE <= len);
|
||||||
(word_ptr.addr() + USIZE_SIZE) <= start.addr().wrapping_add(len) &&
|
// And that our assumptions about `byte_pos` hold.
|
||||||
// And that our assumptions about `byte_pos` hold.
|
debug_assert!(matches!(
|
||||||
(word_ptr.addr() - start.addr()) == byte_pos
|
word_ptr.cast::<u8>().guaranteed_eq(start.wrapping_add(byte_pos)),
|
||||||
);
|
// These are from the same allocation, so will hopefully always be
|
||||||
|
// known to match even in CTFE, but if it refuses to compare them
|
||||||
|
// that's ok since it's just a debug check anyway.
|
||||||
|
None | Some(true),
|
||||||
|
));
|
||||||
|
|
||||||
// SAFETY: We know `word_ptr` is properly aligned (because of
|
// SAFETY: We know `word_ptr` is properly aligned (because of
|
||||||
// `align_offset`), and we know that we have enough bytes between `word_ptr` and the end
|
// `align_offset`), and we know that we have enough bytes between `word_ptr` and the end
|
||||||
|
|
|
@ -44,6 +44,10 @@ mod raw;
|
||||||
mod rotate;
|
mod rotate;
|
||||||
mod specialize;
|
mod specialize;
|
||||||
|
|
||||||
|
#[unstable(feature = "str_internals", issue = "none")]
|
||||||
|
#[doc(hidden)]
|
||||||
|
pub use ascii::is_ascii_simple;
|
||||||
|
|
||||||
#[stable(feature = "rust1", since = "1.0.0")]
|
#[stable(feature = "rust1", since = "1.0.0")]
|
||||||
pub use iter::{Chunks, ChunksMut, Windows};
|
pub use iter::{Chunks, ChunksMut, Windows};
|
||||||
#[stable(feature = "rust1", since = "1.0.0")]
|
#[stable(feature = "rust1", since = "1.0.0")]
|
||||||
|
|
|
@ -2358,9 +2358,10 @@ impl str {
|
||||||
/// assert!(!non_ascii.is_ascii());
|
/// assert!(!non_ascii.is_ascii());
|
||||||
/// ```
|
/// ```
|
||||||
#[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
|
#[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
|
||||||
|
#[rustc_const_unstable(feature = "const_slice_is_ascii", issue = "111090")]
|
||||||
#[must_use]
|
#[must_use]
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn is_ascii(&self) -> bool {
|
pub const fn is_ascii(&self) -> bool {
|
||||||
// We can treat each byte as character here: all multibyte characters
|
// We can treat each byte as character here: all multibyte characters
|
||||||
// start with a byte that is not in the ASCII range, so we will stop
|
// start with a byte that is not in the ASCII range, so we will stop
|
||||||
// there already.
|
// there already.
|
||||||
|
@ -2372,7 +2373,7 @@ impl str {
|
||||||
#[unstable(feature = "ascii_char", issue = "110998")]
|
#[unstable(feature = "ascii_char", issue = "110998")]
|
||||||
#[must_use]
|
#[must_use]
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn as_ascii(&self) -> Option<&[ascii::Char]> {
|
pub const fn as_ascii(&self) -> Option<&[ascii::Char]> {
|
||||||
// Like in `is_ascii`, we can work on the bytes directly.
|
// Like in `is_ascii`, we can work on the bytes directly.
|
||||||
self.as_bytes().as_ascii()
|
self.as_bytes().as_ascii()
|
||||||
}
|
}
|
||||||
|
|
35
tests/assembly/slice-is_ascii.rs
Normal file
35
tests/assembly/slice-is_ascii.rs
Normal file
|
@ -0,0 +1,35 @@
|
||||||
|
// revisions: WIN LIN
|
||||||
|
// [WIN] only-windows
|
||||||
|
// [LIN] only-linux
|
||||||
|
// assembly-output: emit-asm
|
||||||
|
// compile-flags: --crate-type=lib -O -C llvm-args=-x86-asm-syntax=intel
|
||||||
|
// min-llvm-version: 14
|
||||||
|
// only-x86_64
|
||||||
|
// ignore-sgx
|
||||||
|
// ignore-debug
|
||||||
|
|
||||||
|
#![feature(str_internals)]
|
||||||
|
|
||||||
|
// CHECK-LABEL: is_ascii_simple_demo:
|
||||||
|
#[no_mangle]
|
||||||
|
pub fn is_ascii_simple_demo(bytes: &[u8]) -> bool {
|
||||||
|
// Linux (System V): pointer is rdi; length is rsi
|
||||||
|
// Windows: pointer is rcx; length is rdx.
|
||||||
|
|
||||||
|
// CHECK-NOT: mov
|
||||||
|
// CHECK-NOT: test
|
||||||
|
// CHECK-NOT: cmp
|
||||||
|
|
||||||
|
// CHECK: .[[LOOPHEAD:.+]]:
|
||||||
|
// CHECK-NEXT: mov [[TEMP:.+]], [[LEN:rsi|rdx]]
|
||||||
|
// CHECK-NEXT: sub [[LEN]], 1
|
||||||
|
// CHECK-NEXT: jb .[[LOOPEXIT:.+]]
|
||||||
|
// CHECK-NEXT: cmp byte ptr [{{rdi|rcx}} + [[TEMP]] - 1], 0
|
||||||
|
// CHECK-NEXT: jns .[[LOOPHEAD]]
|
||||||
|
|
||||||
|
// CHECK-NEXT: .[[LOOPEXIT]]:
|
||||||
|
// CHECK-NEXT: test [[TEMP]], [[TEMP]]
|
||||||
|
// CHECK-NEXT: sete al
|
||||||
|
// CHECK-NEXT: ret
|
||||||
|
core::slice::is_ascii_simple(bytes)
|
||||||
|
}
|
Loading…
Add table
Reference in a new issue