Auto merge of #132883 - LaihoE:vectorized_is_sorted, r=thomcc

vectorize slice::is_sorted

Benchmarks using u32 slices:
| len | New | Old |
|--------|----------------------|----------------------|
| 2      | 1.1997 ns           | 889.23 ps           |
| 4      | 1.6479 ns           | 1.5396 ns           |
| 8      | 2.5764 ns           | 2.5633 ns           |
| 16     | 5.4750 ns           | 4.7421 ns           |
| 32     | 11.344 ns           | 8.4634 ns           |
| 64     | 12.105 ns           | 18.104 ns           |
| 128    | 17.263 ns           | 33.185 ns           |
| 256    | 29.465 ns           | 60.928 ns           |
| 512    | 48.926 ns           | 116.19 ns           |
| 1024   | 85.274 ns           | 237.91 ns           |
| 2048   | 160.94 ns           | 469.53 ns           |
| 4096   | 311.60 ns           | 911.43 ns           |
| 8192   | 615.89 ns           | 2.2316 µs           |
| 16384  | 1.2619 µs           | 3.4871 µs           |
| 32768  | 2.5245 µs           | 6.9947 µs           |
| 65536  | 5.2254 µs           | 15.212 µs           |

Seems to be a bit slower on small N but much faster on large N.

Godbolt: https://rust.godbolt.org/z/Txn5MdfKn
This commit is contained in:
bors 2024-11-13 03:43:59 +00:00
commit 44f233f251

View file

@ -4097,7 +4097,23 @@ impl<T> [T] {
where
T: PartialOrd,
{
self.is_sorted_by(|a, b| a <= b)
// This odd number works the best. 32 + 1 extra due to overlapping chunk boundaries.
const CHUNK_SIZE: usize = 33;
if self.len() < CHUNK_SIZE {
return self.windows(2).all(|w| w[0] <= w[1]);
}
let mut i = 0;
// Check in chunks for autovectorization.
while i < self.len() - CHUNK_SIZE {
let chunk = &self[i..i + CHUNK_SIZE];
if !chunk.windows(2).fold(true, |acc, w| acc & (w[0] <= w[1])) {
return false;
}
// We need to ensure that chunk boundaries are also sorted.
// Overlap the next chunk with the last element of our last chunk.
i += CHUNK_SIZE - 1;
}
self[i..].windows(2).all(|w| w[0] <= w[1])
}
/// Checks if the elements of this slice are sorted using the given comparator function.