From 36e198b97a0615df965df5fe88bb052bd1bc92b1 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Mon, 13 Sep 2021 00:48:26 +0000
Subject: [PATCH 01/12] Use new bitmask intrinsics with byte arrays

---
 crates/core_simd/src/lane_count.rs       |  9 ---------
 crates/core_simd/src/masks/bitmask.rs    | 20 +++++++-------------
 crates/core_simd/src/masks/full_masks.rs | 19 ++-----------------
 3 files changed, 9 insertions(+), 39 deletions(-)
diff --git a/crates/core_simd/src/lane_count.rs b/crates/core_simd/src/lane_count.rs
index b017e7d137e..4a5dc80049a 100644
--- a/crates/core_simd/src/lane_count.rs
+++ b/crates/core_simd/src/lane_count.rs
@@ -15,34 +15,25 @@ impl<const LANES: usize> LaneCount<LANES> {
 pub trait SupportedLaneCount: Sealed {
     #[doc(hidden)]
     type BitMask: Copy + Default + AsRef<[u8]> + AsMut<[u8]>;
-
-    #[doc(hidden)]
-    type IntBitMask;
 }
 
 impl<const LANES: usize> Sealed for LaneCount<LANES> {}
 
 impl SupportedLaneCount for LaneCount<1> {
     type BitMask = [u8; 1];
-    type IntBitMask = u8;
 }
 impl SupportedLaneCount for LaneCount<2> {
     type BitMask = [u8; 1];
-    type IntBitMask = u8;
 }
 impl SupportedLaneCount for LaneCount<4> {
     type BitMask = [u8; 1];
-    type IntBitMask = u8;
 }
 impl SupportedLaneCount for LaneCount<8> {
     type BitMask = [u8; 1];
-    type IntBitMask = u8;
 }
 impl SupportedLaneCount for LaneCount<16> {
     type BitMask = [u8; 2];
-    type IntBitMask = u16;
 }
 impl SupportedLaneCount for LaneCount<32> {
     type BitMask = [u8; 4];
-    type IntBitMask = u32;
 }
diff --git a/crates/core_simd/src/masks/bitmask.rs b/crates/core_simd/src/masks/bitmask.rs
index 2689e1a88a8..45990e9ce5f 100644
--- a/crates/core_simd/src/masks/bitmask.rs
+++ b/crates/core_simd/src/masks/bitmask.rs
@@ -1,3 +1,4 @@
+#![allow(unused_imports)]
 use super::MaskElement;
 use crate::simd::intrinsics;
 use crate::simd::{LaneCount, Simd, SupportedLaneCount};
@@ -101,24 +102,17 @@ where
     #[inline]
     pub fn to_int(self) -> Simd<T, LANES> {
         unsafe {
-            let mask: <LaneCount<LANES> as SupportedLaneCount>::IntBitMask =
-                core::mem::transmute_copy(&self);
-            intrinsics::simd_select_bitmask(mask, Simd::splat(T::TRUE), Simd::splat(T::FALSE))
+            crate::intrinsics::simd_select_bitmask(
+                self.0,
+                Simd::splat(T::TRUE),
+                Simd::splat(T::FALSE),
+            )
         }
     }
 
     #[inline]
     pub unsafe fn from_int_unchecked(value: Simd<T, LANES>) -> Self {
-        // TODO remove the transmute when rustc is more flexible
-        assert_eq!(
-            core::mem::size_of::<<LaneCount::<LANES> as SupportedLaneCount>::BitMask>(),
-            core::mem::size_of::<<LaneCount::<LANES> as SupportedLaneCount>::IntBitMask>(),
-        );
-        unsafe {
-            let mask: <LaneCount<LANES> as SupportedLaneCount>::IntBitMask =
-                intrinsics::simd_bitmask(value);
-            Self(core::mem::transmute_copy(&mask), PhantomData)
-        }
+        unsafe { Self(crate::intrinsics::simd_bitmask(value), PhantomData) }
     }
 
     #[cfg(feature = "generic_const_exprs")]
diff --git a/crates/core_simd/src/masks/full_masks.rs b/crates/core_simd/src/masks/full_masks.rs
index dd981cedb93..0f1edf9d2f5 100644
--- a/crates/core_simd/src/masks/full_masks.rs
+++ b/crates/core_simd/src/masks/full_masks.rs
@@ -106,15 +106,8 @@ where
     #[inline]
     pub fn to_bitmask(self) -> [u8; LaneCount::<LANES>::BITMASK_LEN] {
         unsafe {
-            // TODO remove the transmute when rustc can use arrays of u8 as bitmasks
-            assert_eq!(
-                core::mem::size_of::<<LaneCount::<LANES> as SupportedLaneCount>::IntBitMask>(),
-                LaneCount::<LANES>::BITMASK_LEN,
-            );
-            let bitmask: <LaneCount<LANES> as SupportedLaneCount>::IntBitMask =
-                intrinsics::simd_bitmask(self.0);
             let mut bitmask: [u8; LaneCount::<LANES>::BITMASK_LEN] =
-                core::mem::transmute_copy(&bitmask);
+                crate::intrinsics::simd_bitmask(self.0);
 
             // There is a bug where LLVM appears to implement this operation with the wrong
             // bit order.
@@ -142,15 +135,7 @@ where
                 }
             }
 
-            // TODO remove the transmute when rustc can use arrays of u8 as bitmasks
-            assert_eq!(
-                core::mem::size_of::<<LaneCount::<LANES> as SupportedLaneCount>::IntBitMask>(),
-                LaneCount::<LANES>::BITMASK_LEN,
-            );
-            let bitmask: <LaneCount<LANES> as SupportedLaneCount>::IntBitMask =
-                core::mem::transmute_copy(&bitmask);
-
-            Self::from_int_unchecked(intrinsics::simd_select_bitmask(
+            Self::from_int_unchecked(crate::intrinsics::simd_select_bitmask(
                 bitmask,
                 Self::splat(true).to_int(),
                 Self::splat(false).to_int(),

From 429e0b66a24399b5e6746cc8bff27921ecde7370 Mon Sep 17 00:00:00 2001
From: Alex Gaynor <alex.gaynor@gmail.com>
Date: Sun, 14 Nov 2021 12:41:16 -0500
Subject: [PATCH 02/12] Update CONTRIBUTING.md for the fact that Travis is no
 longer used

---
 CONTRIBUTING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index f9ba12d3a1b..9612fe871c6 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -15,7 +15,7 @@ SIMD can be quite complex, and even a "simple" issue can be huge. If an issue is
 
 ## CI
 
-We currently have 2 CI matrices through Travis CI and GitHub Actions that will automatically build and test your change in order to verify that `std::simd`'s portable API is, in fact, portable. If your change builds locally, but does not build on either, this is likely due to a platform-specific concern that your code has not addressed. Please consult the build logs and address the error, or ask for help if you need it.
+We currently use GitHub Actions which will automatically build and test your change in order to verify that `std::simd`'s portable API is, in fact, portable. If your change builds locally, but does not build in CI, this is likely due to a platform-specific concern that your code has not addressed. Please consult the build logs and address the error, or ask for help if you need it.
 
 ## Beyond stdsimd
 

From f7b03585737b18782c5697881e0a7cb04fe8e462 Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Sat, 13 Nov 2021 14:38:26 -0800
Subject: [PATCH 03/12] Sprinkle the crate with #[must_use]

---
 crates/core_simd/src/comparisons.rs      |  6 ++++++
 crates/core_simd/src/masks.rs            | 24 ++++++++++++++++++++++++
 crates/core_simd/src/masks/bitmask.rs    | 13 +++++++++++++
 crates/core_simd/src/masks/full_masks.rs | 15 +++++++++++++++
 crates/core_simd/src/select.rs           |  3 +++
 crates/core_simd/src/swizzle.rs          |  9 +++++++++
 crates/core_simd/src/vector/float.rs     | 20 ++++++++++++++++++++
 7 files changed, 90 insertions(+)

diff --git a/crates/core_simd/src/comparisons.rs b/crates/core_simd/src/comparisons.rs
index 8c51baca8ed..edef5af3687 100644
--- a/crates/core_simd/src/comparisons.rs
+++ b/crates/core_simd/src/comparisons.rs
@@ -8,12 +8,14 @@ where
 {
     /// Test if each lane is equal to the corresponding lane in `other`.
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn lanes_eq(self, other: Self) -> Mask<T::Mask, LANES> {
         unsafe { Mask::from_int_unchecked(intrinsics::simd_eq(self, other)) }
     }
 
     /// Test if each lane is not equal to the corresponding lane in `other`.
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn lanes_ne(self, other: Self) -> Mask<T::Mask, LANES> {
         unsafe { Mask::from_int_unchecked(intrinsics::simd_ne(self, other)) }
     }
@@ -26,24 +28,28 @@ where
 {
     /// Test if each lane is less than the corresponding lane in `other`.
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn lanes_lt(self, other: Self) -> Mask<T::Mask, LANES> {
         unsafe { Mask::from_int_unchecked(intrinsics::simd_lt(self, other)) }
     }
 
     /// Test if each lane is greater than the corresponding lane in `other`.
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn lanes_gt(self, other: Self) -> Mask<T::Mask, LANES> {
         unsafe { Mask::from_int_unchecked(intrinsics::simd_gt(self, other)) }
     }
 
     /// Test if each lane is less than or equal to the corresponding lane in `other`.
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn lanes_le(self, other: Self) -> Mask<T::Mask, LANES> {
         unsafe { Mask::from_int_unchecked(intrinsics::simd_le(self, other)) }
     }
 
     /// Test if each lane is greater than or equal to the corresponding lane in `other`.
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn lanes_ge(self, other: Self) -> Mask<T::Mask, LANES> {
         unsafe { Mask::from_int_unchecked(intrinsics::simd_ge(self, other)) }
     }
diff --git a/crates/core_simd/src/masks.rs b/crates/core_simd/src/masks.rs
index d460da0d04f..191e9690313 100644
--- a/crates/core_simd/src/masks.rs
+++ b/crates/core_simd/src/masks.rs
@@ -129,6 +129,7 @@ where
     /// # Safety
     /// All lanes must be either 0 or -1.
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     pub unsafe fn from_int_unchecked(value: Simd<T, LANES>) -> Self {
         unsafe { Self(mask_impl::Mask::from_int_unchecked(value)) }
     }
@@ -139,6 +140,7 @@ where
     /// # Panics
     /// Panics if any lane is not 0 or -1.
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn from_int(value: Simd<T, LANES>) -> Self {
         assert!(T::valid(value), "all values must be either 0 or -1",);
         unsafe { Self::from_int_unchecked(value) }
@@ -147,6 +149,7 @@ where
     /// Converts the mask to a vector of integers, where 0 represents `false` and -1
     /// represents `true`.
     #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original value"]
     pub fn to_int(self) -> Simd<T, LANES> {
         self.0.to_int()
     }
@@ -156,6 +159,7 @@ where
     /// # Safety
     /// `lane` must be less than `LANES`.
     #[inline]
+    #[must_use = "method returns a new bool and does not mutate the original value"]
     pub unsafe fn test_unchecked(&self, lane: usize) -> bool {
         unsafe { self.0.test_unchecked(lane) }
     }
@@ -165,6 +169,7 @@ where
     /// # Panics
     /// Panics if `lane` is greater than or equal to the number of lanes in the vector.
     #[inline]
+    #[must_use = "method returns a new bool and does not mutate the original value"]
     pub fn test(&self, lane: usize) -> bool {
         assert!(lane < LANES, "lane index out of range");
         unsafe { self.test_unchecked(lane) }
@@ -195,24 +200,30 @@ where
 
     /// Convert this mask to a bitmask, with one bit set per lane.
     #[cfg(feature = "generic_const_exprs")]
+    #[inline]
+    #[must_use = "method returns a new array and does not mutate the original value"]
     pub fn to_bitmask(self) -> [u8; LaneCount::<LANES>::BITMASK_LEN] {
         self.0.to_bitmask()
     }
 
     /// Convert a bitmask to a mask.
     #[cfg(feature = "generic_const_exprs")]
+    #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn from_bitmask(bitmask: [u8; LaneCount::<LANES>::BITMASK_LEN]) -> Self {
         Self(mask_impl::Mask::from_bitmask(bitmask))
     }
 
     /// Returns true if any lane is set, or false otherwise.
     #[inline]
+    #[must_use = "method returns a new bool and does not mutate the original value"]
     pub fn any(self) -> bool {
         self.0.any()
     }
 
     /// Returns true if all lanes are set, or false otherwise.
     #[inline]
+    #[must_use = "method returns a new bool and does not mutate the original value"]
     pub fn all(self) -> bool {
         self.0.all()
     }
@@ -245,6 +256,7 @@ where
     LaneCount<LANES>: SupportedLaneCount,
 {
     #[inline]
+    #[must_use = "method returns a defaulted mask with all lanes set to false (0)"]
     fn default() -> Self {
         Self::splat(false)
     }
@@ -256,6 +268,7 @@ where
     LaneCount<LANES>: SupportedLaneCount,
 {
     #[inline]
+    #[must_use = "method returns a new bool and does not mutate the original value"]
     fn eq(&self, other: &Self) -> bool {
         self.0 == other.0
     }
@@ -267,6 +280,7 @@ where
     LaneCount<LANES>: SupportedLaneCount,
 {
     #[inline]
+    #[must_use = "method returns a new Ordering and does not mutate the original value"]
     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
         self.0.partial_cmp(&other.0)
     }
@@ -291,6 +305,7 @@ where
 {
     type Output = Self;
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitand(self, rhs: Self) -> Self {
         Self(self.0 & rhs.0)
     }
@@ -303,6 +318,7 @@ where
 {
     type Output = Self;
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitand(self, rhs: bool) -> Self {
         self & Self::splat(rhs)
     }
@@ -315,6 +331,7 @@ where
 {
     type Output = Mask<T, LANES>;
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitand(self, rhs: Mask<T, LANES>) -> Mask<T, LANES> {
         Mask::splat(self) & rhs
     }
@@ -327,6 +344,7 @@ where
 {
     type Output = Self;
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitor(self, rhs: Self) -> Self {
         Self(self.0 | rhs.0)
     }
@@ -339,6 +357,7 @@ where
 {
     type Output = Self;
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitor(self, rhs: bool) -> Self {
         self | Self::splat(rhs)
     }
@@ -351,6 +370,7 @@ where
 {
     type Output = Mask<T, LANES>;
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitor(self, rhs: Mask<T, LANES>) -> Mask<T, LANES> {
         Mask::splat(self) | rhs
     }
@@ -363,6 +383,7 @@ where
 {
     type Output = Self;
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitxor(self, rhs: Self) -> Self::Output {
         Self(self.0 ^ rhs.0)
     }
@@ -375,6 +396,7 @@ where
 {
     type Output = Self;
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitxor(self, rhs: bool) -> Self::Output {
         self ^ Self::splat(rhs)
     }
@@ -387,6 +409,7 @@ where
 {
     type Output = Mask<T, LANES>;
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitxor(self, rhs: Mask<T, LANES>) -> Self::Output {
         Mask::splat(self) ^ rhs
     }
@@ -399,6 +422,7 @@ where
 {
     type Output = Mask<T, LANES>;
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn not(self) -> Self::Output {
         Self(!self.0)
     }
diff --git a/crates/core_simd/src/masks/bitmask.rs b/crates/core_simd/src/masks/bitmask.rs
index 45990e9ce5f..4c964cb52e1 100644
--- a/crates/core_simd/src/masks/bitmask.rs
+++ b/crates/core_simd/src/masks/bitmask.rs
@@ -74,6 +74,7 @@ where
     LaneCount<LANES>: SupportedLaneCount,
 {
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn splat(value: bool) -> Self {
         let mut mask = <LaneCount<LANES> as SupportedLaneCount>::BitMask::default();
         if value {
@@ -88,6 +89,7 @@ where
     }
 
     #[inline]
+    #[must_use = "method returns a new bool and does not mutate the original value"]
     pub unsafe fn test_unchecked(&self, lane: usize) -> bool {
         (self.0.as_ref()[lane / 8] >> (lane % 8)) & 0x1 > 0
     }
@@ -100,6 +102,7 @@ where
     }
 
     #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original value"]
     pub fn to_int(self) -> Simd<T, LANES> {
         unsafe {
             crate::intrinsics::simd_select_bitmask(
@@ -111,12 +114,14 @@ where
     }
 
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     pub unsafe fn from_int_unchecked(value: Simd<T, LANES>) -> Self {
         unsafe { Self(crate::intrinsics::simd_bitmask(value), PhantomData) }
     }
 
     #[cfg(feature = "generic_const_exprs")]
     #[inline]
+    #[must_use = "method returns a new array and does not mutate the original value"]
     pub fn to_bitmask(self) -> [u8; LaneCount::<LANES>::BITMASK_LEN] {
         // Safety: these are the same type and we are laundering the generic
         unsafe { core::mem::transmute_copy(&self.0) }
@@ -124,12 +129,14 @@ where
 
     #[cfg(feature = "generic_const_exprs")]
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn from_bitmask(bitmask: [u8; LaneCount::<LANES>::BITMASK_LEN]) -> Self {
         // Safety: these are the same type and we are laundering the generic
         Self(unsafe { core::mem::transmute_copy(&bitmask) }, PhantomData)
     }
 
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn convert<U>(self) -> Mask<U, LANES>
     where
         U: MaskElement,
@@ -138,11 +145,13 @@ where
     }
 
     #[inline]
+    #[must_use = "method returns a new bool and does not mutate the original value"]
     pub fn any(self) -> bool {
         self != Self::splat(false)
     }
 
     #[inline]
+    #[must_use = "method returns a new bool and does not mutate the original value"]
     pub fn all(self) -> bool {
         self == Self::splat(true)
     }
@@ -156,6 +165,7 @@ where
 {
     type Output = Self;
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitand(mut self, rhs: Self) -> Self {
         for (l, r) in self.0.as_mut().iter_mut().zip(rhs.0.as_ref().iter()) {
             *l &= r;
@@ -172,6 +182,7 @@ where
 {
     type Output = Self;
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitor(mut self, rhs: Self) -> Self {
         for (l, r) in self.0.as_mut().iter_mut().zip(rhs.0.as_ref().iter()) {
             *l |= r;
@@ -187,6 +198,7 @@ where
 {
     type Output = Self;
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitxor(mut self, rhs: Self) -> Self::Output {
         for (l, r) in self.0.as_mut().iter_mut().zip(rhs.0.as_ref().iter()) {
             *l ^= r;
@@ -202,6 +214,7 @@ where
 {
     type Output = Self;
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn not(mut self) -> Self::Output {
         for x in self.0.as_mut() {
             *x = !*x;
diff --git a/crates/core_simd/src/masks/full_masks.rs b/crates/core_simd/src/masks/full_masks.rs
index 0f1edf9d2f5..5421ccbe3d8 100644
--- a/crates/core_simd/src/masks/full_masks.rs
+++ b/crates/core_simd/src/masks/full_masks.rs
@@ -23,6 +23,7 @@ where
     LaneCount<LANES>: SupportedLaneCount,
 {
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn clone(&self) -> Self {
         *self
     }
@@ -70,11 +71,14 @@ where
     T: MaskElement,
     LaneCount<LANES>: SupportedLaneCount,
 {
+    #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn splat(value: bool) -> Self {
         Self(Simd::splat(if value { T::TRUE } else { T::FALSE }))
     }
 
     #[inline]
+    #[must_use = "method returns a new bool and does not mutate the original value"]
     pub unsafe fn test_unchecked(&self, lane: usize) -> bool {
         T::eq(self.0[lane], T::TRUE)
     }
@@ -85,16 +89,19 @@ where
     }
 
     #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original value"]
     pub fn to_int(self) -> Simd<T, LANES> {
         self.0
     }
 
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     pub unsafe fn from_int_unchecked(value: Simd<T, LANES>) -> Self {
         Self(value)
     }
 
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn convert<U>(self) -> Mask<U, LANES>
     where
         U: MaskElement,
@@ -104,6 +111,7 @@ where
 
     #[cfg(feature = "generic_const_exprs")]
     #[inline]
+    #[must_use = "method returns a new array and does not mutate the original value"]
     pub fn to_bitmask(self) -> [u8; LaneCount::<LANES>::BITMASK_LEN] {
         unsafe {
             let mut bitmask: [u8; LaneCount::<LANES>::BITMASK_LEN] =
@@ -124,6 +132,7 @@ where
 
     #[cfg(feature = "generic_const_exprs")]
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn from_bitmask(mut bitmask: [u8; LaneCount::<LANES>::BITMASK_LEN]) -> Self {
         unsafe {
             // There is a bug where LLVM appears to implement this operation with the wrong
@@ -144,11 +153,13 @@ where
     }
 
     #[inline]
+    #[must_use = "method returns a new bool and does not mutate the original value"]
     pub fn any(self) -> bool {
         unsafe { intrinsics::simd_reduce_any(self.to_int()) }
     }
 
     #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original value"]
     pub fn all(self) -> bool {
         unsafe { intrinsics::simd_reduce_all(self.to_int()) }
     }
@@ -171,6 +182,7 @@ where
 {
     type Output = Self;
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitand(self, rhs: Self) -> Self {
         unsafe { Self(intrinsics::simd_and(self.0, rhs.0)) }
     }
@@ -183,6 +195,7 @@ where
 {
     type Output = Self;
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitor(self, rhs: Self) -> Self {
         unsafe { Self(intrinsics::simd_or(self.0, rhs.0)) }
     }
@@ -195,6 +208,7 @@ where
 {
     type Output = Self;
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitxor(self, rhs: Self) -> Self {
         unsafe { Self(intrinsics::simd_xor(self.0, rhs.0)) }
     }
@@ -207,6 +221,7 @@ where
 {
     type Output = Self;
     #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn not(self) -> Self::Output {
         Self::splat(true) ^ self
     }
diff --git a/crates/core_simd/src/select.rs b/crates/core_simd/src/select.rs
index d976231a03a..5d696ebf76e 100644
--- a/crates/core_simd/src/select.rs
+++ b/crates/core_simd/src/select.rs
@@ -17,6 +17,7 @@ where
     LaneCount<LANES>: SupportedLaneCount,
 {
     #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
     fn select(mask: Mask<T::Mask, LANES>, true_values: Self, false_values: Self) -> Self {
         unsafe { intrinsics::simd_select(mask.to_int(), true_values, false_values) }
     }
@@ -35,6 +36,7 @@ where
     LaneCount<LANES>: SupportedLaneCount,
 {
     #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
     fn select(mask: Self, true_values: Self, false_values: Self) -> Self {
         mask & true_values | !mask & false_values
     }
@@ -80,6 +82,7 @@ where
     /// assert_eq!(c.to_array(), [true, false, true, false]);
     /// ```
     #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
     pub fn select<S: Select<Self>>(self, true_values: S, false_values: S) -> S {
         S::select(self, true_values, false_values)
     }
diff --git a/crates/core_simd/src/swizzle.rs b/crates/core_simd/src/swizzle.rs
index 62cda68f0a9..bdc489774a5 100644
--- a/crates/core_simd/src/swizzle.rs
+++ b/crates/core_simd/src/swizzle.rs
@@ -87,6 +87,8 @@ pub trait Swizzle<const INPUT_LANES: usize, const OUTPUT_LANES: usize> {
     /// Create a new vector from the lanes of `vector`.
     ///
     /// Lane `i` of the output is `vector[Self::INDEX[i]]`.
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
     fn swizzle<T>(vector: Simd<T, INPUT_LANES>) -> Simd<T, OUTPUT_LANES>
     where
         T: SimdElement,
@@ -106,6 +108,8 @@ pub trait Swizzle2<const INPUT_LANES: usize, const OUTPUT_LANES: usize> {
     ///
     /// Lane `i` is `first[j]` when `Self::INDEX[i]` is `First(j)`, or `second[j]` when it is
     /// `Second(j)`.
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
     fn swizzle2<T>(
         first: Simd<T, INPUT_LANES>,
         second: Simd<T, INPUT_LANES>,
@@ -182,6 +186,7 @@ where
 {
     /// Reverse the order of the lanes in the vector.
     #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
     pub fn reverse(self) -> Self {
         const fn reverse_index<const LANES: usize>() -> [usize; LANES] {
             let mut index = [0; LANES];
@@ -206,6 +211,7 @@ where
     /// while the last `LANES - OFFSET` elements move to the front. After calling `rotate_lanes_left`,
     /// the element previously in lane `OFFSET` will become the first element in the slice.
     #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
     pub fn rotate_lanes_left<const OFFSET: usize>(self) -> Self {
         const fn rotate_index<const OFFSET: usize, const LANES: usize>() -> [usize; LANES] {
             let offset = OFFSET % LANES;
@@ -231,6 +237,7 @@ where
     /// the end while the last `OFFSET` elements move to the front. After calling `rotate_lanes_right`,
     /// the element previously at index `LANES - OFFSET` will become the first element in the slice.
     #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
     pub fn rotate_lanes_right<const OFFSET: usize>(self) -> Self {
         const fn rotate_index<const OFFSET: usize, const LANES: usize>() -> [usize; LANES] {
             let offset = LANES - OFFSET % LANES;
@@ -273,6 +280,7 @@ where
     /// assert_eq!(y.to_array(), [2, 6, 3, 7]);
     /// ```
     #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
     pub fn interleave(self, other: Self) -> (Self, Self) {
         const fn lo<const LANES: usize>() -> [Which; LANES] {
             let mut idx = [Which::First(0); LANES];
@@ -336,6 +344,7 @@ where
     /// assert_eq!(y.to_array(), [4, 5, 6, 7]);
     /// ```
     #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
     pub fn deinterleave(self, other: Self) -> (Self, Self) {
         const fn even<const LANES: usize>() -> [Which; LANES] {
             let mut idx = [Which::First(0); LANES];
diff --git a/crates/core_simd/src/vector/float.rs b/crates/core_simd/src/vector/float.rs
index c09d0ac84d2..4a4b23238c4 100644
--- a/crates/core_simd/src/vector/float.rs
+++ b/crates/core_simd/src/vector/float.rs
@@ -15,6 +15,7 @@ macro_rules! impl_float_vector {
             /// Raw transmutation to an unsigned integer vector type with the
             /// same size and number of lanes.
             #[inline]
+            #[must_use = "method returns a new vector and does not mutate the original value"]
             pub fn to_bits(self) -> Simd<$bits_ty, LANES> {
                 assert_eq!(core::mem::size_of::<Self>(), core::mem::size_of::<Simd<$bits_ty, LANES>>());
                 unsafe { core::mem::transmute_copy(&self) }
@@ -23,6 +24,7 @@ macro_rules! impl_float_vector {
             /// Raw transmutation from an unsigned integer vector type with the
             /// same size and number of lanes.
             #[inline]
+            #[must_use = "method returns a new vector and does not mutate the original value"]
             pub fn from_bits(bits: Simd<$bits_ty, LANES>) -> Self {
                 assert_eq!(core::mem::size_of::<Self>(), core::mem::size_of::<Simd<$bits_ty, LANES>>());
                 unsafe { core::mem::transmute_copy(&bits) }
@@ -31,6 +33,7 @@ macro_rules! impl_float_vector {
             /// Produces a vector where every lane has the absolute value of the
             /// equivalently-indexed lane in `self`.
             #[inline]
+            #[must_use = "method returns a new vector and does not mutate the original value"]
             pub fn abs(self) -> Self {
                 unsafe { intrinsics::simd_fabs(self) }
             }
@@ -44,6 +47,7 @@ macro_rules! impl_float_vector {
             /// hardware in mind.
             #[cfg(feature = "std")]
             #[inline]
+            #[must_use = "method returns a new vector and does not mutate the original value"]
             pub fn mul_add(self, a: Self, b: Self) -> Self {
                 unsafe { intrinsics::simd_fma(self, a, b) }
             }
@@ -51,6 +55,7 @@ macro_rules! impl_float_vector {
             /// Produces a vector where every lane has the square root value
             /// of the equivalently-indexed lane in `self`
             #[inline]
+            #[must_use = "method returns a new vector and does not mutate the original value"]
             #[cfg(feature = "std")]
             pub fn sqrt(self) -> Self {
                 unsafe { intrinsics::simd_fsqrt(self) }
@@ -58,12 +63,14 @@ macro_rules! impl_float_vector {
 
             /// Takes the reciprocal (inverse) of each lane, `1/x`.
             #[inline]
+            #[must_use = "method returns a new vector and does not mutate the original value"]
             pub fn recip(self) -> Self {
                 Self::splat(1.0) / self
             }
 
             /// Converts each lane from radians to degrees.
             #[inline]
+            #[must_use = "method returns a new vector and does not mutate the original value"]
             pub fn to_degrees(self) -> Self {
                 // to_degrees uses a special constant for better precision, so extract that constant
                 self * Self::splat(<$type>::to_degrees(1.))
@@ -71,6 +78,7 @@ macro_rules! impl_float_vector {
 
             /// Converts each lane from degrees to radians.
             #[inline]
+            #[must_use = "method returns a new vector and does not mutate the original value"]
             pub fn to_radians(self) -> Self {
                 self * Self::splat(<$type>::to_radians(1.))
             }
@@ -78,6 +86,7 @@ macro_rules! impl_float_vector {
             /// Returns true for each lane if it has a positive sign, including
             /// `+0.0`, `NaN`s with positive sign bit and positive infinity.
             #[inline]
+            #[must_use = "method returns a new mask and does not mutate the original value"]
             pub fn is_sign_positive(self) -> Mask<$mask_ty, LANES> {
                 !self.is_sign_negative()
             }
@@ -85,6 +94,7 @@ macro_rules! impl_float_vector {
             /// Returns true for each lane if it has a negative sign, including
             /// `-0.0`, `NaN`s with negative sign bit and negative infinity.
             #[inline]
+            #[must_use = "method returns a new mask and does not mutate the original value"]
             pub fn is_sign_negative(self) -> Mask<$mask_ty, LANES> {
                 let sign_bits = self.to_bits() & Simd::splat((!0 >> 1) + 1);
                 sign_bits.lanes_gt(Simd::splat(0))
@@ -92,24 +102,28 @@ macro_rules! impl_float_vector {
 
             /// Returns true for each lane if its value is `NaN`.
             #[inline]
+            #[must_use = "method returns a new mask and does not mutate the original value"]
             pub fn is_nan(self) -> Mask<$mask_ty, LANES> {
                 self.lanes_ne(self)
             }
 
             /// Returns true for each lane if its value is positive infinity or negative infinity.
             #[inline]
+            #[must_use = "method returns a new mask and does not mutate the original value"]
             pub fn is_infinite(self) -> Mask<$mask_ty, LANES> {
                 self.abs().lanes_eq(Self::splat(<$type>::INFINITY))
             }
 
             /// Returns true for each lane if its value is neither infinite nor `NaN`.
             #[inline]
+            #[must_use = "method returns a new mask and does not mutate the original value"]
             pub fn is_finite(self) -> Mask<$mask_ty, LANES> {
                 self.abs().lanes_lt(Self::splat(<$type>::INFINITY))
             }
 
             /// Returns true for each lane if its value is subnormal.
             #[inline]
+            #[must_use = "method returns a new mask and does not mutate the original value"]
             pub fn is_subnormal(self) -> Mask<$mask_ty, LANES> {
                 self.abs().lanes_ne(Self::splat(0.0)) & (self.to_bits() & Self::splat(<$type>::INFINITY).to_bits()).lanes_eq(Simd::splat(0))
             }
@@ -117,6 +131,7 @@ macro_rules! impl_float_vector {
             /// Returns true for each lane if its value is neither neither zero, infinite,
             /// subnormal, or `NaN`.
             #[inline]
+            #[must_use = "method returns a new mask and does not mutate the original value"]
             pub fn is_normal(self) -> Mask<$mask_ty, LANES> {
                 !(self.abs().lanes_eq(Self::splat(0.0)) | self.is_nan() | self.is_subnormal() | self.is_infinite())
             }
@@ -127,6 +142,7 @@ macro_rules! impl_float_vector {
             /// * `-1.0` if the number is negative, `-0.0`, or `NEG_INFINITY`
             /// * `NAN` if the number is `NAN`
             #[inline]
+            #[must_use = "method returns a new vector and does not mutate the original value"]
             pub fn signum(self) -> Self {
                 self.is_nan().select(Self::splat(<$type>::NAN), Self::splat(1.0).copysign(self))
             }
@@ -135,6 +151,7 @@ macro_rules! impl_float_vector {
             ///
             /// If any lane is a `NAN`, then a `NAN` with the sign of `sign` is returned.
             #[inline]
+            #[must_use = "method returns a new vector and does not mutate the original value"]
             pub fn copysign(self, sign: Self) -> Self {
                 let sign_bit = sign.to_bits() & Self::splat(-0.).to_bits();
                 let magnitude = self.to_bits() & !Self::splat(-0.).to_bits();
@@ -145,6 +162,7 @@ macro_rules! impl_float_vector {
             ///
             /// If one of the values is `NAN`, then the other value is returned.
             #[inline]
+            #[must_use = "method returns a new vector and does not mutate the original value"]
             pub fn min(self, other: Self) -> Self {
                 // TODO consider using an intrinsic
                 self.is_nan().select(
@@ -157,6 +175,7 @@ macro_rules! impl_float_vector {
             ///
             /// If one of the values is `NAN`, then the other value is returned.
             #[inline]
+            #[must_use = "method returns a new vector and does not mutate the original value"]
             pub fn max(self, other: Self) -> Self {
                 // TODO consider using an intrinsic
                 self.is_nan().select(
@@ -171,6 +190,7 @@ macro_rules! impl_float_vector {
             /// greater than `max`, and the corresponding lane in `min` if the lane is less
             /// than `min`.  Otherwise returns the lane in `self`.
             #[inline]
+            #[must_use = "method returns a new vector and does not mutate the original value"]
             pub fn clamp(self, min: Self, max: Self) -> Self {
                 assert!(
                     min.lanes_le(max).all(),

From 9129ae651f744328eb691016d59163832ef0c7e9 Mon Sep 17 00:00:00 2001
From: Proloy Mishra <67726964+pro465@users.noreply.github.com>
Date: Mon, 15 Nov 2021 18:36:21 +0530
Subject: [PATCH 04/12] Fix outdated workflow badge

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index da536a4d6f2..db0af2da606 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 # The Rust standard library's portable SIMD API
-[![Build Status](https://travis-ci.com/rust-lang/portable-simd.svg?branch=master)](https://travis-ci.com/rust-lang/portable-simd)
+![Build Status](https://github.com/rust-lang/portable-simd/actions/workflows/ci.yml/badge.svg?branch=master)
 
 Code repository for the [Portable SIMD Project Group](https://github.com/rust-lang/project-portable-simd).
 Please refer to [CONTRIBUTING.md](./CONTRIBUTING.md) for our contributing guidelines.

From ced3a05526ce70583f827b5d99a69f436126af20 Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Sun, 21 Nov 2021 14:35:25 -0800
Subject: [PATCH 05/12] Attempt to support to 64 lanes

---
 crates/core_simd/src/lane_count.rs |  3 +++
 crates/test_helpers/src/lib.rs     | 12 ++++++++++++
 2 files changed, 15 insertions(+)

diff --git a/crates/core_simd/src/lane_count.rs b/crates/core_simd/src/lane_count.rs
index 4a5dc80049a..3b316f12b3e 100644
--- a/crates/core_simd/src/lane_count.rs
+++ b/crates/core_simd/src/lane_count.rs
@@ -37,3 +37,6 @@ impl SupportedLaneCount for LaneCount<16> {
 impl SupportedLaneCount for LaneCount<32> {
     type BitMask = [u8; 4];
 }
+impl SupportedLaneCount for LaneCount<64> {
+    type BitMask = [u8; 8];
+}
diff --git a/crates/test_helpers/src/lib.rs b/crates/test_helpers/src/lib.rs
index 5c6478876f3..7edd6096381 100644
--- a/crates/test_helpers/src/lib.rs
+++ b/crates/test_helpers/src/lib.rs
@@ -376,6 +376,12 @@ macro_rules! test_lanes {
                 fn lanes_32() {
                     implementation::<32>();
                 }
+
+                #[test]
+                #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
+                fn lanes_64() {
+                    implementation::<64>();
+                }
             }
         )*
     }
@@ -431,6 +437,12 @@ macro_rules! test_lanes_panic {
                 fn lanes_32() {
                     implementation::<32>();
                 }
+
+                #[test]
+                #[should_panic]
+                fn lanes_64() {
+                    implementation::<64>();
+                }
             }
         )*
     }

From 0a6992f5bfb6a2e879d23ff015ae27e2534095aa Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Tue, 23 Nov 2021 16:15:19 -0800
Subject: [PATCH 06/12] impl deref.rs<&Self> for Simd<T, _>

Instead of implementing each "deref" pattern for every single scalar,
we can use type parameters for Simd operating on &Self.
We can use a macro, but keep it cleaner and more explicit.
---
 crates/core_simd/src/ops.rs       | 62 +++------------------------
 crates/core_simd/src/ops/deref.rs | 70 +++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+), 56 deletions(-)
 create mode 100644 crates/core_simd/src/ops/deref.rs

diff --git a/crates/core_simd/src/ops.rs b/crates/core_simd/src/ops.rs
index 5d7af474caf..f5683ebb2c0 100644
--- a/crates/core_simd/src/ops.rs
+++ b/crates/core_simd/src/ops.rs
@@ -1,5 +1,11 @@
 use crate::simd::intrinsics;
 use crate::simd::{LaneCount, Simd, SimdElement, SupportedLaneCount};
+use core::ops::{Add, Mul};
+use core::ops::{BitAnd, BitOr, BitXor};
+use core::ops::{Div, Rem, Sub};
+use core::ops::{Shl, Shr};
+
+mod deref;
 
 impl<I, T, const LANES: usize> core::ops::Index<I> for Simd<T, LANES>
 where
@@ -57,42 +63,6 @@ macro_rules! impl_ref_ops {
             $(#[$attrs])*
             fn $fn($self_tok, $rhs_arg: $rhs_arg_ty) -> Self::Output $body
         }
-
-        impl<const $lanes: usize> core::ops::$trait<&'_ $rhs> for $type
-        where
-            LaneCount<$lanes2>: SupportedLaneCount,
-        {
-            type Output = <$type as core::ops::$trait<$rhs>>::Output;
-
-            $(#[$attrs])*
-            fn $fn($self_tok, $rhs_arg: &$rhs) -> Self::Output {
-                core::ops::$trait::$fn($self_tok, *$rhs_arg)
-            }
-        }
-
-        impl<const $lanes: usize> core::ops::$trait<$rhs> for &'_ $type
-        where
-            LaneCount<$lanes2>: SupportedLaneCount,
-        {
-            type Output = <$type as core::ops::$trait<$rhs>>::Output;
-
-            $(#[$attrs])*
-            fn $fn($self_tok, $rhs_arg: $rhs) -> Self::Output {
-                core::ops::$trait::$fn(*$self_tok, $rhs_arg)
-            }
-        }
-
-        impl<const $lanes: usize> core::ops::$trait<&'_ $rhs> for &'_ $type
-        where
-            LaneCount<$lanes2>: SupportedLaneCount,
-        {
-            type Output = <$type as core::ops::$trait<$rhs>>::Output;
-
-            $(#[$attrs])*
-            fn $fn($self_tok, $rhs_arg: &$rhs) -> Self::Output {
-                core::ops::$trait::$fn(*$self_tok, *$rhs_arg)
-            }
-        }
     };
 
     // binary assignment op
@@ -112,16 +82,6 @@ macro_rules! impl_ref_ops {
             $(#[$attrs])*
             fn $fn(&mut $self_tok, $rhs_arg: $rhs_arg_ty) $body
         }
-
-        impl<const $lanes: usize> core::ops::$trait<&'_ $rhs> for $type
-        where
-            LaneCount<$lanes2>: SupportedLaneCount,
-        {
-            $(#[$attrs])*
-            fn $fn(&mut $self_tok, $rhs_arg: &$rhs_arg_ty) {
-                core::ops::$trait::$fn($self_tok, *$rhs_arg)
-            }
-        }
     };
 
     // unary op
@@ -141,16 +101,6 @@ macro_rules! impl_ref_ops {
             type Output = $output;
             fn $fn($self_tok) -> Self::Output $body
         }
-
-        impl<const $lanes: usize> core::ops::$trait for &'_ $type
-        where
-            LaneCount<$lanes2>: SupportedLaneCount,
-        {
-            type Output = <$type as core::ops::$trait>::Output;
-            fn $fn($self_tok) -> Self::Output {
-                core::ops::$trait::$fn(*$self_tok)
-            }
-        }
     }
 }
 
diff --git a/crates/core_simd/src/ops/deref.rs b/crates/core_simd/src/ops/deref.rs
new file mode 100644
index 00000000000..1138b9494f6
--- /dev/null
+++ b/crates/core_simd/src/ops/deref.rs
@@ -0,0 +1,70 @@
+//! This module hacks in "implicit deref" for Simd's operators.
+//! Ideally, Rust would take care of this itself,
+//! and method calls usually handle the LHS implicitly.
+//! So, we'll manually deref the RHS.
+use super::*;
+
+macro_rules! deref_ops {
+    ($(impl<T, const LANES: usize> $trait:ident<&Self> for Simd<T, LANES> {
+            fn $call:ident(rhs: &Self)
+        })*) => {
+        $(impl<T, const LANES: usize> $trait<&Self> for Simd<T, LANES>
+        where
+            Self: $trait<Self, Output = Self>,
+            T: SimdElement,
+            LaneCount<LANES>: SupportedLaneCount,
+        {
+            type Output = Self;
+
+            #[inline]
+            #[must_use = "operator returns a new vector without mutating the inputs"]
+            fn $call(self, rhs: &Self) -> Self::Output {
+                self.$call(*rhs)
+            }
+        })*
+    }
+}
+
+deref_ops! {
+    // Arithmetic
+    impl<T, const LANES: usize> Add<&Self> for Simd<T, LANES> {
+        fn add(rhs: &Self)
+    }
+
+    impl<T, const LANES: usize> Mul<&Self> for Simd<T, LANES> {
+        fn mul(rhs: &Self)
+    }
+
+    impl<T, const LANES: usize> Sub<&Self> for Simd<T, LANES> {
+        fn sub(rhs: &Self)
+    }
+
+    impl<T, const LANES: usize> Div<&Self> for Simd<T, LANES> {
+        fn div(rhs: &Self)
+    }
+
+    impl<T, const LANES: usize> Rem<&Self> for Simd<T, LANES> {
+        fn rem(rhs: &Self)
+    }
+
+    // Bitops
+    impl<T, const LANES: usize> BitAnd<&Self> for Simd<T, LANES> {
+        fn bitand(rhs: &Self)
+    }
+
+    impl<T, const LANES: usize> BitOr<&Self> for Simd<T, LANES> {
+        fn bitor(rhs: &Self)
+    }
+
+    impl<T, const LANES: usize> BitXor<&Self> for Simd<T, LANES> {
+        fn bitxor(rhs: &Self)
+    }
+
+    impl<T, const LANES: usize> Shl<&Self> for Simd<T, LANES> {
+        fn shl(rhs: &Self)
+    }
+
+    impl<T, const LANES: usize> Shr<&Self> for Simd<T, LANES> {
+        fn shr(rhs: &Self)
+    }
+}

From 51ff9259256f8db9b5491777f0f6cce92b11bde9 Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Tue, 23 Nov 2021 16:43:02 -0800
Subject: [PATCH 07/12] impl assign.rs<U> for Simd<T, _>

Instead of implementing {Op}Assign traits for individual scalar type args
to Simd<_, _>, use parametric impls that reassert the bounds of the binary op.
---
 crates/core_simd/src/ops.rs        | 166 +++--------------------------
 crates/core_simd/src/ops/assign.rs | 124 +++++++++++++++++++++
 2 files changed, 136 insertions(+), 154 deletions(-)
 create mode 100644 crates/core_simd/src/ops/assign.rs

diff --git a/crates/core_simd/src/ops.rs b/crates/core_simd/src/ops.rs
index f5683ebb2c0..aee5a111a82 100644
--- a/crates/core_simd/src/ops.rs
+++ b/crates/core_simd/src/ops.rs
@@ -5,6 +5,7 @@ use core::ops::{BitAnd, BitOr, BitXor};
 use core::ops::{Div, Rem, Sub};
 use core::ops::{Shl, Shr};
 
+mod assign;
 mod deref;
 
 impl<I, T, const LANES: usize> core::ops::Index<I> for Simd<T, LANES>
@@ -65,25 +66,6 @@ macro_rules! impl_ref_ops {
         }
     };
 
-    // binary assignment op
-    {
-        impl<const $lanes:ident: usize> core::ops::$trait:ident<$rhs:ty> for $type:ty
-        where
-            LaneCount<$lanes2:ident>: SupportedLaneCount,
-        {
-            $(#[$attrs:meta])*
-            fn $fn:ident(&mut $self_tok:ident, $rhs_arg:ident: $rhs_arg_ty:ty) $body:tt
-        }
-    } => {
-        impl<const $lanes: usize> core::ops::$trait<$rhs> for $type
-        where
-            LaneCount<$lanes2>: SupportedLaneCount,
-        {
-            $(#[$attrs])*
-            fn $fn(&mut $self_tok, $rhs_arg: $rhs_arg_ty) $body
-        }
-    };
-
     // unary op
     {
         impl<const $lanes:ident: usize> core::ops::$trait:ident for $type:ty
@@ -107,34 +89,34 @@ macro_rules! impl_ref_ops {
 /// Automatically implements operators over vectors and scalars for a particular vector.
 macro_rules! impl_op {
     { impl Add for $scalar:ty } => {
-        impl_op! { @binary $scalar, Add::add, AddAssign::add_assign, simd_add }
+        impl_op! { @binary $scalar, Add::add, simd_add }
     };
     { impl Sub for $scalar:ty } => {
-        impl_op! { @binary $scalar, Sub::sub, SubAssign::sub_assign, simd_sub }
+        impl_op! { @binary $scalar, Sub::sub, simd_sub }
     };
     { impl Mul for $scalar:ty } => {
-        impl_op! { @binary $scalar, Mul::mul, MulAssign::mul_assign, simd_mul }
+        impl_op! { @binary $scalar, Mul::mul, simd_mul }
     };
     { impl Div for $scalar:ty } => {
-        impl_op! { @binary $scalar, Div::div, DivAssign::div_assign, simd_div }
+        impl_op! { @binary $scalar, Div::div, simd_div }
     };
     { impl Rem for $scalar:ty } => {
-        impl_op! { @binary $scalar, Rem::rem, RemAssign::rem_assign, simd_rem }
+        impl_op! { @binary $scalar, Rem::rem, simd_rem }
     };
     { impl Shl for $scalar:ty } => {
-        impl_op! { @binary $scalar, Shl::shl, ShlAssign::shl_assign, simd_shl }
+        impl_op! { @binary $scalar, Shl::shl, simd_shl }
     };
     { impl Shr for $scalar:ty } => {
-        impl_op! { @binary $scalar, Shr::shr, ShrAssign::shr_assign, simd_shr }
+        impl_op! { @binary $scalar, Shr::shr, simd_shr }
     };
     { impl BitAnd for $scalar:ty } => {
-        impl_op! { @binary $scalar, BitAnd::bitand, BitAndAssign::bitand_assign, simd_and }
+        impl_op! { @binary $scalar, BitAnd::bitand, simd_and }
     };
     { impl BitOr for $scalar:ty } => {
-        impl_op! { @binary $scalar, BitOr::bitor, BitOrAssign::bitor_assign, simd_or }
+        impl_op! { @binary $scalar, BitOr::bitor, simd_or }
     };
     { impl BitXor for $scalar:ty } => {
-        impl_op! { @binary $scalar, BitXor::bitxor, BitXorAssign::bitxor_assign, simd_xor }
+        impl_op! { @binary $scalar, BitXor::bitxor, simd_xor }
     };
 
     { impl Not for $scalar:ty } => {
@@ -166,7 +148,7 @@ macro_rules! impl_op {
     };
 
     // generic binary op with assignment when output is `Self`
-    { @binary $scalar:ty, $trait:ident :: $trait_fn:ident, $assign_trait:ident :: $assign_trait_fn:ident, $intrinsic:ident } => {
+    { @binary $scalar:ty, $trait:ident :: $trait_fn:ident, $intrinsic:ident } => {
         impl_ref_ops! {
             impl<const LANES: usize> core::ops::$trait<Self> for Simd<$scalar, LANES>
             where
@@ -210,32 +192,6 @@ macro_rules! impl_op {
                 }
             }
         }
-
-        impl_ref_ops! {
-            impl<const LANES: usize> core::ops::$assign_trait<Self> for Simd<$scalar, LANES>
-            where
-                LaneCount<LANES>: SupportedLaneCount,
-            {
-                #[inline]
-                fn $assign_trait_fn(&mut self, rhs: Self) {
-                    unsafe {
-                        *self = intrinsics::$intrinsic(*self, rhs);
-                    }
-                }
-            }
-        }
-
-        impl_ref_ops! {
-            impl<const LANES: usize> core::ops::$assign_trait<$scalar> for Simd<$scalar, LANES>
-            where
-                LaneCount<LANES>: SupportedLaneCount,
-            {
-                #[inline]
-                fn $assign_trait_fn(&mut self, rhs: $scalar) {
-                    core::ops::$assign_trait::$assign_trait_fn(self, Self::splat(rhs));
-                }
-            }
-        }
     };
 }
 
@@ -331,30 +287,6 @@ macro_rules! impl_unsigned_int_ops {
                 }
             }
 
-            impl_ref_ops! {
-                impl<const LANES: usize> core::ops::DivAssign<Self> for Simd<$scalar, LANES>
-                where
-                    LaneCount<LANES>: SupportedLaneCount,
-                {
-                    #[inline]
-                    fn div_assign(&mut self, rhs: Self) {
-                        *self = *self / rhs;
-                    }
-                }
-            }
-
-            impl_ref_ops! {
-                impl<const LANES: usize> core::ops::DivAssign<$scalar> for Simd<$scalar, LANES>
-                where
-                    LaneCount<LANES>: SupportedLaneCount,
-                {
-                    #[inline]
-                    fn div_assign(&mut self, rhs: $scalar) {
-                        *self = *self / rhs;
-                    }
-                }
-            }
-
             // remainder panics on zero divisor
             impl_ref_ops! {
                 impl<const LANES: usize> core::ops::Rem<Self> for Simd<$scalar, LANES>
@@ -421,30 +353,6 @@ macro_rules! impl_unsigned_int_ops {
                 }
             }
 
-            impl_ref_ops! {
-                impl<const LANES: usize> core::ops::RemAssign<Self> for Simd<$scalar, LANES>
-                where
-                    LaneCount<LANES>: SupportedLaneCount,
-                {
-                    #[inline]
-                    fn rem_assign(&mut self, rhs: Self) {
-                        *self = *self % rhs;
-                    }
-                }
-            }
-
-            impl_ref_ops! {
-                impl<const LANES: usize> core::ops::RemAssign<$scalar> for Simd<$scalar, LANES>
-                where
-                    LaneCount<LANES>: SupportedLaneCount,
-                {
-                    #[inline]
-                    fn rem_assign(&mut self, rhs: $scalar) {
-                        *self = *self % rhs;
-                    }
-                }
-            }
-
             // shifts panic on overflow
             impl_ref_ops! {
                 impl<const LANES: usize> core::ops::Shl<Self> for Simd<$scalar, LANES>
@@ -486,31 +394,6 @@ macro_rules! impl_unsigned_int_ops {
                 }
             }
 
-
-            impl_ref_ops! {
-                impl<const LANES: usize> core::ops::ShlAssign<Self> for Simd<$scalar, LANES>
-                where
-                    LaneCount<LANES>: SupportedLaneCount,
-                {
-                    #[inline]
-                    fn shl_assign(&mut self, rhs: Self) {
-                        *self = *self << rhs;
-                    }
-                }
-            }
-
-            impl_ref_ops! {
-                impl<const LANES: usize> core::ops::ShlAssign<$scalar> for Simd<$scalar, LANES>
-                where
-                    LaneCount<LANES>: SupportedLaneCount,
-                {
-                    #[inline]
-                    fn shl_assign(&mut self, rhs: $scalar) {
-                        *self = *self << rhs;
-                    }
-                }
-            }
-
             impl_ref_ops! {
                 impl<const LANES: usize> core::ops::Shr<Self> for Simd<$scalar, LANES>
                 where
@@ -550,31 +433,6 @@ macro_rules! impl_unsigned_int_ops {
                     }
                 }
             }
-
-
-            impl_ref_ops! {
-                impl<const LANES: usize> core::ops::ShrAssign<Self> for Simd<$scalar, LANES>
-                where
-                    LaneCount<LANES>: SupportedLaneCount,
-                {
-                    #[inline]
-                    fn shr_assign(&mut self, rhs: Self) {
-                        *self = *self >> rhs;
-                    }
-                }
-            }
-
-            impl_ref_ops! {
-                impl<const LANES: usize> core::ops::ShrAssign<$scalar> for Simd<$scalar, LANES>
-                where
-                    LaneCount<LANES>: SupportedLaneCount,
-                {
-                    #[inline]
-                    fn shr_assign(&mut self, rhs: $scalar) {
-                        *self = *self >> rhs;
-                    }
-                }
-            }
         )*
     };
 }
diff --git a/crates/core_simd/src/ops/assign.rs b/crates/core_simd/src/ops/assign.rs
new file mode 100644
index 00000000000..d2b48614fc9
--- /dev/null
+++ b/crates/core_simd/src/ops/assign.rs
@@ -0,0 +1,124 @@
+//! Assignment operators
+use super::*;
+use core::ops::{AddAssign, MulAssign}; // commutative binary op-assignment
+use core::ops::{BitAndAssign, BitOrAssign, BitXorAssign}; // commutative bit binary op-assignment
+use core::ops::{DivAssign, RemAssign, SubAssign}; // non-commutative binary op-assignment
+use core::ops::{ShlAssign, ShrAssign}; // non-commutative bit binary op-assignment
+
+// Arithmetic
+
+macro_rules! assign_ops {
+    ($(impl<T, U, const LANES: usize> $assignTrait:ident<U> for Simd<T, LANES>
+        where
+            Self: $trait:ident,
+        {
+            fn $assign_call:ident(rhs: U) {
+                $call:ident
+            }
+        })*) => {
+        $(impl<T, U, const LANES: usize> $assignTrait<U> for Simd<T, LANES>
+        where
+            Self: $trait<U, Output = Self>,
+            T: SimdElement,
+            LaneCount<LANES>: SupportedLaneCount,
+        {
+            #[inline]
+            fn $assign_call(&mut self, rhs: U) {
+                *self = self.$call(rhs);
+            }
+        })*
+    }
+}
+
+assign_ops! {
+    // Arithmetic
+    impl<T, U, const LANES: usize> AddAssign<U> for Simd<T, LANES>
+    where
+        Self: Add,
+    {
+        fn add_assign(rhs: U) {
+            add
+        }
+    }
+
+    impl<T, U, const LANES: usize> MulAssign<U> for Simd<T, LANES>
+    where
+        Self: Mul,
+    {
+        fn mul_assign(rhs: U) {
+            mul
+        }
+    }
+
+    impl<T, U, const LANES: usize> SubAssign<U> for Simd<T, LANES>
+    where
+        Self: Sub,
+    {
+        fn sub_assign(rhs: U) {
+            sub
+        }
+    }
+
+    impl<T, U, const LANES: usize> DivAssign<U> for Simd<T, LANES>
+    where
+        Self: Div,
+    {
+        fn div_assign(rhs: U) {
+            div
+        }
+    }
+    impl<T, U, const LANES: usize> RemAssign<U> for Simd<T, LANES>
+    where
+        Self: Rem,
+    {
+        fn rem_assign(rhs: U) {
+            rem
+        }
+    }
+
+    // Bitops
+    impl<T, U, const LANES: usize> BitAndAssign<U> for Simd<T, LANES>
+    where
+        Self: BitAnd,
+    {
+        fn bitand_assign(rhs: U) {
+            bitand
+        }
+    }
+
+    impl<T, U, const LANES: usize> BitOrAssign<U> for Simd<T, LANES>
+    where
+        Self: BitOr,
+    {
+        fn bitor_assign(rhs: U) {
+            bitor
+        }
+    }
+
+    impl<T, U, const LANES: usize> BitXorAssign<U> for Simd<T, LANES>
+    where
+        Self: BitXor,
+    {
+        fn bitxor_assign(rhs: U) {
+            bitxor
+        }
+    }
+
+    impl<T, U, const LANES: usize> ShlAssign<U> for Simd<T, LANES>
+    where
+        Self: Shl,
+    {
+        fn shl_assign(rhs: U) {
+            shl
+        }
+    }
+
+    impl<T, U, const LANES: usize> ShrAssign<U> for Simd<T, LANES>
+    where
+        Self: Shr,
+    {
+        fn shr_assign(rhs: U) {
+            shr
+        }
+    }
+}

From ae612100d28e3e806c6aa39e52792b3ae98907e7 Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Sun, 21 Nov 2021 19:08:51 -0800
Subject: [PATCH 08/12] Generically implement horizontal_{and,or,xor}

---
 crates/core_simd/src/reduction.rs | 66 ++++++++++++++++++++-----------
 1 file changed, 44 insertions(+), 22 deletions(-)

diff --git a/crates/core_simd/src/reduction.rs b/crates/core_simd/src/reduction.rs
index db0640aae79..e79a185816b 100644
--- a/crates/core_simd/src/reduction.rs
+++ b/crates/core_simd/src/reduction.rs
@@ -2,7 +2,8 @@ use crate::simd::intrinsics::{
     simd_reduce_add_ordered, simd_reduce_and, simd_reduce_max, simd_reduce_min,
     simd_reduce_mul_ordered, simd_reduce_or, simd_reduce_xor,
 };
-use crate::simd::{LaneCount, Simd, SupportedLaneCount};
+use crate::simd::{LaneCount, Simd, SimdElement, SupportedLaneCount};
+use core::ops::{BitAnd, BitOr, BitXor};
 
 macro_rules! impl_integer_reductions {
     { $scalar:ty } => {
@@ -22,27 +23,6 @@ macro_rules! impl_integer_reductions {
                 unsafe { simd_reduce_mul_ordered(self, 1) }
             }
 
-            /// Horizontal bitwise "and".  Returns the cumulative bitwise "and" across the lanes of
-            /// the vector.
-            #[inline]
-            pub fn horizontal_and(self) -> $scalar {
-                unsafe { simd_reduce_and(self) }
-            }
-
-            /// Horizontal bitwise "or".  Returns the cumulative bitwise "or" across the lanes of
-            /// the vector.
-            #[inline]
-            pub fn horizontal_or(self) -> $scalar {
-                unsafe { simd_reduce_or(self) }
-            }
-
-            /// Horizontal bitwise "xor".  Returns the cumulative bitwise "xor" across the lanes of
-            /// the vector.
-            #[inline]
-            pub fn horizontal_xor(self) -> $scalar {
-                unsafe { simd_reduce_xor(self) }
-            }
-
             /// Horizontal maximum.  Returns the maximum lane in the vector.
             #[inline]
             pub fn horizontal_max(self) -> $scalar {
@@ -121,3 +101,45 @@ macro_rules! impl_float_reductions {
 
 impl_float_reductions! { f32 }
 impl_float_reductions! { f64 }
+
+impl<T, const LANES: usize> Simd<T, LANES>
+where
+    Self: BitAnd<Self, Output = Self>,
+    T: SimdElement + BitAnd<T, Output = T>,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    /// Horizontal bitwise "and".  Returns the cumulative bitwise "and" across the lanes of
+    /// the vector.
+    #[inline]
+    pub fn horizontal_and(self) -> T {
+        unsafe { simd_reduce_and(self) }
+    }
+}
+
+impl<T, const LANES: usize> Simd<T, LANES>
+where
+    Self: BitOr<Self, Output = Self>,
+    T: SimdElement + BitOr<T, Output = T>,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    /// Horizontal bitwise "or".  Returns the cumulative bitwise "or" across the lanes of
+    /// the vector.
+    #[inline]
+    pub fn horizontal_or(self) -> T {
+        unsafe { simd_reduce_or(self) }
+    }
+}
+
+impl<T, const LANES: usize> Simd<T, LANES>
+where
+    Self: BitXor<Self, Output = Self>,
+    T: SimdElement + BitXor<T, Output = T>,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    /// Horizontal bitwise "xor".  Returns the cumulative bitwise "xor" across the lanes of
+    /// the vector.
+    #[inline]
+    pub fn horizontal_xor(self) -> T {
+        unsafe { simd_reduce_xor(self) }
+    }
+}

From b2dac7124b2aa3951c7f564015d66f0fff6488aa Mon Sep 17 00:00:00 2001
From: Alexander Ronald Altman <alexanderaltman@me.com>
Date: Thu, 25 Nov 2021 00:45:28 -0800
Subject: [PATCH 09/12] Uncomment AVX512 byte vector conversions

Resolves my comment in #197, at least for now; #187 is pending but since these are already here, just commented, it seemed to make sense to me to re-enable them anyway.
---
 crates/core_simd/src/vendor/x86.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/crates/core_simd/src/vendor/x86.rs b/crates/core_simd/src/vendor/x86.rs
index d3c19ccc539..0dd47015ed2 100644
--- a/crates/core_simd/src/vendor/x86.rs
+++ b/crates/core_simd/src/vendor/x86.rs
@@ -8,10 +8,10 @@ use core::arch::x86_64::*;
 
 from_transmute! { unsafe u8x16 => __m128i }
 from_transmute! { unsafe u8x32 => __m256i }
-//from_transmute! { unsafe u8x64 => __m512i }
+from_transmute! { unsafe u8x64 => __m512i }
 from_transmute! { unsafe i8x16 => __m128i }
 from_transmute! { unsafe i8x32 => __m256i }
-//from_transmute! { unsafe i8x64 => __m512i }
+from_transmute! { unsafe i8x64 => __m512i }
 
 from_transmute! { unsafe u16x8 => __m128i }
 from_transmute! { unsafe u16x16 => __m256i }

From 6094f22ceb6a697bfcfc3e972170f33badc8f6ee Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Tue, 23 Nov 2021 17:36:54 -0800
Subject: [PATCH 10/12] impl unary.rs for Simd<{i,u}{8,16,32,64,size}, _>

In order to assure type soundness, these "base" impls
need to go directly on Simd<T, _> for every scalar type argument.
A bit of cleanup of ops.rs is still warranted.
---
 crates/core_simd/src/ops.rs       | 53 +--------------------
 crates/core_simd/src/ops/unary.rs | 77 +++++++++++++++++++++++++++++++
 2 files changed, 78 insertions(+), 52 deletions(-)
 create mode 100644 crates/core_simd/src/ops/unary.rs

diff --git a/crates/core_simd/src/ops.rs b/crates/core_simd/src/ops.rs
index aee5a111a82..b7da4f341d1 100644
--- a/crates/core_simd/src/ops.rs
+++ b/crates/core_simd/src/ops.rs
@@ -7,6 +7,7 @@ use core::ops::{Shl, Shr};
 
 mod assign;
 mod deref;
+mod unary;
 
 impl<I, T, const LANES: usize> core::ops::Index<I> for Simd<T, LANES>
 where
@@ -65,25 +66,6 @@ macro_rules! impl_ref_ops {
             fn $fn($self_tok, $rhs_arg: $rhs_arg_ty) -> Self::Output $body
         }
     };
-
-    // unary op
-    {
-        impl<const $lanes:ident: usize> core::ops::$trait:ident for $type:ty
-        where
-            LaneCount<$lanes2:ident>: SupportedLaneCount,
-        {
-            type Output = $output:ty;
-            fn $fn:ident($self_tok:ident) -> Self::Output $body:tt
-        }
-    } => {
-        impl<const $lanes: usize> core::ops::$trait for $type
-        where
-            LaneCount<$lanes2>: SupportedLaneCount,
-        {
-            type Output = $output;
-            fn $fn($self_tok) -> Self::Output $body
-        }
-    }
 }
 
 /// Automatically implements operators over vectors and scalars for a particular vector.
@@ -119,34 +101,6 @@ macro_rules! impl_op {
         impl_op! { @binary $scalar, BitXor::bitxor, simd_xor }
     };
 
-    { impl Not for $scalar:ty } => {
-        impl_ref_ops! {
-            impl<const LANES: usize> core::ops::Not for Simd<$scalar, LANES>
-            where
-                LaneCount<LANES>: SupportedLaneCount,
-            {
-                type Output = Self;
-                fn not(self) -> Self::Output {
-                    self ^ Self::splat(!<$scalar>::default())
-                }
-            }
-        }
-    };
-
-    { impl Neg for $scalar:ty } => {
-        impl_ref_ops! {
-            impl<const LANES: usize> core::ops::Neg for Simd<$scalar, LANES>
-            where
-                LaneCount<LANES>: SupportedLaneCount,
-            {
-                type Output = Self;
-                fn neg(self) -> Self::Output {
-                    unsafe { intrinsics::simd_neg(self) }
-                }
-            }
-        }
-    };
-
     // generic binary op with assignment when output is `Self`
     { @binary $scalar:ty, $trait:ident :: $trait_fn:ident, $intrinsic:ident } => {
         impl_ref_ops! {
@@ -204,7 +158,6 @@ macro_rules! impl_float_ops {
             impl_op! { impl Mul for $scalar }
             impl_op! { impl Div for $scalar }
             impl_op! { impl Rem for $scalar }
-            impl_op! { impl Neg for $scalar }
         )*
     };
 }
@@ -219,7 +172,6 @@ macro_rules! impl_unsigned_int_ops {
             impl_op! { impl BitAnd for $scalar }
             impl_op! { impl BitOr  for $scalar }
             impl_op! { impl BitXor for $scalar }
-            impl_op! { impl Not for $scalar }
 
             // Integers panic on divide by 0
             impl_ref_ops! {
@@ -441,9 +393,6 @@ macro_rules! impl_unsigned_int_ops {
 macro_rules! impl_signed_int_ops {
     { $($scalar:ty),* } => {
         impl_unsigned_int_ops! { $($scalar),* }
-        $( // scalar
-            impl_op! { impl Neg for $scalar }
-        )*
     };
 }
 
diff --git a/crates/core_simd/src/ops/unary.rs b/crates/core_simd/src/ops/unary.rs
new file mode 100644
index 00000000000..4ebea560fc6
--- /dev/null
+++ b/crates/core_simd/src/ops/unary.rs
@@ -0,0 +1,77 @@
+use crate::simd::intrinsics;
+use crate::simd::{LaneCount, Simd, SimdElement, SupportedLaneCount};
+use core::ops::{Neg, Not}; // unary ops
+
+macro_rules! neg {
+    ($(impl<const LANES: usize> Neg for Simd<$scalar:ty, LANES>)*) => {
+        $(impl<const LANES: usize> Neg for Simd<$scalar, LANES>
+        where
+            $scalar: SimdElement,
+            LaneCount<LANES>: SupportedLaneCount,
+        {
+            type Output = Self;
+
+            #[inline]
+            #[must_use = "operator returns a new vector without mutating the input"]
+            fn neg(self) -> Self::Output {
+                unsafe { intrinsics::simd_neg(self) }
+            }
+        })*
+    }
+}
+
+neg! {
+    impl<const LANES: usize> Neg for Simd<f32, LANES>
+
+    impl<const LANES: usize> Neg for Simd<f64, LANES>
+
+    impl<const LANES: usize> Neg for Simd<i8, LANES>
+
+    impl<const LANES: usize> Neg for Simd<i16, LANES>
+
+    impl<const LANES: usize> Neg for Simd<i32, LANES>
+
+    impl<const LANES: usize> Neg for Simd<i64, LANES>
+
+    impl<const LANES: usize> Neg for Simd<isize, LANES>
+}
+
+macro_rules! not {
+    ($(impl<const LANES: usize> Not for Simd<$scalar:ty, LANES>)*) => {
+        $(impl<const LANES: usize> Not for Simd<$scalar, LANES>
+        where
+            $scalar: SimdElement,
+            LaneCount<LANES>: SupportedLaneCount,
+        {
+            type Output = Self;
+
+            #[inline]
+            #[must_use = "operator returns a new vector without mutating the input"]
+            fn not(self) -> Self::Output {
+                self ^ (Simd::splat(!(0 as $scalar)))
+            }
+        })*
+    }
+}
+
+not! {
+    impl<const LANES: usize> Not for Simd<i8, LANES>
+
+    impl<const LANES: usize> Not for Simd<i16, LANES>
+
+    impl<const LANES: usize> Not for Simd<i32, LANES>
+
+    impl<const LANES: usize> Not for Simd<i64, LANES>
+
+    impl<const LANES: usize> Not for Simd<isize, LANES>
+
+    impl<const LANES: usize> Not for Simd<u8, LANES>
+
+    impl<const LANES: usize> Not for Simd<u16, LANES>
+
+    impl<const LANES: usize> Not for Simd<u32, LANES>
+
+    impl<const LANES: usize> Not for Simd<u64, LANES>
+
+    impl<const LANES: usize> Not for Simd<usize, LANES>
+}

From 257fa7aa6d03157476f0d6acd9a0b4c28a3877ec Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Tue, 23 Nov 2021 17:55:14 -0800
Subject: [PATCH 11/12] Drop splats for Simd<T, _>

Unfortunately, splatting impls currently break several crates.
Rust needs more time to review possible mitigations, so
drop the impls for the `impl Add<T> for Simd<T, _>` pattern, for now.
---
 crates/core_simd/examples/nbody.rs   |  10 +-
 crates/core_simd/src/math.rs         |   8 +-
 crates/core_simd/src/ops.rs          | 138 ---------------------------
 crates/core_simd/src/vector/ptr.rs   |   4 +-
 crates/core_simd/tests/ops_macros.rs |  48 ----------
 5 files changed, 11 insertions(+), 197 deletions(-)

diff --git a/crates/core_simd/examples/nbody.rs b/crates/core_simd/examples/nbody.rs
index 779575985ed..43280feebbd 100644
--- a/crates/core_simd/examples/nbody.rs
+++ b/crates/core_simd/examples/nbody.rs
@@ -97,7 +97,7 @@ mod nbody {
         let sun = &mut sun[0];
         for body in rest {
             let m_ratio = body.mass / SOLAR_MASS;
-            sun.v -= body.v * m_ratio;
+            sun.v -= body.v * Simd::splat(m_ratio);
         }
     }
 
@@ -143,14 +143,14 @@ mod nbody {
         let mut i = 0;
         for j in 0..N_BODIES {
             for k in j + 1..N_BODIES {
-                let f = r[i] * mag[i];
-                bodies[j].v -= f * bodies[k].mass;
-                bodies[k].v += f * bodies[j].mass;
+                let f = r[i] * Simd::splat(mag[i]);
+                bodies[j].v -= f * Simd::splat(bodies[k].mass);
+                bodies[k].v += f * Simd::splat(bodies[j].mass);
                 i += 1
             }
         }
         for body in bodies {
-            body.x += dt * body.v
+            body.x += Simd::splat(dt) * body.v
         }
     }
 
diff --git a/crates/core_simd/src/math.rs b/crates/core_simd/src/math.rs
index 2bae414ebfb..7435b6df918 100644
--- a/crates/core_simd/src/math.rs
+++ b/crates/core_simd/src/math.rs
@@ -17,7 +17,7 @@ macro_rules! impl_uint_arith {
             /// let max = Simd::splat(MAX);
             /// let unsat = x + max;
             /// let sat = x.saturating_add(max);
-            /// assert_eq!(x - 1, unsat);
+            /// assert_eq!(unsat, Simd::from_array([1, 0, MAX, MAX - 1]));
             /// assert_eq!(sat, max);
             /// ```
             #[inline]
@@ -37,7 +37,7 @@ macro_rules! impl_uint_arith {
             /// let max = Simd::splat(MAX);
             /// let unsat = x - max;
             /// let sat = x.saturating_sub(max);
-            /// assert_eq!(unsat, x + 1);
+            /// assert_eq!(unsat, Simd::from_array([3, 2, 1, 0]));
             /// assert_eq!(sat, Simd::splat(0));
             #[inline]
             pub fn saturating_sub(self, second: Self) -> Self {
@@ -105,7 +105,7 @@ macro_rules! impl_int_arith {
             #[inline]
             pub fn abs(self) -> Self {
                 const SHR: $ty = <$ty>::BITS as $ty - 1;
-                let m = self >> SHR;
+                let m = self >> Simd::splat(SHR);
                 (self^m) - m
             }
 
@@ -128,7 +128,7 @@ macro_rules! impl_int_arith {
             pub fn saturating_abs(self) -> Self {
                 // arith shift for -1 or 0 mask based on sign bit, giving 2s complement
                 const SHR: $ty = <$ty>::BITS as $ty - 1;
-                let m = self >> SHR;
+                let m = self >> Simd::splat(SHR);
                 (self^m).saturating_sub(m)
             }
 
diff --git a/crates/core_simd/src/ops.rs b/crates/core_simd/src/ops.rs
index b7da4f341d1..3582c57870b 100644
--- a/crates/core_simd/src/ops.rs
+++ b/crates/core_simd/src/ops.rs
@@ -118,34 +118,6 @@ macro_rules! impl_op {
                 }
             }
         }
-
-        impl_ref_ops! {
-            impl<const LANES: usize> core::ops::$trait<$scalar> for Simd<$scalar, LANES>
-            where
-                LaneCount<LANES>: SupportedLaneCount,
-            {
-                type Output = Self;
-
-                #[inline]
-                fn $trait_fn(self, rhs: $scalar) -> Self::Output {
-                    core::ops::$trait::$trait_fn(self, Self::splat(rhs))
-                }
-            }
-        }
-
-        impl_ref_ops! {
-            impl<const LANES: usize> core::ops::$trait<Simd<$scalar, LANES>> for $scalar
-            where
-                LaneCount<LANES>: SupportedLaneCount,
-            {
-                type Output = Simd<$scalar, LANES>;
-
-                #[inline]
-                fn $trait_fn(self, rhs: Simd<$scalar, LANES>) -> Self::Output {
-                    core::ops::$trait::$trait_fn(Simd::splat(self), rhs)
-                }
-            }
-        }
     };
 }
 
@@ -202,43 +174,6 @@ macro_rules! impl_unsigned_int_ops {
                 }
             }
 
-            impl_ref_ops! {
-                impl<const LANES: usize> core::ops::Div<$scalar> for Simd<$scalar, LANES>
-                where
-                    LaneCount<LANES>: SupportedLaneCount,
-                {
-                    type Output = Self;
-
-                    #[inline]
-                    fn div(self, rhs: $scalar) -> Self::Output {
-                        if rhs == 0 {
-                            panic!("attempt to divide by zero");
-                        }
-                        if <$scalar>::MIN != 0 &&
-                            self.as_array().iter().any(|x| *x == <$scalar>::MIN) &&
-                            rhs == -1 as _ {
-                                panic!("attempt to divide with overflow");
-                        }
-                        let rhs = Self::splat(rhs);
-                        unsafe { intrinsics::simd_div(self, rhs) }
-                    }
-                }
-            }
-
-            impl_ref_ops! {
-                impl<const LANES: usize> core::ops::Div<Simd<$scalar, LANES>> for $scalar
-                where
-                    LaneCount<LANES>: SupportedLaneCount,
-                {
-                    type Output = Simd<$scalar, LANES>;
-
-                    #[inline]
-                    fn div(self, rhs: Simd<$scalar, LANES>) -> Self::Output {
-                        Simd::splat(self) / rhs
-                    }
-                }
-            }
-
             // remainder panics on zero divisor
             impl_ref_ops! {
                 impl<const LANES: usize> core::ops::Rem<Self> for Simd<$scalar, LANES>
@@ -268,43 +203,6 @@ macro_rules! impl_unsigned_int_ops {
                 }
             }
 
-            impl_ref_ops! {
-                impl<const LANES: usize> core::ops::Rem<$scalar> for Simd<$scalar, LANES>
-                where
-                    LaneCount<LANES>: SupportedLaneCount,
-                {
-                    type Output = Self;
-
-                    #[inline]
-                    fn rem(self, rhs: $scalar) -> Self::Output {
-                        if rhs == 0 {
-                            panic!("attempt to calculate the remainder with a divisor of zero");
-                        }
-                        if <$scalar>::MIN != 0 &&
-                            self.as_array().iter().any(|x| *x == <$scalar>::MIN) &&
-                            rhs == -1 as _ {
-                                panic!("attempt to calculate the remainder with overflow");
-                        }
-                        let rhs = Self::splat(rhs);
-                        unsafe { intrinsics::simd_rem(self, rhs) }
-                    }
-                }
-            }
-
-            impl_ref_ops! {
-                impl<const LANES: usize> core::ops::Rem<Simd<$scalar, LANES>> for $scalar
-                where
-                    LaneCount<LANES>: SupportedLaneCount,
-                {
-                    type Output = Simd<$scalar, LANES>;
-
-                    #[inline]
-                    fn rem(self, rhs: Simd<$scalar, LANES>) -> Self::Output {
-                        Simd::splat(self) % rhs
-                    }
-                }
-            }
-
             // shifts panic on overflow
             impl_ref_ops! {
                 impl<const LANES: usize> core::ops::Shl<Self> for Simd<$scalar, LANES>
@@ -328,24 +226,6 @@ macro_rules! impl_unsigned_int_ops {
                 }
             }
 
-            impl_ref_ops! {
-                impl<const LANES: usize> core::ops::Shl<$scalar> for Simd<$scalar, LANES>
-                where
-                    LaneCount<LANES>: SupportedLaneCount,
-                {
-                    type Output = Self;
-
-                    #[inline]
-                    fn shl(self, rhs: $scalar) -> Self::Output {
-                        if invalid_shift_rhs(rhs) {
-                            panic!("attempt to shift left with overflow");
-                        }
-                        let rhs = Self::splat(rhs);
-                        unsafe { intrinsics::simd_shl(self, rhs) }
-                    }
-                }
-            }
-
             impl_ref_ops! {
                 impl<const LANES: usize> core::ops::Shr<Self> for Simd<$scalar, LANES>
                 where
@@ -367,24 +247,6 @@ macro_rules! impl_unsigned_int_ops {
                     }
                 }
             }
-
-            impl_ref_ops! {
-                impl<const LANES: usize> core::ops::Shr<$scalar> for Simd<$scalar, LANES>
-                where
-                    LaneCount<LANES>: SupportedLaneCount,
-                {
-                    type Output = Self;
-
-                    #[inline]
-                    fn shr(self, rhs: $scalar) -> Self::Output {
-                        if invalid_shift_rhs(rhs) {
-                            panic!("attempt to shift with overflow");
-                        }
-                        let rhs = Self::splat(rhs);
-                        unsafe { intrinsics::simd_shr(self, rhs) }
-                    }
-                }
-            }
         )*
     };
 }
diff --git a/crates/core_simd/src/vector/ptr.rs b/crates/core_simd/src/vector/ptr.rs
index ac9b98ca031..c668d9a6eae 100644
--- a/crates/core_simd/src/vector/ptr.rs
+++ b/crates/core_simd/src/vector/ptr.rs
@@ -23,7 +23,7 @@ where
     pub fn wrapping_add(self, addend: Simd<usize, LANES>) -> Self {
         unsafe {
             let x: Simd<usize, LANES> = mem::transmute_copy(&self);
-            mem::transmute_copy(&{ x + (addend * mem::size_of::<T>()) })
+            mem::transmute_copy(&{ x + (addend * Simd::splat(mem::size_of::<T>())) })
         }
     }
 }
@@ -49,7 +49,7 @@ where
     pub fn wrapping_add(self, addend: Simd<usize, LANES>) -> Self {
         unsafe {
             let x: Simd<usize, LANES> = mem::transmute_copy(&self);
-            mem::transmute_copy(&{ x + (addend * mem::size_of::<T>()) })
+            mem::transmute_copy(&{ x + (addend * Simd::splat(mem::size_of::<T>())) })
         }
     }
 }
diff --git a/crates/core_simd/tests/ops_macros.rs b/crates/core_simd/tests/ops_macros.rs
index 31b7ee20695..43ddde4c55e 100644
--- a/crates/core_simd/tests/ops_macros.rs
+++ b/crates/core_simd/tests/ops_macros.rs
@@ -38,22 +38,6 @@ macro_rules! impl_binary_op_test {
                     );
                 }
 
-                fn scalar_rhs<const LANES: usize>() {
-                    test_helpers::test_binary_scalar_rhs_elementwise(
-                        &<Simd<$scalar, LANES> as core::ops::$trait<$scalar>>::$fn,
-                        &$scalar_fn,
-                        &|_, _| true,
-                    );
-                }
-
-                fn scalar_lhs<const LANES: usize>() {
-                    test_helpers::test_binary_scalar_lhs_elementwise(
-                        &<$scalar as core::ops::$trait<Simd<$scalar, LANES>>>::$fn,
-                        &$scalar_fn,
-                        &|_, _| true,
-                    );
-                }
-
                 fn assign<const LANES: usize>() {
                     test_helpers::test_binary_elementwise(
                         &|mut a, b| { <Simd<$scalar, LANES> as core::ops::$trait_assign>::$fn_assign(&mut a, b); a },
@@ -61,14 +45,6 @@ macro_rules! impl_binary_op_test {
                         &|_, _| true,
                     );
                 }
-
-                fn assign_scalar_rhs<const LANES: usize>() {
-                    test_helpers::test_binary_scalar_rhs_elementwise(
-                        &|mut a, b| { <Simd<$scalar, LANES> as core::ops::$trait_assign<$scalar>>::$fn_assign(&mut a, b); a },
-                        &$scalar_fn,
-                        &|_, _| true,
-                    );
-                }
             }
         }
     };
@@ -99,22 +75,6 @@ macro_rules! impl_binary_checked_op_test {
                     );
                 }
 
-                fn scalar_rhs<const LANES: usize>() {
-                    test_helpers::test_binary_scalar_rhs_elementwise(
-                        &<Simd<$scalar, LANES> as core::ops::$trait<$scalar>>::$fn,
-                        &$scalar_fn,
-                        &|x, y| x.iter().all(|x| $check_fn(*x, y)),
-                    );
-                }
-
-                fn scalar_lhs<const LANES: usize>() {
-                    test_helpers::test_binary_scalar_lhs_elementwise(
-                        &<$scalar as core::ops::$trait<Simd<$scalar, LANES>>>::$fn,
-                        &$scalar_fn,
-                        &|x, y| y.iter().all(|y| $check_fn(x, *y)),
-                    );
-                }
-
                 fn assign<const LANES: usize>() {
                     test_helpers::test_binary_elementwise(
                         &|mut a, b| { <Simd<$scalar, LANES> as core::ops::$trait_assign>::$fn_assign(&mut a, b); a },
@@ -122,14 +82,6 @@ macro_rules! impl_binary_checked_op_test {
                         &|x, y| x.iter().zip(y.iter()).all(|(x, y)| $check_fn(*x, *y)),
                     )
                 }
-
-                fn assign_scalar_rhs<const LANES: usize>() {
-                    test_helpers::test_binary_scalar_rhs_elementwise(
-                        &|mut a, b| { <Simd<$scalar, LANES> as core::ops::$trait_assign<$scalar>>::$fn_assign(&mut a, b); a },
-                        &$scalar_fn,
-                        &|x, y| x.iter().all(|x| $check_fn(*x, y)),
-                    )
-                }
             }
         }
     };

From 8003b043233213c6f984837d7618f92a6181a875 Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Wed, 1 Dec 2021 15:02:03 -0800
Subject: [PATCH 12/12] impl Op<&'_ RHS> for &'_ LHS

---
 crates/core_simd/src/ops/deref.rs   | 114 ++++++++++++++++++++--------
 crates/core_simd/tests/autoderef.rs |  22 ++++++
 2 files changed, 106 insertions(+), 30 deletions(-)
 create mode 100644 crates/core_simd/tests/autoderef.rs

diff --git a/crates/core_simd/src/ops/deref.rs b/crates/core_simd/src/ops/deref.rs
index 1138b9494f6..9883a74c92d 100644
--- a/crates/core_simd/src/ops/deref.rs
+++ b/crates/core_simd/src/ops/deref.rs
@@ -1,70 +1,124 @@
 //! This module hacks in "implicit deref" for Simd's operators.
 //! Ideally, Rust would take care of this itself,
 //! and method calls usually handle the LHS implicitly.
-//! So, we'll manually deref the RHS.
+//! But this is not the case with arithmetic ops.
 use super::*;
 
-macro_rules! deref_ops {
-    ($(impl<T, const LANES: usize> $trait:ident<&Self> for Simd<T, LANES> {
-            fn $call:ident(rhs: &Self)
-        })*) => {
-        $(impl<T, const LANES: usize> $trait<&Self> for Simd<T, LANES>
+macro_rules! deref_lhs {
+    (impl<T, const LANES: usize> $trait:ident for $simd:ty {
+            fn $call:ident
+        }) => {
+        impl<T, const LANES: usize> $trait<$simd> for &$simd
         where
-            Self: $trait<Self, Output = Self>,
             T: SimdElement,
+            $simd: $trait<$simd, Output = $simd>,
             LaneCount<LANES>: SupportedLaneCount,
         {
-            type Output = Self;
+            type Output = Simd<T, LANES>;
 
             #[inline]
             #[must_use = "operator returns a new vector without mutating the inputs"]
-            fn $call(self, rhs: &Self) -> Self::Output {
+            fn $call(self, rhs: $simd) -> Self::Output {
+                (*self).$call(rhs)
+            }
+        }
+    };
+}
+
+macro_rules! deref_rhs {
+    (impl<T, const LANES: usize> $trait:ident for $simd:ty {
+            fn $call:ident
+        }) => {
+        impl<T, const LANES: usize> $trait<&$simd> for $simd
+        where
+            T: SimdElement,
+            $simd: $trait<$simd, Output = $simd>,
+            LaneCount<LANES>: SupportedLaneCount,
+        {
+            type Output = Simd<T, LANES>;
+
+            #[inline]
+            #[must_use = "operator returns a new vector without mutating the inputs"]
+            fn $call(self, rhs: &$simd) -> Self::Output {
                 self.$call(*rhs)
             }
-        })*
+        }
+    };
+}
+
+macro_rules! deref_ops {
+    ($(impl<T, const LANES: usize> $trait:ident for $simd:ty {
+            fn $call:ident
+        })*) => {
+        $(
+            deref_rhs! {
+                impl<T, const LANES: usize> $trait for $simd {
+                    fn $call
+                }
+            }
+            deref_lhs! {
+                impl<T, const LANES: usize> $trait for $simd {
+                    fn $call
+                }
+            }
+            impl<'lhs, 'rhs, T, const LANES: usize> $trait<&'rhs $simd> for &'lhs $simd
+            where
+                T: SimdElement,
+                $simd: $trait<$simd, Output = $simd>,
+                LaneCount<LANES>: SupportedLaneCount,
+            {
+                type Output = $simd;
+
+                #[inline]
+                #[must_use = "operator returns a new vector without mutating the inputs"]
+                fn $call(self, rhs: &$simd) -> Self::Output {
+                    (*self).$call(*rhs)
+                }
+            }
+        )*
     }
 }
 
 deref_ops! {
     // Arithmetic
-    impl<T, const LANES: usize> Add<&Self> for Simd<T, LANES> {
-        fn add(rhs: &Self)
+    impl<T, const LANES: usize> Add for Simd<T, LANES> {
+        fn add
     }
 
-    impl<T, const LANES: usize> Mul<&Self> for Simd<T, LANES> {
-        fn mul(rhs: &Self)
+    impl<T, const LANES: usize> Mul for Simd<T, LANES> {
+        fn mul
     }
 
-    impl<T, const LANES: usize> Sub<&Self> for Simd<T, LANES> {
-        fn sub(rhs: &Self)
+    impl<T, const LANES: usize> Sub for Simd<T, LANES> {
+        fn sub
     }
 
-    impl<T, const LANES: usize> Div<&Self> for Simd<T, LANES> {
-        fn div(rhs: &Self)
+    impl<T, const LANES: usize> Div for Simd<T, LANES> {
+        fn div
     }
 
-    impl<T, const LANES: usize> Rem<&Self> for Simd<T, LANES> {
-        fn rem(rhs: &Self)
+    impl<T, const LANES: usize> Rem for Simd<T, LANES> {
+        fn rem
     }
 
     // Bitops
-    impl<T, const LANES: usize> BitAnd<&Self> for Simd<T, LANES> {
-        fn bitand(rhs: &Self)
+    impl<T, const LANES: usize> BitAnd for Simd<T, LANES> {
+        fn bitand
     }
 
-    impl<T, const LANES: usize> BitOr<&Self> for Simd<T, LANES> {
-        fn bitor(rhs: &Self)
+    impl<T, const LANES: usize> BitOr for Simd<T, LANES> {
+        fn bitor
     }
 
-    impl<T, const LANES: usize> BitXor<&Self> for Simd<T, LANES> {
-        fn bitxor(rhs: &Self)
+    impl<T, const LANES: usize> BitXor for Simd<T, LANES> {
+        fn bitxor
     }
 
-    impl<T, const LANES: usize> Shl<&Self> for Simd<T, LANES> {
-        fn shl(rhs: &Self)
+    impl<T, const LANES: usize> Shl for Simd<T, LANES> {
+        fn shl
     }
 
-    impl<T, const LANES: usize> Shr<&Self> for Simd<T, LANES> {
-        fn shr(rhs: &Self)
+    impl<T, const LANES: usize> Shr for Simd<T, LANES> {
+        fn shr
     }
 }
diff --git a/crates/core_simd/tests/autoderef.rs b/crates/core_simd/tests/autoderef.rs
new file mode 100644
index 00000000000..9359da16ee5
--- /dev/null
+++ b/crates/core_simd/tests/autoderef.rs
@@ -0,0 +1,22 @@
+// Test that we handle all our "auto-deref" cases correctly.
+#![feature(portable_simd)]
+use core_simd::f32x4;
+
+#[cfg(target_arch = "wasm32")]
+use wasm_bindgen_test::*;
+
+#[cfg(target_arch = "wasm32")]
+wasm_bindgen_test_configure!(run_in_browser);
+
+#[test]
+#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+fn deref() {
+    let x = f32x4::splat(1.0);
+    let y = f32x4::splat(2.0);
+    let a = &x;
+    let b = &y;
+    assert_eq!(f32x4::splat(3.0), x + y);
+    assert_eq!(f32x4::splat(3.0), x + b);
+    assert_eq!(f32x4::splat(3.0), a + y);
+    assert_eq!(f32x4::splat(3.0), a + b);
+}