rust-lang · farnoy · Nov 16, 2023 · Nov 16, 2023 · Nov 17, 2023 · Nov 19, 2023
diff --git a/crates/core_simd/src/intrinsics.rs b/crates/core_simd/src/intrinsics.rs
@@ -107,6 +107,15 @@ extern "platform-intrinsic" {
     /// like gather, but more spicy, as it writes instead of reads
     pub(crate) fn simd_scatter<T, U, V>(val: T, ptr: U, mask: V);
 
+    /// like a loop of reads offset from the same pointer
+    /// val: vector of values to select if a lane is masked
+    /// ptr: vector of pointers to read from
+    /// mask: a "wide" mask of integers, selects as if simd_select(mask, read(ptr), val)
+    /// note, the LLVM intrinsic accepts a mask vector of `<N x i1>`
+    pub(crate) fn simd_masked_load<T, U, V>(val: T, ptr: U, mask: V) -> T;
+    /// like masked_load, but more spicy, as it writes instead of reads
+    pub(crate) fn simd_masked_store<T, U, V>(val: T, ptr: U, mask: V);
+
     // {s,u}add.sat
     pub(crate) fn simd_saturating_add<T>(x: T, y: T) -> T;
 

diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
@@ -1,3 +1,4 @@
+use super::masks::{ToBitMask, ToBitMaskArray};
 use crate::simd::{
     cmp::SimdPartialOrd,
     intrinsics,
@@ -311,6 +312,57 @@ where
         unsafe { self.store(slice.as_mut_ptr().cast()) }
     }
 
+    #[must_use]
+    #[inline]
+    pub fn masked_load_or(slice: &[T], or: Self) -> Self
+    where
+        Mask<<T as SimdElement>::Mask, N>: ToBitMask + ToBitMaskArray,
+    {
+        Self::masked_load_select(slice, Mask::splat(true), or)
+    }
 assert!( 
     slice.len() >= Self::LEN, 
     "slice length must be at least the number of elements" 
 ); 
 assert!( 
     slice.len() >= Self::LEN, 
     "slice length must be at least the number of elements" 
 ); 
+
+    #[must_use]
+    #[inline]
+    pub fn masked_load_select(
+        slice: &[T],
+        mut enable: Mask<<T as SimdElement>::Mask, N>,
+        or: Self,
+    ) -> Self
+    where
+        Mask<<T as SimdElement>::Mask, N>: ToBitMask + ToBitMaskArray,
+    {
+        enable &= {
+            let mask = bzhi_u64(u64::MAX, core::cmp::min(N, slice.len()) as u32);
+            let mask_bytes: [u8; 8] = unsafe { core::mem::transmute(mask) };
+            let mut in_bounds_arr = Mask::splat(true).to_bitmask_array();
+            let len = in_bounds_arr.as_ref().len();
+            in_bounds_arr.as_mut().copy_from_slice(&mask_bytes[..len]);
+            Mask::from_bitmask_array(in_bounds_arr)
+        };
+        unsafe { Self::masked_load_select_ptr(slice.as_ptr(), enable, or) }
+    }
+
+    #[must_use]
+    #[inline]
+    pub unsafe fn masked_load_select_unchecked(
+        slice: &[T],
+        enable: Mask<<T as SimdElement>::Mask, N>,
+        or: Self,
+    ) -> Self {
+        let ptr = slice.as_ptr();
+        unsafe { Self::masked_load_select_ptr(ptr, enable, or) }
+    }
+
+    #[must_use]
+    #[inline]
+    pub unsafe fn masked_load_select_ptr(
+        ptr: *const T,
+        enable: Mask<<T as SimdElement>::Mask, N>,
+        or: Self,
+    ) -> Self {
+        unsafe { intrinsics::simd_masked_load(or, ptr, enable.to_int()) }
+    }
+
     /// Reads from potentially discontiguous indices in `slice` to construct a SIMD vector.
     /// If an index is out-of-bounds, the element is instead selected from the `or` vector.
     ///
@@ -489,6 +541,37 @@ where
         unsafe { intrinsics::simd_gather(or, source, enable.to_int()) }
     }
 
+    #[inline]
+    pub fn masked_store(self, slice: &mut [T], mut enable: Mask<<T as SimdElement>::Mask, N>)
+    where
+        Mask<<T as SimdElement>::Mask, N>: ToBitMask + ToBitMaskArray,
+    {
+        enable &= {
+            let mask = bzhi_u64(u64::MAX, core::cmp::min(N, slice.len()) as u32);
+            let mask_bytes: [u8; 8] = unsafe { core::mem::transmute(mask) };
+            let mut in_bounds_arr = Mask::splat(true).to_bitmask_array();
+            let len = in_bounds_arr.as_ref().len();
+            in_bounds_arr.as_mut().copy_from_slice(&mask_bytes[..len]);
+            Mask::from_bitmask_array(in_bounds_arr)
+        };
+        unsafe { self.masked_store_ptr(slice.as_mut_ptr(), enable) }
+    }
+
+    #[inline]
+    pub unsafe fn masked_store_unchecked(
+        self,
+        slice: &mut [T],
+        enable: Mask<<T as SimdElement>::Mask, N>,
+    ) {
+        let ptr = slice.as_mut_ptr();
+        unsafe { self.masked_store_ptr(ptr, enable) }
+    }
+
+    #[inline]
+    pub unsafe fn masked_store_ptr(self, ptr: *mut T, enable: Mask<<T as SimdElement>::Mask, N>) {
+        unsafe { intrinsics::simd_masked_store(self, ptr, enable.to_int()) }
+    }
+
     /// Writes the values in a SIMD vector to potentially discontiguous indices in `slice`.
     /// If an index is out-of-bounds, the write is suppressed without panicking.
     /// If two elements in the scattered vector would write to the same index
@@ -974,3 +1057,14 @@ where
 {
     type Mask = isize;
 }
+
+// This function matches the semantics of the `bzhi` instruction on x86 BMI2
+// TODO: optimize it further if possible
+// https://stackoverflow.com/questions/75179720/how-to-get-rust-compiler-to-emit-bzhi-instruction-without-resorting-to-platform
+fn bzhi_u64(a: u64, ix: u32) -> u64 {
+    if ix > 63 {
+        a
+    } else {
+        a & (1u64 << ix) - 1
+    }
+}
diff --git a/crates/core_simd/tests/masked_load_store.rs b/crates/core_simd/tests/masked_load_store.rs
@@ -0,0 +1,35 @@
+#![feature(portable_simd)]
+use core_simd::simd::prelude::*;
+
+#[cfg(target_arch = "wasm32")]
+use wasm_bindgen_test::*;
+
+#[cfg(target_arch = "wasm32")]
+wasm_bindgen_test_configure!(run_in_browser);
+
+#[test]
+#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+fn masked_load_store() {
+    let mut arr = [u8::MAX; 7];
+
+    u8x4::splat(0).masked_store(&mut arr[5..], Mask::from_array([false, true, false, true]));
+    // write to index 8 is OOB and dropped
+    assert_eq!(arr, [255u8, 255, 255, 255, 255, 255, 0]);
+
+    u8x4::from_array([0, 1, 2, 3]).masked_store(&mut arr[1..], Mask::splat(true));
+    assert_eq!(arr, [255u8, 0, 1, 2, 3, 255, 0]);
+
+    // read from index 8 is OOB and dropped
+    assert_eq!(
+        u8x4::masked_load_or(&arr[4..], u8x4::splat(42)),
+        u8x4::from_array([3, 255, 0, 42])
+    );
+    assert_eq!(
+        u8x4::masked_load_select(
+            &arr[4..],
+            Mask::from_array([true, false, true, true]),
+            u8x4::splat(42)
+        ),
+        u8x4::from_array([3, 42, 0, 42])
+    );
+}