Skip to content

Commit

Permalink
non-temporal stores: use inline assembly
Browse files Browse the repository at this point in the history
  • Loading branch information
RalfJung committed Feb 25, 2024
1 parent 6daeb66 commit f4acd83
Show file tree
Hide file tree
Showing 6 changed files with 57 additions and 13 deletions.
18 changes: 15 additions & 3 deletions crates/core_arch/src/x86/avx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1707,7 +1707,11 @@ pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vmovntps))] // FIXME vmovntdq
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
intrinsics::nontemporal_store(mem_addr, a);
crate::arch::asm!(
"vmovntps [{mem_addr}], {a}",
mem_addr = in(reg) mem_addr,
a = in(ymm_reg) a,
);
}

/// Moves double-precision values from a 256-bit vector of `[4 x double]`
Expand All @@ -1730,7 +1734,11 @@ pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
#[stable(feature = "simd_x86", since = "1.27.0")]
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
intrinsics::nontemporal_store(mem_addr as *mut __m256d, a);
crate::arch::asm!(
"vmovntps [{mem_addr}], {a}",
mem_addr = in(reg) mem_addr,
a = in(ymm_reg) a,
);
}

/// Moves single-precision floating point values from a 256-bit vector
Expand All @@ -1754,7 +1762,11 @@ pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
#[stable(feature = "simd_x86", since = "1.27.0")]
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256) {
intrinsics::nontemporal_store(mem_addr as *mut __m256, a);
crate::arch::asm!(
"vmovntps [{mem_addr}], {a}",
mem_addr = in(reg) mem_addr,
a = in(ymm_reg) a,
);
}

/// Computes the approximate reciprocal of packed single-precision (32-bit)
Expand Down
18 changes: 15 additions & 3 deletions crates/core_arch/src/x86/avx512f.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28014,7 +28014,11 @@ pub unsafe fn _mm_mask_testn_epi64_mask(k: __mmask8, a: __m128i, b: __m128i) ->
#[cfg_attr(test, assert_instr(vmovntps))]
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) {
intrinsics::nontemporal_store(mem_addr as *mut __m512, a);
crate::arch::asm!(
"vmovntps [{mem_addr}], {a}",
mem_addr = in(reg) mem_addr,
a = in(zmm_reg) a,
);
}

/// Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
Expand All @@ -28035,7 +28039,11 @@ pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) {
#[cfg_attr(test, assert_instr(vmovntps))] //should be vmovntpd
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) {
intrinsics::nontemporal_store(mem_addr as *mut __m512d, a);
crate::arch::asm!(
"vmovntps [{mem_addr}], {a}",
mem_addr = in(reg) mem_addr,
a = in(zmm_reg) a,
);
}

/// Store 512-bits of integer data from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
Expand All @@ -28056,7 +28064,11 @@ pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) {
#[cfg_attr(test, assert_instr(vmovntps))] //should be vmovntdq
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm512_stream_si512(mem_addr: *mut i64, a: __m512i) {
intrinsics::nontemporal_store(mem_addr as *mut __m512i, a);
crate::arch::asm!(
"vmovntps [{mem_addr}], {a}",
mem_addr = in(reg) mem_addr,
a = in(zmm_reg) a,
);
}

/// Sets packed 32-bit integers in `dst` with the supplied values.
Expand Down
2 changes: 1 addition & 1 deletion crates/core_arch/src/x86/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#[allow(unused_imports)]
use crate::marker::Sized;
use crate::{intrinsics, mem::transmute};
use crate::mem::transmute;

#[macro_use]
mod macros;
Expand Down
6 changes: 5 additions & 1 deletion crates/core_arch/src/x86/sse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2013,7 +2013,11 @@ extern "C" {
#[stable(feature = "simd_x86", since = "1.27.0")]
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) {
intrinsics::nontemporal_store(mem_addr as *mut __m128, a);
crate::arch::asm!(
"movntps [{mem_addr}], {a}",
mem_addr = in(reg) mem_addr,
a = in(xmm_reg) a,
);
}

#[cfg(test)]
Expand Down
18 changes: 15 additions & 3 deletions crates/core_arch/src/x86/sse2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1330,7 +1330,11 @@ pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
#[cfg_attr(test, assert_instr(movntps))] // FIXME movntdq
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
intrinsics::nontemporal_store(mem_addr, a);
crate::arch::asm!(
"movntps [{mem_addr}], {a}",
mem_addr = in(reg) mem_addr,
a = in(xmm_reg) a,
);
}

/// Stores a 32-bit integer value in the specified memory location.
Expand All @@ -1352,7 +1356,11 @@ pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
#[cfg_attr(test, assert_instr(movnti))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
intrinsics::nontemporal_store(mem_addr, a);
crate::arch::asm!(
"movnti [{mem_addr}], {a:e}", // `:e` for 32bit value
mem_addr = in(reg) mem_addr,
a = in(reg) a,
);
}

/// Returns a vector where the low element is extracted from `a` and its upper
Expand Down Expand Up @@ -2547,7 +2555,11 @@ pub unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
#[stable(feature = "simd_x86", since = "1.27.0")]
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) {
intrinsics::nontemporal_store(mem_addr as *mut __m128d, a);
crate::arch::asm!(
"movntps [{mem_addr}], {a}",
mem_addr = in(reg) mem_addr,
a = in(xmm_reg) a,
);
}

/// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a
Expand Down
8 changes: 6 additions & 2 deletions crates/core_arch/src/x86_64/sse2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
use crate::{
core_arch::x86::*,
intrinsics::{self, simd::*},
intrinsics::simd::*,
};

#[cfg(test)]
Expand Down Expand Up @@ -81,7 +81,11 @@ pub unsafe fn _mm_cvttsd_si64x(a: __m128d) -> i64 {
#[cfg_attr(test, assert_instr(movnti))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_stream_si64(mem_addr: *mut i64, a: i64) {
intrinsics::nontemporal_store(mem_addr, a);
crate::arch::asm!(
"movnti [{mem_addr}], {a}",
mem_addr = in(reg) mem_addr,
a = in(reg) a,
);
}

/// Returns a vector whose lowest element is `a` and all higher elements are
Expand Down

0 comments on commit f4acd83

Please sign in to comment.