From 2dbe8d0b39a1b07e9cf4846d593908bed588ce9f Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Tue, 17 Oct 2017 00:14:09 +0200
Subject: [PATCH] Avx (#126)

* avx: _mm256_zextps128_ps256

* avx: _mm256_zextpd128_pd256

* avx: _mm256_set_m128

* avx: _mm256_set_m128d

* avx: _mm256_castpd_ps

* avx: _mm256_castps_pd

* avx: _mm256_castps_si256

* avx: _mm256_castsi256_ps

* avx: _mm256_zextsi128_si256

* avx: _mm256_set_m128i
---
 src/x86/avx.rs | 179 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 178 insertions(+), 1 deletion(-)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index 6757e00f1699f..781869cfa7fed 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -7,6 +7,7 @@ use stdsimd_test::assert_instr;
 use simd_llvm::{simd_cast, simd_shuffle2, simd_shuffle4, simd_shuffle8};
 use v128::{f32x4, f64x2, i32x4, i64x2};
 use v256::*;
+use x86::{__m128i, __m256i};
 
 /// Add packed double-precision (64-bit) floating-point elements
 /// in `a` and `b`.
@@ -1827,6 +1828,34 @@ pub unsafe fn _mm256_set1_epi64x(a: i64) -> i64x4 {
     i64x4::new(a, a, a, a)
 }
 
+/// Cast vector of type __m256d to type __m256.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_castpd_ps(a: f64x4) -> f32x8 {
+    mem::transmute(a)
+}
+
+/// Cast vector of type __m256 to type __m256d.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_castps_pd(a: f32x8) -> f64x4 {
+    mem::transmute(a)
+}
+
+/// Casts vector of type __m256 to type __m256i.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_castps_si256(a: f32x8) -> i64x4 {
+    mem::transmute(a)
+}
+
+/// Casts vector of type __m256i to type __m256.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_castsi256_ps(a: i64x4) -> f32x8 {
+    mem::transmute(a)
+}
+
 /// Casts vector of type __m256d to type __m256i.
 /// This intrinsic is only used for compilation and does not generate any
 /// instructions, thus it has zero latency.
@@ -1899,6 +1928,37 @@ pub unsafe fn _mm256_castsi128_si256(a: i64x2) -> i64x4 {
     simd_shuffle4(a, a, [0, 1, 0, 0])
 }
 
+/// Constructs a 256-bit floating-point vector of [8 x float] from a
+/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
+/// the value of the source vector. The upper 128 bits are set to zero.
+#[inline(always)]
+#[target_feature = "+avx,+sse"]
+pub unsafe fn _mm256_zextps128_ps256(a: f32x4) -> f32x8 {
+    use x86::sse::_mm_setzero_ps;
+    simd_shuffle8(a, _mm_setzero_ps(), [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Constructs a 256-bit integer vector from a 128-bit integer vector.
+/// The lower 128 bits contain the value of the source vector. The upper
+/// 128 bits are set to zero.
+#[inline(always)]
+#[target_feature = "+avx,+sse2"]
+pub unsafe fn _mm256_zextsi128_si256(a: i64x2) -> i64x4 {
+    use x86::sse2::_mm_setzero_si128;
+    simd_shuffle4(a, mem::transmute(_mm_setzero_si128()), [0, 1, 2, 3])
+}
+
+/// Constructs a 256-bit floating-point vector of [4 x double] from a
+/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
+/// contain the value of the source vector. The upper 128 bits are set
+/// to zero.
+#[inline(always)]
+#[target_feature = "+avx,+sse2"]
+pub unsafe fn _mm256_zextpd128_pd256(a: f64x2) -> f64x4 {
+    use x86::sse2::_mm_setzero_pd;
+    simd_shuffle4(a, _mm_setzero_pd(), [0, 1, 2, 3])
+}
+
 /// Return vector of type `f32x8` with undefined elements.
 #[inline(always)]
 #[target_feature = "+avx"]
@@ -1920,6 +1980,34 @@ pub unsafe fn _mm256_undefined_si256() -> i64x4 {
     i64x4::splat(mem::uninitialized())
 }
 
+/// Set packed __m256 returned vector with the supplied values.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vinsertf128))]
+pub unsafe fn _mm256_set_m128(hi: f32x4, lo: f32x4) -> f32x8 {
+    simd_shuffle8(lo, hi, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Set packed __m256d returned vector with the supplied values.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vinsertf128))]
+pub unsafe fn _mm256_set_m128d(hi: f64x2, lo: f64x2) -> f64x4 {
+    let hi: f32x4 = mem::transmute(hi);
+    let lo: f32x4 = mem::transmute(lo);
+    mem::transmute(_mm256_set_m128(hi, lo))
+}
+
+/// Set packed __m256i returned vector with the supplied values.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vinsertf128))]
+pub unsafe fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i {
+    let hi: f32x4 = mem::transmute(hi);
+    let lo: f32x4 = mem::transmute(lo);
+    mem::transmute(_mm256_set_m128(hi, lo))
+}
+
 /// LLVM intrinsics used in the above functions
 #[allow(improper_ctypes)]
 extern "C" {
@@ -2070,7 +2158,7 @@ mod tests {
     use stdsimd_test::simd_test;
     use test::black_box;  // Used to inhibit constant-folding.
 
-    use v128::{f32x4, f64x2, i32x4, i64x2};
+    use v128::{f32x4, f64x2, i8x16, i32x4, i64x2};
     use v256::*;
     use x86::avx;
 
@@ -3390,6 +3478,38 @@ mod tests {
         assert_eq!(r, i64x4::splat(1));
     }
 
+    #[simd_test = "avx"]
+    unsafe fn _mm256_castpd_ps() {
+        let a = f64x4::new(1., 2., 3., 4.);
+        let r = avx::_mm256_castpd_ps(a);
+        let e = f32x8::new(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_castps_pd() {
+        let a = f32x8::new(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
+        let r = avx::_mm256_castps_pd(a);
+        let e = f64x4::new(1., 2., 3., 4.);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_castps_si256() {
+        let a = f32x8::new(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = avx::_mm256_castps_si256(a);
+        let e = i64x4::new(4611686019492741120, 4647714816524288000, 4665729215040061440, 4683743613553737728);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_castsi256_ps() {
+        let a = i64x4::new(4611686019492741120, 4647714816524288000, 4665729215040061440, 4683743613553737728);
+        let r = avx::_mm256_castsi256_ps(a);
+        let e = f32x8::new(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq!(r, e);
+    }
+
     #[simd_test = "avx"]
     unsafe fn _mm256_castpd_si256() {
         let a = f64x4::new(1., 2., 3., 4.);
@@ -3424,4 +3544,61 @@ mod tests {
         let r = avx::_mm256_castsi256_si128(a);
         assert_eq!(r, i64x2::new(1, 2));
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_zextps128_ps256() {
+        let a = f32x4::new(1., 2., 3., 4.);
+        let r = avx::_mm256_zextps128_ps256(a);
+        let e = f32x8::new(1., 2., 3., 4., 0., 0., 0., 0.);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_zextsi128_si256() {
+        let a = i64x2::new(1, 2);
+        let r = avx::_mm256_zextsi128_si256(a);
+        let e = i64x4::new(1, 2, 0, 0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_zextpd128_pd256() {
+        let a = f64x2::new(1., 2.);
+        let r = avx::_mm256_zextpd128_pd256(a);
+        let e = f64x4::new(1., 2., 0., 0.);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_set_m128() {
+        let hi = f32x4::new(5., 6., 7., 8.);
+        let lo = f32x4::new(1., 2., 3., 4.);
+        let r = avx::_mm256_set_m128(hi, lo);
+        let e = f32x8::new(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_set_m128d() {
+        let hi = f64x2::new(3., 4.);
+        let lo = f64x2::new(1., 2.);
+        let r = avx::_mm256_set_m128d(hi, lo);
+        let e = f64x4::new(1., 2., 3., 4.);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_set_m128i() {
+        let hi = i8x16::new(17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32);
+        let lo = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16);
+        let r = avx::_mm256_set_m128i(hi, lo);
+        let e = i8x32::new(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32);
+        assert_eq!(r, e);
+    }
 }