shibatch · shibatch · Jul 6, 2018 · Jun 19, 2018 · Jun 21, 2018 · Jun 21, 2018
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,6 +17,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
   https://github.com/shibatch/sleef/pull/192
 - Power VSX target support is added to libsleef.
   https://github.com/shibatch/sleef/pull/195
+- Payne-Hanek like argument reduction is added to libsleef.
+  https://github.com/shibatch/sleef/pull/???
 ## 3.2 - 2018-02-26
 ### Added
 - The whole build system of the project migrated from makefiles to

diff --git a/Jenkinsfile b/Jenkinsfile
@@ -4,6 +4,8 @@ pipeline {
     stages {
         stage('Preamble') {
             parallel {
+	        /*
+		// SVE testing is temporarily disabled due to compiler bug
                 stage('AArch64 SVE') {
             	     agent { label 'aarch64' }
             	     steps {
@@ -24,7 +26,8 @@ pipeline {
 			 '''
             	     }
                 }
-
+                */
+
                 stage('Intel Compiler') {
                     agent { label 'icc' }
                     steps {

diff --git a/src/arch/helperadvsimd.h b/src/arch/helperadvsimd.h
@@ -72,6 +72,19 @@ static INLINE void vstoreu_v_p_vi2(int32_t *p, vint2 v) { vst1q_s32(p, v); }
 static INLINE vint vloadu_vi_p(int32_t *p) { return vld1_s32(p); }
 static INLINE void vstoreu_v_p_vi(int32_t *p, vint v) { vst1_s32(p, v); }
 
+static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
+  return ((vdouble) { ptr[vget_lane_s32(vi, 0)], ptr[vget_lane_s32(vi, 1)]} );
+}
+
+static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
+  return ((vfloat) {
+      ptr[vgetq_lane_s32(vi2, 0)],
+      ptr[vgetq_lane_s32(vi2, 1)],
+      ptr[vgetq_lane_s32(vi2, 2)],
+      ptr[vgetq_lane_s32(vi2, 3)]
+    });
+}
+
 // Basic logical operations for mask
 static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vandq_u32(x, y); }
 static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) {

diff --git a/src/arch/helperavx.h b/src/arch/helperavx.h
@@ -290,6 +290,12 @@ static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm256_loadu_pd(pt
 static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm256_store_pd(ptr, v); }
 static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm256_storeu_pd(ptr, v); }
 
+static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
+  int a[4];
+  vstoreu_v_p_vi(a, vi);
+  return _mm256_set_pd(ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]);
+}
+
 #if defined(_MSC_VER)
 // This function is needed when debugging on MSVC.
 static INLINE double vcast_d_vd(vdouble v) {
@@ -477,6 +483,13 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm256_loadu_ps(ptr)
 static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm256_store_ps(ptr, v); }
 static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm256_storeu_ps(ptr, v); }
 
+static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
+  int a[8];
+  vstoreu_v_p_vi2(a, vi2);
+  return _mm256_set_ps(ptr[a[7]], ptr[a[6]], ptr[a[5]], ptr[a[4]],
+		       ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]);
+}
+
 #ifdef _MSC_VER
 // This function is needed when debugging on MSVC.
 static INLINE float vcast_f_vf(vfloat v) {

diff --git a/src/arch/helperavx2.h b/src/arch/helperavx2.h
@@ -255,6 +255,8 @@ static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm256_loadu_pd(pt
 static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm256_store_pd(ptr, v); }
 static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm256_storeu_pd(ptr, v); }
 
+static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm256_i32gather_pd(ptr, vi, 8); }
+
 //
 
 static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; }
@@ -359,6 +361,8 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm256_loadu_ps(ptr)
 static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm256_store_ps(ptr, v); }
 static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm256_storeu_ps(ptr, v); }
 
+static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm256_i32gather_ps(ptr, vi2, 4); }
+
 //
 
 #define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0 })

diff --git a/src/arch/helperavx2_128.h b/src/arch/helperavx2_128.h
@@ -228,6 +228,8 @@ static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm_loadu_pd(ptr);
 static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm_store_pd(ptr, v); }
 static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm_storeu_pd(ptr, v); }
 
+static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm_i32gather_pd(ptr, vi, 8); }
+
 #if defined(_MSC_VER)
 // This function is needed when debugging on MSVC.
 static INLINE double vcast_d_vd(vdouble v) {
@@ -330,6 +332,8 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm_loadu_ps(ptr); }
 static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm_store_ps(ptr, v); }
 static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm_storeu_ps(ptr, v); }
 
+static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm_i32gather_ps(ptr, vi2, 4); }
+
 #ifdef _MSC_VER
 // This function is needed when debugging on MSVC.
 static INLINE float vcast_f_vf(vfloat v) {

diff --git a/src/arch/helperavx512f.h b/src/arch/helperavx512f.h
@@ -290,6 +290,8 @@ static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm512_loadu_pd(pt
 static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm512_store_pd(ptr, v); }
 static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm512_storeu_pd(ptr, v); }
 
+static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm512_i32gather_pd(vi, ptr, 8); }
+
 //
 
 static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) {
@@ -417,6 +419,7 @@ static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_
 static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
 
 static INLINE vint2 vilogbk_vi2_vf(vfloat d) { return vrint_vi2_vf(_mm512_getexp_ps(d)); }
+static INLINE vint2 vilogb2k_vi2_vf(vfloat d) { return vrint_vi2_vf(_mm512_getexp_ps(d)); }
 
 #ifdef _MSC_VER
 // This function is needed when debugging on MSVC.
@@ -433,6 +436,8 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm512_loadu_ps(ptr)
 static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm512_store_ps(ptr, v); }
 static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm512_storeu_ps(ptr, v); }
 
+static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm512_i32gather_ps(vi2, ptr, 4); }
+
 //
 
 static INLINE vdouble vposneg_vd_vd(vdouble d) {

diff --git a/src/arch/helperneon32.h b/src/arch/helperneon32.h
@@ -43,11 +43,8 @@ static INLINE int vtestallones_i_vo32(vopmask g) {
   return vget_lane_u32(x1, 0);
 }
 
-static vfloat vloaduf(float *p) { return vld1q_f32(p); }
-static void vstoreuf(float *p, vfloat v) { vst1q_f32(p, v); }
-
 static vint2 vloadu_vi2_p(int32_t *p) { return vld1q_s32(p); }
-static void vstoreu_p_vi2(int32_t *p, vint2 v) { vst1q_s32(p, v); }
+static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { vst1q_s32(p, v); }
 
 //
 
@@ -210,6 +207,15 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) { return vld1q_f32(ptr); }
 static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { vst1q_f32(__builtin_assume_aligned(ptr, 16), v); }
 static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { vst1q_f32(ptr, v); }
 
+static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
+  return ((vfloat) {
+      ptr[vgetq_lane_s32(vi2, 0)],
+      ptr[vgetq_lane_s32(vi2, 1)],
+      ptr[vgetq_lane_s32(vi2, 2)],
+      ptr[vgetq_lane_s32(vi2, 3)]
+    });
+}
+
 #define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
 #define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })
 

diff --git a/src/arch/helperpower_128.h b/src/arch/helperpower_128.h
@@ -74,6 +74,18 @@ static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { ptr[0] = v[0]; ptr[1]
 
 static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&ptr[2*offset]), v); }
 
+static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
+  int a[4];
+  vstoreu_v_p_vi(a, vi);
+  return ((vdouble) { ptr[a[0]], ptr[a[1]] });
+}
+
+static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
+  int a[4];
+  vstoreu_v_p_vi2(a, vi2);
+  return ((vfloat) { ptr[a[0]], ptr[a[1]], ptr[a[2]], ptr[a[3]] });
+}
+
 static INLINE vint vcast_vi_i(int i) { return (vint) { i, i }; }
 static INLINE vint2 vcast_vi2_i(int i) { return (vint2) { i, i, i, i }; }
 static INLINE vfloat vcast_vf_f(float f) { return (vfloat) { f, f, f, f }; }

diff --git a/src/arch/helperpurec.h b/src/arch/helperpurec.h
@@ -304,6 +304,12 @@ static INLINE double vcast_d_vd(vdouble v) { return v.d[0]; }
 static INLINE vdouble vload_vd_p(const double *ptr) { return *(vdouble *)ptr; }
 static INLINE vdouble vloadu_vd_p(const double *ptr) { vdouble vd; for(int i=0;i<VECTLENDP;i++) vd.d[i] = ptr[i]; return vd; }
 
+static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
+  vdouble vd;
+  for(int i=0;i<VECTLENDP;i++) vd.d[i] = ptr[vi.i[i]];
+  return vd;
+}
+
 static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
 static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { for(int i=0;i<VECTLENDP;i++) ptr[i] = v.d[i]; }
 static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
@@ -418,6 +424,12 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) {
   return vf;
 }
 
+static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
+  vfloat vf;
+  for(int i=0;i<VECTLENSP;i++) vf.f[i] = ptr[vi2.i[i]];
+  return vf;
+}
+
 static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { *(vfloat *)ptr = v; }
 static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) {
   for(int i=0;i<VECTLENSP;i++) ptr[i] = v.f[i];

diff --git a/src/arch/helpersse2.h b/src/arch/helpersse2.h
@@ -267,6 +267,12 @@ static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm_loadu_pd(ptr);
 static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm_store_pd(ptr, v); }
 static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm_storeu_pd(ptr, v); }
 
+static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
+  int a[4];
+  vstoreu_v_p_vi(a, vi);
+  return _mm_set_pd(ptr[a[1]], ptr[a[0]]);
+}
+
 #if defined(_MSC_VER)
 // This function is needed when debugging on MSVC.
 static INLINE double vcast_d_vd(vdouble v) {
@@ -373,6 +379,12 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm_loadu_ps(ptr); }
 static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm_store_ps(ptr, v); }
 static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm_storeu_ps(ptr, v); }
 
+static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi) {
+  int a[4];
+  vstoreu_v_p_vi2(a, vi);
+  return _mm_set_ps(ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]);
+}
+
 #ifdef _MSC_VER
 // This function is useful when debugging on MSVC.
 static INLINE float vcast_f_vf(vfloat v) {

diff --git a/src/arch/helpersve.h b/src/arch/helpersve.h
@@ -607,6 +607,9 @@ static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) {
 static INLINE vint vand_vi_vi_vi(vint x, vint y) {
   return svand_s32_x(ptrue, x, y);
 }
+static INLINE vint vandnot_vi_vi_vi(vint x, vint y) {
+  return svbic_s32_x(ptrue, y, x);
+}
 static INLINE vint vxor_vi_vi_vi(vint x, vint y) {
   return sveor_s32_x(ptrue, x, y);
 }
@@ -657,6 +660,15 @@ static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) {
   return svsel_s32(svcmpeq_s32(ptrue, x, y), ALL_TRUE_MASK, ALL_FALSE_MASK);
 }
 
+// Gather
+
+static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
+  return svldff1_gather_s64offset_f64(ptrue, ptr, svreinterpret_s64_s32(svzip1_s32(vi, svdup_n_s32(0))));
+}
+
+static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
+  return svldff1_gather_s32offset_f32(ptrue, ptr, vi2);
+}
 
 // Operations for DFT
 
@@ -713,3 +725,28 @@ static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { vstoreu_v_p_vf(ptr, v);
 static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { vstore_v_p_vf(ptr, v); }
 static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vscatter2_v_p_i_i_vd(ptr, offset, step, v); }
 static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
+
+// These functions are for debugging
+static double vcast_d_vd(vdouble v) {
+  double a[32];
+  vstoreu_v_p_vd(a, v);
+  return a[0];
+}
+
+static float vcast_f_vf(vfloat v) {
+  float a[64];
+  vstoreu_v_p_vf(a, v);
+  return a[0];
+}
+
+static int vcast_i_vi(vint v) {
+  int a[64];
+  vstoreu_v_p_vi(a, v);
+  return a[0];
+}
+
+static int vcast_i_vi2(vint2 v) {
+  int a[64];
+  vstoreu_v_p_vi2(a, v);
+  return a[0];
+}
diff --git a/src/arch/helpervecext.h b/src/arch/helpervecext.h
@@ -627,6 +627,12 @@ static INLINE vdouble vloadu_vd_p(const double *ptr) {
   return vd;
 }
 
+static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
+  vdouble vd;
+  for(int i=0;i<VECTLENDP;i++) vd[i] = ptr[vi[i]];
+  return vd;
+}
+
 static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
 static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) {
   for(int i=0;i<VECTLENDP;i++) ptr[i] = v[i];
@@ -777,6 +783,12 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) {
   return vf;
 }
 
+static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
+  vfloat vf;
+  for(int i=0;i<VECTLENSP;i++) vf[i] = ptr[vi2[i]];
+  return vf;
+}
+
 static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { *(vfloat *)ptr = v; }
 static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) {
   for(int i=0;i<VECTLENSP;i++) ptr[i] = v[i];

diff --git a/src/common/misc.h b/src/common/misc.h
@@ -57,14 +57,14 @@
   bits. So, the maximum argument that could be correctly reduced
   should be 2^(28*2-1) PI = 1.1e+17. However, due to internal
   double precision calculation, the actual maximum argument that can
-  be correctly reduced is around 2^50 = 1.1e+15.
+  be correctly reduced is around 2^47.
  */
 
 #define PI_A 3.1415926218032836914
 #define PI_B 3.1786509424591713469e-08
 #define PI_C 1.2246467864107188502e-16
 #define PI_D 1.2736634327021899816e-24
-#define TRIGRANGEMAX 1e+15
+#define TRIGRANGEMAX 1e+14
 
 /*
   PI_A2 and PI_B2 are constants that satisfy the following two conditions.