Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[libsleef] Add modified Payne Hanek argument reduction #197

Merged
merged 15 commits into from
Jul 6, 2018
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
https://github.com/shibatch/sleef/pull/192
- Power VSX target support is added to libsleef.
https://github.com/shibatch/sleef/pull/195
- Payne-Hanek like argument reduction is added to libsleef.
https://github.com/shibatch/sleef/pull/???
## 3.2 - 2018-02-26
### Added
- The whole build system of the project migrated from makefiles to
Expand Down
5 changes: 4 additions & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ pipeline {
stages {
stage('Preamble') {
parallel {
/*
// SVE testing is temporarily disabled due to compiler bug
stage('AArch64 SVE') {
agent { label 'aarch64' }
steps {
Expand All @@ -24,7 +26,8 @@ pipeline {
'''
}
}

*/

stage('Intel Compiler') {
agent { label 'icc' }
steps {
Expand Down
13 changes: 13 additions & 0 deletions src/arch/helperadvsimd.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,19 @@ static INLINE void vstoreu_v_p_vi2(int32_t *p, vint2 v) { vst1q_s32(p, v); }
static INLINE vint vloadu_vi_p(int32_t *p) { return vld1_s32(p); }
static INLINE void vstoreu_v_p_vi(int32_t *p, vint v) { vst1_s32(p, v); }

static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
return ((vdouble) { ptr[vget_lane_s32(vi, 0)], ptr[vget_lane_s32(vi, 1)]} );
}

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
return ((vfloat) {
ptr[vgetq_lane_s32(vi2, 0)],
ptr[vgetq_lane_s32(vi2, 1)],
ptr[vgetq_lane_s32(vi2, 2)],
ptr[vgetq_lane_s32(vi2, 3)]
});
}

// Basic logical operations for mask
static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vandq_u32(x, y); }
static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) {
Expand Down
13 changes: 13 additions & 0 deletions src/arch/helperavx.h
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,12 @@ static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm256_loadu_pd(pt
static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm256_store_pd(ptr, v); }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm256_storeu_pd(ptr, v); }

static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
int a[4];
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't this beint a[VECLENSP]?

vstoreu_v_p_vi(a, vi);
return _mm256_set_pd(ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]);
}

#if defined(_MSC_VER)
// This function is needed when debugging on MSVC.
static INLINE double vcast_d_vd(vdouble v) {
Expand Down Expand Up @@ -477,6 +483,13 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm256_loadu_ps(ptr)
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm256_store_ps(ptr, v); }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm256_storeu_ps(ptr, v); }

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
int a[8];
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here, shouldn't it be int a[VECLENSP]?

vstoreu_v_p_vi2(a, vi2);
return _mm256_set_ps(ptr[a[7]], ptr[a[6]], ptr[a[5]], ptr[a[4]],
ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]);
}

#ifdef _MSC_VER
// This function is needed when debugging on MSVC.
static INLINE float vcast_f_vf(vfloat v) {
Expand Down
4 changes: 4 additions & 0 deletions src/arch/helperavx2.h
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,8 @@ static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm256_loadu_pd(pt
static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm256_store_pd(ptr, v); }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm256_storeu_pd(ptr, v); }

static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm256_i32gather_pd(ptr, vi, 8); }

//

static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; }
Expand Down Expand Up @@ -359,6 +361,8 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm256_loadu_ps(ptr)
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm256_store_ps(ptr, v); }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm256_storeu_ps(ptr, v); }

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm256_i32gather_ps(ptr, vi2, 4); }

//

#define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0 })
Expand Down
4 changes: 4 additions & 0 deletions src/arch/helperavx2_128.h
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,8 @@ static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm_loadu_pd(ptr);
static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm_store_pd(ptr, v); }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm_storeu_pd(ptr, v); }

static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm_i32gather_pd(ptr, vi, 8); }

#if defined(_MSC_VER)
// This function is needed when debugging on MSVC.
static INLINE double vcast_d_vd(vdouble v) {
Expand Down Expand Up @@ -330,6 +332,8 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm_loadu_ps(ptr); }
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm_store_ps(ptr, v); }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm_storeu_ps(ptr, v); }

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm_i32gather_ps(ptr, vi2, 4); }

#ifdef _MSC_VER
// This function is needed when debugging on MSVC.
static INLINE float vcast_f_vf(vfloat v) {
Expand Down
5 changes: 5 additions & 0 deletions src/arch/helperavx512f.h
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,8 @@ static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm512_loadu_pd(pt
static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm512_store_pd(ptr, v); }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm512_storeu_pd(ptr, v); }

static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm512_i32gather_pd(vi, ptr, 8); }

//

static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) {
Expand Down Expand Up @@ -417,6 +419,7 @@ static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_
static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }

static INLINE vint2 vilogbk_vi2_vf(vfloat d) { return vrint_vi2_vf(_mm512_getexp_ps(d)); }
static INLINE vint2 vilogb2k_vi2_vf(vfloat d) { return vrint_vi2_vf(_mm512_getexp_ps(d)); }

#ifdef _MSC_VER
// This function is needed when debugging on MSVC.
Expand All @@ -433,6 +436,8 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm512_loadu_ps(ptr)
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm512_store_ps(ptr, v); }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm512_storeu_ps(ptr, v); }

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm512_i32gather_ps(vi2, ptr, 4); }

//

static INLINE vdouble vposneg_vd_vd(vdouble d) {
Expand Down
14 changes: 10 additions & 4 deletions src/arch/helperneon32.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,8 @@ static INLINE int vtestallones_i_vo32(vopmask g) {
return vget_lane_u32(x1, 0);
}

static vfloat vloaduf(float *p) { return vld1q_f32(p); }
static void vstoreuf(float *p, vfloat v) { vst1q_f32(p, v); }

static vint2 vloadu_vi2_p(int32_t *p) { return vld1q_s32(p); }
static void vstoreu_p_vi2(int32_t *p, vint2 v) { vst1q_s32(p, v); }
static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { vst1q_s32(p, v); }

//

Expand Down Expand Up @@ -210,6 +207,15 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) { return vld1q_f32(ptr); }
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { vst1q_f32(__builtin_assume_aligned(ptr, 16), v); }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { vst1q_f32(ptr, v); }

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
return ((vfloat) {
ptr[vgetq_lane_s32(vi2, 0)],
ptr[vgetq_lane_s32(vi2, 1)],
ptr[vgetq_lane_s32(vi2, 2)],
ptr[vgetq_lane_s32(vi2, 3)]
});
}

#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })

Expand Down
12 changes: 12 additions & 0 deletions src/arch/helperpower_128.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,18 @@ static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { ptr[0] = v[0]; ptr[1]

static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&ptr[2*offset]), v); }

static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
int a[4];
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't this also be int[VECLENDP]?

vstoreu_v_p_vi(a, vi);
return ((vdouble) { ptr[a[0]], ptr[a[1]] });
}

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
int a[4];
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't this also be int[VECLENSP]?

vstoreu_v_p_vi2(a, vi2);
return ((vfloat) { ptr[a[0]], ptr[a[1]], ptr[a[2]], ptr[a[3]] });
}

static INLINE vint vcast_vi_i(int i) { return (vint) { i, i }; }
static INLINE vint2 vcast_vi2_i(int i) { return (vint2) { i, i, i, i }; }
static INLINE vfloat vcast_vf_f(float f) { return (vfloat) { f, f, f, f }; }
Expand Down
12 changes: 12 additions & 0 deletions src/arch/helperpurec.h
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,12 @@ static INLINE double vcast_d_vd(vdouble v) { return v.d[0]; }
static INLINE vdouble vload_vd_p(const double *ptr) { return *(vdouble *)ptr; }
static INLINE vdouble vloadu_vd_p(const double *ptr) { vdouble vd; for(int i=0;i<VECTLENDP;i++) vd.d[i] = ptr[i]; return vd; }

static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
vdouble vd;
for(int i=0;i<VECTLENDP;i++) vd.d[i] = ptr[vi.i[i]];
return vd;
}

static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { for(int i=0;i<VECTLENDP;i++) ptr[i] = v.d[i]; }
static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
Expand Down Expand Up @@ -418,6 +424,12 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) {
return vf;
}

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
vfloat vf;
for(int i=0;i<VECTLENSP;i++) vf.f[i] = ptr[vi2.i[i]];
return vf;
}

static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { *(vfloat *)ptr = v; }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) {
for(int i=0;i<VECTLENSP;i++) ptr[i] = v.f[i];
Expand Down
12 changes: 12 additions & 0 deletions src/arch/helpersse2.h
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,12 @@ static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm_loadu_pd(ptr);
static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm_store_pd(ptr, v); }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm_storeu_pd(ptr, v); }

static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
int a[4];
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

int a[VECLENDP];?

vstoreu_v_p_vi(a, vi);
return _mm_set_pd(ptr[a[1]], ptr[a[0]]);
}

#if defined(_MSC_VER)
// This function is needed when debugging on MSVC.
static INLINE double vcast_d_vd(vdouble v) {
Expand Down Expand Up @@ -373,6 +379,12 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm_loadu_ps(ptr); }
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm_store_ps(ptr, v); }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm_storeu_ps(ptr, v); }

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi) {
int a[4];
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

int a[VECLENSP]?

vstoreu_v_p_vi2(a, vi);
return _mm_set_ps(ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]);
}

#ifdef _MSC_VER
// This function is useful when debugging on MSVC.
static INLINE float vcast_f_vf(vfloat v) {
Expand Down
37 changes: 37 additions & 0 deletions src/arch/helpersve.h
Original file line number Diff line number Diff line change
Expand Up @@ -607,6 +607,9 @@ static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) {
static INLINE vint vand_vi_vi_vi(vint x, vint y) {
return svand_s32_x(ptrue, x, y);
}
static INLINE vint vandnot_vi_vi_vi(vint x, vint y) {
return svbic_s32_x(ptrue, y, x);
}
static INLINE vint vxor_vi_vi_vi(vint x, vint y) {
return sveor_s32_x(ptrue, x, y);
}
Expand Down Expand Up @@ -657,6 +660,15 @@ static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) {
return svsel_s32(svcmpeq_s32(ptrue, x, y), ALL_TRUE_MASK, ALL_FALSE_MASK);
}

// Gather

static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
return svldff1_gather_s64offset_f64(ptrue, ptr, svreinterpret_s64_s32(svzip1_s32(vi, svdup_n_s32(0))));
}

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
return svldff1_gather_s32offset_f32(ptrue, ptr, vi2);
}

// Operations for DFT

Expand Down Expand Up @@ -713,3 +725,28 @@ static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { vstoreu_v_p_vf(ptr, v);
static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { vstore_v_p_vf(ptr, v); }
static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vscatter2_v_p_i_i_vd(ptr, offset, step, v); }
static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }

// These functions are for debugging
static double vcast_d_vd(vdouble v) {
double a[32];
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

double a[svcntd()]. We have done it for the gnu compatibility tests too: https://github.com/shibatch/sleef/blob/master/src/libm-tester/gnuabi_compatibility.c#L50

It requires the code to be build with the C11 standard.

vstoreu_v_p_vd(a, v);
return a[0];
}

static float vcast_f_vf(vfloat v) {
float a[64];
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

int a[svcntw()];

vstoreu_v_p_vf(a, v);
return a[0];
}

static int vcast_i_vi(vint v) {
int a[64];
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

int a[svcntw()];

vstoreu_v_p_vi(a, v);
return a[0];
}

static int vcast_i_vi2(vint2 v) {
int a[64];
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

int a[svcntw()];

vstoreu_v_p_vi2(a, v);
return a[0];
}
12 changes: 12 additions & 0 deletions src/arch/helpervecext.h
Original file line number Diff line number Diff line change
Expand Up @@ -627,6 +627,12 @@ static INLINE vdouble vloadu_vd_p(const double *ptr) {
return vd;
}

static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
vdouble vd;
for(int i=0;i<VECTLENDP;i++) vd[i] = ptr[vi[i]];
return vd;
}

static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) {
for(int i=0;i<VECTLENDP;i++) ptr[i] = v[i];
Expand Down Expand Up @@ -777,6 +783,12 @@ static INLINE vfloat vloadu_vf_p(const float *ptr) {
return vf;
}

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
vfloat vf;
for(int i=0;i<VECTLENSP;i++) vf[i] = ptr[vi2[i]];
return vf;
}

static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { *(vfloat *)ptr = v; }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) {
for(int i=0;i<VECTLENSP;i++) ptr[i] = v[i];
Expand Down
4 changes: 2 additions & 2 deletions src/common/misc.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,14 +57,14 @@
bits. So, the maximum argument that could be correctly reduced
should be 2^(28*2-1) PI = 1.1e+17. However, due to internal
double precision calculation, the actual maximum argument that can
be correctly reduced is around 2^50 = 1.1e+15.
be correctly reduced is around 2^47.
*/

#define PI_A 3.1415926218032836914
#define PI_B 3.1786509424591713469e-08
#define PI_C 1.2246467864107188502e-16
#define PI_D 1.2736634327021899816e-24
#define TRIGRANGEMAX 1e+15
#define TRIGRANGEMAX 1e+14

/*
PI_A2 and PI_B2 are constants that satisfy the following two conditions.
Expand Down
Loading