-
Notifications
You must be signed in to change notification settings - Fork 137
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
This patch introduces erf, erfc, lgamma, tgamma and sinpi. #23
Merged
Merged
Changes from all commits
Commits
Show all changes
4 commits
Select commit
Hold shift + click to select a range
eb8b397
This patch introduces erf, erfc, lgamma, tgamma and sinpi. sinpi is j…
shibatch 031632f
* For selection from two to four values, it turned out that the gener…
shibatch 00d9052
This patch adds permutation-based value selection function for AVX512…
shibatch 6bb9fcd
Added TODO comments
shibatch File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -130,11 +130,11 @@ static INLINE vfloat vsqrt_vf_vf(vfloat d) { return vsqrtq_f32(d); } | |
|
||
// Multiply accumulate: z = z + x * y | ||
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { | ||
return vmlaq_f32(z, x, y); | ||
return vfmaq_f32(z, x, y); | ||
} | ||
// Multiply subtract: z = z = x * y | ||
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { | ||
return vmlsq_f32(z, x, y); | ||
return vfmsq_f32(z, x, y); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why? |
||
} | ||
|
||
// |x|, -x | ||
|
@@ -290,11 +290,7 @@ static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { | |
|
||
// Multiply accumulate: z = z + x * y | ||
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { | ||
return vmlaq_f64(z, x, y); | ||
} | ||
//[z = x * y - z] | ||
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { | ||
return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); | ||
return vfmaq_f64(z, x, y); | ||
} | ||
|
||
static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { | ||
|
@@ -309,6 +305,11 @@ static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { // z | |
return vfmsq_f64(z, x, y); | ||
} | ||
|
||
//[z = x * y - z] | ||
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { | ||
return vneg_vd_vd(vfmanp_vd_vd_vd_vd(x, y, z)); | ||
} | ||
|
||
static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { // x * y - z | ||
return vneg_vd_vd(vfmanp_vd_vd_vd_vd(x, y, z)); | ||
} | ||
|
@@ -350,6 +351,44 @@ static INLINE vdouble vsel_vd_vo_vd_vd(vopmask mask, vdouble x, vdouble y) { | |
return vbslq_f64(vreinterpretq_u64_u32(mask), x, y); | ||
} | ||
|
||
#if 1 | ||
static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) { | ||
return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0)); | ||
} | ||
|
||
static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) { | ||
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2)); | ||
} | ||
|
||
static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) { | ||
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3))); | ||
} | ||
#else | ||
// This implementation is slower on the current CPU models (as of May 2017.) | ||
// I(Naoki Shibata) expect that on future CPU models with hardware similar to Super Shuffle Engine, this implementation will be faster. | ||
static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double d0, double d1) { | ||
uint8x16_t idx = vbslq_u8(vreinterpretq_u8_u32(o), (uint8x16_t) { 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 }, | ||
(uint8x16_t) { 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15 }); | ||
|
||
uint8x16_t tab = (uint8x16_t) (float64x2_t) { d0, d1 }; | ||
return (vdouble) vqtbl1q_u8(tab, idx); | ||
} | ||
|
||
static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sam here - I think this is not optimal. Isn't the AVX code better? |
||
uint8x16_t idx = vbslq_u8(vreinterpretq_u8_u32(o0), (uint8x16_t) { 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 }, | ||
vbslq_u8(vreinterpretq_u8_u32(o1), (uint8x16_t) { 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15 }, | ||
vbslq_u8(vreinterpretq_u8_u32(o2), (uint8x16_t) { 16, 17, 18, 19, 20, 21, 22, 23, 16, 17, 18, 19, 20, 21, 22, 23 }, | ||
(uint8x16_t) { 24, 25, 26, 27, 28, 29, 30, 31, 24, 25, 26, 27, 28, 29, 30, 31 }))); | ||
|
||
uint8x16x2_t tab = { { (uint8x16_t) (float64x2_t) { d0, d1 }, (uint8x16_t) (float64x2_t) { d2, d3 } } }; | ||
return (vdouble) vqtbl2q_u8(tab, idx); | ||
} | ||
|
||
static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) { | ||
return vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o1, d0, d1, d2, d2); | ||
} | ||
#endif | ||
|
||
static INLINE vdouble vrint_vd_vd(vdouble d) { | ||
return vrndnq_f64(d); | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What's the reason behind this change?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can safely assume that FMA is the faster than any other combination of multiplication and addition. I saw the assembly output from the compiler and vmlaq is converted into multiplication and addition.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh! Strange. Could you please provide a minimal example that shows what seems to be an inconsistent behavior? You might have found a bug in gcc.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Multiply-accumulate instructions are all fused in aarch64, so this is not a bug.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This LGTM, I was just asking you to provide an example of code that generated multiplication and addition (separated) from the
vmlaq_f32
intrinsics.No worries if you don't have time.