diff --git a/README.md b/README.md index acbfcab3..ee94a88a 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ http://www.boost.org/LICENSE_1_0.txt. Contributions to this project are accepted under the same license. -Copyright Naoki Shibata and contributors 2010 - 2017. +Copyright Naoki Shibata and contributors 2010 - 2019. Main Page : http://sleef.org/ GitHub Repo : https://github.com/shibatch/sleef diff --git a/src/libm-tester/hash_cinz.txt b/src/libm-tester/hash_cinz.txt index 9981c707..f399c0aa 100644 --- a/src/libm-tester/hash_cinz.txt +++ b/src/libm-tester/hash_cinz.txt @@ -1,48 +1,48 @@ -sin u35 ec03e7577871ef943c4b1ca6a485d08c -sin u10 0f8444e11d3779dc4c28f3db353607bf -cos u35 b87d0826f08562a17c71125d39fb5919 -cos u10 9f9e4f5f3d2d84deeab3829375054fe8 -tan u35 b04ff8785a7c98694842004ccb3da186 -tan u10 9a4e53fd7e9245c7c1c8cb9cd30b1644 +sin u35 bc50dfbcbd8ef534541d1babe90860c7 +sin u10 9de2d9fcee4d16f87b463b813226b4cf +cos u35 506e34a809b80ad3603ed46ba2a574b0 +cos u10 7365446329254723b64987da76aea812 +tan u35 cfb4aacb5f14e68c3171246838ba8f57 +tan u10 5fd08e0552e3ab853439bf5fd2bd344d sincos u10 7c164edcaa45988f6165b653fc76c495 sincos u35 38fe7e261e184ed8dbf432ce6bedc5c4 sincospi u05 0c6fc00c7aaf0b6e67d66542d1be833d sincospi u35 c428b0fc3e6c5be4d2c03dcd8bb27a7c log u10 4855b27222d900bea47a27cadba71727 -log u35 9c60b59cc6bba46d016798b97ea939c2 -log10 u10 ee979872a5a5181905eb7d04eefb3870 +log u35 c95484de57c167da3d8d6d1baadf9ffa +log10 u10 36645e8031d873d66fd0ec2c5959f273 log1p u10 1383924fb56cf2e7eda27de21320c591 -exp u10 1a8756694ca98254c4d872646b639721 -exp2 u10 6aeddf0a30622490d3fabbe4851e3e7a +exp u10 23e2f486c7e9d2068f3f6d30a3161ffd +exp2 u10 436146f8d6dcaa4a754837108a9aa3e1 exp10 u10 9d704b310f683872a6446cfc97726a4d -expm1 u10 374817f0728c57b0ec019c3bfb141a79 -pow u10 15d42c41ce130c3a61fff3686632c8ce +expm1 u10 cd3f0b8e86943d52c278394b60e2d22e +pow u10 a0ea63b27d33262346a35c9439741075 cbrt u10 5d8bf28ac74624594fd1be9217817690 cbrt u10 3c896e03746bcf1b3f70182dfec3d93b cbrt u35 73daa306764e208aab1627ac110b10d7 cbrt u35 c29b7bf200215425b4ba948c8cc94c42 hypot u05 cc2f18e409e19a02cadf7b91fd869120 hypot u35 5194e0a554174a6145511ce3df9c1f46 -asin u10 bae9eb3b4f484295f3f8641e61808dc6 -asin u35 9bea0a03680e83cbd198262ecdb37e28 -acos u10 1c435f5e072ad2fe458ed3cb7d61efdb -acos u35 c7f12cdf2aa9b68b0476053d1727b42f -atan u10 1f13ac8525edc54818f136bd9cd8ac46 -atan u10 2b2309e489a92e208e83d1641301e59c -atan u35 c62cfeb4fa325a5b85613723723b1d98 -atan u35 515a88bf70e307daee25f24178e3791a -atan2 u10 b3cbe8bf7c3a75c4eb1a767e2331b5ca -atan2 u35 07060ed63483b89fb3c27840d200d747 -sinh u10 25b8333e58c6a39e5b0f8ba085b218ef -cosh u10 beb5e63186c0d580e11333e206aa31b1 +asin u10 86c061caec3fa2e1bc71bda4dad29f4c +asin u35 31303b88bdc00206265002d6cc5e89e4 +acos u10 0a1a403590f2ac8364f132b334920945 +acos u35 493f960c1cce57931d95a5a22a0587a3 +atan u10 e5c01c5ec2e9a535ac8d3000c346067d +atan u10 584ebe4bf2df7b60210f87f74b7d21d1 +atan u35 9d6d83e066b5a4851d44771418c9948c +atan u35 f32c1aa4caa08c6945afd1125ba8b113 +atan2 u10 dfa671b1bae503fcae52246f65da3324 +atan2 u35 afb07894347062a96dab705b34eb1763 +sinh u10 61d459b1f368087f6f23ebf8e9f0ea01 +cosh u10 f77eb95f79e274c12b4e92dc0389259b tanh u10 2bb9dd54ed0fa22bb5f3b6d557eb58a3 asinh u10 01136e54e2a434839530dda54f33cfdb acosh u10 2f3c28c9ee2eb2b3d5659c6cb2a58e3e atanh u10 601a77ba8c1d5175f2808b48a41260c1 lgamma u10 90cdc41063f4198c6ad592c0cdd0f5da -tgamma u10 74ac3717702a9586553f83cd9abfce6e +tgamma u10 6f864c3a1f17fbdf914cac7ffcd82cb7 erf u10 4031f3e285101359aea99feb5e2de3f0 -erfc u15 9b6e3d08a9d409154bcbf0ab8315f0bd +erfc u15 5e116a4316dafa742769f71e18f6f9fe fabs bef2f2ac8a4789357e580b4da4f9b9fe copysign 3219022f267464e3704f90558e8df3bc fmax 4e4f5220ccfef191864c316df0d18fc0 @@ -60,7 +60,7 @@ sinf u35 833d845950b9cbb025629fe4c040f8f6 sinf u10 9c21afa4d7d6af3fc666309c3cd647fe cosf u35 74d7f871a6553cd0019087895e2052ad cosf u10 35349e94c323c1614f22093959288010 -tanf u35 5412973e369bb09dc7d506823b6fb4e9 +tanf u35 bbb7c092d017e96d2454a38a20687735 tanf u10 227423bc04f42d76a8f68082ba696126 sincosf u10 83ecc4e3d5295056e9d8c52bc196b666 sincosf u35 533319caa49a961e4909bd6dcab40721 @@ -87,10 +87,10 @@ acosf u10 15617dd0429b90e59d2923415934c2a6 acosf u35 af0b132d9e263721f9296187dbf9b9bf atanf u10 26b77fb423104b45633cf24500237d6e atanf u10 4313d0bc2708de53f74d804aac6564d4 -atanf u35 12c1b3ad574c3cbf642690ab13bf27d1 -atanf u35 a112a95870d110dddf8a4aa7e877c556 +atanf u35 97a1797897955643c722c7d291987331 +atanf u35 7d3f47169415058e8578f11d899bfd10 atan2f u10 098a33f730fe95ce4774a991db4cee14 -atan2f u35 25a1ce0f275b31ab2d77a5acc9d25ece +atan2f u35 56fc6bd8349979f0d0b1dcdb57f68363 sinhf u10 0780a2f57df3a831718195d1ee5c19ef coshf u10 cfbb6aed408e43a7b7f053474100ff2d tanhf u10 d19f254d41e8726c748df87b95bc9acd diff --git a/src/libm-tester/hash_finz.txt b/src/libm-tester/hash_finz.txt index 9c1be57a..f756f7ae 100644 --- a/src/libm-tester/hash_finz.txt +++ b/src/libm-tester/hash_finz.txt @@ -1,48 +1,48 @@ -sin u35 2e8fd38a29df4880a9aa9e4ddca62a52 -sin u10 b0419d95daaef51278bf8fd5c347e211 -cos u35 ff31d0d21e577876dd019d0ff215e2d1 -cos u10 165b5501b94897c832aee0cca2227aba -tan u35 1e31c80ded9720a3b3fcf68eb2e7a5f1 -tan u10 bd0bec1cd3107d66ce1de78af2929a2d +sin u35 c163e4a7e9ccebb2181dcc8653367d8c +sin u10 a2bda8db1c7d23a9b2a805bf8e0ec95e +cos u35 52f902bd939d751b5b544ac70181fcff +cos u10 c07bdf6738ba6aabed3cb2d25610ad14 +tan u35 4651b9aa79fab36a07e3d4d6ef250d2f +tan u10 c98f29a62067fa63646d9bcc29a310c6 sincos u10 3fe37f4eb805505152f2b14a22a9f94e sincos u35 95a7b7f48c71febf10ec6eff796dd391 sincospi u05 0c6fc00c7aaf0b6e67d66542d1be833d sincospi u35 c428b0fc3e6c5be4d2c03dcd8bb27a7c log u10 4855b27222d900bea47a27cadba71727 -log u35 a0ba62c41a2c42e04bef9be158c573d1 -log10 u10 ee979872a5a5181905eb7d04eefb3870 +log u35 015f8ae899c9b921d48919dd12ef19a9 +log10 u10 36645e8031d873d66fd0ec2c5959f273 log1p u10 1383924fb56cf2e7eda27de21320c591 -exp u10 68339ce61fd14ae329e4c5f37a7f4c3c -exp2 u10 bccd14fa3878c93c7bdc8feca954344b +exp u10 084e5be89c2ad03e356078ea4f287bab +exp2 u10 6e36db9ae2cf9eca82e3d9157c622351 exp10 u10 0cc08bc6a3d08d6e61450b5370c6161e -expm1 u10 374817f0728c57b0ec019c3bfb141a79 -pow u10 a2ae975d6aa593c5f47d38cd7514ddb3 +expm1 u10 cd3f0b8e86943d52c278394b60e2d22e +pow u10 7e19796027d7c1d1999be948f90e6181 cbrt u10 5d8bf28ac74624594fd1be9217817690 cbrt u10 3c896e03746bcf1b3f70182dfec3d93b cbrt u35 fc7ee3e3e6c54365d708b752c242a947 cbrt u35 2408714a56d74f8c82389ca6772cdbc1 hypot u05 cc2f18e409e19a02cadf7b91fd869120 hypot u35 be7bbd41dffd746b70261ee773cbd4b2 -asin u10 7949c3ac5b9ca0a548a411bffe321632 -asin u35 8ae753ef1085067fe59d5b3d9037f185 -acos u10 1db2f6cdcebd9c46650c2e3c16ff6aba -acos u35 ed73f226b2bd5eed79cbf54404825e28 -atan u10 5284563c51512b9785c89ec0620f518e -atan u10 9cf30e08e5d980ae3bcbcc4be77ed2f6 -atan u35 585d2ff936bed7863b566003553c26fd -atan u35 c20c4d491e4b4265f7ccf4a0819cc82a -atan2 u10 45678ae1ca8075b8a7f40fb785c10b97 -atan2 u35 936110373f2e1c44ebb479fdcc2d549f -sinh u10 775dc1a219c3703cd86b1332e66029e4 -cosh u10 beb5e63186c0d580e11333e206aa31b1 +asin u10 8a21b7c28cdaffc9d3e53f415367932e +asin u35 9c9e8107782898e9faed6924ad1b3cb1 +acos u10 28261e4eb8331865660c814676d5c6bc +acos u35 310911130bfc45b10dabe3a072939331 +atan u10 bfaf9c60689afb923c5c9637b3a711ca +atan u10 4e6b0112b6bb5a5fe936b9a01b8a7afc +atan u35 6161b6189609f105b017d8768d0a41f1 +atan u35 6face71d8d93c69448d49ed6140e361d +atan2 u10 7ca84089be978aca61b04c5be6aaf353 +atan2 u35 6a3e764125aab2a0a13e7a0d9ec02f7f +sinh u10 61d459b1f368087f6f23ebf8e9f0ea01 +cosh u10 f77eb95f79e274c12b4e92dc0389259b tanh u10 2bb9dd54ed0fa22bb5f3b6d557eb58a3 asinh u10 01136e54e2a434839530dda54f33cfdb acosh u10 2f3c28c9ee2eb2b3d5659c6cb2a58e3e atanh u10 601a77ba8c1d5175f2808b48a41260c1 lgamma u10 90cdc41063f4198c6ad592c0cdd0f5da -tgamma u10 cc1e96362ad9626d9c0c397426abfbac +tgamma u10 cb9a93844ad1713d2ab92ff5b6398150 erf u10 3f3c9bf4f8e5768c09c472cee4475e43 -erfc u15 3465bc3addcaaa18368e654cb92320ef +erfc u15 3e247a54183eeddedc33e99c50118995 fabs bef2f2ac8a4789357e580b4da4f9b9fe copysign 3219022f267464e3704f90558e8df3bc fmax 4e4f5220ccfef191864c316df0d18fc0 @@ -60,7 +60,7 @@ sinf u35 f8f804eae1d9443103e81fec96293477 sinf u10 3f12a7381f1cbb1830d92b4ec72d21fe cosf u35 f2f3d1c9f090cde9c02439608dc7066e cosf u10 dc35f27fae65f63f0aa6ad241f8b387b -tanf u35 a1109709284ec790b31ef533c63212d6 +tanf u35 68d42ad1fb412e6b8be3853461e61213 tanf u10 97df301d4f59e67d5318b5356b703f06 sincosf u10 a97124d810ec461c135dc4fb0c059b6f sincosf u35 0cc521e52ae1227d311012c2919c1ff2 @@ -87,10 +87,10 @@ acosf u10 5180fde4b02a0ca4cd75f0a786a1bfeb acosf u35 72b0e2f9791f90f1c43570b9e9ba893f atanf u10 fa672e387a204055f735b7af98dd8a35 atanf u10 d017670c13bc221b68bc9ee5f41c4b5e -atanf u35 052c537c09b297322e825ac0b2f0339e -atanf u35 9b25b4a6d96ee5fed9ca58cc7f21cb71 +atanf u35 f592e46eaa5d29583f86d3e336f20b6b +atanf u35 e7087fe40de46921826b373d10c40954 atan2f u10 275b2fa8ee554c45551bb142db9f8197 -atan2f u35 1e3ae1d22d0ff3054b29f53760a1cade +atan2f u35 44b187851195d24bab2561eb8f4ff5d0 sinhf u10 45bc228a14c3e39eeb35e9764394a23e coshf u10 838d441e85d415ef4fb1e5c5ea966a71 tanhf u10 d19f254d41e8726c748df87b95bc9acd diff --git a/src/libm/estrin.h b/src/libm/estrin.h new file mode 100644 index 00000000..46417251 --- /dev/null +++ b/src/libm/estrin.h @@ -0,0 +1,36 @@ +// Copyright Naoki Shibata 2010 - 2019. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +// These are macros for evaluating polynomials using Estrin's method + +#define POLY2(x, c1, c0) MLA(x, C2V(c1), C2V(c0)) +#define POLY3(x, x2, c2, c1, c0) MLA(x2, C2V(c2), MLA(x, C2V(c1), C2V(c0))) +#define POLY4(x, x2, c3, c2, c1, c0) MLA(x2, MLA(x, C2V(c3), C2V(c2)), MLA(x, C2V(c1), C2V(c0))) +#define POLY5(x, x2, x4, c4, c3, c2, c1, c0) MLA(x4, C2V(c4), POLY4(x, x2, c3, c2, c1, c0)) +#define POLY6(x, x2, x4, c5, c4, c3, c2, c1, c0) MLA(x4, POLY2(x, c5, c4), POLY4(x, x2, c3, c2, c1, c0)) +#define POLY7(x, x2, x4, c6, c5, c4, c3, c2, c1, c0) MLA(x4, POLY3(x, x2, c6, c5, c4), POLY4(x, x2, c3, c2, c1, c0)) +#define POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0) MLA(x4, POLY4(x, x2, c7, c6, c5, c4), POLY4(x, x2, c3, c2, c1, c0)) +#define POLY9(x, x2, x4, x8, c8, c7, c6, c5, c4, c3, c2, c1, c0)\ + MLA(x8, C2V(c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0)) +#define POLY10(x, x2, x4, x8, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\ + MLA(x8, POLY2(x, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0)) +#define POLY11(x, x2, x4, x8, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\ + MLA(x8, POLY3(x, x2, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0)) +#define POLY12(x, x2, x4, x8, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\ + MLA(x8, POLY4(x, x2, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0)) +#define POLY13(x, x2, x4, x8, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\ + MLA(x8, POLY5(x, x2, x4, cc, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0)) +#define POLY14(x, x2, x4, x8, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\ + MLA(x8, POLY6(x, x2, x4, cd, cc, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0)) +#define POLY15(x, x2, x4, x8, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\ + MLA(x8, POLY7(x, x2, x4, ce, cd, cc, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0)) +#define POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\ + MLA(x8, POLY8(x, x2, x4, cf, ce, cd, cc, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0)) +#define POLY17(x, x2, x4, x8, x16, d0, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\ + MLA(x16, C2V(d0), POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)) +#define POLY18(x, x2, x4, x8, x16, d1, d0, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\ + MLA(x16, POLY2(x, d1, d0), POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)) +#define POLY19(x, x2, x4, x8, x16, d2, d1, d0, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\ + MLA(x16, POLY3(x, x2, d2, d1, d0), POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)) diff --git a/src/libm/sleefdp.c b/src/libm/sleefdp.c index 6dbf5b56..c713a554 100644 --- a/src/libm/sleefdp.c +++ b/src/libm/sleefdp.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata 2010 - 2018. +// Copyright Naoki Shibata 2010 - 2019. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -30,6 +30,10 @@ extern const double rempitabdp[]; #pragma fp_contract (off) #endif +#define MLA mla +#define C2V(x) (x) +#include "estrin.h" + static INLINE CONST int64_t doubleToRawLongBits(double d) { union { double f; @@ -310,7 +314,7 @@ static INLINE CONST Sleef_double2 ddadd_d2_d2_d2(Sleef_double2 x, Sleef_double2 Sleef_double2 r; #ifndef NDEBUG - if (!(checkfp(x.x) || checkfp(y.x) || fabsk(x.x) >= fabsk(y.x) || (fabsk(x.x+y.x) <= fabsk(x.x) && fabsk(x.x+y.x) <= fabsk(y.x)))) { + if (!(x.x == 0 || checkfp(x.x) || checkfp(y.x) || fabsk(x.x) >= fabsk(y.x) || (fabsk(x.x+y.x) <= fabsk(x.x) && fabsk(x.x+y.x) <= fabsk(y.x)))) { fprintf(stderr, "[ddadd_d2_d2_d2 : %g %g]\n", x.x, y.x); fflush(stderr); } @@ -471,25 +475,27 @@ static INLINE CONST double atan2k(double y, double x) { s = y / x; t = s * s; - u = -1.88796008463073496563746e-05; - u = mla(u, t, 0.000209850076645816976906797); - u = mla(u, t, -0.00110611831486672482563471); - u = mla(u, t, 0.00370026744188713119232403); - u = mla(u, t, -0.00889896195887655491740809); - u = mla(u, t, 0.016599329773529201970117); - u = mla(u, t, -0.0254517624932312641616861); - u = mla(u, t, 0.0337852580001353069993897); - u = mla(u, t, -0.0407629191276836500001934); - u = mla(u, t, 0.0466667150077840625632675); - u = mla(u, t, -0.0523674852303482457616113); - u = mla(u, t, 0.0587666392926673580854313); - u = mla(u, t, -0.0666573579361080525984562); - u = mla(u, t, 0.0769219538311769618355029); - u = mla(u, t, -0.090908995008245008229153); - u = mla(u, t, 0.111111105648261418443745); - u = mla(u, t, -0.14285714266771329383765); - u = mla(u, t, 0.199999999996591265594148); - u = mla(u, t, -0.333333333333311110369124); + double t2 = t * t, t4 = t2 * t2, t8 = t4 * t4, t16 = t8 * t8; + u = POLY19(t, t2, t4, t8, t16, + -1.88796008463073496563746e-05, + 0.000209850076645816976906797, + -0.00110611831486672482563471, + 0.00370026744188713119232403, + -0.00889896195887655491740809, + 0.016599329773529201970117, + -0.0254517624932312641616861, + 0.0337852580001353069993897, + -0.0407629191276836500001934, + 0.0466667150077840625632675, + -0.0523674852303482457616113, + 0.0587666392926673580854313, + -0.0666573579361080525984562, + 0.0769219538311769618355029, + -0.090908995008245008229153, + 0.111111105648261418443745, + -0.14285714266771329383765, + 0.199999999996591265594148, + -0.333333333333311110369124); t = u * t * s + s; t = q * (M_PI/2) + t; @@ -512,18 +518,21 @@ EXPORT CONST double xasin(double d) { int o = fabsk(d) < 0.5; double x2 = o ? (d*d) : ((1-fabsk(d))*0.5), x = o ? fabsk(d) : SQRT(x2), u; - u = +0.3161587650653934628e-1; - u = mla(u, x2, -0.1581918243329996643e-1); - u = mla(u, x2, +0.1929045477267910674e-1); - u = mla(u, x2, +0.6606077476277170610e-2); - u = mla(u, x2, +0.1215360525577377331e-1); - u = mla(u, x2, +0.1388715184501609218e-1); - u = mla(u, x2, +0.1735956991223614604e-1); - u = mla(u, x2, +0.2237176181932048341e-1); - u = mla(u, x2, +0.3038195928038132237e-1); - u = mla(u, x2, +0.4464285681377102438e-1); - u = mla(u, x2, +0.7500000000378581611e-1); - u = mla(u, x2, +0.1666666666666497543e+0); + double x4 = x2 * x2, x8 = x4 * x4, x16 = x8 * x8; + u = POLY12(x2, x4, x8, x16, + +0.3161587650653934628e-1, + -0.1581918243329996643e-1, + +0.1929045477267910674e-1, + +0.6606077476277170610e-2, + +0.1215360525577377331e-1, + +0.1388715184501609218e-1, + +0.1735956991223614604e-1, + +0.2237176181932048341e-1, + +0.3038195928038132237e-1, + +0.4464285681377102438e-1, + +0.7500000000378581611e-1, + +0.1666666666666497543e+0); + u = mla(u, x * x2, x); double r = o ? u : (M_PI/2 - 2*u); @@ -538,18 +547,20 @@ EXPORT CONST double xacos(double d) { double x = o ? fabsk(d) : SQRT(x2); x = fabsk(d) == 1.0 ? 0 : x; - u = +0.3161587650653934628e-1; - u = mla(u, x2, -0.1581918243329996643e-1); - u = mla(u, x2, +0.1929045477267910674e-1); - u = mla(u, x2, +0.6606077476277170610e-2); - u = mla(u, x2, +0.1215360525577377331e-1); - u = mla(u, x2, +0.1388715184501609218e-1); - u = mla(u, x2, +0.1735956991223614604e-1); - u = mla(u, x2, +0.2237176181932048341e-1); - u = mla(u, x2, +0.3038195928038132237e-1); - u = mla(u, x2, +0.4464285681377102438e-1); - u = mla(u, x2, +0.7500000000378581611e-1); - u = mla(u, x2, +0.1666666666666497543e+0); + double x4 = x2 * x2, x8 = x4 * x4, x16 = x8 * x8; + u = POLY12(x2, x4, x8, x16, + +0.3161587650653934628e-1, + -0.1581918243329996643e-1, + +0.1929045477267910674e-1, + +0.6606077476277170610e-2, + +0.1215360525577377331e-1, + +0.1388715184501609218e-1, + +0.1735956991223614604e-1, + +0.2237176181932048341e-1, + +0.3038195928038132237e-1, + +0.4464285681377102438e-1, + +0.7500000000378581611e-1, + +0.1666666666666497543e+0); u *= x * x2; @@ -570,25 +581,27 @@ EXPORT CONST double xatan(double s) { t = s * s; - u = -1.88796008463073496563746e-05; - u = mla(u, t, 0.000209850076645816976906797); - u = mla(u, t, -0.00110611831486672482563471); - u = mla(u, t, 0.00370026744188713119232403); - u = mla(u, t, -0.00889896195887655491740809); - u = mla(u, t, 0.016599329773529201970117); - u = mla(u, t, -0.0254517624932312641616861); - u = mla(u, t, 0.0337852580001353069993897); - u = mla(u, t, -0.0407629191276836500001934); - u = mla(u, t, 0.0466667150077840625632675); - u = mla(u, t, -0.0523674852303482457616113); - u = mla(u, t, 0.0587666392926673580854313); - u = mla(u, t, -0.0666573579361080525984562); - u = mla(u, t, 0.0769219538311769618355029); - u = mla(u, t, -0.090908995008245008229153); - u = mla(u, t, 0.111111105648261418443745); - u = mla(u, t, -0.14285714266771329383765); - u = mla(u, t, 0.199999999996591265594148); - u = mla(u, t, -0.333333333333311110369124); + double t2 = t * t, t4 = t2 * t2, t8 = t4 * t4, t16 = t8 * t8; + u = POLY19(t, t2, t4, t8, t16, + -1.88796008463073496563746e-05, + 0.000209850076645816976906797, + -0.00110611831486672482563471, + 0.00370026744188713119232403, + -0.00889896195887655491740809, + 0.016599329773529201970117, + -0.0254517624932312641616861, + 0.0337852580001353069993897, + -0.0407629191276836500001934, + 0.0466667150077840625632675, + -0.0523674852303482457616113, + 0.0587666392926673580854313, + -0.0666573579361080525984562, + 0.0769219538311769618355029, + -0.090908995008245008229153, + 0.111111105648261418443745, + -0.14285714266771329383765, + 0.199999999996591265594148, + -0.333333333333311110369124); t = s + s * (t * u); @@ -610,29 +623,31 @@ static Sleef_double2 atan2k_u1(Sleef_double2 y, Sleef_double2 x) { t = ddsqu_d2_d2(s); t = ddnormalize_d2_d2(t); - u = 1.06298484191448746607415e-05; - u = mla(u, t.x, -0.000125620649967286867384336); - u = mla(u, t.x, 0.00070557664296393412389774); - u = mla(u, t.x, -0.00251865614498713360352999); - u = mla(u, t.x, 0.00646262899036991172313504); - u = mla(u, t.x, -0.0128281333663399031014274); - u = mla(u, t.x, 0.0208024799924145797902497); - u = mla(u, t.x, -0.0289002344784740315686289); - u = mla(u, t.x, 0.0359785005035104590853656); - u = mla(u, t.x, -0.041848579703592507506027); - u = mla(u, t.x, 0.0470843011653283988193763); - u = mla(u, t.x, -0.0524914210588448421068719); - u = mla(u, t.x, 0.0587946590969581003860434); - u = mla(u, t.x, -0.0666620884778795497194182); - u = mla(u, t.x, 0.0769225330296203768654095); - u = mla(u, t.x, -0.0909090442773387574781907); - u = mla(u, t.x, 0.111111108376896236538123); + double t2 = t.x * t.x, t4 = t2 * t2, t8 = t4 * t4, t16 = t8 * t8; + u = POLY17(t.x, t2, t4, t8, t16, + 1.06298484191448746607415e-05, + -0.000125620649967286867384336, + 0.00070557664296393412389774, + -0.00251865614498713360352999, + 0.00646262899036991172313504, + -0.0128281333663399031014274, + 0.0208024799924145797902497, + -0.0289002344784740315686289, + 0.0359785005035104590853656, + -0.041848579703592507506027, + 0.0470843011653283988193763, + -0.0524914210588448421068719, + 0.0587946590969581003860434, + -0.0666620884778795497194182, + 0.0769225330296203768654095, + -0.0909090442773387574781907, + 0.111111108376896236538123); u = mla(u, t.x, -0.142857142756268568062339); u = mla(u, t.x, 0.199999999997977351284817); u = mla(u, t.x, -0.333333333333317605173818); - t = ddmul_d2_d2_d(t, u); - t = ddmul_d2_d2_d2(s, ddadd_d2_d_d2(1, t)); + t = ddadd_d2_d2_d2(s, ddmul_d2_d2_d(ddmul_d2_d2_d2(s, t), u)); + if (fabsk(s.x) < 1e-200) t = s; t = ddadd2_d2_d2_d2(ddmul_d2_d2_d(dd(1.570796326794896557998982, 6.12323399573676603586882e-17), q), t); @@ -658,18 +673,21 @@ EXPORT CONST double xasin_u1(double d) { Sleef_double2 x = o ? dd(fabsk(d), 0) : ddsqrt_d2_d(x2); x = fabsk(d) == 1.0 ? dd(0, 0) : x; - u = +0.3161587650653934628e-1; - u = mla(u, x2, -0.1581918243329996643e-1); - u = mla(u, x2, +0.1929045477267910674e-1); - u = mla(u, x2, +0.6606077476277170610e-2); - u = mla(u, x2, +0.1215360525577377331e-1); - u = mla(u, x2, +0.1388715184501609218e-1); - u = mla(u, x2, +0.1735956991223614604e-1); - u = mla(u, x2, +0.2237176181932048341e-1); - u = mla(u, x2, +0.3038195928038132237e-1); - u = mla(u, x2, +0.4464285681377102438e-1); - u = mla(u, x2, +0.7500000000378581611e-1); - u = mla(u, x2, +0.1666666666666497543e+0); + double x4 = x2 * x2, x8 = x4 * x4, x16 = x8 * x8; + u = POLY12(x2, x4, x8, x16, + +0.3161587650653934628e-1, + -0.1581918243329996643e-1, + +0.1929045477267910674e-1, + +0.6606077476277170610e-2, + +0.1215360525577377331e-1, + +0.1388715184501609218e-1, + +0.1735956991223614604e-1, + +0.2237176181932048341e-1, + +0.3038195928038132237e-1, + +0.4464285681377102438e-1, + +0.7500000000378581611e-1, + +0.1666666666666497543e+0); + u *= x2 * x.x; Sleef_double2 y = ddadd_d2_d2_d(ddsub_d2_d2_d2(dd(3.141592653589793116/4, 1.2246467991473532072e-16/4), x), -u); @@ -685,18 +703,20 @@ EXPORT CONST double xacos_u1(double d) { Sleef_double2 x = o ? dd(fabsk(d), 0) : ddsqrt_d2_d(x2), w; x = fabsk(d) == 1.0 ? dd(0, 0) : x; - u = +0.3161587650653934628e-1; - u = mla(u, x2, -0.1581918243329996643e-1); - u = mla(u, x2, +0.1929045477267910674e-1); - u = mla(u, x2, +0.6606077476277170610e-2); - u = mla(u, x2, +0.1215360525577377331e-1); - u = mla(u, x2, +0.1388715184501609218e-1); - u = mla(u, x2, +0.1735956991223614604e-1); - u = mla(u, x2, +0.2237176181932048341e-1); - u = mla(u, x2, +0.3038195928038132237e-1); - u = mla(u, x2, +0.4464285681377102438e-1); - u = mla(u, x2, +0.7500000000378581611e-1); - u = mla(u, x2, +0.1666666666666497543e+0); + double x4 = x2 * x2, x8 = x4 * x4, x16 = x8 * x8; + u = POLY12(x2, x4, x8, x16, + +0.3161587650653934628e-1, + -0.1581918243329996643e-1, + +0.1929045477267910674e-1, + +0.6606077476277170610e-2, + +0.1215360525577377331e-1, + +0.1388715184501609218e-1, + +0.1735956991223614604e-1, + +0.2237176181932048341e-1, + +0.3038195928038132237e-1, + +0.4464285681377102438e-1, + +0.7500000000378581611e-1, + +0.1666666666666497543e+0); u *= x.x * x2; @@ -802,14 +822,16 @@ EXPORT CONST double xsin(double d) { if ((ql & 1) != 0) d = -d; - u = -7.97255955009037868891952e-18; - u = mla(u, s, 2.81009972710863200091251e-15); - u = mla(u, s, -7.64712219118158833288484e-13); - u = mla(u, s, 1.60590430605664501629054e-10); - u = mla(u, s, -2.50521083763502045810755e-08); - u = mla(u, s, 2.75573192239198747630416e-06); - u = mla(u, s, -0.000198412698412696162806809); - u = mla(u, s, 0.00833333333333332974823815); + double s2 = s * s, s4 = s2 * s2; + u = POLY8(s, s2, s4, + -7.97255955009037868891952e-18, + 2.81009972710863200091251e-15, + -7.64712219118158833288484e-13, + 1.60590430605664501629054e-10, + -2.50521083763502045810755e-08, + 2.75573192239198747630416e-06, + -0.000198412698412696162806809, + 0.00833333333333332974823815); u = mla(u, s, -0.166666666666666657414808); u = mla(s, u * d, d); @@ -853,16 +875,17 @@ EXPORT CONST double xsin_u1(double d) { t = s; s = ddsqu_d2_d2(s); - u = 2.72052416138529567917983e-15; - u = mla(u, s.x, -7.6429259411395447190023e-13); - u = mla(u, s.x, 1.60589370117277896211623e-10); - u = mla(u, s.x, -2.5052106814843123359368e-08); - u = mla(u, s.x, 2.75573192104428224777379e-06); - u = mla(u, s.x, -0.000198412698412046454654947); - u = mla(u, s.x, 0.00833333333333318056201922); + double s2 = s.x * s.x, s4 = s2 * s2; + u = POLY7(s.x, s2, s4, + 2.72052416138529567917983e-15, + -7.6429259411395447190023e-13, + 1.60589370117277896211623e-10, + -2.5052106814843123359368e-08, + 2.75573192104428224777379e-06, + -0.000198412698412046454654947, + 0.00833333333333318056201922); x = ddadd_d2_d_d2(1, ddmul_d2_d2_d2(ddadd_d2_d_d(-0.166666666666666657414808, u * s.x), s)); - u = ddmul_d_d2_d2(t, x); if ((ql & 1) != 0) u = -u; @@ -906,14 +929,16 @@ EXPORT CONST double xcos(double d) { if ((ql & 2) == 0) d = -d; - u = -7.97255955009037868891952e-18; - u = mla(u, s, 2.81009972710863200091251e-15); - u = mla(u, s, -7.64712219118158833288484e-13); - u = mla(u, s, 1.60590430605664501629054e-10); - u = mla(u, s, -2.50521083763502045810755e-08); - u = mla(u, s, 2.75573192239198747630416e-06); - u = mla(u, s, -0.000198412698412696162806809); - u = mla(u, s, 0.00833333333333332974823815); + double s2 = s * s, s4 = s2 * s2; + u = POLY8(s, s2, s4, + -7.97255955009037868891952e-18, + 2.81009972710863200091251e-15, + -7.64712219118158833288484e-13, + 1.60590430605664501629054e-10, + -2.50521083763502045810755e-08, + 2.75573192239198747630416e-06, + -0.000198412698412696162806809, + 0.00833333333333332974823815); u = mla(u, s, -0.166666666666666657414808); u = mla(s, u * d, d); @@ -958,16 +983,17 @@ EXPORT CONST double xcos_u1(double d) { t = s; s = ddsqu_d2_d2(s); - u = 2.72052416138529567917983e-15; - u = mla(u, s.x, -7.6429259411395447190023e-13); - u = mla(u, s.x, 1.60589370117277896211623e-10); - u = mla(u, s.x, -2.5052106814843123359368e-08); - u = mla(u, s.x, 2.75573192104428224777379e-06); - u = mla(u, s.x, -0.000198412698412046454654947); - u = mla(u, s.x, 0.00833333333333318056201922); + double s2 = s.x * s.x, s4 = s2 * s2; + u = POLY7(s.x, s2, s4, + 2.72052416138529567917983e-15, + -7.6429259411395447190023e-13, + 1.60589370117277896211623e-10, + -2.5052106814843123359368e-08, + 2.75573192104428224777379e-06, + -0.000198412698412046454654947, + 0.00833333333333318056201922); x = ddadd_d2_d_d2(1, ddmul_d2_d2_d2(ddadd_d2_d_d(-0.166666666666666657414808, u * s.x), s)); - u = ddmul_d_d2_d2(t, x); if ((((int)ql) & 2) == 0) u = -u; @@ -1296,7 +1322,7 @@ EXPORT CONST double xcospi_u05(double d) { } EXPORT CONST double xtan(double d) { - double u, s, x; + double u, s, x, y; int ql; if (fabsk(d) < TRIGRANGEMAX2) { @@ -1321,37 +1347,36 @@ EXPORT CONST double xtan(double d) { if (xisinf(d) || xisnan(d)) x = SLEEF_NAN; } + x *= 0.5; s = x * x; - if ((ql & 1) != 0) x = -x; - - u = 9.99583485362149960784268e-06; - u = mla(u, s, -4.31184585467324750724175e-05); - u = mla(u, s, 0.000103573238391744000389851); - u = mla(u, s, -0.000137892809714281708733524); - u = mla(u, s, 0.000157624358465342784274554); - u = mla(u, s, -6.07500301486087879295969e-05); - u = mla(u, s, 0.000148898734751616411290179); - u = mla(u, s, 0.000219040550724571513561967); - u = mla(u, s, 0.000595799595197098359744547); - u = mla(u, s, 0.00145461240472358871965441); - u = mla(u, s, 0.0035923150771440177410343); - u = mla(u, s, 0.00886321546662684547901456); - u = mla(u, s, 0.0218694899718446938985394); - u = mla(u, s, 0.0539682539049961967903002); - u = mla(u, s, 0.133333333334818976423364); - u = mla(u, s, 0.333333333333320047664472); - + double s2 = s * s, s4 = s2 * s2; + u = POLY8(s, s2, s4, + +0.3245098826639276316e-3, + +0.5619219738114323735e-3, + +0.1460781502402784494e-2, + +0.3591611540792499519e-2, + +0.8863268409563113126e-2, + +0.2186948728185535498e-1, + +0.5396825399517272970e-1, + +0.1333333333330500581e+0); + + u = mla(u, s, +0.3333333333333343695e+0); u = mla(s, u * x, x); - if ((ql & 1) != 0) u = 1.0 / u; + y = mla(u, u, -1); + x = -2 * u; + + if ((ql & 1) != 0) { double t = x; x = y; y = -t; } + + u = x / y; return u; } EXPORT CONST double xtan_u1(double d) { double u; - Sleef_double2 s, t, x; + Sleef_double2 s, t, x, y; int ql; if (fabsk(d) < TRIGRANGEMAX2) { @@ -1377,30 +1402,29 @@ EXPORT CONST double xtan_u1(double d) { if (xisinf(d) || xisnan(d)) s.x = SLEEF_NAN; } - if ((ql & 1) != 0) s = ddneg_d2_d2(s); + t = ddscale_d2_d2_d(s, 0.5); + s = ddsqu_d2_d2(t); - t = s; - s = ddsqu_d2_d2(s); + double s2 = s.x * s.x, s4 = s2 * s2; + u = POLY8(s.x, s2, s4, + +0.3245098826639276316e-3, + +0.5619219738114323735e-3, + +0.1460781502402784494e-2, + +0.3591611540792499519e-2, + +0.8863268409563113126e-2, + +0.2186948728185535498e-1, + +0.5396825399517272970e-1, + +0.1333333333330500581e+0); - u = 1.01419718511083373224408e-05; - u = mla(u, s.x, -2.59519791585924697698614e-05); - u = mla(u, s.x, 5.23388081915899855325186e-05); - u = mla(u, s.x, -3.05033014433946488225616e-05); - u = mla(u, s.x, 7.14707504084242744267497e-05); - u = mla(u, s.x, 8.09674518280159187045078e-05); - u = mla(u, s.x, 0.000244884931879331847054404); - u = mla(u, s.x, 0.000588505168743587154904506); - u = mla(u, s.x, 0.00145612788922812427978848); - u = mla(u, s.x, 0.00359208743836906619142924); - u = mla(u, s.x, 0.00886323944362401618113356); - u = mla(u, s.x, 0.0218694882853846389592078); - u = mla(u, s.x, 0.0539682539781298417636002); - u = mla(u, s.x, 0.133333333333125941821962); - - x = ddadd_d2_d_d2(1, ddmul_d2_d2_d2(ddadd_d2_d_d(0.333333333333334980164153, u * s.x), s)); - x = ddmul_d2_d2_d2(t, x); - - if ((ql & 1) != 0) x = ddrec_d2_d2(x); + u = mla(u, s.x, +0.3333333333333343695e+0); + x = ddadd_d2_d2_d2(t, ddmul_d2_d2_d(ddmul_d2_d2_d2(s, t), u)); + + y = ddadd_d2_d_d2(-1, ddsqu_d2_d2(x)); + x = ddscale_d2_d2_d(x, -2); + + if ((ql & 1) != 0) { t = x; x = y; y = ddneg_d2_d2(t); } + + x = dddiv_d2_d2_d2(x, y); u = x.x + x.y; @@ -1424,16 +1448,18 @@ EXPORT CONST double xlog(double d) { x = (m-1) / (m+1); x2 = x * x; - t = 0.153487338491425068243146; - t = mla(t, x2, 0.152519917006351951593857); - t = mla(t, x2, 0.181863266251982985677316); - t = mla(t, x2, 0.222221366518767365905163); - t = mla(t, x2, 0.285714294746548025383248); - t = mla(t, x2, 0.399999999950799600689777); - t = mla(t, x2, 0.6666666666667778740063); - t = mla(t, x2, 2); - - x = x * t + 0.693147180559945286226764 * e; + double x4 = x2 * x2, x8 = x4 * x4; + + t = POLY7(x2, x4, x8, + 0.153487338491425068243146, + 0.152519917006351951593857, + 0.181863266251982985677316, + 0.222221366518767365905163, + 0.285714294746548025383248, + 0.399999999950799600689777, + 0.6666666666667778740063); + + x = x * 2 + 0.693147180559945286226764 * e + x * x2 * t; if (xisinf(d)) x = SLEEF_INFINITY; if (d < 0 || xisnan(d)) x = SLEEF_NAN; @@ -1448,18 +1474,20 @@ EXPORT CONST double xexp(double d) { s = mla(q, -L2U, d); s = mla(q, -L2L, s); - - u = 2.08860621107283687536341e-09; - u = mla(u, s, 2.51112930892876518610661e-08); - u = mla(u, s, 2.75573911234900471893338e-07); - u = mla(u, s, 2.75572362911928827629423e-06); - u = mla(u, s, 2.4801587159235472998791e-05); - u = mla(u, s, 0.000198412698960509205564975); - u = mla(u, s, 0.00138888888889774492207962); - u = mla(u, s, 0.00833333333331652721664984); - u = mla(u, s, 0.0416666666666665047591422); - u = mla(u, s, 0.166666666666666851703837); - u = mla(u, s, 0.5); + + double s2 = s * s, s4 = s2 * s2, s8 = s4 * s4; + u = POLY11(s, s2, s4, s8, + 2.08860621107283687536341e-09, + 2.51112930892876518610661e-08, + 2.75573911234900471893338e-07, + 2.75572362911928827629423e-06, + 2.4801587159235472998791e-05, + 0.000198412698960509205564975, + 0.00138888888889774492207962, + 0.00833333333331652721664984, + 0.0416666666666665047591422, + 0.166666666666666851703837, + 0.5); u = s * s * u + s + 1; u = ldexp2k(u, q); @@ -1477,18 +1505,20 @@ static INLINE CONST double expm1k(double d) { s = mla(q, -L2U, d); s = mla(q, -L2L, s); - u = 2.08860621107283687536341e-09; - u = mla(u, s, 2.51112930892876518610661e-08); - u = mla(u, s, 2.75573911234900471893338e-07); - u = mla(u, s, 2.75572362911928827629423e-06); - u = mla(u, s, 2.4801587159235472998791e-05); - u = mla(u, s, 0.000198412698960509205564975); - u = mla(u, s, 0.00138888888889774492207962); - u = mla(u, s, 0.00833333333331652721664984); - u = mla(u, s, 0.0416666666666665047591422); - u = mla(u, s, 0.166666666666666851703837); - u = mla(u, s, 0.5); - u = s * s * u + s; + double s2 = s * s, s4 = s2 * s2, s8 = s4 * s4; + u = POLY10(s, s2, s4, s8, + 2.08860621107283687536341e-09, + 2.51112930892876518610661e-08, + 2.75573911234900471893338e-07, + 2.75572362911928827629423e-06, + 2.4801587159235472998791e-05, + 0.000198412698960509205564975, + 0.00138888888889774492207962, + 0.00833333333331652721664984, + 0.0416666666666665047591422, + 0.166666666666666851703837); + + u = mla(s2, 0.5, s2 * s * u) + s; if (q != 0) u = ldexp2k(u + 1, q) - 1; @@ -1511,21 +1541,26 @@ static INLINE CONST Sleef_double2 logk(double d) { x = dddiv_d2_d2_d2(ddadd2_d2_d_d(-1, m), ddadd2_d2_d_d(1, m)); x2 = ddsqu_d2_d2(x); - t = 0.116255524079935043668677; - t = mla(t, x2.x, 0.103239680901072952701192); - t = mla(t, x2.x, 0.117754809412463995466069); - t = mla(t, x2.x, 0.13332981086846273921509); - t = mla(t, x2.x, 0.153846227114512262845736); - t = mla(t, x2.x, 0.181818180850050775676507); - t = mla(t, x2.x, 0.222222222230083560345903); - t = mla(t, x2.x, 0.285714285714249172087875); - t = mla(t, x2.x, 0.400000000000000077715612); - Sleef_double2 c = dd(0.666666666666666629659233, 3.80554962542412056336616e-17); + double x4 = x2.x * x2.x, x8 = x4 * x4, x16 = x8 * x8; + t = POLY9(x2.x, x4, x8, x16, + 0.116255524079935043668677, + 0.103239680901072952701192, + 0.117754809412463995466069, + 0.13332981086846273921509, + 0.153846227114512262845736, + 0.181818180850050775676507, + 0.222222222230083560345903, + 0.285714285714249172087875, + 0.400000000000000077715612); + Sleef_double2 c = dd(0.666666666666666629659233, 3.80554962542412056336616e-17); s = ddmul_d2_d2_d(dd(0.693147180559945286226764, 2.319046813846299558417771e-17), e); s = ddadd_d2_d2_d2(s, ddscale_d2_d2_d(x, 2)); - s = ddadd_d2_d2_d2(s, ddmul_d2_d2_d2(ddmul_d2_d2_d2(x2, x), - ddadd2_d2_d2_d2(ddmul_d2_d2_d(x2, t), c))); + x = ddmul_d2_d2_d2(x2, x); + s = ddadd_d2_d2_d2(s, ddmul_d2_d2_d2(x, c)); + x = ddmul_d2_d2_d2(x2, x); + s = ddadd_d2_d2_d2(s, ddmul_d2_d2_d(x, t)); + return s; } @@ -1545,13 +1580,15 @@ EXPORT CONST double xlog_u1(double d) { x = dddiv_d2_d2_d2(ddadd2_d2_d_d(-1, m), ddadd2_d2_d_d(1, m)); x2 = x.x * x.x; - t = 0.1532076988502701353e+0; - t = mla(t, x2, 0.1525629051003428716e+0); - t = mla(t, x2, 0.1818605932937785996e+0); - t = mla(t, x2, 0.2222214519839380009e+0); - t = mla(t, x2, 0.2857142932794299317e+0); - t = mla(t, x2, 0.3999999999635251990e+0); - t = mla(t, x2, 0.6666666666667333541e+0); + double x4 = x2 * x2, x8 = x4 * x4; + t = POLY7(x2, x4, x8, + 0.1532076988502701353e+0, + 0.1525629051003428716e+0, + 0.1818605932937785996e+0, + 0.2222214519839380009e+0, + 0.2857142932794299317e+0, + 0.3999999999635251990e+0, + 0.6666666666667333541e+0); s = ddmul_d2_d2_d(dd(0.693147180559945286226764, 2.319046813846299558417771e-17), (double)e); s = ddadd_d2_d2_d2(s, ddscale_d2_d2_d(x, 2)); @@ -1576,20 +1613,21 @@ static INLINE CONST double expk(Sleef_double2 d) { s = ddnormalize_d2_d2(s); - u = 2.51069683420950419527139e-08; - u = mla(u, s.x, 2.76286166770270649116855e-07); - u = mla(u, s.x, 2.75572496725023574143864e-06); - u = mla(u, s.x, 2.48014973989819794114153e-05); - u = mla(u, s.x, 0.000198412698809069797676111); - u = mla(u, s.x, 0.0013888888939977128960529); - u = mla(u, s.x, 0.00833333333332371417601081); - u = mla(u, s.x, 0.0416666666665409524128449); - u = mla(u, s.x, 0.166666666666666740681535); - u = mla(u, s.x, 0.500000000000000999200722); - - t = ddadd_d2_d2_d2(s, ddmul_d2_d2_d(ddsqu_d2_d2(s), u)); - - t = ddadd_d2_d_d2(1, t); + double s2 = s.x * s.x, s4 = s2 * s2, s8 = s4 * s4; + u = POLY10(s.x, s2, s4, s8, + 2.51069683420950419527139e-08, + 2.76286166770270649116855e-07, + 2.75572496725023574143864e-06, + 2.48014973989819794114153e-05, + 0.000198412698809069797676111, + 0.0013888888939977128960529, + 0.00833333333332371417601081, + 0.0416666666665409524128449, + 0.166666666666666740681535, + 0.500000000000000999200722); + + t = ddadd_d2_d_d2(1, s); + t = ddadd_d2_d2_d2(t, ddmul_d2_d2_d(ddsqu_d2_d2(s), u)); u = ldexpk(t.x + t.y, q); @@ -1739,14 +1777,16 @@ static INLINE CONST Sleef_double2 logk2(Sleef_double2 d) { x = dddiv_d2_d2_d2(ddadd2_d2_d2_d(m, -1), ddadd2_d2_d2_d(m, 1)); x2 = ddsqu_d2_d2(x); - - t = 0.13860436390467167910856; - t = mla(t, x2.x, 0.131699838841615374240845); - t = mla(t, x2.x, 0.153914168346271945653214); - t = mla(t, x2.x, 0.181816523941564611721589); - t = mla(t, x2.x, 0.22222224632662035403996); - t = mla(t, x2.x, 0.285714285511134091777308); - t = mla(t, x2.x, 0.400000000000914013309483); + + double x4 = x2.x * x2.x, x8 = x4 * x4; + t = POLY7(x2.x, x4, x8, + 0.13860436390467167910856, + 0.131699838841615374240845, + 0.153914168346271945653214, + 0.181816523941564611721589, + 0.22222224632662035403996, + 0.285714285511134091777308, + 0.400000000000914013309483); t = mla(t, x2.x, 0.666666666666664853302393); s = ddmul_d2_d2_d(dd(0.693147180559945286226764, 2.319046813846299558417771e-17), e); @@ -1876,17 +1916,20 @@ EXPORT CONST double xexp2(double d) { s = d - q; - u = +0.4434359082926529454e-9; - u = mla(u, s, +0.7073164598085707425e-8); - u = mla(u, s, +0.1017819260921760451e-6); - u = mla(u, s, +0.1321543872511327615e-5); - u = mla(u, s, +0.1525273353517584730e-4); - u = mla(u, s, +0.1540353045101147808e-3); - u = mla(u, s, +0.1333355814670499073e-2); - u = mla(u, s, +0.9618129107597600536e-2); - u = mla(u, s, +0.5550410866482046596e-1); - u = mla(u, s, +0.2402265069591012214e+0); + double s2 = s * s, s4 = s2 * s2, s8 = s4 * s4; + u = POLY10(s, s2, s4, s8, + +0.4434359082926529454e-9, + +0.7073164598085707425e-8, + +0.1017819260921760451e-6, + +0.1321543872511327615e-5, + +0.1525273353517584730e-4, + +0.1540353045101147808e-3, + +0.1333355814670499073e-2, + +0.9618129107597600536e-2, + +0.5550410866482046596e-1, + +0.2402265069591012214e+0); u = mla(u, s, +0.6931471805599452862e+0); + u = ddnormalize_d2_d2(ddadd_d2_d_d2(1, ddmul_d2_d_d(u, s))).x; u = ldexp2k(u, q); @@ -1915,6 +1958,7 @@ EXPORT CONST double xexp10(double d) { u = mla(u, s, +0.2034678592293432953e+1); u = mla(u, s, +0.2650949055239205876e+1); u = mla(u, s, +0.2302585092994045901e+1); + u = ddnormalize_d2_d2(ddadd_d2_d_d2(1, ddmul_d2_d_d(u, s))).x; u = ldexp2k(u, q); @@ -1950,13 +1994,15 @@ EXPORT CONST double xlog10(double d) { x = dddiv_d2_d2_d2(ddadd2_d2_d_d(-1, m), ddadd2_d2_d_d(1, m)); x2 = x.x * x.x; - t = +0.6653725819576758460e-1; - t = mla(t, x2, +0.6625722782820833712e-1); - t = mla(t, x2, +0.7898105214313944078e-1); - t = mla(t, x2, +0.9650955035715275132e-1); - t = mla(t, x2, +0.1240841409721444993e+0); - t = mla(t, x2, +0.1737177927454605086e+0); - t = mla(t, x2, +0.2895296546021972617e+0); + double x4 = x2 * x2, x8 = x4 * x4; + t = POLY7(x2, x4, x8, + +0.6653725819576758460e-1, + +0.6625722782820833712e-1, + +0.7898105214313944078e-1, + +0.9650955035715275132e-1, + +0.1240841409721444993e+0, + +0.1737177927454605086e+0, + +0.2895296546021972617e+0); s = ddmul_d2_d2_d(dd(0.30102999566398119802, -2.803728127785170339e-18), (double)e); s = ddadd_d2_d2_d2(s, ddmul_d2_d2_d2(x, dd(0.86858896380650363334, 1.1430059694096389311e-17))); @@ -1986,14 +2032,17 @@ EXPORT CONST double xlog2(double d) { x = dddiv_d2_d2_d2(ddadd2_d2_d_d(-1, m), ddadd2_d2_d_d(1, m)); x2 = x.x * x.x; - - t = +0.2211941750456081490e+0; - t = mla(t, x2, +0.2200768693152277689e+0); - t = mla(t, x2, +0.2623708057488514656e+0); - t = mla(t, x2, +0.3205977477944495502e+0); - t = mla(t, x2, +0.4121985945485324709e+0); - t = mla(t, x2, +0.5770780162997058982e+0); - t = mla(t, x2, +0.96179669392608091449 ); + + double x4 = x2 * x2, x8 = x4 * x4; + t = POLY7(x2, x4, x8, + +0.2211941750456081490e+0, + +0.2200768693152277689e+0, + +0.2623708057488514656e+0, + +0.3205977477944495502e+0, + +0.4121985945485324709e+0, + +0.5770780162997058982e+0, + +0.96179669392608091449); + s = ddadd2_d2_d_d2(e, ddmul_d2_d2_d2(x, dd(2.885390081777926774, 6.0561604995516736434e-18))); s = ddadd2_d2_d2_d(s, x2 * x.x * t); @@ -2026,13 +2075,15 @@ EXPORT CONST double xlog1p(double d) { x = dddiv_d2_d2_d2(dd(m, 0), ddadd_d2_d_d(2, m)); x2 = x.x * x.x; - t = 0.1532076988502701353e+0; - t = mla(t, x2, 0.1525629051003428716e+0); - t = mla(t, x2, 0.1818605932937785996e+0); - t = mla(t, x2, 0.2222214519839380009e+0); - t = mla(t, x2, 0.2857142932794299317e+0); - t = mla(t, x2, 0.3999999999635251990e+0); - t = mla(t, x2, 0.6666666666667333541e+0); + double x4 = x2 * x2, x8 = x4 * x4; + t = POLY7(x2, x4, x8, + 0.1532076988502701353e+0, + 0.1525629051003428716e+0, + 0.1818605932937785996e+0, + 0.2222214519839380009e+0, + 0.2857142932794299317e+0, + 0.3999999999635251990e+0, + 0.6666666666667333541e+0); s = ddmul_d2_d2_d(dd(0.693147180559945286226764, 2.319046813846299558417771e-17), (double)e); s = ddadd_d2_d2_d2(s, ddscale_d2_d2_d(x, 2)); diff --git a/src/libm/sleefsimddp.c b/src/libm/sleefsimddp.c index 6b8c5eab..9227c270 100644 --- a/src/libm/sleefsimddp.c +++ b/src/libm/sleefsimddp.c @@ -1,4 +1,4 @@ -// Copyright Naoki Shibata 2010 - 2018. +// Copyright Naoki Shibata 2010 - 2019. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -194,9 +194,11 @@ extern const double rempitabdp[]; #endif /* DORENAME */ #endif /* ENABLE_SVE */ -#if defined(DETERMINISTIC) && defined(SPLIT_KERNEL) -#undef SPLIT_KERNEL -#endif +// + +#define MLA(x, y, z) vmla_vd_vd_vd_vd((x), (y), (z)) +#define C2V(c) vcast_vd_d(c) +#include "estrin.h" // @@ -440,14 +442,16 @@ EXPORT CONST VECTOR_CC vdouble xsin(vdouble d) { d = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1))), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(d))); - u = vcast_vd_d(-7.97255955009037868891952e-18); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.81009972710863200091251e-15)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-7.64712219118158833288484e-13)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(1.60590430605664501629054e-10)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-2.50521083763502045810755e-08)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.75573192239198747630416e-06)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.000198412698412696162806809)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00833333333333332974823815)); + vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2); + u = POLY8(s, s2, s4, + -7.97255955009037868891952e-18, + 2.81009972710863200091251e-15, + -7.64712219118158833288484e-13, + 1.60590430605664501629054e-10, + -2.50521083763502045810755e-08, + 2.75573192239198747630416e-06, + -0.000198412698412696162806809, + 0.00833333333333332974823815); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.166666666666666657414808)); u = vadd_vd_vd_vd(vmul_vd_vd_vd(s, vmul_vd_vd_vd(u, d)), d); @@ -510,14 +514,16 @@ EXPORT CONST VECTOR_CC vdouble xsin(vdouble d) { d = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1))), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(d))); - u = vcast_vd_d(-7.97255955009037868891952e-18); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.81009972710863200091251e-15)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-7.64712219118158833288484e-13)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(1.60590430605664501629054e-10)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-2.50521083763502045810755e-08)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.75573192239198747630416e-06)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.000198412698412696162806809)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00833333333333332974823815)); + vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2); + u = POLY8(s, s2, s4, + -7.97255955009037868891952e-18, + 2.81009972710863200091251e-15, + -7.64712219118158833288484e-13, + 1.60590430605664501629054e-10, + -2.50521083763502045810755e-08, + 2.75573192239198747630416e-06, + -0.000198412698412696162806809, + 0.00833333333333332974823815); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.166666666666666657414808)); u = vadd_vd_vd_vd(vmul_vd_vd_vd(s, vmul_vd_vd_vd(u, d)), d); @@ -569,16 +575,17 @@ EXPORT CONST VECTOR_CC vdouble xsin_u1(vdouble d) { t = s; s = ddsqu_vd2_vd2(s); - u = vcast_vd_d(2.72052416138529567917983e-15); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-7.6429259411395447190023e-13)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(1.60589370117277896211623e-10)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-2.5052106814843123359368e-08)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(2.75573192104428224777379e-06)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-0.000198412698412046454654947)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.00833333333333318056201922)); + vdouble s2 = vmul_vd_vd_vd(s.x, s.x), s4 = vmul_vd_vd_vd(s2, s2); + u = POLY7(s.x, s2, s4, + 2.72052416138529567917983e-15, + -7.6429259411395447190023e-13, + 1.60589370117277896211623e-10, + -2.5052106814843123359368e-08, + 2.75573192104428224777379e-06, + -0.000198412698412046454654947, + 0.00833333333333318056201922); x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd2_vd2(ddadd_vd2_vd_vd(vcast_vd_d(-0.166666666666666657414808), vmul_vd_vd_vd(u, s.x)), s)); - u = ddmul_vd_vd2_vd2(t, x); u = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1))), @@ -636,18 +643,19 @@ EXPORT CONST VECTOR_CC vdouble xsin_u1(vdouble d) { t = x; s = ddsqu_vd2_vd2(x); - u = vcast_vd_d(2.72052416138529567917983e-15); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-7.6429259411395447190023e-13)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(1.60589370117277896211623e-10)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-2.5052106814843123359368e-08)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(2.75573192104428224777379e-06)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-0.000198412698412046454654947)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.00833333333333318056201922)); + vdouble s2 = vmul_vd_vd_vd(s.x, s.x), s4 = vmul_vd_vd_vd(s2, s2); + u = POLY7(s.x, s2, s4, + 2.72052416138529567917983e-15, + -7.6429259411395447190023e-13, + 1.60589370117277896211623e-10, + -2.5052106814843123359368e-08, + 2.75573192104428224777379e-06, + -0.000198412698412046454654947, + 0.00833333333333318056201922); x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd2_vd2(ddadd_vd2_vd_vd(vcast_vd_d(-0.166666666666666657414808), vmul_vd_vd_vd(u, s.x)), s)); - u = ddmul_vd_vd2_vd2(t, x); - + u = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1))), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(u))); @@ -703,14 +711,16 @@ EXPORT CONST VECTOR_CC vdouble xcos(vdouble d) { d = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(0))), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(d))); - u = vcast_vd_d(-7.97255955009037868891952e-18); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.81009972710863200091251e-15)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-7.64712219118158833288484e-13)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(1.60590430605664501629054e-10)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-2.50521083763502045810755e-08)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.75573192239198747630416e-06)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.000198412698412696162806809)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00833333333333332974823815)); + vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2); + u = POLY8(s, s2, s4, + -7.97255955009037868891952e-18, + 2.81009972710863200091251e-15, + -7.64712219118158833288484e-13, + 1.60590430605664501629054e-10, + -2.50521083763502045810755e-08, + 2.75573192239198747630416e-06, + -0.000198412698412696162806809, + 0.00833333333333332974823815); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.166666666666666657414808)); u = vadd_vd_vd_vd(vmul_vd_vd_vd(s, vmul_vd_vd_vd(u, d)), d); @@ -772,14 +782,16 @@ EXPORT CONST VECTOR_CC vdouble xcos(vdouble d) { d = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(0))), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(d))); - u = vcast_vd_d(-7.97255955009037868891952e-18); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.81009972710863200091251e-15)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-7.64712219118158833288484e-13)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(1.60590430605664501629054e-10)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-2.50521083763502045810755e-08)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.75573192239198747630416e-06)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.000198412698412696162806809)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00833333333333332974823815)); + vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2); + u = POLY8(s, s2, s4, + -7.97255955009037868891952e-18, + 2.81009972710863200091251e-15, + -7.64712219118158833288484e-13, + 1.60590430605664501629054e-10, + -2.50521083763502045810755e-08, + 2.75573192239198747630416e-06, + -0.000198412698412696162806809, + 0.00833333333333332974823815); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.166666666666666657414808)); u = vadd_vd_vd_vd(vmul_vd_vd_vd(s, vmul_vd_vd_vd(u, d)), d); @@ -833,16 +845,17 @@ EXPORT CONST VECTOR_CC vdouble xcos_u1(vdouble d) { t = s; s = ddsqu_vd2_vd2(s); - u = vcast_vd_d(2.72052416138529567917983e-15); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-7.6429259411395447190023e-13)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(1.60589370117277896211623e-10)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-2.5052106814843123359368e-08)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(2.75573192104428224777379e-06)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-0.000198412698412046454654947)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.00833333333333318056201922)); + vdouble s2 = vmul_vd_vd_vd(s.x, s.x), s4 = vmul_vd_vd_vd(s2, s2); + u = POLY7(s.x, s2, s4, + 2.72052416138529567917983e-15, + -7.6429259411395447190023e-13, + 1.60589370117277896211623e-10, + -2.5052106814843123359368e-08, + 2.75573192104428224777379e-06, + -0.000198412698412046454654947, + 0.00833333333333318056201922); x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd2_vd2(ddadd_vd2_vd_vd(vcast_vd_d(-0.166666666666666657414808), vmul_vd_vd_vd(u, s.x)), s)); - u = ddmul_vd_vd2_vd2(t, x); u = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(0))), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(u))); @@ -903,16 +916,17 @@ EXPORT CONST VECTOR_CC vdouble xcos_u1(vdouble d) { t = x; s = ddsqu_vd2_vd2(x); - u = vcast_vd_d(2.72052416138529567917983e-15); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-7.6429259411395447190023e-13)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(1.60589370117277896211623e-10)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-2.5052106814843123359368e-08)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(2.75573192104428224777379e-06)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-0.000198412698412046454654947)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.00833333333333318056201922)); + vdouble s2 = vmul_vd_vd_vd(s.x, s.x), s4 = vmul_vd_vd_vd(s2, s2); + u = POLY7(s.x, s2, s4, + 2.72052416138529567917983e-15, + -7.6429259411395447190023e-13, + 1.60589370117277896211623e-10, + -2.5052106814843123359368e-08, + 2.75573192104428224777379e-06, + -0.000198412698412046454654947, + 0.00833333333333318056201922); x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd2_vd2(ddadd_vd2_vd_vd(vcast_vd_d(-0.166666666666666657414808), vmul_vd_vd_vd(u, s.x)), s)); - u = ddmul_vd_vd2_vd2(t, x); u = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(0))), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(u))); @@ -1525,7 +1539,7 @@ EXPORT CONST VECTOR_CC vdouble xcospi_u05(vdouble d) { EXPORT CONST VECTOR_CC vdouble xtan(vdouble d) { #if !defined(DETERMINISTIC) - vdouble u, s, x; + vdouble u, s, x, y; vopmask o; vint ql; @@ -1555,62 +1569,36 @@ EXPORT CONST VECTOR_CC vdouble xtan(vdouble d) { x = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(d), visnan_vo_vd(d)), vreinterpret_vm_vd(x))); } + x = vmul_vd_vd_vd(x, vcast_vd_d(0.5)); s = vmul_vd_vd_vd(x, x); - o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1))); - x = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(x))); - -#ifdef SPLIT_KERNEL - vdouble s2 = vmul_vd_vd_vd(s, s), v; - - u = vcast_vd_d(-4.31184585467324750724175e-05); - u = vmla_vd_vd_vd_vd(u, s2, vcast_vd_d(-0.000137892809714281708733524)); - u = vmla_vd_vd_vd_vd(u, s2, vcast_vd_d(-6.07500301486087879295969e-05)); - u = vmla_vd_vd_vd_vd(u, s2, vcast_vd_d(0.000219040550724571513561967)); - u = vmla_vd_vd_vd_vd(u, s2, vcast_vd_d(0.00145461240472358871965441)); - u = vmla_vd_vd_vd_vd(u, s2, vcast_vd_d(0.00886321546662684547901456)); - u = vmla_vd_vd_vd_vd(u, s2, vcast_vd_d(0.0539682539049961967903002)); - u = vmla_vd_vd_vd_vd(u, s2, vcast_vd_d(0.333333333333320047664472)); - - v = vcast_vd_d(9.99583485362149960784268e-06); - v = vmla_vd_vd_vd_vd(v, s2, vcast_vd_d(0.000103573238391744000389851)); - v = vmla_vd_vd_vd_vd(v, s2, vcast_vd_d(0.000157624358465342784274554)); - v = vmla_vd_vd_vd_vd(v, s2, vcast_vd_d(0.000148898734751616411290179)); - v = vmla_vd_vd_vd_vd(v, s2, vcast_vd_d(0.000595799595197098359744547)); - v = vmla_vd_vd_vd_vd(v, s2, vcast_vd_d(0.0035923150771440177410343)); - v = vmla_vd_vd_vd_vd(v, s2, vcast_vd_d(0.0218694899718446938985394)); - v = vmla_vd_vd_vd_vd(v, s2, vcast_vd_d(0.133333333334818976423364)); - - u = vmla_vd_vd_vd_vd(v, s, u); -#else // #ifdef SPLIT_KERNEL - u = vcast_vd_d(9.99583485362149960784268e-06); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-4.31184585467324750724175e-05)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000103573238391744000389851)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.000137892809714281708733524)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000157624358465342784274554)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-6.07500301486087879295969e-05)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000148898734751616411290179)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000219040550724571513561967)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000595799595197098359744547)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00145461240472358871965441)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0035923150771440177410343)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00886321546662684547901456)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0218694899718446938985394)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0539682539049961967903002)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.133333333334818976423364)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.333333333333320047664472)); -#endif // #ifdef SPLIT_KERNEL - + vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2); + u = POLY8(s, s2, s4, + +0.3245098826639276316e-3, + +0.5619219738114323735e-3, + +0.1460781502402784494e-2, + +0.3591611540792499519e-2, + +0.8863268409563113126e-2, + +0.2186948728185535498e-1, + +0.5396825399517272970e-1, + +0.1333333333330500581e+0); + + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.3333333333333343695e+0)); u = vmla_vd_vd_vd_vd(s, vmul_vd_vd_vd(u, x), x); - u = vsel_vd_vo_vd_vd(o, vrec_vd_vd(u), u); + y = vmla_vd_vd_vd_vd(u, u, vcast_vd_d(-1)); + x = vmul_vd_vd_vd(u, vcast_vd_d(-2)); + + o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1))); + u = vdiv_vd_vd_vd(vsel_vd_vo_vd_vd(o, vneg_vd_vd(y), x), + vsel_vd_vo_vd_vd(o, x, y)); u = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), d, u); - + return u; #else // #if !defined(DETERMINISTIC) - vdouble u, s, x; + vdouble u, s, x, y; vopmask o; vint ql; @@ -1648,58 +1636,29 @@ EXPORT CONST VECTOR_CC vdouble xtan(vdouble d) { } } - x = s; - - s = vmul_vd_vd_vd(s, s); + x = vmul_vd_vd_vd(s, vcast_vd_d(0.5)); + s = vmul_vd_vd_vd(x, x); - o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1))); - x = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(x))); - -#ifdef SPLIT_KERNEL - vdouble s2 = vmul_vd_vd_vd(s, s), v; - - u = vcast_vd_d(-4.31184585467324750724175e-05); - u = vmla_vd_vd_vd_vd(u, s2, vcast_vd_d(-0.000137892809714281708733524)); - u = vmla_vd_vd_vd_vd(u, s2, vcast_vd_d(-6.07500301486087879295969e-05)); - u = vmla_vd_vd_vd_vd(u, s2, vcast_vd_d(0.000219040550724571513561967)); - u = vmla_vd_vd_vd_vd(u, s2, vcast_vd_d(0.00145461240472358871965441)); - u = vmla_vd_vd_vd_vd(u, s2, vcast_vd_d(0.00886321546662684547901456)); - u = vmla_vd_vd_vd_vd(u, s2, vcast_vd_d(0.0539682539049961967903002)); - u = vmla_vd_vd_vd_vd(u, s2, vcast_vd_d(0.333333333333320047664472)); - - v = vcast_vd_d(9.99583485362149960784268e-06); - v = vmla_vd_vd_vd_vd(v, s2, vcast_vd_d(0.000103573238391744000389851)); - v = vmla_vd_vd_vd_vd(v, s2, vcast_vd_d(0.000157624358465342784274554)); - v = vmla_vd_vd_vd_vd(v, s2, vcast_vd_d(0.000148898734751616411290179)); - v = vmla_vd_vd_vd_vd(v, s2, vcast_vd_d(0.000595799595197098359744547)); - v = vmla_vd_vd_vd_vd(v, s2, vcast_vd_d(0.0035923150771440177410343)); - v = vmla_vd_vd_vd_vd(v, s2, vcast_vd_d(0.0218694899718446938985394)); - v = vmla_vd_vd_vd_vd(v, s2, vcast_vd_d(0.133333333334818976423364)); - - u = vmla_vd_vd_vd_vd(v, s, u); -#else // #ifdef SPLIT_KERNEL - u = vcast_vd_d(9.99583485362149960784268e-06); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-4.31184585467324750724175e-05)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000103573238391744000389851)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.000137892809714281708733524)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000157624358465342784274554)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-6.07500301486087879295969e-05)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000148898734751616411290179)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000219040550724571513561967)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000595799595197098359744547)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00145461240472358871965441)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0035923150771440177410343)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00886321546662684547901456)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0218694899718446938985394)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0539682539049961967903002)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.133333333334818976423364)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.333333333333320047664472)); -#endif // #ifdef SPLIT_KERNEL - + vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2); + u = POLY8(s, s2, s4, + +0.3245098826639276316e-3, + +0.5619219738114323735e-3, + +0.1460781502402784494e-2, + +0.3591611540792499519e-2, + +0.8863268409563113126e-2, + +0.2186948728185535498e-1, + +0.5396825399517272970e-1, + +0.1333333333330500581e+0); + + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.3333333333333343695e+0)); u = vmla_vd_vd_vd_vd(s, vmul_vd_vd_vd(u, x), x); - u = vsel_vd_vo_vd_vd(o, vrec_vd_vd(u), u); + y = vmla_vd_vd_vd_vd(u, u, vcast_vd_d(-1)); + x = vmul_vd_vd_vd(u, vcast_vd_d(-2)); + o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1))); + u = vdiv_vd_vd_vd(vsel_vd_vo_vd_vd(o, vneg_vd_vd(y), x), + vsel_vd_vo_vd_vd(o, x, y)); u = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), d, u); return u; @@ -1709,7 +1668,7 @@ EXPORT CONST VECTOR_CC vdouble xtan(vdouble d) { EXPORT CONST VECTOR_CC vdouble xtan_u1(vdouble d) { #if !defined(DETERMINISTIC) vdouble u; - vdouble2 s, t, x; + vdouble2 s, t, x, y; vopmask o; vint ql; @@ -1742,56 +1701,31 @@ EXPORT CONST VECTOR_CC vdouble xtan_u1(vdouble d) { s.x = vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(s.x))); s.y = vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(s.y))); } - - o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1))); - vmask n = vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))); - s.x = vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(s.x), n)); - s.y = vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(s.y), n)); - t = s; - s = ddsqu_vd2_vd2(s); + t = ddscale_vd2_vd2_vd(s, vcast_vd_d(0.5)); + s = ddsqu_vd2_vd2(t); + + vdouble s2 = vmul_vd_vd_vd(s.x, s.x), s4 = vmul_vd_vd_vd(s2, s2); + u = POLY8(s.x, s2, s4, + +0.3245098826639276316e-3, + +0.5619219738114323735e-3, + +0.1460781502402784494e-2, + +0.3591611540792499519e-2, + +0.8863268409563113126e-2, + +0.2186948728185535498e-1, + +0.5396825399517272970e-1, + +0.1333333333330500581e+0); + + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(+0.3333333333333343695e+0)); + x = ddadd_vd2_vd2_vd2(t, ddmul_vd2_vd2_vd(ddmul_vd2_vd2_vd2(s, t), u)); + + y = ddadd_vd2_vd_vd2(vcast_vd_d(-1), ddsqu_vd2_vd2(x)); + x = ddscale_vd2_vd2_vd(x, vcast_vd_d(-2)); + + o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1))); -#ifdef SPLIT_KERNEL - vdouble sx2 = vmul_vd_vd_vd(s.x, s.x), v; - - u = vcast_vd_d(-2.59519791585924697698614e-05); - u = vmla_vd_vd_vd_vd(u, sx2, vcast_vd_d(-3.05033014433946488225616e-05)); - u = vmla_vd_vd_vd_vd(u, sx2, vcast_vd_d(8.09674518280159187045078e-05)); - u = vmla_vd_vd_vd_vd(u, sx2, vcast_vd_d(0.000588505168743587154904506)); - u = vmla_vd_vd_vd_vd(u, sx2, vcast_vd_d(0.00359208743836906619142924)); - u = vmla_vd_vd_vd_vd(u, sx2, vcast_vd_d(0.0218694882853846389592078)); - u = vmla_vd_vd_vd_vd(u, sx2, vcast_vd_d(0.133333333333125941821962)); - - v = vcast_vd_d(1.01419718511083373224408e-05); - v = vmla_vd_vd_vd_vd(v, sx2, vcast_vd_d(5.23388081915899855325186e-05)); - v = vmla_vd_vd_vd_vd(v, sx2, vcast_vd_d(7.14707504084242744267497e-05)); - v = vmla_vd_vd_vd_vd(v, sx2, vcast_vd_d(0.000244884931879331847054404)); - v = vmla_vd_vd_vd_vd(v, sx2, vcast_vd_d(0.00145612788922812427978848)); - v = vmla_vd_vd_vd_vd(v, sx2, vcast_vd_d(0.00886323944362401618113356)); - v = vmla_vd_vd_vd_vd(v, sx2, vcast_vd_d(0.0539682539781298417636002)); - - u = vmla_vd_vd_vd_vd(v, s.x, u); -#else // #ifdef SPLIT_KERNEL - u = vcast_vd_d(1.01419718511083373224408e-05); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-2.59519791585924697698614e-05)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(5.23388081915899855325186e-05)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-3.05033014433946488225616e-05)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(7.14707504084242744267497e-05)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(8.09674518280159187045078e-05)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.000244884931879331847054404)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.000588505168743587154904506)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.00145612788922812427978848)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.00359208743836906619142924)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.00886323944362401618113356)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.0218694882853846389592078)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.0539682539781298417636002)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.133333333333125941821962)); -#endif // #ifdef SPLIT_KERNEL - - x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd2_vd2(ddadd_vd2_vd_vd(vcast_vd_d(0.333333333333334980164153), vmul_vd_vd_vd(u, s.x)), s)); - x = ddmul_vd2_vd2_vd2(t, x); - - x = vsel_vd2_vo_vd2_vd2(o, ddrec_vd2_vd2(x), x); + x = dddiv_vd2_vd2_vd2(vsel_vd2_vo_vd2_vd2(o, ddneg_vd2_vd2(y), x), + vsel_vd2_vo_vd2_vd2(o, x, y)); u = vadd_vd_vd_vd(x.x, x.y); @@ -1802,7 +1736,7 @@ EXPORT CONST VECTOR_CC vdouble xtan_u1(vdouble d) { #else // #if !defined(DETERMINISTIC) vdouble u; - vdouble2 s, t, x; + vdouble2 s, t, x, y; vopmask o; vint ql; @@ -1844,55 +1778,30 @@ EXPORT CONST VECTOR_CC vdouble xtan_u1(vdouble d) { } } - o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1))); - vmask n = vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))); - s.x = vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(s.x), n)); - s.y = vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(s.y), n)); + t = ddscale_vd2_vd2_vd(s, vcast_vd_d(0.5)); + s = ddsqu_vd2_vd2(t); - t = s; - s = ddsqu_vd2_vd2(s); + vdouble s2 = vmul_vd_vd_vd(s.x, s.x), s4 = vmul_vd_vd_vd(s2, s2); + u = POLY8(s.x, s2, s4, + +0.3245098826639276316e-3, + +0.5619219738114323735e-3, + +0.1460781502402784494e-2, + +0.3591611540792499519e-2, + +0.8863268409563113126e-2, + +0.2186948728185535498e-1, + +0.5396825399517272970e-1, + +0.1333333333330500581e+0); + + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(+0.3333333333333343695e+0)); + x = ddadd_vd2_vd2_vd2(t, ddmul_vd2_vd2_vd(ddmul_vd2_vd2_vd2(s, t), u)); + + y = ddadd_vd2_vd_vd2(vcast_vd_d(-1), ddsqu_vd2_vd2(x)); + x = ddscale_vd2_vd2_vd(x, vcast_vd_d(-2)); + + o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1))); -#ifdef SPLIT_KERNEL - vdouble sx2 = vmul_vd_vd_vd(s.x, s.x), v; - - u = vcast_vd_d(-2.59519791585924697698614e-05); - u = vmla_vd_vd_vd_vd(u, sx2, vcast_vd_d(-3.05033014433946488225616e-05)); - u = vmla_vd_vd_vd_vd(u, sx2, vcast_vd_d(8.09674518280159187045078e-05)); - u = vmla_vd_vd_vd_vd(u, sx2, vcast_vd_d(0.000588505168743587154904506)); - u = vmla_vd_vd_vd_vd(u, sx2, vcast_vd_d(0.00359208743836906619142924)); - u = vmla_vd_vd_vd_vd(u, sx2, vcast_vd_d(0.0218694882853846389592078)); - u = vmla_vd_vd_vd_vd(u, sx2, vcast_vd_d(0.133333333333125941821962)); - - v = vcast_vd_d(1.01419718511083373224408e-05); - v = vmla_vd_vd_vd_vd(v, sx2, vcast_vd_d(5.23388081915899855325186e-05)); - v = vmla_vd_vd_vd_vd(v, sx2, vcast_vd_d(7.14707504084242744267497e-05)); - v = vmla_vd_vd_vd_vd(v, sx2, vcast_vd_d(0.000244884931879331847054404)); - v = vmla_vd_vd_vd_vd(v, sx2, vcast_vd_d(0.00145612788922812427978848)); - v = vmla_vd_vd_vd_vd(v, sx2, vcast_vd_d(0.00886323944362401618113356)); - v = vmla_vd_vd_vd_vd(v, sx2, vcast_vd_d(0.0539682539781298417636002)); - - u = vmla_vd_vd_vd_vd(v, s.x, u); -#else // #ifdef SPLIT_KERNEL - u = vcast_vd_d(1.01419718511083373224408e-05); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-2.59519791585924697698614e-05)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(5.23388081915899855325186e-05)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-3.05033014433946488225616e-05)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(7.14707504084242744267497e-05)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(8.09674518280159187045078e-05)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.000244884931879331847054404)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.000588505168743587154904506)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.00145612788922812427978848)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.00359208743836906619142924)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.00886323944362401618113356)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.0218694882853846389592078)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.0539682539781298417636002)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.133333333333125941821962)); -#endif // #ifdef SPLIT_KERNEL - - x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd2_vd2(ddadd_vd2_vd_vd(vcast_vd_d(0.333333333333334980164153), vmul_vd_vd_vd(u, s.x)), s)); - x = ddmul_vd2_vd2_vd2(t, x); - - x = vsel_vd2_vo_vd2_vd2(o, ddrec_vd2_vd2(x), x); + x = dddiv_vd2_vd2_vd2(vsel_vd2_vo_vd2_vd2(o, ddneg_vd2_vd2(y), x), + vsel_vd2_vo_vd2_vd2(o, x, y)); u = vadd_vd_vd_vd(x.x, x.y); @@ -1918,52 +1827,27 @@ static INLINE CONST VECTOR_CC vdouble atan2k(vdouble y, vdouble x) { s = vdiv_vd_vd_vd(s, t); t = vmul_vd_vd_vd(s, s); -#ifdef SPLIT_KERNEL - vdouble t2 = vmul_vd_vd_vd(t, t), v; - - u = vcast_vd_d(-1.88796008463073496563746e-05); - u = vmla_vd_vd_vd_vd(u, t2, vcast_vd_d(-0.00110611831486672482563471)); - u = vmla_vd_vd_vd_vd(u, t2, vcast_vd_d(-0.00889896195887655491740809)); - u = vmla_vd_vd_vd_vd(u, t2, vcast_vd_d(-0.0254517624932312641616861)); - u = vmla_vd_vd_vd_vd(u, t2, vcast_vd_d(-0.0407629191276836500001934)); - u = vmla_vd_vd_vd_vd(u, t2, vcast_vd_d(-0.0523674852303482457616113)); - u = vmla_vd_vd_vd_vd(u, t2, vcast_vd_d(-0.0666573579361080525984562)); - u = vmla_vd_vd_vd_vd(u, t2, vcast_vd_d(-0.090908995008245008229153)); - u = vmla_vd_vd_vd_vd(u, t2, vcast_vd_d(-0.14285714266771329383765)); - u = vmla_vd_vd_vd_vd(u, t2, vcast_vd_d(-0.333333333333311110369124)); - - v = vcast_vd_d(0.000209850076645816976906797); - v = vmla_vd_vd_vd_vd(v, t2, vcast_vd_d(0.00370026744188713119232403)); - v = vmla_vd_vd_vd_vd(v, t2, vcast_vd_d(0.016599329773529201970117)); - v = vmla_vd_vd_vd_vd(v, t2, vcast_vd_d(0.0337852580001353069993897)); - v = vmla_vd_vd_vd_vd(v, t2, vcast_vd_d(0.0466667150077840625632675)); - v = vmla_vd_vd_vd_vd(v, t2, vcast_vd_d(0.0587666392926673580854313)); - v = vmla_vd_vd_vd_vd(v, t2, vcast_vd_d(0.0769219538311769618355029)); - v = vmla_vd_vd_vd_vd(v, t2, vcast_vd_d(0.111111105648261418443745)); - v = vmla_vd_vd_vd_vd(v, t2, vcast_vd_d(0.199999999996591265594148)); - - u = vmla_vd_vd_vd_vd(v, t, u); -#else // #ifdef SPLIT_KERNEL - u = vcast_vd_d(-1.88796008463073496563746e-05); - u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.000209850076645816976906797)); - u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.00110611831486672482563471)); - u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.00370026744188713119232403)); - u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.00889896195887655491740809)); - u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.016599329773529201970117)); - u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.0254517624932312641616861)); - u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.0337852580001353069993897)); - u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.0407629191276836500001934)); - u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.0466667150077840625632675)); - u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.0523674852303482457616113)); - u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.0587666392926673580854313)); - u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.0666573579361080525984562)); - u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.0769219538311769618355029)); - u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.090908995008245008229153)); - u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.111111105648261418443745)); - u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.14285714266771329383765)); - u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.199999999996591265594148)); - u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.333333333333311110369124)); -#endif // #ifdef SPLIT_KERNEL + vdouble t2 = vmul_vd_vd_vd(t, t), t4 = vmul_vd_vd_vd(t2, t2), t8 = vmul_vd_vd_vd(t4, t4), t16 = vmul_vd_vd_vd(t8, t8); + u = POLY19(t, t2, t4, t8, t16, + -1.88796008463073496563746e-05, + 0.000209850076645816976906797, + -0.00110611831486672482563471, + 0.00370026744188713119232403, + -0.00889896195887655491740809, + 0.016599329773529201970117, + -0.0254517624932312641616861, + 0.0337852580001353069993897, + -0.0407629191276836500001934, + 0.0466667150077840625632675, + -0.0523674852303482457616113, + 0.0587666392926673580854313, + -0.0666573579361080525984562, + 0.0769219538311769618355029, + -0.090908995008245008229153, + 0.111111105648261418443745, + -0.14285714266771329383765, + 0.199999999996591265594148, + -0.333333333333311110369124); t = vmla_vd_vd_vd_vd(s, vmul_vd_vd_vd(t, u), s); t = vmla_vd_vd_vd_vd(vcast_vd_vi(q), vcast_vd_d(M_PI/2), t); @@ -1992,57 +1876,31 @@ static INLINE CONST VECTOR_CC vdouble2 atan2k_u1(vdouble2 y, vdouble2 x) { t = ddsqu_vd2_vd2(s); t = ddnormalize_vd2_vd2(t); -#ifdef SPLIT_KERNEL - vdouble tx3 = vmul_vd_vd_vd(vmul_vd_vd_vd(t.x, t.x), t.x), v; - - u = vcast_vd_d(0.00070557664296393412389774); - u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.00251865614498713360352999)); - u = vmla_vd_vd_vd_vd(u, tx3, vcast_vd_d(0.0208024799924145797902497)); - u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.0289002344784740315686289)); - u = vmla_vd_vd_vd_vd(u, tx3, vcast_vd_d(0.0470843011653283988193763)); - u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.0524914210588448421068719)); - u = vmla_vd_vd_vd_vd(u, tx3, vcast_vd_d(0.0769225330296203768654095)); - u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.0909090442773387574781907)); - u = vmla_vd_vd_vd_vd(u, tx3, vcast_vd_d(0.199999999997977351284817)); - u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.333333333333317605173818)); - - v = vcast_vd_d(1.06298484191448746607415e-05); - v = vmla_vd_vd_vd_vd(v, t.x, vcast_vd_d(-0.000125620649967286867384336)); - v = vmla_vd_vd_vd_vd(v, tx3, vcast_vd_d(0.00646262899036991172313504)); - v = vmla_vd_vd_vd_vd(v, t.x, vcast_vd_d(-0.0128281333663399031014274)); - v = vmla_vd_vd_vd_vd(v, tx3, vcast_vd_d(0.0359785005035104590853656)); - v = vmla_vd_vd_vd_vd(v, t.x, vcast_vd_d(-0.041848579703592507506027)); - v = vmla_vd_vd_vd_vd(v, tx3, vcast_vd_d(0.0587946590969581003860434)); - v = vmla_vd_vd_vd_vd(v, t.x, vcast_vd_d(-0.0666620884778795497194182)); - v = vmla_vd_vd_vd_vd(v, tx3, vcast_vd_d(0.111111108376896236538123)); - v = vmla_vd_vd_vd_vd(v, t.x, vcast_vd_d(-0.142857142756268568062339)); - - u = vmla_vd_vd_vd_vd(v, vmul_vd_vd_vd(t.x, t.x), u); -#else // #ifdef SPLIT_KERNEL - u = vcast_vd_d(1.06298484191448746607415e-05); - u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.000125620649967286867384336)); - u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(0.00070557664296393412389774)); - u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.00251865614498713360352999)); - u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(0.00646262899036991172313504)); - u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.0128281333663399031014274)); - u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(0.0208024799924145797902497)); - u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.0289002344784740315686289)); - u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(0.0359785005035104590853656)); - u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.041848579703592507506027)); - u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(0.0470843011653283988193763)); - u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.0524914210588448421068719)); - u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(0.0587946590969581003860434)); - u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.0666620884778795497194182)); - u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(0.0769225330296203768654095)); - u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.0909090442773387574781907)); - u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(0.111111108376896236538123)); + vdouble t2 = vmul_vd_vd_vd(t.x, t.x), t4 = vmul_vd_vd_vd(t2, t2), t8 = vmul_vd_vd_vd(t4, t4), t16 = vmul_vd_vd_vd(t8, t8); + u = POLY17(t.x, t2, t4, t8, t16, + 1.06298484191448746607415e-05, + -0.000125620649967286867384336, + 0.00070557664296393412389774, + -0.00251865614498713360352999, + 0.00646262899036991172313504, + -0.0128281333663399031014274, + 0.0208024799924145797902497, + -0.0289002344784740315686289, + 0.0359785005035104590853656, + -0.041848579703592507506027, + 0.0470843011653283988193763, + -0.0524914210588448421068719, + 0.0587946590969581003860434, + -0.0666620884778795497194182, + 0.0769225330296203768654095, + -0.0909090442773387574781907, + 0.111111108376896236538123); u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.142857142756268568062339)); u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(0.199999999997977351284817)); u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.333333333333317605173818)); -#endif // #ifdef SPLIT_KERNEL + + t = ddadd_vd2_vd2_vd2(s, ddmul_vd2_vd2_vd(ddmul_vd2_vd2_vd2(s, t), u)); - t = ddmul_vd2_vd2_vd(t, u); - t = ddmul_vd2_vd2_vd2(s, ddadd_vd2_vd_vd2(vcast_vd_d(1), t)); t = ddadd_vd2_vd2_vd2(ddmul_vd2_vd2_vd(vcast_vd2_d_d(1.570796326794896557998982, 6.12323399573676603586882e-17), vcast_vd_vi(q)), t); return t; @@ -2090,38 +1948,20 @@ EXPORT CONST VECTOR_CC vdouble xasin(vdouble d) { vdouble x2 = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, d), vmul_vd_vd_vd(vsub_vd_vd_vd(vcast_vd_d(1), vabs_vd_vd(d)), vcast_vd_d(0.5))); vdouble x = vsel_vd_vo_vd_vd(o, vabs_vd_vd(d), vsqrt_vd_vd(x2)), u; -#ifdef SPLIT_KERNEL - vdouble x4 = vmul_vd_vd_vd(x2, x2), v; - - u = vcast_vd_d(-0.1581918243329996643e-1); - u = vmla_vd_vd_vd_vd(u, x4, vcast_vd_d(+0.6606077476277170610e-2)); - u = vmla_vd_vd_vd_vd(u, x4, vcast_vd_d(+0.1388715184501609218e-1)); - u = vmla_vd_vd_vd_vd(u, x4, vcast_vd_d(+0.2237176181932048341e-1)); - u = vmla_vd_vd_vd_vd(u, x4, vcast_vd_d(+0.4464285681377102438e-1)); - u = vmla_vd_vd_vd_vd(u, x4, vcast_vd_d(+0.1666666666666497543e+0)); - - v = vcast_vd_d(+0.3161587650653934628e-1); - v = vmla_vd_vd_vd_vd(v, x4, vcast_vd_d(+0.1929045477267910674e-1)); - v = vmla_vd_vd_vd_vd(v, x4, vcast_vd_d(+0.1215360525577377331e-1)); - v = vmla_vd_vd_vd_vd(v, x4, vcast_vd_d(+0.1735956991223614604e-1)); - v = vmla_vd_vd_vd_vd(v, x4, vcast_vd_d(+0.3038195928038132237e-1)); - v = vmla_vd_vd_vd_vd(v, x4, vcast_vd_d(+0.7500000000378581611e-1)); - - u = vmla_vd_vd_vd_vd(v, x2, u); -#else // #ifdef SPLIT_KERNEL - u = vcast_vd_d(+0.3161587650653934628e-1); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(-0.1581918243329996643e-1)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1929045477267910674e-1)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.6606077476277170610e-2)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1215360525577377331e-1)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1388715184501609218e-1)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1735956991223614604e-1)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.2237176181932048341e-1)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.3038195928038132237e-1)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.4464285681377102438e-1)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.7500000000378581611e-1)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1666666666666497543e+0)); -#endif // #ifdef SPLIT_KERNEL + vdouble x4 = vmul_vd_vd_vd(x2, x2), x8 = vmul_vd_vd_vd(x4, x4), x16 = vmul_vd_vd_vd(x8, x8); + u = POLY12(x2, x4, x8, x16, + +0.3161587650653934628e-1, + -0.1581918243329996643e-1, + +0.1929045477267910674e-1, + +0.6606077476277170610e-2, + +0.1215360525577377331e-1, + +0.1388715184501609218e-1, + +0.1735956991223614604e-1, + +0.2237176181932048341e-1, + +0.3038195928038132237e-1, + +0.4464285681377102438e-1, + +0.7500000000378581611e-1, + +0.1666666666666497543e+0); u = vmla_vd_vd_vd_vd(u, vmul_vd_vd_vd(x, x2), x); @@ -2135,38 +1975,20 @@ EXPORT CONST VECTOR_CC vdouble xasin_u1(vdouble d) { vdouble2 x = vsel_vd2_vo_vd2_vd2(o, vcast_vd2_vd_vd(vabs_vd_vd(d), vcast_vd_d(0)), ddsqrt_vd2_vd(x2)); x = vsel_vd2_vo_vd2_vd2(veq_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(1.0)), vcast_vd2_d_d(0, 0), x); -#ifdef SPLIT_KERNEL - vdouble x4 = vmul_vd_vd_vd(x2, x2), v; - - u = vcast_vd_d(-0.1581918243329996643e-1); - u = vmla_vd_vd_vd_vd(u, x4, vcast_vd_d(+0.6606077476277170610e-2)); - u = vmla_vd_vd_vd_vd(u, x4, vcast_vd_d(+0.1388715184501609218e-1)); - u = vmla_vd_vd_vd_vd(u, x4, vcast_vd_d(+0.2237176181932048341e-1)); - u = vmla_vd_vd_vd_vd(u, x4, vcast_vd_d(+0.4464285681377102438e-1)); - u = vmla_vd_vd_vd_vd(u, x4, vcast_vd_d(+0.1666666666666497543e+0)); - - v = vcast_vd_d(+0.3161587650653934628e-1); - v = vmla_vd_vd_vd_vd(v, x4, vcast_vd_d(+0.1929045477267910674e-1)); - v = vmla_vd_vd_vd_vd(v, x4, vcast_vd_d(+0.1215360525577377331e-1)); - v = vmla_vd_vd_vd_vd(v, x4, vcast_vd_d(+0.1735956991223614604e-1)); - v = vmla_vd_vd_vd_vd(v, x4, vcast_vd_d(+0.3038195928038132237e-1)); - v = vmla_vd_vd_vd_vd(v, x4, vcast_vd_d(+0.7500000000378581611e-1)); - - u = vmla_vd_vd_vd_vd(v, x2, u); -#else // #ifdef SPLIT_KERNEL - u = vcast_vd_d(+0.3161587650653934628e-1); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(-0.1581918243329996643e-1)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1929045477267910674e-1)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.6606077476277170610e-2)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1215360525577377331e-1)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1388715184501609218e-1)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1735956991223614604e-1)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.2237176181932048341e-1)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.3038195928038132237e-1)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.4464285681377102438e-1)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.7500000000378581611e-1)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1666666666666497543e+0)); -#endif // #ifdef SPLIT_KERNEL + vdouble x4 = vmul_vd_vd_vd(x2, x2), x8 = vmul_vd_vd_vd(x4, x4), x16 = vmul_vd_vd_vd(x8, x8); + u = POLY12(x2, x4, x8, x16, + +0.3161587650653934628e-1, + -0.1581918243329996643e-1, + +0.1929045477267910674e-1, + +0.6606077476277170610e-2, + +0.1215360525577377331e-1, + +0.1388715184501609218e-1, + +0.1735956991223614604e-1, + +0.2237176181932048341e-1, + +0.3038195928038132237e-1, + +0.4464285681377102438e-1, + +0.7500000000378581611e-1, + +0.1666666666666497543e+0); u = vmul_vd_vd_vd(u, vmul_vd_vd_vd(x2, x.x)); @@ -2184,38 +2006,20 @@ EXPORT CONST VECTOR_CC vdouble xacos(vdouble d) { vdouble x = vsel_vd_vo_vd_vd(o, vabs_vd_vd(d), vsqrt_vd_vd(x2)); x = vsel_vd_vo_vd_vd(veq_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(1.0)), vcast_vd_d(0), x); -#ifdef SPLIT_KERNEL - vdouble x4 = vmul_vd_vd_vd(x2, x2), v; - - u = vcast_vd_d(-0.1581918243329996643e-1); - u = vmla_vd_vd_vd_vd(u, x4, vcast_vd_d(+0.6606077476277170610e-2)); - u = vmla_vd_vd_vd_vd(u, x4, vcast_vd_d(+0.1388715184501609218e-1)); - u = vmla_vd_vd_vd_vd(u, x4, vcast_vd_d(+0.2237176181932048341e-1)); - u = vmla_vd_vd_vd_vd(u, x4, vcast_vd_d(+0.4464285681377102438e-1)); - u = vmla_vd_vd_vd_vd(u, x4, vcast_vd_d(+0.1666666666666497543e+0)); - - v = vcast_vd_d(+0.3161587650653934628e-1); - v = vmla_vd_vd_vd_vd(v, x4, vcast_vd_d(+0.1929045477267910674e-1)); - v = vmla_vd_vd_vd_vd(v, x4, vcast_vd_d(+0.1215360525577377331e-1)); - v = vmla_vd_vd_vd_vd(v, x4, vcast_vd_d(+0.1735956991223614604e-1)); - v = vmla_vd_vd_vd_vd(v, x4, vcast_vd_d(+0.3038195928038132237e-1)); - v = vmla_vd_vd_vd_vd(v, x4, vcast_vd_d(+0.7500000000378581611e-1)); - - u = vmla_vd_vd_vd_vd(v, x2, u); -#else // #ifdef SPLIT_KERNEL - u = vcast_vd_d(+0.3161587650653934628e-1); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(-0.1581918243329996643e-1)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1929045477267910674e-1)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.6606077476277170610e-2)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1215360525577377331e-1)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1388715184501609218e-1)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1735956991223614604e-1)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.2237176181932048341e-1)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.3038195928038132237e-1)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.4464285681377102438e-1)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.7500000000378581611e-1)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1666666666666497543e+0)); -#endif // #ifdef SPLIT_KERNEL + vdouble x4 = vmul_vd_vd_vd(x2, x2), x8 = vmul_vd_vd_vd(x4, x4), x16 = vmul_vd_vd_vd(x8, x8); + u = POLY12(x2, x4, x8, x16, + +0.3161587650653934628e-1, + -0.1581918243329996643e-1, + +0.1929045477267910674e-1, + +0.6606077476277170610e-2, + +0.1215360525577377331e-1, + +0.1388715184501609218e-1, + +0.1735956991223614604e-1, + +0.2237176181932048341e-1, + +0.3038195928038132237e-1, + +0.4464285681377102438e-1, + +0.7500000000378581611e-1, + +0.1666666666666497543e+0); u = vmul_vd_vd_vd(u, vmul_vd_vd_vd(x2, x)); @@ -2233,38 +2037,20 @@ EXPORT CONST VECTOR_CC vdouble xacos_u1(vdouble d) { vdouble2 x = vsel_vd2_vo_vd2_vd2(o, vcast_vd2_vd_vd(vabs_vd_vd(d), vcast_vd_d(0)), ddsqrt_vd2_vd(x2)); x = vsel_vd2_vo_vd2_vd2(veq_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(1.0)), vcast_vd2_d_d(0, 0), x); -#ifdef SPLIT_KERNEL - vdouble x4 = vmul_vd_vd_vd(x2, x2), v; - - u = vcast_vd_d(-0.1581918243329996643e-1); - u = vmla_vd_vd_vd_vd(u, x4, vcast_vd_d(+0.6606077476277170610e-2)); - u = vmla_vd_vd_vd_vd(u, x4, vcast_vd_d(+0.1388715184501609218e-1)); - u = vmla_vd_vd_vd_vd(u, x4, vcast_vd_d(+0.2237176181932048341e-1)); - u = vmla_vd_vd_vd_vd(u, x4, vcast_vd_d(+0.4464285681377102438e-1)); - u = vmla_vd_vd_vd_vd(u, x4, vcast_vd_d(+0.1666666666666497543e+0)); - - v = vcast_vd_d(+0.3161587650653934628e-1); - v = vmla_vd_vd_vd_vd(v, x4, vcast_vd_d(+0.1929045477267910674e-1)); - v = vmla_vd_vd_vd_vd(v, x4, vcast_vd_d(+0.1215360525577377331e-1)); - v = vmla_vd_vd_vd_vd(v, x4, vcast_vd_d(+0.1735956991223614604e-1)); - v = vmla_vd_vd_vd_vd(v, x4, vcast_vd_d(+0.3038195928038132237e-1)); - v = vmla_vd_vd_vd_vd(v, x4, vcast_vd_d(+0.7500000000378581611e-1)); - - u = vmla_vd_vd_vd_vd(v, x2, u); -#else // #ifdef SPLIT_KERNEL - u = vcast_vd_d(+0.3161587650653934628e-1); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(-0.1581918243329996643e-1)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1929045477267910674e-1)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.6606077476277170610e-2)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1215360525577377331e-1)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1388715184501609218e-1)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1735956991223614604e-1)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.2237176181932048341e-1)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.3038195928038132237e-1)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.4464285681377102438e-1)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.7500000000378581611e-1)); - u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1666666666666497543e+0)); -#endif // #ifdef SPLIT_KERNEL + vdouble x4 = vmul_vd_vd_vd(x2, x2), x8 = vmul_vd_vd_vd(x4, x4), x16 = vmul_vd_vd_vd(x8, x8); + u = POLY12(x2, x4, x8, x16, + +0.3161587650653934628e-1, + -0.1581918243329996643e-1, + +0.1929045477267910674e-1, + +0.6606077476277170610e-2, + +0.1215360525577377331e-1, + +0.1388715184501609218e-1, + +0.1735956991223614604e-1, + +0.2237176181932048341e-1, + +0.3038195928038132237e-1, + +0.4464285681377102438e-1, + +0.7500000000378581611e-1, + +0.1666666666666497543e+0); u = vmul_vd_vd_vd(u, vmul_vd_vd_vd(x2, x.x)); @@ -2302,52 +2088,27 @@ EXPORT CONST VECTOR_CC vdouble xatan(vdouble s) { t = vmul_vd_vd_vd(s, s); -#ifdef SPLIT_KERNEL - vdouble t2 = vmul_vd_vd_vd(t, t), v; - - u = vcast_vd_d(-1.88796008463073496563746e-05); - u = vmla_vd_vd_vd_vd(u, t2, vcast_vd_d(-0.00110611831486672482563471)); - u = vmla_vd_vd_vd_vd(u, t2, vcast_vd_d(-0.00889896195887655491740809)); - u = vmla_vd_vd_vd_vd(u, t2, vcast_vd_d(-0.0254517624932312641616861)); - u = vmla_vd_vd_vd_vd(u, t2, vcast_vd_d(-0.0407629191276836500001934)); - u = vmla_vd_vd_vd_vd(u, t2, vcast_vd_d(-0.0523674852303482457616113)); - u = vmla_vd_vd_vd_vd(u, t2, vcast_vd_d(-0.0666573579361080525984562)); - u = vmla_vd_vd_vd_vd(u, t2, vcast_vd_d(-0.090908995008245008229153)); - u = vmla_vd_vd_vd_vd(u, t2, vcast_vd_d(-0.14285714266771329383765)); - u = vmla_vd_vd_vd_vd(u, t2, vcast_vd_d(-0.333333333333311110369124)); - - v = vcast_vd_d(0.000209850076645816976906797); - v = vmla_vd_vd_vd_vd(v, t2, vcast_vd_d(0.00370026744188713119232403)); - v = vmla_vd_vd_vd_vd(v, t2, vcast_vd_d(0.016599329773529201970117)); - v = vmla_vd_vd_vd_vd(v, t2, vcast_vd_d(0.0337852580001353069993897)); - v = vmla_vd_vd_vd_vd(v, t2, vcast_vd_d(0.0466667150077840625632675)); - v = vmla_vd_vd_vd_vd(v, t2, vcast_vd_d(0.0587666392926673580854313)); - v = vmla_vd_vd_vd_vd(v, t2, vcast_vd_d(0.0769219538311769618355029)); - v = vmla_vd_vd_vd_vd(v, t2, vcast_vd_d(0.111111105648261418443745)); - v = vmla_vd_vd_vd_vd(v, t2, vcast_vd_d(0.199999999996591265594148)); - - u = vmla_vd_vd_vd_vd(v, t, u); -#else // #ifdef SPLIT_KERNEL - u = vcast_vd_d(-1.88796008463073496563746e-05); - u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.000209850076645816976906797)); - u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.00110611831486672482563471)); - u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.00370026744188713119232403)); - u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.00889896195887655491740809)); - u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.016599329773529201970117)); - u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.0254517624932312641616861)); - u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.0337852580001353069993897)); - u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.0407629191276836500001934)); - u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.0466667150077840625632675)); - u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.0523674852303482457616113)); - u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.0587666392926673580854313)); - u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.0666573579361080525984562)); - u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.0769219538311769618355029)); - u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.090908995008245008229153)); - u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.111111105648261418443745)); - u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.14285714266771329383765)); - u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.199999999996591265594148)); - u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.333333333333311110369124)); -#endif // #ifdef SPLIT_KERNEL + vdouble t2 = vmul_vd_vd_vd(t, t), t4 = vmul_vd_vd_vd(t2, t2), t8 = vmul_vd_vd_vd(t4, t4), t16 = vmul_vd_vd_vd(t8, t8); + u = POLY19(t, t2, t4, t8, t16, + -1.88796008463073496563746e-05, + 0.000209850076645816976906797, + -0.00110611831486672482563471, + 0.00370026744188713119232403, + -0.00889896195887655491740809, + 0.016599329773529201970117, + -0.0254517624932312641616861, + 0.0337852580001353069993897, + -0.0407629191276836500001934, + 0.0466667150077840625632675, + -0.0523674852303482457616113, + 0.0587666392926673580854313, + -0.0666573579361080525984562, + 0.0769219538311769618355029, + -0.090908995008245008229153, + 0.111111105648261418443745, + -0.14285714266771329383765, + 0.199999999996591265594148, + -0.333333333333311110369124); t = vmla_vd_vd_vd_vd(s, vmul_vd_vd_vd(t, u), s); @@ -2381,23 +2142,27 @@ EXPORT CONST VECTOR_CC vdouble xlog(vdouble d) { x = vdiv_vd_vd_vd(vadd_vd_vd_vd(vcast_vd_d(-1), m), vadd_vd_vd_vd(vcast_vd_d(1), m)); x2 = vmul_vd_vd_vd(x, x); - t = vcast_vd_d(0.153487338491425068243146); - t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.152519917006351951593857)); - t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.181863266251982985677316)); - t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.222221366518767365905163)); - t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.285714294746548025383248)); - t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.399999999950799600689777)); - t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.6666666666667778740063)); - t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(2)); + vdouble x4 = vmul_vd_vd_vd(x2, x2), x8 = vmul_vd_vd_vd(x4, x4), x3 = vmul_vd_vd_vd(x, x2); + t = POLY7(x2, x4, x8, + 0.153487338491425068243146, + 0.152519917006351951593857, + 0.181863266251982985677316, + 0.222221366518767365905163, + 0.285714294746548025383248, + 0.399999999950799600689777, + 0.6666666666667778740063); #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) - x = vmla_vd_vd_vd_vd(x, t, vmul_vd_vd_vd(vcast_vd_d(0.693147180559945286226764), vcast_vd_vi(e))); + x = vmla_vd_vd_vd_vd(x, vcast_vd_d(2), vmul_vd_vd_vd(vcast_vd_d(0.693147180559945286226764), vcast_vd_vi(e))); + x = vmla_vd_vd_vd_vd(x3, t, x); x = vsel_vd_vo_vd_vd(vispinf_vo_vd(d), vcast_vd_d(SLEEF_INFINITY), x); x = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vlt_vo_vd_vd(d, vcast_vd_d(0)), visnan_vo_vd(d)), vcast_vd_d(SLEEF_NAN), x); x = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(-SLEEF_INFINITY), x); #else - x = vmla_vd_vd_vd_vd(x, t, vmul_vd_vd_vd(vcast_vd_d(0.693147180559945286226764), e)); + x = vmla_vd_vd_vd_vd(x, vcast_vd_d(2), vmul_vd_vd_vd(vcast_vd_d(0.693147180559945286226764), e)); + x = vmla_vd_vd_vd_vd(x3, t, x); + x = vfixup_vd_vd_vd_vi2_i(x, d, vcast_vi2_i((5 << (5*4))), 0); #endif @@ -2413,52 +2178,35 @@ EXPORT CONST VECTOR_CC vdouble xexp(vdouble d) { s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-L2L), s); #ifdef ENABLE_FMA_DP -#ifdef SPLIT_KERNEL - vdouble s2 = vmul_vd_vd_vd(s, s), v; - - u = vcast_vd_d(+0.2081276378237164457e-8); - u = vfma_vd_vd_vd_vd(u, s2, vcast_vd_d(+0.2755762628169491192e-6)); - u = vfma_vd_vd_vd_vd(u, s2, vcast_vd_d(+0.2480158687479686264e-4)); - u = vfma_vd_vd_vd_vd(u, s2, vcast_vd_d(+0.1388888888914497797e-2)); - u = vfma_vd_vd_vd_vd(u, s2, vcast_vd_d(+0.4166666666666602598e-1)); - u = vfma_vd_vd_vd_vd(u, s2, vcast_vd_d(+0.5000000000000000000e+0)); - - v = vcast_vd_d(+0.2511210703042288022e-7); - v = vfma_vd_vd_vd_vd(v, s2, vcast_vd_d(+0.2755723402025388239e-5)); - v = vfma_vd_vd_vd_vd(v, s2, vcast_vd_d(+0.1984126989855865850e-3)); - v = vfma_vd_vd_vd_vd(v, s2, vcast_vd_d(+0.8333333333314938210e-2)); - v = vfma_vd_vd_vd_vd(v, s2, vcast_vd_d(+0.1666666666666669072e+0)); - - u = vmla_vd_vd_vd_vd(v, s, u); - u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1000000000000000000e+1)); - u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1000000000000000000e+1)); -#else // #ifdef SPLIT_KERNEL - u = vcast_vd_d(+0.2081276378237164457e-8); - u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(+0.2511210703042288022e-7)); - u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(+0.2755762628169491192e-6)); - u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(+0.2755723402025388239e-5)); - u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(+0.2480158687479686264e-4)); - u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1984126989855865850e-3)); - u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1388888888914497797e-2)); - u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(+0.8333333333314938210e-2)); - u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(+0.4166666666666602598e-1)); - u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1666666666666669072e+0)); + vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2), s8 = vmul_vd_vd_vd(s4, s4); + u = POLY10(s, s2, s4, s8, + +0.2081276378237164457e-8, + +0.2511210703042288022e-7, + +0.2755762628169491192e-6, + +0.2755723402025388239e-5, + +0.2480158687479686264e-4, + +0.1984126989855865850e-3, + +0.1388888888914497797e-2, + +0.8333333333314938210e-2, + +0.4166666666666602598e-1, + +0.1666666666666669072e+0); u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(+0.5000000000000000000e+0)); u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1000000000000000000e+1)); u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1000000000000000000e+1)); -#endif // #ifdef SPLIT_KERNEL #else // #ifdef ENABLE_FMA_DP - u = vcast_vd_d(2.08860621107283687536341e-09); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.51112930892876518610661e-08)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.75573911234900471893338e-07)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.75572362911928827629423e-06)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.4801587159235472998791e-05)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000198412698960509205564975)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00138888888889774492207962)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00833333333331652721664984)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0416666666666665047591422)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.166666666666666851703837)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.5)); + vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2), s8 = vmul_vd_vd_vd(s4, s4); + u = POLY11(s, s2, s4, s8, + 2.08860621107283687536341e-09, + 2.51112930892876518610661e-08, + 2.75573911234900471893338e-07, + 2.75572362911928827629423e-06, + 2.4801587159235472998791e-05, + 0.000198412698960509205564975, + 0.00138888888889774492207962, + 0.00833333333331652721664984, + 0.0416666666666665047591422, + 0.166666666666666851703837, + 0.5); u = vadd_vd_vd_vd(vcast_vd_d(1), vmla_vd_vd_vd_vd(vmul_vd_vd_vd(s, s), u, s)); #endif // #ifdef ENABLE_FMA_DP @@ -2478,18 +2226,20 @@ static INLINE CONST VECTOR_CC vdouble expm1k(vdouble d) { s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-L2U), d); s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-L2L), s); - u = vcast_vd_d(2.08860621107283687536341e-09); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.51112930892876518610661e-08)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.75573911234900471893338e-07)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.75572362911928827629423e-06)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.4801587159235472998791e-05)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000198412698960509205564975)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00138888888889774492207962)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00833333333331652721664984)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0416666666666665047591422)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.166666666666666851703837)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.5)); - u = vmla_vd_vd_vd_vd(vmul_vd_vd_vd(s, s), u, s); + vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2), s8 = vmul_vd_vd_vd(s4, s4); + u = POLY10(s, s2, s4, s8, + 2.08860621107283687536341e-09, + 2.51112930892876518610661e-08, + 2.75573911234900471893338e-07, + 2.75572362911928827629423e-06, + 2.4801587159235472998791e-05, + 0.000198412698960509205564975, + 0.00138888888889774492207962, + 0.00833333333331652721664984, + 0.0416666666666665047591422, + 0.166666666666666851703837); + + u = vadd_vd_vd_vd(vmla_vd_vd_vd_vd(s2, vcast_vd_d(0.5), vmul_vd_vd_vd(vmul_vd_vd_vd(s2, s), u)), s); u = vsel_vd_vo_vd_vd(vcast_vo64_vo32(veq_vo_vi_vi(q, vcast_vi_i(0))), u, vsub_vd_vd_vd(vldexp2_vd_vd_vi(vadd_vd_vd_vd(u, vcast_vd_d(1)), q), vcast_vd_d(1))); @@ -2516,26 +2266,30 @@ static INLINE CONST VECTOR_CC vdouble2 logk(vdouble d) { x = dddiv_vd2_vd2_vd2(ddadd2_vd2_vd_vd(vcast_vd_d(-1), m), ddadd2_vd2_vd_vd(vcast_vd_d(1), m)); x2 = ddsqu_vd2_vd2(x); - t = vcast_vd_d(0.116255524079935043668677); - t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.103239680901072952701192)); - t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.117754809412463995466069)); - t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.13332981086846273921509)); - t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.153846227114512262845736)); - t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.181818180850050775676507)); - t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.222222222230083560345903)); - t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.285714285714249172087875)); - t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.400000000000000077715612)); - vdouble2 c = vcast_vd2_d_d(0.666666666666666629659233, 3.80554962542412056336616e-17); + vdouble x4 = vmul_vd_vd_vd(x2.x, x2.x), x8 = vmul_vd_vd_vd(x4, x4), x16 = vmul_vd_vd_vd(x8, x8); + t = POLY9(x2.x, x4, x8, x16, + 0.116255524079935043668677, + 0.103239680901072952701192, + 0.117754809412463995466069, + 0.13332981086846273921509, + 0.153846227114512262845736, + 0.181818180850050775676507, + 0.222222222230083560345903, + 0.285714285714249172087875, + 0.400000000000000077715612); + vdouble2 c = vcast_vd2_d_d(0.666666666666666629659233, 3.80554962542412056336616e-17); #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.693147180559945286226764, 2.319046813846299558417771e-17), vcast_vd_vi(e)); #else - s = ddmul_vd2_vd2_vd(vcast_vd2_vd_vd(vcast_vd_d(0.693147180559945286226764), vcast_vd_d(2.319046813846299558417771e-17)), e); + s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.693147180559945286226764, 2.319046813846299558417771e-17), e); #endif - s = ddadd_vd2_vd2_vd2(s, ddscale_vd2_vd2_vd(x, vcast_vd_d(2))); - s = ddadd_vd2_vd2_vd2(s, ddmul_vd2_vd2_vd2(ddmul_vd2_vd2_vd2(x2, x), - ddadd2_vd2_vd2_vd2(ddmul_vd2_vd2_vd(x2, t), c))); + x = ddmul_vd2_vd2_vd2(x2, x); + s = ddadd_vd2_vd2_vd2(s, ddmul_vd2_vd2_vd2(x, c)); + x = ddmul_vd2_vd2_vd2(x2, x); + s = ddadd_vd2_vd2_vd2(s, ddmul_vd2_vd2_vd(x, t)); + return s; } @@ -2559,14 +2313,16 @@ EXPORT CONST VECTOR_CC vdouble xlog_u1(vdouble d) { x = dddiv_vd2_vd2_vd2(ddadd2_vd2_vd_vd(vcast_vd_d(-1), m), ddadd2_vd2_vd_vd(vcast_vd_d(1), m)); x2 = vmul_vd_vd_vd(x.x, x.x); - t = vcast_vd_d(0.1532076988502701353e+0); - t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.1525629051003428716e+0)); - t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.1818605932937785996e+0)); - t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.2222214519839380009e+0)); - t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.2857142932794299317e+0)); - t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.3999999999635251990e+0)); - t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.6666666666667333541e+0)); - + vdouble x4 = vmul_vd_vd_vd(x2, x2), x8 = vmul_vd_vd_vd(x4, x4); + t = POLY7(x2, x4, x8, + 0.1532076988502701353e+0, + 0.1525629051003428716e+0, + 0.1818605932937785996e+0, + 0.2222214519839380009e+0, + 0.2857142932794299317e+0, + 0.3999999999635251990e+0, + 0.6666666666667333541e+0); + #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) vdouble2 s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.693147180559945286226764, 2.319046813846299558417771e-17), vcast_vd_vi(e)); #else @@ -2601,20 +2357,22 @@ static INLINE CONST VECTOR_CC vdouble expk(vdouble2 d) { s = ddnormalize_vd2_vd2(s); - u = vcast_vd_d(2.51069683420950419527139e-08); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(2.76286166770270649116855e-07)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(2.75572496725023574143864e-06)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(2.48014973989819794114153e-05)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.000198412698809069797676111)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.0013888888939977128960529)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.00833333333332371417601081)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.0416666666665409524128449)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.166666666666666740681535)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.500000000000000999200722)); - - t = ddadd_vd2_vd2_vd2(s, ddmul_vd2_vd2_vd(ddsqu_vd2_vd2(s), u)); + vdouble s2 = vmul_vd_vd_vd(s.x, s.x), s4 = vmul_vd_vd_vd(s2, s2), s8 = vmul_vd_vd_vd(s4, s4); + u = POLY10(s.x, s2, s4, s8, + 2.51069683420950419527139e-08, + 2.76286166770270649116855e-07, + 2.75572496725023574143864e-06, + 2.48014973989819794114153e-05, + 0.000198412698809069797676111, + 0.0013888888939977128960529, + 0.00833333333332371417601081, + 0.0416666666665409524128449, + 0.166666666666666740681535, + 0.500000000000000999200722); + + t = ddadd_vd2_vd_vd2(vcast_vd_d(1), s); + t = ddadd_vd2_vd2_vd2(t, ddmul_vd2_vd2_vd(ddsqu_vd2_vd2(s), u)); - t = ddadd_vd2_vd_vd2(vcast_vd_d(1), t); u = vadd_vd_vd_vd(t.x, t.y); u = vldexp2_vd_vd_vi(u, q); @@ -2673,22 +2431,24 @@ static INLINE CONST VECTOR_CC vdouble2 expk2(vdouble2 d) { s = ddadd2_vd2_vd2_vd(d, vmul_vd_vd_vd(dq, vcast_vd_d(-L2U))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dq, vcast_vd_d(-L2L))); - u = vcast_vd_d(+0.1602472219709932072e-9); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(+0.2092255183563157007e-8)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(+0.2505230023782644465e-7)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(+0.2755724800902135303e-6)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(+0.2755731892386044373e-5)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(+0.2480158735605815065e-4)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(+0.1984126984148071858e-3)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(+0.1388888888886763255e-2)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(+0.8333333333333347095e-2)); - u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(+0.4166666666666669905e-1)); - - t = ddadd2_vd2_vd2_vd(ddmul_vd2_vd2_vd(s, u), vcast_vd_d(+0.1666666666666666574e+0)); - t = ddadd2_vd2_vd2_vd(ddmul_vd2_vd2_vd2(s, t), vcast_vd_d(0.5)); - t = ddadd2_vd2_vd2_vd2(s, ddmul_vd2_vd2_vd2(ddsqu_vd2_vd2(s), t)); - - t = ddadd_vd2_vd_vd2(vcast_vd_d(1), t); + vdouble2 s2 = ddsqu_vd2_vd2(s), s4 = ddsqu_vd2_vd2(s2); + vdouble s8 = vmul_vd_vd_vd(s4.x, s4.x); + u = POLY10(s.x, s2.x, s4.x, s8, + +0.1602472219709932072e-9, + +0.2092255183563157007e-8, + +0.2505230023782644465e-7, + +0.2755724800902135303e-6, + +0.2755731892386044373e-5, + +0.2480158735605815065e-4, + +0.1984126984148071858e-3, + +0.1388888888886763255e-2, + +0.8333333333333347095e-2, + +0.4166666666666669905e-1); + + t = ddadd_vd2_vd_vd2(vcast_vd_d(0.5), ddmul_vd2_vd2_vd(s, vcast_vd_d(+0.1666666666666666574e+0))); + t = ddadd_vd2_vd_vd2(vcast_vd_d(1.0), ddmul_vd2_vd2_vd2(t, s)); + t = ddadd_vd2_vd_vd2(vcast_vd_d(1.0), ddmul_vd2_vd2_vd2(t, s)); + t = ddadd_vd2_vd2_vd2(t, ddmul_vd2_vd2_vd(s4, u)); t.x = vldexp2_vd_vd_vi(t.x, q); t.y = vldexp2_vd_vd_vi(t.y, q); @@ -2786,13 +2546,15 @@ static INLINE CONST VECTOR_CC vdouble2 logk2(vdouble2 d) { x = dddiv_vd2_vd2_vd2(ddadd2_vd2_vd2_vd(m, vcast_vd_d(-1)), ddadd2_vd2_vd2_vd(m, vcast_vd_d(1))); x2 = ddsqu_vd2_vd2(x); - t = vcast_vd_d(0.13860436390467167910856); - t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.131699838841615374240845)); - t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.153914168346271945653214)); - t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.181816523941564611721589)); - t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.22222224632662035403996)); - t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.285714285511134091777308)); - t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.400000000000914013309483)); + vdouble x4 = vmul_vd_vd_vd(x2.x, x2.x), x8 = vmul_vd_vd_vd(x4, x4); + t = POLY7(x2.x, x4, x8, + 0.13860436390467167910856, + 0.131699838841615374240845, + 0.153914168346271945653214, + 0.181816523941564611721589, + 0.22222224632662035403996, + 0.285714285511134091777308, + 0.400000000000914013309483); t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.666666666666664853302393)); s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.693147180559945286226764, 2.319046813846299558417771e-17), vcast_vd_vi(e)); @@ -2955,36 +2717,19 @@ EXPORT CONST VECTOR_CC vdouble xexp2(vdouble d) { s = vsub_vd_vd_vd(d, u); -#ifdef SPLIT_KERNEL - vdouble s2 = vmul_vd_vd_vd(s, s), v; - - u = vcast_vd_d(+0.4434359082926529454e-9); - u = vmla_vd_vd_vd_vd(u, s2, vcast_vd_d(+0.1017819260921760451e-6)); - u = vmla_vd_vd_vd_vd(u, s2, vcast_vd_d(+0.1525273353517584730e-4)); - u = vmla_vd_vd_vd_vd(u, s2, vcast_vd_d(+0.1333355814670499073e-2)); - u = vmla_vd_vd_vd_vd(u, s2, vcast_vd_d(+0.5550410866482046596e-1)); - - v = vcast_vd_d(+0.7073164598085707425e-8); - v = vmla_vd_vd_vd_vd(v, s2, vcast_vd_d(+0.1321543872511327615e-5)); - v = vmla_vd_vd_vd_vd(v, s2, vcast_vd_d(+0.1540353045101147808e-3)); - v = vmla_vd_vd_vd_vd(v, s2, vcast_vd_d(+0.9618129107597600536e-2)); - v = vmla_vd_vd_vd_vd(v, s2, vcast_vd_d(+0.2402265069591012214e+0)); - - u = vmla_vd_vd_vd_vd(u, s, v); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.6931471805599452862e+0)); -#else // #ifdef SPLIT_KERNEL - u = vcast_vd_d(+0.4434359082926529454e-9); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.7073164598085707425e-8)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1017819260921760451e-6)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1321543872511327615e-5)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1525273353517584730e-4)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1540353045101147808e-3)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1333355814670499073e-2)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.9618129107597600536e-2)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.5550410866482046596e-1)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.2402265069591012214e+0)); + vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2), s8 = vmul_vd_vd_vd(s4, s4); + u = POLY10(s, s2, s4, s8, + +0.4434359082926529454e-9, + +0.7073164598085707425e-8, + +0.1017819260921760451e-6, + +0.1321543872511327615e-5, + +0.1525273353517584730e-4, + +0.1540353045101147808e-3, + +0.1333355814670499073e-2, + +0.9618129107597600536e-2, + +0.5550410866482046596e-1, + +0.2402265069591012214e+0); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.6931471805599452862e+0)); -#endif // #ifdef SPLIT_KERNEL #ifdef ENABLE_FMA_DP u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(1)); @@ -3007,24 +2752,6 @@ EXPORT CONST VECTOR_CC vdouble xexp10(vdouble d) { s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-L10U), d); s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-L10L), s); -#ifdef SPLIT_KERNEL - vdouble s2 = vmul_vd_vd_vd(s, s), v; - - u = vcast_vd_d(+0.2411463498334267652e-3); - u = vmla_vd_vd_vd_vd(u, s2, vcast_vd_d(+0.5013975546789733659e-2)); - u = vmla_vd_vd_vd_vd(u, s2, vcast_vd_d(+0.6808936399446784138e-1)); - u = vmla_vd_vd_vd_vd(u, s2, vcast_vd_d(+0.5393829292058536229e+0)); - u = vmla_vd_vd_vd_vd(u, s2, vcast_vd_d(+0.2034678592293432953e+1)); - - v = vcast_vd_d(+0.1157488415217187375e-2); - v = vmla_vd_vd_vd_vd(v, s2, vcast_vd_d(+0.1959762320720533080e-1)); - v = vmla_vd_vd_vd_vd(v, s2, vcast_vd_d(+0.2069958494722676234e+0)); - v = vmla_vd_vd_vd_vd(v, s2, vcast_vd_d(+0.1171255148908541655e+1)); - v = vmla_vd_vd_vd_vd(v, s2, vcast_vd_d(+0.2650949055239205876e+1)); - - u = vmla_vd_vd_vd_vd(u, s, v); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.2302585092994045901e+1)); -#else // #ifdef SPLIT_KERNEL u = vcast_vd_d(+0.2411463498334267652e-3); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1157488415217187375e-2)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.5013975546789733659e-2)); @@ -3036,7 +2763,6 @@ EXPORT CONST VECTOR_CC vdouble xexp10(vdouble d) { u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.2034678592293432953e+1)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.2650949055239205876e+1)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.2302585092994045901e+1)); -#endif // #ifdef SPLIT_KERNEL #ifdef ENABLE_FMA_DP u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(1)); @@ -3081,13 +2807,15 @@ EXPORT CONST VECTOR_CC vdouble xlog10(vdouble d) { x = dddiv_vd2_vd2_vd2(ddadd2_vd2_vd_vd(vcast_vd_d(-1), m), ddadd2_vd2_vd_vd(vcast_vd_d(1), m)); x2 = vmul_vd_vd_vd(x.x, x.x); - t = vcast_vd_d(+0.6653725819576758460e-1); - t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(+0.6625722782820833712e-1)); - t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(+0.7898105214313944078e-1)); - t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(+0.9650955035715275132e-1)); - t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(+0.1240841409721444993e+0)); - t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(+0.1737177927454605086e+0)); - t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(+0.2895296546021972617e+0)); + vdouble x4 = vmul_vd_vd_vd(x2, x2), x8 = vmul_vd_vd_vd(x4, x4); + t = POLY7(x2, x4, x8, + +0.6653725819576758460e-1, + +0.6625722782820833712e-1, + +0.7898105214313944078e-1, + +0.9650955035715275132e-1, + +0.1240841409721444993e+0, + +0.1737177927454605086e+0, + +0.2895296546021972617e+0); #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) vdouble2 s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.30102999566398119802, -2.803728127785170339e-18), vcast_vd_vi(e)); @@ -3130,13 +2858,15 @@ EXPORT CONST VECTOR_CC vdouble xlog2(vdouble d) { x = dddiv_vd2_vd2_vd2(ddadd2_vd2_vd_vd(vcast_vd_d(-1), m), ddadd2_vd2_vd_vd(vcast_vd_d(1), m)); x2 = vmul_vd_vd_vd(x.x, x.x); - t = vcast_vd_d(+0.2211941750456081490e+0); - t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(+0.2200768693152277689e+0)); - t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(+0.2623708057488514656e+0)); - t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(+0.3205977477944495502e+0)); - t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(+0.4121985945485324709e+0)); - t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(+0.5770780162997058982e+0)); - t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(+0.96179669392608091449 )); + vdouble x4 = vmul_vd_vd_vd(x2, x2), x8 = vmul_vd_vd_vd(x4, x4); + t = POLY7(x2, x4, x8, + +0.2211941750456081490e+0, + +0.2200768693152277689e+0, + +0.2623708057488514656e+0, + +0.3205977477944495502e+0, + +0.4121985945485324709e+0, + +0.5770780162997058982e+0, + +0.96179669392608091449); #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) vdouble2 s = ddadd2_vd2_vd_vd2(vcast_vd_vi(e), @@ -3186,13 +2916,15 @@ EXPORT CONST VECTOR_CC vdouble xlog1p(vdouble d) { x = dddiv_vd2_vd2_vd2(vcast_vd2_vd_vd(m, vcast_vd_d(0)), ddadd_vd2_vd_vd(vcast_vd_d(2), m)); x2 = vmul_vd_vd_vd(x.x, x.x); - t = vcast_vd_d(0.1532076988502701353e+0); - t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.1525629051003428716e+0)); - t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.1818605932937785996e+0)); - t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.2222214519839380009e+0)); - t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.2857142932794299317e+0)); - t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.3999999999635251990e+0)); - t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.6666666666667333541e+0)); + vdouble x4 = vmul_vd_vd_vd(x2, x2), x8 = vmul_vd_vd_vd(x4, x4); + t = POLY7(x2, x4, x8, + 0.1532076988502701353e+0, + 0.1525629051003428716e+0, + 0.1818605932937785996e+0, + 0.2222214519839380009e+0, + 0.2857142932794299317e+0, + 0.3999999999635251990e+0, + 0.6666666666667333541e+0); s = ddadd_vd2_vd2_vd2(s, ddscale_vd2_vd2_vd(x, vcast_vd_d(2))); s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(vmul_vd_vd_vd(x2, x.x), t)); diff --git a/src/libm/sleefsimdsp.c b/src/libm/sleefsimdsp.c index f4cb68d3..c3dc6081 100644 --- a/src/libm/sleefsimdsp.c +++ b/src/libm/sleefsimdsp.c @@ -214,6 +214,12 @@ extern const float rempitabsp[]; // +#define MLA(x, y, z) vmla_vf_vf_vf_vf((x), (y), (z)) +#define C2V(c) vcast_vf_f(c) +#include "estrin.h" + +// + #include "df.h" static INLINE CONST VECTOR_CC vopmask visnegzero_vo_vf(vfloat d) { @@ -638,12 +644,23 @@ EXPORT CONST VECTOR_CC vfloat xtanf(vfloat d) { o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)); x = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0f))), vreinterpret_vm_vf(x))); +#if defined(ENABLE_NEON32) u = vcast_vf_f(0.00927245803177356719970703f); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00331984995864331722259521f)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0242998078465461730957031f)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0534495301544666290283203f)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.133383005857467651367188f)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.333331853151321411132812f)); +#else + vfloat s2 = vmul_vf_vf_vf(s, s), s4 = vmul_vf_vf_vf(s2, s2); + u = POLY6(s, s2, s4, + 0.00927245803177356719970703f, + 0.00331984995864331722259521f, + 0.0242998078465461730957031f, + 0.0534495301544666290283203f, + 0.133383005857467651367188f, + 0.333331853151321411132812f); +#endif u = vmla_vf_vf_vf_vf(s, vmul_vf_vf_vf(u, x), x); @@ -691,12 +708,23 @@ EXPORT CONST VECTOR_CC vfloat xtanf(vfloat d) { o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)); x = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0f))), vreinterpret_vm_vf(x))); +#if defined(ENABLE_NEON32) u = vcast_vf_f(0.00927245803177356719970703f); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00331984995864331722259521f)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0242998078465461730957031f)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0534495301544666290283203f)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.133383005857467651367188f)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.333331853151321411132812f)); +#else + vfloat s2 = vmul_vf_vf_vf(s, s), s4 = vmul_vf_vf_vf(s2, s2); + u = POLY6(s, s2, s4, + 0.00927245803177356719970703f, + 0.00331984995864331722259521f, + 0.0242998078465461730957031f, + 0.0534495301544666290283203f, + 0.133383005857467651367188f, + 0.333331853151321411132812f); +#endif u = vmla_vf_vf_vf_vf(s, vmul_vf_vf_vf(u, x), x); @@ -1438,14 +1466,16 @@ EXPORT CONST VECTOR_CC vfloat xatanf(vfloat d) { t = vmul_vf_vf_vf(s, s); - u = vcast_vf_f(0.00282363896258175373077393f); - u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(-0.0159569028764963150024414f)); - u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(0.0425049886107444763183594f)); - u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(-0.0748900920152664184570312f)); - u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(0.106347933411598205566406f)); - u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(-0.142027363181114196777344f)); - u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(0.199926957488059997558594f)); - u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(-0.333331018686294555664062f)); + vfloat t2 = vmul_vf_vf_vf(t, t), t4 = vmul_vf_vf_vf(t2, t2); + u = POLY8(t, t2, t4, + 0.00282363896258175373077393f, + -0.0159569028764963150024414f, + 0.0425049886107444763183594f, + -0.0748900920152664184570312f, + 0.106347933411598205566406f, + -0.142027363181114196777344f, + 0.199926957488059997558594f, + -0.333331018686294555664062f); t = vmla_vf_vf_vf_vf(s, vmul_vf_vf_vf(t, u), s); @@ -1477,14 +1507,16 @@ static INLINE CONST VECTOR_CC vfloat atan2kf(vfloat y, vfloat x) { s = vdiv_vf_vf_vf(s, t); t = vmul_vf_vf_vf(s, s); - u = vcast_vf_f(0.00282363896258175373077393f); - u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(-0.0159569028764963150024414f)); - u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(0.0425049886107444763183594f)); - u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(-0.0748900920152664184570312f)); - u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(0.106347933411598205566406f)); - u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(-0.142027363181114196777344f)); - u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(0.199926957488059997558594f)); - u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(-0.333331018686294555664062f)); + vfloat t2 = vmul_vf_vf_vf(t, t), t4 = vmul_vf_vf_vf(t2, t2); + u = POLY8(t, t2, t4, + 0.00282363896258175373077393f, + -0.0159569028764963150024414f, + 0.0425049886107444763183594f, + -0.0748900920152664184570312f, + 0.106347933411598205566406f, + -0.142027363181114196777344f, + 0.199926957488059997558594f, + -0.333331018686294555664062f); t = vmla_vf_vf_vf_vf(s, vmul_vf_vf_vf(t, u), s); t = vmla_vf_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f((float)(M_PI/2)), t); @@ -1734,12 +1766,14 @@ static INLINE CONST VECTOR_CC vfloat expm1fk(vfloat d) { s = vmla_vf_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf), d); s = vmla_vf_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf), s); - u = vcast_vf_f(0.000198527617612853646278381); - u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00139304355252534151077271)); - u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833336077630519866943359)); - u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0416664853692054748535156)); - u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.166666671633720397949219)); - u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.5)); + vfloat s2 = vmul_vf_vf_vf(s, s), s4 = vmul_vf_vf_vf(s2, s2); + u = POLY6(s, s2, s4, + 0.000198527617612853646278381, + 0.00139304355252534151077271, + 0.00833336077630519866943359, + 0.0416664853692054748535156, + 0.166666671633720397949219, + 0.5); u = vmla_vf_vf_vf_vf(vmul_vf_vf_vf(s, s), u, s); diff --git a/src/libm/sleefsp.c b/src/libm/sleefsp.c index e150c5e0..ead4e8ad 100644 --- a/src/libm/sleefsp.c +++ b/src/libm/sleefsp.c @@ -30,6 +30,10 @@ extern const float rempitabsp[]; #pragma fp_contract (off) #endif +#define MLA mlaf +#define C2V(x) (x) +#include "estrin.h" + static INLINE CONST int32_t floatToRawIntBits(float d) { union { float f; @@ -828,12 +832,14 @@ EXPORT CONST float xtanf(float d) { if ((q & 1) != 0) x = -x; - u = 0.00927245803177356719970703f; - u = mlaf(u, s, 0.00331984995864331722259521f); - u = mlaf(u, s, 0.0242998078465461730957031f); - u = mlaf(u, s, 0.0534495301544666290283203f); - u = mlaf(u, s, 0.133383005857467651367188f); - u = mlaf(u, s, 0.333331853151321411132812f); + float s2 = s * s, s4 = s2 * s2; + u = POLY6(s, s2, s4, + 0.00927245803177356719970703f, + 0.00331984995864331722259521f, + 0.0242998078465461730957031f, + 0.0534495301544666290283203f, + 0.133383005857467651367188f, + 0.333331853151321411132812f); u = mlaf(s, u * x, x); @@ -893,14 +899,16 @@ EXPORT CONST float xatanf(float s) { t = s * s; - u = 0.00282363896258175373077393f; - u = mlaf(u, t, -0.0159569028764963150024414f); - u = mlaf(u, t, 0.0425049886107444763183594f); - u = mlaf(u, t, -0.0748900920152664184570312f); - u = mlaf(u, t, 0.106347933411598205566406f); - u = mlaf(u, t, -0.142027363181114196777344f); - u = mlaf(u, t, 0.199926957488059997558594f); - u = mlaf(u, t, -0.333331018686294555664062f); + float t2 = t * t, t4 = t2 * t2; + u = POLY8(t, t2, t4, + 0.00282363896258175373077393f, + -0.0159569028764963150024414f, + 0.0425049886107444763183594f, + -0.0748900920152664184570312f, + 0.106347933411598205566406f, + -0.142027363181114196777344f, + 0.199926957488059997558594f, + -0.333331018686294555664062f); t = s + s * (t * u); @@ -920,14 +928,16 @@ static INLINE CONST float atan2kf(float y, float x) { s = y / x; t = s * s; - u = 0.00282363896258175373077393f; - u = mlaf(u, t, -0.0159569028764963150024414f); - u = mlaf(u, t, 0.0425049886107444763183594f); - u = mlaf(u, t, -0.0748900920152664184570312f); - u = mlaf(u, t, 0.106347933411598205566406f); - u = mlaf(u, t, -0.142027363181114196777344f); - u = mlaf(u, t, 0.199926957488059997558594f); - u = mlaf(u, t, -0.333331018686294555664062f); + float t2 = t * t, t4 = t2 * t2; + u = POLY8(t, t2, t4, + 0.00282363896258175373077393f, + -0.0159569028764963150024414f, + 0.0425049886107444763183594f, + -0.0748900920152664184570312f, + 0.106347933411598205566406f, + -0.142027363181114196777344f, + 0.199926957488059997558594f, + -0.333331018686294555664062f); t = u * t * s + s; t = q * (float)(M_PI/2) + t; @@ -1163,12 +1173,15 @@ static INLINE CONST float expm1kf(float d) { s = mlaf(q, -L2Uf, d); s = mlaf(q, -L2Lf, s); - u = 0.000198527617612853646278381; - u = mlaf(u, s, 0.00139304355252534151077271); - u = mlaf(u, s, 0.00833336077630519866943359); - u = mlaf(u, s, 0.0416664853692054748535156); - u = mlaf(u, s, 0.166666671633720397949219); - u = mlaf(u, s, 0.5); + float s2 = s * s, s4 = s2 * s2; + u = POLY6(s, s2, s4, + 0.000198527617612853646278381, + 0.00139304355252534151077271, + 0.00833336077630519866943359, + 0.0416664853692054748535156, + 0.166666671633720397949219, + 0.5); + u = s * s * u + s; if (q != 0) u = ldexp2kf(u + 1, q) - 1;