diff --git a/benchmarks/elementary_functions_benchmark.cpp b/benchmarks/elementary_functions_benchmark.cpp index b0a6f0c565..83af680d80 100644 --- a/benchmarks/elementary_functions_benchmark.cpp +++ b/benchmarks/elementary_functions_benchmark.cpp @@ -9,6 +9,7 @@ #include "benchmarks/metric.hpp" #include "functions/cos.hpp" #include "functions/sin.hpp" +#include "numerics/sin_cos.hpp" #include "quantities/numbers.hpp" // 🧙 For π. namespace principia { @@ -17,6 +18,7 @@ namespace functions { using namespace principia::benchmarks::_metric; using namespace principia::functions::_cos; using namespace principia::functions::_sin; +using namespace principia::numerics::_sin_cos; static constexpr std::int64_t number_of_iterations = 1000; @@ -26,7 +28,7 @@ void BM_EvaluateElementaryFunction(benchmark::State& state) { using Argument = double; std::mt19937_64 random(42); - std::uniform_real_distribution<> uniformly_at(-π / 4, π / 4); + std::uniform_real_distribution<> uniformly_at(0, π / 4); Argument a[number_of_iterations]; for (std::int64_t i = 0; i < number_of_iterations; ++i) { @@ -79,6 +81,11 @@ BENCHMARK_TEMPLATE(BM_EvaluateElementaryFunction, Metric::Latency, cr_sin) BENCHMARK_TEMPLATE(BM_EvaluateElementaryFunction, Metric::Throughput, cr_sin) ->Unit(benchmark::kNanosecond); +BENCHMARK_TEMPLATE(BM_EvaluateElementaryFunction, Metric::Latency, Sin) + ->Unit(benchmark::kNanosecond); +BENCHMARK_TEMPLATE(BM_EvaluateElementaryFunction, Metric::Throughput, Sin) + ->Unit(benchmark::kNanosecond); + BENCHMARK_TEMPLATE(BM_EvaluateElementaryFunction, Metric::Latency, std::cos) ->Unit(benchmark::kNanosecond); BENCHMARK_TEMPLATE(BM_EvaluateElementaryFunction, Metric::Throughput, std::cos) @@ -89,5 +96,10 @@ BENCHMARK_TEMPLATE(BM_EvaluateElementaryFunction, Metric::Latency, cr_cos) BENCHMARK_TEMPLATE(BM_EvaluateElementaryFunction, Metric::Throughput, cr_cos) ->Unit(benchmark::kNanosecond); +BENCHMARK_TEMPLATE(BM_EvaluateElementaryFunction, Metric::Latency, Cos) + ->Unit(benchmark::kNanosecond); +BENCHMARK_TEMPLATE(BM_EvaluateElementaryFunction, Metric::Throughput, Cos) + ->Unit(benchmark::kNanosecond); + } // namespace functions } // namespace principia diff --git a/functions/sin_cos_test.cpp b/functions/sin_cos_test.cpp index 5c2b93f57c..de665b45c0 100644 --- a/functions/sin_cos_test.cpp +++ b/functions/sin_cos_test.cpp @@ -24,7 +24,7 @@ class SinCosTest : public ::testing::Test {}; TEST_F(SinCosTest, Random) { std::mt19937_64 random(42); // TODO(phl): Negative angles. - std::uniform_real_distribution<> uniformly_at(0, π / 4); + std::uniform_real_distribution<> uniformly_at(-π / 4, π / 4); cpp_bin_float_50 max_sin_ulps_error = 0; cpp_bin_float_50 max_cos_ulps_error = 0; @@ -34,7 +34,7 @@ TEST_F(SinCosTest, Random) { #if _DEBUG static constexpr std::int64_t iterations = 100; #else - static constexpr std::int64_t iterations = 400'000; + static constexpr std::int64_t iterations = 300'000; #endif for (std::int64_t i = 0; i < iterations; ++i) { @@ -69,8 +69,8 @@ TEST_F(SinCosTest, Random) { } // This implementation is not quite correctly rounded, but not far from it. - EXPECT_LE(max_sin_ulps_error, 0.500002); - EXPECT_LE(max_cos_ulps_error, 0.500002); + EXPECT_LE(max_sin_ulps_error, 0.500003); + EXPECT_LE(max_cos_ulps_error, 0.500001); LOG(ERROR) << "Sin error: " << max_sin_ulps_error << std::setprecision(25) << " ulps for argument: " << worst_sin_argument diff --git a/numerics/double_precision.hpp b/numerics/double_precision.hpp index 8a03a181f8..cff1b09d84 100644 --- a/numerics/double_precision.hpp +++ b/numerics/double_precision.hpp @@ -3,6 +3,7 @@ #include #include "base/not_null.hpp" +#include "base/tags.hpp" #include "numerics/fma.hpp" #include "quantities/named_quantities.hpp" #include "quantities/quantities.hpp" @@ -14,6 +15,7 @@ namespace _double_precision { namespace internal { using namespace principia::base::_not_null; +using namespace principia::base::_tags; using namespace principia::numerics::_fma; using namespace principia::quantities::_named_quantities; using namespace principia::quantities::_quantities; @@ -22,8 +24,9 @@ using namespace principia::quantities::_quantities; // type of the value must be an affine space. The notations follow [HLB08]. template struct DoublePrecision final { - constexpr DoublePrecision() = default; + constexpr DoublePrecision(); + explicit constexpr DoublePrecision(uninitialized_t); explicit constexpr DoublePrecision(T const& value); // This is correct assuming that left and right have non-overlapping @@ -47,8 +50,8 @@ struct DoublePrecision final { static DoublePrecision ReadFromMessage( serialization::DoublePrecision const& message); - T value{}; - Difference error{}; + T value; + Difference error; }; // `scale` must be a signed power of two or zero. @@ -64,25 +67,29 @@ DoublePrecision> Scale(T const& scale, template DoublePrecision> TwoProduct(T const& a, U const& b); -// Returns the exact value of `a * b + c`. +// Returns the exact value of `a * b + c` if `|a * b|` is small compared to +// `|c|`. See [SZ05], section 2.1. template DoublePrecision> TwoProductAdd(T const& a, U const& b, Product const& c); -// Returns the exact value of `a * b - c`. +// Returns the exact value of `a * b - c` if `|a * b|` is small compared to +// `|c|`. See [SZ05], section 2.1. template DoublePrecision> TwoProductSubtract(T const& a, U const& b, Product const& c); -// Returns the exact value of `-a * b + c`. +// Returns the exact value of `-a * b + c` if `|a * b|` is small compared to +// `|c|`. See [SZ05], section 2.1. template DoublePrecision> TwoProductNegatedAdd(T const& a, U const& b, Product const& c); -// Returns the exact value of `-a * b - c`. +// Returns the exact value of `-a * b - c` if `|a * b|` is small compared to +// `|c|`. See [SZ05], section 2.1. template DoublePrecision> TwoProductNegatedSubtract(T const& a, diff --git a/numerics/double_precision_body.hpp b/numerics/double_precision_body.hpp index fef5416eb4..158c0c11da 100644 --- a/numerics/double_precision_body.hpp +++ b/numerics/double_precision_body.hpp @@ -107,6 +107,12 @@ struct ComponentwiseComparator { }; +template +constexpr DoublePrecision::DoublePrecision() : value{}, error{} {} + +template +constexpr DoublePrecision::DoublePrecision(uninitialized_t) {} + template constexpr DoublePrecision::DoublePrecision(T const& value) : value(value), @@ -199,7 +205,7 @@ DoublePrecision> Scale(T const & scale, CHECK_EQ(0.5, std::fabs(mantissa)) << scale; } #endif - DoublePrecision> result; + DoublePrecision> result(uninitialized); result.value = right.value * scale; result.error = right.error * scale; return result; @@ -208,7 +214,7 @@ DoublePrecision> Scale(T const & scale, template constexpr DoublePrecision> VeltkampDekkerProduct(T const& a, U const& b) { - DoublePrecision> result; + DoublePrecision> result(uninitialized); auto const& x = a; auto const& y = b; auto& z = result.value; @@ -242,7 +248,8 @@ DoublePrecision> TwoProduct(T const& a, U const& b) { if ((fma_policy == FMAPolicy::Force && CanEmitFMAInstructions) || (fma_policy == FMAPolicy::Auto && UseHardwareFMA)) { using quantities::_elementary_functions::FusedMultiplySubtract; - DoublePrecision> result(a * b); + DoublePrecision> result(uninitialized); + result.value = a * b; result.error = FusedMultiplySubtract(a, b, result.value); return result; } else { @@ -259,8 +266,9 @@ DoublePrecision> TwoProductAdd(T const& a, (fma_policy == FMAPolicy::Auto && UseHardwareFMA)) { using quantities::_elementary_functions::FusedMultiplyAdd; using quantities::_elementary_functions::FusedMultiplySubtract; - DoublePrecision> result(FusedMultiplyAdd(a, b, c)); - result.error = FusedMultiplySubtract(a, b, (result.value - c)); + DoublePrecision> result(uninitialized); + result.value = FusedMultiplyAdd(a, b, c); + result.error = FusedMultiplySubtract(a, b, result.value - c); return result; } else { auto result = VeltkampDekkerProduct(a, b); @@ -277,8 +285,9 @@ DoublePrecision> TwoProductSubtract(T const& a, if ((fma_policy == FMAPolicy::Force && CanEmitFMAInstructions) || (fma_policy == FMAPolicy::Auto && UseHardwareFMA)) { using quantities::_elementary_functions::FusedMultiplySubtract; - DoublePrecision> result(FusedMultiplySubtract(a, b, c)); - result.error = FusedMultiplySubtract(a, b, (result.value + c)); + DoublePrecision> result(uninitialized); + result.value = FusedMultiplySubtract(a, b, c); + result.error = FusedMultiplySubtract(a, b, result.value + c); return result; } else { auto result = VeltkampDekkerProduct(a, b); @@ -296,8 +305,9 @@ DoublePrecision> TwoProductNegatedAdd(T const& a, (fma_policy == FMAPolicy::Auto && UseHardwareFMA)) { using quantities::_elementary_functions::FusedNegatedMultiplyAdd; using quantities::_elementary_functions::FusedNegatedMultiplySubtract; - DoublePrecision> result(FusedNegatedMultiplyAdd(a, b, c)); - result.error = FusedNegatedMultiplySubtract(a, b, (result.value - c)); + DoublePrecision> result(uninitialized); + result.value = FusedNegatedMultiplyAdd(a, b, c); + result.error = FusedNegatedMultiplySubtract(a, b, result.value - c); return result; } else { auto result = VeltkampDekkerProduct(-a, b); @@ -315,9 +325,9 @@ TwoProductNegatedSubtract(T const& a, if ((fma_policy == FMAPolicy::Force && CanEmitFMAInstructions) || (fma_policy == FMAPolicy::Auto && UseHardwareFMA)) { using quantities::_elementary_functions::FusedNegatedMultiplySubtract; - DoublePrecision> result( - FusedNegatedMultiplySubtract(a, b, c)); - result.error = FusedNegatedMultiplySubtract(a, b, (result.value + c)); + DoublePrecision> result(uninitialized); + result.value = FusedNegatedMultiplySubtract(a, b, c); + result.error = FusedNegatedMultiplySubtract(a, b, result.value + c); return result; } else { auto result = VeltkampDekkerProduct(-a, b); diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp index 232673d634..72c77679ca 100644 --- a/numerics/sin_cos.cpp +++ b/numerics/sin_cos.cpp @@ -1,6 +1,8 @@ +#pragma once + #include "numerics/sin_cos.hpp" -#include +#include #include "numerics/accurate_tables.mathematica.h" #include "numerics/double_precision.hpp" @@ -26,6 +28,11 @@ constexpr Argument table_spacing_reciprocal = 512.0; template using Polynomial1 = HornerEvaluator; +namespace masks { +static const __m128d sign_bit = + _mm_castsi128_pd(_mm_cvtsi64_si128(0x8000'0000'0000'0000)); +} // namespace masks + // TODO(phl): Take the perturbation into account in the polynomials. template @@ -52,21 +59,27 @@ Value CosPolynomial(Argument const x) { template FORCE_INLINE(inline) Value SinImplementation(Argument const x) { - if (x < sin_near_zero_cutoff) { + __m128d x_0 = _mm_set_sd(x); + __m128d const sign = _mm_and_pd(masks::sign_bit, x_0); + x_0 = _mm_andnot_pd(masks::sign_bit, x_0); + double const abs_x = _mm_cvtsd_f64(x_0); + if (abs_x < sin_near_zero_cutoff) { double const x² = x * x; double const x³ = x² * x; return x + x³ * SinPolynomialNearZero(x²); } else { - std::int64_t const i = std::lround(x * table_spacing_reciprocal); + auto const i = _mm_cvtsd_si64(_mm_set_sd(abs_x * table_spacing_reciprocal)); auto const& accurate_values = SinCosAccurateTable[i]; double const& x₀ = accurate_values.x; - double const& sin_x₀ = accurate_values.sin_x; + double const& sin_x₀ = + _mm_cvtsd_f64(_mm_xor_pd(_mm_set_sd(accurate_values.sin_x), sign)); double const& cos_x₀ = accurate_values.cos_x; - double const h = x - x₀; + double const abs_h = abs_x - x₀; + double const h = _mm_cvtsd_f64(_mm_xor_pd(_mm_set_sd(abs_h), sign)); DoublePrecision const sin_x₀_plus_h_cos_x₀ = TwoProductAdd(cos_x₀, h, sin_x₀); - double const h² = h * h; + double const h² = abs_h * abs_h; double const h³ = h² * h; return sin_x₀_plus_h_cos_x₀.value + ((sin_x₀ * h² * CosPolynomial(h²) + @@ -78,12 +91,13 @@ Value SinImplementation(Argument const x) { template FORCE_INLINE(inline) Value CosImplementation(Argument const x) { - std::int64_t const i = std::lround(x * table_spacing_reciprocal); + double const abs_x = std::abs(x); + auto const i = _mm_cvtsd_si64(_mm_set_sd(abs_x * table_spacing_reciprocal)); auto const& accurate_values = SinCosAccurateTable[i]; double const& x₀ = accurate_values.x; double const& sin_x₀ = accurate_values.sin_x; double const& cos_x₀ = accurate_values.cos_x; - double const h = x - x₀; + double const h = abs_x - x₀; DoublePrecision const cos_x₀_minus_h_sin_x₀ = TwoProductNegatedAdd(sin_x₀, h, cos_x₀); @@ -95,12 +109,18 @@ Value CosImplementation(Argument const x) { cos_x₀_minus_h_sin_x₀.error); } -Value Sin(Argument const x) { +#if PRINCIPIA_INLINE_SIN_COS +inline +#endif +Value __cdecl Sin(Argument const x) { return UseHardwareFMA ? SinImplementation(x) : SinImplementation(x); } -Value Cos(Argument const x) { +#if PRINCIPIA_INLINE_SIN_COS +inline +#endif +Value __cdecl Cos(Argument const x) { return UseHardwareFMA ? CosImplementation(x) : CosImplementation(x); } diff --git a/numerics/sin_cos.hpp b/numerics/sin_cos.hpp index baeae24799..a500c695e9 100644 --- a/numerics/sin_cos.hpp +++ b/numerics/sin_cos.hpp @@ -5,8 +5,16 @@ namespace numerics { namespace _sin_cos { namespace internal { -double Sin(double x); -double Cos(double x); +#define PRINCIPIA_INLINE_SIN_COS 0 + +#if PRINCIPIA_INLINE_SIN_COS +inline +#endif +double __cdecl Sin(double x); +#if PRINCIPIA_INLINE_SIN_COS +inline +#endif +double __cdecl Cos(double x); } // namespace internal @@ -16,3 +24,7 @@ using internal::Sin; } // namespace _sin_cos } // namespace numerics } // namespace principia + +#if PRINCIPIA_INLINE_SIN_COS +#include "numerics/sin_cos.cpp" +#endif