Merge pull request #4112 from pleroy/Reduction

Cody-Waite argument reduction for trigonometric functions
mockingbirdnest · Oct 11, 2024 · ab9f43c · ab9f43c
2 parents 2a14d1c + 78c2598
commit ab9f43c
Show file tree

Hide file tree

Showing 7 changed files with 181 additions and 32 deletions.
diff --git a/benchmarks/elementary_functions_benchmark.cpp b/benchmarks/elementary_functions_benchmark.cpp
@@ -28,7 +28,7 @@ void BM_EvaluateElementaryFunction(benchmark::State& state) {
   using Argument = double;
 
   std::mt19937_64 random(42);
-  std::uniform_real_distribution<> uniformly_at(0, π / 4);
+  std::uniform_real_distribution<> uniformly_at(-2 * π, 2 * π);
 
   Argument a[number_of_iterations];
   for (std::int64_t i = 0; i < number_of_iterations; ++i) {

diff --git a/documentation/bibliography.bib b/documentation/bibliography.bib
@@ -718,6 +718,17 @@ @article{Fukushima2012b
   volume       = {236},
 }
 
+@article{GalBachelis1991,
+  author       = {Gal, Shmuel and Bachelis, Boris},
+  publisher    = {ACM New York, NY, USA},
+  date         = {1991-03},
+  journaltitle = {ACM Transactions on Mathematical Software},
+  number       = {1},
+  pages        = {26--45},
+  title        = {An Accurate Elementary Mathematical Library for the IEEE Floating Point Standard},
+  volume       = {17},
+}
+
 @article{Gander1985,
   author       = {Gander, Walter},
   date         = {1985-02},

diff --git a/documentation/bibliography.pdf b/documentation/bibliography.pdf
diff --git a/functions/sin_cos_test.cpp b/functions/sin_cos_test.cpp
@@ -23,13 +23,14 @@ class SinCosTest : public ::testing::Test {};
 
 TEST_F(SinCosTest, Random) {
   std::mt19937_64 random(42);
-  // TODO(phl): Negative angles.
-  std::uniform_real_distribution<> uniformly_at(-π / 4, π / 4);
+  std::uniform_real_distribution<> uniformly_at(-2 * π, 2 * π);
 
   cpp_bin_float_50 max_sin_ulps_error = 0;
   cpp_bin_float_50 max_cos_ulps_error = 0;
   double worst_sin_argument = 0;
   double worst_cos_argument = 0;
+  std::int64_t incorrectly_rounded_sin = 0;
+  std::int64_t incorrectly_rounded_cos = 0;
 
 #if _DEBUG
   static constexpr std::int64_t iterations = 100;
@@ -52,6 +53,9 @@ TEST_F(SinCosTest, Random) {
         max_sin_ulps_error = sin_ulps_error;
         worst_sin_argument = principia_argument;
       }
+      if (sin_ulps_error > 0.5) {
+        ++incorrectly_rounded_sin;
+      }
     }
     {
       auto const boost_cos =
@@ -65,19 +69,28 @@ TEST_F(SinCosTest, Random) {
         max_cos_ulps_error = cos_ulps_error;
         worst_cos_argument = principia_argument;
       }
+      if (cos_ulps_error > 0.5) {
+        ++incorrectly_rounded_cos;
+      }
     }
   }
 
   // This implementation is not quite correctly rounded, but not far from it.
-  EXPECT_LE(max_sin_ulps_error, 0.500003);
-  EXPECT_LE(max_cos_ulps_error, 0.500001);
+  EXPECT_LE(max_sin_ulps_error, 0.500002);
+  EXPECT_LE(max_cos_ulps_error, 0.500002);
+  EXPECT_LE(incorrectly_rounded_sin, 1);
+  EXPECT_LE(incorrectly_rounded_cos, 1);
 
   LOG(ERROR) << "Sin error: " << max_sin_ulps_error << std::setprecision(25)
              << " ulps for argument: " << worst_sin_argument
-             << " value: " << Sin(worst_sin_argument);
+             << " value: " << Sin(worst_sin_argument)
+             << "; incorrectly rounded: " << std::setprecision(3)
+             << incorrectly_rounded_sin / static_cast<double>(iterations);
   LOG(ERROR) << "Cos error: " << max_cos_ulps_error << std::setprecision(25)
              << " ulps for argument: " << worst_cos_argument
-             << " value: " << Cos(worst_cos_argument);
+             << " value: " << Cos(worst_cos_argument)
+             << "; incorrectly rounded: " << std::setprecision(3)
+             << incorrectly_rounded_cos / static_cast<double>(iterations);
 }
 
 }  // namespace _sin_cos

diff --git a/numerics/double_precision.hpp b/numerics/double_precision.hpp
@@ -106,6 +106,12 @@ constexpr DoublePrecision<Product<T, U>> VeltkampDekkerProduct(T const& a,
 template<typename T, typename U>
 constexpr DoublePrecision<Sum<T, U>> QuickTwoSum(T const& a, U const& b);
 
+// Computes the exact difference of a and b.  The arguments must be such that
+// |a| >= |b| or a == 0.
+template<typename T, typename U>
+constexpr DoublePrecision<Difference<T, U>> QuickTwoDifference(T const& a,
+                                                               U const& b);
+
 // Computes the exact sum of a and b.
 template<typename T, typename U>
 constexpr DoublePrecision<Sum<T, U>> TwoSum(T const& a, U const& b);
@@ -193,6 +199,7 @@ std::ostream& operator<<(std::ostream& os,
 }  // namespace internal
 
 using internal::DoublePrecision;
+using internal::QuickTwoDifference;
 using internal::QuickTwoSum;
 using internal::TwoDifference;
 using internal::TwoProduct;

diff --git a/numerics/double_precision_body.hpp b/numerics/double_precision_body.hpp
@@ -346,18 +346,36 @@ DoublePrecision<Sum<T, U>> QuickTwoSum(T const& a, U const& b) {
       << "|" << DebugString(a) << "| < |" << DebugString(b) << "|";
 #endif
   // [HLB07].
-  DoublePrecision<Sum<T, U>> result;
+  DoublePrecision<Sum<T, U>> result{uninitialized};
   auto& s = result.value;
   auto& e = result.error;
   s = a + b;
   e = b - (s - a);
   return result;
 }
 
+template<typename T, typename U>
+FORCE_INLINE(constexpr)
+DoublePrecision<Difference<T, U>> QuickTwoDifference(T const& a, U const& b) {
+#if _DEBUG
+  using quantities::_quantities::DebugString;
+  using Comparator = ComponentwiseComparator<T, U>;
+  CHECK(Comparator::GreaterThanOrEqualOrZero(a, b))
+      << "|" << DebugString(a) << "| < |" << DebugString(b) << "|";
+#endif
+  // [HLB07].
+  DoublePrecision<Sum<T, U>> result{uninitialized};
+  auto& s = result.value;
+  auto& e = result.error;
+  s = a - b;
+  e = (a - s) - b;
+  return result;
+}
+
 template<typename T, typename U>
 constexpr DoublePrecision<Sum<T, U>> TwoSum(T const& a, U const& b) {
   // [HLB07].
-  DoublePrecision<Sum<T, U>> result;
+  DoublePrecision<Sum<T, U>> result{uninitialized};
   auto& s = result.value;
   auto& e = result.error;
   s = a + b;
@@ -374,7 +392,7 @@ constexpr DoublePrecision<Difference<T, U>> TwoDifference(T const& a,
                 "Template metaprogramming went wrong");
   using Point = T;
   using Vector = Difference<T, U>;
-  DoublePrecision<Vector> result;
+  DoublePrecision<Vector> result{uninitialized};
   Vector& s = result.value;
   Vector& e = result.error;
   s = a - b;

diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp
@@ -8,6 +8,7 @@
 #include "numerics/double_precision.hpp"
 #include "numerics/fma.hpp"
 #include "numerics/polynomial_evaluators.hpp"
+#include "quantities/elementary_functions.hpp"
 
 namespace principia {
 namespace numerics {
@@ -18,13 +19,20 @@ using namespace principia::numerics::_accurate_tables;
 using namespace principia::numerics::_double_precision;
 using namespace principia::numerics::_fma;
 using namespace principia::numerics::_polynomial_evaluators;
+using namespace principia::quantities::_elementary_functions;
 
 using Argument = double;
 using Value = double;
 
 constexpr Argument sin_near_zero_cutoff = 1.0 / 1024.0;
 constexpr Argument table_spacing_reciprocal = 512.0;
 
+// π / 2 split so that the high half has 18 zeros at the end of its mantissa.
+constexpr std::int64_t π_over_2_zeroes = 18;
+constexpr Argument π_over_2_threshold = π / 2 * (1 << π_over_2_zeroes);
+constexpr Argument π_over_2_high = 0x1.921fb'5444p0;
+constexpr Argument π_over_2_low = 0x2.d1846'9898c'c5170'1b839p-40;
+
 template<FMAPolicy fma_policy>
 using Polynomial1 = HornerEvaluator<Value, Argument, 1, fma_policy>;
 
@@ -33,6 +41,42 @@ static const __m128d sign_bit =
     _mm_castsi128_pd(_mm_cvtsi64_si128(0x8000'0000'0000'0000));
 }  // namespace masks
 
+template<FMAPolicy fma_policy>
+double FusedMultiplyAdd(double const a, double const b, double const c) {
+  if ((fma_policy == FMAPolicy::Force && CanEmitFMAInstructions) ||
+      (fma_policy == FMAPolicy::Auto && UseHardwareFMA)) {
+    using quantities::_elementary_functions::FusedMultiplyAdd;
+    return FusedMultiplyAdd(a, b, c);
+  } else {
+    return a * b + c;
+  }
+}
+
+void Reduce(Argument const x,
+            DoublePrecision<Argument>& x_reduced,
+            std::int64_t& quadrant) {
+  if (x < π / 4 && x > -π / 4) {
+    x_reduced.value = x;
+    x_reduced.error = 0;
+    quadrant = 0;
+  } else if (x <= π_over_2_threshold && x >= -π_over_2_threshold) {
+    // We are not very sensitive to rounding errors in this expression, because
+    // in the worst case it could cause the reduced angle to jump from the
+    // vicinity of π / 4 to the vicinity of -π / 4 with appropriate adjustment
+    // of the quadrant.
+    __m128d const n_128d = _mm_round_sd(
+        _mm_setzero_pd(), _mm_set_sd(x * (2 / π)), _MM_FROUND_RINT);
+    double const n_double = _mm_cvtsd_f64(n_128d);
+    std::int64_t const n = _mm_cvtsd_si64(n_128d);
+    Argument const value = x - n_double * π_over_2_high;
+    Argument const error = n_double * π_over_2_low;
+    x_reduced = QuickTwoDifference(value, error);
+    // TODO(phl): Check for value too small.
+    quadrant = n & 0b11;
+  }
+  // TODO(phl): Fallback to a more precise algorithm.
+}
+
 // TODO(phl): Take the perturbation into account in the polynomials.
 
 template<FMAPolicy fma_policy>
@@ -58,71 +102,127 @@ Value CosPolynomial(Argument const x) {
 
 template<FMAPolicy fma_policy>
 FORCE_INLINE(inline)
-Value SinImplementation(Argument const x) {
-  __m128d x_0 = _mm_set_sd(x);
-  __m128d const sign = _mm_and_pd(masks::sign_bit, x_0);
-  x_0 = _mm_andnot_pd(masks::sign_bit, x_0);
-  double const abs_x = _mm_cvtsd_f64(x_0);
+Value SinImplementation(DoublePrecision<Argument> const argument) {
+  auto const& x = argument.value;
+  auto const& e = argument.error;
+  double const abs_x = std::abs(x);
   if (abs_x < sin_near_zero_cutoff) {
     double const x² = x * x;
     double const x³ = x² * x;
-    return x + x³ * SinPolynomialNearZero<fma_policy>(x²);
+    return x + FusedMultiplyAdd<fma_policy>(
+                   x³, SinPolynomialNearZero<fma_policy>(x²), e);
   } else {
+    __m128d const sign = _mm_and_pd(masks::sign_bit, _mm_set_sd(x));
     auto const i = _mm_cvtsd_si64(_mm_set_sd(abs_x * table_spacing_reciprocal));
     auto const& accurate_values = SinCosAccurateTable[i];
-    double const& x₀ = accurate_values.x;
-    double const& sin_x₀ =
+    double const x₀ =
+        _mm_cvtsd_f64(_mm_xor_pd(_mm_set_sd(accurate_values.x), sign));
+    double const sin_x₀ =
         _mm_cvtsd_f64(_mm_xor_pd(_mm_set_sd(accurate_values.sin_x), sign));
     double const& cos_x₀ = accurate_values.cos_x;
-    double const abs_h = abs_x - x₀;
-    double const h = _mm_cvtsd_f64(_mm_xor_pd(_mm_set_sd(abs_h), sign));
+    // [GB91] incorporates `e` in the computation of `h`.  However, `x` and `e`
+    // don't overlap and in the first interval `x` and `h` may be of the same
+    // order of magnitude.  Instead we incorporate the terms in `e` and `e * h`
+    // later in the computation.
+    double const h = x - x₀;
 
     DoublePrecision<double> const sin_x₀_plus_h_cos_x₀ =
         TwoProductAdd<fma_policy>(cos_x₀, h, sin_x₀);
-    double const h² = abs_h * abs_h;
+    double const h² = h * (h + 2 * e);
     double const h³ = h² * h;
     return sin_x₀_plus_h_cos_x₀.value +
            ((sin_x₀ * h² * CosPolynomial<fma_policy>(h²) +
-             cos_x₀ * h³ * SinPolynomial<fma_policy>(h²)) +
+             cos_x₀ * FusedMultiplyAdd<fma_policy>(
+                          h³, SinPolynomial<fma_policy>(h²), e)) +
             sin_x₀_plus_h_cos_x₀.error);
   }
 }
 
 template<FMAPolicy fma_policy>
 FORCE_INLINE(inline)
-Value CosImplementation(Argument const x) {
+Value CosImplementation(DoublePrecision<Argument> const argument) {
+  auto const& x = argument.value;
+  auto const& e = argument.error;
   double const abs_x = std::abs(x);
+  __m128d const sign = _mm_and_pd(masks::sign_bit, _mm_set_sd(x));
   auto const i = _mm_cvtsd_si64(_mm_set_sd(abs_x * table_spacing_reciprocal));
   auto const& accurate_values = SinCosAccurateTable[i];
-  double const& x₀ = accurate_values.x;
-  double const& sin_x₀ = accurate_values.sin_x;
+  double const x₀ =
+      _mm_cvtsd_f64(_mm_xor_pd(_mm_set_sd(accurate_values.x), sign));
+  double const sin_x₀ =
+      _mm_cvtsd_f64(_mm_xor_pd(_mm_set_sd(accurate_values.sin_x), sign));
   double const& cos_x₀ = accurate_values.cos_x;
-  double const h = abs_x - x₀;
+  // [GB91] incorporates `e` in the computation of `h`.  However, `x` and `e`
+  // don't overlap and in the first interval `x` and `h` may be of the same
+  // order of magnitude.  Instead we incorporate the terms in `e` and `e * h`
+  // later in the computation.
+  double const h = x - x₀;
 
   DoublePrecision<double> const cos_x₀_minus_h_sin_x₀ =
       TwoProductNegatedAdd<fma_policy>(sin_x₀, h, cos_x₀);
-  double const h² = h * h;
+  double const h² = h * (h + 2 * e);
   double const h³ = h² * h;
   return cos_x₀_minus_h_sin_x₀.value +
          ((cos_x₀ * h² * CosPolynomial<fma_policy>(h²) -
-           sin_x₀ * h³ * SinPolynomial<fma_policy>(h²)) +
+           sin_x₀ * FusedMultiplyAdd<fma_policy>(
+                        h³, SinPolynomial<fma_policy>(h²), e)) +
           cos_x₀_minus_h_sin_x₀.error);
 }
 
 #if PRINCIPIA_INLINE_SIN_COS
 inline
 #endif
 Value __cdecl Sin(Argument const x) {
-  return UseHardwareFMA ? SinImplementation<FMAPolicy::Force>(x)
-                        : SinImplementation<FMAPolicy::Disallow>(x);
+  DoublePrecision<Argument> x_reduced;
+  std::int64_t quadrant;
+  Reduce(x, x_reduced, quadrant);
+  double value;
+  if (UseHardwareFMA) {
+    if (quadrant & 0b1) {
+      value = CosImplementation<FMAPolicy::Force>(x_reduced);
+    } else {
+      value = SinImplementation<FMAPolicy::Force>(x_reduced);
+    }
+  } else {
+    if (quadrant & 0b1) {
+      value = CosImplementation<FMAPolicy::Disallow>(x_reduced);
+    } else {
+      value = SinImplementation<FMAPolicy::Disallow>(x_reduced);
+    }
+  }
+  if (quadrant & 0b10) {
+    return -value;
+  } else {
+    return value;
+  }
 }
 
 #if PRINCIPIA_INLINE_SIN_COS
 inline
 #endif
 Value __cdecl Cos(Argument const x) {
-  return UseHardwareFMA ? CosImplementation<FMAPolicy::Force>(x)
-                        : CosImplementation<FMAPolicy::Disallow>(x);
+  DoublePrecision<Argument> x_reduced;
+  std::int64_t quadrant;
+  Reduce(x, x_reduced, quadrant);
+  double value;
+  if (UseHardwareFMA) {
+    if (quadrant & 0b1) {
+      value = SinImplementation<FMAPolicy::Force>(x_reduced);
+    } else {
+      value = CosImplementation<FMAPolicy::Force>(x_reduced);
+    }
+  } else {
+    if (quadrant & 0b1) {
+      value = SinImplementation<FMAPolicy::Disallow>(x_reduced);
+    } else {
+      value = CosImplementation<FMAPolicy::Disallow>(x_reduced);
+    }
+  }
+  if (quadrant == 1 || quadrant == 2) {
+    return -value;
+  } else {
+    return value;
+  }
 }
 
 }  // namespace internal