Skip to content

Commit

Permalink
Merge pull request #4112 from pleroy/Reduction
Browse files Browse the repository at this point in the history
Cody-Waite argument reduction for trigonometric functions
  • Loading branch information
pleroy authored Oct 11, 2024
2 parents 2a14d1c + 78c2598 commit ab9f43c
Show file tree
Hide file tree
Showing 7 changed files with 181 additions and 32 deletions.
2 changes: 1 addition & 1 deletion benchmarks/elementary_functions_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ void BM_EvaluateElementaryFunction(benchmark::State& state) {
using Argument = double;

std::mt19937_64 random(42);
std::uniform_real_distribution<> uniformly_at(0, π / 4);
std::uniform_real_distribution<> uniformly_at(-2 * π, 2 * π);

Argument a[number_of_iterations];
for (std::int64_t i = 0; i < number_of_iterations; ++i) {
Expand Down
11 changes: 11 additions & 0 deletions documentation/bibliography.bib
Original file line number Diff line number Diff line change
Expand Up @@ -718,6 +718,17 @@ @article{Fukushima2012b
volume = {236},
}

@article{GalBachelis1991,
author = {Gal, Shmuel and Bachelis, Boris},
publisher = {ACM New York, NY, USA},
date = {1991-03},
journaltitle = {ACM Transactions on Mathematical Software},
number = {1},
pages = {26--45},
title = {An Accurate Elementary Mathematical Library for the IEEE Floating Point Standard},
volume = {17},
}

@article{Gander1985,
author = {Gander, Walter},
date = {1985-02},
Expand Down
Binary file modified documentation/bibliography.pdf
Binary file not shown.
25 changes: 19 additions & 6 deletions functions/sin_cos_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,14 @@ class SinCosTest : public ::testing::Test {};

TEST_F(SinCosTest, Random) {
std::mt19937_64 random(42);
// TODO(phl): Negative angles.
std::uniform_real_distribution<> uniformly_at(-π / 4, π / 4);
std::uniform_real_distribution<> uniformly_at(-2 * π, 2 * π);

cpp_bin_float_50 max_sin_ulps_error = 0;
cpp_bin_float_50 max_cos_ulps_error = 0;
double worst_sin_argument = 0;
double worst_cos_argument = 0;
std::int64_t incorrectly_rounded_sin = 0;
std::int64_t incorrectly_rounded_cos = 0;

#if _DEBUG
static constexpr std::int64_t iterations = 100;
Expand All @@ -52,6 +53,9 @@ TEST_F(SinCosTest, Random) {
max_sin_ulps_error = sin_ulps_error;
worst_sin_argument = principia_argument;
}
if (sin_ulps_error > 0.5) {
++incorrectly_rounded_sin;
}
}
{
auto const boost_cos =
Expand All @@ -65,19 +69,28 @@ TEST_F(SinCosTest, Random) {
max_cos_ulps_error = cos_ulps_error;
worst_cos_argument = principia_argument;
}
if (cos_ulps_error > 0.5) {
++incorrectly_rounded_cos;
}
}
}

// This implementation is not quite correctly rounded, but not far from it.
EXPECT_LE(max_sin_ulps_error, 0.500003);
EXPECT_LE(max_cos_ulps_error, 0.500001);
EXPECT_LE(max_sin_ulps_error, 0.500002);
EXPECT_LE(max_cos_ulps_error, 0.500002);
EXPECT_LE(incorrectly_rounded_sin, 1);
EXPECT_LE(incorrectly_rounded_cos, 1);

LOG(ERROR) << "Sin error: " << max_sin_ulps_error << std::setprecision(25)
<< " ulps for argument: " << worst_sin_argument
<< " value: " << Sin(worst_sin_argument);
<< " value: " << Sin(worst_sin_argument)
<< "; incorrectly rounded: " << std::setprecision(3)
<< incorrectly_rounded_sin / static_cast<double>(iterations);
LOG(ERROR) << "Cos error: " << max_cos_ulps_error << std::setprecision(25)
<< " ulps for argument: " << worst_cos_argument
<< " value: " << Cos(worst_cos_argument);
<< " value: " << Cos(worst_cos_argument)
<< "; incorrectly rounded: " << std::setprecision(3)
<< incorrectly_rounded_cos / static_cast<double>(iterations);
}

} // namespace _sin_cos
Expand Down
7 changes: 7 additions & 0 deletions numerics/double_precision.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,12 @@ constexpr DoublePrecision<Product<T, U>> VeltkampDekkerProduct(T const& a,
template<typename T, typename U>
constexpr DoublePrecision<Sum<T, U>> QuickTwoSum(T const& a, U const& b);

// Computes the exact difference of a and b. The arguments must be such that
// |a| >= |b| or a == 0.
template<typename T, typename U>
constexpr DoublePrecision<Difference<T, U>> QuickTwoDifference(T const& a,
U const& b);

// Computes the exact sum of a and b.
template<typename T, typename U>
constexpr DoublePrecision<Sum<T, U>> TwoSum(T const& a, U const& b);
Expand Down Expand Up @@ -193,6 +199,7 @@ std::ostream& operator<<(std::ostream& os,
} // namespace internal

using internal::DoublePrecision;
using internal::QuickTwoDifference;
using internal::QuickTwoSum;
using internal::TwoDifference;
using internal::TwoProduct;
Expand Down
24 changes: 21 additions & 3 deletions numerics/double_precision_body.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -346,18 +346,36 @@ DoublePrecision<Sum<T, U>> QuickTwoSum(T const& a, U const& b) {
<< "|" << DebugString(a) << "| < |" << DebugString(b) << "|";
#endif
// [HLB07].
DoublePrecision<Sum<T, U>> result;
DoublePrecision<Sum<T, U>> result{uninitialized};
auto& s = result.value;
auto& e = result.error;
s = a + b;
e = b - (s - a);
return result;
}

template<typename T, typename U>
FORCE_INLINE(constexpr)
DoublePrecision<Difference<T, U>> QuickTwoDifference(T const& a, U const& b) {
#if _DEBUG
using quantities::_quantities::DebugString;
using Comparator = ComponentwiseComparator<T, U>;
CHECK(Comparator::GreaterThanOrEqualOrZero(a, b))
<< "|" << DebugString(a) << "| < |" << DebugString(b) << "|";
#endif
// [HLB07].
DoublePrecision<Sum<T, U>> result{uninitialized};
auto& s = result.value;
auto& e = result.error;
s = a - b;
e = (a - s) - b;
return result;
}

template<typename T, typename U>
constexpr DoublePrecision<Sum<T, U>> TwoSum(T const& a, U const& b) {
// [HLB07].
DoublePrecision<Sum<T, U>> result;
DoublePrecision<Sum<T, U>> result{uninitialized};
auto& s = result.value;
auto& e = result.error;
s = a + b;
Expand All @@ -374,7 +392,7 @@ constexpr DoublePrecision<Difference<T, U>> TwoDifference(T const& a,
"Template metaprogramming went wrong");
using Point = T;
using Vector = Difference<T, U>;
DoublePrecision<Vector> result;
DoublePrecision<Vector> result{uninitialized};
Vector& s = result.value;
Vector& e = result.error;
s = a - b;
Expand Down
144 changes: 122 additions & 22 deletions numerics/sin_cos.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "numerics/double_precision.hpp"
#include "numerics/fma.hpp"
#include "numerics/polynomial_evaluators.hpp"
#include "quantities/elementary_functions.hpp"

namespace principia {
namespace numerics {
Expand All @@ -18,13 +19,20 @@ using namespace principia::numerics::_accurate_tables;
using namespace principia::numerics::_double_precision;
using namespace principia::numerics::_fma;
using namespace principia::numerics::_polynomial_evaluators;
using namespace principia::quantities::_elementary_functions;

using Argument = double;
using Value = double;

constexpr Argument sin_near_zero_cutoff = 1.0 / 1024.0;
constexpr Argument table_spacing_reciprocal = 512.0;

// π / 2 split so that the high half has 18 zeros at the end of its mantissa.
constexpr std::int64_t π_over_2_zeroes = 18;
constexpr Argument π_over_2_threshold = π / 2 * (1 << π_over_2_zeroes);
constexpr Argument π_over_2_high = 0x1.921fb'5444p0;
constexpr Argument π_over_2_low = 0x2.d1846'9898c'c5170'1b839p-40;

template<FMAPolicy fma_policy>
using Polynomial1 = HornerEvaluator<Value, Argument, 1, fma_policy>;

Expand All @@ -33,6 +41,42 @@ static const __m128d sign_bit =
_mm_castsi128_pd(_mm_cvtsi64_si128(0x8000'0000'0000'0000));
} // namespace masks

template<FMAPolicy fma_policy>
double FusedMultiplyAdd(double const a, double const b, double const c) {
if ((fma_policy == FMAPolicy::Force && CanEmitFMAInstructions) ||
(fma_policy == FMAPolicy::Auto && UseHardwareFMA)) {
using quantities::_elementary_functions::FusedMultiplyAdd;
return FusedMultiplyAdd(a, b, c);
} else {
return a * b + c;
}
}

void Reduce(Argument const x,
DoublePrecision<Argument>& x_reduced,
std::int64_t& quadrant) {
if (x < π / 4 && x > -π / 4) {
x_reduced.value = x;
x_reduced.error = 0;
quadrant = 0;
} else if (x <= π_over_2_threshold && x >= -π_over_2_threshold) {
// We are not very sensitive to rounding errors in this expression, because
// in the worst case it could cause the reduced angle to jump from the
// vicinity of π / 4 to the vicinity of -π / 4 with appropriate adjustment
// of the quadrant.
__m128d const n_128d = _mm_round_sd(
_mm_setzero_pd(), _mm_set_sd(x * (2 / π)), _MM_FROUND_RINT);
double const n_double = _mm_cvtsd_f64(n_128d);
std::int64_t const n = _mm_cvtsd_si64(n_128d);
Argument const value = x - n_double * π_over_2_high;
Argument const error = n_double * π_over_2_low;
x_reduced = QuickTwoDifference(value, error);
// TODO(phl): Check for value too small.
quadrant = n & 0b11;
}
// TODO(phl): Fallback to a more precise algorithm.
}

// TODO(phl): Take the perturbation into account in the polynomials.

template<FMAPolicy fma_policy>
Expand All @@ -58,71 +102,127 @@ Value CosPolynomial(Argument const x) {
template<FMAPolicy fma_policy>
FORCE_INLINE(inline)
Value SinImplementation(Argument const x) {
__m128d x_0 = _mm_set_sd(x);
__m128d const sign = _mm_and_pd(masks::sign_bit, x_0);
x_0 = _mm_andnot_pd(masks::sign_bit, x_0);
double const abs_x = _mm_cvtsd_f64(x_0);
Value SinImplementation(DoublePrecision<Argument> const argument) {
auto const& x = argument.value;
auto const& e = argument.error;
double const abs_x = std::abs(x);
if (abs_x < sin_near_zero_cutoff) {
double const x² = x * x;
double const x³ = x² * x;
return x + x³ * SinPolynomialNearZero<fma_policy>(x²);
return x + FusedMultiplyAdd<fma_policy>(
x³, SinPolynomialNearZero<fma_policy>(x²), e);
} else {
__m128d const sign = _mm_and_pd(masks::sign_bit, _mm_set_sd(x));
auto const i = _mm_cvtsd_si64(_mm_set_sd(abs_x * table_spacing_reciprocal));
auto const& accurate_values = SinCosAccurateTable[i];
double const& x₀ = accurate_values.x;
double const& sin_x₀ =
double const x₀ =
_mm_cvtsd_f64(_mm_xor_pd(_mm_set_sd(accurate_values.x), sign));
double const sin_x₀ =
_mm_cvtsd_f64(_mm_xor_pd(_mm_set_sd(accurate_values.sin_x), sign));
double const& cos_x₀ = accurate_values.cos_x;
double const abs_h = abs_x - x₀;
double const h = _mm_cvtsd_f64(_mm_xor_pd(_mm_set_sd(abs_h), sign));
// [GB91] incorporates `e` in the computation of `h`. However, `x` and `e`
// don't overlap and in the first interval `x` and `h` may be of the same
// order of magnitude. Instead we incorporate the terms in `e` and `e * h`
// later in the computation.
double const h = x - x₀;

DoublePrecision<double> const sin_x₀_plus_h_cos_x₀ =
TwoProductAdd<fma_policy>(cos_x₀, h, sin_x₀);
double const h² = abs_h * abs_h;
double const h² = h * (h + 2 * e);
double const h³ = h² * h;
return sin_x₀_plus_h_cos_x₀.value +
((sin_x₀ * h² * CosPolynomial<fma_policy>(h²) +
cos_x₀ * h³ * SinPolynomial<fma_policy>(h²)) +
cos_x₀ * FusedMultiplyAdd<fma_policy>(
h³, SinPolynomial<fma_policy>(h²), e)) +
sin_x₀_plus_h_cos_x₀.error);
}
}

template<FMAPolicy fma_policy>
FORCE_INLINE(inline)
Value CosImplementation(Argument const x) {
Value CosImplementation(DoublePrecision<Argument> const argument) {
auto const& x = argument.value;
auto const& e = argument.error;
double const abs_x = std::abs(x);
__m128d const sign = _mm_and_pd(masks::sign_bit, _mm_set_sd(x));
auto const i = _mm_cvtsd_si64(_mm_set_sd(abs_x * table_spacing_reciprocal));
auto const& accurate_values = SinCosAccurateTable[i];
double const& x₀ = accurate_values.x;
double const& sin_x₀ = accurate_values.sin_x;
double const x₀ =
_mm_cvtsd_f64(_mm_xor_pd(_mm_set_sd(accurate_values.x), sign));
double const sin_x₀ =
_mm_cvtsd_f64(_mm_xor_pd(_mm_set_sd(accurate_values.sin_x), sign));
double const& cos_x₀ = accurate_values.cos_x;
double const h = abs_x - x₀;
// [GB91] incorporates `e` in the computation of `h`. However, `x` and `e`
// don't overlap and in the first interval `x` and `h` may be of the same
// order of magnitude. Instead we incorporate the terms in `e` and `e * h`
// later in the computation.
double const h = x - x₀;

DoublePrecision<double> const cos_x₀_minus_h_sin_x₀ =
TwoProductNegatedAdd<fma_policy>(sin_x₀, h, cos_x₀);
double const h² = h * h;
double const h² = h * (h + 2 * e);
double const h³ = h² * h;
return cos_x₀_minus_h_sin_x₀.value +
((cos_x₀ * h² * CosPolynomial<fma_policy>(h²) -
sin_x₀ * h³ * SinPolynomial<fma_policy>(h²)) +
sin_x₀ * FusedMultiplyAdd<fma_policy>(
h³, SinPolynomial<fma_policy>(h²), e)) +
cos_x₀_minus_h_sin_x₀.error);
}

#if PRINCIPIA_INLINE_SIN_COS
inline
#endif
Value __cdecl Sin(Argument const x) {
return UseHardwareFMA ? SinImplementation<FMAPolicy::Force>(x)
: SinImplementation<FMAPolicy::Disallow>(x);
DoublePrecision<Argument> x_reduced;
std::int64_t quadrant;
Reduce(x, x_reduced, quadrant);
double value;
if (UseHardwareFMA) {
if (quadrant & 0b1) {
value = CosImplementation<FMAPolicy::Force>(x_reduced);
} else {
value = SinImplementation<FMAPolicy::Force>(x_reduced);
}
} else {
if (quadrant & 0b1) {
value = CosImplementation<FMAPolicy::Disallow>(x_reduced);
} else {
value = SinImplementation<FMAPolicy::Disallow>(x_reduced);
}
}
if (quadrant & 0b10) {
return -value;
} else {
return value;
}
}

#if PRINCIPIA_INLINE_SIN_COS
inline
#endif
Value __cdecl Cos(Argument const x) {
return UseHardwareFMA ? CosImplementation<FMAPolicy::Force>(x)
: CosImplementation<FMAPolicy::Disallow>(x);
DoublePrecision<Argument> x_reduced;
std::int64_t quadrant;
Reduce(x, x_reduced, quadrant);
double value;
if (UseHardwareFMA) {
if (quadrant & 0b1) {
value = SinImplementation<FMAPolicy::Force>(x_reduced);
} else {
value = CosImplementation<FMAPolicy::Force>(x_reduced);
}
} else {
if (quadrant & 0b1) {
value = SinImplementation<FMAPolicy::Disallow>(x_reduced);
} else {
value = CosImplementation<FMAPolicy::Disallow>(x_reduced);
}
}
if (quadrant == 1 || quadrant == 2) {
return -value;
} else {
return value;
}
}

} // namespace internal
Expand Down

0 comments on commit ab9f43c

Please sign in to comment.