Skip to content

Commit

Permalink
Merge pull request #4111 from pleroy/Benchmark2
Browse files Browse the repository at this point in the history
A benchmark for our trigonometric functions and a few performance improvements
  • Loading branch information
pleroy authored Oct 7, 2024
2 parents 18696df + d8be89d commit 2a14d1c
Show file tree
Hide file tree
Showing 6 changed files with 97 additions and 36 deletions.
14 changes: 13 additions & 1 deletion benchmarks/elementary_functions_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "benchmarks/metric.hpp"
#include "functions/cos.hpp"
#include "functions/sin.hpp"
#include "numerics/sin_cos.hpp"
#include "quantities/numbers.hpp" // 🧙 For π.

namespace principia {
Expand All @@ -17,6 +18,7 @@ namespace functions {
using namespace principia::benchmarks::_metric;
using namespace principia::functions::_cos;
using namespace principia::functions::_sin;
using namespace principia::numerics::_sin_cos;

static constexpr std::int64_t number_of_iterations = 1000;

Expand All @@ -26,7 +28,7 @@ void BM_EvaluateElementaryFunction(benchmark::State& state) {
using Argument = double;

std::mt19937_64 random(42);
std::uniform_real_distribution<> uniformly_at(-π / 4, π / 4);
std::uniform_real_distribution<> uniformly_at(0, π / 4);

Argument a[number_of_iterations];
for (std::int64_t i = 0; i < number_of_iterations; ++i) {
Expand Down Expand Up @@ -79,6 +81,11 @@ BENCHMARK_TEMPLATE(BM_EvaluateElementaryFunction, Metric::Latency, cr_sin)
BENCHMARK_TEMPLATE(BM_EvaluateElementaryFunction, Metric::Throughput, cr_sin)
->Unit(benchmark::kNanosecond);

BENCHMARK_TEMPLATE(BM_EvaluateElementaryFunction, Metric::Latency, Sin)
->Unit(benchmark::kNanosecond);
BENCHMARK_TEMPLATE(BM_EvaluateElementaryFunction, Metric::Throughput, Sin)
->Unit(benchmark::kNanosecond);

BENCHMARK_TEMPLATE(BM_EvaluateElementaryFunction, Metric::Latency, std::cos)
->Unit(benchmark::kNanosecond);
BENCHMARK_TEMPLATE(BM_EvaluateElementaryFunction, Metric::Throughput, std::cos)
Expand All @@ -89,5 +96,10 @@ BENCHMARK_TEMPLATE(BM_EvaluateElementaryFunction, Metric::Latency, cr_cos)
BENCHMARK_TEMPLATE(BM_EvaluateElementaryFunction, Metric::Throughput, cr_cos)
->Unit(benchmark::kNanosecond);

BENCHMARK_TEMPLATE(BM_EvaluateElementaryFunction, Metric::Latency, Cos)
->Unit(benchmark::kNanosecond);
BENCHMARK_TEMPLATE(BM_EvaluateElementaryFunction, Metric::Throughput, Cos)
->Unit(benchmark::kNanosecond);

} // namespace functions
} // namespace principia
8 changes: 4 additions & 4 deletions functions/sin_cos_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class SinCosTest : public ::testing::Test {};
TEST_F(SinCosTest, Random) {
std::mt19937_64 random(42);
// TODO(phl): Negative angles.
std::uniform_real_distribution<> uniformly_at(0, π / 4);
std::uniform_real_distribution<> uniformly_at(-π / 4, π / 4);

cpp_bin_float_50 max_sin_ulps_error = 0;
cpp_bin_float_50 max_cos_ulps_error = 0;
Expand All @@ -34,7 +34,7 @@ TEST_F(SinCosTest, Random) {
#if _DEBUG
static constexpr std::int64_t iterations = 100;
#else
static constexpr std::int64_t iterations = 400'000;
static constexpr std::int64_t iterations = 300'000;
#endif

for (std::int64_t i = 0; i < iterations; ++i) {
Expand Down Expand Up @@ -69,8 +69,8 @@ TEST_F(SinCosTest, Random) {
}

// This implementation is not quite correctly rounded, but not far from it.
EXPECT_LE(max_sin_ulps_error, 0.500002);
EXPECT_LE(max_cos_ulps_error, 0.500002);
EXPECT_LE(max_sin_ulps_error, 0.500003);
EXPECT_LE(max_cos_ulps_error, 0.500001);

LOG(ERROR) << "Sin error: " << max_sin_ulps_error << std::setprecision(25)
<< " ulps for argument: " << worst_sin_argument
Expand Down
21 changes: 14 additions & 7 deletions numerics/double_precision.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <string>

#include "base/not_null.hpp"
#include "base/tags.hpp"
#include "numerics/fma.hpp"
#include "quantities/named_quantities.hpp"
#include "quantities/quantities.hpp"
Expand All @@ -14,6 +15,7 @@ namespace _double_precision {
namespace internal {

using namespace principia::base::_not_null;
using namespace principia::base::_tags;
using namespace principia::numerics::_fma;
using namespace principia::quantities::_named_quantities;
using namespace principia::quantities::_quantities;
Expand All @@ -22,8 +24,9 @@ using namespace principia::quantities::_quantities;
// type of the value must be an affine space. The notations follow [HLB08].
template<typename T>
struct DoublePrecision final {
constexpr DoublePrecision() = default;
constexpr DoublePrecision();

explicit constexpr DoublePrecision(uninitialized_t);
explicit constexpr DoublePrecision(T const& value);

// This is correct assuming that left and right have non-overlapping
Expand All @@ -47,8 +50,8 @@ struct DoublePrecision final {
static DoublePrecision ReadFromMessage(
serialization::DoublePrecision const& message);

T value{};
Difference<T> error{};
T value;
Difference<T> error;
};

// `scale` must be a signed power of two or zero.
Expand All @@ -64,25 +67,29 @@ DoublePrecision<Product<T, U>> Scale(T const& scale,
template<FMAPolicy fma_policy = FMAPolicy::Auto, typename T, typename U>
DoublePrecision<Product<T, U>> TwoProduct(T const& a, U const& b);

// Returns the exact value of `a * b + c`.
// Returns the exact value of `a * b + c` if `|a * b|` is small compared to
// `|c|`. See [SZ05], section 2.1.
template<FMAPolicy fma_policy = FMAPolicy::Auto, typename T, typename U>
DoublePrecision<Product<T, U>> TwoProductAdd(T const& a,
U const& b,
Product<T, U> const& c);

// Returns the exact value of `a * b - c`.
// Returns the exact value of `a * b - c` if `|a * b|` is small compared to
// `|c|`. See [SZ05], section 2.1.
template<FMAPolicy fma_policy = FMAPolicy::Auto, typename T, typename U>
DoublePrecision<Product<T, U>> TwoProductSubtract(T const& a,
U const& b,
Product<T, U> const& c);

// Returns the exact value of `-a * b + c`.
// Returns the exact value of `-a * b + c` if `|a * b|` is small compared to
// `|c|`. See [SZ05], section 2.1.
template<FMAPolicy fma_policy = FMAPolicy::Auto, typename T, typename U>
DoublePrecision<Product<T, U>> TwoProductNegatedAdd(T const& a,
U const& b,
Product<T, U> const& c);

// Returns the exact value of `-a * b - c`.
// Returns the exact value of `-a * b - c` if `|a * b|` is small compared to
// `|c|`. See [SZ05], section 2.1.
template<FMAPolicy fma_policy = FMAPolicy::Auto, typename T, typename U>
DoublePrecision<Product<T, U>>
TwoProductNegatedSubtract(T const& a,
Expand Down
34 changes: 22 additions & 12 deletions numerics/double_precision_body.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,12 @@ struct ComponentwiseComparator<T, U> {
};


template<typename T>
constexpr DoublePrecision<T>::DoublePrecision() : value{}, error{} {}

template<typename T>
constexpr DoublePrecision<T>::DoublePrecision(uninitialized_t) {}

template<typename T>
constexpr DoublePrecision<T>::DoublePrecision(T const& value)
: value(value),
Expand Down Expand Up @@ -199,7 +205,7 @@ DoublePrecision<Product<T, U>> Scale(T const & scale,
CHECK_EQ(0.5, std::fabs(mantissa)) << scale;
}
#endif
DoublePrecision<Product<T, U>> result;
DoublePrecision<Product<T, U>> result(uninitialized);
result.value = right.value * scale;
result.error = right.error * scale;
return result;
Expand All @@ -208,7 +214,7 @@ DoublePrecision<Product<T, U>> Scale(T const & scale,
template<typename T, typename U>
constexpr DoublePrecision<Product<T, U>> VeltkampDekkerProduct(T const& a,
U const& b) {
DoublePrecision<Product<T, U>> result;
DoublePrecision<Product<T, U>> result(uninitialized);
auto const& x = a;
auto const& y = b;
auto& z = result.value;
Expand Down Expand Up @@ -242,7 +248,8 @@ DoublePrecision<Product<T, U>> TwoProduct(T const& a, U const& b) {
if ((fma_policy == FMAPolicy::Force && CanEmitFMAInstructions) ||
(fma_policy == FMAPolicy::Auto && UseHardwareFMA)) {
using quantities::_elementary_functions::FusedMultiplySubtract;
DoublePrecision<Product<T, U>> result(a * b);
DoublePrecision<Product<T, U>> result(uninitialized);
result.value = a * b;
result.error = FusedMultiplySubtract(a, b, result.value);
return result;
} else {
Expand All @@ -259,8 +266,9 @@ DoublePrecision<Product<T, U>> TwoProductAdd(T const& a,
(fma_policy == FMAPolicy::Auto && UseHardwareFMA)) {
using quantities::_elementary_functions::FusedMultiplyAdd;
using quantities::_elementary_functions::FusedMultiplySubtract;
DoublePrecision<Product<T, U>> result(FusedMultiplyAdd(a, b, c));
result.error = FusedMultiplySubtract(a, b, (result.value - c));
DoublePrecision<Product<T, U>> result(uninitialized);
result.value = FusedMultiplyAdd(a, b, c);
result.error = FusedMultiplySubtract(a, b, result.value - c);
return result;
} else {
auto result = VeltkampDekkerProduct(a, b);
Expand All @@ -277,8 +285,9 @@ DoublePrecision<Product<T, U>> TwoProductSubtract(T const& a,
if ((fma_policy == FMAPolicy::Force && CanEmitFMAInstructions) ||
(fma_policy == FMAPolicy::Auto && UseHardwareFMA)) {
using quantities::_elementary_functions::FusedMultiplySubtract;
DoublePrecision<Product<T, U>> result(FusedMultiplySubtract(a, b, c));
result.error = FusedMultiplySubtract(a, b, (result.value + c));
DoublePrecision<Product<T, U>> result(uninitialized);
result.value = FusedMultiplySubtract(a, b, c);
result.error = FusedMultiplySubtract(a, b, result.value + c);
return result;
} else {
auto result = VeltkampDekkerProduct(a, b);
Expand All @@ -296,8 +305,9 @@ DoublePrecision<Product<T, U>> TwoProductNegatedAdd(T const& a,
(fma_policy == FMAPolicy::Auto && UseHardwareFMA)) {
using quantities::_elementary_functions::FusedNegatedMultiplyAdd;
using quantities::_elementary_functions::FusedNegatedMultiplySubtract;
DoublePrecision<Product<T, U>> result(FusedNegatedMultiplyAdd(a, b, c));
result.error = FusedNegatedMultiplySubtract(a, b, (result.value - c));
DoublePrecision<Product<T, U>> result(uninitialized);
result.value = FusedNegatedMultiplyAdd(a, b, c);
result.error = FusedNegatedMultiplySubtract(a, b, result.value - c);
return result;
} else {
auto result = VeltkampDekkerProduct(-a, b);
Expand All @@ -315,9 +325,9 @@ TwoProductNegatedSubtract(T const& a,
if ((fma_policy == FMAPolicy::Force && CanEmitFMAInstructions) ||
(fma_policy == FMAPolicy::Auto && UseHardwareFMA)) {
using quantities::_elementary_functions::FusedNegatedMultiplySubtract;
DoublePrecision<Product<T, U>> result(
FusedNegatedMultiplySubtract(a, b, c));
result.error = FusedNegatedMultiplySubtract(a, b, (result.value + c));
DoublePrecision<Product<T, U>> result(uninitialized);
result.value = FusedNegatedMultiplySubtract(a, b, c);
result.error = FusedNegatedMultiplySubtract(a, b, result.value + c);
return result;
} else {
auto result = VeltkampDekkerProduct(-a, b);
Expand Down
40 changes: 30 additions & 10 deletions numerics/sin_cos.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#pragma once

#include "numerics/sin_cos.hpp"

#include <cmath>
#include <pmmintrin.h>

#include "numerics/accurate_tables.mathematica.h"
#include "numerics/double_precision.hpp"
Expand All @@ -26,6 +28,11 @@ constexpr Argument table_spacing_reciprocal = 512.0;
template<FMAPolicy fma_policy>
using Polynomial1 = HornerEvaluator<Value, Argument, 1, fma_policy>;

namespace masks {
static const __m128d sign_bit =
_mm_castsi128_pd(_mm_cvtsi64_si128(0x8000'0000'0000'0000));
} // namespace masks

// TODO(phl): Take the perturbation into account in the polynomials.

template<FMAPolicy fma_policy>
Expand All @@ -52,21 +59,27 @@ Value CosPolynomial(Argument const x) {
template<FMAPolicy fma_policy>
FORCE_INLINE(inline)
Value SinImplementation(Argument const x) {
if (x < sin_near_zero_cutoff) {
__m128d x_0 = _mm_set_sd(x);
__m128d const sign = _mm_and_pd(masks::sign_bit, x_0);
x_0 = _mm_andnot_pd(masks::sign_bit, x_0);
double const abs_x = _mm_cvtsd_f64(x_0);
if (abs_x < sin_near_zero_cutoff) {
double const x² = x * x;
double const x³ = x² * x;
return x + x³ * SinPolynomialNearZero<fma_policy>(x²);
} else {
std::int64_t const i = std::lround(x * table_spacing_reciprocal);
auto const i = _mm_cvtsd_si64(_mm_set_sd(abs_x * table_spacing_reciprocal));
auto const& accurate_values = SinCosAccurateTable[i];
double const& x₀ = accurate_values.x;
double const& sin_x₀ = accurate_values.sin_x;
double const& sin_x₀ =
_mm_cvtsd_f64(_mm_xor_pd(_mm_set_sd(accurate_values.sin_x), sign));
double const& cos_x₀ = accurate_values.cos_x;
double const h = x - x₀;
double const abs_h = abs_x - x₀;
double const h = _mm_cvtsd_f64(_mm_xor_pd(_mm_set_sd(abs_h), sign));
DoublePrecision<double> const sin_x₀_plus_h_cos_x₀ =
TwoProductAdd<fma_policy>(cos_x₀, h, sin_x₀);
double const h² = h * h;
double const h² = abs_h * abs_h;
double const h³ = h² * h;
return sin_x₀_plus_h_cos_x₀.value +
((sin_x₀ * h² * CosPolynomial<fma_policy>(h²) +
Expand All @@ -78,12 +91,13 @@ Value SinImplementation(Argument const x) {
template<FMAPolicy fma_policy>
FORCE_INLINE(inline)
Value CosImplementation(Argument const x) {
std::int64_t const i = std::lround(x * table_spacing_reciprocal);
double const abs_x = std::abs(x);
auto const i = _mm_cvtsd_si64(_mm_set_sd(abs_x * table_spacing_reciprocal));
auto const& accurate_values = SinCosAccurateTable[i];
double const& x₀ = accurate_values.x;
double const& sin_x₀ = accurate_values.sin_x;
double const& cos_x₀ = accurate_values.cos_x;
double const h = x - x₀;
double const h = abs_x - x₀;
DoublePrecision<double> const cos_x₀_minus_h_sin_x₀ =
TwoProductNegatedAdd<fma_policy>(sin_x₀, h, cos_x₀);
Expand All @@ -95,12 +109,18 @@ Value CosImplementation(Argument const x) {
cos_x₀_minus_h_sin_x₀.error);
}
Value Sin(Argument const x) {
#if PRINCIPIA_INLINE_SIN_COS
inline
#endif
Value __cdecl Sin(Argument const x) {
return UseHardwareFMA ? SinImplementation<FMAPolicy::Force>(x)
: SinImplementation<FMAPolicy::Disallow>(x);
}
Value Cos(Argument const x) {
#if PRINCIPIA_INLINE_SIN_COS
inline
#endif
Value __cdecl Cos(Argument const x) {
return UseHardwareFMA ? CosImplementation<FMAPolicy::Force>(x)
: CosImplementation<FMAPolicy::Disallow>(x);
}
Expand Down
16 changes: 14 additions & 2 deletions numerics/sin_cos.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,16 @@ namespace numerics {
namespace _sin_cos {
namespace internal {

double Sin(double x);
double Cos(double x);
#define PRINCIPIA_INLINE_SIN_COS 0

#if PRINCIPIA_INLINE_SIN_COS
inline
#endif
double __cdecl Sin(double x);
#if PRINCIPIA_INLINE_SIN_COS
inline
#endif
double __cdecl Cos(double x);

} // namespace internal

Expand All @@ -16,3 +24,7 @@ using internal::Sin;
} // namespace _sin_cos
} // namespace numerics
} // namespace principia

#if PRINCIPIA_INLINE_SIN_COS
#include "numerics/sin_cos.cpp"
#endif

0 comments on commit 2a14d1c

Please sign in to comment.