Skip to content

Commit

Permalink
Merge pull request #4157 from eggrobin/cbrt-osaca
Browse files Browse the repository at this point in the history
Move the *CA markup macros into their own header and mark up Cbrt
  • Loading branch information
eggrobin authored Jan 4, 2025
2 parents cc70c4f + 2f30c63 commit 8c5eb1e
Show file tree
Hide file tree
Showing 5 changed files with 272 additions and 227 deletions.
81 changes: 51 additions & 30 deletions numerics/cbrt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "glog/logging.h"
#include "numerics/double_precision.hpp"
#include "numerics/fma.hpp"
#include "numerics/osaca.hpp" // 🧙 For OSACA_*.
#include "quantities/elementary_functions.hpp"

namespace principia {
Expand All @@ -22,6 +23,24 @@ using namespace principia::numerics::_double_precision;
using namespace principia::numerics::_fma;
using namespace principia::quantities::_elementary_functions;

#define OSACA_ANALYSED_FUNCTION Cbrt
#define OSACA_ANALYSED_FUNCTION_NAMESPACE method_5²Z4¹FMA::
#define OSACA_ANALYSED_FUNCTION_TEMPLATE_PARAMETERS <Rounding::Correct>
#define UNDER_OSACA_HYPOTHESES(expression) \
[&] { \
constexpr double y = 3; \
constexpr double abs_y = y == 0 ? 0 : y > 0 ? y : -y; \
/* Non-constexpr values have to be taken by reference (and must not be */ \
/* used). */ \
constexpr auto CorrectionPossiblyNeeded = [](double const& r₀, \
double const& r₁, \
double const& r̃, \
double const τ) -> bool { \
return false; \
}; \
return expression; \
}()

// The computations in this file are described in documentation/cbrt.pdf; the
// identifiers match the notation in that document.

Expand Down Expand Up @@ -133,31 +152,32 @@ static_assert(σ₁⁻³ * y₁ == y₂);
static_assert(σ₂⁻³ * y₂ == y₁);

template<Rounding rounding>
double Cbrt(double const y) {
double Cbrt(double y) {
OSACA_FUNCTION_BEGIN(y, <rounding>);
__m128d Y_0 = _mm_set_sd(y);
__m128d const sign = _mm_and_pd(masks::sign_bit, Y_0);
Y_0 = _mm_andnot_pd(masks::sign_bit, Y_0);
double const abs_y = _mm_cvtsd_f64(Y_0);

if (y != y) {
OSACA_IF(y != y) {
// The usual logic will produce a qNaN when given a NaN, but will not
// preserve the payload and will signal overflows (q will be a nonsensical
// large value, and q³ will overflow). Further, the rescaling comparisons
// will signal the invalid operation exception for quiet NaNs (although that
// would be easy to work around using the unordered compare intrinsics).
return y + y;
OSACA_RETURN(y + y);
}

if (abs_y < y₁) {
if (abs_y == 0) {
return y;
OSACA_IF(abs_y < y₁) {
OSACA_IF(abs_y == 0) {
OSACA_RETURN(y);
}
return Cbrt<rounding>(y * σ₁⁻³) * σ₁;
} else if (abs_y > y₂) {
} OSACA_ELSE_IF(abs_y > y₂) {
if (abs_y == std::numeric_limits<double>::infinity()) {
return y;
OSACA_RETURN(y);
}
return Cbrt<rounding>(y * σ₂⁻³) * σ₂;
OSACA_RETURN(Cbrt<rounding>(y * σ₂⁻³) * σ₂);
}

// Step 1. The constant Γʟ²ᴄ is the result of Canon optimization for step 2.
Expand Down Expand Up @@ -197,12 +217,12 @@ double Cbrt(double const y) {
double const r₁ = x_sign_y - r₀ - Δ;

double const r̃ = r₀ + 2 * r₁;
if (rounding == Rounding::Correct &&
CorrectionPossiblyNeeded(r₀, r₁, r̃, /*τ=*/0x1.7C73DBBD9FA60p-66)) {
return _mm_cvtsd_f64(_mm_or_pd(
_mm_set_sd(CorrectLastBit(abs_y, std::abs(r₀), std::abs(r̃))), sign));
OSACA_IF(rounding == Rounding::Correct &&
CorrectionPossiblyNeeded(r₀, r₁, r̃, /*τ=*/0x1.7C73DBBD9FA60p-66)) {
OSACA_RETURN(_mm_cvtsd_f64(_mm_or_pd(
_mm_set_sd(CorrectLastBit(abs_y, std::abs(r₀), std::abs(r̃))), sign)));
}
return r₀;
OSACA_RETURN(r₀);
}
template double Cbrt<Rounding::Faithful>(double y);
template double Cbrt<Rounding::Correct>(double y);
Expand All @@ -227,31 +247,32 @@ static_assert(σ₁⁻³ * std::numeric_limits<double>::denorm_min() > y₁);
static_assert(σ₂⁻³ * std::numeric_limits<double>::max() < y₂);

template<Rounding rounding>
double Cbrt(double const y) {
double Cbrt(double y) {
OSACA_FUNCTION_BEGIN(y, <rounding>);
__m128d Y_0 = _mm_set_sd(y);
__m128d const sign = _mm_and_pd(masks::sign_bit, Y_0);
Y_0 = _mm_andnot_pd(masks::sign_bit, Y_0);
double const abs_y = _mm_cvtsd_f64(Y_0);

if (y != y) {
OSACA_IF(y != y) {
// The usual logic will produce a qNaN when given a NaN, but will not
// preserve the payload and will signal overflows (q will be a nonsensical
// large value, and q³ will overflow). Further, the rescaling comparisons
// will signal the invalid operation exception for quiet NaNs (although that
// would be easy to work around using the unordered compare intrinsics).
return y + y;
OSACA_RETURN(y + y);
}

if (abs_y < y₁) {
if (abs_y == 0) {
return y;
OSACA_IF(abs_y < y₁) {
OSACA_IF(abs_y == 0) {
OSACA_RETURN(y);
}
return Cbrt<rounding>(y * σ₁⁻³) * σ₁;
} else if (abs_y > y₂) {
if (abs_y == std::numeric_limits<double>::infinity()) {
return y;
OSACA_RETURN(Cbrt<rounding>(y * σ₁⁻³) * σ₁);
} OSACA_ELSE_IF(abs_y > y₂) {
OSACA_IF(abs_y == std::numeric_limits<double>::infinity()) {
OSACA_RETURN(y);
}
return Cbrt<rounding>(y * σ₂⁻³) * σ₂;
OSACA_RETURN(Cbrt<rounding>(y * σ₂⁻³) * σ₂);
}

// Step 1. The constant Γᴋ minimizes the error of q, as in [KB01], rather
Expand Down Expand Up @@ -299,12 +320,12 @@ double Cbrt(double const y) {
double const r₁ = FusedNegatedMultiplyAdd(Δ₁, Δ₂, x_sign_y - r₀);

double const r̃ = r₀ + 2 * r₁;
if (rounding == Rounding::Correct &&
CorrectionPossiblyNeeded(r₀, r₁, r̃, /*τ=*/0x1.E45E16EF5480Fp-76)) {
return _mm_cvtsd_f64(_mm_or_pd(
_mm_set_sd(CorrectLastBit(abs_y, std::abs(r₀), std::abs(r̃))), sign));
OSACA_IF(rounding == Rounding::Correct &&
CorrectionPossiblyNeeded(r₀, r₁, r̃, /*τ=*/0x1.E45E16EF5480Fp-76)) {
OSACA_RETURN(_mm_cvtsd_f64(_mm_or_pd(
_mm_set_sd(CorrectLastBit(abs_y, std::abs(r₀), std::abs(r̃))), sign)));
}
return r₀;
OSACA_RETURN(r₀);
}
template double Cbrt<Rounding::Faithful>(double y);
template double Cbrt<Rounding::Correct>(double y);
Expand Down
1 change: 1 addition & 0 deletions numerics/numerics.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@
<ClInclude Include="newhall_matrices.mathematica.h" />
<ClInclude Include="next.hpp" />
<ClInclude Include="next_body.hpp" />
<ClInclude Include="osaca.hpp" />
<ClInclude Include="piecewise_poisson_series.hpp" />
<ClInclude Include="piecewise_poisson_series_body.hpp" />
<ClInclude Include="poisson_series.hpp" />
Expand Down
203 changes: 203 additions & 0 deletions numerics/osaca.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
#pragma once

#define PRINCIPIA_USE_OSACA 0

// The macros OSACA_FUNCTION_BEGIN and OSACA_RETURN are used to analyse the
// latency of a double -> double function, as measured, e.g., by the
// nanobenchmarks; this notionally corresponds to the duration of an iteration
// of a loop `x = f(x)`.
// The latency-critical path of the function is reported as the loop-carried
// dependency by OSACA, and as the critical path by IACA in throughput analysis
// mode.
// OSACA and IACA are only suitable for benchmarking a single path. Any if
// statements, including in inline functions called by the function being
// analysed, should be replaced with OSACA_IF. The path should be determined by
// providing any necessary constexpr expressions in UNDER_OSACA_HYPOTHESES.
// If OSACA_EVALUATE_CONDITIONS is set to 1, code will be generated to evaluate
// the conditions for the branches taken (outside the loop-carried path, as they
// would be with correct branch prediction).
// OSACA_EVALUATE_CONDITIONS can be set to 0 to get a more streamlined
// dependency graph when the conditions are irrelevant. Note that this can
// result in artificially low port pressures.
#if 0
// Usage:
double f(double const x) {
double const abs_x = std::abs(x);
if (abs_x < 0.1) {
return x;
} else if (x < 0) {
return -f_positive(abs_x);
} else {
return f_positive(abs_x);
}
}
double f_positive(double const abs_x) {
if (abs_x > 3) {
return 1;
} else {
// …
}
}
// becomes:
double f(double x) { // The argument cannot be const.
OSACA_FUNCTION_BEGIN(x);
double const abs_x = std::abs(x);
OSACA_IF(abs_x < 0.1) {
OSACA_RETURN(x);
} OSACA_ELSE_IF(x < 0) { // `else OSACA_IF` works, but upsets the linter.
OSACA_RETURN(-f_positive(abs_x));
} else {
OSACA_RETURN(f_positive(abs_x));
}
}
// Other functions can have const arguments and return normally, but need to
// use OSACA_IF:
double f_positive(double const abs_x) {
OSACA_IF(abs_x > 3) {
return 1;
} else {
// …
}
}
// To analyse it near x = 5:
# define OSACA_ANALYSED_FUNCTION f
# define OSACA_ANALYSED_FUNCTION_NAMESPACE
# define OSACA_ANALYSED_FUNCTION_TEMPLATE_PARAMETERS
# define UNDER_OSACA_HYPOTHESES(expression) \
[&] { \
constexpr double x = 5; \
/* To avoid inconsistent definitions, constexpr code can be used as */ \
/* needed. */ \
constexpr double abs_x = x > 0 ? x : -x; \
return (expression); \
}()

// If multiple functions principia::ns1::f and principia::ns2::f are marked up
// for analysis in a translation unit, use
# define OSACA_ANALYSED_FUNCTION_NAMESPACE ns1::
// If f is templatized as, e.g, `template<RoundingMode mode>`, use
OSACA_FUNCTION_BEGIN(x, <mode>)
// and
# define OSACA_ANALYSED_FUNCTION_TEMPLATE_PARAMETERS \
<RoundingMode::NearestTiesToEven>
#endif
// In principle, the loop-carried dependency for function latency analysis is
// achieved by wrapping the body of the function in an infinite loop, assigning
// the result to the argument at each iteration, and adding IACA markers outside
// the loop. There are some subtleties:
// — We need to trick the compiler into believing the loop is finite, so that it
// doesn’t optimize away the end marker or even the function. This is
// achieved by exiting based on the value of `OSACA_loop_terminator`.
// — Return statements may be in if statements, and there may be several of
// them, so they cannot be the end of a loop started unconditionally. Instead
// we loop with goto.
// — We need to prevent the compiler from moving the start and end markers into
// the middle of register saving and restoring code, which would mess up the
// dependency analysis. This is done with additional conditional gotos.
// — Some volatile reads and writes are used to clarify identity of the
// registers in the generated code (where the names of `OSACA_result` and, if
// `OSACA_CARRY_LOOP_THROUGH_REGISTER` is set to 0, `OSACA_loop_carry` appear
// in movsd instructions).
//
// Carrying the loop dependency through a memory load and store can make the
// dependency graph easier to understand, as it forces any usage of the input to
// depend on the initial movsd, with the loop carried by a single backward edge
// to that initial movsd.
// If the loop is carried through a register, multiple usages of the input may
// result in multiple back edges from the final instruction that computed the
// result. Carrying the loop through the memory could also potentially prevent
// the compiler from reusing intermediate values in the next iteration, e.g., if
// the the computation of f(x) depends on -x and produces -f(x) before f(x), as
// in an even function defined in terms of its positive half, the compiler might
// reuse -f(x₀)=-x₁ instead of computing -x₁ from x₁=f(x₀). However:
// — it adds a spurious move to the latency;
// — some tools (IACA) cannot see the dependency through memory.
// Set OSACA_CARRY_LOOP_THROUGH_REGISTER to 0 to carry the loop through memory.

#define OSACA_EVALUATE_CONDITIONS 1
#define OSACA_CARRY_LOOP_THROUGH_REGISTER 1

#if PRINCIPIA_USE_OSACA

#include "intel/iacaMarks.h"

#define OSACA_QUALIFIED_ANALYSED_FUNCTION \
OSACA_ANALYSED_FUNCTION_NAMESPACE OSACA_ANALYSED_FUNCTION \
OSACA_ANALYSED_FUNCTION_TEMPLATE_PARAMETERS

static bool volatile OSACA_loop_terminator = false;

#define OSACA_FUNCTION_BEGIN(arg, ...) \
double OSACA_LOOP_CARRY_QUALIFIER OSACA_loop_carry = arg; \
OSACA_outer_loop: \
constexpr auto* OSACA_analysed_function_with_current_parameters = \
&OSACA_ANALYSED_FUNCTION __VA_ARGS__; \
if constexpr (std::string_view(__func__) == \
STRINGIFY_EXPANSION(OSACA_ANALYSED_FUNCTION) && \
OSACA_analysed_function_with_current_parameters == \
&OSACA_QUALIFIED_ANALYSED_FUNCTION) { \
IACA_VC64_START; \
} \
_Pragma("warning(push)"); \
_Pragma("warning(disable : 4102)"); \
OSACA_loop: \
_Pragma("warning(pop)"); \
arg = OSACA_loop_carry

#define OSACA_RETURN(result) \
do { \
if constexpr (std::string_view(__func__) == \
STRINGIFY_EXPANSION(OSACA_ANALYSED_FUNCTION) && \
OSACA_analysed_function_with_current_parameters == \
&OSACA_QUALIFIED_ANALYSED_FUNCTION) { \
OSACA_loop_carry = (result); \
if (!OSACA_loop_terminator) { \
goto OSACA_loop; \
} \
double volatile OSACA_result = OSACA_loop_carry; \
IACA_VC64_END; \
/* The outer loop prevents the the start and end marker from being */ \
/* interleaved with register saving and restoring moves. */ \
if (!OSACA_loop_terminator) { \
goto OSACA_outer_loop; \
} \
return OSACA_result; \
} else { \
return (result); \
} \
} while (false)

#if OSACA_CARRY_LOOP_THROUGH_REGISTER
#define OSACA_LOOP_CARRY_QUALIFIER
#else
#define OSACA_LOOP_CARRY_QUALIFIER volatile
#endif

// The branch not taken, determined by evaluating the condition
// `UNDER_OSACA_HYPOTHESES`, is eliminated by `if constexpr`; the condition is
// also compiled normally and assigned to a boolean; whether this results in any
// generated code depends on `OSACA_EVALUATE_CONDITIONS`. Note that, with
// `OSACA_EVALUATE_CONDITIONS`, in `OSACA_IF(p) { } OSACA_ELSE_IF(q) { }`, if
// `p` holds `UNDER_OSACA_HYPOTHESES`, code is generated to evaluate `p`, but
// not `q`.

#define OSACA_IF(condition) \
if constexpr (bool OSACA_CONDITION_QUALIFIER OSACA_computed_condition = \
(condition); \
UNDER_OSACA_HYPOTHESES(condition))

#if OSACA_EVALUATE_CONDITIONS
#define OSACA_CONDITION_QUALIFIER volatile
#else
#define OSACA_CONDITION_QUALIFIER
#endif

#else // if !PRINCIPIA_USE_OSACA

#define OSACA_FUNCTION_BEGIN(...) ((void) 0)
#define OSACA_RETURN(result) return (result)
#define OSACA_IF(condition) if (condition)

#endif // PRINCIPIA_USE_OSACA

#define OSACA_ELSE_IF else OSACA_IF // NOLINT
Loading

0 comments on commit 8c5eb1e

Please sign in to comment.