Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bench against vcl and optimize #5

Merged
merged 11 commits into from
Jan 1, 2025
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "benches/vcl"]
path = benches/cpp/vcl
url = https://github.com/vectorclass/version2
9 changes: 9 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,12 @@ name = "simd_addons"
[dev-dependencies]
approx = "0.5"
paste = "1.0"
cxx = "1.0"

[build-dependencies]
cxx-build = { version = "1.0", optional = true }

[features]
vectorclass_bench = ["cxx-build"]
libmvec_bench = ["cxx-build"]
cpp_bench = ["vectorclass_bench", "libmvec_bench"]
15 changes: 15 additions & 0 deletions benches/cpp/cppbench.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#pragma once
#include <cstddef>

using f32 = float;
using f64 = double;

static constexpr size_t BENCH_POINTS = 200'000;

template <class X>
using Func = X (*)(X);

#define CPP_BENCH_FUNC_NAME(func, ftype, suffix) func##_##ftype##_##suffix

#define CPP_BENCH_FUNC_DECL(func, ftype, suffix) \
void CPP_BENCH_FUNC_NAME(func, ftype, suffix)(const ftype *__restrict__ x, ftype *__restrict__ result)
32 changes: 32 additions & 0 deletions benches/cpp/libmvecbench.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#include "libmvecbench.hpp"

#include <cmath>

namespace {
template <class T, Func<T> func>
void libmvec_bench_impl(const T *__restrict__ x, T *__restrict__ result) {
for (size_t i = 0; i < BENCH_POINTS; ++i) {
result[i] = func(x[i]);
}
}
} // namespace

namespace bench {
#define IMPL_LIBMVEC_BENCHES(func) \
CPP_BENCH_FUNC_DECL(func, f32, libmvec) { libmvec_bench_impl<f32, func##f>(x, result); } \
CPP_BENCH_FUNC_DECL(func, f64, libmvec) { libmvec_bench_impl<f64, func>(x, result); }

IMPL_LIBMVEC_BENCHES(exp)
IMPL_LIBMVEC_BENCHES(exp2)
CPP_BENCH_FUNC_DECL(exp_m1, f32, libmvec) { libmvec_bench_impl<f32, expm1f>(x, result); }
CPP_BENCH_FUNC_DECL(exp_m1, f64, libmvec) { libmvec_bench_impl<f64, expm1>(x, result); }

IMPL_LIBMVEC_BENCHES(sin)
IMPL_LIBMVEC_BENCHES(cos)
IMPL_LIBMVEC_BENCHES(tan)
IMPL_LIBMVEC_BENCHES(asin)
IMPL_LIBMVEC_BENCHES(acos)
IMPL_LIBMVEC_BENCHES(atan)

#undef IMPL_LIBMVEC_BENCHES
} // namespace benches
21 changes: 21 additions & 0 deletions benches/cpp/libmvecbench.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#pragma once

#include "cppbench.hpp"

namespace bench {
#define DEF_VECLIB_BENCHES(func) \
CPP_BENCH_FUNC_DECL(func, f32, libmvec); \
CPP_BENCH_FUNC_DECL(func, f64, libmvec);

DEF_VECLIB_BENCHES(exp)
DEF_VECLIB_BENCHES(exp2)
DEF_VECLIB_BENCHES(exp_m1)
DEF_VECLIB_BENCHES(sin)
DEF_VECLIB_BENCHES(cos)
DEF_VECLIB_BENCHES(tan)
DEF_VECLIB_BENCHES(asin)
DEF_VECLIB_BENCHES(acos)
DEF_VECLIB_BENCHES(atan)

#undef DEF_VECLIB_BENCHES
} // namespace bench
32 changes: 32 additions & 0 deletions benches/cpp/libmvecbench.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#[cxx::bridge(namespace = "bench")]
pub mod ffi {
unsafe extern "C++" {
include!("portable-simd-addons/benches/cpp/libmvecbench.hpp");

unsafe fn exp_f32_libmvec(x: *const f32, result: *mut f32);
unsafe fn exp2_f32_libmvec(x: *const f32, result: *mut f32);
unsafe fn exp_m1_f32_libmvec(x: *const f32, result: *mut f32);

unsafe fn sin_f32_libmvec(x: *const f32, result: *mut f32);
unsafe fn cos_f32_libmvec(x: *const f32, result: *mut f32);
unsafe fn tan_f32_libmvec(x: *const f32, result: *mut f32);

unsafe fn asin_f32_libmvec(x: *const f32, result: *mut f32);
unsafe fn acos_f32_libmvec(x: *const f32, result: *mut f32);
unsafe fn atan_f32_libmvec(x: *const f32, result: *mut f32);

unsafe fn exp_f64_libmvec(x: *const f64, result: *mut f64);
unsafe fn exp2_f64_libmvec(x: *const f64, result: *mut f64);
unsafe fn exp_m1_f64_libmvec(x: *const f64, result: *mut f64);

unsafe fn sin_f64_libmvec(x: *const f64, result: *mut f64);
unsafe fn cos_f64_libmvec(x: *const f64, result: *mut f64);
unsafe fn tan_f64_libmvec(x: *const f64, result: *mut f64);

unsafe fn asin_f64_libmvec(x: *const f64, result: *mut f64);
unsafe fn acos_f64_libmvec(x: *const f64, result: *mut f64);
unsafe fn atan_f64_libmvec(x: *const f64, result: *mut f64);
}
}

pub use ffi::*;
1 change: 1 addition & 0 deletions benches/cpp/vcl
Submodule vcl added at f4617d
46 changes: 46 additions & 0 deletions benches/cpp/vclbench.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#include "vclbench.hpp"

#define VCL_NAMESPACE vcl
#include "vcl/vectormath_exp.h"
#include "vcl/vectormath_trig.h"

namespace {
template <class Scalar, class Vec, Func<Vec> func>
void vcl_bench_impl(const Scalar *__restrict__ x, Scalar *__restrict__ result) {
Vec x_vec;
for (size_t i = 0; i < BENCH_POINTS; i += Vec::size()) {
x_vec.load(x + i);
func(x_vec).store(result + i);
}
}
} // namespace

namespace bench {
#define IMPL_VCL_BECHES(func) \
CPP_BENCH_FUNC_DECL(func, f32, vcl) { vcl_bench_impl<f32, vcl::Vec16f, vcl::func>(x, result); } \
CPP_BENCH_FUNC_DECL(func, f64, vcl) { vcl_bench_impl<f64, vcl::Vec8d, vcl::func>(x, result); }

IMPL_VCL_BECHES(exp)
IMPL_VCL_BECHES(exp2)
CPP_BENCH_FUNC_DECL(exp_m1, f32, vcl) { vcl_bench_impl<f32, vcl::Vec16f, vcl::expm1>(x, result); }
CPP_BENCH_FUNC_DECL(exp_m1, f64, vcl) { vcl_bench_impl<f64, vcl::Vec8d, vcl::expm1>(x, result); }

IMPL_VCL_BECHES(sin)
IMPL_VCL_BECHES(cos)
IMPL_VCL_BECHES(tan)
IMPL_VCL_BECHES(asin)
IMPL_VCL_BECHES(acos)
IMPL_VCL_BECHES(atan)

void atan2_f32_vcl(const float *x, const float *y, float *result) {
vcl::Vec16f x_vec;
vcl::Vec16f y_vec;
for (size_t i = 0; i < BENCH_POINTS; i += vcl::Vec16f::size()) {
x_vec.load(x + i);
y_vec.load(y + i);
vcl::atan2(y_vec, x_vec).store(result + i);
}
}

#undef IMPL_VCL_BECHES
} // namespace bench
21 changes: 21 additions & 0 deletions benches/cpp/vclbench.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#pragma once

#include "cppbench.hpp"

namespace bench {
#define DEF_VCL_BENCHES(func) \
CPP_BENCH_FUNC_DECL(func, f32, vcl); \
CPP_BENCH_FUNC_DECL(func, f64, vcl);

DEF_VCL_BENCHES(exp)
DEF_VCL_BENCHES(exp2)
DEF_VCL_BENCHES(exp_m1)
DEF_VCL_BENCHES(sin)
DEF_VCL_BENCHES(cos)
DEF_VCL_BENCHES(tan)
DEF_VCL_BENCHES(asin)
DEF_VCL_BENCHES(acos)
DEF_VCL_BENCHES(atan)

#undef DEF_VCL_BENCHES
} // namespace bench
35 changes: 35 additions & 0 deletions benches/cpp/vclbench.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#[cfg(all(not(target_arch = "x86"), not(target_arch = "x86_64")))]
compile_error!("VCL supports only x86-compatible platforms");

#[cxx::bridge(namespace = "bench")]
pub mod ffi {
unsafe extern "C++" {
include!("portable-simd-addons/benches/cpp/vclbench.hpp");

unsafe fn exp_f32_vcl(x: *const f32, result: *mut f32);
unsafe fn exp2_f32_vcl(x: *const f32, result: *mut f32);
unsafe fn exp_m1_f32_vcl(x: *const f32, result: *mut f32);

unsafe fn sin_f32_vcl(x: *const f32, result: *mut f32);
unsafe fn cos_f32_vcl(x: *const f32, result: *mut f32);
unsafe fn tan_f32_vcl(x: *const f32, result: *mut f32);

unsafe fn asin_f32_vcl(x: *const f32, result: *mut f32);
unsafe fn acos_f32_vcl(x: *const f32, result: *mut f32);
unsafe fn atan_f32_vcl(x: *const f32, result: *mut f32);

unsafe fn exp_f64_vcl(x: *const f64, result: *mut f64);
unsafe fn exp2_f64_vcl(x: *const f64, result: *mut f64);
unsafe fn exp_m1_f64_vcl(x: *const f64, result: *mut f64);

unsafe fn sin_f64_vcl(x: *const f64, result: *mut f64);
unsafe fn cos_f64_vcl(x: *const f64, result: *mut f64);
unsafe fn tan_f64_vcl(x: *const f64, result: *mut f64);

unsafe fn asin_f64_vcl(x: *const f64, result: *mut f64);
unsafe fn acos_f64_vcl(x: *const f64, result: *mut f64);
unsafe fn atan_f64_vcl(x: *const f64, result: *mut f64);
}
}

pub use ffi::*;
93 changes: 72 additions & 21 deletions benches/math.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,53 +6,104 @@ extern crate test;
mod common;
use common::Linspace;

use simd_addons::math::Trigonometry;
use simd_addons::math::{Exponent, Trigonometry};
use std::simd::prelude::*;

const BENCH_POINTS: usize = 200_000;

macro_rules! bench_simd_vs_scalar {
($range: expr, $func: tt, $ftype: ty $(, $coresimdfn: tt )?) => {
#[cfg(feature = "vectorclass_bench")]
#[path = "cpp/vclbench.rs"]
mod vclbench;

#[cfg(feature = "libmvec_bench")]
#[path = "cpp/libmvecbench.rs"]
mod libmvecbench;

macro_rules! bench_func {
($range: expr, $func: tt, $ftype: ty) => {
bench_func!($range, $func, $ftype, 64);
};

($range: expr, $func: tt, $ftype: ty, $vecsize: literal) => {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

А в этом макросе нигде не надо black_box?

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

А в этом макросе нигде не надо black_box?

Тут как раз важно, что оно не black_box, т.к. это ломает оптимизации. Там даже удаление assert-а замедляет бенчмарк.

Наверное можно в black_box засунуть сразу весь слайс типа такого:

let input_data = black_box(data);
assert(input_data.len() == BENCH_POINTS);
...

но кажется это не должно ничего поменять (я попробую).

paste::paste! {
#[bench]
fn [< bench_ $func _ $ftype _vec >](b: &mut test::Bencher) {
#[allow(clippy::all)]
let data: Vec<_> = ($range as $ftype).linspace(BENCH_POINTS).collect();
let x = data.as_slice();
let mut result_vec: Vec<_> = vec![0.0; BENCH_POINTS];
let result = result_vec.as_mut_slice();
b.iter(|| {
for x in test::black_box(data.array_chunks::<64>()) {
test::black_box(Simd::from_array(*x).$func());
assert_eq!(x.len(), BENCH_POINTS);
assert_eq!(result.len(), BENCH_POINTS);
for i in (0..BENCH_POINTS).step_by($vecsize) {
Simd::<_, $vecsize>::from_slice(&x[i..])
.$func()
.copy_to_slice(&mut result[i..]);
}
})
}

#[bench]
fn [< bench_ $func _ $ftype _scalar >](b: &mut test::Bencher) {
fn [<bench_ $func _ $ftype _scalar >](b: &mut test::Bencher) {
#[allow(clippy::all)]
let data: Vec<_> = ($range as $ftype).linspace(BENCH_POINTS).collect();
let mut result = vec![0.0; BENCH_POINTS];
b.iter(|| {
for x in test::black_box(&data) {
test::black_box(x.$func());
assert_eq!(result.len(), BENCH_POINTS);
assert_eq!(data.len(), BENCH_POINTS);
for (x, res) in std::iter::zip(&data, &mut result) {
*res = x.$func();
}
});
}

$(

#[cfg(feature = "vectorclass_bench")]
#[bench]
fn [< bench_ $func _ $ftype _core_simd >](b: &mut test::Bencher) {
#[allow(clippy::all)]
let data: Vec<_> = ($range as $ftype).linspace(BENCH_POINTS).collect();
fn [<bench_ $func _ $ftype _vcl >](b: &mut test::Bencher) {
let data: Vec<$ftype> = ($range as $ftype).linspace(BENCH_POINTS).collect();
let mut result = vec![0.0; BENCH_POINTS];
b.iter(|| {
for x in test::black_box(data.array_chunks::<64>()) {
test::black_box(unsafe { core::intrinsics::simd::$coresimdfn(Simd::from_array(*x)) });
}
})
unsafe { vclbench::[<$func _ $ftype _vcl>](data.as_ptr(), result.as_mut_ptr()) }
});
}

#[cfg(feature = "libmvec_bench")]
#[bench]
fn [<bench_ $func _ $ftype _libmvec >](b: &mut test::Bencher) {
let data: Vec<$ftype> = ($range as $ftype).linspace(BENCH_POINTS).collect();
let mut result = vec![0.0; BENCH_POINTS];
b.iter(|| {
unsafe { libmvecbench::[<$func _ $ftype _libmvec>](data.as_ptr(), result.as_mut_ptr()) }
});
}
)?
}
};
}

bench_simd_vs_scalar!(-1e4..1e4, sin, f32, simd_fsin);
bench_simd_vs_scalar!(-1e4..1e4, sin, f64, simd_fsin);
bench_simd_vs_scalar!(-1e4..1e4, cos, f32, simd_fcos);
bench_simd_vs_scalar!(-1e4..1e4, cos, f64, simd_fcos);
bench_func!(-50.0..50, exp, f32, 16);
bench_func!(-50.0..50, exp, f64);
bench_func!(-50.0..50, exp2, f32, 16);
bench_func!(-50.0..50, exp2, f64);
bench_func!(-50.0..50, exp_m1, f32, 16);
bench_func!(-50.0..50, exp_m1, f64);

bench_func!(-1e4..1e4, sin, f32);
bench_func!(-1e4..1e4, sin, f64);
bench_func!(-1e4..1e4, cos, f32);
bench_func!(-1e4..1e4, cos, f64);
bench_func!(-1e4..1e4, tan, f32);
bench_func!(-1e4..1e4, tan, f64);

bench_func!(-1.0..1.0, asin, f32);
bench_func!(-1.0..1.0, asin, f64);
bench_func!(-1.0..1.0, acos, f32);
bench_func!(-1.0..1.0, acos, f64);
bench_func!(-1e3..1e3, atan, f32);
bench_func!(-1e3..1e3, atan, f64);

#[allow(unused)]
fn generate_atan2_bench_sample() -> (Vec<f32>, Vec<f32>) {
todo!()
}
Loading
Loading