NamorNiradnug · NamorNiradnug · Jan 1, 2025 · May 3, 2024 · May 4, 2024 · May 4, 2024
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "benches/vcl"]
+	path = benches/cpp/vcl
+	url = https://github.com/vectorclass/version2
diff --git a/Cargo.toml b/Cargo.toml
@@ -14,3 +14,12 @@ name = "simd_addons"
 [dev-dependencies]
 approx = "0.5"
 paste = "1.0"
+cxx = "1.0"
+
+[build-dependencies]
+cxx-build = { version = "1.0", optional = true }
+
+[features]
+vectorclass_bench = ["cxx-build"]
+libmvec_bench = ["cxx-build"]
+cpp_bench = ["vectorclass_bench", "libmvec_bench"]
diff --git a/benches/cpp/cppbench.hpp b/benches/cpp/cppbench.hpp
@@ -0,0 +1,15 @@
+#pragma once
+#include <cstddef>
+
+using f32 = float;
+using f64 = double;
+
+static constexpr size_t BENCH_POINTS = 200'000;
+
+template <class X>
+using Func = X (*)(X);
+
+#define CPP_BENCH_FUNC_NAME(func, ftype, suffix) func##_##ftype##_##suffix
+
+#define CPP_BENCH_FUNC_DECL(func, ftype, suffix) \
+    void CPP_BENCH_FUNC_NAME(func, ftype, suffix)(const ftype *__restrict__ x, ftype *__restrict__ result)
diff --git a/benches/cpp/libmvecbench.cpp b/benches/cpp/libmvecbench.cpp
@@ -0,0 +1,32 @@
+#include "libmvecbench.hpp"
+
+#include <cmath>
+
+namespace {
+template <class T, Func<T> func>
+void libmvec_bench_impl(const T *__restrict__ x, T *__restrict__ result) {
+    for (size_t i = 0; i < BENCH_POINTS; ++i) {
+        result[i] = func(x[i]);
+    }
+}
+}  // namespace
+
+namespace bench {
+#define IMPL_LIBMVEC_BENCHES(func)                                                          \
+    CPP_BENCH_FUNC_DECL(func, f32, libmvec) { libmvec_bench_impl<f32, func##f>(x, result); } \
+    CPP_BENCH_FUNC_DECL(func, f64, libmvec) { libmvec_bench_impl<f64, func>(x, result); }
+
+IMPL_LIBMVEC_BENCHES(exp)
+IMPL_LIBMVEC_BENCHES(exp2)
+CPP_BENCH_FUNC_DECL(exp_m1, f32, libmvec) { libmvec_bench_impl<f32, expm1f>(x, result); }
+CPP_BENCH_FUNC_DECL(exp_m1, f64, libmvec) { libmvec_bench_impl<f64, expm1>(x, result); }
+
+IMPL_LIBMVEC_BENCHES(sin)
+IMPL_LIBMVEC_BENCHES(cos)
+IMPL_LIBMVEC_BENCHES(tan)
+IMPL_LIBMVEC_BENCHES(asin)
+IMPL_LIBMVEC_BENCHES(acos)
+IMPL_LIBMVEC_BENCHES(atan)
+
+#undef IMPL_LIBMVEC_BENCHES
+}  // namespace benches
diff --git a/benches/cpp/libmvecbench.hpp b/benches/cpp/libmvecbench.hpp
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "cppbench.hpp"
+
+namespace bench {
+#define DEF_VECLIB_BENCHES(func)         \
+    CPP_BENCH_FUNC_DECL(func, f32, libmvec); \
+    CPP_BENCH_FUNC_DECL(func, f64, libmvec);
+
+DEF_VECLIB_BENCHES(exp)
+DEF_VECLIB_BENCHES(exp2)
+DEF_VECLIB_BENCHES(exp_m1)
+DEF_VECLIB_BENCHES(sin)
+DEF_VECLIB_BENCHES(cos)
+DEF_VECLIB_BENCHES(tan)
+DEF_VECLIB_BENCHES(asin)
+DEF_VECLIB_BENCHES(acos)
+DEF_VECLIB_BENCHES(atan)
+
+#undef DEF_VECLIB_BENCHES
+}  // namespace bench
diff --git a/benches/cpp/libmvecbench.rs b/benches/cpp/libmvecbench.rs
@@ -0,0 +1,32 @@
+#[cxx::bridge(namespace = "bench")]
+pub mod ffi {
+    unsafe extern "C++" {
+        include!("portable-simd-addons/benches/cpp/libmvecbench.hpp");
+
+        unsafe fn exp_f32_libmvec(x: *const f32, result: *mut f32);
+        unsafe fn exp2_f32_libmvec(x: *const f32, result: *mut f32);
+        unsafe fn exp_m1_f32_libmvec(x: *const f32, result: *mut f32);
+
+        unsafe fn sin_f32_libmvec(x: *const f32, result: *mut f32);
+        unsafe fn cos_f32_libmvec(x: *const f32, result: *mut f32);
+        unsafe fn tan_f32_libmvec(x: *const f32, result: *mut f32);
+
+        unsafe fn asin_f32_libmvec(x: *const f32, result: *mut f32);
+        unsafe fn acos_f32_libmvec(x: *const f32, result: *mut f32);
+        unsafe fn atan_f32_libmvec(x: *const f32, result: *mut f32);
+
+        unsafe fn exp_f64_libmvec(x: *const f64, result: *mut f64);
+        unsafe fn exp2_f64_libmvec(x: *const f64, result: *mut f64);
+        unsafe fn exp_m1_f64_libmvec(x: *const f64, result: *mut f64);
+
+        unsafe fn sin_f64_libmvec(x: *const f64, result: *mut f64);
+        unsafe fn cos_f64_libmvec(x: *const f64, result: *mut f64);
+        unsafe fn tan_f64_libmvec(x: *const f64, result: *mut f64);
+
+        unsafe fn asin_f64_libmvec(x: *const f64, result: *mut f64);
+        unsafe fn acos_f64_libmvec(x: *const f64, result: *mut f64);
+        unsafe fn atan_f64_libmvec(x: *const f64, result: *mut f64);
+    }
+}
+
+pub use ffi::*;
diff --git a/benches/cpp/vcl b/benches/cpp/vcl
diff --git a/benches/cpp/vclbench.cpp b/benches/cpp/vclbench.cpp
@@ -0,0 +1,46 @@
+#include "vclbench.hpp"
+
+#define VCL_NAMESPACE vcl
+#include "vcl/vectormath_exp.h"
+#include "vcl/vectormath_trig.h"
+
+namespace {
+template <class Scalar, class Vec, Func<Vec> func>
+void vcl_bench_impl(const Scalar *__restrict__ x, Scalar *__restrict__ result) {
+    Vec x_vec;
+    for (size_t i = 0; i < BENCH_POINTS; i += Vec::size()) {
+        x_vec.load(x + i);
+        func(x_vec).store(result + i);
+    }
+}
+}  // namespace
+
+namespace bench {
+#define IMPL_VCL_BECHES(func)                                                                       \
+    CPP_BENCH_FUNC_DECL(func, f32, vcl) { vcl_bench_impl<f32, vcl::Vec16f, vcl::func>(x, result); } \
+    CPP_BENCH_FUNC_DECL(func, f64, vcl) { vcl_bench_impl<f64, vcl::Vec8d, vcl::func>(x, result); }
+
+IMPL_VCL_BECHES(exp)
+IMPL_VCL_BECHES(exp2)
+CPP_BENCH_FUNC_DECL(exp_m1, f32, vcl) { vcl_bench_impl<f32, vcl::Vec16f, vcl::expm1>(x, result); }
+CPP_BENCH_FUNC_DECL(exp_m1, f64, vcl) { vcl_bench_impl<f64, vcl::Vec8d, vcl::expm1>(x, result); }
+
+IMPL_VCL_BECHES(sin)
+IMPL_VCL_BECHES(cos)
+IMPL_VCL_BECHES(tan)
+IMPL_VCL_BECHES(asin)
+IMPL_VCL_BECHES(acos)
+IMPL_VCL_BECHES(atan)
+
+void atan2_f32_vcl(const float *x, const float *y, float *result) {
+    vcl::Vec16f x_vec;
+    vcl::Vec16f y_vec;
+    for (size_t i = 0; i < BENCH_POINTS; i += vcl::Vec16f::size()) {
+        x_vec.load(x + i);
+        y_vec.load(y + i);
+        vcl::atan2(y_vec, x_vec).store(result + i);
+    }
+}
+
+#undef IMPL_VCL_BECHES
+}  // namespace bench
diff --git a/benches/cpp/vclbench.hpp b/benches/cpp/vclbench.hpp
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "cppbench.hpp"
+
+namespace bench {
+#define DEF_VCL_BENCHES(func)            \
+    CPP_BENCH_FUNC_DECL(func, f32, vcl); \
+    CPP_BENCH_FUNC_DECL(func, f64, vcl);
+
+DEF_VCL_BENCHES(exp)
+DEF_VCL_BENCHES(exp2)
+DEF_VCL_BENCHES(exp_m1)
+DEF_VCL_BENCHES(sin)
+DEF_VCL_BENCHES(cos)
+DEF_VCL_BENCHES(tan)
+DEF_VCL_BENCHES(asin)
+DEF_VCL_BENCHES(acos)
+DEF_VCL_BENCHES(atan)
+
+#undef DEF_VCL_BENCHES
+}  // namespace bench
diff --git a/benches/cpp/vclbench.rs b/benches/cpp/vclbench.rs
@@ -0,0 +1,35 @@
+#[cfg(all(not(target_arch = "x86"), not(target_arch = "x86_64")))]
+compile_error!("VCL supports only x86-compatible platforms");
+
+#[cxx::bridge(namespace = "bench")]
+pub mod ffi {
+    unsafe extern "C++" {
+        include!("portable-simd-addons/benches/cpp/vclbench.hpp");
+
+        unsafe fn exp_f32_vcl(x: *const f32, result: *mut f32);
+        unsafe fn exp2_f32_vcl(x: *const f32, result: *mut f32);
+        unsafe fn exp_m1_f32_vcl(x: *const f32, result: *mut f32);
+
+        unsafe fn sin_f32_vcl(x: *const f32, result: *mut f32);
+        unsafe fn cos_f32_vcl(x: *const f32, result: *mut f32);
+        unsafe fn tan_f32_vcl(x: *const f32, result: *mut f32);
+
+        unsafe fn asin_f32_vcl(x: *const f32, result: *mut f32);
+        unsafe fn acos_f32_vcl(x: *const f32, result: *mut f32);
+        unsafe fn atan_f32_vcl(x: *const f32, result: *mut f32);
+
+        unsafe fn exp_f64_vcl(x: *const f64, result: *mut f64);
+        unsafe fn exp2_f64_vcl(x: *const f64, result: *mut f64);
+        unsafe fn exp_m1_f64_vcl(x: *const f64, result: *mut f64);
+
+        unsafe fn sin_f64_vcl(x: *const f64, result: *mut f64);
+        unsafe fn cos_f64_vcl(x: *const f64, result: *mut f64);
+        unsafe fn tan_f64_vcl(x: *const f64, result: *mut f64);
+
+        unsafe fn asin_f64_vcl(x: *const f64, result: *mut f64);
+        unsafe fn acos_f64_vcl(x: *const f64, result: *mut f64);
+        unsafe fn atan_f64_vcl(x: *const f64, result: *mut f64);
+    }
+}
+
+pub use ffi::*;
diff --git a/benches/math.rs b/benches/math.rs
@@ -6,53 +6,104 @@ extern crate test;
 mod common;
 use common::Linspace;
 
-use simd_addons::math::Trigonometry;
+use simd_addons::math::{Exponent, Trigonometry};
 use std::simd::prelude::*;
 
 const BENCH_POINTS: usize = 200_000;
 
-macro_rules! bench_simd_vs_scalar {
-    ($range: expr, $func: tt, $ftype: ty $(, $coresimdfn: tt )?) => {
+#[cfg(feature = "vectorclass_bench")]
+#[path = "cpp/vclbench.rs"]
+mod vclbench;
+
+#[cfg(feature = "libmvec_bench")]
+#[path = "cpp/libmvecbench.rs"]
+mod libmvecbench;
+
+macro_rules! bench_func {
+    ($range: expr, $func: tt, $ftype: ty) => {
+        bench_func!($range, $func, $ftype, 64);
+    };
+
+    ($range: expr, $func: tt, $ftype: ty, $vecsize: literal) => {
         paste::paste! {
         #[bench]
         fn [< bench_ $func _ $ftype _vec >](b: &mut test::Bencher) {
             #[allow(clippy::all)]
             let data: Vec<_> = ($range as $ftype).linspace(BENCH_POINTS).collect();
+            let x = data.as_slice();
+            let mut result_vec: Vec<_> = vec![0.0; BENCH_POINTS];
+            let result = result_vec.as_mut_slice();
             b.iter(|| {
-                for x in test::black_box(data.array_chunks::<64>()) {
-                    test::black_box(Simd::from_array(*x).$func());
+                assert_eq!(x.len(), BENCH_POINTS);
+                assert_eq!(result.len(), BENCH_POINTS);
+                for i in (0..BENCH_POINTS).step_by($vecsize) {
+                    Simd::<_, $vecsize>::from_slice(&x[i..])
+                        .$func()
+                        .copy_to_slice(&mut result[i..]);
                 }
             })
         }
 
         #[bench]
-        fn [< bench_ $func _ $ftype _scalar >](b: &mut test::Bencher) {
+        fn [<bench_ $func _ $ftype _scalar >](b: &mut test::Bencher) {
             #[allow(clippy::all)]
             let data: Vec<_> = ($range as $ftype).linspace(BENCH_POINTS).collect();
+            let mut result = vec![0.0; BENCH_POINTS];
             b.iter(|| {
-                for x in test::black_box(&data) {
-                    test::black_box(x.$func());
+                assert_eq!(result.len(), BENCH_POINTS);
+                assert_eq!(data.len(), BENCH_POINTS);
+                for (x, res) in std::iter::zip(&data, &mut result) {
+                    *res = x.$func();
                 }
             });
         }
 
-        $(
+
+        #[cfg(feature = "vectorclass_bench")]
         #[bench]
-        fn [< bench_ $func _ $ftype _core_simd >](b: &mut test::Bencher) {
-            #[allow(clippy::all)]
-            let data: Vec<_> = ($range as $ftype).linspace(BENCH_POINTS).collect();
+        fn [<bench_ $func _ $ftype _vcl >](b: &mut test::Bencher) {
+            let data: Vec<$ftype> = ($range as $ftype).linspace(BENCH_POINTS).collect();
+            let mut result = vec![0.0; BENCH_POINTS];
             b.iter(|| {
-                for x in test::black_box(data.array_chunks::<64>()) {
-                    test::black_box(unsafe { core::intrinsics::simd::$coresimdfn(Simd::from_array(*x)) });
-                }
-            })
+                unsafe { vclbench::[<$func _ $ftype _vcl>](data.as_ptr(), result.as_mut_ptr()) }
+            });
+        }
+
+        #[cfg(feature = "libmvec_bench")]
+        #[bench]
+        fn [<bench_ $func _ $ftype _libmvec >](b: &mut test::Bencher) {
+            let data: Vec<$ftype> = ($range as $ftype).linspace(BENCH_POINTS).collect();
+            let mut result = vec![0.0; BENCH_POINTS];
+            b.iter(|| {
+                unsafe { libmvecbench::[<$func _ $ftype _libmvec>](data.as_ptr(), result.as_mut_ptr()) }
+            });
         }
-        )?
         }
     };
 }
 
-bench_simd_vs_scalar!(-1e4..1e4, sin, f32, simd_fsin);
-bench_simd_vs_scalar!(-1e4..1e4, sin, f64, simd_fsin);
-bench_simd_vs_scalar!(-1e4..1e4, cos, f32, simd_fcos);
-bench_simd_vs_scalar!(-1e4..1e4, cos, f64, simd_fcos);
+bench_func!(-50.0..50, exp, f32, 16);
+bench_func!(-50.0..50, exp, f64);
+bench_func!(-50.0..50, exp2, f32, 16);
+bench_func!(-50.0..50, exp2, f64);
+bench_func!(-50.0..50, exp_m1, f32, 16);
+bench_func!(-50.0..50, exp_m1, f64);
+
+bench_func!(-1e4..1e4, sin, f32);
+bench_func!(-1e4..1e4, sin, f64);
+bench_func!(-1e4..1e4, cos, f32);
+bench_func!(-1e4..1e4, cos, f64);
+bench_func!(-1e4..1e4, tan, f32);
+bench_func!(-1e4..1e4, tan, f64);
+
+bench_func!(-1.0..1.0, asin, f32);
+bench_func!(-1.0..1.0, asin, f64);
+bench_func!(-1.0..1.0, acos, f32);
+bench_func!(-1.0..1.0, acos, f64);
+bench_func!(-1e3..1e3, atan, f32);
+bench_func!(-1e3..1e3, atan, f64);
+
+#[allow(unused)]
+fn generate_atan2_bench_sample() -> (Vec<f32>, Vec<f32>) {
+    todo!()
+}