Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Take perf comparison #1

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,11 @@ harness = false
[[bench]]
name = "sum_nulls_bitmap"
harness = false

[[bench]]
name = "take"
harness = false

[[bench]]
name = "take_nulls_bitmap"
harness = false
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,20 @@ nonsimd_sum bitmap 2^20 f32 [541.78 us 545.16 us 549.09 us]
naive_sum bitmap 2^20 f32 [1.6740 ms 1.6922 ms 1.7149 ms]
```

### Take of values

```
core_simd_take 2^20 f32 time: [911.13 us 912.21 us 913.33 us]
naive_take 2^20 f32 time: [912.39 us 915.22 us 918.41 us]
```

### Nullable take of values (`Bitmap`)

```
core_simd_take_nulls 2^20 f32 time: [950.40 us 954.08 us 958.88 us]
naive_take_nulls 2^20 f32 time: [2.3714 ms 2.3968 ms 2.4296 ms]
```

## Bench results on default

Command:
Expand Down
34 changes: 34 additions & 0 deletions benches/take.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
use criterion::{criterion_group, criterion_main, Criterion};

use simd_benches::take::*;

fn close(l: &[f32], r: &[f32]) {
for (l, r) in l.iter().zip(r.iter()) {
assert!((l - r).abs() < l * 0.001);
}
}

fn add_benchmark(c: &mut Criterion) {
let name = "";
(10..=20).step_by(2).for_each(|log2_size| {
let size = 2usize.pow(log2_size);
let array = (0..size).map(|x| 1.0 + x as f32).collect::<Vec<_>>();
let indices = (0..size).collect::<Vec<_>>();
// check that they are equal...
close(
&core_simd_take(&array, &indices),
&naive_take(&array, &indices),
);

c.bench_function(
&format!("core_simd_take{} 2^{} f32", name, log2_size),
|b| b.iter(|| core_simd_take(&array, &indices)),
);
c.bench_function(&format!("naive_take{} 2^{} f32", name, log2_size), |b| {
b.iter(|| naive_take(&array, &indices))
});
});
}

criterion_group!(benches, add_benchmark);
criterion_main!(benches);
40 changes: 40 additions & 0 deletions benches/take_nulls_bitmap.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
use criterion::{criterion_group, criterion_main, Criterion};

use simd_benches::bitmap_ops;
use simd_benches::take::*;

fn close(l: &[f32], r: &[f32]) {
for (l, r) in l.iter().zip(r.iter()) {
assert!((l - r).abs() < l * 0.001 || (l.abs() < 0.000001 && r.abs() < 0.000001));
}
}

fn add_benchmark(c: &mut Criterion) {
let name = "";
(10..=20).step_by(2).for_each(|log2_size| {
let size = 2usize.pow(log2_size);
let array = (0..size).map(|x| 1.0 + x as f32).collect::<Vec<_>>();
let mut mask = vec![0u8; size / 8];
// 10% nulls
(0..size).for_each(|x| bitmap_ops::set_bit(&mut mask, x, (1 + x) % 10 != 0));
let mask = (mask, size);
let indices = (0..size).collect::<Vec<_>>();
// check that they are equal...
close(
&core_simd_take_nulls(&array, &indices, &mask),
&naive_take_nulls(&array, &indices, &mask),
);

c.bench_function(
&format!("core_simd_take_nulls{} 2^{} f32", name, log2_size),
|b| b.iter(|| core_simd_take_nulls(&array, &indices, &mask)),
);
c.bench_function(
&format!("naive_take_nulls{} 2^{} f32", name, log2_size),
|b| b.iter(|| naive_take_nulls(&array, &indices, &mask)),
);
});
}

criterion_group!(benches, add_benchmark);
criterion_main!(benches);
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ pub mod bitmap_ops;
pub mod sum;
pub mod sum_nulls;
pub mod sum_nulls_bitmap;
pub mod take;
70 changes: 70 additions & 0 deletions src/take.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
use core_simd::*;

use super::bitmap_ops::*;

pub fn naive_take(values: &[f32], indices: &[usize]) -> Vec<f32> {
indices.iter().map(|i| values[*i]).collect()
}

const LANES: usize = 8;
const MASK_LANES: usize = 8 / 8;

pub fn core_simd_take(values: &[f32], indices: &[usize]) -> Vec<f32> {
let chunks = indices.chunks_exact(LANES);
// todo handle remainder

let mut result = vec![0.0; indices.len()]; // todo: maybeUninit
let result_chunks = result.chunks_exact_mut(LANES);
chunks.zip(result_chunks).for_each(|(chunk, r_chunk)| {
let idxs: [usize; LANES] = chunk.try_into().unwrap();
let idxs: usizex8 = usizex8::from_array(idxs);

let r = Simd::gather_or_default(&values, idxs);
let r: [f32; LANES] = r.to_array();

let r_chunk: &mut [f32; LANES] = r_chunk.try_into().unwrap();
*r_chunk = r;
});

result
}

type Bitmap = (Vec<u8>, usize);

pub fn naive_take_nulls(values: &[f32], indices: &[usize], mask: &Bitmap) -> Vec<f32> {
let mask = (0..mask.1).map(|x| get_bit(&mask.0, x));

indices
.iter()
.zip(mask)
.map(|(x, m)| if m { values[*x] } else { 0.0f32 })
.collect()
}

pub fn core_simd_take_nulls(values: &[f32], indices: &[usize], mask: &Bitmap) -> Vec<f32> {
assert_eq!(mask.1 % 16, 0); // todo: handle remainders
let chunks = indices.chunks_exact(LANES);
let mask_chunks = mask.0.chunks_exact(MASK_LANES);
//let remainder = chunks.remainder();
//let mask_remainder = mask_chunks.remainder();

let mut result = vec![0.0; indices.len()]; // todo: maybeUninit
let result_chunks = result.chunks_exact_mut(LANES);
chunks
.zip(mask_chunks)
.zip(result_chunks)
.for_each(|((chunk, mask_chunk), r_chunk)| {
let idxs: [usize; LANES] = chunk.try_into().unwrap();
let idxs: usizex8 = usizex8::from_array(idxs);

let mask: [u8; MASK_LANES] = mask_chunk.try_into().unwrap();
let mask = masksizex8::from_bitmask(mask);

let r = Simd::gather_select(&values, mask, idxs, Simd::splat(f32::default()));
let r: [f32; LANES] = r.to_array();

let r_chunk: &mut [f32; LANES] = r_chunk.try_into().unwrap();
*r_chunk = r;
});
result
}