diff --git a/Cargo.toml b/Cargo.toml index e00dcbce12b..9e515dc16d5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -125,6 +125,7 @@ compute = ["strength_reduce", "multiversion", "lexical-core", "ahash"] io_parquet = ["parquet2", "io_ipc", "base64", "futures"] benchmarks = ["rand"] simd = ["packed_simd"] +cache_aligned = [] [package.metadata.cargo-all-features] skip_feature_sets = [ diff --git a/src/alloc/alignment.rs b/src/alloc/alignment.rs new file mode 100644 index 00000000000..dbf4602f83a --- /dev/null +++ b/src/alloc/alignment.rs @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// NOTE: Below code is written for spatial/temporal prefetcher optimizations. Memory allocation +// should align well with usage pattern of cache access and block sizes on layers of storage levels from +// registers to non-volatile memory. These alignments are all cache aware alignments incorporated +// from [cuneiform](https://crates.io/crates/cuneiform) crate. This approach mimicks Intel TBB's +// cache_aligned_allocator which exploits cache locality and minimizes prefetch signals +// resulting in less round trip time between the layers of storage. +// For further info: https://software.intel.com/en-us/node/506094 + +// 32-bit architecture and things other than netburst microarchitecture are using 64 bytes. +/// Cache and allocation multiple alignment size +#[cfg(target_arch = "x86")] +pub const ALIGNMENT: usize = 1 << 6; + +// Intel x86_64: +// L2D streamer from L1: +// Loads data or instructions from memory to the second-level cache. To use the streamer, +// organize the data or instructions in blocks of 128 bytes, aligned on 128 bytes. +// - https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf +/// Cache and allocation multiple alignment size +#[cfg(target_arch = "x86_64")] +pub const ALIGNMENT: usize = 1 << 7; + +// 24Kc: +// Data Line Size +// - https://s3-eu-west-1.amazonaws.com/downloads-mips/documents/MD00346-2B-24K-DTS-04.00.pdf +// - https://gitlab.e.foundation/e/devices/samsung/n7100/stable_android_kernel_samsung_smdk4412/commit/2dbac10263b2f3c561de68b4c369bc679352ccee +/// Cache and allocation multiple alignment size +#[cfg(target_arch = "mips")] +pub const ALIGNMENT: usize = 1 << 5; +/// Cache and allocation multiple alignment size +#[cfg(target_arch = "mips64")] +pub const ALIGNMENT: usize = 1 << 5; + +// Defaults for powerpc +/// Cache and allocation multiple alignment size +#[cfg(target_arch = "powerpc")] +pub const ALIGNMENT: usize = 1 << 5; + +// Defaults for the ppc 64 +/// Cache and allocation multiple alignment size +#[cfg(target_arch = "powerpc64")] +pub const ALIGNMENT: usize = 1 << 6; + +// e.g.: sifive +// - https://github.com/torvalds/linux/blob/master/Documentation/devicetree/bindings/riscv/sifive-l2-cache.txt#L41 +// in general all of them are the same. +/// Cache and allocation multiple alignment size +#[cfg(target_arch = "riscv")] +pub const ALIGNMENT: usize = 1 << 6; + +// This size is same across all hardware for this architecture. +// - https://docs.huihoo.com/doxygen/linux/kernel/3.7/arch_2s390_2include_2asm_2cache_8h.html +/// Cache and allocation multiple alignment size +#[cfg(target_arch = "s390x")] +pub const ALIGNMENT: usize = 1 << 8; + +// This size is same across all hardware for this architecture. +// - https://docs.huihoo.com/doxygen/linux/kernel/3.7/arch_2sparc_2include_2asm_2cache_8h.html#a9400cc2ba37e33279bdbc510a6311fb4 +/// Cache and allocation multiple alignment size +#[cfg(target_arch = "sparc")] +pub const ALIGNMENT: usize = 1 << 5; +/// Cache and allocation multiple alignment size +#[cfg(target_arch = "sparc64")] +pub const ALIGNMENT: usize = 1 << 6; + +// On ARM cache line sizes are fixed. both v6 and v7. +// Need to add board specific or platform specific things later. +/// Cache and allocation multiple alignment size +#[cfg(target_arch = "thumbv6")] +pub const ALIGNMENT: usize = 1 << 5; +/// Cache and allocation multiple alignment size +#[cfg(target_arch = "thumbv7")] +pub const ALIGNMENT: usize = 1 << 5; + +// Operating Systems cache size determines this. +// Currently no way to determine this without runtime inference. +/// Cache and allocation multiple alignment size +#[cfg(target_arch = "wasm32")] +pub const ALIGNMENT: usize = 1 << 6; + +// Same as v6 and v7. +// List goes like that: +// Cortex A, M, R, ARM v7, v7-M, Krait and NeoverseN uses this size. +/// Cache and allocation multiple alignment size +#[cfg(target_arch = "arm")] +pub const ALIGNMENT: usize = 1 << 5; + +// Combined from 4 sectors. Volta says 128. +// Prevent chunk optimizations better to go to the default size. +// If you have smaller data with less padded functionality then use 32 with force option. +// - https://devtalk.nvidia.com/default/topic/803600/variable-cache-line-width-/ +/// Cache and allocation multiple alignment size +#[cfg(target_arch = "nvptx")] +pub const ALIGNMENT: usize = 1 << 7; +/// Cache and allocation multiple alignment size +#[cfg(target_arch = "nvptx64")] +pub const ALIGNMENT: usize = 1 << 7; + +// This size is same across all hardware for this architecture. +/// Cache and allocation multiple alignment size +#[cfg(target_arch = "aarch64")] +pub const ALIGNMENT: usize = 1 << 6; diff --git a/src/alloc/mod.rs b/src/alloc/mod.rs new file mode 100644 index 00000000000..36b9f721583 --- /dev/null +++ b/src/alloc/mod.rs @@ -0,0 +1,128 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Defines memory-related functions, such as allocate/deallocate/reallocate memory +//! regions, cache and allocation alignments. + +use std::mem::size_of; +use std::ptr::NonNull; +use std::{ + alloc::{handle_alloc_error, Layout}, + sync::atomic::AtomicIsize, +}; + +use crate::types::NativeType; + +mod alignment; + +pub use alignment::ALIGNMENT; + +// If this number is not zero after all objects have been `drop`, there is a memory leak +static mut ALLOCATIONS: AtomicIsize = AtomicIsize::new(0); + +/// # Safety +/// This pointer may only be used to check if memory is allocated. +#[inline] +pub unsafe fn dangling() -> NonNull { + NonNull::new_unchecked(ALIGNMENT as *mut T) +} + +/// Allocates a cache-aligned memory region of `size` bytes with uninitialized values. +/// This is more performant than using [allocate_aligned_zeroed] when all bytes will have +/// an unknown or non-zero value and is semantically similar to `malloc`. +pub fn allocate_aligned(size: usize) -> NonNull { + unsafe { + if size == 0 { + dangling() + } else { + let size = size * size_of::(); + ALLOCATIONS.fetch_add(size as isize, std::sync::atomic::Ordering::SeqCst); + + let layout = Layout::from_size_align_unchecked(size, ALIGNMENT); + let raw_ptr = std::alloc::alloc(layout) as *mut T; + NonNull::new(raw_ptr).unwrap_or_else(|| handle_alloc_error(layout)) + } + } +} + +/// Allocates a cache-aligned memory region of `size` bytes with `0` on all of them. +/// This is more performant than using [allocate_aligned] and setting all bytes to zero +/// and is semantically similar to `calloc`. +pub fn allocate_aligned_zeroed(size: usize) -> NonNull { + unsafe { + if size == 0 { + dangling() + } else { + let size = size * size_of::(); + ALLOCATIONS.fetch_add(size as isize, std::sync::atomic::Ordering::SeqCst); + + let layout = Layout::from_size_align_unchecked(size, ALIGNMENT); + let raw_ptr = std::alloc::alloc_zeroed(layout) as *mut T; + NonNull::new(raw_ptr).unwrap_or_else(|| handle_alloc_error(layout)) + } + } +} + +/// Frees memory previously allocated by [`allocate_aligned_zeroed`] or [`allocate_aligned`]. +/// # Safety +/// This function is sound iff: +/// +/// * `ptr` was allocated by [`allocate_aligned_zeroed`] or [`allocate_aligned`] +/// * `size` must be the same size that was used to allocate that block of memory. +pub unsafe fn free_aligned(ptr: NonNull, size: usize) { + if size != 0 { + let size = size * size_of::(); + ALLOCATIONS.fetch_sub(size as isize, std::sync::atomic::Ordering::SeqCst); + std::alloc::dealloc( + ptr.as_ptr() as *mut u8, + Layout::from_size_align_unchecked(size, ALIGNMENT), + ); + } +} + +/// Reallocates memory previously allocated by [`allocate_aligned_zeroed`] or [`allocate_aligned`]. +/// # Safety +/// This function is sound iff `ptr` was previously allocated by `allocate_aligned` or `allocate_aligned_zeroed` for `old_size` items. +pub unsafe fn reallocate( + ptr: NonNull, + old_size: usize, + new_size: usize, +) -> NonNull { + if old_size == 0 { + return allocate_aligned(new_size); + } + + if new_size == 0 { + free_aligned(ptr, old_size); + return dangling(); + } + let old_size = old_size * size_of::(); + let new_size = new_size * size_of::(); + + ALLOCATIONS.fetch_add( + new_size as isize - old_size as isize, + std::sync::atomic::Ordering::SeqCst, + ); + let raw_ptr = std::alloc::realloc( + ptr.as_ptr() as *mut u8, + Layout::from_size_align_unchecked(old_size, ALIGNMENT), + new_size, + ) as *mut T; + NonNull::new(raw_ptr).unwrap_or_else(|| { + handle_alloc_error(Layout::from_size_align_unchecked(new_size, ALIGNMENT)) + }) +} diff --git a/src/buffer/bytes.rs b/src/buffer/bytes.rs index 65a959a91ea..e057f0fd942 100644 --- a/src/buffer/bytes.rs +++ b/src/buffer/bytes.rs @@ -7,6 +7,8 @@ use std::{ptr::NonNull, sync::Arc}; use crate::ffi; use crate::types::NativeType; +#[cfg(feature = "cache_aligned")] +use crate::vec::AlignedVec as Vec; /// Mode of deallocating memory regions pub enum Deallocation { @@ -89,7 +91,10 @@ impl Drop for Bytes { fn drop(&mut self) { match &self.deallocation { Deallocation::Native(capacity) => unsafe { - Vec::from_raw_parts(self.ptr.as_ptr(), self.len, *capacity); + #[cfg(feature = "cache_aligned")] + let _ = Vec::from_raw_parts(self.ptr, self.len, *capacity); + #[cfg(not(feature = "cache_aligned"))] + let _ = Vec::from_raw_parts(self.ptr.as_ptr(), self.len, *capacity); }, // foreign interface knows how to deallocate itself. Deallocation::Foreign(_) => (), diff --git a/src/buffer/immutable.rs b/src/buffer/immutable.rs index cd307434671..5632b12d90a 100644 --- a/src/buffer/immutable.rs +++ b/src/buffer/immutable.rs @@ -52,6 +52,16 @@ impl Buffer { MutableBuffer::from_len_zeroed(length).into() } + /// Takes ownership of [`Vec`]. + /// # Implementation + /// This function is `O(1)` + #[cfg(not(feature = "cache_aligned"))] + #[cfg_attr(docsrs, doc(cfg(not(feature = "cache_aligned"))))] + #[inline] + pub fn from_vec(data: Vec) -> Self { + MutableBuffer::from_vec(data).into() + } + /// Auxiliary method to create a new Buffer pub(crate) fn from_bytes(bytes: Bytes) -> Self { let length = bytes.len(); diff --git a/src/buffer/mutable.rs b/src/buffer/mutable.rs index c5cfa795ff2..281917e836e 100644 --- a/src/buffer/mutable.rs +++ b/src/buffer/mutable.rs @@ -6,13 +6,15 @@ use crate::trusted_len::TrustedLen; use crate::types::{BitChunk, NativeType}; use super::bytes::{Bytes, Deallocation}; +#[cfg(feature = "cache_aligned")] +use crate::vec::AlignedVec as Vec; use super::immutable::Buffer; /// A [`MutableBuffer`] is this crates' interface to store types that are byte-like such as `i32`. -/// It behaves like a [`Vec`], with the following differences: -/// * memory is allocated along cache lines and in multiple of 64 bytes. -/// * it can only hold types supported by the arrow format (`u8-u64`, `i8-i128`, `f32,f64` and [`crate::types::days_ms`]) +/// It behaves like a [`Vec`] but can only hold types supported by the arrow format +/// (`u8-u64`, `i8-i128`, `f32,f64`, [`crate::types::days_ms`] and [`crate::types::months_days_ns`]). +/// When the feature `cache_aligned` is active, memory is allocated along cache lines and in multiple of 64 bytes. /// A [`MutableBuffer`] can be converted to a [`Buffer`] via `.into`. /// # Example /// ``` @@ -28,6 +30,14 @@ pub struct MutableBuffer { data: Vec, } +#[cfg(not(feature = "cache_aligned"))] +#[cfg_attr(docsrs, doc(cfg(not(feature = "cache_aligned"))))] +impl From> for Vec { + fn from(data: MutableBuffer) -> Self { + data.data + } +} + impl std::fmt::Debug for MutableBuffer { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { std::fmt::Debug::fmt(&**self, f) @@ -55,6 +65,14 @@ impl MutableBuffer { } } + /// Takes ownership of [`Vec`]. + #[cfg(not(feature = "cache_aligned"))] + #[cfg_attr(docsrs, doc(cfg(not(feature = "cache_aligned"))))] + #[inline] + pub fn from_vec(data: Vec) -> Self { + Self { data } + } + /// Allocates a new [MutableBuffer] with `len` and capacity to be at least `len` /// where data is zeroed. /// # Example @@ -68,9 +86,11 @@ impl MutableBuffer { /// ``` #[inline] pub fn from_len_zeroed(len: usize) -> Self { - Self { - data: vec![T::default(); len], - } + #[cfg(not(feature = "cache_aligned"))] + let data = vec![T::default(); len]; + #[cfg(feature = "cache_aligned")] + let data = Vec::from_len_zeroed(len); + Self { data } } /// Ensures that this buffer has at least `self.len + additional` bytes. This re-allocates iff diff --git a/src/lib.rs b/src/lib.rs index 7a2543d6028..a6ca6598496 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,6 +4,8 @@ #[macro_use] pub mod array; +#[cfg(feature = "cache_aligned")] +mod alloc; pub mod bitmap; pub mod buffer; mod endianess; @@ -11,6 +13,8 @@ pub mod error; pub mod scalar; pub mod trusted_len; pub mod types; +#[cfg(feature = "cache_aligned")] +mod vec; #[cfg(feature = "compute")] #[cfg_attr(docsrs, doc(cfg(feature = "compute")))] diff --git a/tests/it/buffer/immutable.rs b/tests/it/buffer/immutable.rs index db969849053..5fbf711ec5f 100644 --- a/tests/it/buffer/immutable.rs +++ b/tests/it/buffer/immutable.rs @@ -67,3 +67,11 @@ fn debug() { let a = format!("{:?}", buffer); assert_eq!(a, "[1, 2]") } + +#[cfg(not(feature = "cache_aligned"))] +#[test] +fn from_vec() { + let buffer = Buffer::::from_vec(vec![0, 1, 2]); + assert_eq!(buffer.len(), 3); + assert_eq!(buffer.as_slice(), &[0, 1, 2]); +}