From 70b116862f99621d692eb61315ad1db63d016ac7 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Wed, 4 Aug 2021 05:27:02 +0000 Subject: [PATCH] Added IndexRange to improve performance in compute. --- src/array/mod.rs | 2 +- src/array/specification.rs | 128 +------------------------- src/compute/sort/boolean.rs | 3 +- src/compute/sort/common.rs | 15 ++- src/compute/sort/lex_sort.rs | 3 +- src/compute/sort/mod.rs | 2 +- src/compute/sort/primitive/indices.rs | 4 +- src/compute/sort/utf8.rs | 3 +- src/compute/take/mod.rs | 4 +- src/types/index.rs | 113 +++++++++++++++++++++++ src/types/mod.rs | 2 + 11 files changed, 134 insertions(+), 145 deletions(-) create mode 100644 src/types/index.rs diff --git a/src/array/mod.rs b/src/array/mod.rs index ac9dfa79e67..772c49b46c8 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -397,7 +397,7 @@ pub use fixed_size_list::FixedSizeListArray; pub use list::{ListArray, MutableListArray}; pub use null::NullArray; pub use primitive::*; -pub use specification::{Index, Offset}; +pub use specification::Offset; pub use struct_::StructArray; pub use utf8::{MutableUtf8Array, Utf8Array, Utf8ValuesIter}; diff --git a/src/array/specification.rs b/src/array/specification.rs index 0684d9a6f54..2db3d76293e 100644 --- a/src/array/specification.rs +++ b/src/array/specification.rs @@ -2,29 +2,13 @@ use std::convert::TryFrom; use num::Num; -use crate::{ - buffer::{Buffer, MutableBuffer}, - types::{NativeType, NaturalDataType}, -}; - -/// Trait describing any type that can be used to index a slot of an array. -pub trait Index: NativeType + NaturalDataType { - fn to_usize(&self) -> usize; - fn from_usize(index: usize) -> Option; - fn is_usize() -> bool { - false - } - - fn buffer_from_range(start: usize, end: usize) -> Option>; -} +use crate::{buffer::Buffer, types::Index}; /// Trait describing types that can be used as offsets as per Arrow specification. /// This trait is only implemented for `i32` and `i64`, the two sizes part of the specification. /// # Safety /// Do not implement. -pub unsafe trait Offset: - Index + Num + Ord + std::ops::AddAssign + std::ops::Sub + num::CheckedAdd -{ +pub unsafe trait Offset: Index + Num + Ord + num::CheckedAdd { fn is_large() -> bool; fn to_isize(&self) -> isize; @@ -66,114 +50,6 @@ unsafe impl Offset for i64 { } } -impl Index for i32 { - #[inline] - fn to_usize(&self) -> usize { - *self as usize - } - - #[inline] - fn from_usize(value: usize) -> Option { - Self::try_from(value).ok() - } - - fn buffer_from_range(start: usize, end: usize) -> Option> { - let start = Self::from_usize(start); - let end = Self::from_usize(end); - match (start, end) { - (Some(start), Some(end)) => unsafe { - Some(MutableBuffer::::from_trusted_len_iter_unchecked( - start..end, - )) - }, - _ => None, - } - } -} - -impl Index for i64 { - #[inline] - fn to_usize(&self) -> usize { - *self as usize - } - - #[inline] - fn from_usize(value: usize) -> Option { - Self::try_from(value).ok() - } - - fn buffer_from_range(start: usize, end: usize) -> Option> { - let start = Self::from_usize(start); - let end = Self::from_usize(end); - match (start, end) { - (Some(start), Some(end)) => unsafe { - Some(MutableBuffer::::from_trusted_len_iter_unchecked( - start..end, - )) - }, - _ => None, - } - } -} - -impl Index for u32 { - #[inline] - fn to_usize(&self) -> usize { - *self as usize - } - - #[inline] - fn from_usize(value: usize) -> Option { - Self::try_from(value).ok() - } - - fn is_usize() -> bool { - std::mem::size_of::() == std::mem::size_of::() - } - - fn buffer_from_range(start: usize, end: usize) -> Option> { - let start = Self::from_usize(start); - let end = Self::from_usize(end); - match (start, end) { - (Some(start), Some(end)) => unsafe { - Some(MutableBuffer::::from_trusted_len_iter_unchecked( - start..end, - )) - }, - _ => None, - } - } -} - -impl Index for u64 { - #[inline] - fn to_usize(&self) -> usize { - *self as usize - } - - #[inline] - fn from_usize(value: usize) -> Option { - Self::try_from(value).ok() - } - - fn is_usize() -> bool { - std::mem::size_of::() == std::mem::size_of::() - } - - fn buffer_from_range(start: usize, end: usize) -> Option> { - let start = Self::from_usize(start); - let end = Self::from_usize(end); - match (start, end) { - (Some(start), Some(end)) => unsafe { - Some(MutableBuffer::::from_trusted_len_iter_unchecked( - start..end, - )) - }, - _ => None, - } - } -} - #[inline] pub fn check_offsets(offsets: &Buffer, values_len: usize) -> usize { assert!( diff --git a/src/compute/sort/boolean.rs b/src/compute/sort/boolean.rs index 6afdd551ec1..79ad428576f 100644 --- a/src/compute/sort/boolean.rs +++ b/src/compute/sort/boolean.rs @@ -1,6 +1,7 @@ use crate::{ - array::{Array, BooleanArray, Index, PrimitiveArray}, + array::{Array, BooleanArray, PrimitiveArray}, buffer::MutableBuffer, + types::Index, }; use super::SortOptions; diff --git a/src/compute/sort/common.rs b/src/compute/sort/common.rs index 7e909e01cd6..b2fab91f50a 100644 --- a/src/compute/sort/common.rs +++ b/src/compute/sort/common.rs @@ -1,8 +1,4 @@ -use crate::{ - array::{Index, PrimitiveArray}, - bitmap::Bitmap, - buffer::MutableBuffer, -}; +use crate::{array::PrimitiveArray, bitmap::Bitmap, buffer::MutableBuffer, types::Index}; use super::SortOptions; @@ -106,13 +102,13 @@ where let mut valids = 0; validity .iter() - .zip(0..length) + .zip(I::range(0, length).unwrap()) .for_each(|(is_valid, index)| { if is_valid { - indices[validity.null_count() + valids] = I::from_usize(index).unwrap(); + indices[validity.null_count() + valids] = index; valids += 1; } else { - indices[nulls] = I::from_usize(index).unwrap(); + indices[nulls] = index; nulls += 1; } }); @@ -154,7 +150,8 @@ where indices } else { - let mut indices = Index::buffer_from_range(0, length).unwrap(); + let mut indices = MutableBuffer::from_trusted_len_iter(I::range(0, length).unwrap()); + // Soundness: // indices are by construction `< values.len()` // limit is by construction `< values.len()` diff --git a/src/compute/sort/lex_sort.rs b/src/compute/sort/lex_sort.rs index 3a15a10b675..4e8797eba4f 100644 --- a/src/compute/sort/lex_sort.rs +++ b/src/compute/sort/lex_sort.rs @@ -3,8 +3,9 @@ use std::cmp::Ordering; use crate::compute::take; use crate::error::{ArrowError, Result}; use crate::{ - array::{ord, Array, Index, PrimitiveArray}, + array::{ord, Array, PrimitiveArray}, buffer::MutableBuffer, + types::Index, }; use super::{sort_to_indices, SortOptions}; diff --git a/src/compute/sort/mod.rs b/src/compute/sort/mod.rs index 781263f45dc..d1fcd213ad1 100644 --- a/src/compute/sort/mod.rs +++ b/src/compute/sort/mod.rs @@ -6,7 +6,7 @@ use crate::datatypes::*; use crate::error::{ArrowError, Result}; use crate::{ array::*, - types::{days_ms, NativeType}, + types::{days_ms, Index, NativeType}, }; use crate::buffer::MutableBuffer; diff --git a/src/compute/sort/primitive/indices.rs b/src/compute/sort/primitive/indices.rs index d3d7f76bd36..8b229f1c8b6 100644 --- a/src/compute/sort/primitive/indices.rs +++ b/src/compute/sort/primitive/indices.rs @@ -1,6 +1,6 @@ use crate::{ - array::{Array, Index, PrimitiveArray}, - types::NativeType, + array::{Array, PrimitiveArray}, + types::{Index, NativeType}, }; use super::super::common; diff --git a/src/compute/sort/utf8.rs b/src/compute/sort/utf8.rs index 17a75aef4e9..1776b6b022e 100644 --- a/src/compute/sort/utf8.rs +++ b/src/compute/sort/utf8.rs @@ -1,5 +1,6 @@ -use crate::array::{Array, Index, Offset, PrimitiveArray, Utf8Array}; +use crate::array::{Array, Offset, PrimitiveArray, Utf8Array}; use crate::array::{DictionaryArray, DictionaryKey}; +use crate::types::Index; use super::common; use super::SortOptions; diff --git a/src/compute/take/mod.rs b/src/compute/take/mod.rs index 1517f37e404..87d3cfaa9d9 100644 --- a/src/compute/take/mod.rs +++ b/src/compute/take/mod.rs @@ -21,11 +21,9 @@ use crate::{ array::{new_empty_array, Array, NullArray, PrimitiveArray}, datatypes::{DataType, IntervalUnit}, error::Result, - types::days_ms, + types::{days_ms, Index}, }; -pub use crate::array::Index; - mod binary; mod boolean; mod dict; diff --git a/src/types/index.rs b/src/types/index.rs new file mode 100644 index 00000000000..3a6103f0028 --- /dev/null +++ b/src/types/index.rs @@ -0,0 +1,113 @@ +use std::convert::TryFrom; + +use crate::{ + trusted_len::TrustedLen, + types::{NativeType, NaturalDataType}, +}; + +/// iterator of [`Index`] equivalent to `(a..b)`. +// `Step` is unstable in Rust which does not allow (a..b) for generic `Index`. +pub struct IndexRange { + start: I, + end: I, +} + +impl IndexRange { + pub fn new(start: I, end: I) -> Self { + assert!(end >= start); + Self { start, end } + } +} + +impl Iterator for IndexRange { + type Item = I; + + #[inline] + fn next(&mut self) -> Option { + if self.start == self.end { + return None; + } + let old = self.start; + self.start += I::one(); + Some(old) + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let len = (self.end - self.start).to_usize(); + (len, Some(len)) + } +} + +/// Safety: a range is always of known length +unsafe impl TrustedLen for IndexRange {} + +/// Trait describing any type that can be used to index a slot of an array. +pub trait Index: + NativeType + + NaturalDataType + + std::ops::AddAssign + + std::ops::Sub + + num::One + + PartialOrd +{ + fn to_usize(&self) -> usize; + fn from_usize(index: usize) -> Option; + + fn range(start: usize, end: usize) -> Option> { + let start = Self::from_usize(start); + let end = Self::from_usize(end); + match (start, end) { + (Some(start), Some(end)) => Some(IndexRange::new(start, end)), + _ => None, + } + } +} + +impl Index for i32 { + #[inline] + fn to_usize(&self) -> usize { + *self as usize + } + + #[inline] + fn from_usize(value: usize) -> Option { + Self::try_from(value).ok() + } +} + +impl Index for i64 { + #[inline] + fn to_usize(&self) -> usize { + *self as usize + } + + #[inline] + fn from_usize(value: usize) -> Option { + Self::try_from(value).ok() + } +} + +impl Index for u32 { + #[inline] + fn to_usize(&self) -> usize { + *self as usize + } + + #[inline] + fn from_usize(value: usize) -> Option { + Self::try_from(value).ok() + } +} + +impl Index for u64 { + #[inline] + fn to_usize(&self) -> usize { + *self as usize + } + + #[inline] + fn from_usize(value: usize) -> Option { + Self::try_from(value).ok() + } +} diff --git a/src/types/mod.rs b/src/types/mod.rs index 45167baceb0..fb96b099b22 100644 --- a/src/types/mod.rs +++ b/src/types/mod.rs @@ -14,7 +14,9 @@ use std::{ mod bit_chunk; pub use bit_chunk::{BitChunk, BitChunkIter}; +mod index; pub mod simd; +pub use index::*; use crate::datatypes::{DataType, IntervalUnit, TimeUnit};