-
Notifications
You must be signed in to change notification settings - Fork 867
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Speed up the substring
kernel by about 2x
#1512
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -24,56 +24,74 @@ use crate::{ | |||||||||||||||||||
error::{ArrowError, Result}, | ||||||||||||||||||||
}; | ||||||||||||||||||||
|
||||||||||||||||||||
#[allow(clippy::unnecessary_wraps)] | ||||||||||||||||||||
fn generic_substring<OffsetSize: StringOffsetSizeTrait>( | ||||||||||||||||||||
array: &GenericStringArray<OffsetSize>, | ||||||||||||||||||||
start: OffsetSize, | ||||||||||||||||||||
length: &Option<OffsetSize>, | ||||||||||||||||||||
) -> Result<ArrayRef> { | ||||||||||||||||||||
// compute current offsets | ||||||||||||||||||||
let offsets = array.data_ref().clone().buffers()[0].clone(); | ||||||||||||||||||||
let offsets: &[OffsetSize] = unsafe { offsets.typed_data::<OffsetSize>() }; | ||||||||||||||||||||
|
||||||||||||||||||||
// compute null bitmap (copy) | ||||||||||||||||||||
let offsets = array.value_offsets(); | ||||||||||||||||||||
let null_bit_buffer = array.data_ref().null_buffer().cloned(); | ||||||||||||||||||||
|
||||||||||||||||||||
// compute values | ||||||||||||||||||||
let values = &array.data_ref().buffers()[1]; | ||||||||||||||||||||
let values = array.value_data(); | ||||||||||||||||||||
let data = values.as_slice(); | ||||||||||||||||||||
let zero = OffsetSize::zero(); | ||||||||||||||||||||
|
||||||||||||||||||||
let mut new_values = MutableBuffer::new(0); // we have no way to estimate how much this will be. | ||||||||||||||||||||
let mut new_offsets: Vec<OffsetSize> = Vec::with_capacity(array.len() + 1); | ||||||||||||||||||||
|
||||||||||||||||||||
let mut length_so_far = OffsetSize::zero(); | ||||||||||||||||||||
new_offsets.push(length_so_far); | ||||||||||||||||||||
(0..array.len()).for_each(|i| { | ||||||||||||||||||||
// the length of this entry | ||||||||||||||||||||
let length_i: OffsetSize = offsets[i + 1] - offsets[i]; | ||||||||||||||||||||
// compute where we should start slicing this entry | ||||||||||||||||||||
let start = offsets[i] | ||||||||||||||||||||
+ if start >= OffsetSize::zero() { | ||||||||||||||||||||
start | ||||||||||||||||||||
} else { | ||||||||||||||||||||
length_i + start | ||||||||||||||||||||
}; | ||||||||||||||||||||
|
||||||||||||||||||||
let start = start.max(offsets[i]).min(offsets[i + 1]); | ||||||||||||||||||||
// compute the length of the slice | ||||||||||||||||||||
let length: OffsetSize = length | ||||||||||||||||||||
.unwrap_or(length_i) | ||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same reason! we don't need to |
||||||||||||||||||||
// .max(0) is not needed as it is guaranteed | ||||||||||||||||||||
.min(offsets[i + 1] - start); // so we do not go beyond this entry | ||||||||||||||||||||
|
||||||||||||||||||||
length_so_far += length; | ||||||||||||||||||||
// calculate the start offset for each substring | ||||||||||||||||||||
// if `start` >= 0 | ||||||||||||||||||||
// then, count from the start of each string | ||||||||||||||||||||
// else, count from the end of each string | ||||||||||||||||||||
let new_starts: Vec<OffsetSize> = if start >= zero { | ||||||||||||||||||||
offsets | ||||||||||||||||||||
.windows(2) | ||||||||||||||||||||
.map(|pair| (pair[0] + start).min(pair[1])) | ||||||||||||||||||||
.collect() | ||||||||||||||||||||
} else { | ||||||||||||||||||||
offsets | ||||||||||||||||||||
.windows(2) | ||||||||||||||||||||
.map(|pair| (pair[1] + start).max(pair[0])) | ||||||||||||||||||||
.collect() | ||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Might be better to do There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thinking about it, I think actually materializing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tried to remove There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, that is not possible in Rust (I think only by Boxing or manually inlining / macros). Calculating the length in two places with something like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
One way is to allow the closure not to capture the environment by adding more parameters, and the compiler will downcast closure to function. This works for calculating There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My later comment mentioned not allocating the new vec but not allocating the array at all (by moving the calculation to the iteration in
|
||||||||||||||||||||
}; | ||||||||||||||||||||
|
||||||||||||||||||||
new_offsets.push(length_so_far); | ||||||||||||||||||||
// count the length of each substring | ||||||||||||||||||||
// if `length` is given | ||||||||||||||||||||
// then, use it | ||||||||||||||||||||
// else, length is `string[new_start..].len()` | ||||||||||||||||||||
let new_length: Vec<OffsetSize> = if let Some(length) = length { | ||||||||||||||||||||
offsets[1..] | ||||||||||||||||||||
.iter() | ||||||||||||||||||||
.zip(new_starts.iter()) | ||||||||||||||||||||
.map(|(end, start)| *(length.min(&(*end - *start)))) | ||||||||||||||||||||
.collect() | ||||||||||||||||||||
} else { | ||||||||||||||||||||
offsets[1..] | ||||||||||||||||||||
.iter() | ||||||||||||||||||||
.zip(new_starts.iter()) | ||||||||||||||||||||
.map(|(end, start)| *end - *start) | ||||||||||||||||||||
.collect() | ||||||||||||||||||||
}; | ||||||||||||||||||||
|
||||||||||||||||||||
// we need usize for ranges | ||||||||||||||||||||
let start = start.to_usize().unwrap(); | ||||||||||||||||||||
let length = length.to_usize().unwrap(); | ||||||||||||||||||||
let new_offsets: Vec<OffsetSize> = [zero] | ||||||||||||||||||||
.iter() | ||||||||||||||||||||
.copied() | ||||||||||||||||||||
.chain(new_length.iter().scan(zero, |len_so_far, &len| { | ||||||||||||||||||||
*len_so_far += len; | ||||||||||||||||||||
Some(*len_so_far) | ||||||||||||||||||||
})) | ||||||||||||||||||||
.collect(); | ||||||||||||||||||||
|
||||||||||||||||||||
new_values.extend_from_slice(&data[start..start + length]); | ||||||||||||||||||||
}); | ||||||||||||||||||||
// concatenate substrings into a buffer | ||||||||||||||||||||
let new_values = { | ||||||||||||||||||||
let mut new_values = | ||||||||||||||||||||
MutableBuffer::new(new_offsets.last().unwrap().to_usize().unwrap()); | ||||||||||||||||||||
new_starts | ||||||||||||||||||||
.iter() | ||||||||||||||||||||
.zip(new_length.iter()) | ||||||||||||||||||||
.map(|(start, length)| { | ||||||||||||||||||||
(start.to_usize().unwrap(), length.to_usize().unwrap()) | ||||||||||||||||||||
}) | ||||||||||||||||||||
.map(|(start, length)| &data[start..start + length]) | ||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done! |
||||||||||||||||||||
.for_each(|slice| new_values.extend_from_slice(slice)); | ||||||||||||||||||||
new_values | ||||||||||||||||||||
}; | ||||||||||||||||||||
|
||||||||||||||||||||
let data = unsafe { | ||||||||||||||||||||
ArrayData::new_unchecked( | ||||||||||||||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We don't need to compare
start
withzero
multiple time. So I move this outside the for loop