Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
make substring kernel work on utf8 data
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Nov 3, 2021
1 parent ed8836f commit 8208f13
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 27 deletions.
92 changes: 65 additions & 27 deletions src/compute/substring.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,19 @@ use crate::{
error::{ArrowError, Result},
};

fn get_utf8_slice(string: &str, start: usize, end: usize) -> Option<&str> {
string.char_indices().nth(start).and_then(|(start_pos, _)| {
string[start_pos..]
.char_indices()
.nth(end - start - 1)
.map(|(end_pos, _)| &string[start_pos..end_pos])
})
}

fn utf8_substring<O: Offset>(array: &Utf8Array<O>, start: O, length: &Option<O>) -> Utf8Array<O> {
let validity = array.validity();
let offsets = array.offsets();
let values = array.values();
let values = array.values().as_slice();

let mut new_offsets = MutableBuffer::<O>::with_capacity(array.len() + 1);
let mut new_values = MutableBuffer::<u8>::new(); // we have no way to estimate how much this will be.
Expand All @@ -35,37 +44,66 @@ fn utf8_substring<O: Offset>(array: &Utf8Array<O>, start: O, length: &Option<O>)
new_offsets.push(length_so_far);

offsets.windows(2).for_each(|windows| {
let length_i: O = windows[1] - windows[0];
// Safety:
// invariant of the struct that these values are utf8
let str_val = unsafe {
std::str::from_utf8_unchecked(&values[windows[0].to_usize()..windows[1].to_usize()])
};

// compute where we should start slicing this entry
let start = windows[0]
+ if start >= O::zero() {
start
let start = if start >= O::zero() {
start.to_usize()
} else {
let start = (O::zero() - start).to_usize();
str_val
.char_indices()
.rev()
.nth(start)
.map(|(idx, _)| idx + 1)
.unwrap_or(0)
};

let mut iter_chars = str_val.char_indices();
let length = if let Some((start_idx, _char)) = iter_chars.nth(start) {
// length till end of str
let len_end = str_val.len() - start_idx;

// length to slice
let length = length.map(|v| v.to_usize()).unwrap_or(len_end);

// index of the char with offset `start`, and length: `length`
let end_idx = iter_chars
.nth(length.saturating_sub(1))
.map(|(idx, _)| idx)
.unwrap_or(str_val.len());
if length != 0 {
debug_assert!(std::str::from_utf8(
&values[windows[0].to_usize() + start_idx..windows[0].to_usize() + end_idx]
)
.is_ok());
new_values.extend_from_slice(
&values[windows[0].to_usize() + start_idx..windows[0].to_usize() + end_idx],
);
end_idx
} else {
length_i + start
};
let start = start.max(windows[0]).min(windows[1]);

let length: O = length
.unwrap_or(length_i)
// .max(0) is not needed as it is guaranteed
.min(windows[1] - start); // so we do not go beyond this entry
length_so_far += length;
new_offsets.push(length_so_far);

// we need usize for ranges
let start = start.to_usize();
let length = length.to_usize();

new_values.extend_from_slice(&values[start..start + length]);
0
}
} else {
0
};
new_offsets.push(O::from_usize(new_values.len()).unwrap());
});

Utf8Array::<O>::from_data(
array.data_type().clone(),
new_offsets.into(),
new_values.into(),
validity.cloned(),
)
// Safety:
// we deal with valid utf8
unsafe {
Utf8Array::<O>::from_data_unchecked(
array.data_type().clone(),
new_offsets.into(),
new_values.into(),
validity.cloned(),
)
}
}

fn binary_substring<O: Offset>(
Expand Down
8 changes: 8 additions & 0 deletions tests/it/compute/substring.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ fn with_nulls_utf8<O: Offset>() -> Result<()> {

let result = result.as_any().downcast_ref::<Utf8Array<O>>().unwrap();
let expected = Utf8Array::<O>::from(&expected);

assert_eq!(&expected, result);
Ok(())
})?;
Expand Down Expand Up @@ -117,6 +118,13 @@ fn without_nulls_utf8<O: Offset>() -> Result<()> {
Some(4),
vec!["llo", "", "ord"],
),
(
vec!["😇🔥🥺", "", "😇🔥🗺️"],
0,
Some(2),
vec!["😇🔥", "", "😇🔥"],
),
(vec!["π1π", "", "α1απ"], 1, Some(4), vec!["1π", "", "1απ"]),
];

cases
Expand Down

0 comments on commit 8208f13

Please sign in to comment.