From b9b02e73de758ae581ac4269d9587c8e855fa04e Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Wed, 3 Nov 2021 15:22:47 +0100 Subject: [PATCH 1/3] make substring kernel work on utf8 data --- src/compute/substring.rs | 83 ++++++++++++++++++++++------------- tests/it/compute/substring.rs | 8 ++++ 2 files changed, 61 insertions(+), 30 deletions(-) diff --git a/src/compute/substring.rs b/src/compute/substring.rs index cc1667e7b15..7ec9d35559c 100644 --- a/src/compute/substring.rs +++ b/src/compute/substring.rs @@ -26,46 +26,69 @@ use crate::{ fn utf8_substring(array: &Utf8Array, start: O, length: &Option) -> Utf8Array { let validity = array.validity(); let offsets = array.offsets(); - let values = array.values(); + let values = array.values().as_slice(); let mut new_offsets = MutableBuffer::::with_capacity(array.len() + 1); let mut new_values = MutableBuffer::::new(); // we have no way to estimate how much this will be. - let mut length_so_far = O::zero(); - new_offsets.push(length_so_far); + new_offsets.push(O::zero()); offsets.windows(2).for_each(|windows| { - let length_i: O = windows[1] - windows[0]; + // Safety: + // invariant of the struct that these values are utf8 + let str_val = unsafe { + std::str::from_utf8_unchecked(&values[windows[0].to_usize()..windows[1].to_usize()]) + }; // compute where we should start slicing this entry - let start = windows[0] - + if start >= O::zero() { - start - } else { - length_i + start - }; - let start = start.max(windows[0]).min(windows[1]); - - let length: O = length - .unwrap_or(length_i) - // .max(0) is not needed as it is guaranteed - .min(windows[1] - start); // so we do not go beyond this entry - length_so_far += length; - new_offsets.push(length_so_far); - - // we need usize for ranges - let start = start.to_usize(); - let length = length.to_usize(); - - new_values.extend_from_slice(&values[start..start + length]); + let start = if start >= O::zero() { + start.to_usize() + } else { + let start = (O::zero() - start).to_usize(); + str_val + .char_indices() + .rev() + .nth(start) + .map(|(idx, _)| idx + 1) + .unwrap_or(0) + }; + + let mut iter_chars = str_val.char_indices(); + if let Some((start_idx, _char)) = iter_chars.nth(start) { + // length till end of str + let len_end = str_val.len() - start_idx; + + // length to slice + let length = length.map(|v| v.to_usize()).unwrap_or(len_end); + + // index of the char with offset `start`, and length: `length` + let end_idx = iter_chars + .nth(length.saturating_sub(1)) + .map(|(idx, _)| idx) + .unwrap_or(str_val.len()); + if length != 0 { + debug_assert!(std::str::from_utf8( + &values[windows[0].to_usize() + start_idx..windows[0].to_usize() + end_idx] + ) + .is_ok()); + new_values.extend_from_slice( + &values[windows[0].to_usize() + start_idx..windows[0].to_usize() + end_idx], + ); + } + } + new_offsets.push(O::from_usize(new_values.len()).unwrap()); }); - Utf8Array::::from_data( - array.data_type().clone(), - new_offsets.into(), - new_values.into(), - validity.cloned(), - ) + // Safety: + // we deal with valid utf8 + unsafe { + Utf8Array::::from_data_unchecked( + array.data_type().clone(), + new_offsets.into(), + new_values.into(), + validity.cloned(), + ) + } } fn binary_substring( diff --git a/tests/it/compute/substring.rs b/tests/it/compute/substring.rs index 3ab1ca95a4c..365615cd51f 100644 --- a/tests/it/compute/substring.rs +++ b/tests/it/compute/substring.rs @@ -48,6 +48,7 @@ fn with_nulls_utf8() -> Result<()> { let result = result.as_any().downcast_ref::>().unwrap(); let expected = Utf8Array::::from(&expected); + assert_eq!(&expected, result); Ok(()) })?; @@ -117,6 +118,13 @@ fn without_nulls_utf8() -> Result<()> { Some(4), vec!["llo", "", "ord"], ), + ( + vec!["πŸ˜‡πŸ”₯πŸ₯Ί", "", "πŸ˜‡πŸ”₯πŸ—ΊοΈ"], + 0, + Some(2), + vec!["πŸ˜‡πŸ”₯", "", "πŸ˜‡πŸ”₯"], + ), + (vec!["Ο€1Ο€", "", "Ξ±1Ξ±Ο€"], 1, Some(4), vec!["1Ο€", "", "1Ξ±Ο€"]), ]; cases From a51ef15710e4e119d6367f66be30f45d8352ffc4 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Thu, 4 Nov 2021 06:25:30 +0000 Subject: [PATCH 2/3] Removed unsafe and bound checks. --- src/compute/substring.rs | 56 ++++++++++++---------------------------- 1 file changed, 16 insertions(+), 40 deletions(-) diff --git a/src/compute/substring.rs b/src/compute/substring.rs index 7ec9d35559c..3039f1521d6 100644 --- a/src/compute/substring.rs +++ b/src/compute/substring.rs @@ -24,23 +24,10 @@ use crate::{ }; fn utf8_substring(array: &Utf8Array, start: O, length: &Option) -> Utf8Array { - let validity = array.validity(); - let offsets = array.offsets(); - let values = array.values().as_slice(); - - let mut new_offsets = MutableBuffer::::with_capacity(array.len() + 1); - let mut new_values = MutableBuffer::::new(); // we have no way to estimate how much this will be. - - new_offsets.push(O::zero()); + let length = length.map(|v| v.to_usize()); - offsets.windows(2).for_each(|windows| { - // Safety: - // invariant of the struct that these values are utf8 - let str_val = unsafe { - std::str::from_utf8_unchecked(&values[windows[0].to_usize()..windows[1].to_usize()]) - }; - - // compute where we should start slicing this entry + let iter = array.values_iter().map(|str_val| { + // compute where we should start slicing this entry. let start = if start >= O::zero() { start.to_usize() } else { @@ -54,41 +41,30 @@ fn utf8_substring(array: &Utf8Array, start: O, length: &Option) }; let mut iter_chars = str_val.char_indices(); - if let Some((start_idx, _char)) = iter_chars.nth(start) { - // length till end of str + if let Some((start_idx, _)) = iter_chars.nth(start) { + // length of the str let len_end = str_val.len() - start_idx; // length to slice - let length = length.map(|v| v.to_usize()).unwrap_or(len_end); + let length = length.unwrap_or(len_end); - // index of the char with offset `start`, and length: `length` + if length == 0 { + return ""; + } + // compute let end_idx = iter_chars .nth(length.saturating_sub(1)) .map(|(idx, _)| idx) .unwrap_or(str_val.len()); - if length != 0 { - debug_assert!(std::str::from_utf8( - &values[windows[0].to_usize() + start_idx..windows[0].to_usize() + end_idx] - ) - .is_ok()); - new_values.extend_from_slice( - &values[windows[0].to_usize() + start_idx..windows[0].to_usize() + end_idx], - ); - } + + &str_val[start_idx..end_idx] + } else { + "" } - new_offsets.push(O::from_usize(new_values.len()).unwrap()); }); - // Safety: - // we deal with valid utf8 - unsafe { - Utf8Array::::from_data_unchecked( - array.data_type().clone(), - new_offsets.into(), - new_values.into(), - validity.cloned(), - ) - } + let new = Utf8Array::::from_trusted_len_values_iter(iter); + new.with_validity(array.validity().map(|x| x.clone())) } fn binary_substring( From 4851f34e8c06820d582792e612bd66bd802c9321 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Thu, 4 Nov 2021 09:11:31 +0100 Subject: [PATCH 3/3] Update src/compute/substring.rs Co-authored-by: Jorge Leitao --- src/compute/substring.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/compute/substring.rs b/src/compute/substring.rs index 3039f1521d6..56ad9239775 100644 --- a/src/compute/substring.rs +++ b/src/compute/substring.rs @@ -64,7 +64,7 @@ fn utf8_substring(array: &Utf8Array, start: O, length: &Option) }); let new = Utf8Array::::from_trusted_len_values_iter(iter); - new.with_validity(array.validity().map(|x| x.clone())) + new.with_validity(array.validity().cloned()) } fn binary_substring(