From b9b02e73de758ae581ac4269d9587c8e855fa04e Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Wed, 3 Nov 2021 15:22:47 +0100
Subject: [PATCH 1/3] make substring kernel work on utf8 data

---
 src/compute/substring.rs      | 83 ++++++++++++++++++++++-------------
 tests/it/compute/substring.rs |  8 ++++
 2 files changed, 61 insertions(+), 30 deletions(-)
diff --git a/src/compute/substring.rs b/src/compute/substring.rs
index cc1667e7b15..7ec9d35559c 100644
--- a/src/compute/substring.rs
+++ b/src/compute/substring.rs
@@ -26,46 +26,69 @@ use crate::{
 fn utf8_substring<O: Offset>(array: &Utf8Array<O>, start: O, length: &Option<O>) -> Utf8Array<O> {
     let validity = array.validity();
     let offsets = array.offsets();
-    let values = array.values();
+    let values = array.values().as_slice();
 
     let mut new_offsets = MutableBuffer::<O>::with_capacity(array.len() + 1);
     let mut new_values = MutableBuffer::<u8>::new(); // we have no way to estimate how much this will be.
 
-    let mut length_so_far = O::zero();
-    new_offsets.push(length_so_far);
+    new_offsets.push(O::zero());
 
     offsets.windows(2).for_each(|windows| {
-        let length_i: O = windows[1] - windows[0];
+        // Safety:
+        // invariant of the struct that these values are utf8
+        let str_val = unsafe {
+            std::str::from_utf8_unchecked(&values[windows[0].to_usize()..windows[1].to_usize()])
+        };
 
         // compute where we should start slicing this entry
-        let start = windows[0]
-            + if start >= O::zero() {
-                start
-            } else {
-                length_i + start
-            };
-        let start = start.max(windows[0]).min(windows[1]);
-
-        let length: O = length
-            .unwrap_or(length_i)
-            // .max(0) is not needed as it is guaranteed
-            .min(windows[1] - start); // so we do not go beyond this entry
-        length_so_far += length;
-        new_offsets.push(length_so_far);
-
-        // we need usize for ranges
-        let start = start.to_usize();
-        let length = length.to_usize();
-
-        new_values.extend_from_slice(&values[start..start + length]);
+        let start = if start >= O::zero() {
+            start.to_usize()
+        } else {
+            let start = (O::zero() - start).to_usize();
+            str_val
+                .char_indices()
+                .rev()
+                .nth(start)
+                .map(|(idx, _)| idx + 1)
+                .unwrap_or(0)
+        };
+
+        let mut iter_chars = str_val.char_indices();
+        if let Some((start_idx, _char)) = iter_chars.nth(start) {
+            // length till end of str
+            let len_end = str_val.len() - start_idx;
+
+            // length to slice
+            let length = length.map(|v| v.to_usize()).unwrap_or(len_end);
+
+            // index of the char with offset `start`, and length: `length`
+            let end_idx = iter_chars
+                .nth(length.saturating_sub(1))
+                .map(|(idx, _)| idx)
+                .unwrap_or(str_val.len());
+            if length != 0 {
+                debug_assert!(std::str::from_utf8(
+                    &values[windows[0].to_usize() + start_idx..windows[0].to_usize() + end_idx]
+                )
+                .is_ok());
+                new_values.extend_from_slice(
+                    &values[windows[0].to_usize() + start_idx..windows[0].to_usize() + end_idx],
+                );
+            }
+        }
+        new_offsets.push(O::from_usize(new_values.len()).unwrap());
     });
 
-    Utf8Array::<O>::from_data(
-        array.data_type().clone(),
-        new_offsets.into(),
-        new_values.into(),
-        validity.cloned(),
-    )
+    // Safety:
+    // we deal with valid utf8
+    unsafe {
+        Utf8Array::<O>::from_data_unchecked(
+            array.data_type().clone(),
+            new_offsets.into(),
+            new_values.into(),
+            validity.cloned(),
+        )
+    }
 }
 
 fn binary_substring<O: Offset>(
diff --git a/tests/it/compute/substring.rs b/tests/it/compute/substring.rs
index 3ab1ca95a4c..365615cd51f 100644
--- a/tests/it/compute/substring.rs
+++ b/tests/it/compute/substring.rs
@@ -48,6 +48,7 @@ fn with_nulls_utf8<O: Offset>() -> Result<()> {
 
             let result = result.as_any().downcast_ref::<Utf8Array<O>>().unwrap();
             let expected = Utf8Array::<O>::from(&expected);
+
             assert_eq!(&expected, result);
             Ok(())
         })?;
@@ -117,6 +118,13 @@ fn without_nulls_utf8<O: Offset>() -> Result<()> {
             Some(4),
             vec!["llo", "", "ord"],
         ),
+        (
+            vec!["😇🔥🥺", "", "😇🔥🗺️"],
+            0,
+            Some(2),
+            vec!["😇🔥", "", "😇🔥"],
+        ),
+        (vec!["π1π", "", "α1απ"], 1, Some(4), vec!["1π", "", "1απ"]),
     ];
 
     cases

From a51ef15710e4e119d6367f66be30f45d8352ffc4 Mon Sep 17 00:00:00 2001
From: "Jorge C. Leitao" <jorgecarleitao@gmail.com>
Date: Thu, 4 Nov 2021 06:25:30 +0000
Subject: [PATCH 2/3] Removed unsafe and bound checks.

---
 src/compute/substring.rs | 56 ++++++++++++----------------------------
 1 file changed, 16 insertions(+), 40 deletions(-)

diff --git a/src/compute/substring.rs b/src/compute/substring.rs
index 7ec9d35559c..3039f1521d6 100644
--- a/src/compute/substring.rs
+++ b/src/compute/substring.rs
@@ -24,23 +24,10 @@ use crate::{
 };
 
 fn utf8_substring<O: Offset>(array: &Utf8Array<O>, start: O, length: &Option<O>) -> Utf8Array<O> {
-    let validity = array.validity();
-    let offsets = array.offsets();
-    let values = array.values().as_slice();
-
-    let mut new_offsets = MutableBuffer::<O>::with_capacity(array.len() + 1);
-    let mut new_values = MutableBuffer::<u8>::new(); // we have no way to estimate how much this will be.
-
-    new_offsets.push(O::zero());
+    let length = length.map(|v| v.to_usize());
 
-    offsets.windows(2).for_each(|windows| {
-        // Safety:
-        // invariant of the struct that these values are utf8
-        let str_val = unsafe {
-            std::str::from_utf8_unchecked(&values[windows[0].to_usize()..windows[1].to_usize()])
-        };
-
-        // compute where we should start slicing this entry
+    let iter = array.values_iter().map(|str_val| {
+        // compute where we should start slicing this entry.
         let start = if start >= O::zero() {
             start.to_usize()
         } else {
@@ -54,41 +41,30 @@ fn utf8_substring<O: Offset>(array: &Utf8Array<O>, start: O, length: &Option<O>)
         };
 
         let mut iter_chars = str_val.char_indices();
-        if let Some((start_idx, _char)) = iter_chars.nth(start) {
-            // length till end of str
+        if let Some((start_idx, _)) = iter_chars.nth(start) {
+            // length of the str
             let len_end = str_val.len() - start_idx;
 
             // length to slice
-            let length = length.map(|v| v.to_usize()).unwrap_or(len_end);
+            let length = length.unwrap_or(len_end);
 
-            // index of the char with offset `start`, and length: `length`
+            if length == 0 {
+                return "";
+            }
+            // compute
             let end_idx = iter_chars
                 .nth(length.saturating_sub(1))
                 .map(|(idx, _)| idx)
                 .unwrap_or(str_val.len());
-            if length != 0 {
-                debug_assert!(std::str::from_utf8(
-                    &values[windows[0].to_usize() + start_idx..windows[0].to_usize() + end_idx]
-                )
-                .is_ok());
-                new_values.extend_from_slice(
-                    &values[windows[0].to_usize() + start_idx..windows[0].to_usize() + end_idx],
-                );
-            }
+
+            &str_val[start_idx..end_idx]
+        } else {
+            ""
         }
-        new_offsets.push(O::from_usize(new_values.len()).unwrap());
     });
 
-    // Safety:
-    // we deal with valid utf8
-    unsafe {
-        Utf8Array::<O>::from_data_unchecked(
-            array.data_type().clone(),
-            new_offsets.into(),
-            new_values.into(),
-            validity.cloned(),
-        )
-    }
+    let new = Utf8Array::<O>::from_trusted_len_values_iter(iter);
+    new.with_validity(array.validity().map(|x| x.clone()))
 }
 
 fn binary_substring<O: Offset>(

From 4851f34e8c06820d582792e612bd66bd802c9321 Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Thu, 4 Nov 2021 09:11:31 +0100
Subject: [PATCH 3/3] Update src/compute/substring.rs

Co-authored-by: Jorge Leitao <jorgecarleitao@gmail.com>
---
 src/compute/substring.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/compute/substring.rs b/src/compute/substring.rs
index 3039f1521d6..56ad9239775 100644
--- a/src/compute/substring.rs
+++ b/src/compute/substring.rs
@@ -64,7 +64,7 @@ fn utf8_substring<O: Offset>(array: &Utf8Array<O>, start: O, length: &Option<O>)
     });
 
     let new = Utf8Array::<O>::from_trusted_len_values_iter(iter);
-    new.with_validity(array.validity().map(|x| x.clone()))
+    new.with_validity(array.validity().cloned())
 }
 
 fn binary_substring<O: Offset>(