Skip to content

Commit

Permalink
Verify valid UTF-8 when converting byte array (#2205) (#2686)
Browse files Browse the repository at this point in the history
* Verify valid UTF-8 when converting byte array (#2205)

* Add doc comment
  • Loading branch information
tustvold authored Sep 11, 2022
1 parent 2d28010 commit 8206f01
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 8 deletions.
15 changes: 9 additions & 6 deletions arrow/src/array/array_string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,13 @@ impl<OffsetSize: OffsetSizeTrait> GenericStringArray<OffsetSize> {
}

/// Convert a list array to a string array.
/// This method is unsound because it does
/// not check the utf-8 validation for each element.
///
/// Note: this performs potentially expensive UTF-8 validation, consider using
/// [`StringBuilder`][crate::array::StringBuilder] to avoid this
///
/// # Panics
///
/// This method panics if the array contains non-UTF-8 data
fn from_list(v: GenericListArray<OffsetSize>) -> Self {
assert_eq!(
v.data_ref().child_data().len(),
Expand Down Expand Up @@ -164,8 +169,7 @@ impl<OffsetSize: OffsetSizeTrait> GenericStringArray<OffsetSize> {
.add_buffer(child_data.buffers()[0].slice(child_data.offset()))
.null_bit_buffer(v.data().null_buffer().cloned());

let array_data = unsafe { builder.build_unchecked() };
Self::from(array_data)
Self::from(builder.build().unwrap())
}

/// Creates a [`GenericStringArray`] based on an iterator of values without nulls
Expand Down Expand Up @@ -352,8 +356,7 @@ impl<OffsetSize: OffsetSizeTrait> From<GenericBinaryArray<OffsetSize>>
{
fn from(v: GenericBinaryArray<OffsetSize>) -> Self {
let builder = v.into_data().into_builder().data_type(Self::DATA_TYPE);
let data = unsafe { builder.build_unchecked() };
Self::from(data)
Self::from(builder.build().unwrap())
}
}

Expand Down
12 changes: 10 additions & 2 deletions arrow/src/array/builder/generic_string_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.

use crate::array::{ArrayBuilder, ArrayRef, GenericStringArray, OffsetSizeTrait};
use crate::array::{Array, ArrayBuilder, ArrayRef, GenericStringArray, OffsetSizeTrait};
use std::any::Any;
use std::sync::Arc;

Expand Down Expand Up @@ -67,7 +67,15 @@ impl<OffsetSize: OffsetSizeTrait> GenericStringBuilder<OffsetSize> {

/// Builds the [`GenericStringArray`] and reset this builder.
pub fn finish(&mut self) -> GenericStringArray<OffsetSize> {
GenericStringArray::<OffsetSize>::from(self.builder.finish())
let t = GenericStringArray::<OffsetSize>::DATA_TYPE;
let v = self.builder.finish();
let builder = v.into_data().into_builder().data_type(t);

// SAFETY:
// Data must be UTF-8 as only support writing `str`
// Offsets must be valid as guaranteed by `GenericBinaryBuilder`
let data = unsafe { builder.build_unchecked() };
data.into()
}

/// Returns the current values buffer as a slice
Expand Down

0 comments on commit 8206f01

Please sign in to comment.