Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Return error when reading Parquet column with invalid utf-8 bytes (#807)
Browse files Browse the repository at this point in the history
  • Loading branch information
danburkert authored Feb 5, 2022
1 parent 7d585ee commit 8330eea
Show file tree
Hide file tree
Showing 6 changed files with 156 additions and 52 deletions.
44 changes: 36 additions & 8 deletions src/array/binary/mod.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
use crate::{bitmap::Bitmap, buffer::Buffer, datatypes::DataType};
use crate::{
bitmap::Bitmap,
buffer::Buffer,
datatypes::DataType,
error::{ArrowError, Result},
};

use super::{
display_fmt, display_helper,
specification::{check_offsets, check_offsets_minimal},
specification::{check_offsets_minimal, try_check_offsets},
Array, GenericBinaryArray, Offset,
};

Expand Down Expand Up @@ -61,22 +66,45 @@ impl<O: Offset> BinaryArray<O> {
values: Buffer<u8>,
validity: Option<Bitmap>,
) -> Self {
check_offsets(&offsets, values.len());
Self::try_new(data_type, offsets, values, validity).unwrap()
}

if let Some(validity) = &validity {
assert_eq!(offsets.len() - 1, validity.len());
/// Creates a new [`BinaryArray`] from lower-level parts.
///
/// This function returns an error iff:
/// * the offsets are not monotonically increasing
/// * The last offset is not equal to the values' length.
/// * the validity's length is not equal to `offsets.len() - 1`.
/// * The `data_type`'s physical type is not equal to `Binary` or `LargeBinary`.
pub fn try_new(
data_type: DataType,
offsets: Buffer<O>,
values: Buffer<u8>,
validity: Option<Bitmap>,
) -> Result<Self> {
try_check_offsets(&offsets, values.len())?;

if validity
.as_ref()
.map_or(false, |validity| validity.len() != offsets.len() - 1)
{
return Err(ArrowError::oos(
"validity mask length must match the number of values",
));
}

if data_type.to_physical_type() != Self::default_data_type().to_physical_type() {
panic!("BinaryArray can only be initialized with DataType::Binary or DataType::LargeBinary")
return Err(ArrowError::oos(
"BinaryArray can only be initialized with DataType::Binary or DataType::LargeBinary",
));
}

Self {
Ok(Self {
data_type,
offsets,
values,
validity,
}
})
}

/// Returns the default [`DataType`], `DataType::Binary` or `DataType::LargeBinary`
Expand Down
67 changes: 44 additions & 23 deletions src/array/specification.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use crate::error::{ArrowError, Result};
use crate::types::Offset;

pub fn check_offsets_minimal<O: Offset>(offsets: &[O], values_len: usize) -> usize {
Expand All @@ -22,45 +23,65 @@ pub fn check_offsets_minimal<O: Offset>(offsets: &[O], values_len: usize) -> usi
/// * any slice of `values` between two consecutive pairs from `offsets` is invalid `utf8`, or
/// * any offset is larger or equal to `values_len`.
pub fn check_offsets_and_utf8<O: Offset>(offsets: &[O], values: &[u8]) {
const SIMD_CHUNK_SIZE: usize = 64;
try_check_offsets_and_utf8(offsets, values).unwrap()
}

/// # Panics iff:
/// * the `offsets` is not monotonically increasing, or
/// * any slice of `values` between two consecutive pairs from `offsets` is invalid `utf8`, or
/// * any offset is larger or equal to `values_len`.
pub fn try_check_offsets_and_utf8<O: Offset>(offsets: &[O], values: &[u8]) -> Result<()> {
const SIMD_CHUNK_SIZE: usize = 64;
if values.is_ascii() {
check_offsets(offsets, values.len());
try_check_offsets(offsets, values.len())
} else {
offsets.windows(2).for_each(|window| {
for window in offsets.windows(2) {
let start = window[0].to_usize();
let end = window[1].to_usize();
// assert monotonicity
assert!(start <= end);
// assert bounds

// check monotonicity
if start > end {
return Err(ArrowError::oos("offsets must be monotonically increasing"));
}

// check bounds
if end > values.len() {
return Err(ArrowError::oos("offsets must not exceed values length"));
};

let slice = &values[start..end];

// Fast ASCII check per item
// fast ASCII check per item
if slice.len() < SIMD_CHUNK_SIZE && slice.is_ascii() {
return;
continue;
}

// assert utf8
simdutf8::basic::from_utf8(slice).expect("A non-utf8 string was passed.");
});
// check utf8
simdutf8::basic::from_utf8(slice)?;
}

Ok(())
}
}

/// # Panics iff:
/// * the `offsets` is not monotonically increasing, or
/// * any offset is larger or equal to `values_len`.
pub fn check_offsets<O: Offset>(offsets: &[O], values_len: usize) {
if offsets.is_empty() {
return;
}
try_check_offsets(offsets, values_len).unwrap()
}

let mut last = offsets[0];
// assert monotonicity
assert!(offsets.iter().skip(1).all(|&end| {
let monotone = last <= end;
last = end;
monotone
}));
// assert bounds
assert!(last.to_usize() <= values_len);
/// Checks that `offsets` is monotonically increasing, and all offsets are less than or equal to
/// `values_len`.
pub fn try_check_offsets<O: Offset>(offsets: &[O], values_len: usize) -> Result<()> {
if offsets.windows(2).any(|window| window[0] > window[1]) {
Err(ArrowError::oos("offsets must be monotonically increasing"))
} else if offsets
.last()
.map_or(false, |last| last.to_usize() > values_len)
{
Err(ArrowError::oos("offsets must not exceed values length"))
} else {
Ok(())
}
}
44 changes: 36 additions & 8 deletions src/array/utf8/mod.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
use crate::{bitmap::Bitmap, buffer::Buffer, datatypes::DataType};
use crate::{
bitmap::Bitmap,
buffer::Buffer,
datatypes::DataType,
error::{ArrowError, Result},
};
use either::Either;

use super::{
display_fmt,
specification::{check_offsets_and_utf8, check_offsets_minimal},
specification::{check_offsets_minimal, try_check_offsets_and_utf8},
Array, GenericBinaryArray, Offset,
};

Expand Down Expand Up @@ -77,21 +82,44 @@ impl<O: Offset> Utf8Array<O> {
values: Buffer<u8>,
validity: Option<Bitmap>,
) -> Self {
check_offsets_and_utf8(&offsets, &values);
if let Some(ref validity) = validity {
assert_eq!(offsets.len() - 1, validity.len());
Utf8Array::try_new(data_type, offsets, values, validity).unwrap()
}

/// The canonical method to create a [`Utf8Array`] out of low-end APIs.
///
/// This function returns an error iff:
/// * The `data_type`'s physical type is not consistent with the offset `O`.
/// * The `offsets` and `values` are inconsistent
/// * The `values` between `offsets` are utf8 encoded
/// * The validity is not `None` and its length is different from `offsets.len() - 1`.
pub fn try_new(
data_type: DataType,
offsets: Buffer<O>,
values: Buffer<u8>,
validity: Option<Bitmap>,
) -> Result<Self> {
try_check_offsets_and_utf8(&offsets, &values)?;
if validity
.as_ref()
.map_or(false, |validity| validity.len() != offsets.len() - 1)
{
return Err(ArrowError::oos(
"validity mask length must match the number of values",
));
}

if data_type.to_physical_type() != Self::default_data_type().to_physical_type() {
panic!("Utf8Array can only be initialized with DataType::Utf8 or DataType::LargeUtf8")
return Err(ArrowError::oos(
"Utf8Array can only be initialized with DataType::Utf8 or DataType::LargeUtf8",
));
}

Self {
Ok(Self {
data_type,
offsets,
values,
validity,
}
})
}

/// Returns the default [`DataType`], `DataType::Utf8` or `DataType::LargeUtf8`
Expand Down
24 changes: 13 additions & 11 deletions src/io/parquet/read/binary/basic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -150,33 +150,35 @@ impl<'a> utils::PageState<'a> for State<'a> {
}

pub trait TraitBinaryArray<O: Offset>: Array + 'static {
fn from_data(
fn try_new(
data_type: DataType,
offsets: Buffer<O>,
values: Buffer<u8>,
validity: Option<Bitmap>,
) -> Self;
) -> Result<Self>
where
Self: Sized;
}

impl<O: Offset> TraitBinaryArray<O> for BinaryArray<O> {
fn from_data(
fn try_new(
data_type: DataType,
offsets: Buffer<O>,
values: Buffer<u8>,
validity: Option<Bitmap>,
) -> Self {
Self::from_data(data_type, offsets, values, validity)
) -> Result<Self> {
Self::try_new(data_type, offsets, values, validity)
}
}

impl<O: Offset> TraitBinaryArray<O> for Utf8Array<O> {
fn from_data(
fn try_new(
data_type: DataType,
offsets: Buffer<O>,
values: Buffer<u8>,
validity: Option<Bitmap>,
) -> Self {
Self::from_data(data_type, offsets, values, validity)
) -> Result<Self> {
Self::try_new(data_type, offsets, values, validity)
}
}

Expand Down Expand Up @@ -268,8 +270,8 @@ pub(super) fn finish<O: Offset, A: TraitBinaryArray<O>>(
data_type: &DataType,
values: Binary<O>,
validity: MutableBitmap,
) -> A {
A::from_data(
) -> Result<A> {
A::try_new(
data_type.clone(),
values.offsets.0.into(),
values.values.into(),
Expand Down Expand Up @@ -309,7 +311,7 @@ impl<O: Offset, A: TraitBinaryArray<O>, I: DataPages> Iterator for Iter<O, A, I>
);
match maybe_state {
MaybeNext::Some(Ok((values, validity))) => {
Some(Ok(finish(&self.data_type, values, validity)))
Some(finish(&self.data_type, values, validity))
}
MaybeNext::Some(Err(e)) => Some(Err(e)),
MaybeNext::None => None,
Expand Down
2 changes: 1 addition & 1 deletion src/io/parquet/read/binary/nested.rs
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ impl<O: Offset, A: TraitBinaryArray<O>, I: DataPages> Iterator for ArrayIterator
);
match maybe_state {
MaybeNext::Some(Ok((nested, values, validity))) => {
Some(Ok((nested, finish(&self.data_type, values, validity))))
Some(finish(&self.data_type, values, validity).map(|array| (nested, array)))
}
MaybeNext::Some(Err(e)) => Some(Err(e)),
MaybeNext::None => None,
Expand Down
27 changes: 26 additions & 1 deletion tests/it/io/parquet/read.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use std::fs::File;

use arrow2::array::*;
use arrow2::error::Result;
use arrow2::error::*;
use arrow2::io::parquet::read::*;

use super::*;
Expand Down Expand Up @@ -463,3 +463,28 @@ fn all_types_chunked() -> Result<()> {

Ok(())
}

#[test]
fn invalid_utf8() {
let invalid_data = &[
0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0x24, 0x15, 0x28, 0x2c, 0x15, 0x02, 0x15, 0x00,
0x15, 0x06, 0x15, 0x08, 0x00, 0x00, 0x12, 0x44, 0x02, 0x00, 0x00, 0x00, 0x03, 0xff, 0x08,
0x00, 0x00, 0x00, 0x67, 0x6f, 0x75, 0x67, 0xe8, 0x72, 0x65, 0x73, 0x15, 0x02, 0x19, 0x2c,
0x48, 0x0d, 0x64, 0x75, 0x63, 0x6b, 0x64, 0x62, 0x5f, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61,
0x15, 0x02, 0x00, 0x15, 0x0c, 0x25, 0x02, 0x18, 0x02, 0x63, 0x31, 0x15, 0x00, 0x15, 0x00,
0x00, 0x16, 0x02, 0x19, 0x1c, 0x19, 0x1c, 0x26, 0x00, 0x1c, 0x15, 0x0c, 0x19, 0x05, 0x19,
0x18, 0x02, 0x63, 0x31, 0x15, 0x02, 0x16, 0x02, 0x16, 0x00, 0x16, 0x4a, 0x26, 0x08, 0x00,
0x00, 0x16, 0x00, 0x16, 0x02, 0x26, 0x08, 0x00, 0x28, 0x06, 0x44, 0x75, 0x63, 0x6b, 0x44,
0x42, 0x00, 0x51, 0x00, 0x00, 0x00, 0x50, 0x41, 0x52, 0x31,
];

let reader = Cursor::new(invalid_data);
let reader = FileReader::try_new(reader, None, Some(5), None, None).unwrap();

let error = reader.collect::<Result<Vec<_>>>().unwrap_err();
assert!(
error.to_string().contains("invalid utf-8"),
"unexpected error: {}",
error
);
}

0 comments on commit 8330eea

Please sign in to comment.