forked from jorgecarleitao/arrow2
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Improved parquet stats deserialization (jorgecarleitao#962)
* Fixed stats * Fixed error in decimal stats * Fixed struct stats
- Loading branch information
1 parent
e4f2354
commit f0b45eb
Showing
19 changed files
with
1,182 additions
and
729 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,113 +1,23 @@ | ||
use std::any::Any; | ||
use std::convert::TryFrom; | ||
|
||
use crate::datatypes::DataType; | ||
use parquet2::statistics::BinaryStatistics as ParquetByteArrayStatistics; | ||
|
||
use super::Statistics; | ||
use crate::error::{ArrowError, Result}; | ||
|
||
/// Represents a `Binary` or `LargeBinary` | ||
#[derive(Debug, Clone, PartialEq)] | ||
pub struct BinaryStatistics { | ||
/// number of nulls | ||
pub null_count: Option<i64>, | ||
/// number of dictinct values | ||
pub distinct_count: Option<i64>, | ||
/// Minimum | ||
pub min_value: Option<Vec<u8>>, | ||
/// Maximum | ||
pub max_value: Option<Vec<u8>>, | ||
} | ||
|
||
impl Statistics for BinaryStatistics { | ||
fn data_type(&self) -> &DataType { | ||
&DataType::Binary | ||
} | ||
|
||
fn as_any(&self) -> &dyn Any { | ||
self | ||
} | ||
|
||
fn null_count(&self) -> Option<i64> { | ||
self.null_count | ||
} | ||
} | ||
|
||
impl From<&ParquetByteArrayStatistics> for BinaryStatistics { | ||
fn from(stats: &ParquetByteArrayStatistics) -> Self { | ||
Self { | ||
null_count: stats.null_count, | ||
distinct_count: stats.distinct_count, | ||
min_value: stats.min_value.clone(), | ||
max_value: stats.max_value.clone(), | ||
} | ||
} | ||
} | ||
|
||
/// Statistics of a string parquet column | ||
#[derive(Debug, Clone, PartialEq)] | ||
pub struct Utf8Statistics { | ||
/// number of nulls | ||
pub null_count: Option<i64>, | ||
/// number of dictinct values | ||
pub distinct_count: Option<i64>, | ||
/// Minimum | ||
pub min_value: Option<String>, | ||
/// Maximum | ||
pub max_value: Option<String>, | ||
} | ||
|
||
impl Statistics for Utf8Statistics { | ||
fn data_type(&self) -> &DataType { | ||
&DataType::Utf8 | ||
} | ||
|
||
fn as_any(&self) -> &dyn Any { | ||
self | ||
} | ||
|
||
fn null_count(&self) -> Option<i64> { | ||
self.null_count | ||
} | ||
} | ||
|
||
impl TryFrom<&ParquetByteArrayStatistics> for Utf8Statistics { | ||
type Error = ArrowError; | ||
|
||
fn try_from(stats: &ParquetByteArrayStatistics) -> Result<Self> { | ||
Ok(Self { | ||
null_count: stats.null_count, | ||
distinct_count: stats.distinct_count, | ||
min_value: stats | ||
.min_value | ||
.as_ref() | ||
.map(|x| simdutf8::basic::from_utf8(x).map(|x| x.to_string())) | ||
.transpose()?, | ||
max_value: stats | ||
.max_value | ||
.as_ref() | ||
.map(|x| simdutf8::basic::from_utf8(x).map(|x| x.to_string())) | ||
.transpose()?, | ||
}) | ||
} | ||
} | ||
|
||
pub(super) fn statistics_from_byte_array( | ||
stats: &ParquetByteArrayStatistics, | ||
data_type: DataType, | ||
) -> Result<Box<dyn Statistics>> { | ||
use DataType::*; | ||
Ok(match data_type { | ||
Utf8 => Box::new(Utf8Statistics::try_from(stats)?), | ||
LargeUtf8 => Box::new(Utf8Statistics::try_from(stats)?), | ||
Binary => Box::new(BinaryStatistics::from(stats)), | ||
LargeBinary => Box::new(BinaryStatistics::from(stats)), | ||
other => { | ||
return Err(ArrowError::NotYetImplemented(format!( | ||
"Can't read {:?} from parquet", | ||
other | ||
))) | ||
} | ||
}) | ||
use crate::array::{MutableArray, MutableBinaryArray, Offset}; | ||
use parquet2::statistics::{BinaryStatistics, Statistics as ParquetStatistics}; | ||
|
||
use crate::error::Result; | ||
|
||
pub(super) fn push<O: Offset>( | ||
from: Option<&dyn ParquetStatistics>, | ||
min: &mut dyn MutableArray, | ||
max: &mut dyn MutableArray, | ||
) -> Result<()> { | ||
let min = min | ||
.as_mut_any() | ||
.downcast_mut::<MutableBinaryArray<O>>() | ||
.unwrap(); | ||
let max = max | ||
.as_mut_any() | ||
.downcast_mut::<MutableBinaryArray<O>>() | ||
.unwrap(); | ||
let from = from.map(|s| s.as_any().downcast_ref::<BinaryStatistics>().unwrap()); | ||
min.push(from.and_then(|s| s.min_value.as_ref())); | ||
max.push(from.and_then(|s| s.max_value.as_ref())); | ||
Ok(()) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,43 +1,23 @@ | ||
use crate::datatypes::DataType; | ||
use parquet2::statistics::BooleanStatistics as ParquetBooleanStatistics; | ||
use std::any::Any; | ||
use crate::array::{MutableArray, MutableBooleanArray}; | ||
use parquet2::statistics::{BooleanStatistics, Statistics as ParquetStatistics}; | ||
|
||
use super::Statistics; | ||
use crate::error::Result; | ||
|
||
/// Statistics of a boolean parquet column | ||
#[derive(Debug, Clone, PartialEq)] | ||
pub struct BooleanStatistics { | ||
/// number of nulls | ||
pub null_count: Option<i64>, | ||
/// number of dictinct values | ||
pub distinct_count: Option<i64>, | ||
/// Minimum | ||
pub min_value: Option<bool>, | ||
/// Maximum | ||
pub max_value: Option<bool>, | ||
} | ||
|
||
impl Statistics for BooleanStatistics { | ||
fn data_type(&self) -> &DataType { | ||
&DataType::Boolean | ||
} | ||
|
||
fn as_any(&self) -> &dyn Any { | ||
self | ||
} | ||
|
||
fn null_count(&self) -> Option<i64> { | ||
self.null_count | ||
} | ||
} | ||
|
||
impl From<&ParquetBooleanStatistics> for BooleanStatistics { | ||
fn from(stats: &ParquetBooleanStatistics) -> Self { | ||
Self { | ||
null_count: stats.null_count, | ||
distinct_count: stats.distinct_count, | ||
min_value: stats.min_value, | ||
max_value: stats.max_value, | ||
} | ||
} | ||
pub(super) fn push( | ||
from: Option<&dyn ParquetStatistics>, | ||
min: &mut dyn MutableArray, | ||
max: &mut dyn MutableArray, | ||
) -> Result<()> { | ||
let min = min | ||
.as_mut_any() | ||
.downcast_mut::<MutableBooleanArray>() | ||
.unwrap(); | ||
let max = max | ||
.as_mut_any() | ||
.downcast_mut::<MutableBooleanArray>() | ||
.unwrap(); | ||
let from = from.map(|s| s.as_any().downcast_ref::<BooleanStatistics>().unwrap()); | ||
min.push(from.and_then(|s| s.min_value)); | ||
max.push(from.and_then(|s| s.max_value)); | ||
Ok(()) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
use crate::array::*; | ||
use crate::datatypes::{DataType, PhysicalType}; | ||
use crate::error::Result; | ||
|
||
use super::make_mutable; | ||
|
||
#[derive(Debug)] | ||
pub struct DynMutableDictionary { | ||
data_type: DataType, | ||
pub inner: Box<dyn MutableArray>, | ||
} | ||
|
||
impl DynMutableDictionary { | ||
pub fn try_with_capacity(data_type: DataType, capacity: usize) -> Result<Self> { | ||
let inner = if let DataType::Dictionary(_, inner, _) = &data_type { | ||
inner.as_ref() | ||
} else { | ||
unreachable!() | ||
}; | ||
let inner = make_mutable(inner, capacity)?; | ||
|
||
Ok(Self { data_type, inner }) | ||
} | ||
} | ||
|
||
impl MutableArray for DynMutableDictionary { | ||
fn data_type(&self) -> &DataType { | ||
&self.data_type | ||
} | ||
|
||
fn len(&self) -> usize { | ||
self.inner.len() | ||
} | ||
|
||
fn validity(&self) -> Option<&crate::bitmap::MutableBitmap> { | ||
self.inner.validity() | ||
} | ||
|
||
fn as_box(&mut self) -> Box<dyn Array> { | ||
let inner = self.inner.as_arc(); | ||
match self.data_type.to_physical_type() { | ||
PhysicalType::Dictionary(key) => match_integer_type!(key, |$T| { | ||
let keys = PrimitiveArray::<$T>::from_iter((0..inner.len() as $T).map(Some)); | ||
Box::new(DictionaryArray::<$T>::from_data(keys, inner)) | ||
}), | ||
_ => todo!(), | ||
} | ||
} | ||
|
||
fn as_any(&self) -> &dyn std::any::Any { | ||
self | ||
} | ||
|
||
fn as_mut_any(&mut self) -> &mut dyn std::any::Any { | ||
self | ||
} | ||
|
||
fn push_null(&mut self) { | ||
todo!() | ||
} | ||
|
||
fn shrink_to_fit(&mut self) { | ||
todo!() | ||
} | ||
} |
Oops, something went wrong.