Skip to content

Commit

Permalink
Fix parquet string statistics generation (#643)
Browse files Browse the repository at this point in the history
* Fix string statistics generation, add tests

* fix Int96 stats test

* Add notes for additional tickets
  • Loading branch information
alamb authored Aug 8, 2021
1 parent b682ef5 commit 4618ef5
Show file tree
Hide file tree
Showing 2 changed files with 134 additions and 17 deletions.
122 changes: 122 additions & 0 deletions parquet/src/column/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1687,6 +1687,128 @@ mod tests {
);
}

#[test]
fn test_bool_statistics() {
let stats = statistics_roundtrip::<BoolType>(&[true, false, false, true]);
assert!(stats.has_min_max_set());
// should it be BooleanStatistics??
// https://github.com/apache/arrow-rs/issues/659
if let Statistics::Int32(stats) = stats {
assert_eq!(stats.min(), &0);
assert_eq!(stats.max(), &1);
} else {
panic!("expecting Statistics::Int32, got {:?}", stats);
}
}

#[test]
fn test_int32_statistics() {
let stats = statistics_roundtrip::<Int32Type>(&[-1, 3, -2, 2]);
assert!(stats.has_min_max_set());
if let Statistics::Int32(stats) = stats {
assert_eq!(stats.min(), &-2);
assert_eq!(stats.max(), &3);
} else {
panic!("expecting Statistics::Int32, got {:?}", stats);
}
}

#[test]
fn test_int64_statistics() {
let stats = statistics_roundtrip::<Int64Type>(&[-1, 3, -2, 2]);
assert!(stats.has_min_max_set());
if let Statistics::Int64(stats) = stats {
assert_eq!(stats.min(), &-2);
assert_eq!(stats.max(), &3);
} else {
panic!("expecting Statistics::Int64, got {:?}", stats);
}
}

#[test]
fn test_int96_statistics() {
let input = vec![
Int96::from(vec![1, 20, 30]),
Int96::from(vec![3, 20, 10]),
Int96::from(vec![0, 20, 30]),
Int96::from(vec![2, 20, 30]),
]
.into_iter()
.collect::<Vec<Int96>>();

let stats = statistics_roundtrip::<Int96Type>(&input);
assert!(stats.has_min_max_set());
if let Statistics::Int96(stats) = stats {
assert_eq!(stats.min(), &Int96::from(vec![0, 20, 30]));
assert_eq!(stats.max(), &Int96::from(vec![3, 20, 10]));
} else {
panic!("expecting Statistics::Int96, got {:?}", stats);
}
}

#[test]
fn test_float_statistics() {
let stats = statistics_roundtrip::<FloatType>(&[-1.0, 3.0, -2.0, 2.0]);
assert!(stats.has_min_max_set());
if let Statistics::Float(stats) = stats {
assert_eq!(stats.min(), &-2.0);
assert_eq!(stats.max(), &3.0);
} else {
panic!("expecting Statistics::Float, got {:?}", stats);
}
}

#[test]
fn test_double_statistics() {
let stats = statistics_roundtrip::<DoubleType>(&[-1.0, 3.0, -2.0, 2.0]);
assert!(stats.has_min_max_set());
if let Statistics::Double(stats) = stats {
assert_eq!(stats.min(), &-2.0);
assert_eq!(stats.max(), &3.0);
} else {
panic!("expecting Statistics::Double, got {:?}", stats);
}
}

#[test]
fn test_byte_array_statistics() {
let input = vec!["aawaa", "zz", "aaw", "m", "qrs"]
.iter()
.map(|&s| s.into())
.collect::<Vec<ByteArray>>();

let stats = statistics_roundtrip::<ByteArrayType>(&input);
assert!(stats.has_min_max_set());
if let Statistics::ByteArray(stats) = stats {
assert_eq!(stats.min(), &ByteArray::from("aaw"));
assert_eq!(stats.max(), &ByteArray::from("zz"));
} else {
panic!("expecting Statistics::ByteArray, got {:?}", stats);
}
}

#[test]
fn test_fixed_len_byte_array_statistics() {
let input = vec!["aawaa", "zz ", "aaw ", "m ", "qrs "]
.iter()
.map(|&s| {
let b: ByteArray = s.into();
b.into()
})
.collect::<Vec<FixedLenByteArray>>();

let stats = statistics_roundtrip::<FixedLenByteArrayType>(&input);
assert!(stats.has_min_max_set());
// should it be FixedLenByteArray?
// https://github.com/apache/arrow-rs/issues/660
if let Statistics::ByteArray(stats) = stats {
assert_eq!(stats.min(), &ByteArray::from("aaw "));
assert_eq!(stats.max(), &ByteArray::from("zz "));
} else {
panic!("expecting Statistics::ByteArray, got {:?}", stats);
}
}

#[test]
fn test_float_statistics_nan_middle() {
let stats = statistics_roundtrip::<FloatType>(&[1.0, f32::NAN, 2.0]);
Expand Down
29 changes: 12 additions & 17 deletions parquet/src/data_type.rs
Original file line number Diff line number Diff line change
Expand Up @@ -128,23 +128,18 @@ impl std::fmt::Debug for ByteArray {

impl PartialOrd for ByteArray {
fn partial_cmp(&self, other: &ByteArray) -> Option<Ordering> {
if self.data.is_some() && other.data.is_some() {
match self.len().cmp(&other.len()) {
Ordering::Greater => Some(Ordering::Greater),
Ordering::Less => Some(Ordering::Less),
Ordering::Equal => {
for (v1, v2) in self.data().iter().zip(other.data().iter()) {
match v1.cmp(v2) {
Ordering::Greater => return Some(Ordering::Greater),
Ordering::Less => return Some(Ordering::Less),
_ => {}
}
}
Some(Ordering::Equal)
}
// sort nulls first (consistent with PartialCmp on Option)
//
// Since ByteBuffer doesn't implement PartialOrd, so can't
// derive an implementation
match (&self.data, &other.data) {
(None, None) => Some(Ordering::Equal),
(None, Some(_)) => Some(Ordering::Less),
(Some(_), None) => Some(Ordering::Greater),
(Some(self_data), Some(other_data)) => {
// compare slices directly
self_data.data().partial_cmp(other_data.data())
}
} else {
None
}
}
}
Expand Down Expand Up @@ -1368,7 +1363,7 @@ mod tests {
let ba4 = ByteArray::from(vec![]);
let ba5 = ByteArray::from(vec![2, 2, 3]);

assert!(ba1 > ba2);
assert!(ba1 < ba2);
assert!(ba3 > ba1);
assert!(ba1 > ba4);
assert_eq!(ba1, ba11);
Expand Down

0 comments on commit 4618ef5

Please sign in to comment.