-
Notifications
You must be signed in to change notification settings - Fork 59
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improvements for dictionary encodings and nested values #7
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,38 +3,75 @@ use super::{super::ceil8, HybridEncoded}; | |
|
||
/// An iterator that, given a slice of bytes, returns `HybridEncoded` | ||
pub struct Decoder<'a> { | ||
values: &'a [u8], | ||
num_bits: u32, | ||
inner: std::iter::Flatten<HybridIterator<'a>>, | ||
} | ||
|
||
impl<'a> Decoder<'a> { | ||
pub fn new(values: &'a [u8], num_bits: u32) -> Self { | ||
Self { values, num_bits } | ||
pub fn new(values: &'a [u8], num_bits: u32, length: usize) -> Self { | ||
Self { | ||
inner: HybridIterator { | ||
values, | ||
num_bits, | ||
remaining: length, | ||
} | ||
.flatten(), | ||
} | ||
} | ||
} | ||
|
||
impl<'a> Iterator for Decoder<'a> { | ||
type Item = HybridEncoded<'a>; | ||
type Item = u32; | ||
|
||
fn next(&mut self) -> Option<Self::Item> { | ||
self.inner.next() | ||
} | ||
} | ||
|
||
pub struct HybridIterator<'a> { | ||
values: &'a [u8], | ||
num_bits: u32, | ||
remaining: usize, | ||
} | ||
|
||
impl<'a> Iterator for HybridIterator<'a> { | ||
type Item = HybridEncoded<'a>; | ||
|
||
fn next(&mut self) -> Option<HybridEncoded<'a>> { | ||
if self.values.is_empty() { | ||
return None; | ||
} | ||
let (indicator, consumed) = uleb128::decode(self.values); | ||
self.values = &self.values[consumed..]; | ||
if indicator & 1 == 1 { | ||
// is bitpacking | ||
let bytes = (indicator as usize >> 1) * self.num_bits as usize; | ||
let result = Some(HybridEncoded::Bitpacked(&self.values[..bytes])); | ||
self.values = &self.values[bytes..]; | ||
let num_bits = self.num_bits as usize; | ||
let num_bytes = (indicator as usize >> 1) * num_bits; | ||
let length = (indicator as usize >> 1) * 8; | ||
let run_length = std::cmp::min(length, self.remaining); | ||
let result = Some(HybridEncoded::Bitpacked { | ||
compressed: &self.values[..num_bytes], | ||
num_bits, | ||
run_length, | ||
}); | ||
self.remaining -= run_length; | ||
self.values = &self.values[num_bytes..]; | ||
result | ||
} else { | ||
// is rle | ||
let run_length = indicator as usize >> 1; | ||
// repeated-value := value that is repeated, using a fixed-width of round-up-to-next-byte(bit-width) | ||
let rle_bytes = ceil8(self.num_bits as usize); | ||
let result = Some(HybridEncoded::Rle(&self.values[..rle_bytes], run_length)); | ||
|
||
let pack = &self.values[0..rle_bytes]; | ||
let mut value_bytes = [0u8; std::mem::size_of::<u32>()]; | ||
pack.iter() | ||
.enumerate() | ||
.for_each(|(i, byte)| value_bytes[i] = *byte); | ||
let value = u32::from_le_bytes(value_bytes); | ||
Comment on lines
+64
to
+70
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So, the reason I do not want to push this here is that, in the arrow format, validity is in bytes as LSB, which is the same as parquet when rep level = 0. This particular change would require a round trip This is why I tried to offer the "raw" bytes here, and provide a higher-end API based on So, the idea was: if people want the raw stuff, use the decoder, else, use the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
That sounds like it should work, seems all users call There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now that I realized that arrow2 is using the lower level api I understand what you actually meant :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So, we already have an iterator API, that returns either variant, RLE or Bitpacked. This is made so that arrow2 can use it without having to incur the cost of bit-unpacking the bitpacked to then compare an There will be a performance degradation if we move to iterators over i32, which is why I was trying to suggest to keep the encoder as is, and offer a higher-end iterator to iterate over it with |
||
|
||
let result = Some(HybridEncoded::Rle { value, run_length }); | ||
self.values = &self.values[rle_bytes..]; | ||
self.remaining -= run_length; | ||
result | ||
} | ||
} | ||
|
@@ -44,8 +81,6 @@ impl<'a> Iterator for Decoder<'a> { | |
mod tests { | ||
use super::*; | ||
|
||
use super::super::super::bitpacking; | ||
|
||
#[test] | ||
fn basics_1() { | ||
let bit_width = 1; | ||
|
@@ -54,43 +89,32 @@ mod tests { | |
2, 0, 0, 0, // length | ||
0b00000011, 0b00001011, // data | ||
]; | ||
let expected = vec![1, 1, 0, 1, 0]; | ||
|
||
let mut decoder = Decoder::new(&values[4..6], bit_width); | ||
let decoder = Decoder::new(&values[4..6], bit_width, length); | ||
|
||
let run = decoder.next().unwrap(); | ||
let result = decoder.collect::<Vec<_>>(); | ||
|
||
if let HybridEncoded::Bitpacked(values) = run { | ||
assert_eq!(values, &[0b00001011]); | ||
let result = | ||
bitpacking::Decoder::new(values, bit_width as u8, length).collect::<Vec<_>>(); | ||
assert_eq!(result, &[1, 1, 0, 1, 0]); | ||
} else { | ||
panic!() | ||
}; | ||
assert_eq!(result, expected); | ||
} | ||
|
||
#[test] | ||
fn basics_2() { | ||
// This test was validated by the result of what pyarrow3 outputs when | ||
// the bitmap is used. | ||
let bit_width = 1; | ||
let length = 10; | ||
let values = vec![ | ||
3, 0, 0, 0, // length | ||
0b00000101, 0b11101011, 0b00000010, // data | ||
]; | ||
let expected = &[1, 1, 0, 1, 0, 1, 1, 1, 0, 1]; | ||
let expected = vec![1, 1, 0, 1, 0, 1, 1, 1, 0, 1]; | ||
|
||
let mut decoder = Decoder::new(&values[4..4 + 3], bit_width); | ||
let decoder = Decoder::new(&values[4..4 + 3], bit_width, length); | ||
|
||
let run = decoder.next().unwrap(); | ||
let result = decoder.collect::<Vec<_>>(); | ||
|
||
if let HybridEncoded::Bitpacked(values) = run { | ||
assert_eq!(values, &[0b11101011, 0b00000010]); | ||
let result = bitpacking::Decoder::new(values, bit_width as u8, 10).collect::<Vec<_>>(); | ||
assert_eq!(result, expected); | ||
} else { | ||
panic!() | ||
}; | ||
assert_eq!(result, expected); | ||
} | ||
|
||
#[test] | ||
|
@@ -102,16 +126,30 @@ mod tests { | |
0b00010000, // data | ||
0b00000001, | ||
]; | ||
let expected = vec![1_u32; length]; | ||
|
||
let mut decoder = Decoder::new(&values[4..4 + 2], bit_width); | ||
let decoder = Decoder::new(&values[4..4 + 2], bit_width, length); | ||
|
||
let run = decoder.next().unwrap(); | ||
let result = decoder.collect::<Vec<_>>(); | ||
|
||
if let HybridEncoded::Rle(values, items) = run { | ||
assert_eq!(values, &[0b00000001]); | ||
assert_eq!(items, length); | ||
} else { | ||
panic!() | ||
}; | ||
assert_eq!(result, expected); | ||
} | ||
|
||
#[test] | ||
fn rle_and_bit_packed() { | ||
let bit_width = 1; | ||
let length = 8; | ||
let values = vec![ | ||
4, 0, 0, 0, // length | ||
0b00001000, // data | ||
0b00000001, 0b00000011, 0b00001010, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These values are according to my understanding of the encoding, I did not yet validate them |
||
]; | ||
let expected = vec![1, 1, 1, 1, 0, 1, 0, 1]; | ||
|
||
let decoder = Decoder::new(&values[4..4 + 4], bit_width, length); | ||
|
||
let result = decoder.collect::<Vec<_>>(); | ||
|
||
assert_eq!(result, expected); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
use crate::encoding::get_length; | ||
|
||
/// Decodes according to [Plain strings](https://github.com/apache/parquet-format/blob/master/Encodings.md#plain-plain--0), | ||
/// prefixes, lengths and values | ||
/// # Implementation | ||
/// This struct does not allocate on the heap. | ||
#[derive(Debug)] | ||
pub struct Decoder<'a> { | ||
values: &'a [u8], | ||
index: usize, | ||
} | ||
|
||
impl<'a> Decoder<'a> { | ||
pub fn new(values: &'a [u8]) -> Self { | ||
Self { values, index: 0 } | ||
} | ||
} | ||
|
||
impl<'a> Iterator for Decoder<'a> { | ||
type Item = &'a [u8]; | ||
|
||
fn next(&mut self) -> Option<Self::Item> { | ||
let values = self.values; | ||
let index = self.index; | ||
if index + 4 < values.len() { | ||
let next_len = get_length(values) as usize; | ||
let next_index = index + 4 + next_len; | ||
|
||
let result = Some(&values[index + 4..next_index]); | ||
self.index = next_index; | ||
|
||
result | ||
} else { | ||
None | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
mod decoder; | ||
|
||
pub use decoder::Decoder; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These changes mostly correspond to what was previously in
rle_decode
inlevels.rs