forked from jorgecarleitao/parquet2
-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added bloom filter (jorgecarleitao#99)
- Loading branch information
1 parent
8de1c3f
commit 2dfce10
Showing
8 changed files
with
272 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
use xxhash_rust::xxh64::xxh64; | ||
|
||
use crate::types::NativeType; | ||
|
||
const SEED: u64 = 0; | ||
|
||
/// (xxh64) hash of a [`NativeType`]. | ||
#[inline] | ||
pub fn hash_native<T: NativeType>(value: T) -> u64 { | ||
xxh64(value.to_le_bytes().as_ref(), SEED) | ||
} | ||
|
||
/// (xxh64) hash of a sequence of bytes (e.g. ByteArray). | ||
#[inline] | ||
pub fn hash_byte<A: AsRef<[u8]>>(value: A) -> u64 { | ||
xxh64(value.as_ref(), SEED) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
//! API to read and use bloom filters | ||
mod hash; | ||
mod read; | ||
mod split_block; | ||
|
||
pub use hash::{hash_byte, hash_native}; | ||
pub use read::read; | ||
pub use split_block::{insert, is_in_set}; | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use super::*; | ||
|
||
#[test] | ||
fn basics() { | ||
let mut bitset = vec![0; 32]; | ||
|
||
// insert | ||
for a in 0..10i64 { | ||
let hash = hash_native(a); | ||
insert(&mut bitset, hash); | ||
} | ||
|
||
// bloom filter produced by parquet-mr/spark for a column of i64 (0..=10) | ||
/* | ||
import pyspark.sql // 3.2.1 | ||
spark = pyspark.sql.SparkSession.builder.getOrCreate() | ||
spark.conf.set("parquet.bloom.filter.enabled", True) | ||
spark.conf.set("parquet.bloom.filter.expected.ndv", 10) | ||
spark.conf.set("parquet.bloom.filter.max.bytes", 32) | ||
data = [(i % 10,) for i in range(100)] | ||
df = spark.createDataFrame(data, ["id"]).repartition(1) | ||
df.write.parquet("bla.parquet", mode = "overwrite") | ||
*/ | ||
let expected: &[u8] = &[ | ||
24, 130, 24, 8, 134, 8, 68, 6, 2, 101, 128, 10, 64, 2, 38, 78, 114, 1, 64, 38, 1, 192, | ||
194, 152, 64, 70, 0, 36, 56, 121, 64, 0, | ||
]; | ||
assert_eq!(bitset, expected); | ||
|
||
// check | ||
for a in 0..11i64 { | ||
let hash = hash_native(a); | ||
|
||
let valid = is_in_set(&bitset, hash); | ||
|
||
assert_eq!(a < 10, valid); | ||
} | ||
} | ||
|
||
#[test] | ||
fn binary() { | ||
let mut bitset = vec![0; 32]; | ||
|
||
// insert | ||
for a in 0..10i64 { | ||
let value = format!("a{}", a); | ||
let hash = hash_byte(value); | ||
insert(&mut bitset, hash); | ||
} | ||
|
||
// bloom filter produced by parquet-mr/spark for a column of i64 f"a{i}" for i in 0..10 | ||
let expected: &[u8] = &[ | ||
200, 1, 80, 20, 64, 68, 8, 109, 6, 37, 4, 67, 144, 80, 96, 32, 8, 132, 43, 33, 0, 5, | ||
99, 65, 2, 0, 224, 44, 64, 78, 96, 4, | ||
]; | ||
assert_eq!(bitset, expected); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
use std::io::{Read, Seek, SeekFrom}; | ||
|
||
use parquet_format_async_temp::{ | ||
thrift::protocol::TCompactInputProtocol, BloomFilterAlgorithm, BloomFilterCompression, | ||
BloomFilterHeader, SplitBlockAlgorithm, Uncompressed, | ||
}; | ||
|
||
use crate::{error::ParquetError, metadata::ColumnChunkMetaData}; | ||
|
||
/// Reads the bloom filter associated to [`ColumnChunkMetaData`] into `bitset`. | ||
/// Results in an empty `bitset` if there is no associated bloom filter or the algorithm is not supported. | ||
/// # Error | ||
/// Errors if the column contains no metadata or the filter can't be read or deserialized. | ||
pub fn read<R: Read + Seek>( | ||
column_metadata: &ColumnChunkMetaData, | ||
mut reader: &mut R, | ||
bitset: &mut Vec<u8>, | ||
) -> Result<(), ParquetError> { | ||
let offset = column_metadata | ||
.metadata() | ||
.ok_or_else(|| ParquetError::OutOfSpec("Column metadata is required".to_string()))? | ||
.bloom_filter_offset; | ||
|
||
let offset = if let Some(offset) = offset { | ||
offset as u64 | ||
} else { | ||
bitset.clear(); | ||
return Ok(()); | ||
}; | ||
reader.seek(SeekFrom::Start(offset))?; | ||
|
||
// deserialize header | ||
let mut prot = TCompactInputProtocol::new(&mut reader); | ||
let header = BloomFilterHeader::read_from_in_protocol(&mut prot)?; | ||
|
||
if header.algorithm != BloomFilterAlgorithm::BLOCK(SplitBlockAlgorithm {}) { | ||
bitset.clear(); | ||
return Ok(()); | ||
} | ||
if header.compression != BloomFilterCompression::UNCOMPRESSED(Uncompressed {}) { | ||
bitset.clear(); | ||
return Ok(()); | ||
} | ||
// read bitset | ||
if header.num_bytes as usize > bitset.capacity() { | ||
*bitset = vec![0; header.num_bytes as usize] | ||
} else { | ||
bitset.clear(); | ||
bitset.resize(header.num_bytes as usize, 0); // populate with zeros | ||
} | ||
|
||
reader.read_exact(bitset)?; | ||
Ok(()) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
use std::convert::TryInto; | ||
|
||
/// magic numbers taken from https://github.com/apache/parquet-format/blob/master/BloomFilter.md | ||
const SALT: [u32; 8] = [ | ||
1203114875, 1150766481, 2284105051, 2729912477, 1884591559, 770785867, 2667333959, 1550580529, | ||
]; | ||
|
||
fn hash_to_block_index(hash: u64, len: usize) -> usize { | ||
let number_of_blocks = len as u64 / 32; | ||
let low_hash = hash >> 32; | ||
let block_index = ((low_hash * number_of_blocks) >> 32) as u32; | ||
block_index as usize | ||
} | ||
|
||
fn new_mask(x: u32) -> [u32; 8] { | ||
let mut a = [0u32; 8]; | ||
for i in 0..8 { | ||
let mask = x.wrapping_mul(SALT[i]); | ||
let mask = mask >> 27; | ||
let mask = 0x1 << mask; | ||
a[i] = mask; | ||
} | ||
a | ||
} | ||
|
||
/// loads a block from the bitset to the stack | ||
#[inline] | ||
fn load_block(bitset: &[u8]) -> [u32; 8] { | ||
let mut a = [0u32; 8]; | ||
let bitset = bitset.chunks_exact(4).take(8); | ||
for (a, chunk) in a.iter_mut().zip(bitset) { | ||
*a = u32::from_le_bytes(chunk.try_into().unwrap()) | ||
} | ||
a | ||
} | ||
|
||
/// assigns a block from the stack to `bitset` | ||
#[inline] | ||
fn unload_block(block: [u32; 8], bitset: &mut [u8]) { | ||
let bitset = bitset.chunks_exact_mut(4).take(8); | ||
for (a, chunk) in block.iter().zip(bitset) { | ||
let a = a.to_le_bytes(); | ||
chunk[0] = a[0]; | ||
chunk[1] = a[1]; | ||
chunk[2] = a[2]; | ||
chunk[3] = a[3]; | ||
} | ||
} | ||
|
||
/// Returns whether the `hash` is in the set | ||
pub fn is_in_set(bitset: &[u8], hash: u64) -> bool { | ||
let block_index = hash_to_block_index(hash, bitset.len()); | ||
let key = hash as u32; | ||
|
||
let mask = new_mask(key); | ||
let slice = &bitset[block_index * 32..(block_index + 1) * 32]; | ||
let block_mask = load_block(slice); | ||
|
||
for i in 0..8 { | ||
if mask[i] & block_mask[i] == 0 { | ||
return false; | ||
} | ||
} | ||
true | ||
} | ||
|
||
/// Inserts a new hash to the set | ||
pub fn insert(bitset: &mut [u8], hash: u64) { | ||
let block_index = hash_to_block_index(hash, bitset.len()); | ||
let key = hash as u32; | ||
|
||
let mask = new_mask(key); | ||
let slice = &bitset[block_index * 32..(block_index + 1) * 32]; | ||
let mut block_mask = load_block(slice); | ||
|
||
for i in 0..8 { | ||
block_mask[i] |= mask[i]; | ||
|
||
let mut_slice = &mut bitset[block_index * 32..(block_index + 1) * 32]; | ||
unload_block(block_mask, mut_slice) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters