Skip to content

Commit

Permalink
feat: hash streaming for v2 (#17)
Browse files Browse the repository at this point in the history
  • Loading branch information
thevilledev authored Dec 2, 2024
1 parent c479d29 commit df5bf6d
Show file tree
Hide file tree
Showing 3 changed files with 182 additions and 3 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]

- Added `HashMap` and `HashSet` implementations to the `v2` version

- Added streaming hashing to the `v2` version
## [v0.4.0] - 2024-11-30

- Added `v2` version of the algorithm, available by importing `chibihash::v2::*`. Note that `v2` is missing `StreamingChibiHasher`.
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ The `v2` version will be the default in the next major version.
- Multiple ways to use ChibiHash:
1. **Direct Hashing**: One-shot hashing using `chibi_hash64()`
2. **Simple Hasher**: Basic implementation using `ChibiHasher` (implements `std::hash::Hasher`)
3. **Streaming Hasher**: Memory-efficient streaming with `StreamingChibiHasher` (implements `std::hash::Hasher`) - currently only available in `v1`
3. **Streaming Hasher**: Memory-efficient streaming with `StreamingChibiHasher` (implements `std::hash::Hasher`)
4. **BuildHasher**: `ChibiHasher` implements `BuildHasher`. This allows using ChibiHash as the default hasher for `std::collections::HashMap` and `std::collections::HashSet`. Use `ChibiHashMap` and `ChibiHashSet` types.

## Example
Expand Down
181 changes: 180 additions & 1 deletion src/v2/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
//!
//! Basic usage:
//! ```rust
//! use chibihash::v2::{chibi_hash64, ChibiHasher, ChibiHashMap, ChibiHashSet};
//! use chibihash::v2::{chibi_hash64, ChibiHasher, ChibiHashMap, ChibiHashSet, StreamingChibiHasher};
//! use std::hash::Hasher;
//!
//! // Direct hashing
Expand All @@ -32,6 +32,12 @@
//! hasher.write(key);
//! println!("{:016x}", hasher.finish());
//!
//! // Streaming hashing
//! let mut hasher1 = StreamingChibiHasher::new(0);
//! hasher1.update(b"Hello, ");
//! hasher1.update(b"World!");
//! println!("{:016x}", hasher1.finalize());
//!
//! // Using BuildHasher as HashMap
//! let mut map: ChibiHashMap<String, i32> = ChibiHashMap::default();
//! map.insert("hello".to_string(), 42);
Expand Down Expand Up @@ -187,6 +193,137 @@ pub type ChibiHashMap<K, V> = BaseHashMap<K, V, ChibiHasher>;
/// A HashSet that uses ChibiHash by default
pub type ChibiHashSet<T> = BaseHashSet<T, ChibiHasher>;

/// Streaming ChibiHasher that processes data incrementally
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct StreamingChibiHasher {
h: [u64; 4],
total_len: u64,
seed: u64,
buf: [u8; 32],
buf_len: usize,
}

impl StreamingChibiHasher {
#[inline(always)]
pub const fn new(seed: u64) -> Self {
let seed2 = seed
.wrapping_sub(K)
.rotate_left(15)
.wrapping_add(seed.wrapping_sub(K).rotate_left(47));

Self {
h: [
seed,
seed.wrapping_add(K),
seed2,
seed2.wrapping_add(K.wrapping_mul(K) ^ K),
],
buf: [0; 32],
buf_len: 0,
total_len: 0,
seed,
}
}

pub fn update(&mut self, input: &[u8]) {
let mut p = input;
let mut l = p.len();

// If there's data in buf, try to fill it up
if self.buf_len > 0 {
while l > 0 && self.buf_len < 32 {
self.buf[self.buf_len] = p[0];
self.buf_len += 1;
p = &p[1..];
l -= 1;
}

// Flush if filled
if self.buf_len == 32 {
for i in 0..4 {
let stripe = load_u64_le(&self.buf[i * 8..]);
self.h[i] = stripe.wrapping_add(self.h[i]).wrapping_mul(K);
self.h[(i + 1) & 3] = self.h[(i + 1) & 3].wrapping_add(stripe.rotate_left(27));
}
self.buf_len = 0;
}
}

// Process 32-byte chunks
while l >= 32 {
for i in 0..4 {
let stripe = load_u64_le(&p[i * 8..]);
self.h[i] = stripe.wrapping_add(self.h[i]).wrapping_mul(K);
self.h[(i + 1) & 3] = self.h[(i + 1) & 3].wrapping_add(stripe.rotate_left(27));
}
p = &p[32..];
l -= 32;
}

// Store remaining bytes in buffer
while l > 0 {
self.buf[self.buf_len] = p[0];
self.buf_len += 1;
p = &p[1..];
l -= 1;
}

self.total_len += input.len() as u64;
}

pub fn finalize(&self) -> u64 {
let mut h = self.h;
let mut p = &self.buf[..self.buf_len];
let mut l = self.buf_len;

// Process 8-byte chunks
while l >= 8 {
h[0] ^= load_u32_le(&p[0..]);
h[0] = h[0].wrapping_mul(K);
h[1] ^= load_u32_le(&p[4..]);
h[1] = h[1].wrapping_mul(K);
p = &p[8..];
l -= 8;
}

// Handle remaining bytes
if l >= 4 {
h[2] ^= load_u32_le(&p[0..]);
h[3] ^= load_u32_le(&p[l - 4..]);
} else if l > 0 {
h[2] ^= u64::from(p[0]);
h[3] ^= u64::from(p[l / 2]) | (u64::from(p[l - 1]) << 8);
}

h[0] = h[0].wrapping_add((h[2].wrapping_mul(K)).rotate_left(31) ^ (h[2] >> 31));
h[1] = h[1].wrapping_add((h[3].wrapping_mul(K)).rotate_left(31) ^ (h[3] >> 31));
h[0] = h[0].wrapping_mul(K);
h[0] ^= h[0] >> 31;
h[1] = h[1].wrapping_add(h[0]);

let mut x = (self.total_len).wrapping_mul(K);
x ^= x.rotate_left(29);
x = x.wrapping_add(self.seed);
x ^= h[1];

x ^= x.rotate_left(15) ^ x.rotate_left(42);
x = x.wrapping_mul(K);
x ^= x.rotate_left(13) ^ x.rotate_left(31);

x
}
}

impl Hasher for StreamingChibiHasher {
fn finish(&self) -> u64 {
self.finalize()
}

fn write(&mut self, bytes: &[u8]) {
self.update(bytes);
}
}

#[inline(always)]
fn load_u32_le(bytes: &[u8]) -> u64 {
u32::from_le_bytes(bytes[..4].try_into().unwrap()) as u64
Expand Down Expand Up @@ -248,4 +385,46 @@ mod tests {
set.insert("hello".to_string());
assert!(set.contains("hello"));
}

#[test]
fn test_streaming_matches_direct() {
let test_cases = [
("", 55555, 0x58AEE94CA9FB5092),
("", 0, 0xD4F69E3ECCF128FC),
("hi", 0, 0x92C85CA994367DAC),
("123", 0, 0x788A224711FF6E25),
("abcdefgh", 0, 0xA2E39BE0A0689B32),
("Hello, world!", 0, 0xABF8EB3100B2FEC7),
("qwertyuiopasdfghjklzxcvbnm123456", 0, 0x90FC5DB7F56967FA),
("qwertyuiopasdfghjklzxcvbnm123456789", 0, 0x6DCDCE02882A4975),
];

// Test direct matches
for (input, seed, expected) in test_cases {
let input_bytes = input.as_bytes();
let direct = chibi_hash64(input_bytes, seed);
assert_eq!(direct, expected, "Direct hash mismatch");

let mut streaming = StreamingChibiHasher::new(seed);
streaming.update(input_bytes);
let streaming_result = streaming.finalize();

assert_eq!(
streaming_result, expected,
"Streaming hash mismatch for input: {:?}, seed: {}, got: {:016X}, expected: {:016X}",
input, seed, streaming_result, expected
);
}

// Test split streaming
let (seed, expected) = (0, 0xABF8EB3100B2FEC7);
let mut streaming = StreamingChibiHasher::new(seed);
streaming.update(b"Hello, ");
streaming.update(b"world!");
assert_eq!(
streaming.finalize(),
expected,
"Split streaming should match expected hash"
);
}
}

0 comments on commit df5bf6d

Please sign in to comment.