From 8179193a0f1cc4602d05e201b4f1ae16142eec10 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 9 Jun 2021 14:09:45 -0400 Subject: [PATCH] Fix out of bounds read in bit chunk iterator (#416) (#432) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix out of bounds read in bit chunk iterator * Add comment why reading one additional byte is enough Co-authored-by: Jörn Horstmann --- arrow/benches/boolean_kernels.rs | 19 +++++++++++++++ arrow/src/util/bit_chunk_iterator.rs | 36 +++++++++++++++++++++------- 2 files changed, 46 insertions(+), 9 deletions(-) diff --git a/arrow/benches/boolean_kernels.rs b/arrow/benches/boolean_kernels.rs index 6559c4e4caf5..e9394f7a0231 100644 --- a/arrow/benches/boolean_kernels.rs +++ b/arrow/benches/boolean_kernels.rs @@ -45,6 +45,25 @@ fn add_benchmark(c: &mut Criterion) { c.bench_function("and", |b| b.iter(|| bench_and(&array1, &array2))); c.bench_function("or", |b| b.iter(|| bench_or(&array1, &array2))); c.bench_function("not", |b| b.iter(|| bench_not(&array1))); + + let array1_slice = array1.slice(1, size - 1); + let array1_slice = array1_slice + .as_any() + .downcast_ref::() + .unwrap(); + let array2_slice = array2.slice(1, size - 1); + let array2_slice = array2_slice + .as_any() + .downcast_ref::() + .unwrap(); + + c.bench_function("and_sliced", |b| { + b.iter(|| bench_and(&array1_slice, &array2_slice)) + }); + c.bench_function("or_sliced", |b| { + b.iter(|| bench_or(&array1_slice, &array2_slice)) + }); + c.bench_function("not_sliced", |b| b.iter(|| bench_not(&array1_slice))); } criterion_group!(benches, add_benchmark); diff --git a/arrow/src/util/bit_chunk_iterator.rs b/arrow/src/util/bit_chunk_iterator.rs index b9145b7af86f..ea9280cee3f1 100644 --- a/arrow/src/util/bit_chunk_iterator.rs +++ b/arrow/src/util/bit_chunk_iterator.rs @@ -35,10 +35,10 @@ impl<'a> BitChunks<'a> { let byte_offset = offset / 8; let bit_offset = offset % 8; - let chunk_bits = 8 * std::mem::size_of::(); - - let chunk_len = len / chunk_bits; - let remainder_len = len & (chunk_bits - 1); + // number of complete u64 chunks + let chunk_len = len / 64; + // number of remaining bits + let remainder_len = len % 64; BitChunks::<'a> { buffer: &buffer[byte_offset..], @@ -137,14 +137,18 @@ impl Iterator for BitChunkIterator<'_> { // so when reading as u64 on a big-endian machine, the bytes need to be swapped let current = unsafe { std::ptr::read_unaligned(raw_data.add(index)).to_le() }; - let combined = if self.bit_offset == 0 { + let bit_offset = self.bit_offset; + + let combined = if bit_offset == 0 { current } else { - let next = - unsafe { std::ptr::read_unaligned(raw_data.add(index + 1)).to_le() }; + // the constructor ensures that bit_offset is in 0..8 + // that means we need to read at most one additional byte to fill in the high bits + let next = unsafe { + std::ptr::read_unaligned(raw_data.add(index + 1) as *const u8) as u64 + }; - current >> self.bit_offset - | (next & ((1 << self.bit_offset) - 1)) << (64 - self.bit_offset) + (current >> bit_offset) | (next << (64 - bit_offset)) }; self.index = index + 1; @@ -254,4 +258,18 @@ mod tests { bitchunks.remainder_bits() ); } + + #[test] + fn test_iter_remainder_out_of_bounds() { + // allocating a full page should trigger a fault when reading out of bounds + const ALLOC_SIZE: usize = 4 * 1024; + let input = vec![0xFF_u8; ALLOC_SIZE]; + + let buffer: Buffer = Buffer::from(input); + + let bitchunks = buffer.bit_chunks(57, ALLOC_SIZE * 8 - 57); + + assert_eq!(u64::MAX, bitchunks.iter().last().unwrap()); + assert_eq!(0x7F, bitchunks.remainder_bits()); + } }