From ee237968686b1b444755b9227b8df438655bf7f5 Mon Sep 17 00:00:00 2001 From: Jorge Leitao Date: Tue, 14 Sep 2021 09:17:15 +0100 Subject: [PATCH] Improved usages of unsafe and added tests for its invariants. (#403) --- src/array/binary/mod.rs | 12 +++--- src/array/fixed_size_binary/mod.rs | 15 ++++--- src/array/utf8/mod.rs | 15 +++---- tests/it/array/binary/mod.rs | 64 ++++++++++++++++++++++++++++++ tests/it/array/utf8/mod.rs | 39 +++++++++++++++++- 5 files changed, 123 insertions(+), 22 deletions(-) diff --git a/src/array/binary/mod.rs b/src/array/binary/mod.rs index 607e7bd3eae..eea1fdfcad4 100644 --- a/src/array/binary/mod.rs +++ b/src/array/binary/mod.rs @@ -95,12 +95,12 @@ impl BinaryArray { } } - /// Sets the validity bitmap on this [`BinaryArray`]. + /// Clones this [`BinaryArray`] with a different validity. /// # Panic - /// This function panics iff `validity.len() != self.len()`. + /// Panics iff `validity.len() != self.len()`. pub fn with_validity(&self, validity: Option) -> Self { if matches!(&validity, Some(bitmap) if bitmap.len() != self.len()) { - panic!("validity should be as least as large as the array") + panic!("validity's length must be equal to the array's length") } let mut arr = self.clone(); arr.validity = validity; @@ -112,7 +112,7 @@ impl BinaryArray { impl BinaryArray { /// Returns the element at index `i` /// # Panics - /// iff `i > self.len()` + /// iff `i >= self.len()` pub fn value(&self, i: usize) -> &[u8] { let offsets = self.offsets.as_slice(); let offset = offsets[i]; @@ -127,8 +127,8 @@ impl BinaryArray { /// # Safety /// Assumes that the `i < self.len`. pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] { - let offset = *self.offsets.as_ptr().add(i); - let offset_1 = *self.offsets.as_ptr().add(i + 1); + let offset = *self.offsets.get_unchecked(i); + let offset_1 = *self.offsets.get_unchecked(i + 1); let length = (offset_1 - offset).to_usize(); let offset = offset.to_usize(); diff --git a/src/array/fixed_size_binary/mod.rs b/src/array/fixed_size_binary/mod.rs index 1bd31e6fef6..7d7d8a58506 100644 --- a/src/array/fixed_size_binary/mod.rs +++ b/src/array/fixed_size_binary/mod.rs @@ -10,7 +10,7 @@ pub use mutable::*; /// Cloning and slicing this struct is `O(1)`. #[derive(Debug, Clone)] pub struct FixedSizeBinaryArray { - size: i32, // this is redundant with `data_type`, but useful to not have to deconstruct the data_type. + size: usize, // this is redundant with `data_type`, but useful to not have to deconstruct the data_type. data_type: DataType, values: Buffer, validity: Option, @@ -34,9 +34,9 @@ impl FixedSizeBinaryArray { /// Returns a new [`FixedSizeBinaryArray`]. pub fn from_data(data_type: DataType, values: Buffer, validity: Option) -> Self { - let size = *Self::get_size(&data_type); + let size = *Self::get_size(&data_type) as usize; - assert_eq!(values.len() % (size as usize), 0); + assert_eq!(values.len() % size, 0); Self { size, @@ -83,15 +83,14 @@ impl FixedSizeBinaryArray { /// Assumes that the `i < self.len`. #[inline] pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] { - std::slice::from_raw_parts( - self.values.as_ptr().add(i * self.size as usize), - self.size as usize, - ) + // soundness: invariant of the function. + self.values + .get_unchecked(i * self.size..(i + 1) * self.size) } /// Returns the size pub fn size(&self) -> usize { - self.size as usize + self.size } /// Sets the validity bitmap on this [`FixedSizeBinaryArray`]. diff --git a/src/array/utf8/mod.rs b/src/array/utf8/mod.rs index 530858c7e25..6033a8224b0 100644 --- a/src/array/utf8/mod.rs +++ b/src/array/utf8/mod.rs @@ -94,7 +94,7 @@ impl Utf8Array { } } - /// The same as [`Utf8Array::from_data`] but does not check for utf8. + /// The same as [`Utf8Array::from_data`] but does not check for valid utf8. /// # Safety /// `values` buffer must contain valid utf8 between every `offset` pub unsafe fn from_data_unchecked( @@ -122,13 +122,14 @@ impl Utf8Array { /// # Safety /// This function is safe `iff` `i < self.len`. pub unsafe fn value_unchecked(&self, i: usize) -> &str { - let offset = *self.offsets.as_ptr().add(i); - let offset_1 = *self.offsets.as_ptr().add(i + 1); + // soundness: the invariant of the function + let offset = *self.offsets.get_unchecked(i); + let offset_1 = *self.offsets.get_unchecked(i + 1); let length = (offset_1 - offset).to_usize(); let offset = offset.to_usize(); - // Soundness: `from_data` verifies that each slot is utf8 and offsets are built correctly. - let slice = std::slice::from_raw_parts(self.values.as_ptr().add(offset), length); + let slice = &self.values.as_slice()[offset..offset + length]; + // soundness: the invariant of the struct std::str::from_utf8_unchecked(slice) } @@ -171,8 +172,8 @@ impl Utf8Array { let offset = offset.to_usize(); let slice = &self.values.as_slice()[offset..offset + length]; - // todo: validate utf8 so that we can use the unsafe version - std::str::from_utf8(slice).unwrap() + // soundness: we always check for utf8 soundness on constructors. + unsafe { std::str::from_utf8_unchecked(slice) } } /// Returns the offsets of this [`Utf8Array`]. diff --git a/tests/it/array/binary/mod.rs b/tests/it/array/binary/mod.rs index 4b6c031936b..2cb5a374541 100644 --- a/tests/it/array/binary/mod.rs +++ b/tests/it/array/binary/mod.rs @@ -1,6 +1,7 @@ use arrow2::{ array::{Array, BinaryArray}, bitmap::Bitmap, + buffer::Buffer, datatypes::DataType, }; @@ -71,3 +72,66 @@ fn from_iter() { let a: BinaryArray = iter.collect(); assert_eq!(a.len(), 2); } + +#[test] +fn with_validity() { + let array = BinaryArray::::from(&[Some(b"hello".as_ref()), Some(b" ".as_ref()), None]); + + let array = array.with_validity(None); + + let a = array.validity(); + assert_eq!(a, &None); +} + +#[test] +#[should_panic] +fn wrong_offsets() { + let offsets = Buffer::from(&[0, 5, 4]); // invalid offsets + let values = Buffer::from(b"abbbbb"); + BinaryArray::::from_data(DataType::Binary, offsets, values, None); +} + +#[test] +#[should_panic] +fn wrong_data_type() { + let offsets = Buffer::from(&[0, 4]); + let values = Buffer::from(b"abbb"); + BinaryArray::::from_data(DataType::Int8, offsets, values, None); +} + +#[test] +#[should_panic] +fn value_with_wrong_offsets_panics() { + let offsets = Buffer::from(&[0, 10, 11, 4]); + let values = Buffer::from(b"abbb"); + // the 10-11 is not checked + let array = BinaryArray::::from_data(DataType::Binary, offsets, values, None); + + // but access is still checked (and panics) + // without checks, this would result in reading beyond bounds + array.value(0); +} + +#[test] +#[should_panic] +fn index_out_of_bounds_panics() { + let offsets = Buffer::from(&[0, 1, 2, 4]); + let values = Buffer::from(b"abbb"); + let array = BinaryArray::::from_data(DataType::Utf8, offsets, values, None); + + array.value(3); +} + +#[test] +#[should_panic] +fn value_unchecked_with_wrong_offsets_panics() { + let offsets = Buffer::from(&[0, 10, 11, 4]); + let values = Buffer::from(b"abbb"); + // the 10-11 is not checked + let array = BinaryArray::::from_data(DataType::Binary, offsets, values, None); + + // but access is still checked (and panics) + // without checks, this would result in reading beyond bounds, + // even if `0` is in bounds + unsafe { array.value_unchecked(0) }; +} diff --git a/tests/it/array/utf8/mod.rs b/tests/it/array/utf8/mod.rs index 3d664a47a71..c18dbacd880 100644 --- a/tests/it/array/utf8/mod.rs +++ b/tests/it/array/utf8/mod.rs @@ -127,7 +127,44 @@ fn wrong_offsets() { #[test] #[should_panic] fn wrong_data_type() { - let offsets = Buffer::from(&[0, 4]); // invalid offsets + let offsets = Buffer::from(&[0, 4]); let values = Buffer::from(b"abbb"); Utf8Array::::from_data(DataType::Int8, offsets, values, None); } + +#[test] +#[should_panic] +fn value_with_wrong_offsets_panics() { + let offsets = Buffer::from(&[0, 10, 11, 4]); + let values = Buffer::from(b"abbb"); + // the 10-11 is not checked + let array = Utf8Array::::from_data(DataType::Utf8, offsets, values, None); + + // but access is still checked (and panics) + // without checks, this would result in reading beyond bounds + array.value(0); +} + +#[test] +#[should_panic] +fn index_out_of_bounds_panics() { + let offsets = Buffer::from(&[0, 1, 2, 4]); + let values = Buffer::from(b"abbb"); + let array = Utf8Array::::from_data(DataType::Utf8, offsets, values, None); + + array.value(3); +} + +#[test] +#[should_panic] +fn value_unchecked_with_wrong_offsets_panics() { + let offsets = Buffer::from(&[0, 10, 11, 4]); + let values = Buffer::from(b"abbb"); + // the 10-11 is not checked + let array = Utf8Array::::from_data(DataType::Utf8, offsets, values, None); + + // but access is still checked (and panics) + // without checks, this would result in reading beyond bounds, + // even if index `0` is in bounds + unsafe { array.value_unchecked(0) }; +}