diff --git a/src/array/binary/mod.rs b/src/array/binary/mod.rs index a11469b220d..1a5abdcc330 100644 --- a/src/array/binary/mod.rs +++ b/src/array/binary/mod.rs @@ -40,7 +40,7 @@ pub use mutable::*; /// assert_eq!(array.values_iter().collect::>(), vec![[1, 2].as_ref(), &[], &[3]]); /// // the underlying representation: /// assert_eq!(array.values(), &Buffer::from(vec![1, 2, 3])); -/// assert_eq!(array.offsets(), &Buffer::from(vec![0, 2, 2, 3])); +/// assert_eq!(array.offsets().buffer(), &Buffer::from(vec![0, 2, 2, 3])); /// assert_eq!(array.validity(), Some(&Bitmap::from([true, false, true]))); /// ``` /// diff --git a/src/array/specification.rs b/src/array/specification.rs index 966df9f49f2..9e4b4b9653e 100644 --- a/src/array/specification.rs +++ b/src/array/specification.rs @@ -64,10 +64,12 @@ pub(crate) fn try_check_utf8>( let starts = offsets.starts(); let last = starts .iter() - .rev() .enumerate() - .find_map(|(i, offset)| (offset.to_usize() != values.len()).then(|| i + 1)) - .unwrap_or(starts.len() - 1); + .rev() + .find_map(|(i, offset)| (offset.to_usize() != values.len()).then(|| i)) + // all equal to the length implies that `offsets = [0, values.len()]` + // which is covered by `from_utf8(values)` + .unwrap_or(0); let mut any_invalid = false; for start in &starts[..=last] { @@ -117,7 +119,7 @@ mod tests { proptest! { // a bit expensive, feel free to run it when changing the code above - //#![proptest_config(ProptestConfig::with_cases(100000))] + // #![proptest_config(ProptestConfig::with_cases(100000))] #[test] #[cfg_attr(miri, ignore)] // miri and proptest do not work well fn check_utf8_validation(values in binary_strategy()) { diff --git a/src/array/utf8/mod.rs b/src/array/utf8/mod.rs index 36ce27b28bf..f8b8b86a8b8 100644 --- a/src/array/utf8/mod.rs +++ b/src/array/utf8/mod.rs @@ -51,7 +51,7 @@ impl> AsRef<[u8]> for StrAsBytes { /// // the underlying representation /// assert_eq!(array.validity(), Some(&Bitmap::from([true, false, true]))); /// assert_eq!(array.values(), &Buffer::from(b"hithere".to_vec())); -/// assert_eq!(array.offsets(), &Buffer::from(vec![0, 2, 2, 2 + 5])); +/// assert_eq!(array.offsets().buffer(), &Buffer::from(vec![0, 2, 2, 2 + 5])); /// # } /// ``` /// diff --git a/src/compute/take/generic_binary.rs b/src/compute/take/generic_binary.rs index 4fc4d01138d..a9cf9c199c2 100644 --- a/src/compute/take/generic_binary.rs +++ b/src/compute/take/generic_binary.rs @@ -17,10 +17,10 @@ pub fn take_values( let mut buffer = Vec::with_capacity(new_len); starts .iter() - .zip(offsets.buffer().windows(2)) - .for_each(|(start_, window)| { - let start = start_.to_usize(); - let end = (*start_ + (window[1] - window[0])).to_usize(); + .map(|start| start.to_usize()) + .zip(offsets.lengths()) + .for_each(|(start, length)| { + let end = start + length; buffer.extend_from_slice(&values[start..end]); }); buffer.into() @@ -32,27 +32,16 @@ pub fn take_no_validity( values: &[u8], indices: &[I], ) -> (OffsetsBuffer, Buffer, Option) { - let mut length = O::zero(); let mut buffer = Vec::::new(); - let offsets = offsets.buffer(); - let offsets = indices.iter().map(|index| { - let index = index.to_usize(); - let start = offsets[index]; - let length_h = offsets[index + 1] - start; - length += length_h; - - let _start = start.to_usize(); - let end = (start + length_h).to_usize(); - buffer.extend_from_slice(&values[_start..end]); - length + let lengths = indices.iter().map(|index| index.to_usize()).map(|index| { + let (start, end) = offsets.start_end(index); + // todo: remove this bound check + buffer.extend_from_slice(&values[start..end]); + end - start }); - let offsets = std::iter::once(O::zero()) - .chain(offsets) - .collect::>(); - // Safety: offsets _are_ monotonically increasing - let offsets = unsafe { Offsets::new_unchecked(offsets) }.into(); + let offsets = Offsets::try_from_lengths(lengths).expect(""); - (offsets, buffer.into(), None) + (offsets.into(), buffer.into(), None) } // take implementation when only values contain nulls diff --git a/src/offset.rs b/src/offset.rs index 6d0878fa2b5..2337f082218 100644 --- a/src/offset.rs +++ b/src/offset.rs @@ -420,6 +420,12 @@ impl OffsetsBuffer { Self(self.0.slice_unchecked(offset, length)) } + /// Returns an iterator with the lengths of the offsets + #[inline] + pub fn lengths(&self) -> impl Iterator + '_ { + self.0.windows(2).map(|w| (w[1] - w[0]).to_usize()) + } + /// Returns the inner [`Buffer`]. #[inline] pub fn into_inner(self) -> Buffer { diff --git a/tests/it/io/ipc/read/file.rs b/tests/it/io/ipc/read/file.rs index 9d21d051f2a..515a6ede92f 100644 --- a/tests/it/io/ipc/read/file.rs +++ b/tests/it/io/ipc/read/file.rs @@ -106,6 +106,12 @@ fn read_generated_100_decimal() -> Result<()> { test_file("1.0.0-bigendian", "generated_decimal") } +#[test] +fn read_generated_duplicate_fieldnames() -> Result<()> { + test_file("1.0.0-littleendian", "generated_duplicate_fieldnames")?; + test_file("1.0.0-bigendian", "generated_duplicate_fieldnames") +} + #[test] fn read_generated_100_interval() -> Result<()> { test_file("1.0.0-littleendian", "generated_interval")?;