diff --git a/src/alloc/mod.rs b/src/alloc/mod.rs index 6ee0afaada1..0686b613a77 100644 --- a/src/alloc/mod.rs +++ b/src/alloc/mod.rs @@ -29,7 +29,7 @@ use crate::types::NativeType; mod alignment; -use alignment::ALIGNMENT; +pub use alignment::ALIGNMENT; // If this number is not zero after all objects have been `drop`, there is a memory leak pub static mut ALLOCATIONS: AtomicIsize = AtomicIsize::new(0); @@ -39,8 +39,10 @@ pub fn total_allocated_bytes() -> isize { unsafe { ALLOCATIONS.load(std::sync::atomic::Ordering::SeqCst) } } +/// # Safety +/// This pointer may only be used to check if memory is allocated. #[inline] -unsafe fn dangling() -> NonNull { +pub unsafe fn dangling() -> NonNull { NonNull::new_unchecked(ALIGNMENT as *mut T) } diff --git a/src/array/binary/from.rs b/src/array/binary/from.rs index e8ee616610e..416047ad518 100644 --- a/src/array/binary/from.rs +++ b/src/array/binary/from.rs @@ -99,7 +99,6 @@ mod tests { let array = BinaryArray::::from(&[Some(b"hello".as_ref()), Some(b" ".as_ref()), None]); let a = array.validity().as_ref().unwrap(); - assert_eq!(a.len(), 3); - assert_eq!(a.as_slice()[0], 0b00000011); + assert_eq!(a, &Bitmap::from([true, true, false])); } } diff --git a/src/array/utf8/from.rs b/src/array/utf8/from.rs index 1e3d9857454..036e548699c 100644 --- a/src/array/utf8/from.rs +++ b/src/array/utf8/from.rs @@ -265,8 +265,7 @@ mod tests { let array = Utf8Array::::from(&[Some("hello"), Some(" "), None]); let a = array.validity().as_ref().unwrap(); - assert_eq!(a.len(), 3); - assert_eq!(a.as_slice()[0], 0b00000011); + assert_eq!(a, &Bitmap::from([true, true, false])); } #[test] diff --git a/src/bitmap/immutable.rs b/src/bitmap/immutable.rs index ca978b6ea6c..a6a3a70b2e9 100644 --- a/src/bitmap/immutable.rs +++ b/src/bitmap/immutable.rs @@ -211,19 +211,17 @@ impl Bitmap { } } -// Methods used for IPC impl Bitmap { - #[inline] - pub(crate) fn offset(&self) -> usize { - self.offset % 8 - } - /// Returns the byte slice of this Bitmap. #[inline] - pub(crate) fn as_slice(&self) -> &[u8] { + pub fn as_slice(&self) -> (&[u8], usize, usize) { let start = self.offset / 8; - let len = (self.offset() + self.length).saturating_add(7) / 8; - &self.bytes[start..start + len] + let len = (self.offset % 8 + self.length).saturating_add(7) / 8; + ( + &self.bytes[start..start + len], + self.offset % 8, + self.length, + ) } } @@ -242,48 +240,3 @@ impl<'a> Bitmap { BitmapIter::<'a>::new(&self.bytes, self.offset, self.length) } } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn as_slice() { - let b = Bitmap::from([true, true, true, true, true, true, true, true, true]); - - let slice = b.as_slice(); - assert_eq!(slice, &[0b11111111, 0b1]); - - assert_eq!(0, b.offset()); - } - - #[test] - fn as_slice_offset() { - let b = Bitmap::from([true, true, true, true, true, true, true, true, true]); - let b = b.slice(8, 1); - - let slice = b.as_slice(); - assert_eq!(slice, &[0b1]); - - assert_eq!(0, b.offset()); - } - - #[test] - fn as_slice_offset_middle() { - let b = Bitmap::from_u8_slice(&[0, 0, 0, 0b00010101], 27); - let b = b.slice(22, 5); - - let slice = b.as_slice(); - assert_eq!(slice, &[0, 0b00010101]); - - assert_eq!(6, b.offset()); - } - - #[test] - fn test_debug() { - let b = Bitmap::from([true, true, false, true, true, true, true, true, true]); - let b = b.slice(2, 7); - - assert_eq!(format!("{:?}", b), "[0b111110__, 0b_______1]"); - } -} diff --git a/src/bitmap/mutable.rs b/src/bitmap/mutable.rs index f096354caff..780a479e357 100644 --- a/src/bitmap/mutable.rs +++ b/src/bitmap/mutable.rs @@ -439,7 +439,16 @@ impl MutableBitmap { /// Extends the [`MutableBitmap`] from a [`Bitmap`]. #[inline] pub fn extend_from_bitmap(&mut self, bitmap: &Bitmap) { - self.extend_from_slice(bitmap.as_slice(), bitmap.offset(), bitmap.len()); + let (slice, offset, length) = bitmap.as_slice(); + self.extend_from_slice(slice, offset, length); + } + + /// Returns the slice of bytes of this [`MutableBitmap`]. + /// Note that the last byte may not be fully used. + #[inline] + pub fn as_slice(&self) -> &[u8] { + let len = (self.length).saturating_add(7) / 8; + &self.buffer[..len] } } @@ -448,182 +457,3 @@ impl Default for MutableBitmap { Self::new() } } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_trusted_len() { - let data = vec![true; 65]; - let bitmap = MutableBitmap::from_trusted_len_iter(data.into_iter()); - let bitmap: Bitmap = bitmap.into(); - assert_eq!(bitmap.len(), 65); - - assert_eq!(bitmap.as_slice()[8], 0b00000001); - } - - #[test] - fn test_trusted_len_small() { - let data = vec![true; 7]; - let bitmap = MutableBitmap::from_trusted_len_iter(data.into_iter()); - let bitmap: Bitmap = bitmap.into(); - assert_eq!(bitmap.len(), 7); - - assert_eq!(bitmap.as_slice()[0], 0b01111111); - } - - #[test] - fn test_push() { - let mut bitmap = MutableBitmap::new(); - bitmap.push(true); - bitmap.push(false); - bitmap.push(false); - for _ in 0..7 { - bitmap.push(true) - } - let bitmap: Bitmap = bitmap.into(); - assert_eq!(bitmap.len(), 10); - - assert_eq!(bitmap.as_slice(), &[0b11111001, 0b00000011]); - } - - #[test] - fn test_push_small() { - let mut bitmap = MutableBitmap::new(); - bitmap.push(true); - bitmap.push(true); - bitmap.push(false); - let bitmap: Option = bitmap.into(); - let bitmap = bitmap.unwrap(); - assert_eq!(bitmap.len(), 3); - assert_eq!(bitmap.as_slice()[0], 0b00000011); - } - - #[test] - fn test_push_exact_zeros() { - let mut bitmap = MutableBitmap::new(); - for _ in 0..8 { - bitmap.push(false) - } - let bitmap: Option = bitmap.into(); - let bitmap = bitmap.unwrap(); - assert_eq!(bitmap.len(), 8); - assert_eq!(bitmap.as_slice().len(), 1); - } - - #[test] - fn test_push_exact_ones() { - let mut bitmap = MutableBitmap::new(); - for _ in 0..8 { - bitmap.push(true) - } - let bitmap: Option = bitmap.into(); - assert!(bitmap.is_none()); - } - - #[test] - fn test_capacity() { - let b = MutableBitmap::with_capacity(10); - assert_eq!(b.capacity(), 512); - - let b = MutableBitmap::with_capacity(512); - assert_eq!(b.capacity(), 512); - - let mut b = MutableBitmap::with_capacity(512); - b.reserve(8); - assert_eq!(b.capacity(), 512); - } - - #[test] - fn test_capacity_push() { - let mut b = MutableBitmap::with_capacity(512); - (0..512).for_each(|_| b.push(true)); - assert_eq!(b.capacity(), 512); - b.reserve(8); - assert_eq!(b.capacity(), 1024); - } - - #[test] - fn test_extend() { - let mut b = MutableBitmap::new(); - - let iter = (0..512).map(|i| i % 6 == 0); - unsafe { b.extend_from_trusted_len_iter_unchecked(iter) }; - let b: Bitmap = b.into(); - for (i, v) in b.iter().enumerate() { - assert_eq!(i % 6 == 0, v); - } - } - - #[test] - fn test_extend_offset() { - let mut b = MutableBitmap::new(); - b.push(true); - - let iter = (0..512).map(|i| i % 6 == 0); - unsafe { b.extend_from_trusted_len_iter_unchecked(iter) }; - let b: Bitmap = b.into(); - let mut iter = b.iter().enumerate(); - assert!(iter.next().unwrap().1); - for (i, v) in iter { - assert_eq!((i - 1) % 6 == 0, v); - } - } - - #[test] - fn test_set() { - let mut bitmap = MutableBitmap::from_len_zeroed(12); - bitmap.set(0, true); - assert!(bitmap.get(0)); - bitmap.set(0, false); - assert!(!bitmap.get(0)); - - bitmap.set(11, true); - assert!(bitmap.get(11)); - bitmap.set(11, false); - assert!(!bitmap.get(11)); - bitmap.set(11, true); - - let bitmap: Option = bitmap.into(); - let bitmap = bitmap.unwrap(); - assert_eq!(bitmap.len(), 12); - assert_eq!(bitmap.as_slice()[0], 0b00000000); - } - - #[test] - fn test_extend_from_bitmap() { - let other = Bitmap::from(&[true, false, true]); - let mut bitmap = MutableBitmap::new(); - - // call is optimized to perform a memcopy - bitmap.extend_from_bitmap(&other); - - assert_eq!(bitmap.len(), 3); - assert_eq!(bitmap.buffer[0], 0b00000101); - - // this call iterates over all bits - bitmap.extend_from_bitmap(&other); - - assert_eq!(bitmap.len(), 6); - assert_eq!(bitmap.buffer[0], 0b00101101); - } - - #[test] - fn test_debug() { - let mut b = MutableBitmap::new(); - assert_eq!(format!("{:?}", b), "[]"); - b.push(true); - b.push(false); - assert_eq!(format!("{:?}", b), "[0b______01]"); - b.push(false); - b.push(false); - b.push(false); - b.push(false); - b.push(true); - b.push(true); - assert_eq!(format!("{:?}", b), "[0b11000001]"); - b.push(true); - assert_eq!(format!("{:?}", b), "[0b11000001, 0b_______1]"); - } -} diff --git a/src/bitmap/utils/chunk_iterator/chunks_exact.rs b/src/bitmap/utils/chunk_iterator/chunks_exact.rs index b556506546b..87d8936f060 100644 --- a/src/bitmap/utils/chunk_iterator/chunks_exact.rs +++ b/src/bitmap/utils/chunk_iterator/chunks_exact.rs @@ -86,29 +86,3 @@ impl BitChunkIterExact for BitChunksExact<'_, T> { self.remainder() } } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn basics() { - let mut iter = BitChunksExact::::new(&[0b11111111u8, 0b00000001u8], 9); - assert_eq!(iter.next().unwrap(), 0b11111111u8); - assert_eq!(iter.remainder(), 0b00000001u8); - } - - #[test] - fn basics_u16_small() { - let mut iter = BitChunksExact::::new(&[0b11111111u8], 9); - assert_eq!(iter.next(), None); - assert_eq!(iter.remainder(), 0b0000_0000_1111_1111u16); - } - - #[test] - fn basics_u16() { - let mut iter = BitChunksExact::::new(&[0b11111111u8, 0b00000001u8], 9); - assert_eq!(iter.next(), None); - assert_eq!(iter.remainder(), 0b0000_0001_1111_1111u16); - } -} diff --git a/src/bitmap/utils/chunk_iterator/mod.rs b/src/bitmap/utils/chunk_iterator/mod.rs index b896b1a6662..8528533a21c 100644 --- a/src/bitmap/utils/chunk_iterator/mod.rs +++ b/src/bitmap/utils/chunk_iterator/mod.rs @@ -191,170 +191,3 @@ impl ExactSizeIterator for BitChunks<'_, T> { } unsafe impl TrustedLen for BitChunks<'_, T> {} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn basics() { - let mut iter = BitChunks::::new(&[0b00000001u8, 0b00000010u8], 0, 16); - assert_eq!(iter.next().unwrap(), 0b0000_0010_0000_0001u16); - assert_eq!(iter.remainder(), 0); - } - - #[test] - fn remainder() { - let a = BitChunks::::new(&[0b00000001u8, 0b00000010u8, 0b00000100u8], 0, 18); - assert_eq!(a.remainder(), 0b00000100u16); - } - - #[test] - fn remainder_saturating() { - let a = BitChunks::::new(&[0b00000001u8, 0b00000010u8, 0b00000010u8], 0, 18); - assert_eq!(a.remainder(), 0b0000_0000_0000_0010u16); - } - - #[test] - fn basics_offset() { - let mut iter = BitChunks::::new(&[0b00000001u8, 0b00000011u8, 0b00000001u8], 1, 16); - assert_eq!(iter.remainder(), 0); - assert_eq!(iter.next().unwrap(), 0b1000_0001_1000_0000u16); - assert_eq!(iter.next(), None); - } - - #[test] - fn basics_offset_remainder() { - let mut a = BitChunks::::new(&[0b00000001u8, 0b00000011u8, 0b10000001u8], 1, 15); - assert_eq!(a.next(), None); - assert_eq!(a.remainder(), 0b1000_0001_1000_0000u16); - assert_eq!(a.remainder_len(), 15); - } - - #[test] - fn offset_remainder_saturating() { - let a = BitChunks::::new(&[0b00000001u8, 0b00000011u8, 0b00000011u8], 1, 17); - assert_eq!(a.remainder(), 0b0000_0000_0000_0001u16); - } - - #[test] - fn offset_remainder_saturating2() { - let a = BitChunks::::new(&[0b01001001u8, 0b00000001], 1, 8); - assert_eq!(a.remainder(), 0b1010_0100u64); - } - - #[test] - fn offset_remainder_saturating3() { - let input: &[u8] = &[0b01000000, 0b01000001]; - let a = BitChunks::::new(input, 8, 2); - assert_eq!(a.remainder(), 0b0100_0001u64); - } - - #[test] - fn basics_multiple() { - let mut iter = BitChunks::::new( - &[0b00000001u8, 0b00000010u8, 0b00000100u8, 0b00001000u8], - 0, - 4 * 8, - ); - assert_eq!(iter.next().unwrap(), 0b0000_0010_0000_0001u16); - assert_eq!(iter.next().unwrap(), 0b0000_1000_0000_0100u16); - assert_eq!(iter.remainder(), 0); - } - - #[test] - fn basics_multiple_offset() { - let mut iter = BitChunks::::new( - &[ - 0b00000001u8, - 0b00000010u8, - 0b00000100u8, - 0b00001000u8, - 0b00000001u8, - ], - 1, - 4 * 8, - ); - assert_eq!(iter.next().unwrap(), 0b0000_0001_0000_0000u16); - assert_eq!(iter.next().unwrap(), 0b1000_0100_0000_0010u16); - assert_eq!(iter.remainder(), 0); - } - - #[test] - fn remainder_large() { - let input: &[u8] = &[ - 0b00100100, 0b01001001, 0b10010010, 0b00100100, 0b01001001, 0b10010010, 0b00100100, - 0b01001001, 0b10010010, 0b00100100, 0b01001001, 0b10010010, 0b00000100, - ]; - let mut iter = BitChunks::::new(input, 0, 8 * 12 + 4); - assert_eq!(iter.remainder_len(), 100 - 96); - - for j in 0..12 { - let mut a = BitChunkIter::new(iter.next().unwrap(), 8); - for i in 0..8 { - assert_eq!(a.next().unwrap(), (j * 8 + i + 1) % 3 == 0); - } - } - assert_eq!(None, iter.next()); - - use crate::types::BitChunkIter; - let expected_remainder = 0b00000100u8; - assert_eq!(iter.remainder(), expected_remainder); - - let mut a = BitChunkIter::new(expected_remainder, 8); - for i in 0..4 { - assert_eq!(a.next().unwrap(), (i + 1) % 3 == 0); - } - } - - #[test] - fn basics_1() { - let mut iter = BitChunks::::new( - &[0b00000001u8, 0b00000010u8, 0b00000100u8, 0b00001000u8], - 8, - 3 * 8, - ); - assert_eq!(iter.next().unwrap(), 0b0000_0100_0000_0010u16); - assert_eq!(iter.next(), None); - assert_eq!(iter.remainder(), 0b0000_0000_0000_1000u16); - assert_eq!(iter.remainder_len(), 8); - } - - #[test] - fn basics_2() { - let mut iter = BitChunks::::new( - &[0b00000001u8, 0b00000010u8, 0b00000100u8, 0b00001000u8], - 7, - 3 * 8, - ); - assert_eq!(iter.remainder(), 0b0000_0000_0001_0000u16); - assert_eq!(iter.next().unwrap(), 0b0000_1000_0000_0100u16); - assert_eq!(iter.next(), None); - } - - #[test] - fn remainder_1() { - let mut iter = BitChunks::::new(&[0b11111111u8, 0b00000001u8], 0, 9); - assert_eq!(iter.next(), None); - assert_eq!(iter.remainder(), 0b1_1111_1111u64); - } - - #[test] - fn remainder_2() { - // (i % 3 == 0) in bitmap - let input: &[u8] = &[ - 0b01001001, 0b10010010, 0b00100100, 0b01001001, 0b10010010, 0b00100100, 0b01001001, - 0b10010010, 0b00100100, 0b01001001, /* 73 */ - 0b10010010, /* 146 */ - 0b00100100, 0b00001001, - ]; - let offset = 10; // 8 + 2 - let length = 90; - - let mut iter = BitChunks::::new(input, offset, length); - let first: u64 = 0b0100100100100100100100100100100100100100100100100100100100100100; - assert_eq!(first, iter.next().unwrap()); - assert_eq!(iter.next(), None); - assert_eq!(iter.remainder(), 0b10010010010010010010010010u64); - } -} diff --git a/src/bitmap/utils/iterator.rs b/src/bitmap/utils/iterator.rs index 87678fa8ba7..9211bbf730d 100644 --- a/src/bitmap/utils/iterator.rs +++ b/src/bitmap/utils/iterator.rs @@ -68,51 +68,3 @@ impl<'a> DoubleEndedIterator for BitmapIter<'a> { } unsafe impl TrustedLen for BitmapIter<'_> {} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn basic() { - let values = &[0b01011011u8]; - let iter = BitmapIter::new(values, 0, 6); - let result = iter.collect::>(); - assert_eq!(result, vec![true, true, false, true, true, false]) - } - - #[test] - fn large() { - let values = &[0b01011011u8]; - let values = std::iter::repeat(values) - .take(63) - .flatten() - .copied() - .collect::>(); - let len = 63 * 8; - let iter = BitmapIter::new(&values, 0, len); - assert_eq!(iter.count(), len); - } - - #[test] - fn offset() { - let values = &[0b01011011u8]; - let iter = BitmapIter::new(values, 2, 4); - let result = iter.collect::>(); - assert_eq!(result, vec![false, true, true, false]) - } - - #[test] - fn rev() { - let values = &[0b01011011u8, 0b01011011u8]; - let iter = BitmapIter::new(values, 2, 13); - let result = iter.rev().collect::>(); - assert_eq!( - result, - vec![false, true, true, false, true, false, true, true, false, true, true, false, true] - .into_iter() - .rev() - .collect::>() - ) - } -} diff --git a/src/bitmap/utils/mod.rs b/src/bitmap/utils/mod.rs index 9adfaa937ff..979900b2341 100644 --- a/src/bitmap/utils/mod.rs +++ b/src/bitmap/utils/mod.rs @@ -83,70 +83,3 @@ pub fn null_count(slice: &[u8], offset: usize, len: usize) -> usize { len - count } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_get_bit() { - let input: &[u8] = &[ - 0b00000000, 0b00000001, 0b00000010, 0b00000100, 0b00001000, 0b00010000, 0b00100000, - 0b01000000, 0b11111111, - ]; - for i in 0..8 { - assert!(!get_bit(input, i)); - } - assert!(get_bit(input, 8)); - for i in 8 + 1..2 * 8 { - assert!(!get_bit(input, i)); - } - assert!(get_bit(input, 2 * 8 + 1)); - for i in 2 * 8 + 2..3 * 8 { - assert!(!get_bit(input, i)); - } - assert!(get_bit(input, 3 * 8 + 2)); - for i in 3 * 8 + 3..4 * 8 { - assert!(!get_bit(input, i)); - } - assert!(get_bit(input, 4 * 8 + 3)); - } - - #[test] - fn test_null_count() { - let input: &[u8] = &[ - 0b01001001, 0b00000001, 0b00000010, 0b00000100, 0b00001000, 0b00010000, 0b00100000, - 0b01000000, 0b11111111, - ]; - assert_eq!(null_count(input, 0, 8), 8 - 3); - assert_eq!(null_count(input, 1, 7), 7 - 2); - assert_eq!(null_count(input, 1, 8), 8 - 3); - assert_eq!(null_count(input, 2, 7), 7 - 3); - assert_eq!(null_count(input, 0, 32), 32 - 6); - assert_eq!(null_count(input, 9, 2), 2); - - let input: &[u8] = &[0b01000000, 0b01000001]; - assert_eq!(null_count(input, 8, 2), 1); - assert_eq!(null_count(input, 8, 3), 2); - assert_eq!(null_count(input, 8, 4), 3); - assert_eq!(null_count(input, 8, 5), 4); - assert_eq!(null_count(input, 8, 6), 5); - assert_eq!(null_count(input, 8, 7), 5); - assert_eq!(null_count(input, 8, 8), 6); - - let input: &[u8] = &[0b01000000, 0b01010101]; - assert_eq!(null_count(input, 9, 2), 1); - assert_eq!(null_count(input, 10, 2), 1); - assert_eq!(null_count(input, 11, 2), 1); - assert_eq!(null_count(input, 12, 2), 1); - assert_eq!(null_count(input, 13, 2), 1); - assert_eq!(null_count(input, 14, 2), 1); - } - - #[test] - fn null_count_1() { - // offset = 10, len = 90 => remainder - let input: &[u8] = &[73, 146, 36, 73, 146, 36, 73, 146, 36, 73, 146, 36, 9]; - assert_eq!(null_count(input, 10, 90), 60); - } -} diff --git a/src/bitmap/utils/slice_iterator.rs b/src/bitmap/utils/slice_iterator.rs index f2633a16159..686c5f1c16b 100644 --- a/src/bitmap/utils/slice_iterator.rs +++ b/src/bitmap/utils/slice_iterator.rs @@ -28,8 +28,7 @@ pub struct SlicesIterator<'a> { impl<'a> SlicesIterator<'a> { pub fn new(values: &'a Bitmap) -> Self { - let offset = values.offset(); - let buffer = values.as_slice(); + let (buffer, offset, _) = values.as_slice(); let mut iter = buffer.iter(); let (current_byte, state) = match iter.next() { @@ -152,145 +151,3 @@ impl<'a> Iterator for SlicesIterator<'a> { } } } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn check_invariant() { - let values = (0..8).map(|i| i % 2 != 0).collect::(); - let iter = SlicesIterator::new(&values); - - let slots = iter.slots(); - - let slices = iter.collect::>(); - - assert_eq!(slices, vec![(1, 1), (3, 1), (5, 1), (7, 1)]); - - let mut sum = 0; - for (_, len) in slices { - sum += len; - } - assert_eq!(sum, slots); - } - - #[test] - fn single_set() { - let values = (0..16).map(|i| i == 1).collect::(); - - let iter = SlicesIterator::new(&values); - let count = iter.slots(); - let chunks = iter.collect::>(); - - assert_eq!(chunks, vec![(1, 1)]); - assert_eq!(count, 1); - } - - #[test] - fn single_unset() { - let values = (0..64).map(|i| i != 1).collect::(); - - let iter = SlicesIterator::new(&values); - let count = iter.slots(); - let chunks = iter.collect::>(); - - assert_eq!(chunks, vec![(0, 1), (2, 62)]); - assert_eq!(count, 64 - 1); - } - - #[test] - fn generic() { - let values = (0..130).map(|i| i % 62 != 0).collect::(); - - let iter = SlicesIterator::new(&values); - let count = iter.slots(); - let chunks = iter.collect::>(); - - assert_eq!(chunks, vec![(1, 61), (63, 61), (125, 5)]); - assert_eq!(count, 61 + 61 + 5); - } - - #[test] - fn incomplete_byte() { - let values = (0..6).map(|i| i == 1).collect::(); - - let iter = SlicesIterator::new(&values); - let count = iter.slots(); - let chunks = iter.collect::>(); - - assert_eq!(chunks, vec![(1, 1)]); - assert_eq!(count, 1); - } - - #[test] - fn incomplete_byte1() { - let values = (0..12).map(|i| i == 9).collect::(); - - let iter = SlicesIterator::new(&values); - let count = iter.slots(); - let chunks = iter.collect::>(); - - assert_eq!(chunks, vec![(9, 1)]); - assert_eq!(count, 1); - } - - #[test] - fn end_of_byte() { - let values = (0..16).map(|i| i != 7).collect::(); - - let iter = SlicesIterator::new(&values); - let count = iter.slots(); - let chunks = iter.collect::>(); - - assert_eq!(chunks, vec![(0, 7), (8, 8)]); - assert_eq!(count, 15); - } - - #[test] - fn bla() { - let values = vec![true, true, true, true, true, true, true, false] - .into_iter() - .collect::(); - let iter = SlicesIterator::new(&values); - let count = iter.slots(); - assert_eq!(values.null_count() + iter.slots(), values.len()); - - let total = iter.into_iter().fold(0, |acc, x| acc + x.1); - - assert_eq!(count, total); - } - - #[test] - fn past_end_should_not_be_returned() { - let values = Bitmap::from_u8_slice(&[0b11111010], 3); - let iter = SlicesIterator::new(&values); - let count = iter.slots(); - assert_eq!(values.null_count() + iter.slots(), values.len()); - - let total = iter.into_iter().fold(0, |acc, x| acc + x.1); - - assert_eq!(count, total); - } - - #[test] - fn sliced() { - let values = Bitmap::from_u8_slice(&[0b11111010, 0b11111011], 16); - let values = values.slice(8, 2); - let iter = SlicesIterator::new(&values); - - let chunks = iter.collect::>(); - - // the first "11" in the second byte - assert_eq!(chunks, vec![(0, 2)]); - } - - #[test] - fn remainder_1() { - let values = Bitmap::from_u8_slice(&[0, 0, 0b00000000, 0b00010101], 27); - let values = values.slice(22, 5); - let iter = SlicesIterator::new(&values); - let chunks = iter.collect::>(); - assert_eq!(chunks, vec![(2, 1), (4, 1)]); - } -} diff --git a/src/bitmap/utils/zip_validity.rs b/src/bitmap/utils/zip_validity.rs index 16442bda10e..615d749ce28 100644 --- a/src/bitmap/utils/zip_validity.rs +++ b/src/bitmap/utils/zip_validity.rs @@ -82,112 +82,3 @@ pub fn zip_validity>( ) -> ZipValidity { ZipValidity::::new(values, validity) } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn basic() { - let a = Some(Bitmap::from([true, false])); - let values = vec![0, 1]; - let zip = zip_validity(values.into_iter(), &a); - - let a = zip.collect::>(); - assert_eq!(a, vec![Some(0), None]); - } - - #[test] - fn complete() { - let a = Some(Bitmap::from([ - true, false, true, false, true, false, true, false, - ])); - let values = vec![0, 1, 2, 3, 4, 5, 6, 7]; - let zip = zip_validity(values.into_iter(), &a); - - let a = zip.collect::>(); - assert_eq!( - a, - vec![Some(0), None, Some(2), None, Some(4), None, Some(6), None] - ); - } - - #[test] - fn slices() { - let a = Some(Bitmap::from([true, false])); - let offsets = vec![0, 2, 3]; - let values = vec![1, 2, 3]; - let iter = offsets.windows(2).map(|x| { - let start = x[0]; - let end = x[1]; - &values[start..end] - }); - let zip = zip_validity(iter, &a); - - let a = zip.collect::>(); - assert_eq!(a, vec![Some([1, 2].as_ref()), None]); - } - - #[test] - fn byte() { - let a = Some(Bitmap::from([ - true, false, true, false, false, true, true, false, true, - ])); - let values = vec![0, 1, 2, 3, 4, 5, 6, 7, 8]; - let zip = zip_validity(values.into_iter(), &a); - - let a = zip.collect::>(); - assert_eq!( - a, - vec![ - Some(0), - None, - Some(2), - None, - None, - Some(5), - Some(6), - None, - Some(8) - ] - ); - } - - #[test] - fn offset() { - let a = Bitmap::from([true, false, true, false, false, true, true, false, true]); - let a = Some(a.slice(1, 8)); - let values = vec![0, 1, 2, 3, 4, 5, 6, 7]; - let zip = zip_validity(values.into_iter(), &a); - - let a = zip.collect::>(); - assert_eq!( - a, - vec![None, Some(1), None, None, Some(4), Some(5), None, Some(7)] - ); - } - - #[test] - fn none() { - let values = vec![0, 1, 2]; - let zip = zip_validity(values.into_iter(), &None); - - let a = zip.collect::>(); - assert_eq!(a, vec![Some(0), Some(1), Some(2)]); - } - - #[test] - fn rev() { - let a = Bitmap::from([true, false, true, false, false, true, true, false, true]); - let a = Some(a.slice(1, 8)); - let values = vec![0, 1, 2, 3, 4, 5, 6, 7]; - let zip = zip_validity(values.into_iter(), &a); - - let result = zip.rev().collect::>(); - let expected = vec![None, Some(1), None, None, Some(4), Some(5), None, Some(7)] - .into_iter() - .rev() - .collect::>(); - assert_eq!(result, expected); - } -} diff --git a/src/buffer/immutable.rs b/src/buffer/immutable.rs index 476f8ffdf28..9805703120b 100644 --- a/src/buffer/immutable.rs +++ b/src/buffer/immutable.rs @@ -166,76 +166,3 @@ impl FromIterator for Buffer { MutableBuffer::from_iter(iter).into() } } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_new() { - let buffer = Buffer::::new(); - assert_eq!(buffer.len(), 0); - assert!(buffer.is_empty()); - } - - #[test] - fn test_new_zeroed() { - let buffer = Buffer::::new_zeroed(2); - assert_eq!(buffer.len(), 2); - assert!(!buffer.is_empty()); - assert_eq!(buffer.as_slice(), &[0, 0]); - } - - #[test] - fn test_from_slice() { - let buffer = Buffer::::from(&[0, 1, 2]); - assert_eq!(buffer.len(), 3); - assert_eq!(buffer.as_slice(), &[0, 1, 2]); - } - - #[test] - fn test_slice() { - let buffer = Buffer::::from(&[0, 1, 2, 3]); - let buffer = buffer.slice(1, 2); - assert_eq!(buffer.len(), 2); - assert_eq!(buffer.as_slice(), &[1, 2]); - } - - #[test] - fn test_from_iter() { - let buffer = (0..3).collect::>(); - assert_eq!(buffer.len(), 3); - assert_eq!(buffer.as_slice(), &[0, 1, 2]); - } - - #[test] - fn test_from_trusted_len_iter() { - let buffer = unsafe { Buffer::::from_trusted_len_iter_unchecked(0..3) }; - assert_eq!(buffer.len(), 3); - assert_eq!(buffer.as_slice(), &[0, 1, 2]); - } - - #[test] - fn test_try_from_trusted_len_iter() { - let iter = (0..3).map(Result::<_, String>::Ok); - let buffer = unsafe { Buffer::::try_from_trusted_len_iter_unchecked(iter) }.unwrap(); - assert_eq!(buffer.len(), 3); - assert_eq!(buffer.as_slice(), &[0, 1, 2]); - } - - #[test] - fn test_as_ptr() { - let buffer = Buffer::::from(&[0, 1, 2, 3]); - let buffer = buffer.slice(1, 2); - let ptr = buffer.as_ptr(); - assert_eq!(unsafe { *ptr }, 1); - } - - #[test] - fn test_debug() { - let buffer = Buffer::::from(&[0, 1, 2, 3]); - let buffer = buffer.slice(1, 2); - let a = format!("{:?}", buffer); - assert_eq!(a, "[1, 2]") - } -} diff --git a/src/buffer/mutable.rs b/src/buffer/mutable.rs index a021c7a3438..658bce7a82b 100644 --- a/src/buffer/mutable.rs +++ b/src/buffer/mutable.rs @@ -630,156 +630,3 @@ impl MutableBuffer { MutableBuffer::from_trusted_len_iter_unchecked(iter).into() } } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn default() { - let b = MutableBuffer::::default(); - assert_eq!(b.len(), 0); - assert!(b.is_empty()); - } - - #[test] - fn with_capacity() { - let b = MutableBuffer::::with_capacity(6); - assert!(b.capacity() >= 6); - assert!(b.is_empty()); - } - - #[test] - fn from_len_zeroed() { - let b = MutableBuffer::::from_len_zeroed(3); - assert_eq!(b.len(), 3); - assert!(!b.is_empty()); - assert_eq!(b.as_slice(), &[0, 0, 0]); - } - - #[test] - fn resize() { - let mut b = MutableBuffer::::new(); - b.resize(3, 1); - assert_eq!(b.len(), 3); - assert_eq!(b.as_slice(), &[1, 1, 1]); - assert_eq!(b.as_mut_slice(), &[1, 1, 1]); - } - - // branch that uses alloc_zeroed - #[test] - fn resize_from_zero() { - let mut b = MutableBuffer::::new(); - b.resize(3, 0); - assert_eq!(b.len(), 3); - assert_eq!(b.as_slice(), &[0, 0, 0]); - } - - #[test] - fn resize_smaller() { - let mut b = MutableBuffer::::from_len_zeroed(3); - b.resize(2, 1); - assert_eq!(b.len(), 2); - assert_eq!(b.as_slice(), &[0, 0]); - } - - #[test] - fn extend_from_slice() { - let mut b = MutableBuffer::::from_len_zeroed(1); - b.extend_from_slice(&[1, 2]); - assert_eq!(b.len(), 3); - assert_eq!(b.as_slice(), &[0, 1, 2]); - - assert_eq!(unsafe { *b.as_ptr() }, 0); - assert_eq!(unsafe { *b.as_mut_ptr() }, 0); - } - - #[test] - fn push() { - let mut b = MutableBuffer::::new(); - for _ in 0..17 { - b.push(1); - } - assert_eq!(b.len(), 17); - } - - #[test] - fn capacity() { - let b = MutableBuffer::::with_capacity(10); - assert_eq!(b.capacity(), 64 / std::mem::size_of::()); - let b = MutableBuffer::::with_capacity(16); - assert_eq!(b.capacity(), 16); - - let b = MutableBuffer::::with_capacity(64); - assert!(b.capacity() >= 64); - - let mut b = MutableBuffer::::with_capacity(16); - b.reserve(4); - assert_eq!(b.capacity(), 16); - b.extend_from_slice(&[0.1; 16]); - b.reserve(4); - assert_eq!(b.capacity(), 32); - } - - #[test] - fn extend() { - let mut b = MutableBuffer::::new(); - b.extend(0..3); - assert_eq!(b.as_slice(), &[0, 1, 2]); - } - - #[test] - fn extend_constant() { - let mut b = MutableBuffer::::new(); - b.extend_constant(3, 1); - assert_eq!(b.as_slice(), &[1, 1, 1]); - } - - #[test] - fn from_iter() { - let b = (0..3).collect::>(); - assert_eq!(b.as_slice(), &[0, 1, 2]); - } - - #[test] - fn from_as_ref() { - let b = MutableBuffer::::from(&[0, 1, 2]); - assert_eq!(b.as_slice(), &[0, 1, 2]); - } - - #[test] - fn from_trusted_len_iter() { - let b = unsafe { MutableBuffer::::from_trusted_len_iter_unchecked(0..3) }; - assert_eq!(b.as_slice(), &[0, 1, 2]); - } - - #[test] - fn try_from_trusted_len_iter() { - let iter = (0..3).map(Result::<_, String>::Ok); - let buffer = - unsafe { MutableBuffer::::try_from_trusted_len_iter_unchecked(iter) }.unwrap(); - assert_eq!(buffer.len(), 3); - assert_eq!(buffer.as_slice(), &[0, 1, 2]); - } - - #[test] - fn to_buffer() { - let b = (0..3).collect::>(); - let b: Buffer = b.into(); - assert_eq!(b.as_slice(), &[0, 1, 2]); - } - - #[test] - fn to_bytes() { - let b = (0..3).collect::>(); - let b: Bytes = b.into(); - assert_eq!(b.as_ref(), &[0, 1, 2]); - } - - #[test] - fn test_debug() { - let buffer = MutableBuffer::::from(&[0, 1, 2, 3]); - let a = format!("{:?}", buffer); - assert_eq!(a, "[0, 1, 2, 3]") - } -} diff --git a/src/compute/aggregate/memory.rs b/src/compute/aggregate/memory.rs index d49411c3b34..8ec120da5c1 100644 --- a/src/compute/aggregate/memory.rs +++ b/src/compute/aggregate/memory.rs @@ -4,7 +4,7 @@ use crate::datatypes::{DataType, IntervalUnit}; use crate::types::days_ms; fn validity_size(validity: &Option) -> usize { - validity.as_ref().map(|b| b.as_slice().len()).unwrap_or(0) + validity.as_ref().map(|b| b.as_slice().0.len()).unwrap_or(0) } macro_rules! dyn_primitive { @@ -55,7 +55,7 @@ pub fn estimated_bytes_size(array: &dyn Array) -> usize { Null => 0, Boolean => { let array = array.as_any().downcast_ref::().unwrap(); - array.values().as_slice().len() + validity_size(array.validity()) + array.values().as_slice().0.len() + validity_size(array.validity()) } Int8 => dyn_primitive!(array, i8), Int16 => dyn_primitive!(array, i16), diff --git a/src/compute/aggregate/min_max.rs b/src/compute/aggregate/min_max.rs index 2ab1bf15346..368f2cc6eb6 100644 --- a/src/compute/aggregate/min_max.rs +++ b/src/compute/aggregate/min_max.rs @@ -108,9 +108,9 @@ where T: NativeType + Simd, T::Simd: SimdOrd, { - if bitmap.offset() == 0 { - let validity_masks = - BitChunksExact::<::Chunk>::new(bitmap.as_slice(), bitmap.len()); + let (slice, offset, length) = bitmap.as_slice(); + if offset == 0 { + let validity_masks = BitChunksExact::<::Chunk>::new(slice, length); null_min_primitive_impl(values, validity_masks) } else { let validity_masks = bitmap.chunks::<::Chunk>(); @@ -125,9 +125,9 @@ where T: NativeType + Simd, T::Simd: SimdOrd, { - if bitmap.offset() == 0 { - let validity_masks = - BitChunksExact::<::Chunk>::new(bitmap.as_slice(), bitmap.len()); + let (slice, offset, length) = bitmap.as_slice(); + if offset == 0 { + let validity_masks = BitChunksExact::<::Chunk>::new(slice, length); null_max_primitive_impl(values, validity_masks) } else { let validity_masks = bitmap.chunks::<::Chunk>(); diff --git a/src/compute/aggregate/sum.rs b/src/compute/aggregate/sum.rs index eb3b3ef4d2f..9179756d909 100644 --- a/src/compute/aggregate/sum.rs +++ b/src/compute/aggregate/sum.rs @@ -75,9 +75,9 @@ where T: NativeType + Simd, T::Simd: Add + Sum, { - if bitmap.offset() == 0 { - let validity_masks = - BitChunksExact::<::Chunk>::new(bitmap.as_slice(), bitmap.len()); + let (slice, offset, length) = bitmap.as_slice(); + if offset == 0 { + let validity_masks = BitChunksExact::<::Chunk>::new(slice, length); null_sum_impl(values, validity_masks) } else { let validity_masks = bitmap.chunks::<::Chunk>(); diff --git a/src/compute/arithmetics/basic/add.rs b/src/compute/arithmetics/basic/add.rs index 77e354dfa06..235e7686e7f 100644 --- a/src/compute/arithmetics/basic/add.rs +++ b/src/compute/arithmetics/basic/add.rs @@ -421,19 +421,19 @@ mod tests { let (result, overflow) = overflowing_add(&a, &b).unwrap(); let expected = Int32Array::from(&[None, None, None, Some(12)]); assert_eq!(result, expected); - assert_eq!(overflow.as_slice()[0], 0b0000); + assert_eq!(overflow, Bitmap::from([false, false, false, false])); let a = Int8Array::from(&[Some(1i8), Some(100i8)]); let b = Int8Array::from(&[Some(1i8), Some(100i8)]); let (result, overflow) = overflowing_add(&a, &b).unwrap(); let expected = Int8Array::from(&[Some(2i8), Some(-56i8)]); assert_eq!(result, expected); - assert_eq!(overflow.as_slice()[0], 0b10); + assert_eq!(overflow, Bitmap::from([false, true])); // Trait testing let (result, overflow) = a.overflowing_add(&b).unwrap(); assert_eq!(result, expected); - assert_eq!(overflow.as_slice()[0], 0b10); + assert_eq!(overflow, Bitmap::from([false, true])); } #[test] @@ -488,17 +488,17 @@ mod tests { let (result, overflow) = overflowing_add_scalar(&a, &1i32); let expected = Int32Array::from(&vec![None, Some(7), None, Some(7)]); assert_eq!(result, expected); - assert_eq!(overflow.as_slice()[0], 0b0000); + assert_eq!(overflow, Bitmap::from([false, false, false, false])); let a = Int8Array::from(&vec![Some(1i8), Some(100i8)]); let (result, overflow) = overflowing_add_scalar(&a, &100i8); let expected = Int8Array::from(&vec![Some(101i8), Some(-56i8)]); assert_eq!(result, expected); - assert_eq!(overflow.as_slice()[0], 0b10); + assert_eq!(overflow, Bitmap::from([false, true])); // Trait testing let (result, overflow) = a.overflowing_add(&100i8).unwrap(); assert_eq!(result, expected); - assert_eq!(overflow.as_slice()[0], 0b10); + assert_eq!(overflow, Bitmap::from([false, true])); } } diff --git a/src/compute/arithmetics/basic/mul.rs b/src/compute/arithmetics/basic/mul.rs index 73ade24550e..6038bc6f822 100644 --- a/src/compute/arithmetics/basic/mul.rs +++ b/src/compute/arithmetics/basic/mul.rs @@ -413,19 +413,19 @@ mod tests { let (result, overflow) = overflowing_mul(&a, &b).unwrap(); let expected = Int32Array::from(&[None, None, None, Some(36)]); assert_eq!(result, expected); - assert_eq!(overflow.as_slice()[0], 0b0000); + assert_eq!(overflow, Bitmap::from([false, false, false, false])); let a = Int8Array::from(&[Some(1i8), Some(-100i8)]); let b = Int8Array::from(&[Some(1i8), Some(100i8)]); let (result, overflow) = overflowing_mul(&a, &b).unwrap(); let expected = Int8Array::from(&[Some(1i8), Some(-16i8)]); assert_eq!(result, expected); - assert_eq!(overflow.as_slice()[0], 0b10); + assert_eq!(overflow, Bitmap::from([false, true])); // Trait testing let (result, overflow) = a.overflowing_mul(&b).unwrap(); assert_eq!(result, expected); - assert_eq!(overflow.as_slice()[0], 0b10); + assert_eq!(overflow, Bitmap::from([false, true])); } #[test] @@ -480,17 +480,17 @@ mod tests { let (result, overflow) = overflowing_mul_scalar(&a, &1i32); let expected = Int32Array::from(&[None, Some(6), None, Some(6)]); assert_eq!(result, expected); - assert_eq!(overflow.as_slice()[0], 0b0000); + assert_eq!(overflow, Bitmap::from([false, false, false, false])); let a = Int8Array::from(&[Some(1i8), Some(-100i8)]); let (result, overflow) = overflowing_mul_scalar(&a, &100i8); let expected = Int8Array::from(&[Some(100i8), Some(-16i8)]); assert_eq!(result, expected); - assert_eq!(overflow.as_slice()[0], 0b10); + assert_eq!(overflow, Bitmap::from([false, true])); // Trait testing let (result, overflow) = a.overflowing_mul(&100i8).unwrap(); assert_eq!(result, expected); - assert_eq!(overflow.as_slice()[0], 0b10); + assert_eq!(overflow, Bitmap::from([false, true])); } } diff --git a/src/compute/arithmetics/basic/sub.rs b/src/compute/arithmetics/basic/sub.rs index c617ac486f6..fd812c9cb90 100644 --- a/src/compute/arithmetics/basic/sub.rs +++ b/src/compute/arithmetics/basic/sub.rs @@ -414,19 +414,19 @@ mod tests { let (result, overflow) = overflowing_sub(&a, &b).unwrap(); let expected = Int32Array::from(&[None, None, None, Some(0)]); assert_eq!(result, expected); - assert_eq!(overflow.as_slice()[0], 0b0000); + assert_eq!(overflow, Bitmap::from([false, false, false, false])); let a = Int8Array::from(&[Some(1i8), Some(-100i8)]); let b = Int8Array::from(&[Some(1i8), Some(100i8)]); let (result, overflow) = overflowing_sub(&a, &b).unwrap(); let expected = Int8Array::from(&[Some(0i8), Some(56i8)]); assert_eq!(result, expected); - assert_eq!(overflow.as_slice()[0], 0b10); + assert_eq!(overflow, Bitmap::from([false, true])); // Trait testing let (result, overflow) = a.overflowing_sub(&b).unwrap(); assert_eq!(result, expected); - assert_eq!(overflow.as_slice()[0], 0b10); + assert_eq!(overflow, Bitmap::from([false, true])); } #[test] @@ -481,17 +481,17 @@ mod tests { let (result, overflow) = overflowing_sub_scalar(&a, &1i32); let expected = Int32Array::from(&[None, Some(5), None, Some(5)]); assert_eq!(result, expected); - assert_eq!(overflow.as_slice()[0], 0b0000); + assert_eq!(overflow, Bitmap::from([false, false, false, false])); let a = Int8Array::from(&[Some(1i8), Some(-100i8)]); let (result, overflow) = overflowing_sub_scalar(&a, &100i8); let expected = Int8Array::from(&[Some(-99i8), Some(56i8)]); assert_eq!(result, expected); - assert_eq!(overflow.as_slice()[0], 0b10); + assert_eq!(overflow, Bitmap::from([false, true])); // Trait testing let (result, overflow) = a.overflowing_sub(&100i8).unwrap(); assert_eq!(result, expected); - assert_eq!(overflow.as_slice()[0], 0b10); + assert_eq!(overflow, Bitmap::from([false, true])); } } diff --git a/src/ffi/array.rs b/src/ffi/array.rs index bde98378b7f..8f947d1b1c6 100644 --- a/src/ffi/array.rs +++ b/src/ffi/array.rs @@ -88,138 +88,3 @@ pub fn try_from(array: A) -> Result> { Ok(array) } - -#[cfg(test)] -mod tests { - use super::*; - use crate::datatypes::TimeUnit; - use crate::{error::Result, ffi}; - use std::sync::Arc; - - fn test_release(expected: impl Array + 'static) -> Result<()> { - // create a `ArrowArray` from the data. - let b: Arc = Arc::new(expected); - - // export the array as 2 pointers. - let _ = ffi::export_to_c(b)?; - - Ok(()) - } - - fn test_round_trip(expected: impl Array + Clone + 'static) -> Result<()> { - let b: Arc = Arc::new(expected.clone()); - let expected = Box::new(expected) as Box; - - // create a `ArrowArray` from the data. - let array = Arc::new(ffi::export_to_c(b)?); - - let (_, _) = array.references(); - - let result = try_from(array)?; - - assert_eq!(&result, &expected); - Ok(()) - } - - #[test] - fn test_u32() -> Result<()> { - let data = Int32Array::from(&[Some(2), None, Some(1), None]); - test_release(data) - } - - #[test] - fn test_u64() -> Result<()> { - let data = UInt64Array::from(&[Some(2), None, Some(1), None]); - test_round_trip(data) - } - - #[test] - fn test_i64() -> Result<()> { - let data = Int64Array::from(&[Some(2), None, Some(1), None]); - test_round_trip(data) - } - - #[test] - fn test_utf8() -> Result<()> { - let data = Utf8Array::::from(&vec![Some("a"), None, Some("bb"), None]); - test_round_trip(data) - } - - #[test] - fn test_large_utf8() -> Result<()> { - let data = Utf8Array::::from(&vec![Some("a"), None, Some("bb"), None]); - test_round_trip(data) - } - - #[test] - fn test_binary() -> Result<()> { - let data = - BinaryArray::::from(&vec![Some(b"a".as_ref()), None, Some(b"bb".as_ref()), None]); - test_round_trip(data) - } - - #[test] - fn test_timestamp_tz() -> Result<()> { - let data = Int64Array::from(&vec![Some(2), None, None]).to(DataType::Timestamp( - TimeUnit::Second, - Some("UTC".to_string()), - )); - test_round_trip(data) - } - - #[test] - fn test_large_binary() -> Result<()> { - let data = - BinaryArray::::from(&vec![Some(b"a".as_ref()), None, Some(b"bb".as_ref()), None]); - test_round_trip(data) - } - - #[test] - fn test_list() -> Result<()> { - let data = vec![ - Some(vec![Some(1i32), Some(2), Some(3)]), - None, - Some(vec![Some(4), None, Some(6)]), - ]; - - let mut array = MutableListArray::>::new(); - array.try_extend(data)?; - - let array: ListArray = array.into(); - - test_round_trip(array) - } - - #[test] - fn test_list_list() -> Result<()> { - let data = vec![ - Some(vec![ - Some(vec![None]), - Some(vec![Some(2)]), - Some(vec![Some(3)]), - ]), - None, - Some(vec![Some(vec![Some(4), None, Some(6)])]), - ]; - - let mut array = - MutableListArray::>>::new(); - array.try_extend(data)?; - - let array: ListArray = array.into(); - - test_round_trip(array) - } - - #[test] - fn test_dict() -> Result<()> { - let data = vec![Some("a"), Some("a"), None, Some("b")]; - - let mut array = MutableDictionaryArray::>::new(); - array.try_extend(data)?; - - let array: DictionaryArray = array.into(); - - test_round_trip(array) - } -} diff --git a/src/io/ipc/common.rs b/src/io/ipc/common.rs deleted file mode 100644 index 2820a968c9f..00000000000 --- a/src/io/ipc/common.rs +++ /dev/null @@ -1,87 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#[cfg(test)] -pub(crate) mod tests { - use crate::{ - datatypes::Schema, - io::json_integration::{to_record_batch, ArrowJson}, - record_batch::RecordBatch, - }; - use crate::{error::Result, io::ipc::read::read_stream_metadata}; - - use crate::io::ipc::read::StreamReader; - - use std::{collections::HashMap, convert::TryFrom, fs::File, io::Read}; - - use flate2::read::GzDecoder; - - /// Read gzipped JSON file - pub fn read_gzip_json(version: &str, file_name: &str) -> (Schema, Vec) { - let testdata = crate::util::test_util::arrow_test_data(); - let file = File::open(format!( - "{}/arrow-ipc-stream/integration/{}/{}.json.gz", - testdata, version, file_name - )) - .unwrap(); - let mut gz = GzDecoder::new(&file); - let mut s = String::new(); - gz.read_to_string(&mut s).unwrap(); - // convert to Arrow JSON - let arrow_json: ArrowJson = serde_json::from_str(&s).unwrap(); - - let schema = serde_json::to_value(arrow_json.schema).unwrap(); - let schema = Schema::try_from(&schema).unwrap(); - - // read dictionaries - let mut dictionaries = HashMap::new(); - if let Some(dicts) = arrow_json.dictionaries { - for json_dict in dicts { - // TODO: convert to a concrete Arrow type - dictionaries.insert(json_dict.id, json_dict); - } - } - - let batches = arrow_json - .batches - .iter() - .map(|batch| to_record_batch(&schema, batch, &dictionaries)) - .collect::>>() - .unwrap(); - - (schema, batches) - } - - pub fn read_arrow_stream(version: &str, file_name: &str) -> (Schema, Vec) { - let testdata = crate::util::test_util::arrow_test_data(); - let mut file = File::open(format!( - "{}/arrow-ipc-stream/integration/{}/{}.stream", - testdata, version, file_name - )) - .unwrap(); - - let metadata = read_stream_metadata(&mut file).unwrap(); - let reader = StreamReader::new(file, metadata); - - let schema = reader.schema(); - - ( - schema.as_ref().clone(), - reader.collect::>().unwrap(), - ) - } -} diff --git a/src/io/ipc/mod.rs b/src/io/ipc/mod.rs index e3df1cd6ee7..d7246f41745 100644 --- a/src/io/ipc/mod.rs +++ b/src/io/ipc/mod.rs @@ -25,7 +25,6 @@ #[allow(clippy::redundant_field_names)] pub mod gen; -pub(crate) mod common; mod compression; mod convert; diff --git a/src/io/ipc/read/reader.rs b/src/io/ipc/read/reader.rs index e4656c1caff..61c5f3ca6d9 100644 --- a/src/io/ipc/read/reader.rs +++ b/src/io/ipc/read/reader.rs @@ -300,165 +300,3 @@ impl<'a, R: Read + Seek> RecordBatchReader for FileReader<'a, R> { self.schema().as_ref() } } - -#[cfg(test)] -mod tests { - use std::fs::File; - - use crate::error::Result; - use crate::io::ipc::common::tests::read_gzip_json; - - use super::*; - - fn test_file(version: &str, file_name: &str) -> Result<()> { - let testdata = crate::util::test_util::arrow_test_data(); - let mut file = File::open(format!( - "{}/arrow-ipc-stream/integration/{}/{}.arrow_file", - testdata, version, file_name - ))?; - - let metadata = read_file_metadata(&mut file)?; - let reader = FileReader::new(&mut file, metadata, None); - - // read expected JSON output - let (schema, batches) = read_gzip_json(version, file_name); - - assert_eq!(&schema, reader.schema().as_ref()); - - batches.iter().zip(reader).try_for_each(|(lhs, rhs)| { - assert_eq!(lhs, &rhs?); - Result::Ok(()) - })?; - Ok(()) - } - - #[test] - fn read_generated_100_primitive() -> Result<()> { - test_file("1.0.0-littleendian", "generated_primitive")?; - test_file("1.0.0-bigendian", "generated_primitive") - } - - #[test] - fn read_generated_100_primitive_large_offsets() -> Result<()> { - test_file("1.0.0-littleendian", "generated_primitive_large_offsets")?; - test_file("1.0.0-bigendian", "generated_primitive_large_offsets") - } - - #[test] - fn read_generated_100_datetime() -> Result<()> { - test_file("1.0.0-littleendian", "generated_datetime")?; - test_file("1.0.0-bigendian", "generated_datetime") - } - - #[test] - fn read_generated_100_null_trivial() -> Result<()> { - test_file("1.0.0-littleendian", "generated_null_trivial")?; - test_file("1.0.0-bigendian", "generated_null_trivial") - } - - #[test] - fn read_generated_100_null() -> Result<()> { - test_file("1.0.0-littleendian", "generated_null")?; - test_file("1.0.0-bigendian", "generated_null") - } - - #[test] - fn read_generated_100_primitive_zerolength() -> Result<()> { - test_file("1.0.0-littleendian", "generated_primitive_zerolength")?; - test_file("1.0.0-bigendian", "generated_primitive_zerolength") - } - - #[test] - fn read_generated_100_primitive_primitive_no_batches() -> Result<()> { - test_file("1.0.0-littleendian", "generated_primitive_no_batches")?; - test_file("1.0.0-bigendian", "generated_primitive_no_batches") - } - - #[test] - fn read_generated_100_dictionary() -> Result<()> { - test_file("1.0.0-littleendian", "generated_dictionary")?; - test_file("1.0.0-bigendian", "generated_dictionary") - } - - #[test] - fn read_100_custom_metadata() -> Result<()> { - test_file("1.0.0-littleendian", "generated_custom_metadata")?; - test_file("1.0.0-bigendian", "generated_custom_metadata") - } - - #[test] - fn read_generated_100_nested_large_offsets() -> Result<()> { - test_file("1.0.0-littleendian", "generated_nested_large_offsets")?; - test_file("1.0.0-bigendian", "generated_nested_large_offsets") - } - - #[test] - fn read_generated_100_nested() -> Result<()> { - test_file("1.0.0-littleendian", "generated_nested")?; - test_file("1.0.0-bigendian", "generated_nested") - } - - #[test] - fn read_generated_100_dictionary_unsigned() -> Result<()> { - test_file("1.0.0-littleendian", "generated_dictionary_unsigned")?; - test_file("1.0.0-bigendian", "generated_dictionary_unsigned") - } - - #[test] - fn read_generated_100_decimal() -> Result<()> { - test_file("1.0.0-littleendian", "generated_decimal")?; - test_file("1.0.0-bigendian", "generated_decimal") - } - - #[test] - fn read_generated_100_interval() -> Result<()> { - test_file("1.0.0-littleendian", "generated_interval")?; - test_file("1.0.0-bigendian", "generated_interval") - } - - #[test] - fn read_generated_100_union() -> Result<()> { - test_file("1.0.0-littleendian", "generated_union") - } - - #[test] - fn read_generated_017_union() -> Result<()> { - test_file("0.17.1", "generated_union") - } - - #[test] - fn read_generated_200_compression_lz4() -> Result<()> { - test_file("2.0.0-compression", "generated_lz4") - } - - #[test] - fn read_generated_200_compression_zstd() -> Result<()> { - test_file("2.0.0-compression", "generated_zstd") - } - - fn test_projection(version: &str, file_name: &str, column: usize) -> Result<()> { - let testdata = crate::util::test_util::arrow_test_data(); - let mut file = File::open(format!( - "{}/arrow-ipc-stream/integration/{}/{}.arrow_file", - testdata, version, file_name - ))?; - - let metadata = read_file_metadata(&mut file)?; - let mut reader = FileReader::new(&mut file, metadata, Some(vec![column])); - - assert_eq!(reader.schema().fields().len(), 1); - - reader.try_for_each(|rhs| { - assert_eq!(rhs?.num_columns(), 1); - Result::Ok(()) - })?; - Ok(()) - } - - #[test] - fn read_projected() -> Result<()> { - test_projection("1.0.0-littleendian", "generated_primitive", 1)?; - test_projection("1.0.0-littleendian", "generated_dictionary", 2)?; - test_projection("1.0.0-littleendian", "generated_nested", 1) - } -} diff --git a/src/io/ipc/read/stream.rs b/src/io/ipc/read/stream.rs index ddf432145dc..5018f7953be 100644 --- a/src/io/ipc/read/stream.rs +++ b/src/io/ipc/read/stream.rs @@ -239,101 +239,3 @@ impl RecordBatchReader for StreamReader { self.metadata.schema.as_ref() } } - -#[cfg(test)] -mod tests { - use super::*; - - use crate::io::ipc::common::tests::read_gzip_json; - - use std::fs::File; - - fn test_file(version: &str, file_name: &str) -> Result<()> { - let testdata = crate::util::test_util::arrow_test_data(); - let mut file = File::open(format!( - "{}/arrow-ipc-stream/integration/{}/{}.stream", - testdata, version, file_name - ))?; - - let metadata = read_stream_metadata(&mut file)?; - let reader = StreamReader::new(file, metadata); - - // read expected JSON output - let (schema, batches) = read_gzip_json(version, file_name); - - assert_eq!(&schema, reader.schema().as_ref()); - - batches - .iter() - .zip(reader.map(|x| x.unwrap())) - .for_each(|(lhs, rhs)| { - assert_eq!(lhs, &rhs); - }); - Ok(()) - } - - #[test] - fn read_generated_100_primitive() -> Result<()> { - test_file("1.0.0-littleendian", "generated_primitive") - } - - #[test] - fn read_generated_100_datetime() -> Result<()> { - test_file("1.0.0-littleendian", "generated_datetime") - } - - #[test] - fn read_generated_100_null_trivial() -> Result<()> { - test_file("1.0.0-littleendian", "generated_null_trivial") - } - - #[test] - fn read_generated_100_null() -> Result<()> { - test_file("1.0.0-littleendian", "generated_null") - } - - #[test] - fn read_generated_100_primitive_zerolength() -> Result<()> { - test_file("1.0.0-littleendian", "generated_primitive_zerolength") - } - - #[test] - fn read_generated_100_primitive_primitive_no_batches() -> Result<()> { - test_file("1.0.0-littleendian", "generated_primitive_no_batches") - } - - #[test] - fn read_generated_100_dictionary() -> Result<()> { - test_file("1.0.0-littleendian", "generated_dictionary") - } - - #[test] - fn read_generated_100_nested() -> Result<()> { - test_file("1.0.0-littleendian", "generated_nested") - } - - #[test] - fn read_generated_100_interval() -> Result<()> { - test_file("1.0.0-littleendian", "generated_interval") - } - - #[test] - fn read_generated_100_decimal() -> Result<()> { - test_file("1.0.0-littleendian", "generated_decimal") - } - - #[test] - fn read_generated_200_compression_lz4() -> Result<()> { - test_file("2.0.0-compression", "generated_lz4") - } - - #[test] - fn read_generated_200_compression_zstd() -> Result<()> { - test_file("2.0.0-compression", "generated_zstd") - } - - #[test] - fn read_generated_017_union() -> Result<()> { - test_file("0.17.1", "generated_union") - } -} diff --git a/src/io/ipc/write/serialize.rs b/src/io/ipc/write/serialize.rs index 0fa669452cf..3fc5296c235 100644 --- a/src/io/ipc/write/serialize.rs +++ b/src/io/ipc/write/serialize.rs @@ -542,12 +542,14 @@ fn write_bitmap( match bitmap { Some(bitmap) => { assert_eq!(bitmap.len(), length); - if bitmap.offset() != 0 { + let (slice, slice_offset, _) = bitmap.as_slice(); + if slice_offset != 0 { // case where we can't slice the bitmap as the offsets are not multiple of 8 let bytes = Bitmap::from_trusted_len_iter(bitmap.iter()); - write_bytes(bytes.as_slice(), buffers, arrow_data, offset) + let (slice, _, _) = bytes.as_slice(); + write_bytes(slice, buffers, arrow_data, offset) } else { - write_bytes(bitmap.as_slice(), buffers, arrow_data, offset) + write_bytes(slice, buffers, arrow_data, offset) } } None => { diff --git a/src/io/ipc/write/stream.rs b/src/io/ipc/write/stream.rs index e29d8fa9149..5a52341b092 100644 --- a/src/io/ipc/write/stream.rs +++ b/src/io/ipc/write/stream.rs @@ -108,131 +108,3 @@ impl Drop for StreamWriter { } } } - -#[cfg(test)] -mod tests { - use std::io::Cursor; - - use super::super::super::gen; - use super::*; - - use crate::io::ipc::read::StreamReader; - use crate::io::ipc::{ - common::tests::{read_arrow_stream, read_gzip_json}, - read::read_stream_metadata, - }; - - fn test_file(version: &str, file_name: &str) { - let (schema, batches) = read_arrow_stream(version, file_name); - - let mut result = Vec::::new(); - - // write IPC version 5 - { - let options = - IpcWriteOptions::try_new(8, false, gen::Schema::MetadataVersion::V5).unwrap(); - let mut writer = - StreamWriter::try_new_with_options(&mut result, &schema, options).unwrap(); - for batch in batches { - writer.write(&batch).unwrap(); - } - writer.finish().unwrap(); - } - - let mut reader = Cursor::new(result); - let metadata = read_stream_metadata(&mut reader).unwrap(); - let reader = StreamReader::new(reader, metadata); - - let schema = reader.schema().clone(); - - // read expected JSON output - let (expected_schema, expected_batches) = read_gzip_json(version, file_name); - - assert_eq!(schema.as_ref(), &expected_schema); - - let batches = reader.collect::>>().unwrap(); - - assert_eq!(batches, expected_batches); - } - - #[test] - fn write_100_primitive() { - test_file("1.0.0-littleendian", "generated_primitive"); - } - - #[test] - fn write_100_datetime() { - test_file("1.0.0-littleendian", "generated_datetime"); - } - - #[test] - fn write_100_dictionary_unsigned() { - test_file("1.0.0-littleendian", "generated_dictionary_unsigned"); - } - - #[test] - fn write_100_dictionary() { - test_file("1.0.0-littleendian", "generated_dictionary"); - } - - #[test] - fn write_100_interval() { - test_file("1.0.0-littleendian", "generated_interval"); - } - - #[test] - fn write_100_large_batch() { - // this takes too long for unit-tests. It has been passing... - //test_file("1.0.0-littleendian", "generated_large_batch"); - } - - #[test] - fn write_100_nested() { - test_file("1.0.0-littleendian", "generated_nested"); - } - - #[test] - fn write_100_nested_large_offsets() { - test_file("1.0.0-littleendian", "generated_nested_large_offsets"); - } - - #[test] - fn write_100_null_trivial() { - test_file("1.0.0-littleendian", "generated_null_trivial"); - } - - #[test] - fn write_100_null() { - test_file("1.0.0-littleendian", "generated_null"); - } - - #[test] - fn write_100_primitive_large_offsets() { - test_file("1.0.0-littleendian", "generated_primitive_large_offsets"); - } - - //#[test] - //fn write_100_recursive_nested() { - //test_file("1.0.0-littleendian", "generated_recursive_nested"); - //} - - #[test] - fn write_100_primitive_no_batches() { - test_file("1.0.0-littleendian", "generated_primitive_no_batches"); - } - - #[test] - fn write_100_primitive_zerolength() { - test_file("1.0.0-littleendian", "generated_primitive_zerolength"); - } - - #[test] - fn write_100_custom_metadata() { - test_file("1.0.0-littleendian", "generated_custom_metadata"); - } - - #[test] - fn write_100_decimal() { - test_file("1.0.0-littleendian", "generated_decimal"); - } -} diff --git a/src/io/ipc/write/writer.rs b/src/io/ipc/write/writer.rs index e642648914d..df477757e02 100644 --- a/src/io/ipc/write/writer.rs +++ b/src/io/ipc/write/writer.rs @@ -162,209 +162,3 @@ impl<'a, W: Write> Drop for FileWriter<'a, W> { } } } - -#[cfg(test)] -mod tests { - use std::io::Cursor; - - use super::*; - - use crate::error::Result; - use crate::io::ipc::{ - common::tests::read_gzip_json, - read::{read_file_metadata, FileReader}, - }; - - fn test_round_trip(batch: RecordBatch) -> Result<()> { - let mut result = Vec::::new(); - - // write IPC version 5 - { - let options = IpcWriteOptions::try_new(8, false, gen::Schema::MetadataVersion::V5)?; - let mut writer = - FileWriter::try_new_with_options(&mut result, batch.schema(), options)?; - writer.write(&batch)?; - writer.finish()?; - } - let mut reader = Cursor::new(result); - let metadata = read_file_metadata(&mut reader)?; - let schema = metadata.schema().clone(); - - let reader = FileReader::new(&mut reader, metadata, None); - - // read expected JSON output - let (expected_schema, expected_batches) = (batch.schema().clone(), vec![batch]); - - assert_eq!(schema.as_ref(), expected_schema.as_ref()); - - let batches = reader.collect::>>()?; - - assert_eq!(batches, expected_batches); - Ok(()) - } - - fn test_file(version: &str, file_name: &str) -> Result<()> { - let (schema, batches) = read_gzip_json(version, file_name); - - let mut result = Vec::::new(); - - // write IPC version 5 - { - let options = IpcWriteOptions::try_new(8, false, gen::Schema::MetadataVersion::V5)?; - let mut writer = FileWriter::try_new_with_options(&mut result, &schema, options)?; - for batch in batches { - writer.write(&batch)?; - } - writer.finish()?; - } - let mut reader = Cursor::new(result); - let metadata = read_file_metadata(&mut reader)?; - let schema = metadata.schema().clone(); - - let reader = FileReader::new(&mut reader, metadata, None); - - // read expected JSON output - let (expected_schema, expected_batches) = read_gzip_json(version, file_name); - - assert_eq!(schema.as_ref(), &expected_schema); - - let batches = reader.collect::>>()?; - - assert_eq!(batches, expected_batches); - Ok(()) - } - - #[test] - fn write_100_primitive() -> Result<()> { - test_file("1.0.0-littleendian", "generated_primitive")?; - test_file("1.0.0-bigendian", "generated_primitive") - } - - #[test] - fn write_100_datetime() -> Result<()> { - test_file("1.0.0-littleendian", "generated_datetime")?; - test_file("1.0.0-bigendian", "generated_datetime") - } - - #[test] - fn write_100_dictionary_unsigned() -> Result<()> { - test_file("1.0.0-littleendian", "generated_dictionary_unsigned")?; - test_file("1.0.0-bigendian", "generated_dictionary_unsigned") - } - - #[test] - fn write_100_dictionary() -> Result<()> { - test_file("1.0.0-littleendian", "generated_dictionary")?; - test_file("1.0.0-bigendian", "generated_dictionary") - } - - #[test] - fn write_100_interval() -> Result<()> { - test_file("1.0.0-littleendian", "generated_interval")?; - test_file("1.0.0-bigendian", "generated_interval") - } - - #[test] - fn write_100_large_batch() -> Result<()> { - // this takes too long for unit-tests. It has been passing... - //test_file("1.0.0-littleendian", "generated_large_batch"); - Ok(()) - } - - #[test] - fn write_100_nested() -> Result<()> { - test_file("1.0.0-littleendian", "generated_nested")?; - test_file("1.0.0-bigendian", "generated_nested") - } - - #[test] - fn write_100_nested_large_offsets() -> Result<()> { - test_file("1.0.0-littleendian", "generated_nested_large_offsets")?; - test_file("1.0.0-bigendian", "generated_nested_large_offsets") - } - - #[test] - fn write_100_null_trivial() -> Result<()> { - test_file("1.0.0-littleendian", "generated_null_trivial")?; - test_file("1.0.0-bigendian", "generated_null_trivial") - } - - #[test] - fn write_100_null() -> Result<()> { - test_file("1.0.0-littleendian", "generated_null")?; - test_file("1.0.0-bigendian", "generated_null") - } - - #[test] - fn write_100_primitive_large_offsets() -> Result<()> { - test_file("1.0.0-littleendian", "generated_primitive_large_offsets")?; - test_file("1.0.0-bigendian", "generated_primitive_large_offsets") - } - - #[test] - fn write_100_primitive_no_batches() -> Result<()> { - test_file("1.0.0-littleendian", "generated_primitive_no_batches")?; - test_file("1.0.0-bigendian", "generated_primitive_no_batches") - } - - #[test] - fn write_100_primitive_zerolength() -> Result<()> { - test_file("1.0.0-littleendian", "generated_primitive_zerolength")?; - test_file("1.0.0-bigendian", "generated_primitive_zerolength") - } - - #[test] - fn write_0141_primitive_zerolength() -> Result<()> { - test_file("0.14.1", "generated_primitive_zerolength") - } - - #[test] - fn write_100_custom_metadata() -> Result<()> { - test_file("1.0.0-littleendian", "generated_custom_metadata")?; - test_file("1.0.0-bigendian", "generated_custom_metadata") - } - - #[test] - fn write_100_decimal() -> Result<()> { - test_file("1.0.0-littleendian", "generated_decimal")?; - test_file("1.0.0-bigendian", "generated_decimal") - } - - #[test] - fn write_100_union() -> Result<()> { - test_file("1.0.0-littleendian", "generated_union")?; - test_file("1.0.0-bigendian", "generated_union") - } - - #[test] - fn write_generated_017_union() -> Result<()> { - test_file("0.17.1", "generated_union") - } - - #[test] - fn write_sliced_utf8() -> Result<()> { - use crate::array::{Array, Utf8Array}; - use std::sync::Arc; - let array = - Arc::new(Utf8Array::::from_slice(["aa", "bb"]).slice(1, 1)) as Arc; - let batch = RecordBatch::try_from_iter(vec![("a", array)]).unwrap(); - test_round_trip(batch) - } - - #[test] - fn write_sliced_list() -> Result<()> { - use crate::array::{MutableListArray, MutablePrimitiveArray, TryExtend}; - - let data = vec![ - Some(vec![Some(1i32), Some(2), Some(3)]), - None, - Some(vec![Some(4), None, Some(6)]), - ]; - - let mut array = MutableListArray::>::new(); - array.try_extend(data).unwrap(); - let array = array.into_arc().slice(1, 2).into(); - let batch = RecordBatch::try_from_iter(vec![("a", array)]).unwrap(); - test_round_trip(batch) - } -} diff --git a/src/io/json/read/reader.rs b/src/io/json/read/reader.rs index c78fb92e2e6..0fe8fed271e 100644 --- a/src/io/json/read/reader.rs +++ b/src/io/json/read/reader.rs @@ -283,664 +283,3 @@ impl ReaderBuilder { )) } } - -#[cfg(test)] -mod tests { - use flate2::read::GzDecoder; - use std::{io::Cursor, sync::Arc}; - - use crate::{bitmap::Bitmap, buffer::Buffer, error::Result, io::json::read::infer_json_schema}; - - use super::*; - use std::{fs::File, io::SeekFrom}; - - #[test] - fn test_json_basic() { - let builder = ReaderBuilder::new().infer_schema(None).with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/basic.json").unwrap()) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(4, batch.num_columns()); - assert_eq!(12, batch.num_rows()); - - let schema = reader.schema(); - let batch_schema = batch.schema(); - assert_eq!(schema, batch_schema); - - let a = schema.column_with_name("a").unwrap(); - assert_eq!(0, a.0); - assert_eq!(&DataType::Int64, a.1.data_type()); - let b = schema.column_with_name("b").unwrap(); - assert_eq!(1, b.0); - assert_eq!(&DataType::Float64, b.1.data_type()); - let c = schema.column_with_name("c").unwrap(); - assert_eq!(2, c.0); - assert_eq!(&DataType::Boolean, c.1.data_type()); - let d = schema.column_with_name("d").unwrap(); - assert_eq!(3, d.0); - assert_eq!(&DataType::Utf8, d.1.data_type()); - - let aa = batch - .column(a.0) - .as_any() - .downcast_ref::>() - .unwrap(); - assert_eq!(1, aa.value(0)); - assert_eq!(-10, aa.value(1)); - let bb = batch - .column(b.0) - .as_any() - .downcast_ref::>() - .unwrap(); - assert!((2.0 - bb.value(0)).abs() < f64::EPSILON); - assert!((-3.5 - bb.value(1)).abs() < f64::EPSILON); - let cc = batch - .column(c.0) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(!cc.value(0)); - assert!(cc.value(10)); - let dd = batch - .column(d.0) - .as_any() - .downcast_ref::>() - .unwrap(); - assert_eq!("4", dd.value(0)); - assert_eq!("text", dd.value(8)); - } - - #[test] - fn test_json_basic_with_nulls() { - let builder = ReaderBuilder::new().infer_schema(None).with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/basic_nulls.json").unwrap()) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(4, batch.num_columns()); - assert_eq!(12, batch.num_rows()); - - let schema = reader.schema(); - let batch_schema = batch.schema(); - assert_eq!(schema, batch_schema); - - let a = schema.column_with_name("a").unwrap(); - assert_eq!(&DataType::Int64, a.1.data_type()); - let b = schema.column_with_name("b").unwrap(); - assert_eq!(&DataType::Float64, b.1.data_type()); - let c = schema.column_with_name("c").unwrap(); - assert_eq!(&DataType::Boolean, c.1.data_type()); - let d = schema.column_with_name("d").unwrap(); - assert_eq!(&DataType::Utf8, d.1.data_type()); - - let aa = batch - .column(a.0) - .as_any() - .downcast_ref::>() - .unwrap(); - assert!(aa.is_valid(0)); - assert!(!aa.is_valid(1)); - assert!(!aa.is_valid(11)); - let bb = batch - .column(b.0) - .as_any() - .downcast_ref::>() - .unwrap(); - assert!(bb.is_valid(0)); - assert!(!bb.is_valid(2)); - assert!(!bb.is_valid(11)); - let cc = batch - .column(c.0) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(cc.is_valid(0)); - assert!(!cc.is_valid(4)); - assert!(!cc.is_valid(11)); - let dd = batch - .column(d.0) - .as_any() - .downcast_ref::>() - .unwrap(); - assert!(!dd.is_valid(0)); - assert!(dd.is_valid(1)); - assert!(!dd.is_valid(4)); - assert!(!dd.is_valid(11)); - } - - #[test] - fn test_json_basic_schema() { - let schema = Arc::new(Schema::new(vec![ - Field::new("a", DataType::Int32, false), - Field::new("b", DataType::Float32, false), - Field::new("c", DataType::Boolean, false), - Field::new("d", DataType::Utf8, false), - ])); - - let mut reader: Reader = Reader::new( - File::open("test/data/basic.json").unwrap(), - schema.clone(), - 1024, - None, - ); - let reader_schema = reader.schema(); - assert_eq!(reader_schema, &schema); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(4, batch.num_columns()); - assert_eq!(12, batch.num_rows()); - - let schema = batch.schema(); - - let a = schema.column_with_name("a").unwrap(); - assert_eq!(&DataType::Int32, a.1.data_type()); - let b = schema.column_with_name("b").unwrap(); - assert_eq!(&DataType::Float32, b.1.data_type()); - let c = schema.column_with_name("c").unwrap(); - assert_eq!(&DataType::Boolean, c.1.data_type()); - let d = schema.column_with_name("d").unwrap(); - assert_eq!(&DataType::Utf8, d.1.data_type()); - - let aa = batch - .column(a.0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(1, aa.value(0)); - // test that a 64bit value is returned as null due to overflowing - assert!(!aa.is_valid(11)); - let bb = batch - .column(b.0) - .as_any() - .downcast_ref::() - .unwrap(); - assert!((2.0 - bb.value(0)).abs() < f32::EPSILON); - assert!((-3.5 - bb.value(1)).abs() < f32::EPSILON); - } - - #[test] - fn test_json_basic_schema_projection() { - // We test implicit and explicit projection: - // Implicit: omitting fields from a schema - // Explicit: supplying a vec of fields to take - let schema = Arc::new(Schema::new(vec![ - Field::new("a", DataType::Int32, false), - Field::new("b", DataType::Float32, false), - Field::new("c", DataType::Boolean, false), - ])); - - let mut reader: Reader = Reader::new( - File::open("test/data/basic.json").unwrap(), - schema, - 1024, - Some(vec!["a".to_string(), "c".to_string()]), - ); - let reader_schema = reader.schema().clone(); - let expected_schema = Schema::new(vec![ - Field::new("a", DataType::Int32, false), - Field::new("c", DataType::Boolean, false), - ]); - assert_eq!(reader_schema.as_ref(), &expected_schema); - - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(2, batch.num_columns()); - assert_eq!(12, batch.num_rows()); - - let batch_schema = batch.schema(); - assert_eq!(&reader_schema, batch_schema); - - let a = batch_schema.column_with_name("a").unwrap(); - assert_eq!(0, a.0); - assert_eq!(&DataType::Int32, a.1.data_type()); - let c = batch_schema.column_with_name("c").unwrap(); - assert_eq!(1, c.0); - assert_eq!(&DataType::Boolean, c.1.data_type()); - } - - #[test] - fn test_json_arrays() { - let builder = ReaderBuilder::new().infer_schema(None).with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/arrays.json").unwrap()) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(4, batch.num_columns()); - assert_eq!(3, batch.num_rows()); - - let schema = batch.schema(); - - let a = schema.column_with_name("a").unwrap(); - assert_eq!(&DataType::Int64, a.1.data_type()); - let b = schema.column_with_name("b").unwrap(); - assert_eq!( - &DataType::List(Box::new(Field::new("item", DataType::Float64, true))), - b.1.data_type() - ); - let c = schema.column_with_name("c").unwrap(); - assert_eq!( - &DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), - c.1.data_type() - ); - let d = schema.column_with_name("d").unwrap(); - assert_eq!(&DataType::Utf8, d.1.data_type()); - - let aa = batch - .column(a.0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(1, aa.value(0)); - assert_eq!(-10, aa.value(1)); - let bb = batch - .column(b.0) - .as_any() - .downcast_ref::>() - .unwrap(); - let bb = bb.values(); - let bb = bb.as_any().downcast_ref::().unwrap(); - assert_eq!(9, bb.len()); - assert!((2.0 - bb.value(0)).abs() < f64::EPSILON); - assert!((-6.1 - bb.value(5)).abs() < f64::EPSILON); - assert!(!bb.is_valid(7)); - - let cc = batch - .column(c.0) - .as_any() - .downcast_ref::>() - .unwrap(); - let cc = cc.values(); - let cc = cc.as_any().downcast_ref::().unwrap(); - assert_eq!(6, cc.len()); - assert!(!cc.value(0)); - assert!(!cc.value(4)); - assert!(!cc.is_valid(5)); - } - - #[test] - fn test_invalid_json_infer_schema() { - let re = infer_json_schema_from_seekable( - &mut BufReader::new(File::open("test/data/uk_cities_with_headers.csv").unwrap()), - None, - ); - assert_eq!( - re.err().unwrap().to_string(), - "External error: expected value at line 1 column 1", - ); - } - - #[test] - fn test_invalid_json_read_record() { - let schema = Arc::new(Schema::new(vec![Field::new( - "a", - DataType::Struct(vec![Field::new("a", DataType::Utf8, true)]), - true, - )])); - let builder = ReaderBuilder::new().with_schema(schema).with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/uk_cities_with_headers.csv").unwrap()) - .unwrap(); - assert_eq!( - reader.next().err().unwrap().to_string(), - "External error: expected value at line 1 column 1", - ); - } - - #[test] - fn test_mixed_json_arrays() { - let builder = ReaderBuilder::new().infer_schema(None).with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/mixed_arrays.json").unwrap()) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); - - let mut file = File::open("test/data/mixed_arrays.json.gz").unwrap(); - let mut reader = BufReader::new(GzDecoder::new(&file)); - let schema = Arc::new(infer_json_schema(&mut reader, None).unwrap()); - file.seek(SeekFrom::Start(0)).unwrap(); - - let reader = BufReader::new(GzDecoder::new(&file)); - let mut reader = Reader::from_buf_reader(reader, schema, 64, None); - let batch_gz = reader.next().unwrap().unwrap(); - - for batch in vec![batch, batch_gz] { - assert_eq!(4, batch.num_columns()); - assert_eq!(4, batch.num_rows()); - - let schema = batch.schema(); - - let a = schema.column_with_name("a").unwrap(); - assert_eq!(&DataType::Int64, a.1.data_type()); - let b = schema.column_with_name("b").unwrap(); - assert_eq!( - &DataType::List(Box::new(Field::new("item", DataType::Float64, true))), - b.1.data_type() - ); - let c = schema.column_with_name("c").unwrap(); - assert_eq!( - &DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), - c.1.data_type() - ); - let d = schema.column_with_name("d").unwrap(); - assert_eq!( - &DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), - d.1.data_type() - ); - - let bb = batch - .column(b.0) - .as_any() - .downcast_ref::>() - .unwrap(); - let bb = bb.values(); - let bb = bb.as_any().downcast_ref::().unwrap(); - assert_eq!(9, bb.len()); - assert!((-6.1 - bb.value(8)).abs() < f64::EPSILON); - - let cc = batch - .column(c.0) - .as_any() - .downcast_ref::>() - .unwrap(); - let cc = cc.values(); - let cc = cc.as_any().downcast_ref::().unwrap(); - let cc_expected = BooleanArray::from(vec![Some(false), Some(true), Some(false), None]); - assert_eq!(cc, &cc_expected); - - let dd = batch - .column(d.0) - .as_any() - .downcast_ref::>() - .unwrap(); - let dd = dd.values(); - let dd = dd.as_any().downcast_ref::>().unwrap(); - assert_eq!( - dd, - &Utf8Array::::from_slice(&["1", "false", "array", "2.4"]) - ); - } - } - - #[test] - fn test_nested_struct_json_arrays() { - let d_field = Field::new("d", DataType::Utf8, true); - let c_field = Field::new("c", DataType::Struct(vec![d_field.clone()]), true); - let a_field = Field::new( - "a", - DataType::Struct(vec![ - Field::new("b", DataType::Boolean, true), - c_field.clone(), - ]), - true, - ); - let schema = Arc::new(Schema::new(vec![a_field])); - let builder = ReaderBuilder::new().with_schema(schema).with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/nested_structs.json").unwrap()) - .unwrap(); - - // build expected output - let d = Utf8Array::::from(&vec![Some("text"), None, Some("text"), None]); - let c = StructArray::from_data(vec![d_field], vec![Arc::new(d)], None); - - let b = BooleanArray::from(vec![Some(true), Some(false), Some(true), None]); - let expected = StructArray::from_data( - vec![Field::new("b", DataType::Boolean, true), c_field], - vec![Arc::new(b), Arc::new(c)], - None, - ); - - // compare `a` with result from json reader - let batch = reader.next().unwrap().unwrap(); - let read = batch.column(0); - assert_eq!(expected, read.as_ref()); - } - - #[test] - fn test_nested_list_json_arrays() { - let d_field = Field::new("d", DataType::Utf8, true); - let c_field = Field::new("c", DataType::Struct(vec![d_field.clone()]), true); - let b_field = Field::new("b", DataType::Boolean, true); - let a_struct_field = Field::new( - "a", - DataType::Struct(vec![b_field.clone(), c_field.clone()]), - true, - ); - let a_list_data_type = DataType::List(Box::new(a_struct_field)); - let a_field = Field::new("a", a_list_data_type.clone(), true); - let schema = Arc::new(Schema::new(vec![a_field])); - let builder = ReaderBuilder::new().with_schema(schema).with_batch_size(64); - let json_content = r#" - {"a": [{"b": true, "c": {"d": "a_text"}}, {"b": false, "c": {"d": "b_text"}}]} - {"a": [{"b": false, "c": null}]} - {"a": [{"b": true, "c": {"d": "c_text"}}, {"b": null, "c": {"d": "d_text"}}, {"b": true, "c": {"d": null}}]} - {"a": null} - {"a": []} - "#; - let mut reader = builder.build(Cursor::new(json_content)).unwrap(); - - // build expected output - let d = Utf8Array::::from(&vec![ - Some("a_text"), - Some("b_text"), - None, - Some("c_text"), - Some("d_text"), - None, - ]); - - let c = StructArray::from_data(vec![d_field], vec![Arc::new(d)], None); - - let b = BooleanArray::from(vec![ - Some(true), - Some(false), - Some(false), - Some(true), - None, - Some(true), - ]); - let a_struct = StructArray::from_data( - vec![b_field, c_field], - vec![Arc::new(b) as Arc, Arc::new(c) as Arc], - None, - ); - let expected = ListArray::from_data( - a_list_data_type, - Buffer::from([0i32, 2, 3, 6, 6, 6]), - Arc::new(a_struct) as Arc, - Some(Bitmap::from_u8_slice([0b00010111], 5)), - ); - - // compare `a` with result from json reader - let batch = reader.next().unwrap().unwrap(); - let read = batch.column(0); - assert_eq!(expected, read.as_ref()); - } - - #[test] - fn test_dictionary_from_json_basic_with_nulls() -> Result<()> { - let schema = Arc::new(Schema::new(vec![Field::new( - "d", - DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)), - true, - )])); - let builder = ReaderBuilder::new().with_schema(schema).with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/basic_nulls.json").unwrap()) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(1, batch.num_columns()); - assert_eq!(12, batch.num_rows()); - - let schema = reader.schema(); - let batch_schema = batch.schema(); - assert_eq!(schema, batch_schema); - - let d = schema.column_with_name("d").unwrap(); - let data_type = DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)); - assert_eq!(&data_type, d.1.data_type()); - - let result = batch.column(d.0); - - let values = vec![ - None, - Some("4"), - Some("text"), - Some("4"), - None, - None, - Some("4"), - None, - Some("text"), - Some("4"), - Some("4"), - None, - ]; - - let mut expected = MutableDictionaryArray::>::new(); - expected.try_extend(values)?; - let expected: DictionaryArray = expected.into(); - - assert_eq!(expected, result.as_ref()); - Ok(()) - } - - #[test] - fn test_skip_empty_lines() { - let builder = ReaderBuilder::new().infer_schema(None).with_batch_size(64); - let json_content = " - {\"a\": 1} - - {\"a\": 2} - - {\"a\": 3}"; - let mut reader = builder.build(Cursor::new(json_content)).unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(1, batch.num_columns()); - assert_eq!(3, batch.num_rows()); - - let schema = reader.schema(); - let c = schema.column_with_name("a").unwrap(); - assert_eq!(&DataType::Int64, c.1.data_type()); - } - - #[test] - fn test_row_type_validation() { - let builder = ReaderBuilder::new().infer_schema(None).with_batch_size(64); - let json_content = " - [1, \"hello\"] - \"world\""; - let re = builder.build(Cursor::new(json_content)); - assert_eq!( - re.err().unwrap().to_string(), - r#"Expected JSON record to be an object, found Array([Number(1), String("hello")])"#, - ); - } - - #[test] - fn test_list_of_string_dictionary_from_json_with_nulls() -> Result<()> { - let data_type = DataType::List(Box::new(Field::new( - "item", - DataType::Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), - true, - ))); - - let schema = Arc::new(Schema::new(vec![Field::new( - "events", - data_type.clone(), - true, - )])); - let builder = ReaderBuilder::new().with_schema(schema).with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/list_string_dict_nested_nulls.json").unwrap()) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(1, batch.num_columns()); - assert_eq!(3, batch.num_rows()); - - let schema = reader.schema(); - let batch_schema = batch.schema(); - assert_eq!(schema, batch_schema); - - let events = schema.column_with_name("events").unwrap(); - assert_eq!(&data_type, events.1.data_type()); - - let expected = vec![ - Some(vec![None, Some("Elect Leader"), Some("Do Ballot")]), - Some(vec![ - Some("Do Ballot"), - None, - Some("Send Data"), - Some("Elect Leader"), - ]), - Some(vec![Some("Send Data")]), - ]; - - type A = MutableDictionaryArray>; - - let mut array = MutableListArray::::new(); - array.try_extend(expected)?; - - let expected: ListArray = array.into(); - - assert_eq!(expected, batch.column(0).as_ref()); - Ok(()) - } - - #[test] - fn test_with_multiple_batches() { - let builder = ReaderBuilder::new() - .infer_schema(Some(4)) - .with_batch_size(5); - let mut reader: Reader = builder - .build::(File::open("test/data/basic_nulls.json").unwrap()) - .unwrap(); - - let mut num_records = Vec::new(); - while let Some(rb) = reader.next().unwrap() { - num_records.push(rb.num_rows()); - } - - assert_eq!(vec![5, 5, 2], num_records); - } - - #[test] - fn test_json_infer_schema() { - let schema = Schema::new(vec![ - Field::new("a", DataType::Int64, true), - Field::new( - "b", - DataType::List(Box::new(Field::new("item", DataType::Float64, true))), - true, - ), - Field::new( - "c", - DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), - true, - ), - Field::new( - "d", - DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), - true, - ), - ]); - - let mut reader = BufReader::new(File::open("test/data/mixed_arrays.json").unwrap()); - let inferred_schema = infer_json_schema_from_seekable(&mut reader, None).unwrap(); - - assert_eq!(inferred_schema, schema); - - let file = File::open("test/data/mixed_arrays.json.gz").unwrap(); - let mut reader = BufReader::new(GzDecoder::new(&file)); - let inferred_schema = infer_json_schema(&mut reader, None).unwrap(); - - assert_eq!(inferred_schema, schema); - } -} diff --git a/src/io/json/write/mod.rs b/src/io/json/write/mod.rs index 3f97ae18143..7915dad3416 100644 --- a/src/io/json/write/mod.rs +++ b/src/io/json/write/mod.rs @@ -19,246 +19,3 @@ mod serialize; mod writer; pub use serialize::write_record_batches; pub use writer::*; - -#[cfg(test)] -mod tests { - use std::sync::Arc; - - use crate::array::*; - use crate::bitmap::Bitmap; - use crate::buffer::Buffer; - use crate::datatypes::{DataType, Field, Schema}; - use crate::record_batch::RecordBatch; - - use super::*; - - #[test] - fn write_simple_rows() { - let schema = Schema::new(vec![ - Field::new("c1", DataType::Int32, false), - Field::new("c2", DataType::Utf8, false), - ]); - - let a = Int32Array::from([Some(1), Some(2), Some(3), None, Some(5)]); - let b = Utf8Array::::from(&vec![Some("a"), Some("b"), Some("c"), Some("d"), None]); - - let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]).unwrap(); - - let mut buf = Vec::new(); - { - let mut writer = LineDelimitedWriter::new(&mut buf); - writer.write_batches(&[batch]).unwrap(); - } - - assert_eq!( - String::from_utf8(buf).unwrap(), - r#"{"c1":1,"c2":"a"} -{"c1":2,"c2":"b"} -{"c1":3,"c2":"c"} -{"c1":null,"c2":"d"} -{"c1":5,"c2":null} -"# - ); - } - - #[test] - fn write_nested_structs() { - let c121 = Field::new("c121", DataType::Utf8, false); - let fields = vec![ - Field::new("c11", DataType::Int32, false), - Field::new("c12", DataType::Struct(vec![c121.clone()]), false), - ]; - let schema = Schema::new(vec![ - Field::new("c1", DataType::Struct(fields.clone()), false), - Field::new("c2", DataType::Utf8, false), - ]); - - let c1 = StructArray::from_data( - fields, - vec![ - Arc::new(Int32Array::from(&[Some(1), None, Some(5)])), - Arc::new(StructArray::from_data( - vec![c121], - vec![Arc::new(Utf8Array::::from(&vec![ - Some("e"), - Some("f"), - Some("g"), - ]))], - None, - )), - ], - None, - ); - - let c2 = Utf8Array::::from(&vec![Some("a"), Some("b"), Some("c")]); - - let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c1), Arc::new(c2)]).unwrap(); - - let mut buf = Vec::new(); - { - let mut writer = LineDelimitedWriter::new(&mut buf); - writer.write_batches(&[batch]).unwrap(); - } - - assert_eq!( - String::from_utf8(buf).unwrap(), - r#"{"c1":{"c11":1,"c12":{"c121":"e"}},"c2":"a"} -{"c1":{"c11":null,"c12":{"c121":"f"}},"c2":"b"} -{"c1":{"c11":5,"c12":{"c121":"g"}},"c2":"c"} -"# - ); - } - - #[test] - fn write_struct_with_list_field() { - let list_datatype = DataType::List(Box::new(Field::new("c_list", DataType::Utf8, false))); - let field_c1 = Field::new("c1", list_datatype, false); - let field_c2 = Field::new("c2", DataType::Int32, false); - let schema = Schema::new(vec![field_c1, field_c2]); - - let iter = vec![vec!["a", "a1"], vec!["b"], vec!["c"], vec!["d"], vec!["e"]]; - - let iter = iter - .into_iter() - .map(|x| x.into_iter().map(Some).collect::>()) - .map(Some); - let mut a = MutableListArray::>::new_with_field( - MutableUtf8Array::::new(), - "c_list", - false, - ); - a.try_extend(iter).unwrap(); - let a: ListArray = a.into(); - - let b = PrimitiveArray::from_slice(&vec![1, 2, 3, 4, 5]).to(DataType::Int32); - - let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]).unwrap(); - - let mut buf = Vec::new(); - { - let mut writer = LineDelimitedWriter::new(&mut buf); - writer.write_batches(&[batch]).unwrap(); - } - - assert_eq!( - String::from_utf8(buf).unwrap(), - r#"{"c1":["a","a1"],"c2":1} -{"c1":["b"],"c2":2} -{"c1":["c"],"c2":3} -{"c1":["d"],"c2":4} -{"c1":["e"],"c2":5} -"# - ); - } - - #[test] - fn write_nested_list() { - let list_inner = DataType::List(Box::new(Field::new("b", DataType::Int32, false))); - let list_datatype = DataType::List(Box::new(Field::new("a", list_inner, false))); - let field_c1 = Field::new("c1", list_datatype, true); - let field_c2 = Field::new("c2", DataType::Utf8, true); - let schema = Schema::new(vec![field_c1, field_c2]); - - let iter = vec![ - vec![Some(vec![Some(1), Some(2)]), Some(vec![Some(3)])], - vec![], - vec![Some(vec![Some(4), Some(5), Some(6)])], - ]; - - let iter = iter.into_iter().map(Some); - - let inner = MutableListArray::>::new_with_field( - MutablePrimitiveArray::::new(), - "b", - false, - ); - let mut c1 = - MutableListArray::>>::new_with_field(inner, "a", false); - c1.try_extend(iter).unwrap(); - let c1: ListArray = c1.into(); - - let c2 = Utf8Array::::from(&vec![Some("foo"), Some("bar"), None]); - - let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c1), Arc::new(c2)]).unwrap(); - - let mut buf = Vec::new(); - { - let mut writer = LineDelimitedWriter::new(&mut buf); - writer.write_batches(&[batch]).unwrap(); - } - - assert_eq!( - String::from_utf8(buf).unwrap(), - r#"{"c1":[[1,2],[3]],"c2":"foo"} -{"c1":[],"c2":"bar"} -{"c1":[[4,5,6]],"c2":null} -"# - ); - } - - #[test] - fn write_list_of_struct() { - let inner = vec![Field::new("c121", DataType::Utf8, false)]; - let fields = vec![ - Field::new("c11", DataType::Int32, false), - Field::new("c12", DataType::Struct(inner.clone()), false), - ]; - let c1_datatype = DataType::List(Box::new(Field::new( - "s", - DataType::Struct(fields.clone()), - false, - ))); - let field_c1 = Field::new("c1", c1_datatype.clone(), true); - let field_c2 = Field::new("c2", DataType::Int32, false); - let schema = Schema::new(vec![field_c1, field_c2]); - - let s = StructArray::from_data( - fields, - vec![ - Arc::new(Int32Array::from(&[Some(1), None, Some(5)])), - Arc::new(StructArray::from_data( - inner, - vec![Arc::new(Utf8Array::::from(&vec![ - Some("e"), - Some("f"), - Some("g"), - ]))], - None, - )), - ], - None, - ); - - // list column rows (c1): - // [{"c11": 1, "c12": {"c121": "e"}}, {"c12": {"c121": "f"}}], - // null, - // [{"c11": 5, "c12": {"c121": "g"}}] - let c1 = ListArray::::from_data( - c1_datatype, - Buffer::from(&[0, 2, 2, 3]), - Arc::new(s), - Some(Bitmap::from_u8_slice([0b00000101], 3)), - ); - - let c2 = Int32Array::from_slice(&[1, 2, 3]); - - let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c1), Arc::new(c2)]).unwrap(); - - let mut buf = Vec::new(); - { - let mut writer = LineDelimitedWriter::new(&mut buf); - writer.write_batches(&[batch]).unwrap(); - } - - assert_eq!( - String::from_utf8(buf).unwrap(), - r#"{"c1":[{"c11":1,"c12":{"c121":"e"}},{"c11":null,"c12":{"c121":"f"}}],"c2":1} -{"c1":null,"c2":2} -{"c1":[{"c11":5,"c12":{"c121":"g"}}],"c2":3} -"# - ); - } -} diff --git a/src/io/parquet/mod.rs b/src/io/parquet/mod.rs index 0ac846cabb5..404b4e38086 100644 --- a/src/io/parquet/mod.rs +++ b/src/io/parquet/mod.rs @@ -10,507 +10,3 @@ impl From for ArrowError { ArrowError::External("".to_string(), Box::new(error)) } } - -#[cfg(test)] -mod tests { - use crate::array::*; - use crate::bitmap::Bitmap; - use crate::buffer::Buffer; - use crate::datatypes::*; - - use crate::error::Result; - use crate::io::parquet::read; - use crate::io::parquet::read::statistics::*; - use std::io::{Read, Seek}; - use std::sync::Arc; - - type ArrayStats = ( - Arc, - Option>, - ); - - pub fn read_column( - mut reader: R, - row_group: usize, - column: usize, - ) -> Result { - let metadata = read::read_metadata(&mut reader)?; - - let mut reader = read::RecordReader::try_new( - reader, - Some(vec![column]), - None, - Arc::new(|_, _| true), - None, - )?; - - let statistics = metadata.row_groups[row_group] - .column(column) - .statistics() - .map(|x| read::statistics::deserialize_statistics(x?.as_ref())) - .transpose()?; - - Ok((reader.next().unwrap()?.columns()[0].clone(), statistics)) - } - - pub fn pyarrow_nested_nullable(column: usize) -> Box { - let offsets = Buffer::::from([0, 2, 2, 5, 8, 8, 11, 11, 12]); - - let values = match column { - 0 => { - // [[0, 1], None, [2, None, 3], [4, 5, 6], [], [7, 8, 9], None, [10]] - Arc::new(PrimitiveArray::::from(&[ - Some(0), - Some(1), - Some(2), - None, - Some(3), - Some(4), - Some(5), - Some(6), - Some(7), - Some(8), - Some(9), - Some(10), - ])) as Arc - } - 1 | 2 => { - // [[0, 1], None, [2, 0, 3], [4, 5, 6], [], [7, 8, 9], None, [10]] - Arc::new(PrimitiveArray::::from(&[ - Some(0), - Some(1), - Some(2), - Some(0), - Some(3), - Some(4), - Some(5), - Some(6), - Some(7), - Some(8), - Some(9), - Some(10), - ])) as Arc - } - 3 => Arc::new(PrimitiveArray::::from(&[ - Some(0), - Some(1), - Some(2), - None, - Some(3), - Some(4), - Some(5), - Some(6), - Some(7), - Some(8), - Some(9), - Some(10), - ])) as Arc, - 4 => Arc::new(BooleanArray::from(&[ - Some(false), - Some(true), - Some(true), - None, - Some(false), - Some(true), - Some(false), - Some(true), - Some(false), - Some(false), - Some(false), - Some(true), - ])) as Arc, - /* - string = [ - ["Hello", "bbb"], - None, - ["aa", None, ""], - ["bbb", "aa", "ccc"], - [], - ["abc", "bbb", "bbb"], - None, - [""], - ] - */ - 5 => Arc::new(Utf8Array::::from(&[ - Some("Hello".to_string()), - Some("bbb".to_string()), - Some("aa".to_string()), - None, - Some("".to_string()), - Some("bbb".to_string()), - Some("aa".to_string()), - Some("ccc".to_string()), - Some("abc".to_string()), - Some("bbb".to_string()), - Some("bbb".to_string()), - Some("".to_string()), - ])), - 6 => Arc::new(BinaryArray::::from(&[ - Some(b"Hello".to_vec()), - Some(b"bbb".to_vec()), - Some(b"aa".to_vec()), - None, - Some(b"".to_vec()), - Some(b"bbb".to_vec()), - Some(b"aa".to_vec()), - Some(b"ccc".to_vec()), - Some(b"abc".to_vec()), - Some(b"bbb".to_vec()), - Some(b"bbb".to_vec()), - Some(b"".to_vec()), - ])), - _ => unreachable!(), - }; - - match column { - 0 | 1 | 3 | 4 | 5 | 6 => { - let field = match column { - 0 => Field::new("item", DataType::Int64, true), - 1 => Field::new("item", DataType::Int64, false), - 3 => Field::new("item", DataType::Int16, true), - 4 => Field::new("item", DataType::Boolean, true), - 5 => Field::new("item", DataType::Utf8, true), - 6 => Field::new("item", DataType::LargeBinary, true), - _ => unreachable!(), - }; - - let validity = Some(Bitmap::from([ - true, false, true, true, true, true, false, true, - ])); - let data_type = DataType::List(Box::new(field)); - Box::new(ListArray::::from_data( - data_type, offsets, values, validity, - )) - } - 2 => { - // [[0, 1], [], [2, None, 3], [4, 5, 6], [], [7, 8, 9], [], [10]] - let data_type = - DataType::List(Box::new(Field::new("item", DataType::Int64, false))); - Box::new(ListArray::::from_data( - data_type, offsets, values, None, - )) - } - _ => unreachable!(), - } - } - - pub fn pyarrow_nullable(column: usize) -> Box { - let i64_values = &[ - Some(0), - Some(1), - None, - Some(3), - None, - Some(5), - Some(6), - Some(7), - None, - Some(9), - ]; - - match column { - 0 => Box::new(PrimitiveArray::::from(i64_values)), - 1 => Box::new(PrimitiveArray::::from(&[ - Some(0.0), - Some(1.0), - None, - Some(3.0), - None, - Some(5.0), - Some(6.0), - Some(7.0), - None, - Some(9.0), - ])), - 2 => Box::new(Utf8Array::::from(&[ - Some("Hello".to_string()), - None, - Some("aa".to_string()), - Some("".to_string()), - None, - Some("abc".to_string()), - None, - None, - Some("def".to_string()), - Some("aaa".to_string()), - ])), - 3 => Box::new(BooleanArray::from(&[ - Some(true), - None, - Some(false), - Some(false), - None, - Some(true), - None, - None, - Some(true), - Some(true), - ])), - 4 => Box::new( - PrimitiveArray::::from(i64_values) - .to(DataType::Timestamp(TimeUnit::Millisecond, None)), - ), - 5 => { - let values = i64_values - .iter() - .map(|x| x.map(|x| x as u32)) - .collect::>(); - Box::new(PrimitiveArray::::from(values)) - } - 6 => { - let keys = PrimitiveArray::::from([Some(0), Some(1), None, Some(1)]); - let values = Arc::new(PrimitiveArray::::from_slice([10, 200])); - Box::new(DictionaryArray::::from_data(keys, values)) - } - _ => unreachable!(), - } - } - - pub fn pyarrow_nullable_statistics(column: usize) -> Option> { - Some(match column { - 0 => Box::new(PrimitiveStatistics:: { - data_type: DataType::Int64, - distinct_count: None, - null_count: Some(3), - min_value: Some(0), - max_value: Some(9), - }), - 1 => Box::new(PrimitiveStatistics:: { - data_type: DataType::Float64, - distinct_count: None, - null_count: Some(3), - min_value: Some(0.0), - max_value: Some(9.0), - }), - 2 => Box::new(Utf8Statistics { - null_count: Some(4), - distinct_count: None, - min_value: Some("".to_string()), - max_value: Some("def".to_string()), - }), - 3 => Box::new(BooleanStatistics { - null_count: Some(4), - distinct_count: None, - - min_value: Some(false), - max_value: Some(true), - }), - 4 => Box::new(PrimitiveStatistics:: { - data_type: DataType::Timestamp(TimeUnit::Millisecond, None), - distinct_count: None, - null_count: Some(3), - min_value: Some(0), - max_value: Some(9), - }), - 5 => Box::new(PrimitiveStatistics:: { - data_type: DataType::UInt32, - null_count: Some(3), - distinct_count: None, - - min_value: Some(0), - max_value: Some(9), - }), - 6 => return None, - _ => unreachable!(), - }) - } - - // these values match the values in `integration` - pub fn pyarrow_required(column: usize) -> Box { - let i64_values = &[ - Some(0), - Some(1), - Some(2), - Some(3), - Some(4), - Some(5), - Some(6), - Some(7), - Some(8), - Some(9), - ]; - - match column { - 0 => Box::new(PrimitiveArray::::from(i64_values).to(DataType::Int64)), - 3 => Box::new(BooleanArray::from_slice(&[ - true, true, false, false, false, true, true, true, true, true, - ])), - 2 => Box::new(Utf8Array::::from_slice(&[ - "Hello", "bbb", "aa", "", "bbb", "abc", "bbb", "bbb", "def", "aaa", - ])), - _ => unreachable!(), - } - } - - pub fn pyarrow_required_statistics(column: usize) -> Option> { - Some(match column { - 0 => Box::new(PrimitiveStatistics:: { - data_type: DataType::Int64, - null_count: Some(0), - distinct_count: None, - min_value: Some(0), - max_value: Some(9), - }), - 3 => Box::new(BooleanStatistics { - null_count: Some(0), - distinct_count: None, - min_value: Some(false), - max_value: Some(true), - }), - 2 => Box::new(Utf8Statistics { - null_count: Some(0), - distinct_count: None, - min_value: Some("".to_string()), - max_value: Some("def".to_string()), - }), - _ => unreachable!(), - }) - } - - pub fn pyarrow_nested_nullable_statistics(column: usize) -> Option> { - Some(match column { - 3 => Box::new(PrimitiveStatistics:: { - data_type: DataType::Int16, - distinct_count: None, - null_count: Some(1), - min_value: Some(0), - max_value: Some(10), - }), - 4 => Box::new(BooleanStatistics { - distinct_count: None, - null_count: Some(1), - min_value: Some(false), - max_value: Some(true), - }), - 5 => Box::new(Utf8Statistics { - distinct_count: None, - null_count: Some(1), - min_value: Some("".to_string()), - max_value: Some("def".to_string()), - }), - 6 => Box::new(BinaryStatistics { - distinct_count: None, - null_count: Some(1), - min_value: Some(b"".to_vec()), - max_value: Some(b"def".to_vec()), - }), - _ => Box::new(PrimitiveStatistics:: { - data_type: DataType::Int64, - distinct_count: None, - null_count: Some(3), - min_value: Some(0), - max_value: Some(9), - }), - }) - } -} - -/// Round-trip with parquet using the same integration files used for IPC integration tests. -#[cfg(test)] -mod tests_integration { - use std::sync::Arc; - - use super::write::Compression; - use crate::array::{Array, PrimitiveArray, Utf8Array}; - use crate::datatypes::DataType; - use crate::datatypes::TimeUnit; - use crate::datatypes::*; - use crate::record_batch::*; - - use crate::error::Result; - use crate::io::ipc::common::tests::read_gzip_json; - use crate::io::parquet::read; - use crate::io::parquet::write::*; - use std::io::Cursor; - - fn integration_write(schema: &Schema, batches: &[RecordBatch]) -> Result> { - let options = WriteOptions { - write_statistics: true, - compression: Compression::Uncompressed, - version: Version::V1, - }; - - let parquet_schema = to_parquet_schema(schema)?; - let descritors = parquet_schema.columns().to_vec().into_iter(); - - let row_groups = batches.iter().map(|batch| { - let iterator = DynIter::new(batch.columns().iter().zip(descritors.clone()).map( - |(array, type_)| { - Ok(DynIter::new(std::iter::once(array_to_page( - array.as_ref(), - type_, - options, - Encoding::Plain, - )))) - }, - )); - Ok(iterator) - }); - - let mut writer = Cursor::new(vec![]); - - write_file( - &mut writer, - row_groups, - schema, - parquet_schema, - options, - None, - )?; - - Ok(writer.into_inner()) - } - - fn integration_read(data: &[u8]) -> Result<(Arc, Vec)> { - let reader = Cursor::new(data); - let reader = read::RecordReader::try_new(reader, None, None, Arc::new(|_, _| true), None)?; - let schema = reader.schema().clone(); - let batches = reader.collect::>>()?; - - Ok((schema, batches)) - } - - fn test_file(version: &str, file_name: &str) -> Result<()> { - let (schema, batches) = read_gzip_json(version, file_name); - - let data = integration_write(&schema, &batches)?; - - let (read_schema, read_batches) = integration_read(&data)?; - - assert_eq!(&schema, read_schema.as_ref()); - assert_eq!(batches, read_batches); - - Ok(()) - } - - #[test] - fn roundtrip_100_primitive() -> Result<()> { - test_file("1.0.0-littleendian", "generated_primitive")?; - test_file("1.0.0-bigendian", "generated_primitive") - } - - /// Tests that when arrow-specific types (Duration and LargeUtf8) are written to parquet, we can rountrip its - /// logical types. - #[test] - fn test_arrow_type() -> Result<()> { - let dt1 = DataType::Duration(TimeUnit::Second); - let array = PrimitiveArray::::from([Some(1), None, Some(2)]).to(dt1.clone()); - let array2 = Utf8Array::::from([Some("a"), None, Some("bb")]); - let schema = Schema::new(vec![ - Field::new("a1", dt1, true), - Field::new("a2", array2.data_type().clone(), true), - ]); - let batch = RecordBatch::try_new( - Arc::new(schema.clone()), - vec![Arc::new(array), Arc::new(array2)], - )?; - - let r = integration_write(&schema, &[batch.clone()])?; - - let (new_schema, new_batches) = integration_read(&r)?; - - assert_eq!(new_schema.as_ref(), &schema); - assert_eq!(new_batches, vec![batch]); - Ok(()) - } -} diff --git a/src/io/parquet/read/mod.rs b/src/io/parquet/read/mod.rs index 8d86ab05d03..81797a698f5 100644 --- a/src/io/parquet/read/mod.rs +++ b/src/io/parquet/read/mod.rs @@ -252,270 +252,3 @@ pub async fn page_stream_to_array Result<()> { - if std::env::var("ARROW2_IGNORE_PARQUET").is_ok() { - return Ok(()); - } - let use_dict = if use_dict { "dict/" } else { "" }; - let path = if required { - format!( - "fixtures/pyarrow3/v{}/{}{}_{}_10.parquet", - version, use_dict, type_, "required" - ) - } else { - format!( - "fixtures/pyarrow3/v{}/{}{}_{}_10.parquet", - version, use_dict, type_, "nullable" - ) - }; - let mut file = File::open(path).unwrap(); - let (array, statistics) = read_column(&mut file, 0, column)?; - - let expected = match (type_, required) { - ("basic", true) => pyarrow_required(column), - ("basic", false) => pyarrow_nullable(column), - ("nested", false) => pyarrow_nested_nullable(column), - _ => unreachable!(), - }; - - let expected_statistics = match (type_, required) { - ("basic", true) => pyarrow_required_statistics(column), - ("basic", false) => pyarrow_nullable_statistics(column), - ("nested", false) => pyarrow_nested_nullable_statistics(column), - _ => unreachable!(), - }; - - assert_eq!(expected.as_ref(), array.as_ref()); - assert_eq!(expected_statistics, statistics); - - Ok(()) - } - - #[test] - fn v1_int64_nullable() -> Result<()> { - test_pyarrow_integration(0, 1, "basic", false, false) - } - - #[test] - fn v1_int64_required() -> Result<()> { - test_pyarrow_integration(0, 1, "basic", false, true) - } - - #[test] - fn v1_float64_nullable() -> Result<()> { - test_pyarrow_integration(1, 1, "basic", false, false) - } - - #[test] - fn v1_utf8_nullable() -> Result<()> { - test_pyarrow_integration(2, 1, "basic", false, false) - } - - #[test] - fn v1_utf8_required() -> Result<()> { - test_pyarrow_integration(2, 1, "basic", false, true) - } - - #[test] - fn v1_boolean_nullable() -> Result<()> { - test_pyarrow_integration(3, 1, "basic", false, false) - } - - #[test] - fn v1_boolean_required() -> Result<()> { - test_pyarrow_integration(3, 1, "basic", false, true) - } - - #[test] - fn v1_timestamp_nullable() -> Result<()> { - test_pyarrow_integration(4, 1, "basic", false, false) - } - - #[test] - #[ignore] // pyarrow issue; see https://issues.apache.org/jira/browse/ARROW-12201 - fn v1_u32_nullable() -> Result<()> { - test_pyarrow_integration(5, 1, "basic", false, false) - } - - #[test] - fn v2_int64_nullable() -> Result<()> { - test_pyarrow_integration(0, 2, "basic", false, false) - } - - #[test] - fn v2_int64_nullable_dict() -> Result<()> { - test_pyarrow_integration(0, 2, "basic", true, false) - } - - #[test] - fn v1_int64_nullable_dict() -> Result<()> { - test_pyarrow_integration(0, 1, "basic", true, false) - } - - #[test] - fn v2_utf8_nullable() -> Result<()> { - test_pyarrow_integration(2, 2, "basic", false, false) - } - - #[test] - fn v2_utf8_required() -> Result<()> { - test_pyarrow_integration(2, 2, "basic", false, true) - } - - #[test] - fn v2_utf8_nullable_dict() -> Result<()> { - test_pyarrow_integration(2, 2, "basic", true, false) - } - - #[test] - fn v1_utf8_nullable_dict() -> Result<()> { - test_pyarrow_integration(2, 1, "basic", true, false) - } - - #[test] - fn v2_boolean_nullable() -> Result<()> { - test_pyarrow_integration(3, 2, "basic", false, false) - } - - #[test] - fn v2_boolean_required() -> Result<()> { - test_pyarrow_integration(3, 2, "basic", false, true) - } - - #[test] - fn v2_nested_int64_nullable() -> Result<()> { - test_pyarrow_integration(0, 2, "nested", false, false) - } - - #[test] - fn v1_nested_int64_nullable() -> Result<()> { - test_pyarrow_integration(0, 1, "nested", false, false) - } - - #[test] - fn v2_nested_int64_nullable_required() -> Result<()> { - test_pyarrow_integration(1, 2, "nested", false, false) - } - - #[test] - fn v1_nested_int64_nullable_required() -> Result<()> { - test_pyarrow_integration(1, 1, "nested", false, false) - } - - #[test] - fn v2_nested_int64_required_required() -> Result<()> { - test_pyarrow_integration(2, 2, "nested", false, false) - } - - #[test] - fn v1_nested_int64_required_required() -> Result<()> { - test_pyarrow_integration(2, 1, "nested", false, false) - } - - #[test] - fn v2_nested_i16() -> Result<()> { - test_pyarrow_integration(3, 2, "nested", false, false) - } - - #[test] - fn v1_nested_i16() -> Result<()> { - test_pyarrow_integration(3, 1, "nested", false, false) - } - - #[test] - fn v2_nested_bool() -> Result<()> { - test_pyarrow_integration(4, 2, "nested", false, false) - } - - #[test] - fn v1_nested_bool() -> Result<()> { - test_pyarrow_integration(4, 1, "nested", false, false) - } - - #[test] - fn v2_nested_utf8() -> Result<()> { - test_pyarrow_integration(5, 2, "nested", false, false) - } - - #[test] - fn v1_nested_utf8() -> Result<()> { - test_pyarrow_integration(5, 1, "nested", false, false) - } - - #[test] - fn v2_nested_large_binary() -> Result<()> { - test_pyarrow_integration(6, 2, "nested", false, false) - } - - #[test] - fn v1_nested_large_binary() -> Result<()> { - test_pyarrow_integration(6, 1, "nested", false, false) - } - - /*#[test] - fn v2_nested_nested() { - let _ = test_pyarrow_integration(7, 1, "nested",false, false); - }*/ -} - -#[cfg(test)] -mod tests_integration { - use crate::array::{BinaryArray, Float32Array, Int32Array}; - - use super::*; - use std::sync::Arc; - - #[test] - fn all_types() -> Result<()> { - let path = "testing/parquet-testing/data/alltypes_plain.parquet"; - let reader = std::fs::File::open(path)?; - - let reader = RecordReader::try_new(reader, None, None, Arc::new(|_, _| true), None)?; - - let batches = reader.collect::>>()?; - assert_eq!(batches.len(), 1); - - let result = batches[0] - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(result, &Int32Array::from_slice([4, 5, 6, 7, 2, 3, 0, 1])); - - let result = batches[0] - .column(6) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!( - result, - &Float32Array::from_slice([0.0, 1.1, 0.0, 1.1, 0.0, 1.1, 0.0, 1.1]) - ); - - let result = batches[0] - .column(9) - .as_any() - .downcast_ref::>() - .unwrap(); - assert_eq!( - result, - &BinaryArray::::from_slice([[48], [49], [48], [49], [48], [49], [48], [49]]) - ); - - Ok(()) - } -} diff --git a/src/io/parquet/write/mod.rs b/src/io/parquet/write/mod.rs index 2b4c0f906fa..9973f19abf8 100644 --- a/src/io/parquet/write/mod.rs +++ b/src/io/parquet/write/mod.rs @@ -478,376 +478,3 @@ fn nested_array_to_page( _ => todo!(), } } - -#[cfg(test)] -mod tests { - use super::*; - - use crate::{error::Result, record_batch::RecordBatch}; - use std::io::Cursor; - - use super::super::tests::*; - - fn round_trip( - column: usize, - nullable: bool, - nested: bool, - version: Version, - compression: Compression, - encoding: Encoding, - ) -> Result<()> { - let (array, statistics) = if nested { - ( - pyarrow_nested_nullable(column), - pyarrow_nested_nullable_statistics(column), - ) - } else if nullable { - ( - pyarrow_nullable(column), - pyarrow_nullable_statistics(column), - ) - } else { - ( - pyarrow_required(column), - pyarrow_required_statistics(column), - ) - }; - let array: Arc = array.into(); - - let field = Field::new("a1", array.data_type().clone(), nullable); - let schema = Schema::new(vec![field]); - - let options = WriteOptions { - write_statistics: true, - compression, - version, - }; - - let parquet_schema = to_parquet_schema(&schema)?; - - let iter = vec![RecordBatch::try_new( - Arc::new(schema.clone()), - vec![array.clone()], - )]; - - let row_groups = - RowGroupIterator::try_new(iter.into_iter(), &schema, options, vec![encoding])?; - - let mut writer = Cursor::new(vec![]); - write_file( - &mut writer, - row_groups, - &schema, - parquet_schema, - options, - None, - )?; - - let data = writer.into_inner(); - - let (result, stats) = read_column(&mut Cursor::new(data), 0, 0)?; - assert_eq!(array.as_ref(), result.as_ref()); - assert_eq!(statistics.as_ref(), stats.as_ref()); - Ok(()) - } - - #[test] - fn test_int64_optional_v1() -> Result<()> { - round_trip( - 0, - true, - false, - Version::V1, - Compression::Uncompressed, - Encoding::Plain, - ) - } - - #[test] - fn test_int64_required_v1() -> Result<()> { - round_trip( - 0, - false, - false, - Version::V1, - Compression::Uncompressed, - Encoding::Plain, - ) - } - - #[test] - fn test_int64_optional_v2() -> Result<()> { - round_trip( - 0, - true, - false, - Version::V2, - Compression::Uncompressed, - Encoding::Plain, - ) - } - - #[test] - fn test_int64_optional_v2_compressed() -> Result<()> { - round_trip( - 0, - true, - false, - Version::V2, - Compression::Snappy, - Encoding::Plain, - ) - } - - #[test] - fn test_utf8_optional_v1() -> Result<()> { - round_trip( - 2, - true, - false, - Version::V1, - Compression::Uncompressed, - Encoding::Plain, - ) - } - - #[test] - fn test_utf8_required_v1() -> Result<()> { - round_trip( - 2, - false, - false, - Version::V1, - Compression::Uncompressed, - Encoding::Plain, - ) - } - - #[test] - fn test_utf8_optional_v2() -> Result<()> { - round_trip( - 2, - true, - false, - Version::V2, - Compression::Uncompressed, - Encoding::Plain, - ) - } - - #[test] - fn test_utf8_required_v2() -> Result<()> { - round_trip( - 2, - false, - false, - Version::V2, - Compression::Uncompressed, - Encoding::Plain, - ) - } - - #[test] - fn test_utf8_optional_v2_compressed() -> Result<()> { - round_trip( - 2, - true, - false, - Version::V2, - Compression::Snappy, - Encoding::Plain, - ) - } - - #[test] - fn test_utf8_required_v2_compressed() -> Result<()> { - round_trip( - 2, - false, - false, - Version::V2, - Compression::Snappy, - Encoding::Plain, - ) - } - - #[test] - fn test_bool_optional_v1() -> Result<()> { - round_trip( - 3, - true, - false, - Version::V1, - Compression::Uncompressed, - Encoding::Plain, - ) - } - - #[test] - fn test_bool_required_v1() -> Result<()> { - round_trip( - 3, - false, - false, - Version::V1, - Compression::Uncompressed, - Encoding::Plain, - ) - } - - #[test] - fn test_bool_optional_v2_uncompressed() -> Result<()> { - round_trip( - 3, - true, - false, - Version::V2, - Compression::Uncompressed, - Encoding::Plain, - ) - } - - #[test] - fn test_bool_required_v2_uncompressed() -> Result<()> { - round_trip( - 3, - false, - false, - Version::V2, - Compression::Uncompressed, - Encoding::Plain, - ) - } - - #[test] - fn test_bool_required_v2_compressed() -> Result<()> { - round_trip( - 3, - false, - false, - Version::V2, - Compression::Snappy, - Encoding::Plain, - ) - } - - #[test] - fn test_list_int64_optional_v2() -> Result<()> { - round_trip( - 0, - true, - true, - Version::V2, - Compression::Uncompressed, - Encoding::Plain, - ) - } - - #[test] - fn test_list_int64_optional_v1() -> Result<()> { - round_trip( - 0, - true, - true, - Version::V1, - Compression::Uncompressed, - Encoding::Plain, - ) - } - - #[test] - fn test_list_bool_optional_v2() -> Result<()> { - round_trip( - 4, - true, - true, - Version::V2, - Compression::Uncompressed, - Encoding::Plain, - ) - } - - #[test] - fn test_list_bool_optional_v1() -> Result<()> { - round_trip( - 4, - true, - true, - Version::V1, - Compression::Uncompressed, - Encoding::Plain, - ) - } - - #[test] - fn test_list_utf8_optional_v2() -> Result<()> { - round_trip( - 5, - true, - true, - Version::V2, - Compression::Uncompressed, - Encoding::Plain, - ) - } - - #[test] - fn test_list_utf8_optional_v1() -> Result<()> { - round_trip( - 5, - true, - true, - Version::V1, - Compression::Uncompressed, - Encoding::Plain, - ) - } - - #[test] - fn test_list_large_binary_optional_v2() -> Result<()> { - round_trip( - 6, - true, - true, - Version::V2, - Compression::Uncompressed, - Encoding::Plain, - ) - } - - #[test] - fn test_list_large_binary_optional_v1() -> Result<()> { - round_trip( - 6, - true, - true, - Version::V1, - Compression::Uncompressed, - Encoding::Plain, - ) - } - - #[test] - fn test_utf8_optional_v2_delta() -> Result<()> { - round_trip( - 2, - true, - false, - Version::V2, - Compression::Uncompressed, - Encoding::DeltaLengthByteArray, - ) - } - - #[test] - fn test_i32_optional_v2_dict() -> Result<()> { - round_trip( - 6, - true, - false, - Version::V2, - Compression::Uncompressed, - Encoding::RleDictionary, - ) - } -} diff --git a/src/io/print.rs b/src/io/print.rs index be65b955980..a96654249b6 100644 --- a/src/io/print.rs +++ b/src/io/print.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::{array::*, record_batch::RecordBatch}; +use crate::{array::get_display, record_batch::RecordBatch}; use comfy_table::{Cell, Table}; @@ -64,398 +64,3 @@ fn create_table(results: &[RecordBatch]) -> Table { } table } - -#[cfg(test)] -mod tests { - use crate::{array::*, bitmap::Bitmap, buffer::Buffer, datatypes::*, error::Result}; - - use super::*; - use std::sync::Arc; - - #[test] - fn test_write() -> Result<()> { - // define a schema. - let schema = Arc::new(Schema::new(vec![ - Field::new("a", DataType::Utf8, true), - Field::new("b", DataType::Int32, true), - ])); - - // define data. - let batch = RecordBatch::try_new( - schema, - vec![ - Arc::new(Utf8Array::::from(vec![ - Some("a"), - Some("b"), - None, - Some("d"), - ])), - Arc::new(Int32Array::from(vec![Some(1), None, Some(10), Some(100)])), - ], - )?; - - let table = write(&[batch]); - - let expected = vec![ - "+---+-----+", - "| a | b |", - "+---+-----+", - "| a | 1 |", - "| b | |", - "| | 10 |", - "| d | 100 |", - "+---+-----+", - ]; - - let actual: Vec<&str> = table.lines().collect(); - - assert_eq!(expected, actual, "Actual result:\n{}", table); - - Ok(()) - } - - #[test] - fn test_write_null() -> Result<()> { - let schema = Arc::new(Schema::new(vec![ - Field::new("a", DataType::Utf8, true), - Field::new("b", DataType::Int32, true), - Field::new("c", DataType::Null, true), - ])); - - let num_rows = 4; - let arrays = schema - .fields() - .iter() - .map(|f| new_null_array(f.data_type().clone(), num_rows).into()) - .collect(); - - // define data (null) - let batch = RecordBatch::try_new(schema, arrays)?; - - let table = write(&[batch]); - - let expected = vec![ - "+---+---+---+", - "| a | b | c |", - "+---+---+---+", - "| | | |", - "| | | |", - "| | | |", - "| | | |", - "+---+---+---+", - ]; - - let actual: Vec<&str> = table.lines().collect(); - - assert_eq!(expected, actual, "Actual result:\n{:#?}", table); - Ok(()) - } - - #[test] - fn test_write_dictionary() -> Result<()> { - // define a schema. - let field_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)); - let schema = Arc::new(Schema::new(vec![Field::new("d1", field_type, true)])); - - let mut array = MutableDictionaryArray::>::new(); - - array.try_extend(vec![Some("one"), None, Some("three")])?; - let array = array.into_arc(); - - let batch = RecordBatch::try_new(schema, vec![array])?; - - let table = write(&[batch]); - - let expected = vec![ - "+-------+", - "| d1 |", - "+-------+", - "| one |", - "| |", - "| three |", - "+-------+", - ]; - - let actual: Vec<&str> = table.lines().collect(); - - assert_eq!(expected, actual, "Actual result:\n{}", table); - - Ok(()) - } - - /// Generate an array with type $ARRAYTYPE with a numeric value of - /// $VALUE, and compare $EXPECTED_RESULT to the output of - /// formatting that array with `write` - macro_rules! check_datetime { - ($ty:ty, $datatype:expr, $value:expr, $EXPECTED_RESULT:expr) => { - let array = Arc::new(PrimitiveArray::<$ty>::from(&[Some($value), None]).to($datatype)); - - let schema = Arc::new(Schema::new(vec![Field::new( - "f", - array.data_type().clone(), - true, - )])); - let batch = RecordBatch::try_new(schema, vec![array]).unwrap(); - - let table = write(&[batch]); - - let expected = $EXPECTED_RESULT; - let actual: Vec<&str> = table.lines().collect(); - - assert_eq!(expected, actual, "Actual result:\n\n{:#?}\n\n", actual); - }; - } - - #[test] - fn test_write_timestamp_second() { - let expected = vec![ - "+---------------------+", - "| f |", - "+---------------------+", - "| 1970-05-09 14:25:11 |", - "| |", - "+---------------------+", - ]; - check_datetime!( - i64, - DataType::Timestamp(TimeUnit::Second, None), - 11111111, - expected - ); - } - - #[test] - fn test_write_timestamp_second_with_tz() { - let expected = vec![ - "+-------------------------+", - "| f |", - "+-------------------------+", - "| 1970-05-09 14:25:11 UTC |", - "| |", - "+-------------------------+", - ]; - check_datetime!( - i64, - DataType::Timestamp(TimeUnit::Second, Some("UTC".to_string())), - 11111111, - expected - ); - } - - #[test] - fn test_write_timestamp_millisecond() { - let expected = vec![ - "+-------------------------+", - "| f |", - "+-------------------------+", - "| 1970-01-01 03:05:11.111 |", - "| |", - "+-------------------------+", - ]; - check_datetime!( - i64, - DataType::Timestamp(TimeUnit::Millisecond, None), - 11111111, - expected - ); - } - - #[test] - fn test_write_timestamp_microsecond() { - let expected = vec![ - "+----------------------------+", - "| f |", - "+----------------------------+", - "| 1970-01-01 00:00:11.111111 |", - "| |", - "+----------------------------+", - ]; - check_datetime!( - i64, - DataType::Timestamp(TimeUnit::Microsecond, None), - 11111111, - expected - ); - } - - #[test] - fn test_write_timestamp_nanosecond() { - let expected = vec![ - "+-------------------------------+", - "| f |", - "+-------------------------------+", - "| 1970-01-01 00:00:00.011111111 |", - "| |", - "+-------------------------------+", - ]; - check_datetime!( - i64, - DataType::Timestamp(TimeUnit::Nanosecond, None), - 11111111, - expected - ); - } - - #[test] - fn test_write_date_32() { - let expected = vec![ - "+------------+", - "| f |", - "+------------+", - "| 1973-05-19 |", - "| |", - "+------------+", - ]; - check_datetime!(i32, DataType::Date32, 1234, expected); - } - - #[test] - fn test_write_date_64() { - let expected = vec![ - "+------------+", - "| f |", - "+------------+", - "| 2005-03-18 |", - "| |", - "+------------+", - ]; - check_datetime!(i64, DataType::Date64, 1111111100000, expected); - } - - #[test] - fn test_write_time_32_second() { - let expected = vec![ - "+----------+", - "| f |", - "+----------+", - "| 00:18:31 |", - "| |", - "+----------+", - ]; - check_datetime!(i32, DataType::Time32(TimeUnit::Second), 1111, expected); - } - - #[test] - fn test_write_time_32_millisecond() { - let expected = vec![ - "+--------------+", - "| f |", - "+--------------+", - "| 03:05:11.111 |", - "| |", - "+--------------+", - ]; - check_datetime!( - i32, - DataType::Time32(TimeUnit::Millisecond), - 11111111, - expected - ); - } - - #[test] - fn test_write_time_64_microsecond() { - let expected = vec![ - "+-----------------+", - "| f |", - "+-----------------+", - "| 00:00:11.111111 |", - "| |", - "+-----------------+", - ]; - check_datetime!( - i64, - DataType::Time64(TimeUnit::Microsecond), - 11111111, - expected - ); - } - - #[test] - fn test_write_time_64_nanosecond() { - let expected = vec![ - "+--------------------+", - "| f |", - "+--------------------+", - "| 00:00:00.011111111 |", - "| |", - "+--------------------+", - ]; - check_datetime!( - i64, - DataType::Time64(TimeUnit::Nanosecond), - 11111111, - expected - ); - } - - #[test] - fn test_write_struct() -> Result<()> { - let fields = vec![ - Field::new("a", DataType::Int32, true), - Field::new("b", DataType::Utf8, true), - ]; - let values = vec![ - Arc::new(Int32Array::from(&[Some(1), None, Some(2)])) as Arc, - Arc::new(Utf8Array::::from(&[Some("a"), Some("b"), Some("c")])) as Arc, - ]; - - let validity = Some(Bitmap::from(&[true, false, true])); - - let array = StructArray::from_data(fields, values, validity); - - let schema = Schema::new(vec![Field::new("a", array.data_type().clone(), true)]); - - let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(array)])?; - - let table = write(&[batch]); - - let expected = vec![ - "+--------------+", - "| a |", - "+--------------+", - "| {a: 1, b: a} |", - "| |", - "| {a: 2, b: c} |", - "+--------------+", - ]; - - let actual: Vec<&str> = table.lines().collect(); - - assert_eq!(expected, actual, "Actual result:\n{}", table); - - Ok(()) - } - - #[test] - fn test_write_union() -> Result<()> { - let fields = vec![ - Field::new("a", DataType::Int32, true), - Field::new("b", DataType::Utf8, true), - ]; - let data_type = DataType::Union(fields, None, true); - let types = Buffer::from(&[0, 0, 1]); - let fields = vec![ - Arc::new(Int32Array::from(&[Some(1), None, Some(2)])) as Arc, - Arc::new(Utf8Array::::from(&[Some("a"), Some("b"), Some("c")])) as Arc, - ]; - - let array = UnionArray::from_data(data_type, types, fields, None); - - let schema = Schema::new(vec![Field::new("a", array.data_type().clone(), true)]); - - let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(array)])?; - - let table = write(&[batch]); - - let expected = vec![ - "+---+", "| a |", "+---+", "| 1 |", "| |", "| c |", "+---+", - ]; - - let actual: Vec<&str> = table.lines().collect(); - - assert_eq!(expected, actual, "Actual result:\n{}", table); - - Ok(()) - } -} diff --git a/src/util/mod.rs b/src/util/mod.rs index 4c7cccba1c8..53b658b8416 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -39,8 +39,5 @@ pub fn lexical_to_string(n: N) -> String { unsafe { String::from_utf8_unchecked(lexical_to_bytes(n)) } } -#[cfg(test)] -pub mod test_util; - #[cfg(feature = "benchmarks")] pub mod bench_util; diff --git a/tests/it/alloc.rs b/tests/it/alloc.rs new file mode 100644 index 00000000000..bfd35bcae1c --- /dev/null +++ b/tests/it/alloc.rs @@ -0,0 +1,49 @@ +use arrow2::alloc::*; + +#[test] +fn allocate_dangling() { + let p = allocate_aligned::(0); + assert_eq!(0, (p.as_ptr() as usize) % ALIGNMENT); +} + +#[test] +fn allocate() { + let p = allocate_aligned::(1024); + assert_eq!(0, (p.as_ptr() as usize) % ALIGNMENT); + unsafe { free_aligned(p, 1024) }; +} + +#[test] +fn allocate_zeroed() { + let p = allocate_aligned_zeroed::(1024); + assert_eq!(0, (p.as_ptr() as usize) % ALIGNMENT); + unsafe { free_aligned(p, 1024) }; +} + +#[test] +fn reallocate_from_zero() { + let ptr = allocate_aligned::(0); + let ptr = unsafe { reallocate(ptr, 0, 512) }; + unsafe { free_aligned(ptr, 512) }; +} + +#[test] +fn reallocate_from_alloc() { + let ptr = allocate_aligned::(32); + let ptr = unsafe { reallocate(ptr, 32, 64) }; + unsafe { free_aligned(ptr, 64) }; +} + +#[test] +fn reallocate_smaller() { + let ptr = allocate_aligned::(32); + let ptr = unsafe { reallocate(ptr, 32, 16) }; + unsafe { free_aligned(ptr, 16) }; +} + +#[test] +fn reallocate_to_zero() { + let ptr = allocate_aligned::(32); + let ptr = unsafe { reallocate(ptr, 32, 0) }; + assert_eq!(ptr, unsafe { dangling() }); +} diff --git a/tests/it/bitmap/immutable.rs b/tests/it/bitmap/immutable.rs new file mode 100644 index 00000000000..e6b13d76d94 --- /dev/null +++ b/tests/it/bitmap/immutable.rs @@ -0,0 +1,41 @@ +use arrow2::bitmap::Bitmap; + +#[test] +fn as_slice() { + let b = Bitmap::from([true, true, true, true, true, true, true, true, true]); + + let (slice, offset, length) = b.as_slice(); + assert_eq!(slice, &[0b11111111, 0b1]); + assert_eq!(offset, 0); + assert_eq!(length, 9); +} + +#[test] +fn as_slice_offset() { + let b = Bitmap::from([true, true, true, true, true, true, true, true, true]); + let b = b.slice(8, 1); + + let (slice, offset, length) = b.as_slice(); + assert_eq!(slice, &[0b1]); + assert_eq!(offset, 0); + assert_eq!(length, 1); +} + +#[test] +fn as_slice_offset_middle() { + let b = Bitmap::from_u8_slice(&[0, 0, 0, 0b00010101], 27); + let b = b.slice(22, 5); + + let (slice, offset, length) = b.as_slice(); + assert_eq!(slice, &[0, 0b00010101]); + assert_eq!(offset, 6); + assert_eq!(length, 5); +} + +#[test] +fn debug() { + let b = Bitmap::from([true, true, false, true, true, true, true, true, true]); + let b = b.slice(2, 7); + + assert_eq!(format!("{:?}", b), "[0b111110__, 0b_______1]"); +} diff --git a/tests/it/bitmap/mod.rs b/tests/it/bitmap/mod.rs new file mode 100644 index 00000000000..85a86d9c1fe --- /dev/null +++ b/tests/it/bitmap/mod.rs @@ -0,0 +1,3 @@ +mod immutable; +mod mutable; +mod utils; diff --git a/tests/it/bitmap/mutable.rs b/tests/it/bitmap/mutable.rs new file mode 100644 index 00000000000..419c0ab8699 --- /dev/null +++ b/tests/it/bitmap/mutable.rs @@ -0,0 +1,175 @@ +use arrow2::bitmap::{Bitmap, MutableBitmap}; + +#[test] +fn trusted_len() { + let data = vec![true; 65]; + let bitmap = MutableBitmap::from_trusted_len_iter(data.into_iter()); + let bitmap: Bitmap = bitmap.into(); + assert_eq!(bitmap.len(), 65); + + assert_eq!(bitmap.as_slice().0[8], 0b00000001); +} + +#[test] +fn trusted_len_small() { + let data = vec![true; 7]; + let bitmap = MutableBitmap::from_trusted_len_iter(data.into_iter()); + let bitmap: Bitmap = bitmap.into(); + assert_eq!(bitmap.len(), 7); + + assert_eq!(bitmap.as_slice().0[0], 0b01111111); +} + +#[test] +fn push() { + let mut bitmap = MutableBitmap::new(); + bitmap.push(true); + bitmap.push(false); + bitmap.push(false); + for _ in 0..7 { + bitmap.push(true) + } + let bitmap: Bitmap = bitmap.into(); + assert_eq!(bitmap.len(), 10); + + assert_eq!(bitmap.as_slice().0, &[0b11111001, 0b00000011]); +} + +#[test] +fn push_small() { + let mut bitmap = MutableBitmap::new(); + bitmap.push(true); + bitmap.push(true); + bitmap.push(false); + let bitmap: Option = bitmap.into(); + let bitmap = bitmap.unwrap(); + assert_eq!(bitmap.len(), 3); + assert_eq!(bitmap.as_slice().0[0], 0b00000011); +} + +#[test] +fn push_exact_zeros() { + let mut bitmap = MutableBitmap::new(); + for _ in 0..8 { + bitmap.push(false) + } + let bitmap: Option = bitmap.into(); + let bitmap = bitmap.unwrap(); + assert_eq!(bitmap.len(), 8); + assert_eq!(bitmap.as_slice().0.len(), 1); +} + +#[test] +fn push_exact_ones() { + let mut bitmap = MutableBitmap::new(); + for _ in 0..8 { + bitmap.push(true) + } + let bitmap: Option = bitmap.into(); + assert!(bitmap.is_none()); +} + +#[test] +fn capacity() { + let b = MutableBitmap::with_capacity(10); + assert_eq!(b.capacity(), 512); + + let b = MutableBitmap::with_capacity(512); + assert_eq!(b.capacity(), 512); + + let mut b = MutableBitmap::with_capacity(512); + b.reserve(8); + assert_eq!(b.capacity(), 512); +} + +#[test] +fn capacity_push() { + let mut b = MutableBitmap::with_capacity(512); + (0..512).for_each(|_| b.push(true)); + assert_eq!(b.capacity(), 512); + b.reserve(8); + assert_eq!(b.capacity(), 1024); +} + +#[test] +fn extend() { + let mut b = MutableBitmap::new(); + + let iter = (0..512).map(|i| i % 6 == 0); + unsafe { b.extend_from_trusted_len_iter_unchecked(iter) }; + let b: Bitmap = b.into(); + for (i, v) in b.iter().enumerate() { + assert_eq!(i % 6 == 0, v); + } +} + +#[test] +fn extend_offset() { + let mut b = MutableBitmap::new(); + b.push(true); + + let iter = (0..512).map(|i| i % 6 == 0); + unsafe { b.extend_from_trusted_len_iter_unchecked(iter) }; + let b: Bitmap = b.into(); + let mut iter = b.iter().enumerate(); + assert!(iter.next().unwrap().1); + for (i, v) in iter { + assert_eq!((i - 1) % 6 == 0, v); + } +} + +#[test] +fn set() { + let mut bitmap = MutableBitmap::from_len_zeroed(12); + bitmap.set(0, true); + assert!(bitmap.get(0)); + bitmap.set(0, false); + assert!(!bitmap.get(0)); + + bitmap.set(11, true); + assert!(bitmap.get(11)); + bitmap.set(11, false); + assert!(!bitmap.get(11)); + bitmap.set(11, true); + + let bitmap: Option = bitmap.into(); + let bitmap = bitmap.unwrap(); + assert_eq!(bitmap.len(), 12); + assert_eq!(bitmap.as_slice().0[0], 0b00000000); +} + +#[test] +fn extend_from_bitmap() { + let other = Bitmap::from(&[true, false, true]); + let mut bitmap = MutableBitmap::new(); + + // call is optimized to perform a memcopy + bitmap.extend_from_bitmap(&other); + + assert_eq!(bitmap.len(), 3); + assert_eq!(bitmap.as_slice()[0], 0b00000101); + + // this call iterates over all bits + bitmap.extend_from_bitmap(&other); + + assert_eq!(bitmap.len(), 6); + assert_eq!(bitmap.as_slice()[0], 0b00101101); +} + +#[test] +fn debug() { + let mut b = MutableBitmap::new(); + assert_eq!(format!("{:?}", b), "[]"); + b.push(true); + b.push(false); + assert_eq!(format!("{:?}", b), "[0b______01]"); + b.push(false); + b.push(false); + b.push(false); + b.push(false); + b.push(true); + b.push(true); + assert_eq!(format!("{:?}", b), "[0b11000001]"); + b.push(true); + assert_eq!(format!("{:?}", b), "[0b11000001, 0b_______1]"); +} diff --git a/tests/it/bitmap/utils/bit_chunks_exact.rs b/tests/it/bitmap/utils/bit_chunks_exact.rs new file mode 100644 index 00000000000..755b8ddb9e6 --- /dev/null +++ b/tests/it/bitmap/utils/bit_chunks_exact.rs @@ -0,0 +1,22 @@ +use arrow2::bitmap::utils::BitChunksExact; + +#[test] +fn basics() { + let mut iter = BitChunksExact::::new(&[0b11111111u8, 0b00000001u8], 9); + assert_eq!(iter.next().unwrap(), 0b11111111u8); + assert_eq!(iter.remainder(), 0b00000001u8); +} + +#[test] +fn basics_u16_small() { + let mut iter = BitChunksExact::::new(&[0b11111111u8], 9); + assert_eq!(iter.next(), None); + assert_eq!(iter.remainder(), 0b0000_0000_1111_1111u16); +} + +#[test] +fn basics_u16() { + let mut iter = BitChunksExact::::new(&[0b11111111u8, 0b00000001u8], 9); + assert_eq!(iter.next(), None); + assert_eq!(iter.remainder(), 0b0000_0001_1111_1111u16); +} diff --git a/tests/it/bitmap/utils/chunk_iter.rs b/tests/it/bitmap/utils/chunk_iter.rs new file mode 100644 index 00000000000..3bbad3f88cf --- /dev/null +++ b/tests/it/bitmap/utils/chunk_iter.rs @@ -0,0 +1,163 @@ +use arrow2::bitmap::utils::BitChunks; +use arrow2::types::BitChunkIter; + +#[test] +fn basics() { + let mut iter = BitChunks::::new(&[0b00000001u8, 0b00000010u8], 0, 16); + assert_eq!(iter.next().unwrap(), 0b0000_0010_0000_0001u16); + assert_eq!(iter.remainder(), 0); +} + +#[test] +fn remainder() { + let a = BitChunks::::new(&[0b00000001u8, 0b00000010u8, 0b00000100u8], 0, 18); + assert_eq!(a.remainder(), 0b00000100u16); +} + +#[test] +fn remainder_saturating() { + let a = BitChunks::::new(&[0b00000001u8, 0b00000010u8, 0b00000010u8], 0, 18); + assert_eq!(a.remainder(), 0b0000_0000_0000_0010u16); +} + +#[test] +fn basics_offset() { + let mut iter = BitChunks::::new(&[0b00000001u8, 0b00000011u8, 0b00000001u8], 1, 16); + assert_eq!(iter.remainder(), 0); + assert_eq!(iter.next().unwrap(), 0b1000_0001_1000_0000u16); + assert_eq!(iter.next(), None); +} + +#[test] +fn basics_offset_remainder() { + let mut a = BitChunks::::new(&[0b00000001u8, 0b00000011u8, 0b10000001u8], 1, 15); + assert_eq!(a.next(), None); + assert_eq!(a.remainder(), 0b1000_0001_1000_0000u16); + assert_eq!(a.remainder_len(), 15); +} + +#[test] +fn offset_remainder_saturating() { + let a = BitChunks::::new(&[0b00000001u8, 0b00000011u8, 0b00000011u8], 1, 17); + assert_eq!(a.remainder(), 0b0000_0000_0000_0001u16); +} + +#[test] +fn offset_remainder_saturating2() { + let a = BitChunks::::new(&[0b01001001u8, 0b00000001], 1, 8); + assert_eq!(a.remainder(), 0b1010_0100u64); +} + +#[test] +fn offset_remainder_saturating3() { + let input: &[u8] = &[0b01000000, 0b01000001]; + let a = BitChunks::::new(input, 8, 2); + assert_eq!(a.remainder(), 0b0100_0001u64); +} + +#[test] +fn basics_multiple() { + let mut iter = BitChunks::::new( + &[0b00000001u8, 0b00000010u8, 0b00000100u8, 0b00001000u8], + 0, + 4 * 8, + ); + assert_eq!(iter.next().unwrap(), 0b0000_0010_0000_0001u16); + assert_eq!(iter.next().unwrap(), 0b0000_1000_0000_0100u16); + assert_eq!(iter.remainder(), 0); +} + +#[test] +fn basics_multiple_offset() { + let mut iter = BitChunks::::new( + &[ + 0b00000001u8, + 0b00000010u8, + 0b00000100u8, + 0b00001000u8, + 0b00000001u8, + ], + 1, + 4 * 8, + ); + assert_eq!(iter.next().unwrap(), 0b0000_0001_0000_0000u16); + assert_eq!(iter.next().unwrap(), 0b1000_0100_0000_0010u16); + assert_eq!(iter.remainder(), 0); +} + +#[test] +fn remainder_large() { + let input: &[u8] = &[ + 0b00100100, 0b01001001, 0b10010010, 0b00100100, 0b01001001, 0b10010010, 0b00100100, + 0b01001001, 0b10010010, 0b00100100, 0b01001001, 0b10010010, 0b00000100, + ]; + let mut iter = BitChunks::::new(input, 0, 8 * 12 + 4); + assert_eq!(iter.remainder_len(), 100 - 96); + + for j in 0..12 { + let mut a = BitChunkIter::new(iter.next().unwrap(), 8); + for i in 0..8 { + assert_eq!(a.next().unwrap(), (j * 8 + i + 1) % 3 == 0); + } + } + assert_eq!(None, iter.next()); + + let expected_remainder = 0b00000100u8; + assert_eq!(iter.remainder(), expected_remainder); + + let mut a = BitChunkIter::new(expected_remainder, 8); + for i in 0..4 { + assert_eq!(a.next().unwrap(), (i + 1) % 3 == 0); + } +} + +#[test] +fn basics_1() { + let mut iter = BitChunks::::new( + &[0b00000001u8, 0b00000010u8, 0b00000100u8, 0b00001000u8], + 8, + 3 * 8, + ); + assert_eq!(iter.next().unwrap(), 0b0000_0100_0000_0010u16); + assert_eq!(iter.next(), None); + assert_eq!(iter.remainder(), 0b0000_0000_0000_1000u16); + assert_eq!(iter.remainder_len(), 8); +} + +#[test] +fn basics_2() { + let mut iter = BitChunks::::new( + &[0b00000001u8, 0b00000010u8, 0b00000100u8, 0b00001000u8], + 7, + 3 * 8, + ); + assert_eq!(iter.remainder(), 0b0000_0000_0001_0000u16); + assert_eq!(iter.next().unwrap(), 0b0000_1000_0000_0100u16); + assert_eq!(iter.next(), None); +} + +#[test] +fn remainder_1() { + let mut iter = BitChunks::::new(&[0b11111111u8, 0b00000001u8], 0, 9); + assert_eq!(iter.next(), None); + assert_eq!(iter.remainder(), 0b1_1111_1111u64); +} + +#[test] +fn remainder_2() { + // (i % 3 == 0) in bitmap + let input: &[u8] = &[ + 0b01001001, 0b10010010, 0b00100100, 0b01001001, 0b10010010, 0b00100100, 0b01001001, + 0b10010010, 0b00100100, 0b01001001, /* 73 */ + 0b10010010, /* 146 */ + 0b00100100, 0b00001001, + ]; + let offset = 10; // 8 + 2 + let length = 90; + + let mut iter = BitChunks::::new(input, offset, length); + let first: u64 = 0b0100100100100100100100100100100100100100100100100100100100100100; + assert_eq!(first, iter.next().unwrap()); + assert_eq!(iter.next(), None); + assert_eq!(iter.remainder(), 0b10010010010010010010010010u64); +} diff --git a/tests/it/bitmap/utils/iterator.rs b/tests/it/bitmap/utils/iterator.rs new file mode 100644 index 00000000000..1f1d56d39d0 --- /dev/null +++ b/tests/it/bitmap/utils/iterator.rs @@ -0,0 +1,44 @@ +use arrow2::bitmap::utils::BitmapIter; + +#[test] +fn basic() { + let values = &[0b01011011u8]; + let iter = BitmapIter::new(values, 0, 6); + let result = iter.collect::>(); + assert_eq!(result, vec![true, true, false, true, true, false]) +} + +#[test] +fn large() { + let values = &[0b01011011u8]; + let values = std::iter::repeat(values) + .take(63) + .flatten() + .copied() + .collect::>(); + let len = 63 * 8; + let iter = BitmapIter::new(&values, 0, len); + assert_eq!(iter.count(), len); +} + +#[test] +fn offset() { + let values = &[0b01011011u8]; + let iter = BitmapIter::new(values, 2, 4); + let result = iter.collect::>(); + assert_eq!(result, vec![false, true, true, false]) +} + +#[test] +fn rev() { + let values = &[0b01011011u8, 0b01011011u8]; + let iter = BitmapIter::new(values, 2, 13); + let result = iter.rev().collect::>(); + assert_eq!( + result, + vec![false, true, true, false, true, false, true, true, false, true, true, false, true] + .into_iter() + .rev() + .collect::>() + ) +} diff --git a/tests/it/bitmap/utils/mod.rs b/tests/it/bitmap/utils/mod.rs new file mode 100644 index 00000000000..cc30620dae4 --- /dev/null +++ b/tests/it/bitmap/utils/mod.rs @@ -0,0 +1,68 @@ +use arrow2::bitmap::utils::*; + +mod chunk_iter; +mod iterator; +mod slice_iterator; +mod zip_validity; + +#[test] +fn get_bit_basics() { + let input: &[u8] = &[ + 0b00000000, 0b00000001, 0b00000010, 0b00000100, 0b00001000, 0b00010000, 0b00100000, + 0b01000000, 0b11111111, + ]; + for i in 0..8 { + assert!(!get_bit(input, i)); + } + assert!(get_bit(input, 8)); + for i in 8 + 1..2 * 8 { + assert!(!get_bit(input, i)); + } + assert!(get_bit(input, 2 * 8 + 1)); + for i in 2 * 8 + 2..3 * 8 { + assert!(!get_bit(input, i)); + } + assert!(get_bit(input, 3 * 8 + 2)); + for i in 3 * 8 + 3..4 * 8 { + assert!(!get_bit(input, i)); + } + assert!(get_bit(input, 4 * 8 + 3)); +} + +#[test] +fn null_count_basics() { + let input: &[u8] = &[ + 0b01001001, 0b00000001, 0b00000010, 0b00000100, 0b00001000, 0b00010000, 0b00100000, + 0b01000000, 0b11111111, + ]; + assert_eq!(null_count(input, 0, 8), 8 - 3); + assert_eq!(null_count(input, 1, 7), 7 - 2); + assert_eq!(null_count(input, 1, 8), 8 - 3); + assert_eq!(null_count(input, 2, 7), 7 - 3); + assert_eq!(null_count(input, 0, 32), 32 - 6); + assert_eq!(null_count(input, 9, 2), 2); + + let input: &[u8] = &[0b01000000, 0b01000001]; + assert_eq!(null_count(input, 8, 2), 1); + assert_eq!(null_count(input, 8, 3), 2); + assert_eq!(null_count(input, 8, 4), 3); + assert_eq!(null_count(input, 8, 5), 4); + assert_eq!(null_count(input, 8, 6), 5); + assert_eq!(null_count(input, 8, 7), 5); + assert_eq!(null_count(input, 8, 8), 6); + + let input: &[u8] = &[0b01000000, 0b01010101]; + assert_eq!(null_count(input, 9, 2), 1); + assert_eq!(null_count(input, 10, 2), 1); + assert_eq!(null_count(input, 11, 2), 1); + assert_eq!(null_count(input, 12, 2), 1); + assert_eq!(null_count(input, 13, 2), 1); + assert_eq!(null_count(input, 14, 2), 1); +} + +#[test] +fn null_count_1() { + // offset = 10, len = 90 => remainder + let input: &[u8] = &[73, 146, 36, 73, 146, 36, 73, 146, 36, 73, 146, 36, 9]; + assert_eq!(null_count(input, 10, 90), 60); +} diff --git a/tests/it/bitmap/utils/slice_iterator.rs b/tests/it/bitmap/utils/slice_iterator.rs new file mode 100644 index 00000000000..47bfcf2fbd0 --- /dev/null +++ b/tests/it/bitmap/utils/slice_iterator.rs @@ -0,0 +1,144 @@ +use arrow2::bitmap::utils::SlicesIterator; +use arrow2::bitmap::Bitmap; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn check_invariant() { + let values = (0..8).map(|i| i % 2 != 0).collect::(); + let iter = SlicesIterator::new(&values); + + let slots = iter.slots(); + + let slices = iter.collect::>(); + + assert_eq!(slices, vec![(1, 1), (3, 1), (5, 1), (7, 1)]); + + let mut sum = 0; + for (_, len) in slices { + sum += len; + } + assert_eq!(sum, slots); + } + + #[test] + fn single_set() { + let values = (0..16).map(|i| i == 1).collect::(); + + let iter = SlicesIterator::new(&values); + let count = iter.slots(); + let chunks = iter.collect::>(); + + assert_eq!(chunks, vec![(1, 1)]); + assert_eq!(count, 1); + } + + #[test] + fn single_unset() { + let values = (0..64).map(|i| i != 1).collect::(); + + let iter = SlicesIterator::new(&values); + let count = iter.slots(); + let chunks = iter.collect::>(); + + assert_eq!(chunks, vec![(0, 1), (2, 62)]); + assert_eq!(count, 64 - 1); + } + + #[test] + fn generic() { + let values = (0..130).map(|i| i % 62 != 0).collect::(); + + let iter = SlicesIterator::new(&values); + let count = iter.slots(); + let chunks = iter.collect::>(); + + assert_eq!(chunks, vec![(1, 61), (63, 61), (125, 5)]); + assert_eq!(count, 61 + 61 + 5); + } + + #[test] + fn incomplete_byte() { + let values = (0..6).map(|i| i == 1).collect::(); + + let iter = SlicesIterator::new(&values); + let count = iter.slots(); + let chunks = iter.collect::>(); + + assert_eq!(chunks, vec![(1, 1)]); + assert_eq!(count, 1); + } + + #[test] + fn incomplete_byte1() { + let values = (0..12).map(|i| i == 9).collect::(); + + let iter = SlicesIterator::new(&values); + let count = iter.slots(); + let chunks = iter.collect::>(); + + assert_eq!(chunks, vec![(9, 1)]); + assert_eq!(count, 1); + } + + #[test] + fn end_of_byte() { + let values = (0..16).map(|i| i != 7).collect::(); + + let iter = SlicesIterator::new(&values); + let count = iter.slots(); + let chunks = iter.collect::>(); + + assert_eq!(chunks, vec![(0, 7), (8, 8)]); + assert_eq!(count, 15); + } + + #[test] + fn bla() { + let values = vec![true, true, true, true, true, true, true, false] + .into_iter() + .collect::(); + let iter = SlicesIterator::new(&values); + let count = iter.slots(); + assert_eq!(values.null_count() + iter.slots(), values.len()); + + let total = iter.into_iter().fold(0, |acc, x| acc + x.1); + + assert_eq!(count, total); + } + + #[test] + fn past_end_should_not_be_returned() { + let values = Bitmap::from_u8_slice(&[0b11111010], 3); + let iter = SlicesIterator::new(&values); + let count = iter.slots(); + assert_eq!(values.null_count() + iter.slots(), values.len()); + + let total = iter.into_iter().fold(0, |acc, x| acc + x.1); + + assert_eq!(count, total); + } + + #[test] + fn sliced() { + let values = Bitmap::from_u8_slice(&[0b11111010, 0b11111011], 16); + let values = values.slice(8, 2); + let iter = SlicesIterator::new(&values); + + let chunks = iter.collect::>(); + + // the first "11" in the second byte + assert_eq!(chunks, vec![(0, 2)]); + } + + #[test] + fn remainder_1() { + let values = Bitmap::from_u8_slice(&[0, 0, 0b00000000, 0b00010101], 27); + let values = values.slice(22, 5); + let iter = SlicesIterator::new(&values); + let chunks = iter.collect::>(); + assert_eq!(chunks, vec![(2, 1), (4, 1)]); + } +} diff --git a/tests/it/bitmap/utils/zip_validity.rs b/tests/it/bitmap/utils/zip_validity.rs new file mode 100644 index 00000000000..b49ec623810 --- /dev/null +++ b/tests/it/bitmap/utils/zip_validity.rs @@ -0,0 +1,105 @@ +use arrow2::bitmap::{utils::zip_validity, Bitmap}; + +#[test] +fn basic() { + let a = Some(Bitmap::from([true, false])); + let values = vec![0, 1]; + let zip = zip_validity(values.into_iter(), &a); + + let a = zip.collect::>(); + assert_eq!(a, vec![Some(0), None]); +} + +#[test] +fn complete() { + let a = Some(Bitmap::from([ + true, false, true, false, true, false, true, false, + ])); + let values = vec![0, 1, 2, 3, 4, 5, 6, 7]; + let zip = zip_validity(values.into_iter(), &a); + + let a = zip.collect::>(); + assert_eq!( + a, + vec![Some(0), None, Some(2), None, Some(4), None, Some(6), None] + ); +} + +#[test] +fn slices() { + let a = Some(Bitmap::from([true, false])); + let offsets = vec![0, 2, 3]; + let values = vec![1, 2, 3]; + let iter = offsets.windows(2).map(|x| { + let start = x[0]; + let end = x[1]; + &values[start..end] + }); + let zip = zip_validity(iter, &a); + + let a = zip.collect::>(); + assert_eq!(a, vec![Some([1, 2].as_ref()), None]); +} + +#[test] +fn byte() { + let a = Some(Bitmap::from([ + true, false, true, false, false, true, true, false, true, + ])); + let values = vec![0, 1, 2, 3, 4, 5, 6, 7, 8]; + let zip = zip_validity(values.into_iter(), &a); + + let a = zip.collect::>(); + assert_eq!( + a, + vec![ + Some(0), + None, + Some(2), + None, + None, + Some(5), + Some(6), + None, + Some(8) + ] + ); +} + +#[test] +fn offset() { + let a = Bitmap::from([true, false, true, false, false, true, true, false, true]); + let a = Some(a.slice(1, 8)); + let values = vec![0, 1, 2, 3, 4, 5, 6, 7]; + let zip = zip_validity(values.into_iter(), &a); + + let a = zip.collect::>(); + assert_eq!( + a, + vec![None, Some(1), None, None, Some(4), Some(5), None, Some(7)] + ); +} + +#[test] +fn none() { + let values = vec![0, 1, 2]; + let zip = zip_validity(values.into_iter(), &None); + + let a = zip.collect::>(); + assert_eq!(a, vec![Some(0), Some(1), Some(2)]); +} + +#[test] +fn rev() { + let a = Bitmap::from([true, false, true, false, false, true, true, false, true]); + let a = Some(a.slice(1, 8)); + let values = vec![0, 1, 2, 3, 4, 5, 6, 7]; + let zip = zip_validity(values.into_iter(), &a); + + let result = zip.rev().collect::>(); + let expected = vec![None, Some(1), None, None, Some(4), Some(5), None, Some(7)] + .into_iter() + .rev() + .collect::>(); + assert_eq!(result, expected); +} diff --git a/tests/it/buffer/immutable.rs b/tests/it/buffer/immutable.rs new file mode 100644 index 00000000000..db969849053 --- /dev/null +++ b/tests/it/buffer/immutable.rs @@ -0,0 +1,69 @@ +use arrow2::buffer::Buffer; + +#[test] +fn new() { + let buffer = Buffer::::new(); + assert_eq!(buffer.len(), 0); + assert!(buffer.is_empty()); +} + +#[test] +fn new_zeroed() { + let buffer = Buffer::::new_zeroed(2); + assert_eq!(buffer.len(), 2); + assert!(!buffer.is_empty()); + assert_eq!(buffer.as_slice(), &[0, 0]); +} + +#[test] +fn from_slice() { + let buffer = Buffer::::from(&[0, 1, 2]); + assert_eq!(buffer.len(), 3); + assert_eq!(buffer.as_slice(), &[0, 1, 2]); +} + +#[test] +fn slice() { + let buffer = Buffer::::from(&[0, 1, 2, 3]); + let buffer = buffer.slice(1, 2); + assert_eq!(buffer.len(), 2); + assert_eq!(buffer.as_slice(), &[1, 2]); +} + +#[test] +fn from_iter() { + let buffer = (0..3).collect::>(); + assert_eq!(buffer.len(), 3); + assert_eq!(buffer.as_slice(), &[0, 1, 2]); +} + +#[test] +fn from_trusted_len_iter() { + let buffer = unsafe { Buffer::::from_trusted_len_iter_unchecked(0..3) }; + assert_eq!(buffer.len(), 3); + assert_eq!(buffer.as_slice(), &[0, 1, 2]); +} + +#[test] +fn try_from_trusted_len_iter() { + let iter = (0..3).map(Result::<_, String>::Ok); + let buffer = unsafe { Buffer::::try_from_trusted_len_iter_unchecked(iter) }.unwrap(); + assert_eq!(buffer.len(), 3); + assert_eq!(buffer.as_slice(), &[0, 1, 2]); +} + +#[test] +fn as_ptr() { + let buffer = Buffer::::from(&[0, 1, 2, 3]); + let buffer = buffer.slice(1, 2); + let ptr = buffer.as_ptr(); + assert_eq!(unsafe { *ptr }, 1); +} + +#[test] +fn debug() { + let buffer = Buffer::::from(&[0, 1, 2, 3]); + let buffer = buffer.slice(1, 2); + let a = format!("{:?}", buffer); + assert_eq!(a, "[1, 2]") +} diff --git a/tests/it/buffer/mod.rs b/tests/it/buffer/mod.rs new file mode 100644 index 00000000000..2ad875845b5 --- /dev/null +++ b/tests/it/buffer/mod.rs @@ -0,0 +1,2 @@ +mod immutable; +mod mutable; diff --git a/tests/it/buffer/mutable.rs b/tests/it/buffer/mutable.rs new file mode 100644 index 00000000000..cb6134b0807 --- /dev/null +++ b/tests/it/buffer/mutable.rs @@ -0,0 +1,142 @@ +use arrow2::buffer::{Buffer, MutableBuffer}; + +#[test] +fn default() { + let b = MutableBuffer::::default(); + assert_eq!(b.len(), 0); + assert!(b.is_empty()); +} + +#[test] +fn with_capacity() { + let b = MutableBuffer::::with_capacity(6); + assert!(b.capacity() >= 6); + assert!(b.is_empty()); +} + +#[test] +fn from_len_zeroed() { + let b = MutableBuffer::::from_len_zeroed(3); + assert_eq!(b.len(), 3); + assert!(!b.is_empty()); + assert_eq!(b.as_slice(), &[0, 0, 0]); +} + +#[test] +fn resize() { + let mut b = MutableBuffer::::new(); + b.resize(3, 1); + assert_eq!(b.len(), 3); + assert_eq!(b.as_slice(), &[1, 1, 1]); + assert_eq!(b.as_mut_slice(), &[1, 1, 1]); +} + +// branch that uses alloc_zeroed +#[test] +fn resize_from_zero() { + let mut b = MutableBuffer::::new(); + b.resize(3, 0); + assert_eq!(b.len(), 3); + assert_eq!(b.as_slice(), &[0, 0, 0]); +} + +#[test] +fn resize_smaller() { + let mut b = MutableBuffer::::from_len_zeroed(3); + b.resize(2, 1); + assert_eq!(b.len(), 2); + assert_eq!(b.as_slice(), &[0, 0]); +} + +#[test] +fn extend_from_slice() { + let mut b = MutableBuffer::::from_len_zeroed(1); + b.extend_from_slice(&[1, 2]); + assert_eq!(b.len(), 3); + assert_eq!(b.as_slice(), &[0, 1, 2]); + + assert_eq!(unsafe { *b.as_ptr() }, 0); + assert_eq!(unsafe { *b.as_mut_ptr() }, 0); +} + +#[test] +fn push() { + let mut b = MutableBuffer::::new(); + for _ in 0..17 { + b.push(1); + } + assert_eq!(b.len(), 17); +} + +#[test] +fn capacity() { + let b = MutableBuffer::::with_capacity(10); + assert_eq!(b.capacity(), 64 / std::mem::size_of::()); + let b = MutableBuffer::::with_capacity(16); + assert_eq!(b.capacity(), 16); + + let b = MutableBuffer::::with_capacity(64); + assert!(b.capacity() >= 64); + + let mut b = MutableBuffer::::with_capacity(16); + b.reserve(4); + assert_eq!(b.capacity(), 16); + b.extend_from_slice(&[0.1; 16]); + b.reserve(4); + assert_eq!(b.capacity(), 32); +} + +#[test] +fn extend() { + let mut b = MutableBuffer::::new(); + b.extend(0..3); + assert_eq!(b.as_slice(), &[0, 1, 2]); +} + +#[test] +fn extend_constant() { + let mut b = MutableBuffer::::new(); + b.extend_constant(3, 1); + assert_eq!(b.as_slice(), &[1, 1, 1]); +} + +#[test] +fn from_iter() { + let b = (0..3).collect::>(); + assert_eq!(b.as_slice(), &[0, 1, 2]); +} + +#[test] +fn from_as_ref() { + let b = MutableBuffer::::from(&[0, 1, 2]); + assert_eq!(b.as_slice(), &[0, 1, 2]); +} + +#[test] +fn from_trusted_len_iter() { + let b = unsafe { MutableBuffer::::from_trusted_len_iter_unchecked(0..3) }; + assert_eq!(b.as_slice(), &[0, 1, 2]); +} + +#[test] +fn try_from_trusted_len_iter() { + let iter = (0..3).map(Result::<_, String>::Ok); + let buffer = + unsafe { MutableBuffer::::try_from_trusted_len_iter_unchecked(iter) }.unwrap(); + assert_eq!(buffer.len(), 3); + assert_eq!(buffer.as_slice(), &[0, 1, 2]); +} + +#[test] +fn to_buffer() { + let b = (0..3).collect::>(); + let b: Buffer = b.into(); + assert_eq!(b.as_slice(), &[0, 1, 2]); +} + +#[test] +fn debug() { + let buffer = MutableBuffer::::from(&[0, 1, 2, 3]); + let a = format!("{:?}", buffer); + assert_eq!(a, "[0, 1, 2, 3]") +} diff --git a/tests/it/ffi.rs b/tests/it/ffi.rs new file mode 100644 index 00000000000..ec45c79a7f5 --- /dev/null +++ b/tests/it/ffi.rs @@ -0,0 +1,132 @@ +use arrow2::array::*; +use arrow2::datatypes::{DataType, TimeUnit}; +use arrow2::ffi::try_from; +use arrow2::{error::Result, ffi}; +use std::sync::Arc; + +fn test_release(expected: impl Array + 'static) -> Result<()> { + // create a `ArrowArray` from the data. + let b: Arc = Arc::new(expected); + + // export the array as 2 pointers. + let _ = ffi::export_to_c(b)?; + + Ok(()) +} + +fn test_round_trip(expected: impl Array + Clone + 'static) -> Result<()> { + let b: Arc = Arc::new(expected.clone()); + let expected = Box::new(expected) as Box; + + // create a `ArrowArray` from the data. + let array = Arc::new(ffi::export_to_c(b)?); + + let (_, _) = array.references(); + + let result = try_from(array)?; + + assert_eq!(&result, &expected); + Ok(()) +} + +#[test] +fn test_u32() -> Result<()> { + let data = Int32Array::from(&[Some(2), None, Some(1), None]); + test_release(data) +} + +#[test] +fn test_u64() -> Result<()> { + let data = UInt64Array::from(&[Some(2), None, Some(1), None]); + test_round_trip(data) +} + +#[test] +fn test_i64() -> Result<()> { + let data = Int64Array::from(&[Some(2), None, Some(1), None]); + test_round_trip(data) +} + +#[test] +fn test_utf8() -> Result<()> { + let data = Utf8Array::::from(&vec![Some("a"), None, Some("bb"), None]); + test_round_trip(data) +} + +#[test] +fn test_large_utf8() -> Result<()> { + let data = Utf8Array::::from(&vec![Some("a"), None, Some("bb"), None]); + test_round_trip(data) +} + +#[test] +fn test_binary() -> Result<()> { + let data = + BinaryArray::::from(&vec![Some(b"a".as_ref()), None, Some(b"bb".as_ref()), None]); + test_round_trip(data) +} + +#[test] +fn test_timestamp_tz() -> Result<()> { + let data = Int64Array::from(&vec![Some(2), None, None]).to(DataType::Timestamp( + TimeUnit::Second, + Some("UTC".to_string()), + )); + test_round_trip(data) +} + +#[test] +fn test_large_binary() -> Result<()> { + let data = + BinaryArray::::from(&vec![Some(b"a".as_ref()), None, Some(b"bb".as_ref()), None]); + test_round_trip(data) +} + +#[test] +fn test_list() -> Result<()> { + let data = vec![ + Some(vec![Some(1i32), Some(2), Some(3)]), + None, + Some(vec![Some(4), None, Some(6)]), + ]; + + let mut array = MutableListArray::>::new(); + array.try_extend(data)?; + + let array: ListArray = array.into(); + + test_round_trip(array) +} + +#[test] +fn test_list_list() -> Result<()> { + let data = vec![ + Some(vec![ + Some(vec![None]), + Some(vec![Some(2)]), + Some(vec![Some(3)]), + ]), + None, + Some(vec![Some(vec![Some(4), None, Some(6)])]), + ]; + + let mut array = + MutableListArray::>>::new(); + array.try_extend(data)?; + + let array: ListArray = array.into(); + + test_round_trip(array) +} + +#[test] +fn test_dict() -> Result<()> { + let data = vec![Some("a"), Some("a"), None, Some("b")]; + + let mut array = MutableDictionaryArray::>::new(); + array.try_extend(data)?; + + let array: DictionaryArray = array.into(); + + test_round_trip(array) +} diff --git a/tests/it/io/ipc/common.rs b/tests/it/io/ipc/common.rs new file mode 100644 index 00000000000..f9f2be91fe5 --- /dev/null +++ b/tests/it/io/ipc/common.rs @@ -0,0 +1,67 @@ +use std::{collections::HashMap, convert::TryFrom, fs::File, io::Read}; + +use arrow2::{ + datatypes::Schema, + error::Result, + io::ipc::read::read_stream_metadata, + io::ipc::read::StreamReader, + io::json_integration::{to_record_batch, ArrowJson}, + record_batch::RecordBatch, +}; + +use flate2::read::GzDecoder; + +/// Read gzipped JSON file +pub fn read_gzip_json(version: &str, file_name: &str) -> (Schema, Vec) { + let testdata = crate::test_util::arrow_test_data(); + let file = File::open(format!( + "{}/arrow-ipc-stream/integration/{}/{}.json.gz", + testdata, version, file_name + )) + .unwrap(); + let mut gz = GzDecoder::new(&file); + let mut s = String::new(); + gz.read_to_string(&mut s).unwrap(); + // convert to Arrow JSON + let arrow_json: ArrowJson = serde_json::from_str(&s).unwrap(); + + let schema = serde_json::to_value(arrow_json.schema).unwrap(); + let schema = Schema::try_from(&schema).unwrap(); + + // read dictionaries + let mut dictionaries = HashMap::new(); + if let Some(dicts) = arrow_json.dictionaries { + for json_dict in dicts { + // TODO: convert to a concrete Arrow type + dictionaries.insert(json_dict.id, json_dict); + } + } + + let batches = arrow_json + .batches + .iter() + .map(|batch| to_record_batch(&schema, batch, &dictionaries)) + .collect::>>() + .unwrap(); + + (schema, batches) +} + +pub fn read_arrow_stream(version: &str, file_name: &str) -> (Schema, Vec) { + let testdata = crate::test_util::arrow_test_data(); + let mut file = File::open(format!( + "{}/arrow-ipc-stream/integration/{}/{}.stream", + testdata, version, file_name + )) + .unwrap(); + + let metadata = read_stream_metadata(&mut file).unwrap(); + let reader = StreamReader::new(file, metadata); + + let schema = reader.schema(); + + ( + schema.as_ref().clone(), + reader.collect::>().unwrap(), + ) +} diff --git a/tests/it/io/ipc/mod.rs b/tests/it/io/ipc/mod.rs new file mode 100644 index 00000000000..fd285864a7a --- /dev/null +++ b/tests/it/io/ipc/mod.rs @@ -0,0 +1,5 @@ +mod common; +mod read; +mod write; + +pub use common::read_gzip_json; diff --git a/tests/it/io/ipc/read/file.rs b/tests/it/io/ipc/read/file.rs new file mode 100644 index 00000000000..08e45b68c27 --- /dev/null +++ b/tests/it/io/ipc/read/file.rs @@ -0,0 +1,159 @@ +use std::fs::File; + +use arrow2::error::Result; +use arrow2::io::ipc::read::*; + +use super::super::common::read_gzip_json; + +fn test_file(version: &str, file_name: &str) -> Result<()> { + let testdata = crate::test_util::arrow_test_data(); + let mut file = File::open(format!( + "{}/arrow-ipc-stream/integration/{}/{}.arrow_file", + testdata, version, file_name + ))?; + + let metadata = read_file_metadata(&mut file)?; + let reader = FileReader::new(&mut file, metadata, None); + + // read expected JSON output + let (schema, batches) = read_gzip_json(version, file_name); + + assert_eq!(&schema, reader.schema().as_ref()); + + batches.iter().zip(reader).try_for_each(|(lhs, rhs)| { + assert_eq!(lhs, &rhs?); + Result::Ok(()) + })?; + Ok(()) +} + +#[test] +fn read_generated_100_primitive() -> Result<()> { + test_file("1.0.0-littleendian", "generated_primitive")?; + test_file("1.0.0-bigendian", "generated_primitive") +} + +#[test] +fn read_generated_100_primitive_large_offsets() -> Result<()> { + test_file("1.0.0-littleendian", "generated_primitive_large_offsets")?; + test_file("1.0.0-bigendian", "generated_primitive_large_offsets") +} + +#[test] +fn read_generated_100_datetime() -> Result<()> { + test_file("1.0.0-littleendian", "generated_datetime")?; + test_file("1.0.0-bigendian", "generated_datetime") +} + +#[test] +fn read_generated_100_null_trivial() -> Result<()> { + test_file("1.0.0-littleendian", "generated_null_trivial")?; + test_file("1.0.0-bigendian", "generated_null_trivial") +} + +#[test] +fn read_generated_100_null() -> Result<()> { + test_file("1.0.0-littleendian", "generated_null")?; + test_file("1.0.0-bigendian", "generated_null") +} + +#[test] +fn read_generated_100_primitive_zerolength() -> Result<()> { + test_file("1.0.0-littleendian", "generated_primitive_zerolength")?; + test_file("1.0.0-bigendian", "generated_primitive_zerolength") +} + +#[test] +fn read_generated_100_primitive_primitive_no_batches() -> Result<()> { + test_file("1.0.0-littleendian", "generated_primitive_no_batches")?; + test_file("1.0.0-bigendian", "generated_primitive_no_batches") +} + +#[test] +fn read_generated_100_dictionary() -> Result<()> { + test_file("1.0.0-littleendian", "generated_dictionary")?; + test_file("1.0.0-bigendian", "generated_dictionary") +} + +#[test] +fn read_100_custom_metadata() -> Result<()> { + test_file("1.0.0-littleendian", "generated_custom_metadata")?; + test_file("1.0.0-bigendian", "generated_custom_metadata") +} + +#[test] +fn read_generated_100_nested_large_offsets() -> Result<()> { + test_file("1.0.0-littleendian", "generated_nested_large_offsets")?; + test_file("1.0.0-bigendian", "generated_nested_large_offsets") +} + +#[test] +fn read_generated_100_nested() -> Result<()> { + test_file("1.0.0-littleendian", "generated_nested")?; + test_file("1.0.0-bigendian", "generated_nested") +} + +#[test] +fn read_generated_100_dictionary_unsigned() -> Result<()> { + test_file("1.0.0-littleendian", "generated_dictionary_unsigned")?; + test_file("1.0.0-bigendian", "generated_dictionary_unsigned") +} + +#[test] +fn read_generated_100_decimal() -> Result<()> { + test_file("1.0.0-littleendian", "generated_decimal")?; + test_file("1.0.0-bigendian", "generated_decimal") +} + +#[test] +fn read_generated_100_interval() -> Result<()> { + test_file("1.0.0-littleendian", "generated_interval")?; + test_file("1.0.0-bigendian", "generated_interval") +} + +#[test] +fn read_generated_100_union() -> Result<()> { + test_file("1.0.0-littleendian", "generated_union")?; + test_file("1.0.0-bigendian", "generated_union") +} + +#[test] +fn read_generated_017_union() -> Result<()> { + test_file("0.17.1", "generated_union") +} + +#[test] +fn read_generated_200_compression_lz4() -> Result<()> { + test_file("2.0.0-compression", "generated_lz4") +} + +#[test] +fn read_generated_200_compression_zstd() -> Result<()> { + test_file("2.0.0-compression", "generated_zstd") +} + +fn test_projection(version: &str, file_name: &str, column: usize) -> Result<()> { + let testdata = crate::test_util::arrow_test_data(); + let mut file = File::open(format!( + "{}/arrow-ipc-stream/integration/{}/{}.arrow_file", + testdata, version, file_name + ))?; + + let metadata = read_file_metadata(&mut file)?; + let mut reader = FileReader::new(&mut file, metadata, Some(vec![column])); + + assert_eq!(reader.schema().fields().len(), 1); + + reader.try_for_each(|rhs| { + assert_eq!(rhs?.num_columns(), 1); + Result::Ok(()) + })?; + Ok(()) +} + +#[test] +fn read_projected() -> Result<()> { + test_projection("1.0.0-littleendian", "generated_primitive", 1)?; + test_projection("1.0.0-littleendian", "generated_dictionary", 2)?; + test_projection("1.0.0-littleendian", "generated_nested", 1) +} diff --git a/tests/it/io/ipc/read/mod.rs b/tests/it/io/ipc/read/mod.rs new file mode 100644 index 00000000000..6d1510b36c4 --- /dev/null +++ b/tests/it/io/ipc/read/mod.rs @@ -0,0 +1,2 @@ +mod file; +mod stream; diff --git a/tests/it/io/ipc/read/stream.rs b/tests/it/io/ipc/read/stream.rs new file mode 100644 index 00000000000..9aa5e31a1bc --- /dev/null +++ b/tests/it/io/ipc/read/stream.rs @@ -0,0 +1,101 @@ +use std::fs::File; + +use arrow2::error::Result; +use arrow2::io::ipc::read::*; + +use crate::io::ipc::common::read_gzip_json; + +fn test_file(version: &str, file_name: &str) -> Result<()> { + let testdata = crate::test_util::arrow_test_data(); + let mut file = File::open(format!( + "{}/arrow-ipc-stream/integration/{}/{}.stream", + testdata, version, file_name + ))?; + + let metadata = read_stream_metadata(&mut file)?; + let reader = StreamReader::new(file, metadata); + + // read expected JSON output + let (schema, batches) = read_gzip_json(version, file_name); + + assert_eq!(&schema, reader.schema().as_ref()); + + batches + .iter() + .zip(reader.map(|x| x.unwrap())) + .for_each(|(lhs, rhs)| { + assert_eq!(lhs, &rhs); + }); + Ok(()) +} + +#[test] +fn read_generated_100_primitive() -> Result<()> { + test_file("1.0.0-littleendian", "generated_primitive") +} + +#[test] +fn read_generated_100_datetime() -> Result<()> { + test_file("1.0.0-littleendian", "generated_datetime") +} + +#[test] +fn read_generated_100_null_trivial() -> Result<()> { + test_file("1.0.0-littleendian", "generated_null_trivial") +} + +#[test] +fn read_generated_100_null() -> Result<()> { + test_file("1.0.0-littleendian", "generated_null") +} + +#[test] +fn read_generated_100_primitive_zerolength() -> Result<()> { + test_file("1.0.0-littleendian", "generated_primitive_zerolength") +} + +#[test] +fn read_generated_100_primitive_primitive_no_batches() -> Result<()> { + test_file("1.0.0-littleendian", "generated_primitive_no_batches") +} + +#[test] +fn read_generated_100_dictionary() -> Result<()> { + test_file("1.0.0-littleendian", "generated_dictionary") +} + +#[test] +fn read_generated_100_nested() -> Result<()> { + test_file("1.0.0-littleendian", "generated_nested") +} + +#[test] +fn read_generated_100_interval() -> Result<()> { + test_file("1.0.0-littleendian", "generated_interval") +} + +#[test] +fn read_generated_100_decimal() -> Result<()> { + test_file("1.0.0-littleendian", "generated_decimal") +} + +#[test] +fn read_generated_100_union() -> Result<()> { + test_file("1.0.0-littleendian", "generated_union")?; + test_file("1.0.0-bigendian", "generated_union") +} + +#[test] +fn read_generated_017_union() -> Result<()> { + test_file("0.17.1", "generated_union") +} + +#[test] +fn read_generated_200_compression_lz4() -> Result<()> { + test_file("2.0.0-compression", "generated_lz4") +} + +#[test] +fn read_generated_200_compression_zstd() -> Result<()> { + test_file("2.0.0-compression", "generated_zstd") +} diff --git a/tests/it/io/ipc/write/file.rs b/tests/it/io/ipc/write/file.rs new file mode 100644 index 00000000000..b1ae89a727b --- /dev/null +++ b/tests/it/io/ipc/write/file.rs @@ -0,0 +1,197 @@ +use std::io::Cursor; + +use arrow2::array::*; +use arrow2::error::Result; +use arrow2::io::ipc::read::{read_file_metadata, FileReader}; +use arrow2::io::ipc::write::*; +use arrow2::record_batch::RecordBatch; + +use crate::io::ipc::common::read_gzip_json; + +fn test_round_trip(batch: RecordBatch) -> Result<()> { + let mut result = Vec::::new(); + + // write IPC version 5 + { + let options = IpcWriteOptions::try_new(8, false, MetadataVersion::V5)?; + let mut writer = FileWriter::try_new_with_options(&mut result, batch.schema(), options)?; + writer.write(&batch)?; + writer.finish()?; + } + let mut reader = Cursor::new(result); + let metadata = read_file_metadata(&mut reader)?; + let schema = metadata.schema().clone(); + + let reader = FileReader::new(&mut reader, metadata, None); + + // read expected JSON output + let (expected_schema, expected_batches) = (batch.schema().clone(), vec![batch]); + + assert_eq!(schema.as_ref(), expected_schema.as_ref()); + + let batches = reader.collect::>>()?; + + assert_eq!(batches, expected_batches); + Ok(()) +} + +fn test_file(version: &str, file_name: &str) -> Result<()> { + let (schema, batches) = read_gzip_json(version, file_name); + + let mut result = Vec::::new(); + + // write IPC version 5 + { + let options = IpcWriteOptions::try_new(8, false, MetadataVersion::V5)?; + let mut writer = FileWriter::try_new_with_options(&mut result, &schema, options)?; + for batch in batches { + writer.write(&batch)?; + } + writer.finish()?; + } + let mut reader = Cursor::new(result); + let metadata = read_file_metadata(&mut reader)?; + let schema = metadata.schema().clone(); + + let reader = FileReader::new(&mut reader, metadata, None); + + // read expected JSON output + let (expected_schema, expected_batches) = read_gzip_json(version, file_name); + + assert_eq!(schema.as_ref(), &expected_schema); + + let batches = reader.collect::>>()?; + + assert_eq!(batches, expected_batches); + Ok(()) +} + +#[test] +fn write_100_primitive() -> Result<()> { + test_file("1.0.0-littleendian", "generated_primitive")?; + test_file("1.0.0-bigendian", "generated_primitive") +} + +#[test] +fn write_100_datetime() -> Result<()> { + test_file("1.0.0-littleendian", "generated_datetime")?; + test_file("1.0.0-bigendian", "generated_datetime") +} + +#[test] +fn write_100_dictionary_unsigned() -> Result<()> { + test_file("1.0.0-littleendian", "generated_dictionary_unsigned")?; + test_file("1.0.0-bigendian", "generated_dictionary_unsigned") +} + +#[test] +fn write_100_dictionary() -> Result<()> { + test_file("1.0.0-littleendian", "generated_dictionary")?; + test_file("1.0.0-bigendian", "generated_dictionary") +} + +#[test] +fn write_100_interval() -> Result<()> { + test_file("1.0.0-littleendian", "generated_interval")?; + test_file("1.0.0-bigendian", "generated_interval") +} + +#[test] +fn write_100_large_batch() -> Result<()> { + // this takes too long for unit-tests. It has been passing... + //test_file("1.0.0-littleendian", "generated_large_batch"); + Ok(()) +} + +#[test] +fn write_100_nested() -> Result<()> { + test_file("1.0.0-littleendian", "generated_nested")?; + test_file("1.0.0-bigendian", "generated_nested") +} + +#[test] +fn write_100_nested_large_offsets() -> Result<()> { + test_file("1.0.0-littleendian", "generated_nested_large_offsets")?; + test_file("1.0.0-bigendian", "generated_nested_large_offsets") +} + +#[test] +fn write_100_null_trivial() -> Result<()> { + test_file("1.0.0-littleendian", "generated_null_trivial")?; + test_file("1.0.0-bigendian", "generated_null_trivial") +} + +#[test] +fn write_100_null() -> Result<()> { + test_file("1.0.0-littleendian", "generated_null")?; + test_file("1.0.0-bigendian", "generated_null") +} + +#[test] +fn write_100_primitive_large_offsets() -> Result<()> { + test_file("1.0.0-littleendian", "generated_primitive_large_offsets")?; + test_file("1.0.0-bigendian", "generated_primitive_large_offsets") +} + +#[test] +fn write_100_primitive_no_batches() -> Result<()> { + test_file("1.0.0-littleendian", "generated_primitive_no_batches")?; + test_file("1.0.0-bigendian", "generated_primitive_no_batches") +} + +#[test] +fn write_100_primitive_zerolength() -> Result<()> { + test_file("1.0.0-littleendian", "generated_primitive_zerolength")?; + test_file("1.0.0-bigendian", "generated_primitive_zerolength") +} + +#[test] +fn write_0141_primitive_zerolength() -> Result<()> { + test_file("0.14.1", "generated_primitive_zerolength") +} + +#[test] +fn write_100_custom_metadata() -> Result<()> { + test_file("1.0.0-littleendian", "generated_custom_metadata")?; + test_file("1.0.0-bigendian", "generated_custom_metadata") +} + +#[test] +fn write_100_decimal() -> Result<()> { + test_file("1.0.0-littleendian", "generated_decimal")?; + test_file("1.0.0-bigendian", "generated_decimal") +} + +#[test] +fn write_100_union() -> Result<()> { + test_file("1.0.0-littleendian", "generated_union")?; + test_file("1.0.0-bigendian", "generated_union") +} + +#[test] +fn write_generated_017_union() -> Result<()> { + test_file("0.17.1", "generated_union") +} + +#[test] +fn write_sliced_utf8() -> Result<()> { + use std::sync::Arc; + let array = Arc::new(Utf8Array::::from_slice(["aa", "bb"]).slice(1, 1)) as Arc; + let batch = RecordBatch::try_from_iter(vec![("a", array)]).unwrap(); + test_round_trip(batch) +} + +#[test] +fn write_sliced_list() -> Result<()> { + let data = vec![ + Some(vec![Some(1i32), Some(2), Some(3)]), + None, + Some(vec![Some(4), None, Some(6)]), + ]; + + let mut array = MutableListArray::>::new(); + array.try_extend(data).unwrap(); + let array = array.into_arc().slice(1, 2).into(); + let batch = RecordBatch::try_from_iter(vec![("a", array)]).unwrap(); + test_round_trip(batch) +} diff --git a/tests/it/io/ipc/write/mod.rs b/tests/it/io/ipc/write/mod.rs new file mode 100644 index 00000000000..6d1510b36c4 --- /dev/null +++ b/tests/it/io/ipc/write/mod.rs @@ -0,0 +1,2 @@ +mod file; +mod stream; diff --git a/tests/it/io/ipc/write/stream.rs b/tests/it/io/ipc/write/stream.rs new file mode 100644 index 00000000000..c13831839bf --- /dev/null +++ b/tests/it/io/ipc/write/stream.rs @@ -0,0 +1,131 @@ +use std::io::Cursor; + +use arrow2::error::Result; +use arrow2::io::ipc::read::read_stream_metadata; +use arrow2::io::ipc::read::StreamReader; +use arrow2::io::ipc::write::{IpcWriteOptions, MetadataVersion, StreamWriter}; + +use crate::io::ipc::common::read_arrow_stream; +use crate::io::ipc::common::read_gzip_json; + +fn test_file(version: &str, file_name: &str) { + let (schema, batches) = read_arrow_stream(version, file_name); + + let mut result = Vec::::new(); + + // write IPC version 5 + { + let options = IpcWriteOptions::try_new(8, false, MetadataVersion::V5).unwrap(); + let mut writer = StreamWriter::try_new_with_options(&mut result, &schema, options).unwrap(); + for batch in batches { + writer.write(&batch).unwrap(); + } + writer.finish().unwrap(); + } + + let mut reader = Cursor::new(result); + let metadata = read_stream_metadata(&mut reader).unwrap(); + let reader = StreamReader::new(reader, metadata); + + let schema = reader.schema().clone(); + + // read expected JSON output + let (expected_schema, expected_batches) = read_gzip_json(version, file_name); + + assert_eq!(schema.as_ref(), &expected_schema); + + let batches = reader.collect::>>().unwrap(); + + assert_eq!(batches, expected_batches); +} + +#[test] +fn write_100_primitive() { + test_file("1.0.0-littleendian", "generated_primitive"); +} + +#[test] +fn write_100_datetime() { + test_file("1.0.0-littleendian", "generated_datetime"); +} + +#[test] +fn write_100_dictionary_unsigned() { + test_file("1.0.0-littleendian", "generated_dictionary_unsigned"); +} + +#[test] +fn write_100_dictionary() { + test_file("1.0.0-littleendian", "generated_dictionary"); +} + +#[test] +fn write_100_interval() { + test_file("1.0.0-littleendian", "generated_interval"); +} + +#[test] +fn write_100_large_batch() { + // this takes too long for unit-tests. It has been passing... + //test_file("1.0.0-littleendian", "generated_large_batch"); +} + +#[test] +fn write_100_nested() { + test_file("1.0.0-littleendian", "generated_nested"); +} + +#[test] +fn write_100_nested_large_offsets() { + test_file("1.0.0-littleendian", "generated_nested_large_offsets"); +} + +#[test] +fn write_100_null_trivial() { + test_file("1.0.0-littleendian", "generated_null_trivial"); +} + +#[test] +fn write_100_null() { + test_file("1.0.0-littleendian", "generated_null"); +} + +#[test] +fn write_100_primitive_large_offsets() { + test_file("1.0.0-littleendian", "generated_primitive_large_offsets"); +} + +#[test] +fn write_100_union() { + test_file("1.0.0-littleendian", "generated_union"); +} + +#[test] +fn write_generated_017_union() { + test_file("0.17.1", "generated_union"); +} + +//#[test] +//fn write_100_recursive_nested() { +//test_file("1.0.0-littleendian", "generated_recursive_nested"); +//} + +#[test] +fn write_100_primitive_no_batches() { + test_file("1.0.0-littleendian", "generated_primitive_no_batches"); +} + +#[test] +fn write_100_primitive_zerolength() { + test_file("1.0.0-littleendian", "generated_primitive_zerolength"); +} + +#[test] +fn write_100_custom_metadata() { + test_file("1.0.0-littleendian", "generated_custom_metadata"); +} + +#[test] +fn write_100_decimal() { + test_file("1.0.0-littleendian", "generated_decimal"); +} diff --git a/tests/it/io/json/mod.rs b/tests/it/io/json/mod.rs new file mode 100644 index 00000000000..f7cdac66078 --- /dev/null +++ b/tests/it/io/json/mod.rs @@ -0,0 +1,2 @@ +mod read; +mod write; diff --git a/tests/it/io/json/read.rs b/tests/it/io/json/read.rs new file mode 100644 index 00000000000..78ef6be9c69 --- /dev/null +++ b/tests/it/io/json/read.rs @@ -0,0 +1,661 @@ +use flate2::read::GzDecoder; +use std::io::BufReader; +use std::{ + fs::File, + io::{Seek, SeekFrom}, +}; +use std::{io::Cursor, sync::Arc}; + +use arrow2::array::*; +use arrow2::datatypes::*; +use arrow2::{bitmap::Bitmap, buffer::Buffer, error::Result, io::json::*}; + +#[test] +fn test_json_basic() { + let builder = ReaderBuilder::new().infer_schema(None).with_batch_size(64); + let mut reader: Reader = builder + .build::(File::open("test/data/basic.json").unwrap()) + .unwrap(); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(4, batch.num_columns()); + assert_eq!(12, batch.num_rows()); + + let schema = reader.schema(); + let batch_schema = batch.schema(); + assert_eq!(schema, batch_schema); + + let a = schema.column_with_name("a").unwrap(); + assert_eq!(0, a.0); + assert_eq!(&DataType::Int64, a.1.data_type()); + let b = schema.column_with_name("b").unwrap(); + assert_eq!(1, b.0); + assert_eq!(&DataType::Float64, b.1.data_type()); + let c = schema.column_with_name("c").unwrap(); + assert_eq!(2, c.0); + assert_eq!(&DataType::Boolean, c.1.data_type()); + let d = schema.column_with_name("d").unwrap(); + assert_eq!(3, d.0); + assert_eq!(&DataType::Utf8, d.1.data_type()); + + let aa = batch + .column(a.0) + .as_any() + .downcast_ref::>() + .unwrap(); + assert_eq!(1, aa.value(0)); + assert_eq!(-10, aa.value(1)); + let bb = batch + .column(b.0) + .as_any() + .downcast_ref::>() + .unwrap(); + assert!((2.0 - bb.value(0)).abs() < f64::EPSILON); + assert!((-3.5 - bb.value(1)).abs() < f64::EPSILON); + let cc = batch + .column(c.0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!cc.value(0)); + assert!(cc.value(10)); + let dd = batch + .column(d.0) + .as_any() + .downcast_ref::>() + .unwrap(); + assert_eq!("4", dd.value(0)); + assert_eq!("text", dd.value(8)); +} + +#[test] +fn test_json_basic_with_nulls() { + let builder = ReaderBuilder::new().infer_schema(None).with_batch_size(64); + let mut reader: Reader = builder + .build::(File::open("test/data/basic_nulls.json").unwrap()) + .unwrap(); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(4, batch.num_columns()); + assert_eq!(12, batch.num_rows()); + + let schema = reader.schema(); + let batch_schema = batch.schema(); + assert_eq!(schema, batch_schema); + + let a = schema.column_with_name("a").unwrap(); + assert_eq!(&DataType::Int64, a.1.data_type()); + let b = schema.column_with_name("b").unwrap(); + assert_eq!(&DataType::Float64, b.1.data_type()); + let c = schema.column_with_name("c").unwrap(); + assert_eq!(&DataType::Boolean, c.1.data_type()); + let d = schema.column_with_name("d").unwrap(); + assert_eq!(&DataType::Utf8, d.1.data_type()); + + let aa = batch + .column(a.0) + .as_any() + .downcast_ref::>() + .unwrap(); + assert!(aa.is_valid(0)); + assert!(!aa.is_valid(1)); + assert!(!aa.is_valid(11)); + let bb = batch + .column(b.0) + .as_any() + .downcast_ref::>() + .unwrap(); + assert!(bb.is_valid(0)); + assert!(!bb.is_valid(2)); + assert!(!bb.is_valid(11)); + let cc = batch + .column(c.0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(cc.is_valid(0)); + assert!(!cc.is_valid(4)); + assert!(!cc.is_valid(11)); + let dd = batch + .column(d.0) + .as_any() + .downcast_ref::>() + .unwrap(); + assert!(!dd.is_valid(0)); + assert!(dd.is_valid(1)); + assert!(!dd.is_valid(4)); + assert!(!dd.is_valid(11)); +} + +#[test] +fn test_json_basic_schema() { + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Float32, false), + Field::new("c", DataType::Boolean, false), + Field::new("d", DataType::Utf8, false), + ])); + + let mut reader: Reader = Reader::new( + File::open("test/data/basic.json").unwrap(), + schema.clone(), + 1024, + None, + ); + let reader_schema = reader.schema(); + assert_eq!(reader_schema, &schema); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(4, batch.num_columns()); + assert_eq!(12, batch.num_rows()); + + let schema = batch.schema(); + + let a = schema.column_with_name("a").unwrap(); + assert_eq!(&DataType::Int32, a.1.data_type()); + let b = schema.column_with_name("b").unwrap(); + assert_eq!(&DataType::Float32, b.1.data_type()); + let c = schema.column_with_name("c").unwrap(); + assert_eq!(&DataType::Boolean, c.1.data_type()); + let d = schema.column_with_name("d").unwrap(); + assert_eq!(&DataType::Utf8, d.1.data_type()); + + let aa = batch + .column(a.0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(1, aa.value(0)); + // test that a 64bit value is returned as null due to overflowing + assert!(!aa.is_valid(11)); + let bb = batch + .column(b.0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!((2.0 - bb.value(0)).abs() < f32::EPSILON); + assert!((-3.5 - bb.value(1)).abs() < f32::EPSILON); +} + +#[test] +fn test_json_basic_schema_projection() { + // We test implicit and explicit projection: + // Implicit: omitting fields from a schema + // Explicit: supplying a vec of fields to take + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Float32, false), + Field::new("c", DataType::Boolean, false), + ])); + + let mut reader: Reader = Reader::new( + File::open("test/data/basic.json").unwrap(), + schema, + 1024, + Some(vec!["a".to_string(), "c".to_string()]), + ); + let reader_schema = reader.schema().clone(); + let expected_schema = Schema::new(vec![ + Field::new("a", DataType::Int32, false), + Field::new("c", DataType::Boolean, false), + ]); + assert_eq!(reader_schema.as_ref(), &expected_schema); + + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(2, batch.num_columns()); + assert_eq!(12, batch.num_rows()); + + let batch_schema = batch.schema(); + assert_eq!(&reader_schema, batch_schema); + + let a = batch_schema.column_with_name("a").unwrap(); + assert_eq!(0, a.0); + assert_eq!(&DataType::Int32, a.1.data_type()); + let c = batch_schema.column_with_name("c").unwrap(); + assert_eq!(1, c.0); + assert_eq!(&DataType::Boolean, c.1.data_type()); +} + +#[test] +fn test_json_arrays() { + let builder = ReaderBuilder::new().infer_schema(None).with_batch_size(64); + let mut reader: Reader = builder + .build::(File::open("test/data/arrays.json").unwrap()) + .unwrap(); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(4, batch.num_columns()); + assert_eq!(3, batch.num_rows()); + + let schema = batch.schema(); + + let a = schema.column_with_name("a").unwrap(); + assert_eq!(&DataType::Int64, a.1.data_type()); + let b = schema.column_with_name("b").unwrap(); + assert_eq!( + &DataType::List(Box::new(Field::new("item", DataType::Float64, true))), + b.1.data_type() + ); + let c = schema.column_with_name("c").unwrap(); + assert_eq!( + &DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), + c.1.data_type() + ); + let d = schema.column_with_name("d").unwrap(); + assert_eq!(&DataType::Utf8, d.1.data_type()); + + let aa = batch + .column(a.0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(1, aa.value(0)); + assert_eq!(-10, aa.value(1)); + let bb = batch + .column(b.0) + .as_any() + .downcast_ref::>() + .unwrap(); + let bb = bb.values(); + let bb = bb.as_any().downcast_ref::().unwrap(); + assert_eq!(9, bb.len()); + assert!((2.0 - bb.value(0)).abs() < f64::EPSILON); + assert!((-6.1 - bb.value(5)).abs() < f64::EPSILON); + assert!(!bb.is_valid(7)); + + let cc = batch + .column(c.0) + .as_any() + .downcast_ref::>() + .unwrap(); + let cc = cc.values(); + let cc = cc.as_any().downcast_ref::().unwrap(); + assert_eq!(6, cc.len()); + assert!(!cc.value(0)); + assert!(!cc.value(4)); + assert!(!cc.is_valid(5)); +} + +#[test] +fn test_invalid_json_infer_schema() { + let re = infer_json_schema_from_seekable( + &mut BufReader::new(File::open("test/data/uk_cities_with_headers.csv").unwrap()), + None, + ); + assert_eq!( + re.err().unwrap().to_string(), + "External error: expected value at line 1 column 1", + ); +} + +#[test] +fn test_invalid_json_read_record() { + let schema = Arc::new(Schema::new(vec![Field::new( + "a", + DataType::Struct(vec![Field::new("a", DataType::Utf8, true)]), + true, + )])); + let builder = ReaderBuilder::new().with_schema(schema).with_batch_size(64); + let mut reader: Reader = builder + .build::(File::open("test/data/uk_cities_with_headers.csv").unwrap()) + .unwrap(); + assert_eq!( + reader.next().err().unwrap().to_string(), + "External error: expected value at line 1 column 1", + ); +} + +#[test] +fn test_mixed_json_arrays() { + let builder = ReaderBuilder::new().infer_schema(None).with_batch_size(64); + let mut reader: Reader = builder + .build::(File::open("test/data/mixed_arrays.json").unwrap()) + .unwrap(); + let batch = reader.next().unwrap().unwrap(); + + let mut file = File::open("test/data/mixed_arrays.json.gz").unwrap(); + let mut reader = BufReader::new(GzDecoder::new(&file)); + let schema = Arc::new(infer_json_schema(&mut reader, None).unwrap()); + file.seek(SeekFrom::Start(0)).unwrap(); + + let reader = BufReader::new(GzDecoder::new(&file)); + let mut reader = Reader::from_buf_reader(reader, schema, 64, None); + let batch_gz = reader.next().unwrap().unwrap(); + + for batch in vec![batch, batch_gz] { + assert_eq!(4, batch.num_columns()); + assert_eq!(4, batch.num_rows()); + + let schema = batch.schema(); + + let a = schema.column_with_name("a").unwrap(); + assert_eq!(&DataType::Int64, a.1.data_type()); + let b = schema.column_with_name("b").unwrap(); + assert_eq!( + &DataType::List(Box::new(Field::new("item", DataType::Float64, true))), + b.1.data_type() + ); + let c = schema.column_with_name("c").unwrap(); + assert_eq!( + &DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), + c.1.data_type() + ); + let d = schema.column_with_name("d").unwrap(); + assert_eq!( + &DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), + d.1.data_type() + ); + + let bb = batch + .column(b.0) + .as_any() + .downcast_ref::>() + .unwrap(); + let bb = bb.values(); + let bb = bb.as_any().downcast_ref::().unwrap(); + assert_eq!(9, bb.len()); + assert!((-6.1 - bb.value(8)).abs() < f64::EPSILON); + + let cc = batch + .column(c.0) + .as_any() + .downcast_ref::>() + .unwrap(); + let cc = cc.values(); + let cc = cc.as_any().downcast_ref::().unwrap(); + let cc_expected = BooleanArray::from(vec![Some(false), Some(true), Some(false), None]); + assert_eq!(cc, &cc_expected); + + let dd = batch + .column(d.0) + .as_any() + .downcast_ref::>() + .unwrap(); + let dd = dd.values(); + let dd = dd.as_any().downcast_ref::>().unwrap(); + assert_eq!( + dd, + &Utf8Array::::from_slice(&["1", "false", "array", "2.4"]) + ); + } +} + +#[test] +fn test_nested_struct_json_arrays() { + let d_field = Field::new("d", DataType::Utf8, true); + let c_field = Field::new("c", DataType::Struct(vec![d_field.clone()]), true); + let a_field = Field::new( + "a", + DataType::Struct(vec![ + Field::new("b", DataType::Boolean, true), + c_field.clone(), + ]), + true, + ); + let schema = Arc::new(Schema::new(vec![a_field])); + let builder = ReaderBuilder::new().with_schema(schema).with_batch_size(64); + let mut reader: Reader = builder + .build::(File::open("test/data/nested_structs.json").unwrap()) + .unwrap(); + + // build expected output + let d = Utf8Array::::from(&vec![Some("text"), None, Some("text"), None]); + let c = StructArray::from_data(vec![d_field], vec![Arc::new(d)], None); + + let b = BooleanArray::from(vec![Some(true), Some(false), Some(true), None]); + let expected = StructArray::from_data( + vec![Field::new("b", DataType::Boolean, true), c_field], + vec![Arc::new(b), Arc::new(c)], + None, + ); + + // compare `a` with result from json reader + let batch = reader.next().unwrap().unwrap(); + let read = batch.column(0); + assert_eq!(expected, read.as_ref()); +} + +#[test] +fn test_nested_list_json_arrays() { + let d_field = Field::new("d", DataType::Utf8, true); + let c_field = Field::new("c", DataType::Struct(vec![d_field.clone()]), true); + let b_field = Field::new("b", DataType::Boolean, true); + let a_struct_field = Field::new( + "a", + DataType::Struct(vec![b_field.clone(), c_field.clone()]), + true, + ); + let a_list_data_type = DataType::List(Box::new(a_struct_field)); + let a_field = Field::new("a", a_list_data_type.clone(), true); + let schema = Arc::new(Schema::new(vec![a_field])); + let builder = ReaderBuilder::new().with_schema(schema).with_batch_size(64); + let json_content = r#" + {"a": [{"b": true, "c": {"d": "a_text"}}, {"b": false, "c": {"d": "b_text"}}]} + {"a": [{"b": false, "c": null}]} + {"a": [{"b": true, "c": {"d": "c_text"}}, {"b": null, "c": {"d": "d_text"}}, {"b": true, "c": {"d": null}}]} + {"a": null} + {"a": []} + "#; + let mut reader = builder.build(Cursor::new(json_content)).unwrap(); + + // build expected output + let d = Utf8Array::::from(&vec![ + Some("a_text"), + Some("b_text"), + None, + Some("c_text"), + Some("d_text"), + None, + ]); + + let c = StructArray::from_data(vec![d_field], vec![Arc::new(d)], None); + + let b = BooleanArray::from(vec![ + Some(true), + Some(false), + Some(false), + Some(true), + None, + Some(true), + ]); + let a_struct = StructArray::from_data( + vec![b_field, c_field], + vec![Arc::new(b) as Arc, Arc::new(c) as Arc], + None, + ); + let expected = ListArray::from_data( + a_list_data_type, + Buffer::from([0i32, 2, 3, 6, 6, 6]), + Arc::new(a_struct) as Arc, + Some(Bitmap::from_u8_slice([0b00010111], 5)), + ); + + // compare `a` with result from json reader + let batch = reader.next().unwrap().unwrap(); + let read = batch.column(0); + assert_eq!(expected, read.as_ref()); +} + +#[test] +fn test_dictionary_from_json_basic_with_nulls() -> Result<()> { + let schema = Arc::new(Schema::new(vec![Field::new( + "d", + DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)), + true, + )])); + let builder = ReaderBuilder::new().with_schema(schema).with_batch_size(64); + let mut reader: Reader = builder + .build::(File::open("test/data/basic_nulls.json").unwrap()) + .unwrap(); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(1, batch.num_columns()); + assert_eq!(12, batch.num_rows()); + + let schema = reader.schema(); + let batch_schema = batch.schema(); + assert_eq!(schema, batch_schema); + + let d = schema.column_with_name("d").unwrap(); + let data_type = DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)); + assert_eq!(&data_type, d.1.data_type()); + + let result = batch.column(d.0); + + let values = vec![ + None, + Some("4"), + Some("text"), + Some("4"), + None, + None, + Some("4"), + None, + Some("text"), + Some("4"), + Some("4"), + None, + ]; + + let mut expected = MutableDictionaryArray::>::new(); + expected.try_extend(values)?; + let expected: DictionaryArray = expected.into(); + + assert_eq!(expected, result.as_ref()); + Ok(()) +} + +#[test] +fn test_skip_empty_lines() { + let builder = ReaderBuilder::new().infer_schema(None).with_batch_size(64); + let json_content = " + {\"a\": 1} + + {\"a\": 2} + + {\"a\": 3}"; + let mut reader = builder.build(Cursor::new(json_content)).unwrap(); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(1, batch.num_columns()); + assert_eq!(3, batch.num_rows()); + + let schema = reader.schema(); + let c = schema.column_with_name("a").unwrap(); + assert_eq!(&DataType::Int64, c.1.data_type()); +} + +#[test] +fn test_row_type_validation() { + let builder = ReaderBuilder::new().infer_schema(None).with_batch_size(64); + let json_content = " + [1, \"hello\"] + \"world\""; + let re = builder.build(Cursor::new(json_content)); + assert_eq!( + re.err().unwrap().to_string(), + r#"Expected JSON record to be an object, found Array([Number(1), String("hello")])"#, + ); +} + +#[test] +fn test_list_of_string_dictionary_from_json_with_nulls() -> Result<()> { + let data_type = DataType::List(Box::new(Field::new( + "item", + DataType::Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), + true, + ))); + + let schema = Arc::new(Schema::new(vec![Field::new( + "events", + data_type.clone(), + true, + )])); + let builder = ReaderBuilder::new().with_schema(schema).with_batch_size(64); + let mut reader: Reader = builder + .build::(File::open("test/data/list_string_dict_nested_nulls.json").unwrap()) + .unwrap(); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(1, batch.num_columns()); + assert_eq!(3, batch.num_rows()); + + let schema = reader.schema(); + let batch_schema = batch.schema(); + assert_eq!(schema, batch_schema); + + let events = schema.column_with_name("events").unwrap(); + assert_eq!(&data_type, events.1.data_type()); + + let expected = vec![ + Some(vec![None, Some("Elect Leader"), Some("Do Ballot")]), + Some(vec![ + Some("Do Ballot"), + None, + Some("Send Data"), + Some("Elect Leader"), + ]), + Some(vec![Some("Send Data")]), + ]; + + type A = MutableDictionaryArray>; + + let mut array = MutableListArray::::new(); + array.try_extend(expected)?; + + let expected: ListArray = array.into(); + + assert_eq!(expected, batch.column(0).as_ref()); + Ok(()) +} + +#[test] +fn test_with_multiple_batches() { + let builder = ReaderBuilder::new() + .infer_schema(Some(4)) + .with_batch_size(5); + let mut reader: Reader = builder + .build::(File::open("test/data/basic_nulls.json").unwrap()) + .unwrap(); + + let mut num_records = Vec::new(); + while let Some(rb) = reader.next().unwrap() { + num_records.push(rb.num_rows()); + } + + assert_eq!(vec![5, 5, 2], num_records); +} + +#[test] +fn test_json_infer_schema() { + let schema = Schema::new(vec![ + Field::new("a", DataType::Int64, true), + Field::new( + "b", + DataType::List(Box::new(Field::new("item", DataType::Float64, true))), + true, + ), + Field::new( + "c", + DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), + true, + ), + Field::new( + "d", + DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), + true, + ), + ]); + + let mut reader = BufReader::new(File::open("test/data/mixed_arrays.json").unwrap()); + let inferred_schema = infer_json_schema_from_seekable(&mut reader, None).unwrap(); + + assert_eq!(inferred_schema, schema); + + let file = File::open("test/data/mixed_arrays.json.gz").unwrap(); + let mut reader = BufReader::new(GzDecoder::new(&file)); + let inferred_schema = infer_json_schema(&mut reader, None).unwrap(); + + assert_eq!(inferred_schema, schema); +} diff --git a/tests/it/io/json/write.rs b/tests/it/io/json/write.rs new file mode 100644 index 00000000000..8c9cedd9246 --- /dev/null +++ b/tests/it/io/json/write.rs @@ -0,0 +1,239 @@ +use std::sync::Arc; + +use arrow2::{ + array::*, + bitmap::Bitmap, + buffer::Buffer, + datatypes::{DataType, Field, Schema}, + io::json::LineDelimitedWriter, + record_batch::RecordBatch, +}; + +#[test] +fn write_simple_rows() { + let schema = Schema::new(vec![ + Field::new("c1", DataType::Int32, false), + Field::new("c2", DataType::Utf8, false), + ]); + + let a = Int32Array::from([Some(1), Some(2), Some(3), None, Some(5)]); + let b = Utf8Array::::from(&vec![Some("a"), Some("b"), Some("c"), Some("d"), None]); + + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]).unwrap(); + + let mut buf = Vec::new(); + { + let mut writer = LineDelimitedWriter::new(&mut buf); + writer.write_batches(&[batch]).unwrap(); + } + + assert_eq!( + String::from_utf8(buf).unwrap(), + r#"{"c1":1,"c2":"a"} +{"c1":2,"c2":"b"} +{"c1":3,"c2":"c"} +{"c1":null,"c2":"d"} +{"c1":5,"c2":null} +"# + ); +} + +#[test] +fn write_nested_structs() { + let c121 = Field::new("c121", DataType::Utf8, false); + let fields = vec![ + Field::new("c11", DataType::Int32, false), + Field::new("c12", DataType::Struct(vec![c121.clone()]), false), + ]; + let schema = Schema::new(vec![ + Field::new("c1", DataType::Struct(fields.clone()), false), + Field::new("c2", DataType::Utf8, false), + ]); + + let c1 = StructArray::from_data( + fields, + vec![ + Arc::new(Int32Array::from(&[Some(1), None, Some(5)])), + Arc::new(StructArray::from_data( + vec![c121], + vec![Arc::new(Utf8Array::::from(&vec![ + Some("e"), + Some("f"), + Some("g"), + ]))], + None, + )), + ], + None, + ); + + let c2 = Utf8Array::::from(&vec![Some("a"), Some("b"), Some("c")]); + + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c1), Arc::new(c2)]).unwrap(); + + let mut buf = Vec::new(); + { + let mut writer = LineDelimitedWriter::new(&mut buf); + writer.write_batches(&[batch]).unwrap(); + } + + assert_eq!( + String::from_utf8(buf).unwrap(), + r#"{"c1":{"c11":1,"c12":{"c121":"e"}},"c2":"a"} +{"c1":{"c11":null,"c12":{"c121":"f"}},"c2":"b"} +{"c1":{"c11":5,"c12":{"c121":"g"}},"c2":"c"} +"# + ); +} + +#[test] +fn write_struct_with_list_field() { + let list_datatype = DataType::List(Box::new(Field::new("c_list", DataType::Utf8, false))); + let field_c1 = Field::new("c1", list_datatype, false); + let field_c2 = Field::new("c2", DataType::Int32, false); + let schema = Schema::new(vec![field_c1, field_c2]); + + let iter = vec![vec!["a", "a1"], vec!["b"], vec!["c"], vec!["d"], vec!["e"]]; + + let iter = iter + .into_iter() + .map(|x| x.into_iter().map(Some).collect::>()) + .map(Some); + let mut a = MutableListArray::>::new_with_field( + MutableUtf8Array::::new(), + "c_list", + false, + ); + a.try_extend(iter).unwrap(); + let a: ListArray = a.into(); + + let b = PrimitiveArray::from_slice(&vec![1, 2, 3, 4, 5]).to(DataType::Int32); + + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]).unwrap(); + + let mut buf = Vec::new(); + { + let mut writer = LineDelimitedWriter::new(&mut buf); + writer.write_batches(&[batch]).unwrap(); + } + + assert_eq!( + String::from_utf8(buf).unwrap(), + r#"{"c1":["a","a1"],"c2":1} +{"c1":["b"],"c2":2} +{"c1":["c"],"c2":3} +{"c1":["d"],"c2":4} +{"c1":["e"],"c2":5} +"# + ); +} + +#[test] +fn write_nested_list() { + let list_inner = DataType::List(Box::new(Field::new("b", DataType::Int32, false))); + let list_datatype = DataType::List(Box::new(Field::new("a", list_inner, false))); + let field_c1 = Field::new("c1", list_datatype, true); + let field_c2 = Field::new("c2", DataType::Utf8, true); + let schema = Schema::new(vec![field_c1, field_c2]); + + let iter = vec![ + vec![Some(vec![Some(1), Some(2)]), Some(vec![Some(3)])], + vec![], + vec![Some(vec![Some(4), Some(5), Some(6)])], + ]; + + let iter = iter.into_iter().map(Some); + + let inner = MutableListArray::>::new_with_field( + MutablePrimitiveArray::::new(), + "b", + false, + ); + let mut c1 = + MutableListArray::>>::new_with_field( + inner, "a", false, + ); + c1.try_extend(iter).unwrap(); + let c1: ListArray = c1.into(); + + let c2 = Utf8Array::::from(&vec![Some("foo"), Some("bar"), None]); + + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c1), Arc::new(c2)]).unwrap(); + + let mut buf = Vec::new(); + { + let mut writer = LineDelimitedWriter::new(&mut buf); + writer.write_batches(&[batch]).unwrap(); + } + + assert_eq!( + String::from_utf8(buf).unwrap(), + r#"{"c1":[[1,2],[3]],"c2":"foo"} +{"c1":[],"c2":"bar"} +{"c1":[[4,5,6]],"c2":null} +"# + ); +} + +#[test] +fn write_list_of_struct() { + let inner = vec![Field::new("c121", DataType::Utf8, false)]; + let fields = vec![ + Field::new("c11", DataType::Int32, false), + Field::new("c12", DataType::Struct(inner.clone()), false), + ]; + let c1_datatype = DataType::List(Box::new(Field::new( + "s", + DataType::Struct(fields.clone()), + false, + ))); + let field_c1 = Field::new("c1", c1_datatype.clone(), true); + let field_c2 = Field::new("c2", DataType::Int32, false); + let schema = Schema::new(vec![field_c1, field_c2]); + + let s = StructArray::from_data( + fields, + vec![ + Arc::new(Int32Array::from(&[Some(1), None, Some(5)])), + Arc::new(StructArray::from_data( + inner, + vec![Arc::new(Utf8Array::::from(&vec![ + Some("e"), + Some("f"), + Some("g"), + ]))], + None, + )), + ], + None, + ); + + // list column rows (c1): + // [{"c11": 1, "c12": {"c121": "e"}}, {"c12": {"c121": "f"}}], + // null, + // [{"c11": 5, "c12": {"c121": "g"}}] + let c1 = ListArray::::from_data( + c1_datatype, + Buffer::from(&[0, 2, 2, 3]), + Arc::new(s), + Some(Bitmap::from_u8_slice([0b00000101], 3)), + ); + + let c2 = Int32Array::from_slice(&[1, 2, 3]); + + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c1), Arc::new(c2)]).unwrap(); + + let mut buf = Vec::new(); + { + let mut writer = LineDelimitedWriter::new(&mut buf); + writer.write_batches(&[batch]).unwrap(); + } + + assert_eq!( + String::from_utf8(buf).unwrap(), + r#"{"c1":[{"c11":1,"c12":{"c121":"e"}},{"c11":null,"c12":{"c121":"f"}}],"c2":1} +{"c1":null,"c2":2} +{"c1":[{"c11":5,"c12":{"c121":"g"}}],"c2":3} +"# + ); +} diff --git a/tests/it/io/mod.rs b/tests/it/io/mod.rs new file mode 100644 index 00000000000..d5c4636d733 --- /dev/null +++ b/tests/it/io/mod.rs @@ -0,0 +1,11 @@ +#[cfg(feature = "io_print")] +mod print; + +#[cfg(feature = "io_json")] +mod json; + +#[cfg(feature = "io_ipc")] +mod ipc; + +#[cfg(feature = "io_parquet")] +mod parquet; diff --git a/tests/it/io/parquet/mod.rs b/tests/it/io/parquet/mod.rs new file mode 100644 index 00000000000..4628dc9747d --- /dev/null +++ b/tests/it/io/parquet/mod.rs @@ -0,0 +1,481 @@ +use std::io::{Cursor, Read, Seek}; +use std::sync::Arc; + +use arrow2::{ + array::*, bitmap::Bitmap, buffer::Buffer, datatypes::*, error::Result, + io::parquet::read::statistics::*, io::parquet::read::*, io::parquet::write::*, + record_batch::RecordBatch, +}; + +use crate::io::ipc::read_gzip_json; + +mod read; +mod write; + +type ArrayStats = (Arc, Option>); + +pub fn read_column( + mut reader: R, + row_group: usize, + column: usize, +) -> Result { + let metadata = read_metadata(&mut reader)?; + + let mut reader = RecordReader::try_new( + reader, + Some(vec![column]), + None, + Arc::new(|_, _| true), + None, + )?; + + let statistics = metadata.row_groups[row_group] + .column(column) + .statistics() + .map(|x| statistics::deserialize_statistics(x?.as_ref())) + .transpose()?; + + Ok((reader.next().unwrap()?.columns()[0].clone(), statistics)) +} + +pub fn pyarrow_nested_nullable(column: usize) -> Box { + let offsets = Buffer::::from([0, 2, 2, 5, 8, 8, 11, 11, 12]); + + let values = match column { + 0 => { + // [[0, 1], None, [2, None, 3], [4, 5, 6], [], [7, 8, 9], None, [10]] + Arc::new(PrimitiveArray::::from(&[ + Some(0), + Some(1), + Some(2), + None, + Some(3), + Some(4), + Some(5), + Some(6), + Some(7), + Some(8), + Some(9), + Some(10), + ])) as Arc + } + 1 | 2 => { + // [[0, 1], None, [2, 0, 3], [4, 5, 6], [], [7, 8, 9], None, [10]] + Arc::new(PrimitiveArray::::from(&[ + Some(0), + Some(1), + Some(2), + Some(0), + Some(3), + Some(4), + Some(5), + Some(6), + Some(7), + Some(8), + Some(9), + Some(10), + ])) as Arc + } + 3 => Arc::new(PrimitiveArray::::from(&[ + Some(0), + Some(1), + Some(2), + None, + Some(3), + Some(4), + Some(5), + Some(6), + Some(7), + Some(8), + Some(9), + Some(10), + ])) as Arc, + 4 => Arc::new(BooleanArray::from(&[ + Some(false), + Some(true), + Some(true), + None, + Some(false), + Some(true), + Some(false), + Some(true), + Some(false), + Some(false), + Some(false), + Some(true), + ])) as Arc, + /* + string = [ + ["Hello", "bbb"], + None, + ["aa", None, ""], + ["bbb", "aa", "ccc"], + [], + ["abc", "bbb", "bbb"], + None, + [""], + ] + */ + 5 => Arc::new(Utf8Array::::from(&[ + Some("Hello".to_string()), + Some("bbb".to_string()), + Some("aa".to_string()), + None, + Some("".to_string()), + Some("bbb".to_string()), + Some("aa".to_string()), + Some("ccc".to_string()), + Some("abc".to_string()), + Some("bbb".to_string()), + Some("bbb".to_string()), + Some("".to_string()), + ])), + 6 => Arc::new(BinaryArray::::from(&[ + Some(b"Hello".to_vec()), + Some(b"bbb".to_vec()), + Some(b"aa".to_vec()), + None, + Some(b"".to_vec()), + Some(b"bbb".to_vec()), + Some(b"aa".to_vec()), + Some(b"ccc".to_vec()), + Some(b"abc".to_vec()), + Some(b"bbb".to_vec()), + Some(b"bbb".to_vec()), + Some(b"".to_vec()), + ])), + _ => unreachable!(), + }; + + match column { + 0 | 1 | 3 | 4 | 5 | 6 => { + let field = match column { + 0 => Field::new("item", DataType::Int64, true), + 1 => Field::new("item", DataType::Int64, false), + 3 => Field::new("item", DataType::Int16, true), + 4 => Field::new("item", DataType::Boolean, true), + 5 => Field::new("item", DataType::Utf8, true), + 6 => Field::new("item", DataType::LargeBinary, true), + _ => unreachable!(), + }; + + let validity = Some(Bitmap::from([ + true, false, true, true, true, true, false, true, + ])); + let data_type = DataType::List(Box::new(field)); + Box::new(ListArray::::from_data( + data_type, offsets, values, validity, + )) + } + 2 => { + // [[0, 1], [], [2, None, 3], [4, 5, 6], [], [7, 8, 9], [], [10]] + let data_type = DataType::List(Box::new(Field::new("item", DataType::Int64, false))); + Box::new(ListArray::::from_data( + data_type, offsets, values, None, + )) + } + _ => unreachable!(), + } +} + +pub fn pyarrow_nullable(column: usize) -> Box { + let i64_values = &[ + Some(0), + Some(1), + None, + Some(3), + None, + Some(5), + Some(6), + Some(7), + None, + Some(9), + ]; + + match column { + 0 => Box::new(PrimitiveArray::::from(i64_values)), + 1 => Box::new(PrimitiveArray::::from(&[ + Some(0.0), + Some(1.0), + None, + Some(3.0), + None, + Some(5.0), + Some(6.0), + Some(7.0), + None, + Some(9.0), + ])), + 2 => Box::new(Utf8Array::::from(&[ + Some("Hello".to_string()), + None, + Some("aa".to_string()), + Some("".to_string()), + None, + Some("abc".to_string()), + None, + None, + Some("def".to_string()), + Some("aaa".to_string()), + ])), + 3 => Box::new(BooleanArray::from(&[ + Some(true), + None, + Some(false), + Some(false), + None, + Some(true), + None, + None, + Some(true), + Some(true), + ])), + 4 => Box::new( + PrimitiveArray::::from(i64_values) + .to(DataType::Timestamp(TimeUnit::Millisecond, None)), + ), + 5 => { + let values = i64_values + .iter() + .map(|x| x.map(|x| x as u32)) + .collect::>(); + Box::new(PrimitiveArray::::from(values)) + } + 6 => { + let keys = PrimitiveArray::::from([Some(0), Some(1), None, Some(1)]); + let values = Arc::new(PrimitiveArray::::from_slice([10, 200])); + Box::new(DictionaryArray::::from_data(keys, values)) + } + _ => unreachable!(), + } +} + +pub fn pyarrow_nullable_statistics(column: usize) -> Option> { + Some(match column { + 0 => Box::new(PrimitiveStatistics:: { + data_type: DataType::Int64, + distinct_count: None, + null_count: Some(3), + min_value: Some(0), + max_value: Some(9), + }), + 1 => Box::new(PrimitiveStatistics:: { + data_type: DataType::Float64, + distinct_count: None, + null_count: Some(3), + min_value: Some(0.0), + max_value: Some(9.0), + }), + 2 => Box::new(Utf8Statistics { + null_count: Some(4), + distinct_count: None, + min_value: Some("".to_string()), + max_value: Some("def".to_string()), + }), + 3 => Box::new(BooleanStatistics { + null_count: Some(4), + distinct_count: None, + + min_value: Some(false), + max_value: Some(true), + }), + 4 => Box::new(PrimitiveStatistics:: { + data_type: DataType::Timestamp(TimeUnit::Millisecond, None), + distinct_count: None, + null_count: Some(3), + min_value: Some(0), + max_value: Some(9), + }), + 5 => Box::new(PrimitiveStatistics:: { + data_type: DataType::UInt32, + null_count: Some(3), + distinct_count: None, + + min_value: Some(0), + max_value: Some(9), + }), + 6 => return None, + _ => unreachable!(), + }) +} + +// these values match the values in `integration` +pub fn pyarrow_required(column: usize) -> Box { + let i64_values = &[ + Some(0), + Some(1), + Some(2), + Some(3), + Some(4), + Some(5), + Some(6), + Some(7), + Some(8), + Some(9), + ]; + + match column { + 0 => Box::new(PrimitiveArray::::from(i64_values).to(DataType::Int64)), + 3 => Box::new(BooleanArray::from_slice(&[ + true, true, false, false, false, true, true, true, true, true, + ])), + 2 => Box::new(Utf8Array::::from_slice(&[ + "Hello", "bbb", "aa", "", "bbb", "abc", "bbb", "bbb", "def", "aaa", + ])), + _ => unreachable!(), + } +} + +pub fn pyarrow_required_statistics(column: usize) -> Option> { + Some(match column { + 0 => Box::new(PrimitiveStatistics:: { + data_type: DataType::Int64, + null_count: Some(0), + distinct_count: None, + min_value: Some(0), + max_value: Some(9), + }), + 3 => Box::new(BooleanStatistics { + null_count: Some(0), + distinct_count: None, + min_value: Some(false), + max_value: Some(true), + }), + 2 => Box::new(Utf8Statistics { + null_count: Some(0), + distinct_count: None, + min_value: Some("".to_string()), + max_value: Some("def".to_string()), + }), + _ => unreachable!(), + }) +} + +pub fn pyarrow_nested_nullable_statistics(column: usize) -> Option> { + Some(match column { + 3 => Box::new(PrimitiveStatistics:: { + data_type: DataType::Int16, + distinct_count: None, + null_count: Some(1), + min_value: Some(0), + max_value: Some(10), + }), + 4 => Box::new(BooleanStatistics { + distinct_count: None, + null_count: Some(1), + min_value: Some(false), + max_value: Some(true), + }), + 5 => Box::new(Utf8Statistics { + distinct_count: None, + null_count: Some(1), + min_value: Some("".to_string()), + max_value: Some("def".to_string()), + }), + 6 => Box::new(BinaryStatistics { + distinct_count: None, + null_count: Some(1), + min_value: Some(b"".to_vec()), + max_value: Some(b"def".to_vec()), + }), + _ => Box::new(PrimitiveStatistics:: { + data_type: DataType::Int64, + distinct_count: None, + null_count: Some(3), + min_value: Some(0), + max_value: Some(9), + }), + }) +} + +/// Round-trip with parquet using the same integration files used for IPC integration tests. +fn integration_write(schema: &Schema, batches: &[RecordBatch]) -> Result> { + let options = WriteOptions { + write_statistics: true, + compression: Compression::Uncompressed, + version: Version::V1, + }; + + let parquet_schema = to_parquet_schema(schema)?; + let descritors = parquet_schema.columns().to_vec().into_iter(); + + let row_groups = batches.iter().map(|batch| { + let iterator = DynIter::new(batch.columns().iter().zip(descritors.clone()).map( + |(array, type_)| { + Ok(DynIter::new(std::iter::once(array_to_page( + array.as_ref(), + type_, + options, + Encoding::Plain, + )))) + }, + )); + Ok(iterator) + }); + + let mut writer = Cursor::new(vec![]); + + write_file( + &mut writer, + row_groups, + schema, + parquet_schema, + options, + None, + )?; + + Ok(writer.into_inner()) +} + +fn integration_read(data: &[u8]) -> Result<(Arc, Vec)> { + let reader = Cursor::new(data); + let reader = RecordReader::try_new(reader, None, None, Arc::new(|_, _| true), None)?; + let schema = reader.schema().clone(); + let batches = reader.collect::>>()?; + + Ok((schema, batches)) +} + +fn test_file(version: &str, file_name: &str) -> Result<()> { + let (schema, batches) = read_gzip_json(version, file_name); + + let data = integration_write(&schema, &batches)?; + + let (read_schema, read_batches) = integration_read(&data)?; + + assert_eq!(&schema, read_schema.as_ref()); + assert_eq!(batches, read_batches); + + Ok(()) +} + +#[test] +fn roundtrip_100_primitive() -> Result<()> { + test_file("1.0.0-littleendian", "generated_primitive")?; + test_file("1.0.0-bigendian", "generated_primitive") +} + +/// Tests that when arrow-specific types (Duration and LargeUtf8) are written to parquet, we can rountrip its +/// logical types. +#[test] +fn arrow_type() -> Result<()> { + let dt1 = DataType::Duration(TimeUnit::Second); + let array = PrimitiveArray::::from([Some(1), None, Some(2)]).to(dt1.clone()); + let array2 = Utf8Array::::from([Some("a"), None, Some("bb")]); + let schema = Schema::new(vec![ + Field::new("a1", dt1, true), + Field::new("a2", array2.data_type().clone(), true), + ]); + let batch = RecordBatch::try_new( + Arc::new(schema.clone()), + vec![Arc::new(array), Arc::new(array2)], + )?; + + let r = integration_write(&schema, &[batch.clone()])?; + + let (new_schema, new_batches) = integration_read(&r)?; + + assert_eq!(new_schema.as_ref(), &schema); + assert_eq!(new_batches, vec![batch]); + Ok(()) +} diff --git a/tests/it/io/parquet/read.rs b/tests/it/io/parquet/read.rs new file mode 100644 index 00000000000..31663927883 --- /dev/null +++ b/tests/it/io/parquet/read.rs @@ -0,0 +1,259 @@ +use std::fs::File; +use std::sync::Arc; + +use arrow2::array::*; +use arrow2::error::Result; +use arrow2::io::parquet::read::*; + +use super::*; + +fn test_pyarrow_integration( + column: usize, + version: usize, + type_: &str, + use_dict: bool, + required: bool, +) -> Result<()> { + if std::env::var("ARROW2_IGNORE_PARQUET").is_ok() { + return Ok(()); + } + let use_dict = if use_dict { "dict/" } else { "" }; + let path = if required { + format!( + "fixtures/pyarrow3/v{}/{}{}_{}_10.parquet", + version, use_dict, type_, "required" + ) + } else { + format!( + "fixtures/pyarrow3/v{}/{}{}_{}_10.parquet", + version, use_dict, type_, "nullable" + ) + }; + let mut file = File::open(path).unwrap(); + let (array, statistics) = read_column(&mut file, 0, column)?; + + let expected = match (type_, required) { + ("basic", true) => pyarrow_required(column), + ("basic", false) => pyarrow_nullable(column), + ("nested", false) => pyarrow_nested_nullable(column), + _ => unreachable!(), + }; + + let expected_statistics = match (type_, required) { + ("basic", true) => pyarrow_required_statistics(column), + ("basic", false) => pyarrow_nullable_statistics(column), + ("nested", false) => pyarrow_nested_nullable_statistics(column), + _ => unreachable!(), + }; + + assert_eq!(expected.as_ref(), array.as_ref()); + assert_eq!(expected_statistics, statistics); + + Ok(()) +} + +#[test] +fn v1_int64_nullable() -> Result<()> { + test_pyarrow_integration(0, 1, "basic", false, false) +} + +#[test] +fn v1_int64_required() -> Result<()> { + test_pyarrow_integration(0, 1, "basic", false, true) +} + +#[test] +fn v1_float64_nullable() -> Result<()> { + test_pyarrow_integration(1, 1, "basic", false, false) +} + +#[test] +fn v1_utf8_nullable() -> Result<()> { + test_pyarrow_integration(2, 1, "basic", false, false) +} + +#[test] +fn v1_utf8_required() -> Result<()> { + test_pyarrow_integration(2, 1, "basic", false, true) +} + +#[test] +fn v1_boolean_nullable() -> Result<()> { + test_pyarrow_integration(3, 1, "basic", false, false) +} + +#[test] +fn v1_boolean_required() -> Result<()> { + test_pyarrow_integration(3, 1, "basic", false, true) +} + +#[test] +fn v1_timestamp_nullable() -> Result<()> { + test_pyarrow_integration(4, 1, "basic", false, false) +} + +#[test] +#[ignore] // pyarrow issue; see https://issues.apache.org/jira/browse/ARROW-12201 +fn v1_u32_nullable() -> Result<()> { + test_pyarrow_integration(5, 1, "basic", false, false) +} + +#[test] +fn v2_int64_nullable() -> Result<()> { + test_pyarrow_integration(0, 2, "basic", false, false) +} + +#[test] +fn v2_int64_nullable_dict() -> Result<()> { + test_pyarrow_integration(0, 2, "basic", true, false) +} + +#[test] +fn v1_int64_nullable_dict() -> Result<()> { + test_pyarrow_integration(0, 1, "basic", true, false) +} + +#[test] +fn v2_utf8_nullable() -> Result<()> { + test_pyarrow_integration(2, 2, "basic", false, false) +} + +#[test] +fn v2_utf8_required() -> Result<()> { + test_pyarrow_integration(2, 2, "basic", false, true) +} + +#[test] +fn v2_utf8_nullable_dict() -> Result<()> { + test_pyarrow_integration(2, 2, "basic", true, false) +} + +#[test] +fn v1_utf8_nullable_dict() -> Result<()> { + test_pyarrow_integration(2, 1, "basic", true, false) +} + +#[test] +fn v2_boolean_nullable() -> Result<()> { + test_pyarrow_integration(3, 2, "basic", false, false) +} + +#[test] +fn v2_boolean_required() -> Result<()> { + test_pyarrow_integration(3, 2, "basic", false, true) +} + +#[test] +fn v2_nested_int64_nullable() -> Result<()> { + test_pyarrow_integration(0, 2, "nested", false, false) +} + +#[test] +fn v1_nested_int64_nullable() -> Result<()> { + test_pyarrow_integration(0, 1, "nested", false, false) +} + +#[test] +fn v2_nested_int64_nullable_required() -> Result<()> { + test_pyarrow_integration(1, 2, "nested", false, false) +} + +#[test] +fn v1_nested_int64_nullable_required() -> Result<()> { + test_pyarrow_integration(1, 1, "nested", false, false) +} + +#[test] +fn v2_nested_int64_required_required() -> Result<()> { + test_pyarrow_integration(2, 2, "nested", false, false) +} + +#[test] +fn v1_nested_int64_required_required() -> Result<()> { + test_pyarrow_integration(2, 1, "nested", false, false) +} + +#[test] +fn v2_nested_i16() -> Result<()> { + test_pyarrow_integration(3, 2, "nested", false, false) +} + +#[test] +fn v1_nested_i16() -> Result<()> { + test_pyarrow_integration(3, 1, "nested", false, false) +} + +#[test] +fn v2_nested_bool() -> Result<()> { + test_pyarrow_integration(4, 2, "nested", false, false) +} + +#[test] +fn v1_nested_bool() -> Result<()> { + test_pyarrow_integration(4, 1, "nested", false, false) +} + +#[test] +fn v2_nested_utf8() -> Result<()> { + test_pyarrow_integration(5, 2, "nested", false, false) +} + +#[test] +fn v1_nested_utf8() -> Result<()> { + test_pyarrow_integration(5, 1, "nested", false, false) +} + +#[test] +fn v2_nested_large_binary() -> Result<()> { + test_pyarrow_integration(6, 2, "nested", false, false) +} + +#[test] +fn v1_nested_large_binary() -> Result<()> { + test_pyarrow_integration(6, 1, "nested", false, false) +} + +/*#[test] +fn v2_nested_nested() { + let _ = test_pyarrow_integration(7, 1, "nested",false, false); +}*/ + +#[test] +fn all_types() -> Result<()> { + let path = "testing/parquet-testing/data/alltypes_plain.parquet"; + let reader = std::fs::File::open(path)?; + + let reader = RecordReader::try_new(reader, None, None, Arc::new(|_, _| true), None)?; + + let batches = reader.collect::>>()?; + assert_eq!(batches.len(), 1); + + let result = batches[0] + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(result, &Int32Array::from_slice([4, 5, 6, 7, 2, 3, 0, 1])); + + let result = batches[0] + .column(6) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!( + result, + &Float32Array::from_slice([0.0, 1.1, 0.0, 1.1, 0.0, 1.1, 0.0, 1.1]) + ); + + let result = batches[0] + .column(9) + .as_any() + .downcast_ref::>() + .unwrap(); + assert_eq!( + result, + &BinaryArray::::from_slice([[48], [49], [48], [49], [48], [49], [48], [49]]) + ); + + Ok(()) +} diff --git a/tests/it/io/parquet/write.rs b/tests/it/io/parquet/write.rs new file mode 100644 index 00000000000..3e3b1d14618 --- /dev/null +++ b/tests/it/io/parquet/write.rs @@ -0,0 +1,368 @@ +use std::io::Cursor; + +use arrow2::io::parquet::write::*; +use arrow2::{error::Result, record_batch::RecordBatch}; + +use super::*; + +fn round_trip( + column: usize, + nullable: bool, + nested: bool, + version: Version, + compression: Compression, + encoding: Encoding, +) -> Result<()> { + let (array, statistics) = if nested { + ( + pyarrow_nested_nullable(column), + pyarrow_nested_nullable_statistics(column), + ) + } else if nullable { + ( + pyarrow_nullable(column), + pyarrow_nullable_statistics(column), + ) + } else { + ( + pyarrow_required(column), + pyarrow_required_statistics(column), + ) + }; + let array: Arc = array.into(); + + let field = Field::new("a1", array.data_type().clone(), nullable); + let schema = Schema::new(vec![field]); + + let options = WriteOptions { + write_statistics: true, + compression, + version, + }; + + let parquet_schema = to_parquet_schema(&schema)?; + + let iter = vec![RecordBatch::try_new( + Arc::new(schema.clone()), + vec![array.clone()], + )]; + + let row_groups = RowGroupIterator::try_new(iter.into_iter(), &schema, options, vec![encoding])?; + + let mut writer = Cursor::new(vec![]); + write_file( + &mut writer, + row_groups, + &schema, + parquet_schema, + options, + None, + )?; + + let data = writer.into_inner(); + + let (result, stats) = read_column(&mut Cursor::new(data), 0, 0)?; + assert_eq!(array.as_ref(), result.as_ref()); + assert_eq!(statistics.as_ref(), stats.as_ref()); + Ok(()) +} + +#[test] +fn test_int64_optional_v1() -> Result<()> { + round_trip( + 0, + true, + false, + Version::V1, + Compression::Uncompressed, + Encoding::Plain, + ) +} + +#[test] +fn test_int64_required_v1() -> Result<()> { + round_trip( + 0, + false, + false, + Version::V1, + Compression::Uncompressed, + Encoding::Plain, + ) +} + +#[test] +fn test_int64_optional_v2() -> Result<()> { + round_trip( + 0, + true, + false, + Version::V2, + Compression::Uncompressed, + Encoding::Plain, + ) +} + +#[test] +fn test_int64_optional_v2_compressed() -> Result<()> { + round_trip( + 0, + true, + false, + Version::V2, + Compression::Snappy, + Encoding::Plain, + ) +} + +#[test] +fn test_utf8_optional_v1() -> Result<()> { + round_trip( + 2, + true, + false, + Version::V1, + Compression::Uncompressed, + Encoding::Plain, + ) +} + +#[test] +fn test_utf8_required_v1() -> Result<()> { + round_trip( + 2, + false, + false, + Version::V1, + Compression::Uncompressed, + Encoding::Plain, + ) +} + +#[test] +fn test_utf8_optional_v2() -> Result<()> { + round_trip( + 2, + true, + false, + Version::V2, + Compression::Uncompressed, + Encoding::Plain, + ) +} + +#[test] +fn test_utf8_required_v2() -> Result<()> { + round_trip( + 2, + false, + false, + Version::V2, + Compression::Uncompressed, + Encoding::Plain, + ) +} + +#[test] +fn test_utf8_optional_v2_compressed() -> Result<()> { + round_trip( + 2, + true, + false, + Version::V2, + Compression::Snappy, + Encoding::Plain, + ) +} + +#[test] +fn test_utf8_required_v2_compressed() -> Result<()> { + round_trip( + 2, + false, + false, + Version::V2, + Compression::Snappy, + Encoding::Plain, + ) +} + +#[test] +fn test_bool_optional_v1() -> Result<()> { + round_trip( + 3, + true, + false, + Version::V1, + Compression::Uncompressed, + Encoding::Plain, + ) +} + +#[test] +fn test_bool_required_v1() -> Result<()> { + round_trip( + 3, + false, + false, + Version::V1, + Compression::Uncompressed, + Encoding::Plain, + ) +} + +#[test] +fn test_bool_optional_v2_uncompressed() -> Result<()> { + round_trip( + 3, + true, + false, + Version::V2, + Compression::Uncompressed, + Encoding::Plain, + ) +} + +#[test] +fn test_bool_required_v2_uncompressed() -> Result<()> { + round_trip( + 3, + false, + false, + Version::V2, + Compression::Uncompressed, + Encoding::Plain, + ) +} + +#[test] +fn test_bool_required_v2_compressed() -> Result<()> { + round_trip( + 3, + false, + false, + Version::V2, + Compression::Snappy, + Encoding::Plain, + ) +} + +#[test] +fn test_list_int64_optional_v2() -> Result<()> { + round_trip( + 0, + true, + true, + Version::V2, + Compression::Uncompressed, + Encoding::Plain, + ) +} + +#[test] +fn test_list_int64_optional_v1() -> Result<()> { + round_trip( + 0, + true, + true, + Version::V1, + Compression::Uncompressed, + Encoding::Plain, + ) +} + +#[test] +fn test_list_bool_optional_v2() -> Result<()> { + round_trip( + 4, + true, + true, + Version::V2, + Compression::Uncompressed, + Encoding::Plain, + ) +} + +#[test] +fn test_list_bool_optional_v1() -> Result<()> { + round_trip( + 4, + true, + true, + Version::V1, + Compression::Uncompressed, + Encoding::Plain, + ) +} + +#[test] +fn test_list_utf8_optional_v2() -> Result<()> { + round_trip( + 5, + true, + true, + Version::V2, + Compression::Uncompressed, + Encoding::Plain, + ) +} + +#[test] +fn test_list_utf8_optional_v1() -> Result<()> { + round_trip( + 5, + true, + true, + Version::V1, + Compression::Uncompressed, + Encoding::Plain, + ) +} + +#[test] +fn test_list_large_binary_optional_v2() -> Result<()> { + round_trip( + 6, + true, + true, + Version::V2, + Compression::Uncompressed, + Encoding::Plain, + ) +} + +#[test] +fn test_list_large_binary_optional_v1() -> Result<()> { + round_trip( + 6, + true, + true, + Version::V1, + Compression::Uncompressed, + Encoding::Plain, + ) +} + +#[test] +fn test_utf8_optional_v2_delta() -> Result<()> { + round_trip( + 2, + true, + false, + Version::V2, + Compression::Uncompressed, + Encoding::DeltaLengthByteArray, + ) +} + +#[test] +fn test_i32_optional_v2_dict() -> Result<()> { + round_trip( + 6, + true, + false, + Version::V2, + Compression::Uncompressed, + Encoding::RleDictionary, + ) +} diff --git a/tests/it/io/print.rs b/tests/it/io/print.rs new file mode 100644 index 00000000000..ad15ee6dcd1 --- /dev/null +++ b/tests/it/io/print.rs @@ -0,0 +1,393 @@ +use std::sync::Arc; + +use arrow2::{ + array::*, bitmap::Bitmap, buffer::Buffer, datatypes::*, error::Result, io::print::*, + record_batch::RecordBatch, +}; + +#[test] +fn test_write() -> Result<()> { + // define a schema. + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Utf8, true), + Field::new("b", DataType::Int32, true), + ])); + + // define data. + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(Utf8Array::::from(vec![ + Some("a"), + Some("b"), + None, + Some("d"), + ])), + Arc::new(Int32Array::from(vec![Some(1), None, Some(10), Some(100)])), + ], + )?; + + let table = write(&[batch]); + + let expected = vec![ + "+---+-----+", + "| a | b |", + "+---+-----+", + "| a | 1 |", + "| b | |", + "| | 10 |", + "| d | 100 |", + "+---+-----+", + ]; + + let actual: Vec<&str> = table.lines().collect(); + + assert_eq!(expected, actual, "Actual result:\n{}", table); + + Ok(()) +} + +#[test] +fn test_write_null() -> Result<()> { + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Utf8, true), + Field::new("b", DataType::Int32, true), + Field::new("c", DataType::Null, true), + ])); + + let num_rows = 4; + let arrays = schema + .fields() + .iter() + .map(|f| new_null_array(f.data_type().clone(), num_rows).into()) + .collect(); + + // define data (null) + let batch = RecordBatch::try_new(schema, arrays)?; + + let table = write(&[batch]); + + let expected = vec![ + "+---+---+---+", + "| a | b | c |", + "+---+---+---+", + "| | | |", + "| | | |", + "| | | |", + "| | | |", + "+---+---+---+", + ]; + + let actual: Vec<&str> = table.lines().collect(); + + assert_eq!(expected, actual, "Actual result:\n{:#?}", table); + Ok(()) +} + +#[test] +fn test_write_dictionary() -> Result<()> { + // define a schema. + let field_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)); + let schema = Arc::new(Schema::new(vec![Field::new("d1", field_type, true)])); + + let mut array = MutableDictionaryArray::>::new(); + + array.try_extend(vec![Some("one"), None, Some("three")])?; + let array = array.into_arc(); + + let batch = RecordBatch::try_new(schema, vec![array])?; + + let table = write(&[batch]); + + let expected = vec![ + "+-------+", + "| d1 |", + "+-------+", + "| one |", + "| |", + "| three |", + "+-------+", + ]; + + let actual: Vec<&str> = table.lines().collect(); + + assert_eq!(expected, actual, "Actual result:\n{}", table); + + Ok(()) +} + +/// Generate an array with type $ARRAYTYPE with a numeric value of +/// $VALUE, and compare $EXPECTED_RESULT to the output of +/// formatting that array with `write` +macro_rules! check_datetime { + ($ty:ty, $datatype:expr, $value:expr, $EXPECTED_RESULT:expr) => { + let array = Arc::new(PrimitiveArray::<$ty>::from(&[Some($value), None]).to($datatype)); + + let schema = Arc::new(Schema::new(vec![Field::new( + "f", + array.data_type().clone(), + true, + )])); + let batch = RecordBatch::try_new(schema, vec![array]).unwrap(); + + let table = write(&[batch]); + + let expected = $EXPECTED_RESULT; + let actual: Vec<&str> = table.lines().collect(); + + assert_eq!(expected, actual, "Actual result:\n\n{:#?}\n\n", actual); + }; +} + +#[test] +fn test_write_timestamp_second() { + let expected = vec![ + "+---------------------+", + "| f |", + "+---------------------+", + "| 1970-05-09 14:25:11 |", + "| |", + "+---------------------+", + ]; + check_datetime!( + i64, + DataType::Timestamp(TimeUnit::Second, None), + 11111111, + expected + ); +} + +#[test] +fn test_write_timestamp_second_with_tz() { + let expected = vec![ + "+-------------------------+", + "| f |", + "+-------------------------+", + "| 1970-05-09 14:25:11 UTC |", + "| |", + "+-------------------------+", + ]; + check_datetime!( + i64, + DataType::Timestamp(TimeUnit::Second, Some("UTC".to_string())), + 11111111, + expected + ); +} + +#[test] +fn test_write_timestamp_millisecond() { + let expected = vec![ + "+-------------------------+", + "| f |", + "+-------------------------+", + "| 1970-01-01 03:05:11.111 |", + "| |", + "+-------------------------+", + ]; + check_datetime!( + i64, + DataType::Timestamp(TimeUnit::Millisecond, None), + 11111111, + expected + ); +} + +#[test] +fn test_write_timestamp_microsecond() { + let expected = vec![ + "+----------------------------+", + "| f |", + "+----------------------------+", + "| 1970-01-01 00:00:11.111111 |", + "| |", + "+----------------------------+", + ]; + check_datetime!( + i64, + DataType::Timestamp(TimeUnit::Microsecond, None), + 11111111, + expected + ); +} + +#[test] +fn test_write_timestamp_nanosecond() { + let expected = vec![ + "+-------------------------------+", + "| f |", + "+-------------------------------+", + "| 1970-01-01 00:00:00.011111111 |", + "| |", + "+-------------------------------+", + ]; + check_datetime!( + i64, + DataType::Timestamp(TimeUnit::Nanosecond, None), + 11111111, + expected + ); +} + +#[test] +fn test_write_date_32() { + let expected = vec![ + "+------------+", + "| f |", + "+------------+", + "| 1973-05-19 |", + "| |", + "+------------+", + ]; + check_datetime!(i32, DataType::Date32, 1234, expected); +} + +#[test] +fn test_write_date_64() { + let expected = vec![ + "+------------+", + "| f |", + "+------------+", + "| 2005-03-18 |", + "| |", + "+------------+", + ]; + check_datetime!(i64, DataType::Date64, 1111111100000, expected); +} + +#[test] +fn test_write_time_32_second() { + let expected = vec![ + "+----------+", + "| f |", + "+----------+", + "| 00:18:31 |", + "| |", + "+----------+", + ]; + check_datetime!(i32, DataType::Time32(TimeUnit::Second), 1111, expected); +} + +#[test] +fn test_write_time_32_millisecond() { + let expected = vec![ + "+--------------+", + "| f |", + "+--------------+", + "| 03:05:11.111 |", + "| |", + "+--------------+", + ]; + check_datetime!( + i32, + DataType::Time32(TimeUnit::Millisecond), + 11111111, + expected + ); +} + +#[test] +fn test_write_time_64_microsecond() { + let expected = vec![ + "+-----------------+", + "| f |", + "+-----------------+", + "| 00:00:11.111111 |", + "| |", + "+-----------------+", + ]; + check_datetime!( + i64, + DataType::Time64(TimeUnit::Microsecond), + 11111111, + expected + ); +} + +#[test] +fn test_write_time_64_nanosecond() { + let expected = vec![ + "+--------------------+", + "| f |", + "+--------------------+", + "| 00:00:00.011111111 |", + "| |", + "+--------------------+", + ]; + check_datetime!( + i64, + DataType::Time64(TimeUnit::Nanosecond), + 11111111, + expected + ); +} + +#[test] +fn test_write_struct() -> Result<()> { + let fields = vec![ + Field::new("a", DataType::Int32, true), + Field::new("b", DataType::Utf8, true), + ]; + let values = vec![ + Arc::new(Int32Array::from(&[Some(1), None, Some(2)])) as Arc, + Arc::new(Utf8Array::::from(&[Some("a"), Some("b"), Some("c")])) as Arc, + ]; + + let validity = Some(Bitmap::from(&[true, false, true])); + + let array = StructArray::from_data(fields, values, validity); + + let schema = Schema::new(vec![Field::new("a", array.data_type().clone(), true)]); + + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(array)])?; + + let table = write(&[batch]); + + let expected = vec![ + "+--------------+", + "| a |", + "+--------------+", + "| {a: 1, b: a} |", + "| |", + "| {a: 2, b: c} |", + "+--------------+", + ]; + + let actual: Vec<&str> = table.lines().collect(); + + assert_eq!(expected, actual, "Actual result:\n{}", table); + + Ok(()) +} + +#[test] +fn test_write_union() -> Result<()> { + let fields = vec![ + Field::new("a", DataType::Int32, true), + Field::new("b", DataType::Utf8, true), + ]; + let data_type = DataType::Union(fields, None, true); + let types = Buffer::from(&[0, 0, 1]); + let fields = vec![ + Arc::new(Int32Array::from(&[Some(1), None, Some(2)])) as Arc, + Arc::new(Utf8Array::::from(&[Some("a"), Some("b"), Some("c")])) as Arc, + ]; + + let array = UnionArray::from_data(data_type, types, fields, None); + + let schema = Schema::new(vec![Field::new("a", array.data_type().clone(), true)]); + + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(array)])?; + + let table = write(&[batch]); + + let expected = vec![ + "+---+", "| a |", "+---+", "| 1 |", "| |", "| c |", "+---+", + ]; + + let actual: Vec<&str> = table.lines().collect(); + + assert_eq!(expected, actual, "Actual result:\n{}", table); + + Ok(()) +} diff --git a/tests/it/main.rs b/tests/it/main.rs new file mode 100644 index 00000000000..c7074cf22d0 --- /dev/null +++ b/tests/it/main.rs @@ -0,0 +1,7 @@ +mod alloc; +mod bitmap; +mod buffer; +mod ffi; + +mod io; +mod test_util; diff --git a/src/util/test_util.rs b/tests/it/test_util.rs similarity index 100% rename from src/util/test_util.rs rename to tests/it/test_util.rs