Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Added Utf8Sequence
Browse files Browse the repository at this point in the history
  • Loading branch information
jorgecarleitao committed Apr 16, 2022
1 parent 60130f4 commit ed22b6f
Show file tree
Hide file tree
Showing 24 changed files with 642 additions and 3 deletions.
226 changes: 226 additions & 0 deletions src/array/display.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
use crate::{
array::*,
datatypes::{DataType, IntervalUnit, TimeUnit},
temporal_conversions,
};

macro_rules! dyn_display {
($array:expr, $ty:ty, $expr:expr) => {{
let a = $array.as_any().downcast_ref::<$ty>().unwrap();
Box::new(move |row: usize| format!("{}", $expr(a.value(row))))
}};
}

macro_rules! dyn_primitive {
($array:expr, $ty:ty, $expr:expr) => {{
dyn_display!($array, PrimitiveArray<$ty>, $expr)
}};
}

/// Returns a function of index returning the string representation of the _value_ of `array`.
/// This does not take nulls into account.
pub fn get_value_display<'a>(array: &'a dyn Array) -> Box<dyn Fn(usize) -> String + 'a> {
use DataType::*;
match array.data_type() {
Null => Box::new(|_: usize| "".to_string()),
Boolean => {
let a = array.as_any().downcast_ref::<BooleanArray>().unwrap();
Box::new(move |row: usize| format!("{}", a.value(row)))
}
Int8 => dyn_primitive!(array, i8, |x| x),
Int16 => dyn_primitive!(array, i16, |x| x),
Int32 => dyn_primitive!(array, i32, |x| x),
Int64 => dyn_primitive!(array, i64, |x| x),
UInt8 => dyn_primitive!(array, u8, |x| x),
UInt16 => dyn_primitive!(array, u16, |x| x),
UInt32 => dyn_primitive!(array, u32, |x| x),
UInt64 => dyn_primitive!(array, u64, |x| x),
Float16 => unreachable!(),
Float32 => dyn_primitive!(array, f32, |x| x),
Float64 => dyn_primitive!(array, f64, |x| x),
Date32 => dyn_primitive!(array, i32, temporal_conversions::date32_to_date),
Date64 => dyn_primitive!(array, i64, temporal_conversions::date64_to_date),
Time32(TimeUnit::Second) => {
dyn_primitive!(array, i32, temporal_conversions::time32s_to_time)
}
Time32(TimeUnit::Millisecond) => {
dyn_primitive!(array, i32, temporal_conversions::time32ms_to_time)
}
Time32(_) => unreachable!(), // remaining are not valid
Time64(TimeUnit::Microsecond) => {
dyn_primitive!(array, i64, temporal_conversions::time64us_to_time)
}
Time64(TimeUnit::Nanosecond) => {
dyn_primitive!(array, i64, temporal_conversions::time64ns_to_time)
}
Time64(_) => unreachable!(), // remaining are not valid
Timestamp(time_unit, tz) => {
if let Some(tz) = tz {
let timezone = temporal_conversions::parse_offset(tz);
match timezone {
Ok(timezone) => {
dyn_primitive!(array, i64, |time| {
temporal_conversions::timestamp_to_datetime(time, *time_unit, &timezone)
})
}
#[cfg(feature = "chrono-tz")]
Err(_) => {
let timezone = temporal_conversions::parse_offset_tz(tz).unwrap();
dyn_primitive!(array, i64, |time| {
temporal_conversions::timestamp_to_datetime(time, *time_unit, &timezone)
})
}
#[cfg(not(feature = "chrono-tz"))]
_ => panic!(
"Invalid Offset format (must be [-]00:00) or chrono-tz feature not active"
),
}
} else {
dyn_primitive!(array, i64, |time| {
temporal_conversions::timestamp_to_naive_datetime(time, *time_unit)
})
}
}
Interval(IntervalUnit::YearMonth) => {
dyn_primitive!(array, i32, |x| format!("{}m", x))
}
Interval(IntervalUnit::DayTime) => {
dyn_primitive!(array, days_ms, |x: days_ms| format!(
"{}d{}ms",
x.days(),
x.milliseconds()
))
}

Interval(IntervalUnit::MonthDayNano) => {
dyn_primitive!(array, months_days_ns, |x: months_days_ns| format!(
"{}m{}d{}ns",
x.months(),
x.days(),
x.ns()
))
}
Duration(TimeUnit::Second) => dyn_primitive!(array, i64, |x| format!("{}s", x)),
Duration(TimeUnit::Millisecond) => dyn_primitive!(array, i64, |x| format!("{}ms", x)),
Duration(TimeUnit::Microsecond) => dyn_primitive!(array, i64, |x| format!("{}us", x)),
Duration(TimeUnit::Nanosecond) => dyn_primitive!(array, i64, |x| format!("{}ns", x)),
Binary => dyn_display!(array, BinaryArray<i32>, |x: &[u8]| {
x.iter().fold("".to_string(), |mut acc, x| {
acc.push_str(&format!("{:#010b}", x));
acc
})
}),
LargeBinary => dyn_display!(array, BinaryArray<i64>, |x: &[u8]| {
x.iter().fold("".to_string(), |mut acc, x| {
acc.push_str(&format!("{:#010b}", x));
acc
})
}),
FixedSizeBinary(_) => dyn_display!(array, FixedSizeBinaryArray, |x: &[u8]| {
x.iter().fold("".to_string(), |mut acc, x| {
acc.push_str(&format!("{:#010b}", x));
acc
})
}),
Utf8 => dyn_display!(array, Utf8Array<i32>, |x| x),
LargeUtf8 => dyn_display!(array, Utf8Array<i64>, |x| x),
Utf8Sequence => dyn_display!(array, StringSequenceArray<i32>, |x| x),
LargeUtf8Sequence => dyn_display!(array, StringSequenceArray<i64>, |x| x),
Decimal(_, scale) => {
// The number 999.99 has a precision of 5 and scale of 2
let scale = *scale as u32;
let display = move |x| {
let base = x / 10i128.pow(scale);
let decimals = x - base * 10i128.pow(scale);
format!("{}.{}", base, decimals)
};
dyn_primitive!(array, i128, display)
}
List(_) => {
let f = |x: Box<dyn Array>| {
let display = get_value_display(x.as_ref());
let string_values = (0..x.len()).map(display).collect::<Vec<String>>();
format!("[{}]", string_values.join(", "))
};
dyn_display!(array, ListArray<i32>, f)
}
FixedSizeList(_, _) => {
let f = |x: Box<dyn Array>| {
let display = get_value_display(x.as_ref());
let string_values = (0..x.len()).map(display).collect::<Vec<String>>();
format!("[{}]", string_values.join(", "))
};
dyn_display!(array, FixedSizeListArray, f)
}
LargeList(_) => {
let f = |x: Box<dyn Array>| {
let display = get_value_display(x.as_ref());
let string_values = (0..x.len()).map(display).collect::<Vec<String>>();
format!("[{}]", string_values.join(", "))
};
dyn_display!(array, ListArray<i64>, f)
}
Dictionary(key_type, ..) => match_integer_type!(key_type, |$T| {
let a = array
.as_any()
.downcast_ref::<DictionaryArray<$T>>()
.unwrap();
let keys = a.keys();
let display = get_display(a.values().as_ref());
Box::new(move |row: usize| {
if keys.is_null(row) {
"".to_string()
}else {
display(keys.value(row) as usize)
}
})
}),
Map(_, _) => todo!(),
Struct(_) => {
let a = array.as_any().downcast_ref::<StructArray>().unwrap();
let displays = a
.values()
.iter()
.map(|x| get_value_display(x.as_ref()))
.collect::<Vec<_>>();
Box::new(move |row: usize| {
let mut string = displays
.iter()
.zip(a.fields().iter().map(|f| &f.name))
.map(|(f, name)| (f(row), name))
.fold("{".to_string(), |mut acc, (v, name)| {
acc.push_str(&format!("{}: {}, ", name, v));
acc
});
if string.len() > 1 {
// remove last ", "
string.pop();
string.pop();
}
string.push('}');
string
})
}
Union(_, _, _) => {
let array = array.as_any().downcast_ref::<UnionArray>().unwrap();
Box::new(move |row: usize| {
let (field, index) = array.index(row);
get_display(array.fields()[field].as_ref())(index)
})
}
Extension(_, _, _) => todo!(),
}
}

/// Returns a function of index returning the string representation of the item of `array`.
/// This outputs an empty string on nulls.
pub fn get_display<'a>(array: &'a dyn Array) -> Box<dyn Fn(usize) -> String + 'a> {
let value_display = get_value_display(array);
Box::new(move |row| {
if array.is_null(row) {
"".to_string()
} else {
value_display(row)
}
})
}
11 changes: 11 additions & 0 deletions src/array/equal/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ mod primitive;
mod struct_;
mod union;
mod utf8;
mod utf8_sequence;

impl PartialEq for dyn Array + '_ {
fn eq(&self, that: &dyn Array) -> bool {
Expand Down Expand Up @@ -201,6 +202,16 @@ pub fn equal(lhs: &dyn Array, rhs: &dyn Array) -> bool {
let rhs = rhs.as_any().downcast_ref().unwrap();
utf8::equal::<i64>(lhs, rhs)
}
Utf8Sequence => {
let lhs = lhs.as_any().downcast_ref().unwrap();
let rhs = rhs.as_any().downcast_ref().unwrap();
utf8_sequence::equal::<i32>(lhs, rhs)
}
LargeUtf8Sequence => {
let lhs = lhs.as_any().downcast_ref().unwrap();
let rhs = rhs.as_any().downcast_ref().unwrap();
utf8_sequence::equal::<i64>(lhs, rhs)
}
Binary => {
let lhs = lhs.as_any().downcast_ref().unwrap();
let rhs = rhs.as_any().downcast_ref().unwrap();
Expand Down
5 changes: 5 additions & 0 deletions src/array/equal/utf8_sequence.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
use crate::array::{Offset, StringSequenceArray};

pub(super) fn equal<O: Offset>(lhs: &StringSequenceArray<O>, rhs: &StringSequenceArray<O>) -> bool {
lhs.data_type() == rhs.data_type() && lhs.len() == rhs.len() && lhs.iter().eq(rhs.iter())
}
2 changes: 2 additions & 0 deletions src/array/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ pub fn offset_buffers_children_dictionary(array: &dyn Array) -> BuffersChildren
FixedSizeBinary => ffi_dyn!(array, FixedSizeBinaryArray),
Utf8 => ffi_dyn!(array, Utf8Array::<i32>),
LargeUtf8 => ffi_dyn!(array, Utf8Array::<i64>),
Utf8Sequence => todo!("Arrow does not yet support exporting sequence views via FFI"),
LargeUtf8Sequence => todo!("Arrow does not yet support exporting sequence views via FFI"),
List => ffi_dyn!(array, ListArray::<i32>),
LargeList => ffi_dyn!(array, ListArray::<i64>),
FixedSizeList => ffi_dyn!(array, FixedSizeListArray),
Expand Down
1 change: 1 addition & 0 deletions src/array/fmt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ pub fn get_value_display<'a, F: Write + 'a>(
f,
)
}),
Utf8Sequence | LargeUtf8Sequence => todo!(),
LargeUtf8 => Box::new(|f, index| {
super::utf8::fmt::write_value::<i64, _>(
array.as_any().downcast_ref().unwrap(),
Expand Down
1 change: 1 addition & 0 deletions src/array/growable/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -138,5 +138,6 @@ pub fn make_growable<'a>(
))
})
}
_ => todo!("Sequence views"),
}
}
10 changes: 10 additions & 0 deletions src/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,8 @@ impl std::fmt::Debug for dyn Array + '_ {
FixedSizeBinary => fmt_dyn!(self, FixedSizeBinaryArray, f),
Utf8 => fmt_dyn!(self, Utf8Array::<i32>, f),
LargeUtf8 => fmt_dyn!(self, Utf8Array::<i64>, f),
Utf8Sequence => fmt_dyn!(self, StringSequenceArray::<i32>, f),
LargeUtf8Sequence => fmt_dyn!(self, StringSequenceArray::<i64>, f),
List => fmt_dyn!(self, ListArray::<i32>, f),
LargeList => fmt_dyn!(self, ListArray::<i64>, f),
FixedSizeList => fmt_dyn!(self, FixedSizeListArray, f),
Expand Down Expand Up @@ -256,6 +258,8 @@ pub fn new_empty_array(data_type: DataType) -> Box<dyn Array> {
FixedSizeBinary => Box::new(FixedSizeBinaryArray::new_empty(data_type)),
Utf8 => Box::new(Utf8Array::<i32>::new_empty(data_type)),
LargeUtf8 => Box::new(Utf8Array::<i64>::new_empty(data_type)),
Utf8Sequence => Box::new(StringSequenceArray::<i32>::new_empty(data_type)),
LargeUtf8Sequence => Box::new(StringSequenceArray::<i64>::new_empty(data_type)),
List => Box::new(ListArray::<i32>::new_empty(data_type)),
LargeList => Box::new(ListArray::<i64>::new_empty(data_type)),
FixedSizeList => Box::new(FixedSizeListArray::new_empty(data_type)),
Expand Down Expand Up @@ -286,6 +290,8 @@ pub fn new_null_array(data_type: DataType, length: usize) -> Box<dyn Array> {
FixedSizeBinary => Box::new(FixedSizeBinaryArray::new_null(data_type, length)),
Utf8 => Box::new(Utf8Array::<i32>::new_null(data_type, length)),
LargeUtf8 => Box::new(Utf8Array::<i64>::new_null(data_type, length)),
Utf8Sequence => Box::new(StringSequenceArray::<i32>::new_null(data_type, length)),
LargeUtf8Sequence => Box::new(StringSequenceArray::<i64>::new_null(data_type, length)),
List => Box::new(ListArray::<i32>::new_null(data_type, length)),
LargeList => Box::new(ListArray::<i64>::new_null(data_type, length)),
FixedSizeList => Box::new(FixedSizeListArray::new_null(data_type, length)),
Expand Down Expand Up @@ -324,6 +330,8 @@ pub fn clone(array: &dyn Array) -> Box<dyn Array> {
FixedSizeBinary => clone_dyn!(array, FixedSizeBinaryArray),
Utf8 => clone_dyn!(array, Utf8Array::<i32>),
LargeUtf8 => clone_dyn!(array, Utf8Array::<i64>),
Utf8Sequence => clone_dyn!(array, StringSequenceArray::<i32>),
LargeUtf8Sequence => clone_dyn!(array, StringSequenceArray::<i64>),
List => clone_dyn!(array, ListArray::<i32>),
LargeList => clone_dyn!(array, ListArray::<i64>),
FixedSizeList => clone_dyn!(array, FixedSizeListArray),
Expand Down Expand Up @@ -356,6 +364,7 @@ mod map;
mod null;
mod primitive;
mod specification;
mod string_sequence;
mod struct_;
mod union;
mod utf8;
Expand All @@ -379,6 +388,7 @@ pub use list::{ListArray, ListValuesIter, MutableListArray};
pub use map::MapArray;
pub use null::NullArray;
pub use primitive::*;
pub use string_sequence::StringSequenceArray;
pub use struct_::StructArray;
pub use union::UnionArray;
pub use utf8::{MutableUtf8Array, Utf8Array, Utf8ValuesIter};
Expand Down
27 changes: 27 additions & 0 deletions src/array/string_sequence/fmt.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
use std::fmt::{Debug, Formatter, Result, Write};

use super::super::fmt::write_vec;
use super::super::Offset;
use super::StringSequenceArray;

pub fn write_value<O: Offset, W: Write>(
array: &StringSequenceArray<O>,
index: usize,
f: &mut W,
) -> Result {
write!(f, "{}", array.value(index))
}

impl<O: Offset> Debug for StringSequenceArray<O> {
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
let writer = |f: &mut Formatter, index| write_value(self, index, f);

let head = if O::is_large() {
"LargeStringSequenceArray"
} else {
"StringSequenceArray"
};
write!(f, "{}", head)?;
write_vec(f, writer, self.validity(), self.len(), "None", false)
}
}
Loading

0 comments on commit ed22b6f

Please sign in to comment.