Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Moved is_ordered (#711)
Browse files Browse the repository at this point in the history
  • Loading branch information
jorgecarleitao authored Dec 26, 2021
1 parent 64043a0 commit f33a41f
Show file tree
Hide file tree
Showing 35 changed files with 110 additions and 181 deletions.
2 changes: 1 addition & 1 deletion arrow-parquet-integration-testing/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ fn main() -> Result<()> {
.fields()
.iter()
.map(|x| match x.data_type() {
DataType::Dictionary(_, _) => Encoding::RleDictionary,
DataType::Dictionary(..) => Encoding::RleDictionary,
DataType::Utf8 | DataType::LargeUtf8 => {
if utf8_encoding == "delta" {
Encoding::DeltaLengthByteArray
Expand Down
5 changes: 3 additions & 2 deletions src/array/dictionary/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ impl<K: DictionaryKey> DictionaryArray<K> {

/// The canonical method to create a new [`DictionaryArray`].
pub fn from_data(keys: PrimitiveArray<K>, values: Arc<dyn Array>) -> Self {
let data_type = DataType::Dictionary(K::KEY_TYPE, Box::new(values.data_type().clone()));
let data_type =
DataType::Dictionary(K::KEY_TYPE, Box::new(values.data_type().clone()), false);

Self {
data_type,
Expand Down Expand Up @@ -165,7 +166,7 @@ impl<K: DictionaryKey> DictionaryArray<K> {
impl<K: DictionaryKey> DictionaryArray<K> {
pub(crate) fn get_child(data_type: &DataType) -> &DataType {
match data_type {
DataType::Dictionary(_, values) => values.as_ref(),
DataType::Dictionary(_, values, _) => values.as_ref(),
DataType::Extension(_, inner, _) => Self::get_child(inner),
_ => panic!("DictionaryArray must be initialized with DataType::Dictionary"),
}
Expand Down
12 changes: 10 additions & 2 deletions src/array/dictionary/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,11 @@ impl<K: DictionaryKey, M: MutableArray> From<MutableDictionaryArray<K, M>> for D
impl<K: DictionaryKey, M: MutableArray> From<M> for MutableDictionaryArray<K, M> {
fn from(values: M) -> Self {
Self {
data_type: DataType::Dictionary(K::KEY_TYPE, Box::new(values.data_type().clone())),
data_type: DataType::Dictionary(
K::KEY_TYPE,
Box::new(values.data_type().clone()),
false,
),
keys: MutablePrimitiveArray::<K>::new(),
map: HashedMap::default(),
values,
Expand All @@ -44,7 +48,11 @@ impl<K: DictionaryKey, M: MutableArray + Default> MutableDictionaryArray<K, M> {
pub fn new() -> Self {
let values = M::default();
Self {
data_type: DataType::Dictionary(K::KEY_TYPE, Box::new(values.data_type().clone())),
data_type: DataType::Dictionary(
K::KEY_TYPE,
Box::new(values.data_type().clone()),
false,
),
keys: MutablePrimitiveArray::<K>::new(),
map: HashedMap::default(),
values,
Expand Down
2 changes: 1 addition & 1 deletion src/array/display.rs
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ pub fn get_value_display<'a>(array: &'a dyn Array) -> Box<dyn Fn(usize) -> Strin
};
dyn_display!(array, ListArray<i64>, f)
}
Dictionary(key_type, _) => match_integer_type!(key_type, |$T| {
Dictionary(key_type, ..) => match_integer_type!(key_type, |$T| {
let a = array
.as_any()
.downcast_ref::<DictionaryArray<$T>>()
Expand Down
2 changes: 1 addition & 1 deletion src/array/ord.rs
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ pub fn build_compare(left: &dyn Array, right: &dyn Array) -> Result<DynComparato
(LargeUtf8, LargeUtf8) => compare_string::<i64>(left, right),
(Binary, Binary) => compare_binary::<i32>(left, right),
(LargeBinary, LargeBinary) => compare_binary::<i64>(left, right),
(Dictionary(key_type_lhs, _), Dictionary(key_type_rhs, _)) => {
(Dictionary(key_type_lhs, ..), Dictionary(key_type_rhs, ..)) => {
match (key_type_lhs, key_type_rhs) {
(IntegerType::UInt8, IntegerType::UInt8) => dyn_dict!(u8, left, right),
(IntegerType::UInt16, IntegerType::UInt16) => dyn_dict!(u16, left, right),
Expand Down
2 changes: 1 addition & 1 deletion src/compute/arithmetics/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,7 @@ pub fn neg(array: &dyn Array) -> Box<dyn Array> {

/// Whether [`neg`] is supported for a given [`DataType`]
pub fn can_neg(data_type: &DataType) -> bool {
if let DataType::Dictionary(_, values) = data_type.to_logical_type() {
if let DataType::Dictionary(_, values, _) = data_type.to_logical_type() {
return can_neg(values.as_ref());
}

Expand Down
2 changes: 1 addition & 1 deletion src/compute/cast/dictionary_to.rs
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ pub(super) fn dictionary_cast_dyn<K: DictionaryKey>(
let values = array.values();

match to_type {
DataType::Dictionary(to_keys_type, to_values_type) => {
DataType::Dictionary(to_keys_type, to_values_type, _) => {
let values = cast(values.as_ref(), to_values_type, options)?.into();

// create the appropriate array type
Expand Down
82 changes: 13 additions & 69 deletions src/compute/cast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,40 +80,12 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
match (from_type, to_type) {
(
Null,
Boolean
| Int8
| UInt8
| Int16
| UInt16
| Int32
| UInt32
| Float32
| Date32
| Time32(_)
| Int64
| UInt64
| Float64
| Date64
| List(_)
| Dictionary(_, _),
Boolean | Int8 | UInt8 | Int16 | UInt16 | Int32 | UInt32 | Float32 | Date32 | Time32(_)
| Int64 | UInt64 | Float64 | Date64 | List(_) | Dictionary(..),
)
| (
Boolean
| Int8
| UInt8
| Int16
| UInt16
| Int32
| UInt32
| Float32
| Date32
| Time32(_)
| Int64
| UInt64
| Float64
| Date64
| List(_)
| Dictionary(_, _),
Boolean | Int8 | UInt8 | Int16 | UInt16 | Int32 | UInt32 | Float32 | Date32 | Time32(_)
| Int64 | UInt64 | Float64 | Date64 | List(_) | Dictionary(..),
Null,
) => true,
(Struct(_), _) => false,
Expand All @@ -127,11 +99,11 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
(List(list_from), LargeList(list_to)) if list_from == list_to => true,
(LargeList(list_from), List(list_to)) if list_from == list_to => true,
(_, List(list_to)) => can_cast_types(from_type, list_to.data_type()),
(Dictionary(_, from_value_type), Dictionary(_, to_value_type)) => {
(Dictionary(_, from_value_type, _), Dictionary(_, to_value_type, _)) => {
can_cast_types(from_value_type, to_value_type)
}
(Dictionary(_, value_type), _) => can_cast_types(value_type, to_type),
(_, Dictionary(_, value_type)) => can_cast_types(from_type, value_type),
(Dictionary(_, value_type, _), _) => can_cast_types(value_type, to_type),
(_, Dictionary(_, value_type, _)) => can_cast_types(from_type, value_type),

(_, Boolean) => is_numeric(from_type),
(Boolean, _) => {
Expand Down Expand Up @@ -376,40 +348,12 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu
match (from_type, to_type) {
(
Null,
Boolean
| Int8
| UInt8
| Int16
| UInt16
| Int32
| UInt32
| Float32
| Date32
| Time32(_)
| Int64
| UInt64
| Float64
| Date64
| List(_)
| Dictionary(_, _),
Boolean | Int8 | UInt8 | Int16 | UInt16 | Int32 | UInt32 | Float32 | Date32 | Time32(_)
| Int64 | UInt64 | Float64 | Date64 | List(_) | Dictionary(..),
)
| (
Boolean
| Int8
| UInt8
| Int16
| UInt16
| Int32
| UInt32
| Float32
| Date32
| Time32(_)
| Int64
| UInt64
| Float64
| Date64
| List(_)
| Dictionary(_, _),
Boolean | Int8 | UInt8 | Int16 | UInt16 | Int32 | UInt32 | Float32 | Date32 | Time32(_)
| Int64 | UInt64 | Float64 | Date64 | List(_) | Dictionary(..),
Null,
) => Ok(new_null_array(to_type.clone(), array.len())),
(Struct(_), _) => Err(ArrowError::NotYetImplemented(
Expand Down Expand Up @@ -449,10 +393,10 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu
Ok(Box::new(list_array))
}

(Dictionary(index_type, _), _) => match_integer_type!(index_type, |$T| {
(Dictionary(index_type, ..), _) => match_integer_type!(index_type, |$T| {
dictionary_cast_dyn::<$T>(array, to_type, options)
}),
(_, Dictionary(index_type, value_type)) => match_integer_type!(index_type, |$T| {
(_, Dictionary(index_type, value_type, _)) => match_integer_type!(index_type, |$T| {
cast_to_dictionary::<$T>(array, value_type, options)
}),
(_, Boolean) => match from_type {
Expand Down
4 changes: 2 additions & 2 deletions src/compute/sort/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ pub fn sort_to_indices<I: Index>(
))),
}
}
DataType::Dictionary(key_type, value_type) => match value_type.as_ref() {
DataType::Dictionary(key_type, value_type, _) => match value_type.as_ref() {
DataType::Utf8 => Ok(sort_dict::<I, i32>(values, key_type, options, limit)),
DataType::LargeUtf8 => Ok(sort_dict::<I, i64>(values, key_type, options, limit)),
t => Err(ArrowError::NotYetImplemented(format!(
Expand Down Expand Up @@ -282,7 +282,7 @@ pub fn can_sort(data_type: &DataType) -> bool {
| DataType::UInt64
)
}
DataType::Dictionary(_, value_type) => {
DataType::Dictionary(_, value_type, _) => {
matches!(*value_type.as_ref(), DataType::Utf8 | DataType::LargeUtf8)
}
_ => false,
Expand Down
2 changes: 1 addition & 1 deletion src/compute/take/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,6 @@ pub fn can_take(data_type: &DataType) -> bool {
| DataType::Struct(_)
| DataType::List(_)
| DataType::LargeList(_)
| DataType::Dictionary(_, _)
| DataType::Dictionary(..)
)
}
26 changes: 2 additions & 24 deletions src/datatypes/field.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,6 @@ pub struct Field {
pub nullable: bool,
/// The dictionary id of this field (currently un-used)
pub dict_id: i64,
/// Whether the dictionary's values are ordered
pub dict_is_ordered: bool,
/// A map of key-value pairs containing additional custom meta data.
pub metadata: Option<BTreeMap<String, String>>,
}
Expand All @@ -44,7 +42,6 @@ impl std::hash::Hash for Field {
self.name.hash(state);
self.data_type.hash(state);
self.nullable.hash(state);
self.dict_is_ordered.hash(state);
self.metadata.hash(state);
}
}
Expand All @@ -54,7 +51,6 @@ impl PartialEq for Field {
self.name == other.name
&& self.data_type == other.data_type
&& self.nullable == other.nullable
&& self.dict_is_ordered == other.dict_is_ordered
&& self.metadata == other.metadata
}
}
Expand All @@ -67,7 +63,6 @@ impl Field {
data_type,
nullable,
dict_id: 0,
dict_is_ordered: false,
metadata: None,
}
}
Expand All @@ -78,14 +73,12 @@ impl Field {
data_type: DataType,
nullable: bool,
dict_id: i64,
dict_is_ordered: bool,
) -> Self {
Field {
name: name.into(),
data_type,
nullable,
dict_id,
dict_is_ordered,
metadata: None,
}
}
Expand All @@ -98,7 +91,6 @@ impl Field {
data_type: self.data_type,
nullable: self.nullable,
dict_id: self.dict_id,
dict_is_ordered: self.dict_is_ordered,
metadata: Some(metadata),
}
}
Expand Down Expand Up @@ -143,16 +135,7 @@ impl Field {
#[inline]
pub const fn dict_id(&self) -> Option<i64> {
match self.data_type {
DataType::Dictionary(_, _) => Some(self.dict_id),
_ => None,
}
}

/// Returns whether this [`Field`]'s dictionary is ordered, if this is a dictionary type.
#[inline]
pub const fn dict_is_ordered(&self) -> Option<bool> {
match self.data_type {
DataType::Dictionary(_, _) => Some(self.dict_is_ordered),
DataType::Dictionary(_, _, _) => Some(self.dict_id),
_ => None,
}
}
Expand Down Expand Up @@ -197,11 +180,6 @@ impl Field {
"Fail to merge schema Field due to conflicting dict_id".to_string(),
));
}
if from.dict_is_ordered != self.dict_is_ordered {
return Err(ArrowError::InvalidArgumentError(
"Fail to merge schema Field due to conflicting dict_is_ordered".to_string(),
));
}
match &mut self.data_type {
DataType::Struct(nested_fields) => match &from.data_type {
DataType::Struct(from_nested_fields) => {
Expand Down Expand Up @@ -270,7 +248,7 @@ impl Field {
| DataType::Interval(_)
| DataType::LargeList(_)
| DataType::List(_)
| DataType::Dictionary(_, _)
| DataType::Dictionary(_, _, _)
| DataType::FixedSizeList(_, _)
| DataType::FixedSizeBinary(_)
| DataType::Utf8
Expand Down
4 changes: 2 additions & 2 deletions src/datatypes/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ pub enum DataType {
///
/// This type mostly used to represent low cardinality string
/// arrays or a limited set of primitive types as integers.
Dictionary(IntegerType, Box<DataType>),
Dictionary(IntegerType, Box<DataType>, bool),
/// Decimal value with precision and scale
/// precision is the number of digits in the number and
/// scale is the number of decimal places.
Expand Down Expand Up @@ -261,7 +261,7 @@ impl DataType {
Struct(_) => PhysicalType::Struct,
Union(_, _, _) => PhysicalType::Union,
Map(_, _) => PhysicalType::Map,
Dictionary(key, _) => PhysicalType::Dictionary(*key),
Dictionary(key, _, _) => PhysicalType::Dictionary(*key),
Extension(_, key, _) => key.to_physical_type(),
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/ffi/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@ fn create_dictionary(
field: &Field,
parent: Arc<ArrowArray>,
) -> Result<Option<ArrowArrayChild<'static>>> {
if let DataType::Dictionary(_, values) = field.data_type() {
if let DataType::Dictionary(_, values, _) = field.data_type() {
let field = Field::new("", values.as_ref().clone(), true);
assert!(!array.dictionary.is_null());
let array = unsafe { &*array.dictionary };
Expand Down
9 changes: 5 additions & 4 deletions src/ffi/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,8 @@ impl Ffi_ArrowSchema {
.collect::<Box<_>>();
let n_children = children_ptr.len() as i64;

let dictionary = if let DataType::Dictionary(_, values) = field.data_type() {
flags += field.dict_is_ordered().unwrap_or_default() as i64;
let dictionary = if let DataType::Dictionary(_, values, is_ordered) = field.data_type() {
flags += *is_ordered as i64;
// we do not store field info in the dict values, so can't recover it all :(
let field = Field::new("", values.as_ref().clone(), true);
Some(Box::new(Ffi_ArrowSchema::new(&field)))
Expand Down Expand Up @@ -214,7 +214,8 @@ pub(crate) unsafe fn to_field(schema: &Ffi_ArrowSchema) -> Result<Field> {
let data_type = if let Some(dictionary) = dictionary {
let indices = to_integer_type(schema.format())?;
let values = to_field(dictionary)?;
DataType::Dictionary(indices, Box::new(values.data_type().clone()))
let is_ordered = schema.flags & 1 == 1;
DataType::Dictionary(indices, Box::new(values.data_type().clone()), is_ordered)
} else {
to_data_type(schema)?
};
Expand Down Expand Up @@ -449,7 +450,7 @@ fn to_format(data_type: &DataType) -> String {
r
}
DataType::Map(_, _) => "+m".to_string(),
DataType::Dictionary(index, _) => to_format(&(*index).into()),
DataType::Dictionary(index, _, _) => to_format(&(*index).into()),
DataType::Extension(_, inner, _) => to_format(inner.as_ref()),
}
}
Expand Down
1 change: 1 addition & 0 deletions src/io/avro/read/nested.rs
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ impl FixedItemsUtf8Dictionary {
data_type: DataType::Dictionary(
IntegerType::Int32,
Box::new(values.data_type().clone()),
false,
),
keys: MutablePrimitiveArray::<i32>::with_capacity(capacity),
values,
Expand Down
Loading

0 comments on commit f33a41f

Please sign in to comment.