From a6812464e80b0349f91135286f1cb541ae902d54 Mon Sep 17 00:00:00 2001 From: zhyass <34016424+zhyass@users.noreply.github.com> Date: Wed, 1 Sep 2021 22:33:35 +0800 Subject: [PATCH 1/4] Remove DataType::Utf8 --- Cargo.lock | 35 +++-- common/arrow/Cargo.toml | 4 +- .../src/kernels/data_block_group_by_hash.rs | 4 +- common/datavalues/src/arrays/arithmetic.rs | 27 ++-- .../datavalues/src/arrays/binary/builder.rs | 44 ------- common/datavalues/src/arrays/comparison.rs | 60 +++++---- common/datavalues/src/arrays/list/builder.rs | 26 ++-- common/datavalues/src/arrays/mod.rs | 6 +- common/datavalues/src/arrays/ops/agg.rs | 20 ++- common/datavalues/src/arrays/ops/apply.rs | 12 +- common/datavalues/src/arrays/ops/cast.rs | 6 +- common/datavalues/src/arrays/ops/contain.rs | 2 +- common/datavalues/src/arrays/ops/fill.rs | 35 ++--- .../datavalues/src/arrays/ops/group_hash.rs | 5 +- common/datavalues/src/arrays/ops/if.rs | 9 +- common/datavalues/src/arrays/ops/scatter.rs | 51 +------- .../datavalues/src/arrays/ops/scatter_test.rs | 16 +-- common/datavalues/src/arrays/ops/take.rs | 67 +++++----- .../datavalues/src/arrays/ops/take_random.rs | 18 +-- .../datavalues/src/arrays/ops/take_single.rs | 10 +- common/datavalues/src/arrays/ops/to_values.rs | 30 +---- common/datavalues/src/arrays/ops/vec_hash.rs | 30 +---- .../datavalues/src/arrays/string/builder.rs | 122 ++++++++++++++++++ .../src/arrays/{binary => string}/iterator.rs | 28 ++-- .../src/arrays/{binary => string}/mod.rs | 69 +++++++++- common/datavalues/src/arrays/trusted_len.rs | 4 +- .../datavalues/src/arrays/upstream_traits.rs | 32 ++--- common/datavalues/src/data_group_value.rs | 8 +- common/datavalues/src/data_value.rs | 83 +++++------- common/datavalues/src/data_value_ops.rs | 4 +- common/datavalues/src/macros.rs | 20 +-- common/datavalues/src/prelude.rs | 1 - common/datavalues/src/series/arithmetic.rs | 3 +- common/datavalues/src/series/comparison.rs | 4 +- common/datavalues/src/series/de.rs | 2 +- common/datavalues/src/series/series_impl.rs | 29 ++--- common/datavalues/src/series/wrap.rs | 24 +--- common/datavalues/src/types/data_df_type.rs | 4 +- common/datavalues/src/types/data_type.rs | 9 +- .../src/types/data_type_coercion.rs | 14 -- .../src/types/physical_data_type.rs | 9 +- .../src/types/serializations/mod.rs | 9 +- .../serializations/{binary.rs => string.rs} | 6 +- .../src/types/serializations/utf8.rs | 37 ------ .../src/aggregates/aggregate_arg_min_max.rs | 30 ++--- .../src/aggregates/aggregate_min_max.rs | 32 ++--- .../src/scalars/expressions/expression.rs | 6 +- .../functions/src/scalars/hashes/siphash.rs | 3 +- .../src/scalars/strings/substring.rs | 2 +- common/functions/src/scalars/udfs/database.rs | 2 +- .../src/scalars/udfs/to_type_name.rs | 4 +- common/functions/src/scalars/udfs/version.rs | 2 +- common/io/src/binary_de.rs | 4 +- common/io/src/binary_ser.rs | 6 +- common/io/src/binary_write.rs | 16 +++ common/planners/src/plan_builder.rs | 2 +- common/planners/src/plan_explain.rs | 2 +- common/planners/src/plan_expression.rs | 7 +- .../planners/src/plan_expression_literal.rs | 8 +- .../src/datasources/system/clusters_table.rs | 8 +- .../datasources/system/contributors_table.rs | 6 +- .../src/datasources/system/databases_table.rs | 6 +- .../src/datasources/system/functions_table.rs | 6 +- .../src/datasources/system/processes_table.rs | 31 +++-- .../src/datasources/system/settings_table.rs | 16 +-- query/src/datasources/system/tables_table.rs | 16 +-- query/src/datasources/system/tracing_table.rs | 8 +- .../system/tracing_table_stream.rs | 8 +- query/src/functions/context_function.rs | 8 +- .../interpreter_describe_table.rs | 6 +- query/src/interpreters/interpreter_explain.rs | 15 ++- query/src/interpreters/interpreter_setting.rs | 2 +- .../interpreter_show_create_table.rs | 8 +- .../optimizers/optimizer_statistics_exact.rs | 2 +- .../optimizer_statistics_exact_test.rs | 4 +- .../transforms/group_by/aggregator.rs | 6 +- .../group_by/aggregator_keys_builder.rs | 4 +- .../group_by/aggregator_polymorphic_keys.rs | 9 +- .../transforms/transform_aggregator_final.rs | 4 +- .../transform_aggregator_partial.rs | 4 +- .../transforms/transform_group_by_final.rs | 12 +- .../clickhouse/writers/query_writer.rs | 34 ++--- .../mysql/writers/query_result_writer.rs | 3 +- query/src/sessions/settings.rs | 40 +++--- query/src/sql/plan_parser.rs | 16 +-- query/src/sql/sql_common.rs | 10 +- query/src/tests/parquet.rs | 2 +- .../02_0001_function_to_type_name.result | 2 +- .../0_stateless/08_0000_optimizer.result | 4 +- .../0_stateless/10_0000_describe_table.result | 4 +- 90 files changed, 715 insertions(+), 753 deletions(-) delete mode 100644 common/datavalues/src/arrays/binary/builder.rs create mode 100644 common/datavalues/src/arrays/string/builder.rs rename common/datavalues/src/arrays/{binary => string}/iterator.rs (80%) rename common/datavalues/src/arrays/{binary => string}/mod.rs (64%) rename common/datavalues/src/types/serializations/{binary.rs => string.rs} (88%) delete mode 100644 common/datavalues/src/types/serializations/utf8.rs diff --git a/Cargo.lock b/Cargo.lock index 52c8d75f899cb..01599768032dc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -111,7 +111,7 @@ checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b" [[package]] name = "arrow-flight" version = "0.1.0" -source = "git+https://github.com/datafuse-extras/arrow2?rev=7765067#77650672233bd7bbb9839a2a616f11ebffa15807" +source = "git+https://github.com/zhyass/arrow2?rev=05f2b5c#05f2b5c129c7f83cade7f371f50c87658613135e" dependencies = [ "arrow2", "bytes", @@ -125,7 +125,7 @@ dependencies = [ [[package]] name = "arrow2" version = "0.4.0" -source = "git+https://github.com/datafuse-extras/arrow2?rev=7765067#77650672233bd7bbb9839a2a616f11ebffa15807" +source = "git+https://github.com/zhyass/arrow2?rev=05f2b5c#05f2b5c129c7f83cade7f371f50c87658613135e" dependencies = [ "ahash 0.7.4", "base64", @@ -142,9 +142,9 @@ dependencies = [ "lexical-core", "lz4", "multiversion", - "num", + "num-traits", "packed_simd_2", - "parquet2 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", + "parquet2 0.4.0", "rand 0.8.4", "regex", "serde", @@ -736,7 +736,7 @@ version = "0.1.0" dependencies = [ "arrow-flight", "arrow2", - "parquet2 0.3.0 (git+https://github.com/datafuse-extras/parquet2?rev=d28330f)", + "parquet2 0.3.0", ] [[package]] @@ -3633,11 +3633,23 @@ dependencies = [ "ordered-float 1.1.1", ] +[[package]] +name = "parquet-format-async-temp" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03abc2f9c83fe9ceec83f47c76cc071bfd56caba33794340330f35623ab1f544" +dependencies = [ + "async-trait", + "byteorder", + "futures", + "integer-encoding 3.0.2", + "ordered-float 1.1.1", +] + [[package]] name = "parquet2" version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6b9cfd5adabbb93efd417dcee71a2ebbb0f4978b1d8e65ab7fce84b72b966bc" +source = "git+https://github.com/datafuse-extras/parquet2?rev=d28330f#d28330f92b8f0f69931dc78c7b3ce4f0a0f892bf" dependencies = [ "async-stream", "bitpacking", @@ -3645,7 +3657,7 @@ dependencies = [ "flate2", "futures", "lz4", - "parquet-format-async-temp", + "parquet-format-async-temp 0.1.1", "snap", "streaming-iterator", "zstd", @@ -3653,8 +3665,9 @@ dependencies = [ [[package]] name = "parquet2" -version = "0.3.0" -source = "git+https://github.com/datafuse-extras/parquet2?rev=d28330f#d28330f92b8f0f69931dc78c7b3ce4f0a0f892bf" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73758f76c8c842dfd846dfbc6ee05779aff626908531fffc3305a30cbccc55e7" dependencies = [ "async-stream", "bitpacking", @@ -3662,7 +3675,7 @@ dependencies = [ "flate2", "futures", "lz4", - "parquet-format-async-temp", + "parquet-format-async-temp 0.2.0", "snap", "streaming-iterator", "zstd", diff --git a/common/arrow/Cargo.toml b/common/arrow/Cargo.toml index 0dfece825f195..3801c5efadc1e 100644 --- a/common/arrow/Cargo.toml +++ b/common/arrow/Cargo.toml @@ -15,8 +15,8 @@ simd = ["arrow/simd"] # Workspace dependencies # Github dependencies -arrow = { package = "arrow2", git="https://github.com/datafuse-extras/arrow2", rev = "7765067" } -arrow-flight = { git="https://github.com/datafuse-extras/arrow2", rev = "7765067" } +arrow = { package = "arrow2", git="https://github.com/zhyass/arrow2", rev = "05f2b5c" } +arrow-flight = { git="https://github.com/zhyass/arrow2", rev = "05f2b5c" } parquet = {package = "parquet2", git = "https://github.com/datafuse-extras/parquet2", rev = "d28330f"} # Crates.io dependencies diff --git a/common/datablocks/src/kernels/data_block_group_by_hash.rs b/common/datablocks/src/kernels/data_block_group_by_hash.rs index 271b5590c097e..bac74a9dd1194 100644 --- a/common/datablocks/src/kernels/data_block_group_by_hash.rs +++ b/common/datablocks/src/kernels/data_block_group_by_hash.rs @@ -141,7 +141,7 @@ impl HashMethodKind { } pub fn data_type(&self) -> DataType { match self { - HashMethodKind::Serializer(_) => DataType::Binary, + HashMethodKind::Serializer(_) => DataType::String, HashMethodKind::KeysU8(_) => DataType::UInt8, HashMethodKind::KeysU16(_) => DataType::UInt16, HashMethodKind::KeysU32(_) => DataType::UInt32, @@ -155,7 +155,7 @@ pub struct HashMethodSerializer {} impl HashMethodSerializer { #[inline] - pub fn get_key(&self, array: &DFBinaryArray, row: usize) -> Vec { + pub fn get_key(&self, array: &DFStringArray, row: usize) -> Vec { let v = array.inner().value(row); v.to_owned() } diff --git a/common/datavalues/src/arrays/arithmetic.rs b/common/datavalues/src/arrays/arithmetic.rs index aea7a8ebb2af3..8b0648a6ad187 100644 --- a/common/datavalues/src/arrays/arithmetic.rs +++ b/common/datavalues/src/arrays/arithmetic.rs @@ -312,16 +312,15 @@ where } } -fn concat_strings(l: &str, r: &str) -> String { - // fastest way to concat strings according to https://github.com/hoodie/concatenation_benchmarks-rs - let mut s = String::with_capacity(l.len() + r.len()); - s.push_str(l); - s.push_str(r); +fn concat_strings(l: &[u8], r: &[u8]) -> Vec { + let mut s = Vec::with_capacity(l.len() + r.len()); + s.extend_from_slice(l); + s.extend_from_slice(r); s } -impl Add for &DFUtf8Array { - type Output = Result; +impl Add for &DFStringArray { + type Output = Result; fn add(self, rhs: Self) -> Self::Output { // broadcasting path @@ -329,7 +328,7 @@ impl Add for &DFUtf8Array { let rhs = rhs.get(0); return match rhs { Some(rhs) => self.add(rhs), - None => Ok(DFUtf8Array::full_null(self.len())), + None => Ok(DFStringArray::full_null(self.len())), }; } @@ -346,18 +345,18 @@ impl Add for &DFUtf8Array { } } -impl Add for DFUtf8Array { - type Output = Result; +impl Add for DFStringArray { + type Output = Result; fn add(self, rhs: Self) -> Self::Output { (&self).add(&rhs) } } -impl Add<&str> for &DFUtf8Array { - type Output = Result; +impl Add<&[u8]> for &DFStringArray { + type Output = Result; - fn add(self, rhs: &str) -> Self::Output { + fn add(self, rhs: &[u8]) -> Self::Output { Ok(match self.null_count() { 0 => self .into_no_null_iter() @@ -401,5 +400,5 @@ where } impl Pow for DFBooleanArray {} -impl Pow for DFUtf8Array {} +impl Pow for DFStringArray {} impl Pow for DFListArray {} diff --git a/common/datavalues/src/arrays/binary/builder.rs b/common/datavalues/src/arrays/binary/builder.rs deleted file mode 100644 index b4dd9f23919fa..0000000000000 --- a/common/datavalues/src/arrays/binary/builder.rs +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2020 Datafuse Labs. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use common_arrow::arrow::array::MutableArray; -use common_arrow::arrow::array::MutableBinaryArray; - -use crate::prelude::*; - -pub struct BinaryArrayBuilder { - builder: MutableBinaryArray, -} - -impl BinaryArrayBuilder { - pub fn with_capacity(capacity: usize) -> Self { - Self { - builder: MutableBinaryArray::::with_capacity(capacity), - } - } - - pub fn append_value(&mut self, value: impl AsRef<[u8]>) { - self.builder.push(Some(value)) - } - - #[inline] - pub fn append_null(&mut self) { - self.builder.push_null(); - } - - pub fn finish(&mut self) -> DFBinaryArray { - let array = self.builder.as_arc(); - DFBinaryArray::from_arrow_array(array.as_ref()) - } -} diff --git a/common/datavalues/src/arrays/comparison.rs b/common/datavalues/src/arrays/comparison.rs index bb74409769967..a926f34e773b8 100644 --- a/common/datavalues/src/arrays/comparison.rs +++ b/common/datavalues/src/arrays/comparison.rs @@ -14,10 +14,10 @@ use std::fmt::Debug; +use common_arrow::arrow::compute::comparison::binary_compare_scalar; use common_arrow::arrow::compute::comparison::boolean_compare_scalar; use common_arrow::arrow::compute::comparison::compare; use common_arrow::arrow::compute::comparison::primitive_compare_scalar; -use common_arrow::arrow::compute::comparison::utf8_compare_scalar; use common_arrow::arrow::compute::comparison::Operator; use common_arrow::arrow::compute::comparison::Simd8; use common_arrow::arrow::compute::like; @@ -209,40 +209,40 @@ impl ArrayCompare<&DFBooleanArray> for DFBooleanArray { } } -impl DFUtf8Array { - fn comparison(&self, rhs: &DFUtf8Array, op: Operator) -> Result { +impl DFStringArray { + fn comparison(&self, rhs: &DFStringArray, op: Operator) -> Result { let array = compare(&self.array, &rhs.array, op)?; Ok(array.into()) } - fn comparison_scalar(&self, rhs: &str, op: Operator) -> Result { - let array = utf8_compare_scalar(&self.array, rhs, op); + fn comparison_scalar(&self, rhs: &[u8], op: Operator) -> Result { + let array = binary_compare_scalar(&self.array, rhs, op); Ok(array.into()) } - // pub fn like_utf8(lhs: &Utf8Array, rhs: &Utf8Array) - fn like(&self, rhs: &DFUtf8Array) -> Result { - let array = like::like_utf8(&self.array, &rhs.array)?; + // pub fn like_binary(lhs: &BinaryArray, rhs: &BinaryArray) + fn like(&self, rhs: &DFStringArray) -> Result { + let array = like::like_binary(&self.array, &rhs.array)?; Ok(array.into()) } - fn like_scalar(&self, rhs: &str) -> Result { - let array = like::like_utf8_scalar(&self.array, rhs)?; + fn like_scalar(&self, rhs: &[u8]) -> Result { + let array = like::like_binary_scalar(&self.array, rhs)?; Ok(array.into()) } - fn nlike(&self, rhs: &DFUtf8Array) -> Result { - let array = like::nlike_utf8(&self.array, &rhs.array)?; + fn nlike(&self, rhs: &DFStringArray) -> Result { + let array = like::nlike_binary(&self.array, &rhs.array)?; Ok(array.into()) } - fn nlike_scalar(&self, rhs: &str) -> Result { - let array = like::nlike_utf8_scalar(&self.array, rhs)?; + fn nlike_scalar(&self, rhs: &[u8]) -> Result { + let array = like::nlike_binary_scalar(&self.array, rhs)?; Ok(array.into()) } } -macro_rules! impl_like_utf8 { +macro_rules! impl_like_string { ($self:ident, $rhs:ident, $op:ident, $scalar_op:ident) => {{ // broadcast if $rhs.len() == 1 { @@ -254,7 +254,7 @@ macro_rules! impl_like_utf8 { } else if $self.len() == 1 { if let Some(value) = $self.get(0) { let it = (0..$rhs.len()).map(|_| value); - let left = DFUtf8Array::new_from_iter(it); + let left = DFStringArray::new_from_iter(it); left.$op($rhs) } else { Ok(DFBooleanArray::full(false, $rhs.len())) @@ -265,47 +265,45 @@ macro_rules! impl_like_utf8 { }}; } -impl ArrayCompare<&DFUtf8Array> for DFUtf8Array { - fn eq(&self, rhs: &DFUtf8Array) -> Result { +impl ArrayCompare<&DFStringArray> for DFStringArray { + fn eq(&self, rhs: &DFStringArray) -> Result { impl_cmp_common! {self, rhs, Eq, eq} } - fn neq(&self, rhs: &DFUtf8Array) -> Result { + fn neq(&self, rhs: &DFStringArray) -> Result { impl_cmp_common! {self, rhs, Neq, neq} } - fn gt(&self, rhs: &DFUtf8Array) -> Result { + fn gt(&self, rhs: &DFStringArray) -> Result { impl_cmp_common! {self, rhs, Gt, lt_eq} } - fn gt_eq(&self, rhs: &DFUtf8Array) -> Result { + fn gt_eq(&self, rhs: &DFStringArray) -> Result { impl_cmp_common! {self, rhs, GtEq, lt} } - fn lt(&self, rhs: &DFUtf8Array) -> Result { + fn lt(&self, rhs: &DFStringArray) -> Result { impl_cmp_common! {self, rhs, Lt, gt_eq} } - fn lt_eq(&self, rhs: &DFUtf8Array) -> Result { + fn lt_eq(&self, rhs: &DFStringArray) -> Result { impl_cmp_common! {self, rhs, LtEq, gt} } - fn like(&self, rhs: &DFUtf8Array) -> Result { - impl_like_utf8! {self, rhs, like, like_scalar} + fn like(&self, rhs: &DFStringArray) -> Result { + impl_like_string! {self, rhs, like, like_scalar} } - fn nlike(&self, rhs: &DFUtf8Array) -> Result { - impl_like_utf8! {self, rhs, nlike, nlike_scalar} + fn nlike(&self, rhs: &DFStringArray) -> Result { + impl_like_string! {self, rhs, nlike, nlike_scalar} } } impl ArrayCompare<&DFNullArray> for DFNullArray {} -impl ArrayCompare<&DFBinaryArray> for DFBinaryArray {} - impl ArrayCompare<&DFStructArray> for DFStructArray {} -macro_rules! impl_cmp_numeric_utf8_list { +macro_rules! impl_cmp_numeric_string_list { ($self:ident, $rhs:ident, $cmp_method:ident) => {{ match ($self.null_count(), $rhs.null_count()) { (0, 0) => $self @@ -339,7 +337,7 @@ macro_rules! impl_cmp_numeric_utf8_list { impl ArrayCompare<&DFListArray> for DFListArray { fn eq(&self, rhs: &DFListArray) -> Result { - Ok(impl_cmp_numeric_utf8_list!(self, rhs, series_equal)) + Ok(impl_cmp_numeric_string_list!(self, rhs, series_equal)) } fn neq(&self, rhs: &DFListArray) -> Result { diff --git a/common/datavalues/src/arrays/list/builder.rs b/common/datavalues/src/arrays/list/builder.rs index 67e48d63afb2d..40be5bac75e38 100644 --- a/common/datavalues/src/arrays/list/builder.rs +++ b/common/datavalues/src/arrays/list/builder.rs @@ -25,7 +25,7 @@ pub trait ListBuilderTrait { } type LargeListPrimitiveBuilder = MutableListArray>; -type LargeListUtf8Builder = MutableListArray>; +type LargeListBinaryBuilder = MutableListArray>; type LargeListBooleanBuilder = MutableListArray; pub struct ListPrimitiveArrayBuilder @@ -102,21 +102,21 @@ where } } -pub struct ListUtf8ArrayBuilder { - builder: LargeListUtf8Builder, +pub struct ListStringArrayBuilder { + builder: LargeListBinaryBuilder, } -type LargeMutableUtf8Array = MutableUtf8Array; -impl ListUtf8ArrayBuilder { +type LargeMutableBinaryArray = MutableBinaryArray; +impl ListStringArrayBuilder { pub fn with_capacity(values_capacity: usize, capacity: usize) -> Self { - let values = LargeMutableUtf8Array::with_capacity(values_capacity); - let builder = LargeListUtf8Builder::new_with_capacity(values, capacity); + let values = LargeMutableBinaryArray::with_capacity(values_capacity); + let builder = LargeListBinaryBuilder::new_with_capacity(values, capacity); - ListUtf8ArrayBuilder { builder } + ListStringArrayBuilder { builder } } } -impl ListBuilderTrait for ListUtf8ArrayBuilder { +impl ListBuilderTrait for ListStringArrayBuilder { fn append_opt_series(&mut self, opt_s: Option<&Series>) { match opt_s { Some(s) => self.append_series(s), @@ -133,7 +133,7 @@ impl ListBuilderTrait for ListUtf8ArrayBuilder { #[inline] fn append_series(&mut self, s: &Series) { - let ca = s.utf8().unwrap(); + let ca = s.string().unwrap(); let value_builder = self.builder.mut_values(); value_builder.try_extend(ca).unwrap(); self.builder.try_push_valid().unwrap(); @@ -202,16 +202,16 @@ pub fn get_list_builder( Box::new(builder) }}; } - macro_rules! get_utf8_builder { + macro_rules! get_string_builder { () => {{ - let builder = ListUtf8ArrayBuilder::with_capacity(value_capacity, list_capacity); + let builder = ListStringArrayBuilder::with_capacity(value_capacity, list_capacity); Box::new(builder) }}; } match_data_type_apply_macro!( dt, get_primitive_builder, - get_utf8_builder, + get_string_builder, get_bool_builder ) } diff --git a/common/datavalues/src/arrays/mod.rs b/common/datavalues/src/arrays/mod.rs index 4d7f7473ddf3d..2c608ab175fe1 100644 --- a/common/datavalues/src/arrays/mod.rs +++ b/common/datavalues/src/arrays/mod.rs @@ -23,16 +23,14 @@ mod ops; mod trusted_len; mod upstream_traits; -mod binary; mod boolean; mod list; mod null; mod primitive; +mod string; mod r#struct; -mod utf8; pub use arithmetic::*; -pub use binary::*; pub use boolean::*; pub use builder::*; pub use comparison::*; @@ -41,6 +39,6 @@ pub use null::*; pub use ops::*; pub use primitive::*; pub use r#struct::*; +pub use string::*; pub use trusted_len::*; pub use upstream_traits::*; -pub use utf8::*; diff --git a/common/datavalues/src/arrays/ops/agg.rs b/common/datavalues/src/arrays/ops/agg.rs index bb8076f443698..e6874460c68a1 100644 --- a/common/datavalues/src/arrays/ops/agg.rs +++ b/common/datavalues/src/arrays/ops/agg.rs @@ -294,13 +294,13 @@ impl ArrayAgg for DFBooleanArray { } } -impl ArrayAgg for DFUtf8Array { +impl ArrayAgg for DFStringArray { fn min(&self) -> Result { if self.all_is_null() { - return Ok(DataValue::Utf8(None)); + return Ok(DataValue::String(None)); } - Ok(match aggregate::min_string(self.inner()) { + Ok(match aggregate::min_binary(self.inner()) { Some(x) => x.into(), None => DataValue::from(self.data_type()), }) @@ -308,10 +308,10 @@ impl ArrayAgg for DFUtf8Array { fn max(&self) -> Result { if self.all_is_null() { - return Ok(DataValue::Utf8(None)); + return Ok(DataValue::String(None)); } - Ok(match aggregate::max_string(self.inner()) { + Ok(match aggregate::max_binary(self.inner()) { Some(x) => x.into(), None => DataValue::from(self.data_type()), }) @@ -321,7 +321,7 @@ impl ArrayAgg for DFUtf8Array { if self.all_is_null() { return Ok(DataValue::Struct(vec![ (0_u64).into(), - DataValue::Utf8(None), + DataValue::String(None), ])); } let value = self @@ -339,7 +339,7 @@ impl ArrayAgg for DFUtf8Array { Ok(match value { Some((index, value)) => DataValue::Struct(vec![(index as u64).into(), value.into()]), - None => DataValue::Struct(vec![(0_u64).into(), DataValue::Utf8(None)]), + None => DataValue::Struct(vec![(0_u64).into(), DataValue::String(None)]), }) } @@ -347,7 +347,7 @@ impl ArrayAgg for DFUtf8Array { if self.all_is_null() { return Ok(DataValue::Struct(vec![ (0_u64).into(), - DataValue::Utf8(None), + DataValue::String(None), ])); } let value = self @@ -357,15 +357,13 @@ impl ArrayAgg for DFUtf8Array { Ok(match value { Some((index, value)) => DataValue::Struct(vec![(index as u64).into(), value.into()]), - None => DataValue::Struct(vec![(0_u64).into(), DataValue::Utf8(None)]), + None => DataValue::Struct(vec![(0_u64).into(), DataValue::String(None)]), }) } } impl ArrayAgg for DFListArray {} -impl ArrayAgg for DFBinaryArray {} - impl ArrayAgg for DFNullArray {} impl ArrayAgg for DFStructArray {} diff --git a/common/datavalues/src/arrays/ops/apply.rs b/common/datavalues/src/arrays/ops/apply.rs index 29f2d57bbbdd5..cb46185f3b18e 100644 --- a/common/datavalues/src/arrays/ops/apply.rs +++ b/common/datavalues/src/arrays/ops/apply.rs @@ -167,10 +167,10 @@ impl<'a> ArrayApply<'a, bool, bool> for DFBooleanArray { } } -impl<'a> ArrayApply<'a, &'a str, Cow<'a, str>> for DFUtf8Array { +impl<'a> ArrayApply<'a, &'a [u8], Cow<'a, [u8]>> for DFStringArray { fn apply_cast_numeric(&'a self, f: F) -> DFPrimitiveArray where - F: Fn(&'a str) -> S + Copy, + F: Fn(&'a [u8]) -> S + Copy, S: DFPrimitiveType, { let arr = self.inner(); @@ -183,7 +183,7 @@ impl<'a> ArrayApply<'a, &'a str, Cow<'a, str>> for DFUtf8Array { fn branch_apply_cast_numeric_no_null(&'a self, f: F) -> DFPrimitiveArray where - F: Fn(Option<&'a str>) -> S + Copy, + F: Fn(Option<&'a [u8]>) -> S + Copy, S: DFPrimitiveType, { let av: AlignedVec<_> = AlignedVec::<_>::from_trusted_len_iter(self.inner().iter().map(f)); @@ -192,16 +192,16 @@ impl<'a> ArrayApply<'a, &'a str, Cow<'a, str>> for DFUtf8Array { } fn apply(&'a self, f: F) -> Self - where F: Fn(&'a str) -> Cow<'a, str> + Copy { + where F: Fn(&'a [u8]) -> Cow<'a, [u8]> + Copy { apply!(self, f) } fn apply_with_idx(&'a self, f: F) -> Self - where F: Fn((usize, &'a str)) -> Cow<'a, str> + Copy { + where F: Fn((usize, &'a [u8])) -> Cow<'a, [u8]> + Copy { apply_enumerate!(self, f) } fn apply_with_idx_on_opt(&'a self, f: F) -> Self - where F: Fn((usize, Option<&'a str>)) -> Option> + Copy { + where F: Fn((usize, Option<&'a [u8]>)) -> Option> + Copy { self.into_iter().enumerate().map(f).collect() } } diff --git a/common/datavalues/src/arrays/ops/cast.rs b/common/datavalues/src/arrays/ops/cast.rs index 9cd4d49a96b94..03516d095026a 100644 --- a/common/datavalues/src/arrays/ops/cast.rs +++ b/common/datavalues/src/arrays/ops/cast.rs @@ -50,7 +50,7 @@ where T: DFPrimitiveType } } -impl ArrayCast for DFUtf8Array { +impl ArrayCast for DFStringArray { fn cast_with_type(&self, data_type: &DataType) -> Result { cast_ca(&self.array, data_type) } @@ -67,7 +67,6 @@ impl ArrayCast for DFNullArray { match data_type { DataType::Null => Ok(self.clone().into_series()), DataType::Boolean => Ok(DFBooleanArray::full_null(self.len()).into_series()), - DataType::Utf8 => Ok(DFUtf8Array::full_null(self.len()).into_series()), DataType::UInt8 => Ok(DFUInt8Array::full_null(self.len()).into_series()), DataType::UInt16 => Ok(DFUInt16Array::full_null(self.len()).into_series()), DataType::UInt32 => Ok(DFUInt32Array::full_null(self.len()).into_series()), @@ -78,7 +77,7 @@ impl ArrayCast for DFNullArray { DataType::Int64 => Ok(DFInt64Array::full_null(self.len()).into_series()), DataType::Float32 => Ok(DFFloat32Array::full_null(self.len()).into_series()), DataType::Float64 => Ok(DFFloat64Array::full_null(self.len()).into_series()), - DataType::Binary => Ok(DFBinaryArray::full_null(self.len()).into_series()), + DataType::String => Ok(DFStringArray::full_null(self.len()).into_series()), DataType::List(_) => Ok(DFListArray::full_null(self.len()).into_series()), _ => Err(ErrorCode::BadDataValueType(format!( @@ -90,5 +89,4 @@ impl ArrayCast for DFNullArray { } impl ArrayCast for DFListArray {} -impl ArrayCast for DFBinaryArray {} impl ArrayCast for DFStructArray {} diff --git a/common/datavalues/src/arrays/ops/contain.rs b/common/datavalues/src/arrays/ops/contain.rs index f694cab4f9bbb..118e8a4a7fef2 100644 --- a/common/datavalues/src/arrays/ops/contain.rs +++ b/common/datavalues/src/arrays/ops/contain.rs @@ -51,7 +51,7 @@ where T: DFPrimitiveType } } -impl ArrayContain for DFUtf8Array { +impl ArrayContain for DFStringArray { fn contain(&self, list: &DFListArray) -> Result where Self: std::marker::Sized { contain_internal!(self, list) diff --git a/common/datavalues/src/arrays/ops/fill.rs b/common/datavalues/src/arrays/ops/fill.rs index 18ae0824b9172..e19aa9eaf3def 100644 --- a/common/datavalues/src/arrays/ops/fill.rs +++ b/common/datavalues/src/arrays/ops/fill.rs @@ -109,40 +109,21 @@ impl ArrayFullNull for DFBooleanArray { } } -impl<'a> ArrayFull<&'a str> for DFUtf8Array { - fn full(value: &'a str, length: usize) -> Self { - let mut builder = Utf8ArrayBuilder::with_capacity(length * value.len()); - - for _ in 0..length { - builder.append_value(value); - } - builder.finish() - } -} - -impl ArrayFullNull for DFUtf8Array { - fn full_null(length: usize) -> Self { - (0..length) - .map::, _>(|_| None) - .collect::() - } -} - impl ArrayFull<&Series> for DFListArray { - fn full(_value: &Series, _length: usize) -> DFListArray { + fn full(_value: &Series, _length: usize) -> Self { todo!() } } impl ArrayFullNull for DFListArray { - fn full_null(_length: usize) -> DFListArray { + fn full_null(_length: usize) -> Self { todo!() } } -impl ArrayFull<&[u8]> for DFBinaryArray { - fn full(value: &[u8], length: usize) -> DFBinaryArray { - let mut builder = BinaryArrayBuilder::with_capacity(length); +impl ArrayFull<&[u8]> for DFStringArray { + fn full(value: &[u8], length: usize) -> Self { + let mut builder = StringArrayBuilder::with_capacity(length); for _ in 0..length { builder.append_value(value); } @@ -150,9 +131,9 @@ impl ArrayFull<&[u8]> for DFBinaryArray { } } -impl ArrayFullNull for DFBinaryArray { - fn full_null(length: usize) -> DFBinaryArray { - let mut builder = BinaryArrayBuilder::with_capacity(length); +impl ArrayFullNull for DFStringArray { + fn full_null(length: usize) -> Self { + let mut builder = StringArrayBuilder::with_capacity(length); for _ in 0..length { builder.append_null(); } diff --git a/common/datavalues/src/arrays/ops/group_hash.rs b/common/datavalues/src/arrays/ops/group_hash.rs index 544d93520f81d..e2c83527e3d74 100644 --- a/common/datavalues/src/arrays/ops/group_hash.rs +++ b/common/datavalues/src/arrays/ops/group_hash.rs @@ -98,17 +98,16 @@ impl GroupHash for DFBooleanArray { } } -impl GroupHash for DFUtf8Array { +impl GroupHash for DFStringArray { fn serialize(&self, vec: &mut Vec>) -> Result<()> { assert_eq!(vec.len(), self.len()); for (value, vec) in self.into_no_null_iter().zip(vec.iter_mut()) { - BinaryWrite::write_string(vec, value)?; + BinaryWrite::write_binary(vec, &value)?; } Ok(()) } } impl GroupHash for DFListArray {} -impl GroupHash for DFBinaryArray {} impl GroupHash for DFNullArray {} impl GroupHash for DFStructArray {} diff --git a/common/datavalues/src/arrays/ops/if.rs b/common/datavalues/src/arrays/ops/if.rs index 4264c42e57f42..04c234b5d641e 100644 --- a/common/datavalues/src/arrays/ops/if.rs +++ b/common/datavalues/src/arrays/ops/if.rs @@ -72,7 +72,7 @@ macro_rules! impl_if_common { }}; } -macro_rules! impl_if_bool_utf8 { +macro_rules! impl_if_bool_string { ($predicate:ident, $lhs:ident, $rhs:ident) => {{ match ($predicate.len(), $lhs.len(), $rhs.len()) { (1, b, c) if b == c || b == 1 || c == 1 => { @@ -144,13 +144,13 @@ where T: DFPrimitiveType impl ArrayIf for DFBooleanArray { fn if_then_else(&self, rhs: &Self, predicate: &DFBooleanArray) -> Result { - impl_if_bool_utf8! {predicate, self, rhs} + impl_if_bool_string! {predicate, self, rhs} } } -impl ArrayIf for DFUtf8Array { +impl ArrayIf for DFStringArray { fn if_then_else(&self, rhs: &Self, predicate: &DFBooleanArray) -> Result { - impl_if_bool_utf8! {predicate, self, rhs} + impl_if_bool_string! {predicate, self, rhs} } } @@ -162,4 +162,3 @@ impl ArrayIf for DFNullArray { impl ArrayIf for DFListArray {} impl ArrayIf for DFStructArray {} -impl ArrayIf for DFBinaryArray {} diff --git a/common/datavalues/src/arrays/ops/scatter.rs b/common/datavalues/src/arrays/ops/scatter.rs index 0944a6cc479e0..c22ce43713d08 100644 --- a/common/datavalues/src/arrays/ops/scatter.rs +++ b/common/datavalues/src/arrays/ops/scatter.rs @@ -18,10 +18,9 @@ use common_exception::ErrorCode; use common_exception::Result; use crate::arrays::get_list_builder; -use crate::arrays::BinaryArrayBuilder; use crate::arrays::BooleanArrayBuilder; use crate::arrays::PrimitiveArrayBuilder; -use crate::arrays::Utf8ArrayBuilder; +use crate::arrays::StringArrayBuilder; use crate::prelude::*; use crate::utils::get_iter_capacity; @@ -95,46 +94,6 @@ where T: DFPrimitiveType } } -impl ArrayScatter for DFUtf8Array { - unsafe fn scatter_unchecked( - &self, - indices: &mut dyn Iterator, - scattered_size: usize, - ) -> Result> - where - Self: std::marker::Sized, - { - let array = self.inner(); - let mut builders = Vec::with_capacity(scattered_size); - - for _i in 0..scattered_size { - builders.push(Utf8ArrayBuilder::with_capacity(self.len())); - } - - match self.null_count() { - 0 => { - indices.zip(0..self.len()).for_each(|(index, row)| { - builders[index as usize].append_value(array.value(row)); - }); - } - _ => { - indices.zip(0..self.len()).for_each(|(index, row)| { - if self.is_null(row) { - builders[index as usize].append_null(); - } else { - builders[index as usize].append_value(array.value(row)); - } - }); - } - } - - Ok(builders - .iter_mut() - .map(|builder| builder.finish()) - .collect()) - } -} - impl ArrayScatter for DFBooleanArray { unsafe fn scatter_unchecked( &self, @@ -219,7 +178,7 @@ impl ArrayScatter for DFListArray { } } -impl ArrayScatter for DFBinaryArray { +impl ArrayScatter for DFStringArray { unsafe fn scatter_unchecked( &self, indices: &mut dyn Iterator, @@ -231,14 +190,14 @@ impl ArrayScatter for DFBinaryArray { let mut builders = Vec::with_capacity(scattered_size); let guess_scattered_len = ((self.len() as f64) * 1.1 / (scattered_size as f64)) as usize; for _i in 0..scattered_size { - let builder = BinaryArrayBuilder::with_capacity(guess_scattered_len); + let builder = StringArrayBuilder::with_capacity(guess_scattered_len); builders.push(builder); } - let binary_data = self.inner(); + let string_data = self.inner(); for (i, index) in indices.enumerate() { if !self.is_null(i as usize) { - builders[index as usize].append_value(binary_data.value(i as usize)); + builders[index as usize].append_value(string_data.value(i as usize)); } else { builders[index as usize].append_null(); } diff --git a/common/datavalues/src/arrays/ops/scatter_test.rs b/common/datavalues/src/arrays/ops/scatter_test.rs index 1c487d2ba22ef..9bbe1fa554eb5 100644 --- a/common/datavalues/src/arrays/ops/scatter_test.rs +++ b/common/datavalues/src/arrays/ops/scatter_test.rs @@ -59,15 +59,15 @@ fn test_scatter() -> Result<()> { array_vec[1].into_no_null_iter().collect::>() ); - // Test BinaryArray - let mut binary_builder = BinaryArrayBuilder::with_capacity(8); - binary_builder.append_value(&"12"); - binary_builder.append_value(&"ab"); - binary_builder.append_value(&"c1"); - binary_builder.append_value(&"32"); - let df_binary_array = binary_builder.finish(); + // Test StringArray + let mut string_builder = StringArrayBuilder::with_capacity(8); + string_builder.append_value(&"12"); + string_builder.append_value(&"ab"); + string_builder.append_value(&"c1"); + string_builder.append_value(&"32"); + let df_string_array = string_builder.finish(); let indices = vec![1, 0, 0, 1]; - let array_vec = unsafe { df_binary_array.scatter_unchecked(&mut indices.into_iter(), 2)? }; + let array_vec = unsafe { df_string_array.scatter_unchecked(&mut indices.into_iter(), 2)? }; let values: Vec> = (0..array_vec[0].len()) .map(|idx| array_vec[0].inner().value(idx).to_vec()) diff --git a/common/datavalues/src/arrays/ops/take.rs b/common/datavalues/src/arrays/ops/take.rs index 28519882fd318..53a74d977c461 100644 --- a/common/datavalues/src/arrays/ops/take.rs +++ b/common/datavalues/src/arrays/ops/take.rs @@ -167,90 +167,89 @@ impl ArrayTake for DFBooleanArray { } } -impl ArrayTake for DFUtf8Array { +impl ArrayTake for DFListArray { unsafe fn take_unchecked(&self, indices: TakeIdx) -> Result where Self: std::marker::Sized, I: Iterator, INulls: Iterator>, { - let str_array = self.inner(); + self.take(indices) + } + + fn take(&self, indices: TakeIdx) -> Result + where + Self: std::marker::Sized, + I: Iterator, + INulls: Iterator>, + { + let list_array = self.inner(); match indices { TakeIdx::Array(array) => { - let taked_array = take::take(str_array, array)?; + let taked_array = take::take(list_array, array)?; Ok(Self::from_arrow_array(taked_array.as_ref())) } TakeIdx::Iter(iter) => { if self.is_empty() { return Ok(Self::full_null(iter.size_hint().0)); } - let taked_array = take_utf8_iter_unchecked(str_array, iter); - Ok(Self::from(taked_array)) + let ca: DFListArray = take_iter_n_arrays!(self, iter); + Ok(ca) } TakeIdx::IterNulls(iter) => { if self.is_empty() { return Ok(Self::full_null(iter.size_hint().0)); } - let taked_array = take_utf8_opt_iter_unchecked(str_array, iter); - Ok(Self::from(taked_array)) + + let ca: DFListArray = take_opt_iter_n_arrays!(self, iter); + Ok(ca) } } } - - fn take(&self, indices: TakeIdx) -> Result - where - Self: std::marker::Sized, - I: Iterator, - INulls: Iterator>, - { - unsafe { self.take_unchecked(indices) } - } } -impl ArrayTake for DFListArray { +impl ArrayTake for DFStringArray { unsafe fn take_unchecked(&self, indices: TakeIdx) -> Result where Self: std::marker::Sized, I: Iterator, INulls: Iterator>, { - self.take(indices) - } - - fn take(&self, indices: TakeIdx) -> Result - where - Self: std::marker::Sized, - I: Iterator, - INulls: Iterator>, - { - let list_array = self.inner(); + let str_array = self.inner(); match indices { TakeIdx::Array(array) => { - let taked_array = take::take(list_array, array)?; + let taked_array = take::take(str_array, array)?; Ok(Self::from_arrow_array(taked_array.as_ref())) } TakeIdx::Iter(iter) => { if self.is_empty() { return Ok(Self::full_null(iter.size_hint().0)); } - let ca: DFListArray = take_iter_n_arrays!(self, iter); - Ok(ca) + let taked_array = take_string_iter_unchecked(str_array, iter); + Ok(Self::from(taked_array)) } TakeIdx::IterNulls(iter) => { if self.is_empty() { return Ok(Self::full_null(iter.size_hint().0)); } - - let ca: DFListArray = take_opt_iter_n_arrays!(self, iter); - Ok(ca) + let taked_array = take_string_opt_iter_unchecked(str_array, iter); + Ok(Self::from(taked_array)) } } } + + fn take(&self, indices: TakeIdx) -> Result + where + Self: std::marker::Sized, + I: Iterator, + INulls: Iterator>, + { + unsafe { self.take_unchecked(indices) } + } } impl ArrayTake for DFNullArray {} impl ArrayTake for DFStructArray {} -impl ArrayTake for DFBinaryArray {} pub trait AsTakeIndex { fn as_take_iter<'a>(&'a self) -> Box + 'a>; diff --git a/common/datavalues/src/arrays/ops/take_random.rs b/common/datavalues/src/arrays/ops/take_random.rs index ae317635d61df..8f3ec1463e5f3 100644 --- a/common/datavalues/src/arrays/ops/take_random.rs +++ b/common/datavalues/src/arrays/ops/take_random.rs @@ -40,7 +40,7 @@ pub trait TakeRandom { } // Utility trait because associated type needs a lifetime -pub trait TakeRandomUtf8 { +pub trait TakeRandomString { type Item; /// Get a nullable value by index. @@ -155,12 +155,12 @@ where T: DFPrimitiveType } } -pub struct Utf8TakeRandom<'a> { - arr: &'a LargeUtf8Array, +pub struct StringTakeRandom<'a> { + arr: &'a LargeBinaryArray, } -impl<'a> TakeRandom for Utf8TakeRandom<'a> { - type Item = &'a str; +impl<'a> TakeRandom for StringTakeRandom<'a> { + type Item = &'a [u8]; #[inline] fn get(&self, index: usize) -> Option { @@ -173,13 +173,13 @@ impl<'a> TakeRandom for Utf8TakeRandom<'a> { } } -impl<'a> IntoTakeRandom<'a> for &'a DFUtf8Array { - type Item = &'a str; - type TakeRandom = TakeRandBranch, Utf8TakeRandom<'a>>; +impl<'a> IntoTakeRandom<'a> for &'a DFStringArray { + type Item = &'a [u8]; + type TakeRandom = TakeRandBranch, StringTakeRandom<'a>>; fn take_rand(&self) -> Self::TakeRandom { let arr = self.inner(); - let t = Utf8TakeRandom { arr }; + let t = StringTakeRandom { arr }; TakeRandBranch::Single(t) } } diff --git a/common/datavalues/src/arrays/ops/take_single.rs b/common/datavalues/src/arrays/ops/take_single.rs index e48f2871b2971..78ada6e2619b2 100644 --- a/common/datavalues/src/arrays/ops/take_single.rs +++ b/common/datavalues/src/arrays/ops/take_single.rs @@ -17,7 +17,7 @@ use common_arrow::arrow::array::Array; use common_arrow::arrow::array::ArrayRef; use super::take_random::TakeRandom; -use super::take_random::TakeRandomUtf8; +use super::take_random::TakeRandomString; use crate::prelude::*; macro_rules! impl_take_random_get { @@ -88,8 +88,8 @@ impl TakeRandom for DFBooleanArray { } } -impl<'a> TakeRandom for &'a DFUtf8Array { - type Item = &'a str; +impl<'a> TakeRandom for &'a DFStringArray { + type Item = &'a [u8]; #[inline] fn get(&self, index: usize) -> Option { @@ -106,8 +106,8 @@ impl<'a> TakeRandom for &'a DFUtf8Array { // extra trait such that it also works without extra reference. // Autoref will insert the reference and -impl<'a> TakeRandomUtf8 for &'a DFUtf8Array { - type Item = &'a str; +impl<'a> TakeRandomString for &'a DFStringArray { + type Item = &'a [u8]; #[inline] fn get(self, index: usize) -> Option { diff --git a/common/datavalues/src/arrays/ops/to_values.rs b/common/datavalues/src/arrays/ops/to_values.rs index fa68fd6480859..3dad9c36741aa 100644 --- a/common/datavalues/src/arrays/ops/to_values.rs +++ b/common/datavalues/src/arrays/ops/to_values.rs @@ -64,28 +64,6 @@ where } } -impl ToValues for DFUtf8Array { - fn to_values(&self) -> Result> { - let mut values = Vec::with_capacity(self.len()); - let array = self.inner(); - - if array.null_count() == 0 { - for index in 0..self.len() { - values.push(DataValue::Utf8(Some(array.value(index).to_string()))) - } - } else { - for index in 0..self.len() { - match array.is_null(index) { - true => values.push(DataValue::Utf8(None)), - false => values.push(DataValue::Utf8(Some(array.value(index).to_string()))), - } - } - } - - Ok(values) - } -} - impl ToValues for DFBooleanArray { fn to_values(&self) -> Result> { let mut values = Vec::with_capacity(self.len()); @@ -108,20 +86,20 @@ impl ToValues for DFBooleanArray { } } -impl ToValues for DFBinaryArray { +impl ToValues for DFStringArray { fn to_values(&self) -> Result> { let mut values = Vec::with_capacity(self.len()); let array = self.inner(); if array.null_count() == 0 { for index in 0..self.len() { - values.push(DataValue::Binary(Some(array.value(index).to_vec()))) + values.push(DataValue::String(Some(array.value(index).to_vec()))) } } else { for index in 0..self.len() { match array.is_null(index) { - true => values.push(DataValue::Binary(None)), - false => values.push(DataValue::Binary(Some(array.value(index).to_vec()))), + true => values.push(DataValue::String(None)), + false => values.push(DataValue::String(Some(array.value(index).to_vec()))), } } } diff --git a/common/datavalues/src/arrays/ops/vec_hash.rs b/common/datavalues/src/arrays/ops/vec_hash.rs index 3b91f79222e2f..1bce8e70fd412 100644 --- a/common/datavalues/src/arrays/ops/vec_hash.rs +++ b/common/datavalues/src/arrays/ops/vec_hash.rs @@ -46,7 +46,7 @@ where } } -impl VecHash for DFUtf8Array { +impl VecHash for DFBooleanArray { fn vec_hash(&self, hasher: DFHasher) -> Result { Ok(self.apply_cast_numeric(|v| { let mut h = hasher.clone_initial(); @@ -56,9 +56,10 @@ impl VecHash for DFUtf8Array { } } -impl VecHash for DFBooleanArray { +impl VecHash for DFFloat32Array { fn vec_hash(&self, hasher: DFHasher) -> Result { Ok(self.apply_cast_numeric(|v| { + let v = v.to_bits(); let mut h = hasher.clone_initial(); v.hash(&mut h); h.finish() @@ -66,7 +67,7 @@ impl VecHash for DFBooleanArray { } } -impl VecHash for DFFloat32Array { +impl VecHash for DFFloat64Array { fn vec_hash(&self, hasher: DFHasher) -> Result { Ok(self.apply_cast_numeric(|v| { let v = v.to_bits(); @@ -76,10 +77,10 @@ impl VecHash for DFFloat32Array { })) } } -impl VecHash for DFFloat64Array { + +impl VecHash for DFStringArray { fn vec_hash(&self, hasher: DFHasher) -> Result { Ok(self.apply_cast_numeric(|v| { - let v = v.to_bits(); let mut h = hasher.clone_initial(); v.hash(&mut h); h.finish() @@ -87,25 +88,6 @@ impl VecHash for DFFloat64Array { } } -impl VecHash for DFBinaryArray { - fn vec_hash(&self, hasher: DFHasher) -> Result { - let binary_data = self.inner(); - let mut builder = PrimitiveArrayBuilder::::with_capacity(self.len()); - - (0..self.len()).for_each(|index| { - if self.is_null(index) { - builder.append_null(); - } else { - let mut h = hasher.clone_initial(); - h.write(binary_data.value(index)); - builder.append_value(h.finish()); - } - }); - - Ok(builder.finish()) - } -} - impl VecHash for DFListArray { fn vec_hash(&self, _hasher: DFHasher) -> Result { Err(ErrorCode::BadDataValueType(format!( diff --git a/common/datavalues/src/arrays/string/builder.rs b/common/datavalues/src/arrays/string/builder.rs new file mode 100644 index 0000000000000..95719cd2f4897 --- /dev/null +++ b/common/datavalues/src/arrays/string/builder.rs @@ -0,0 +1,122 @@ +// Copyright 2020 Datafuse Labs. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use common_arrow::arrow::array::*; +use common_exception::Result; + +use crate::prelude::*; +use crate::utils::get_iter_capacity; + +pub struct StringArrayBuilder { + builder: MutableBinaryArray, +} + +impl StringArrayBuilder { + pub fn with_capacity(capacity: usize) -> Self { + Self { + builder: MutableBinaryArray::::with_capacity(capacity), + } + } + + pub fn append_value(&mut self, value: impl AsRef<[u8]>) { + self.builder.push(Some(value)) + } + + #[inline] + pub fn append_null(&mut self) { + self.builder.push_null(); + } + + #[inline] + pub fn append_option>(&mut self, opt: Option) { + match opt { + Some(s) => self.append_value(s), + None => self.append_null(), + } + } + + pub fn finish(&mut self) -> DFStringArray { + let array = self.builder.as_arc(); + DFStringArray::from_arrow_array(array.as_ref()) + } +} + +impl ArrayDeserializer for StringArrayBuilder { + fn de(&mut self, reader: &mut &[u8]) -> Result<()> { + self.append_value(reader.clone()); + Ok(()) + } + + fn de_batch(&mut self, reader: &[u8], step: usize, rows: usize) -> Result<()> { + for row in 0..rows { + let reader = &reader[step * row..]; + self.append_value(reader.clone()); + } + Ok(()) + } + + fn finish_to_series(&mut self) -> Series { + self.finish().into_series() + } + + fn de_text(&mut self, reader: &[u8]) { + self.append_value(reader) + } + + fn de_null(&mut self) { + self.append_null() + } +} + +impl NewDataArray for DFStringArray +where S: AsRef<[u8]> +{ + fn new_from_slice(v: &[S]) -> Self { + let values_size = v.iter().fold(0, |acc, s| acc + s.as_ref().len()); + let mut builder = StringArrayBuilder::with_capacity(values_size); + v.iter().for_each(|val| { + builder.append_value(val.as_ref()); + }); + + builder.finish() + } + + fn new_from_opt_slice(opt_v: &[Option]) -> Self { + let values_size = opt_v.iter().fold(0, |acc, s| match s { + Some(s) => acc + s.as_ref().len(), + None => acc, + }); + let mut builder = StringArrayBuilder::with_capacity(values_size); + opt_v.iter().for_each(|opt| match opt { + Some(v) => builder.append_value(v.as_ref()), + None => builder.append_null(), + }); + builder.finish() + } + + fn new_from_opt_iter(it: impl Iterator>) -> Self { + let cap = get_iter_capacity(&it); + let mut builder = StringArrayBuilder::with_capacity(cap * 5); + it.for_each(|opt| builder.append_option(opt)); + builder.finish() + } + + /// Create a new DataArray from an iterator. + fn new_from_iter(it: impl Iterator) -> Self { + let cap = get_iter_capacity(&it); + let mut builder = StringArrayBuilder::with_capacity(cap * 5); + it.for_each(|v| builder.append_value(v)); + builder.finish() + } +} diff --git a/common/datavalues/src/arrays/binary/iterator.rs b/common/datavalues/src/arrays/string/iterator.rs similarity index 80% rename from common/datavalues/src/arrays/binary/iterator.rs rename to common/datavalues/src/arrays/string/iterator.rs index 19fd2eb0d812a..efaaedb8ab80e 100644 --- a/common/datavalues/src/arrays/binary/iterator.rs +++ b/common/datavalues/src/arrays/string/iterator.rs @@ -19,37 +19,37 @@ use common_arrow::arrow::trusted_len::TrustedLen; use crate::prelude::*; -impl<'a> IntoIterator for &'a DFBinaryArray { +impl<'a> IntoIterator for &'a DFStringArray { type Item = Option<&'a [u8]>; - type IntoIter = ZipValidity<'a, &'a [u8], BinaryValueIter<'a, i64>>; + type IntoIter = ZipValidity<'a, &'a [u8], StringValueIter<'a, i64>>; fn into_iter(self) -> Self::IntoIter { zip_validity( - BinaryValueIter::new(&self.array), + StringValueIter::new(&self.array), self.array.validity().as_ref().map(|x| x.iter()), ) } } -impl DFBinaryArray { +impl DFStringArray { pub fn into_no_null_iter<'a>(&'a self) -> impl TrustedLen + '_ + Send + Sync { - BinaryIterNoNull::new(self.inner()) + StringIterNoNull::new(self.inner()) } } /// Iterator over slices of `&[u8]`. #[derive(Debug, Clone)] -pub struct BinaryValueIter<'a, O: Offset> { +pub struct StringValueIter<'a, O: Offset> { array: &'a BinaryArray, index: usize, } -impl<'a, O: Offset> BinaryValueIter<'a, O> { +impl<'a, O: Offset> StringValueIter<'a, O> { pub fn new(array: &'a BinaryArray) -> Self { Self { array, index: 0 } } } -impl<'a, O: Offset> Iterator for BinaryValueIter<'a, O> { +impl<'a, O: Offset> Iterator for StringValueIter<'a, O> { type Item = &'a [u8]; #[inline] @@ -71,19 +71,19 @@ impl<'a, O: Offset> Iterator for BinaryValueIter<'a, O> { } /// all arrays have known size. -impl<'a> ExactSizeIterator for BinaryIterNoNull<'a> {} -unsafe impl<'a> TrustedLen for BinaryIterNoNull<'a> {} +impl<'a> ExactSizeIterator for StringIterNoNull<'a> {} +unsafe impl<'a> TrustedLen for StringIterNoNull<'a> {} -pub struct BinaryIterNoNull<'a> { +pub struct StringIterNoNull<'a> { array: &'a LargeBinaryArray, current: usize, current_end: usize, } -impl<'a> BinaryIterNoNull<'a> { +impl<'a> StringIterNoNull<'a> { /// create a new iterator pub fn new(array: &'a LargeBinaryArray) -> Self { - BinaryIterNoNull { + StringIterNoNull { array, current: 0, current_end: array.len(), @@ -91,7 +91,7 @@ impl<'a> BinaryIterNoNull<'a> { } } -impl<'a> Iterator for BinaryIterNoNull<'a> { +impl<'a> Iterator for StringIterNoNull<'a> { type Item = &'a [u8]; fn next(&mut self) -> Option { diff --git a/common/datavalues/src/arrays/binary/mod.rs b/common/datavalues/src/arrays/string/mod.rs similarity index 64% rename from common/datavalues/src/arrays/binary/mod.rs rename to common/datavalues/src/arrays/string/mod.rs index 1d96d8ef85e12..7e6670410ecc9 100644 --- a/common/datavalues/src/arrays/binary/mod.rs +++ b/common/datavalues/src/arrays/string/mod.rs @@ -25,17 +25,17 @@ pub use iterator::*; use crate::prelude::*; #[derive(Debug, Clone)] -pub struct DFBinaryArray { +pub struct DFStringArray { pub(crate) array: LargeBinaryArray, } -impl From for DFBinaryArray { +impl From for DFStringArray { fn from(array: LargeBinaryArray) -> Self { Self { array } } } -impl DFBinaryArray { +impl DFStringArray { pub fn new(array: LargeBinaryArray) -> Self { Self { array } } @@ -55,7 +55,7 @@ impl DFBinaryArray { } pub fn data_type(&self) -> &DataType { - &DataType::Binary + &DataType::String } /// # Safety @@ -82,6 +82,11 @@ impl DFBinaryArray { self.array.null_count() } + #[inline] + pub fn all_is_null(&self) -> bool { + self.null_count() == self.len() + } + #[inline] pub fn is_null(&self, i: usize) -> bool { self.array.is_null(i) @@ -128,3 +133,59 @@ impl DFBinaryArray { e.collect() } } + +/// # Safety +/// Note this doesn't do any bound checking, for performance reason. +pub unsafe fn take_string_iter_unchecked>( + arr: &LargeBinaryArray, + indices: I, +) -> LargeBinaryArray { + match arr.null_count() { + 0 => { + let iter = indices + .into_iter() + .map(|idx| Some(arr.value_unchecked(idx))); + LargeBinaryArray::from_trusted_len_iter_unchecked(iter) + } + _ => { + let iter = indices.into_iter().map(|idx| { + if arr.is_null(idx) { + None + } else { + Some(arr.value_unchecked(idx)) + } + }); + LargeBinaryArray::from_trusted_len_iter_unchecked(iter) + } + } +} + +/// # Safety +/// Note this doesn't do any bound checking, for performance reason. +pub unsafe fn take_string_opt_iter_unchecked>>( + arr: &LargeBinaryArray, + indices: I, +) -> LargeBinaryArray { + match arr.null_count() { + 0 => { + let iter = indices + .into_iter() + .map(|opt_idx| opt_idx.map(|idx| arr.value_unchecked(idx))); + + LargeBinaryArray::from_trusted_len_iter_unchecked(iter) + } + _ => { + let iter = indices.into_iter().map(|opt_idx| { + opt_idx.and_then(|idx| { + if arr.is_null(idx) { + None + } else { + Some(arr.value_unchecked(idx)) + } + }) + }); + + LargeBinaryArray::from_trusted_len_iter_unchecked(iter) + } + } +} diff --git a/common/datavalues/src/arrays/trusted_len.rs b/common/datavalues/src/arrays/trusted_len.rs index 98c47a94dfbb4..1636d4944b1cb 100644 --- a/common/datavalues/src/arrays/trusted_len.rs +++ b/common/datavalues/src/arrays/trusted_len.rs @@ -103,8 +103,8 @@ impl FromTrustedLenIterator for NoNull { iter.collect() } } -impl FromTrustedLenIterator for DFUtf8Array -where Ptr: DFAsRef +impl FromTrustedLenIterator for DFStringArray +where Ptr: DFAsRef<[u8]> { fn from_iter_trusted_length>(iter: I) -> Self { iter.collect() diff --git a/common/datavalues/src/arrays/upstream_traits.rs b/common/datavalues/src/arrays/upstream_traits.rs index 7a1a16071a1fb..9bad7512950a8 100644 --- a/common/datavalues/src/arrays/upstream_traits.rs +++ b/common/datavalues/src/arrays/upstream_traits.rs @@ -91,14 +91,14 @@ impl FromIterator for NoNull { } } -// FromIterator for Utf8Type variants.Array +// FromIterator for StringType variants.Array -impl FromIterator> for DFUtf8Array -where Ptr: AsRef +impl FromIterator> for DFStringArray +where Ptr: AsRef<[u8]> { fn from_iter>>(iter: I) -> Self { // 2021-02-07: this was ~30% faster than with the builder. - let arr = LargeUtf8Array::from_iter(iter); + let arr = LargeBinaryArray::from_iter(iter); arr.into() } } @@ -106,33 +106,33 @@ where Ptr: AsRef /// Local AsRef trait to circumvent the orphan rule. pub trait DFAsRef: AsRef {} -impl DFAsRef for String {} -impl DFAsRef for &str {} +impl DFAsRef<[u8]> for Vec {} +impl DFAsRef<[u8]> for &[u8] {} // &["foo", "bar"] -impl DFAsRef for &&str {} -impl<'a> DFAsRef for Cow<'a, str> {} +impl DFAsRef<[u8]> for &&[u8] {} +impl<'a> DFAsRef<[u8]> for Cow<'a, [u8]> {} -impl FromIterator for DFUtf8Array -where Ptr: DFAsRef +impl FromIterator for DFStringArray +where Ptr: DFAsRef<[u8]> { fn from_iter>(iter: I) -> Self { - let arr = LargeUtf8Array::from_iter_values(iter.into_iter()); + let arr = LargeBinaryArray::from_iter_values(iter.into_iter()); arr.into() } } /// From trait -impl<'a> From<&'a DFUtf8Array> for Vec> { - fn from(ca: &'a DFUtf8Array) -> Self { +impl<'a> From<&'a DFStringArray> for Vec> { + fn from(ca: &'a DFStringArray) -> Self { ca.inner().iter().collect() } } -impl From for Vec> { - fn from(ca: DFUtf8Array) -> Self { +impl From for Vec>> { + fn from(ca: DFStringArray) -> Self { ca.inner() .iter() - .map(|opt| opt.map(|s| s.to_string())) + .map(|opt| opt.map(|s| s.to_vec())) .collect() } } diff --git a/common/datavalues/src/data_group_value.rs b/common/datavalues/src/data_group_value.rs index 9ce6f9973283a..b9a487eef31bb 100644 --- a/common/datavalues/src/data_group_value.rs +++ b/common/datavalues/src/data_group_value.rs @@ -34,7 +34,7 @@ pub enum DataGroupValue { Int16(i16), Int32(i32), Int64(i64), - Utf8(Box), + String(Box>), Boolean(bool), } @@ -54,7 +54,7 @@ impl TryFrom<&DataValue> for DataGroupValue { DataValue::UInt16(Some(v)) => DataGroupValue::UInt16(*v), DataValue::UInt32(Some(v)) => DataGroupValue::UInt32(*v), DataValue::UInt64(Some(v)) => DataGroupValue::UInt64(*v), - DataValue::Utf8(Some(v)) => DataGroupValue::Utf8(Box::new(v.clone())), + DataValue::String(Some(v)) => DataGroupValue::String(Box::new(v.clone())), DataValue::Float32(None) | DataValue::Float64(None) @@ -67,7 +67,7 @@ impl TryFrom<&DataValue> for DataGroupValue { | DataValue::UInt16(None) | DataValue::UInt32(None) | DataValue::UInt64(None) - | DataValue::Utf8(None) => { + | DataValue::String(None) => { return Err(ErrorCode::BadDataValueType(format!( "Cannot convert a DataValue holding NULL ({:?})", value @@ -98,7 +98,7 @@ impl From<&DataGroupValue> for DataValue { DataGroupValue::UInt16(v) => DataValue::UInt16(Some(*v)), DataGroupValue::UInt32(v) => DataValue::UInt32(Some(*v)), DataGroupValue::UInt64(v) => DataValue::UInt64(Some(*v)), - DataGroupValue::Utf8(v) => DataValue::Utf8(Some(v.to_string())), + DataGroupValue::String(v) => DataValue::String(Some(v.to_vec())), } } } diff --git a/common/datavalues/src/data_value.rs b/common/datavalues/src/data_value.rs index befeec8f3c673..85c6e71de9b31 100644 --- a/common/datavalues/src/data_value.rs +++ b/common/datavalues/src/data_value.rs @@ -28,7 +28,7 @@ use common_io::prelude::*; use crate::arrays::ListBooleanArrayBuilder; use crate::arrays::ListBuilderTrait; use crate::arrays::ListPrimitiveArrayBuilder; -use crate::arrays::ListUtf8ArrayBuilder; +use crate::arrays::ListStringArrayBuilder; use crate::prelude::*; use crate::series::IntoSeries; use crate::series::Series; @@ -50,8 +50,7 @@ pub enum DataValue { UInt64(Option), Float32(Option), Float64(Option), - Binary(Option>), - Utf8(Option), + String(Option>), // Container struct. List(Option>, DataType), @@ -75,8 +74,7 @@ impl DataValue { | DataValue::UInt64(None) | DataValue::Float32(None) | DataValue::Float64(None) - | DataValue::Binary(None) - | DataValue::Utf8(None) + | DataValue::String(None) | DataValue::Null | DataValue::List(None, _) ) @@ -96,7 +94,6 @@ impl DataValue { DataValue::UInt64(_) => DataType::UInt64, DataValue::Float32(_) => DataType::Float32, DataValue::Float64(_) => DataType::Float64, - DataValue::Utf8(_) => DataType::Utf8, DataValue::List(_, data_type) => { DataType::List(Box::new(DataField::new("item", data_type.clone(), true))) } @@ -111,7 +108,7 @@ impl DataValue { .collect::>(); DataType::Struct(fields) } - DataValue::Binary(_) => DataType::Binary, + DataValue::String(_) => DataType::String, } } @@ -137,17 +134,10 @@ impl DataValue { DataValue::UInt64(values) => Ok(build_constant_series! {DFUInt64Array, values, size}), DataValue::Float32(values) => Ok(build_constant_series! {DFFloat32Array, values, size}), DataValue::Float64(values) => Ok(build_constant_series! {DFFloat64Array, values, size}), - - DataValue::Utf8(values) => match values { - None => Ok(DFUtf8Array::full_null(size).into_series()), - Some(v) => Ok(DFUtf8Array::full(v.deref(), size).into_series()), - }, - - DataValue::Binary(values) => match values { - None => Ok(DFBinaryArray::full_null(size).into_series()), - Some(v) => Ok(DFBinaryArray::full(v.deref(), size).into_series()), + DataValue::String(values) => match values { + None => Ok(DFStringArray::full_null(size).into_series()), + Some(v) => Ok(DFStringArray::full(v.deref(), size).into_series()), }, - DataValue::List(values, data_type) => match data_type { DataType::Int8 => build_list_series! {i8, values, size, data_type }, DataType::Int16 => build_list_series! {i16, values, size, data_type }, @@ -177,8 +167,8 @@ impl DataValue { } Ok(builder.finish().into_series()) } - DataType::Utf8 => { - let mut builder = ListUtf8ArrayBuilder::with_capacity(0, size); + DataType::String => { + let mut builder = ListStringArrayBuilder::with_capacity(0, size); match values { Some(v) => { let series = DataValue::try_into_data_array(v, data_type)?; @@ -274,7 +264,19 @@ typed_cast_from_data_value_to_std!(UInt64, u64); typed_cast_from_data_value_to_std!(Float32, f32); typed_cast_from_data_value_to_std!(Float64, f64); typed_cast_from_data_value_to_std!(Boolean, bool); -typed_cast_from_data_value_to_std!(Utf8, String); + +impl DFTryFrom for Vec { + fn try_from(value: DataValue) -> Result { + match value { + DataValue::String(Some(inner_value)) => Ok(inner_value), + _ => Err(ErrorCode::BadDataValueType(format!( + "DataValue Error: Cannot convert {:?} to {}", + value, + std::any::type_name::() + ))), + } + } +} std_to_data_value!(Int8, i8); std_to_data_value!(Int16, i16); @@ -288,40 +290,28 @@ std_to_data_value!(Float32, f32); std_to_data_value!(Float64, f64); std_to_data_value!(Boolean, bool); -impl From<&str> for DataValue { - fn from(x: &str) -> Self { - DataValue::Utf8(Some(x.to_string())) +impl From<&[u8]> for DataValue { + fn from(x: &[u8]) -> Self { + DataValue::String(Some(x.to_vec())) } } -impl From> for DataValue { - fn from(x: Option<&str>) -> Self { - let x = x.map(|c| c.to_string()); +impl From> for DataValue { + fn from(x: Option<&[u8]>) -> Self { + let x = x.map(|c| c.to_vec()); DataValue::from(x) } } -impl From for DataValue { - fn from(x: String) -> Self { - DataValue::Utf8(Some(x)) - } -} - -impl From> for DataValue { - fn from(x: Option) -> Self { - DataValue::Utf8(x) - } -} - impl From> for DataValue { fn from(x: Vec) -> Self { - DataValue::Binary(Some(x)) + DataValue::String(Some(x)) } } impl From>> for DataValue { fn from(x: Option>) -> Self { - DataValue::Binary(x) + DataValue::String(x) } } @@ -340,13 +330,12 @@ impl From<&DataType> for DataValue { DataType::UInt64 => DataValue::UInt64(None), DataType::Float32 => DataValue::Float32(None), DataType::Float64 => DataValue::Float64(None), - DataType::Utf8 => DataValue::Utf8(None), DataType::Date16 => DataValue::UInt16(None), DataType::Date32 => DataValue::UInt32(None), DataType::DateTime32 => DataValue::UInt32(None), DataType::List(f) => DataValue::List(None, f.data_type().clone()), DataType::Struct(_) => DataValue::Struct(vec![]), - DataType::Binary => DataValue::Binary(None), + DataType::String => DataValue::String(None), } } } @@ -375,9 +364,8 @@ impl fmt::Display for DataValue { DataValue::UInt16(v) => format_data_value_with_option!(f, v), DataValue::UInt32(v) => format_data_value_with_option!(f, v), DataValue::UInt64(v) => format_data_value_with_option!(f, v), - DataValue::Utf8(v) => format_data_value_with_option!(f, v), - DataValue::Binary(None) => write!(f, "NULL"), - DataValue::Binary(Some(v)) => { + DataValue::String(None) => write!(f, "NULL"), + DataValue::String(Some(v)) => { for c in v { write!(f, "{:02x}", c)?; } @@ -418,9 +406,8 @@ impl fmt::Debug for DataValue { DataValue::UInt64(v) => format_data_value_with_option!(f, v), DataValue::Float32(v) => format_data_value_with_option!(f, v), DataValue::Float64(v) => format_data_value_with_option!(f, v), - DataValue::Utf8(v) => format_data_value_with_option!(f, v), - DataValue::Binary(None) => write!(f, "{}", self), - DataValue::Binary(Some(_)) => write!(f, "\"{}\"", self), + DataValue::String(None) => write!(f, "{}", self), + DataValue::String(Some(_)) => write!(f, "{}", self), DataValue::List(_, _) => write!(f, "[{}]", self), DataValue::Struct(v) => write!(f, "{:?}", v), } diff --git a/common/datavalues/src/data_value_ops.rs b/common/datavalues/src/data_value_ops.rs index ffffe13eeaeec..e2363fdce6315 100644 --- a/common/datavalues/src/data_value_ops.rs +++ b/common/datavalues/src/data_value_ops.rs @@ -21,7 +21,7 @@ impl DataValue { pub fn custom_display(&self, single_quote: bool) -> String { let s = self.to_string(); if single_quote { - if let DataValue::Utf8(Some(_)) = self { + if let DataValue::String(Some(_)) = self { return format!("'{}'", s); } } @@ -84,7 +84,7 @@ impl DataValue { try_build_array! {PrimitiveArrayBuilder, f64, Float64, values} } DataType::Boolean => try_build_array! {values}, - DataType::Utf8 => try_build_array! {Utf8, values}, + DataType::String => try_build_array! {String, values}, other => Result::Err(ErrorCode::BadDataValueType(format!( "Unexpected type:{} for DataValue List", other diff --git a/common/datavalues/src/macros.rs b/common/datavalues/src/macros.rs index 7cdce361f08b9..41c5ccb6a5a4a 100644 --- a/common/datavalues/src/macros.rs +++ b/common/datavalues/src/macros.rs @@ -30,10 +30,10 @@ macro_rules! dispatch_numeric_types { #[macro_export] macro_rules! match_data_type_apply_macro_ca { - ($self:expr, $macro:ident, $macro_utf8:ident, $macro_bool:ident $(, $opt_args:expr)*) => {{ + ($self:expr, $macro:ident, $macro_string:ident, $macro_bool:ident $(, $opt_args:expr)*) => {{ match $self.data_type() { - DataType::Utf8 => $macro_utf8!($self.utf8().unwrap() $(, $opt_args)*), + DataType::String => $macro_string!($self.string().unwrap() $(, $opt_args)*), DataType::Boolean => $macro_bool!($self.bool().unwrap() $(, $opt_args)*), DataType::UInt8 => $macro!($self.u8().unwrap() $(, $opt_args)*), DataType::UInt16 => $macro!($self.u16().unwrap() $(, $opt_args)*), @@ -52,7 +52,7 @@ macro_rules! match_data_type_apply_macro_ca { }}; } -// doesn't include Bool and Utf8 +// doesn't include Bool and String #[macro_export] macro_rules! apply_method_numeric_series { ($self:ident, $method:ident, $($args:expr),*) => { @@ -77,9 +77,9 @@ macro_rules! apply_method_numeric_series { #[macro_export] macro_rules! match_data_type_apply_macro { - ($obj:expr, $macro:ident, $macro_utf8:ident, $macro_bool:ident $(, $opt_args:expr)*) => {{ + ($obj:expr, $macro:ident, $macro_string:ident, $macro_bool:ident $(, $opt_args:expr)*) => {{ match $obj { - DataType::Utf8 => $macro_utf8!($($opt_args)*), + DataType::String => $macro_string!($($opt_args)*), DataType::Boolean => $macro_bool!($($opt_args)*), DataType::UInt8 => $macro!(u8 $(, $opt_args)*), DataType::UInt16 => $macro!(u16 $(, $opt_args)*), @@ -192,13 +192,13 @@ macro_rules! try_build_array { Ok(builder.finish().into_series()) }}; - // utf8 - ($utf8:ident, $VALUES:expr) => {{ - let mut builder = Utf8ArrayBuilder::with_capacity($VALUES.len()); + // String + ($string:ident, $VALUES:expr) => {{ + let mut builder = StringArrayBuilder::with_capacity($VALUES.len()); for value in $VALUES.iter() { match value { - DataValue::Utf8(Some(v)) => builder.append_value(v), - DataValue::Utf8(None) => builder.append_null(), + DataValue::String(Some(v)) => builder.append_value(v), + DataValue::String(None) => builder.append_null(), _ => unreachable!(), } } diff --git a/common/datavalues/src/prelude.rs b/common/datavalues/src/prelude.rs index 4486c0e3989a9..ab123dd5f914d 100644 --- a/common/datavalues/src/prelude.rs +++ b/common/datavalues/src/prelude.rs @@ -50,6 +50,5 @@ pub use crate::DataValueLogicOperator; pub use crate::DataValueLogicOperator::*; pub type AlignedVec = common_arrow::arrow::buffer::MutableBuffer; -pub type LargeUtf8Array = common_arrow::arrow::array::Utf8Array; pub type LargeBinaryArray = common_arrow::arrow::array::BinaryArray; pub type LargeListArray = common_arrow::arrow::array::ListArray; diff --git a/common/datavalues/src/series/arithmetic.rs b/common/datavalues/src/series/arithmetic.rs index 1faf57cf92d53..b8debd51b7b26 100644 --- a/common/datavalues/src/series/arithmetic.rs +++ b/common/datavalues/src/series/arithmetic.rs @@ -184,7 +184,7 @@ where } } -impl NumOpsDispatch for DFUtf8Array { +impl NumOpsDispatch for DFStringArray { fn add_to(&self, rhs: &Series) -> Result { let rhs = unsafe { self.unpack(rhs)? }; let out = (self + rhs)?; @@ -193,7 +193,6 @@ impl NumOpsDispatch for DFUtf8Array { } impl NumOpsDispatch for DFBooleanArray {} impl NumOpsDispatch for DFListArray {} -impl NumOpsDispatch for DFBinaryArray {} impl NumOpsDispatch for DFNullArray {} impl NumOpsDispatch for DFStructArray {} diff --git a/common/datavalues/src/series/comparison.rs b/common/datavalues/src/series/comparison.rs index 9d96fc1dfa723..acf946371be90 100644 --- a/common/datavalues/src/series/comparison.rs +++ b/common/datavalues/src/series/comparison.rs @@ -24,7 +24,7 @@ macro_rules! impl_compare { ($self:expr, $rhs:expr, $method:ident) => {{ match $self.data_type() { DataType::Boolean => $self.bool().unwrap().$method($rhs.bool().unwrap()), - DataType::Utf8 => $self.utf8().unwrap().$method($rhs.utf8().unwrap()), + DataType::String => $self.string().unwrap().$method($rhs.string().unwrap()), DataType::UInt8 => $self.u8().unwrap().$method($rhs.u8().unwrap()), DataType::UInt16 => $self.u16().unwrap().$method($rhs.u16().unwrap()), DataType::UInt32 => $self.u32().unwrap().$method($rhs.u32().unwrap()), @@ -63,7 +63,7 @@ fn null_to_boolean(s: &Series) -> DFBooleanArray { fn coerce_cmp_lhs_rhs(lhs: &Series, rhs: &Series) -> Result<(Series, Series)> { if lhs.data_type() == rhs.data_type() - && (lhs.data_type() == &DataType::Utf8 || lhs.data_type() == &DataType::Boolean) + && (lhs.data_type() == &DataType::String || lhs.data_type() == &DataType::Boolean) { return Ok((lhs.clone(), rhs.clone())); } diff --git a/common/datavalues/src/series/de.rs b/common/datavalues/src/series/de.rs index c6efda940f7cb..55e53151bcd63 100644 --- a/common/datavalues/src/series/de.rs +++ b/common/datavalues/src/series/de.rs @@ -34,7 +34,7 @@ impl DataType { match self { DataType::Boolean => Ok(Box::new(BooleanArrayBuilder::with_capacity(capacity))), - DataType::Utf8 => Ok(Box::new(Utf8ArrayBuilder::with_capacity(capacity))), + DataType::String => Ok(Box::new(StringArrayBuilder::with_capacity(capacity))), other => Err(ErrorCode::BadDataValueType(format!( "create_deserializer does not support type '{:?}'", diff --git a/common/datavalues/src/series/series_impl.rs b/common/datavalues/src/series/series_impl.rs index 17f418b53f316..254bec6ebbd66 100644 --- a/common/datavalues/src/series/series_impl.rs +++ b/common/datavalues/src/series/series_impl.rs @@ -167,18 +167,10 @@ pub trait SeriesTrait: Send + Sync + fmt::Debug { ))) } - /// Unpack to DFArray of data_type utf8 - fn utf8(&self) -> Result<&DFUtf8Array> { + /// Unpack to DFArray of data_type string + fn string(&self) -> Result<&DFStringArray> { Err(ErrorCode::IllegalDataType(format!( - "{:?} != utf8", - self.data_type() - ))) - } - - /// Unpack to DFArray of data_type binary - fn binary(&self) -> Result<&DFBinaryArray> { - Err(ErrorCode::IllegalDataType(format!( - "{:?} != binary", + "{:?} != string", self.data_type() ))) } @@ -222,19 +214,18 @@ macro_rules! impl_from { }; } -impl<'a, T: AsRef<[&'a str]>> SeriesFrom for Series { +impl<'a, T: AsRef<[&'a [u8]]>> SeriesFrom for Series { fn new(v: T) -> Self { - DFUtf8Array::new_from_slice(v.as_ref()).into_series() + DFStringArray::new_from_slice(v.as_ref()).into_series() } } -impl<'a, T: AsRef<[Option<&'a str>]>> SeriesFrom]> for Series { +impl<'a, T: AsRef<[Option<&'a [u8]>]>> SeriesFrom]> for Series { fn new(v: T) -> Self { - DFUtf8Array::new_from_opt_slice(v.as_ref()).into_series() + DFStringArray::new_from_opt_slice(v.as_ref()).into_series() } } -impl_from!([String], DFUtf8Array, new_from_slice); impl_from!([bool], DFBooleanArray, new_from_slice); impl_from!([u8], DFUInt8Array, new_from_slice); impl_from!([u16], DFUInt16Array, new_from_slice); @@ -246,8 +237,8 @@ impl_from!([i32], DFInt32Array, new_from_slice); impl_from!([i64], DFInt64Array, new_from_slice); impl_from!([f32], DFFloat32Array, new_from_slice); impl_from!([f64], DFFloat64Array, new_from_slice); +impl_from!([Vec], DFStringArray, new_from_slice); -impl_from!([Option], DFUtf8Array, new_from_opt_slice); impl_from!([Option], DFBooleanArray, new_from_opt_slice); impl_from!([Option], DFUInt8Array, new_from_opt_slice); impl_from!([Option], DFUInt16Array, new_from_opt_slice); @@ -259,6 +250,7 @@ impl_from!([Option], DFInt32Array, new_from_opt_slice); impl_from!([Option], DFInt64Array, new_from_opt_slice); impl_from!([Option], DFFloat32Array, new_from_opt_slice); impl_from!([Option], DFFloat64Array, new_from_opt_slice); +impl_from!([Option>], DFStringArray, new_from_opt_slice); impl Series { /// Check if series are equal. Note that `None == None` evaluates to `false` @@ -319,11 +311,10 @@ impl IntoSeries for ArrayRef { DataType::Float32 => DFFloat32Array::from_arrow_array(self.as_ref()).into_series(), DataType::Float64 => DFFloat64Array::from_arrow_array(self.as_ref()).into_series(), - DataType::Utf8 => DFUtf8Array::from_arrow_array(self.as_ref()).into_series(), DataType::List(_) => DFListArray::from_arrow_array(self.as_ref()).into_series(), DataType::Struct(_) => DFStructArray::from_arrow_array(self.as_ref()).into_series(), - DataType::Binary => DFBinaryArray::from_arrow_array(self.as_ref()).into_series(), + DataType::String => DFStringArray::from_arrow_array(self.as_ref()).into_series(), _ => unreachable!(), } diff --git a/common/datavalues/src/series/wrap.rs b/common/datavalues/src/series/wrap.rs index 98cbce154c990..e833495483b9f 100644 --- a/common/datavalues/src/series/wrap.rs +++ b/common/datavalues/src/series/wrap.rs @@ -290,24 +290,13 @@ macro_rules! impl_dyn_array { } } - fn utf8(&self) -> Result<&DFUtf8Array> { - if matches!(self.0.data_type(), &DataType::Utf8) { - unsafe { Ok(&*(self as *const dyn SeriesTrait as *const DFUtf8Array)) } + /// Unpack to DFArray of data_type string + fn string(&self) -> Result<&DFStringArray> { + if matches!(self.0.data_type(), &DataType::String) { + unsafe { Ok(&*(self as *const dyn SeriesTrait as *const DFStringArray)) } } else { Err(ErrorCode::IllegalDataType(format!( - "cannot unpack Series of type {:?} into utf8", - self.data_type(), - ))) - } - } - - /// Unpack to DFArray of data_type binary - fn binary(&self) -> Result<&DFBinaryArray> { - if matches!(self.0.data_type(), &DataType::Binary) { - unsafe { Ok(&*(self as *const dyn SeriesTrait as *const DFBinaryArray)) } - } else { - Err(ErrorCode::IllegalDataType(format!( - "cannot unpack Series of type {:?} into binary", + "cannot unpack Series of type {:?} into string", self.data_type(), ))) } @@ -351,8 +340,7 @@ impl_dyn_array!(DFInt8Array); impl_dyn_array!(DFInt16Array); impl_dyn_array!(DFInt32Array); impl_dyn_array!(DFInt64Array); -impl_dyn_array!(DFUtf8Array); impl_dyn_array!(DFListArray); impl_dyn_array!(DFBooleanArray); -impl_dyn_array!(DFBinaryArray); +impl_dyn_array!(DFStringArray); impl_dyn_array!(DFStructArray); diff --git a/common/datavalues/src/types/data_df_type.rs b/common/datavalues/src/types/data_df_type.rs index c758f3531557b..b0c2d0aa5e2a2 100644 --- a/common/datavalues/src/types/data_df_type.rs +++ b/common/datavalues/src/types/data_df_type.rs @@ -48,15 +48,13 @@ impl_df_datatype!(f32, Float32); impl_df_datatype!(f64, Float64); impl_df_datatype!(bool, Boolean); -impl_df_datatype!(String, Utf8); - #[derive(Debug)] pub struct Null; impl_df_datatype!(Null, Null); impl DFDataType for Vec { fn data_type() -> DataType { - DataType::Binary + DataType::String } } diff --git a/common/datavalues/src/types/data_type.rs b/common/datavalues/src/types/data_type.rs index c8b962fdd566f..ed1de181f8456 100644 --- a/common/datavalues/src/types/data_type.rs +++ b/common/datavalues/src/types/data_type.rs @@ -36,7 +36,6 @@ pub enum DataType { Int64, Float32, Float64, - Utf8, /// A 32-bit date representing the elapsed time since UNIX epoch (1970-01-01) /// in days (16 bits), it's physical type is UInt16 Date16, @@ -50,7 +49,7 @@ pub enum DataType { List(Box), Struct(Vec), - Binary, + String, } #[derive( @@ -115,7 +114,6 @@ impl DataType { Int64 => ArrowDataType::Int64, Float32 => ArrowDataType::Float32, Float64 => ArrowDataType::Float64, - Utf8 => ArrowDataType::LargeUtf8, Date16 => ArrowDataType::UInt16, Date32 => ArrowDataType::UInt32, DateTime32 => ArrowDataType::UInt32, @@ -124,7 +122,7 @@ impl DataType { let arrows_fields = fs.iter().map(|f| f.to_arrow()).collect(); ArrowDataType::Struct(arrows_fields) } - Binary => ArrowDataType::LargeBinary, + String => ArrowDataType::LargeBinary, } } } @@ -155,8 +153,7 @@ impl From<&ArrowDataType> for DataType { let f: DataField = (f.as_ref()).into(); DataType::List(Box::new(f)) } - ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 => DataType::Utf8, - ArrowDataType::Binary | ArrowDataType::LargeBinary => DataType::Binary, + ArrowDataType::Binary | ArrowDataType::LargeBinary => DataType::String, // this is safe, because we define the datatype firstly _ => { diff --git a/common/datavalues/src/types/data_type_coercion.rs b/common/datavalues/src/types/data_type_coercion.rs index 5d6c643a933e2..0e0c10d706df8 100644 --- a/common/datavalues/src/types/data_type_coercion.rs +++ b/common/datavalues/src/types/data_type_coercion.rs @@ -111,20 +111,6 @@ pub fn construct_numeric_type( } } -/// Coercion rules for dictionary values (aka the type of the dictionary itself) - -/// Coercion rules for Strings: the type that both lhs and rhs can be -/// casted to for the purpose of a string computation -pub fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Result { - match (lhs_type, rhs_type) { - (DataType::Utf8, DataType::Utf8) => Ok(DataType::Utf8), - _ => Result::Err(ErrorCode::BadDataValueType(format!( - "Can't construct type from {} and {}", - lhs_type, rhs_type - ))), - } -} - /// Coercion rule for numerical types: The type that both lhs and rhs /// can be casted to for numerical calculation, while maintaining /// maximum precision diff --git a/common/datavalues/src/types/physical_data_type.rs b/common/datavalues/src/types/physical_data_type.rs index a46defc28dcd0..d7470059bb061 100644 --- a/common/datavalues/src/types/physical_data_type.rs +++ b/common/datavalues/src/types/physical_data_type.rs @@ -31,10 +31,9 @@ pub enum PhysicalDataType { Int64, Float32, Float64, - Utf8, List(Box), Struct(Vec), - Binary, + String, } impl From for PhysicalDataType { @@ -53,10 +52,9 @@ impl From for PhysicalDataType { DataType::Int64 => Int64, DataType::Float32 => Float32, DataType::Float64 => Float64, - DataType::Utf8 => Utf8, DataType::List(x) => List(x), DataType::Struct(x) => Struct(x), - DataType::Binary => Binary, + DataType::String => String, } } } @@ -77,10 +75,9 @@ impl From for DataType { PhysicalDataType::Int64 => Int64, PhysicalDataType::Float32 => Float32, PhysicalDataType::Float64 => Float64, - PhysicalDataType::Utf8 => Utf8, PhysicalDataType::List(x) => List(x), PhysicalDataType::Struct(x) => Struct(x), - PhysicalDataType::Binary => Binary, + PhysicalDataType::String => String, } } } diff --git a/common/datavalues/src/types/serializations/mod.rs b/common/datavalues/src/types/serializations/mod.rs index 83042bc12d734..04974917343d6 100644 --- a/common/datavalues/src/types/serializations/mod.rs +++ b/common/datavalues/src/types/serializations/mod.rs @@ -17,21 +17,19 @@ use common_exception::Result; use crate::prelude::DataColumn; use crate::DataType; -mod binary; mod boolean; mod date; mod date_time; mod null; mod number; -mod utf8; +mod string; -pub use binary::*; pub use boolean::*; pub use date::*; pub use date_time::*; pub use null::*; pub use number::*; -pub use utf8::*; +pub use string::*; pub trait TypeSerializer { fn serialize_strings(&self, column: &DataColumn) -> Result>; @@ -52,11 +50,10 @@ impl DataType { DataType::Int64 => Box::new(NumberSerializer::::default()), DataType::Float32 => Box::new(NumberSerializer::::default()), DataType::Float64 => Box::new(NumberSerializer::::default()), - DataType::Utf8 => Box::new(Utf8Serializer {}), DataType::Date16 => Box::new(DateSerializer::::default()), DataType::Date32 => Box::new(DateSerializer::::default()), DataType::DateTime32 => Box::new(DateTimeSerializer::::default()), - DataType::Binary => Box::new(BinarySerializer {}), + DataType::String => Box::new(StringSerializer {}), DataType::List(_) => todo!(), DataType::Struct(_) => todo!(), } diff --git a/common/datavalues/src/types/serializations/binary.rs b/common/datavalues/src/types/serializations/string.rs similarity index 88% rename from common/datavalues/src/types/serializations/binary.rs rename to common/datavalues/src/types/serializations/string.rs index 4fc08d056ec28..f835893e11bdf 100644 --- a/common/datavalues/src/types/serializations/binary.rs +++ b/common/datavalues/src/types/serializations/string.rs @@ -16,12 +16,12 @@ use common_exception::Result; use crate::prelude::*; -pub struct BinarySerializer {} +pub struct StringSerializer {} -impl TypeSerializer for BinarySerializer { +impl TypeSerializer for StringSerializer { fn serialize_strings(&self, column: &DataColumn) -> Result> { let array = column.to_array()?; - let array: &DFBinaryArray = array.static_cast(); + let array: &DFStringArray = array.static_cast(); let result: Vec = array .into_iter() diff --git a/common/datavalues/src/types/serializations/utf8.rs b/common/datavalues/src/types/serializations/utf8.rs deleted file mode 100644 index 1deab6bdb891f..0000000000000 --- a/common/datavalues/src/types/serializations/utf8.rs +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright 2020 Datafuse Labs. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use common_exception::Result; - -use crate::prelude::DFUtf8Array; -use crate::prelude::DataColumn; -use crate::TypeSerializer; - -pub struct Utf8Serializer {} - -impl TypeSerializer for Utf8Serializer { - fn serialize_strings(&self, column: &DataColumn) -> Result> { - let array = column.to_array()?; - let array: &DFUtf8Array = array.static_cast(); - - let result: Vec = array - .into_iter() - .map(|x| { - x.map(|v| v.to_string()) - .unwrap_or_else(|| "NULL".to_owned()) - }) - .collect(); - Ok(result) - } -} diff --git a/common/functions/src/aggregates/aggregate_arg_min_max.rs b/common/functions/src/aggregates/aggregate_arg_min_max.rs index fce3331c7e2e3..6f3b9f3073499 100644 --- a/common/functions/src/aggregates/aggregate_arg_min_max.rs +++ b/common/functions/src/aggregates/aggregate_arg_min_max.rs @@ -51,8 +51,8 @@ struct NumericState { pub data: DataValue, } -struct Utf8State { - pub value: Option, +struct StringState { + pub value: Option>, pub data: DataValue, } @@ -161,27 +161,27 @@ where } } -impl Utf8State { - fn merge_value(&mut self, data: DataValue, other: &str, is_min: bool) { +impl StringState { + fn merge_value(&mut self, data: DataValue, other: &[u8], is_min: bool) { match &self.value { Some(a) => { - let ord = a.as_str().partial_cmp(other); + let ord = a.as_slice().partial_cmp(other); match (ord, is_min) { (Some(Ordering::Greater), true) | (Some(Ordering::Less), false) => { - self.value = Some(other.to_string()); + self.value = Some(other.to_vec()); self.data = data; } _ => {} } } _ => { - self.value = Some(other.to_string()); + self.value = Some(other.to_vec()); self.data = data; } } } } -impl AggregateArgMinMaxState for Utf8State { +impl AggregateArgMinMaxState for StringState { fn new(data_type: &DataType) -> Self { Self { value: None, @@ -197,7 +197,7 @@ impl AggregateArgMinMaxState for Utf8State { _rows: usize, is_min: bool, ) -> Result<()> { - let array: &DFUtf8Array = data_series.static_cast(); + let array: &DFStringArray = data_series.static_cast(); array .into_iter() .zip(places.iter().enumerate()) @@ -224,7 +224,7 @@ impl AggregateArgMinMaxState for Utf8State { if index_value[0].is_null() { return Ok(()); } - let value: Result = DFTryFrom::try_from(index_value[1].clone()); + let value: Result> = DFTryFrom::try_from(index_value[1].clone()); if let Ok(other) = value { let data = data_series.try_get(index_value[0].as_u64()? as usize)?; @@ -237,7 +237,7 @@ impl AggregateArgMinMaxState for Utf8State { fn merge(&mut self, rhs: &Self, is_min: bool) -> Result<()> { if let Some(other) = &rhs.value { - self.merge_value(rhs.data.clone(), other.as_str(), is_min); + self.merge_value(rhs.data.clone(), other.as_slice(), is_min); } Ok(()) } @@ -248,7 +248,7 @@ impl AggregateArgMinMaxState for Utf8State { } fn deserialize(&mut self, reader: &mut &[u8]) -> Result<()> { - self.value = Option::::deserialize(reader)?; + self.value = Option::>::deserialize(reader)?; self.data = DataValue::deserialize(reader)?; Ok(()) } @@ -396,14 +396,14 @@ pub fn try_create_aggregate_arg_minmax_function( let data_type = arguments[1].data_type(); dispatch_numeric_types! {creator, data_type.clone(), is_min, display_name, arguments} - if data_type == &DataType::Utf8 { + if data_type == &DataType::String { if is_min { - return AggregateArgMinMaxFunction::::try_create_arg_min( + return AggregateArgMinMaxFunction::::try_create_arg_min( display_name, arguments, ); } else { - return AggregateArgMinMaxFunction::::try_create_arg_max( + return AggregateArgMinMaxFunction::::try_create_arg_max( display_name, arguments, ); diff --git a/common/functions/src/aggregates/aggregate_min_max.rs b/common/functions/src/aggregates/aggregate_min_max.rs index 25c0ea856ad27..19937dd075a4f 100644 --- a/common/functions/src/aggregates/aggregate_min_max.rs +++ b/common/functions/src/aggregates/aggregate_min_max.rs @@ -49,8 +49,8 @@ struct NumericState { pub value: Option, } -struct Utf8State { - pub value: Option, +struct StringState { + pub value: Option>, } impl NumericState @@ -131,23 +131,23 @@ where } } -impl Utf8State { - fn merge_value(&mut self, other: &str, is_min: bool) { +impl StringState { + fn merge_value(&mut self, other: &[u8], is_min: bool) { match &self.value { Some(a) => { - let ord = a.as_str().partial_cmp(other); + let ord = a.as_slice().partial_cmp(other); match (ord, is_min) { (Some(Ordering::Greater), true) | (Some(Ordering::Less), false) => { - self.value = Some(other.to_string()) + self.value = Some(other.to_vec()) } _ => {} } } - _ => self.value = Some(other.to_string()), + _ => self.value = Some(other.to_vec()), } } } -impl AggregateMinMaxState for Utf8State { +impl AggregateMinMaxState for StringState { fn default() -> Self { Self { value: None } } @@ -159,7 +159,7 @@ impl AggregateMinMaxState for Utf8State { _rows: usize, is_min: bool, ) -> Result<()> { - let array: &DFUtf8Array = series.static_cast(); + let array: &DFStringArray = series.static_cast(); array.into_iter().zip(places.iter()).for_each(|(x, place)| { let place = place.next(offset); if let Some(x) = x { @@ -172,16 +172,16 @@ impl AggregateMinMaxState for Utf8State { fn add_batch(&mut self, series: &Series, is_min: bool) -> Result<()> { let c = if is_min { series.min() } else { series.max() }?; - let other: Result = DFTryFrom::try_from(c); + let other: Result> = DFTryFrom::try_from(c); if let Ok(other) = other { - self.merge_value(other.as_str(), is_min); + self.merge_value(other.as_slice(), is_min); } Ok(()) } fn merge(&mut self, rhs: &Self, is_min: bool) -> Result<()> { if let Some(other) = &rhs.value { - self.merge_value(other.as_str(), is_min); + self.merge_value(other.as_slice(), is_min); } Ok(()) } @@ -190,7 +190,7 @@ impl AggregateMinMaxState for Utf8State { self.value.serialize_to_buf(writer) } fn deserialize(&mut self, reader: &mut &[u8]) -> Result<()> { - self.value = Option::::deserialize(reader)?; + self.value = Option::>::deserialize(reader)?; Ok(()) } @@ -331,11 +331,11 @@ pub fn try_create_aggregate_minmax_function( let data_type = arguments[0].data_type(); dispatch_numeric_types! {creator, data_type.clone(), is_min, display_name, arguments} - if data_type == &DataType::Utf8 { + if data_type == &DataType::String { if is_min { - return AggregateMinMaxFunction::::try_create_min(display_name, arguments); + return AggregateMinMaxFunction::::try_create_min(display_name, arguments); } else { - return AggregateMinMaxFunction::::try_create_max(display_name, arguments); + return AggregateMinMaxFunction::::try_create_max(display_name, arguments); } } diff --git a/common/functions/src/scalars/expressions/expression.rs b/common/functions/src/scalars/expressions/expression.rs index cb571868290ae..9083f737a6d58 100644 --- a/common/functions/src/scalars/expressions/expression.rs +++ b/common/functions/src/scalars/expressions/expression.rs @@ -49,16 +49,12 @@ impl ToCastFunction { Int64, Float32, Float64, - Utf8, Date16, Date32, DateTime32, - Binary + String } // aliases - map.insert("toString".into(), |display_name| { - CastFunction::create(display_name.to_string(), DataType::Utf8) - }); map.insert("toDate".into(), |display_name| { CastFunction::create(display_name.to_string(), DataType::Date16) }); diff --git a/common/functions/src/scalars/hashes/siphash.rs b/common/functions/src/scalars/hashes/siphash.rs index ce3335e8cef7a..de0ca509d62d8 100644 --- a/common/functions/src/scalars/hashes/siphash.rs +++ b/common/functions/src/scalars/hashes/siphash.rs @@ -61,8 +61,7 @@ impl Function for SipHashFunction { | DataType::Date16 | DataType::Date32 | DataType::DateTime32 - | DataType::Utf8 - | DataType::Binary => Ok(DataType::UInt64), + | DataType::String => Ok(DataType::UInt64), _ => Result::Err(ErrorCode::BadArguments(format!( "Function Error: {} does not support {} type parameters", self.display_name, args[0] diff --git a/common/functions/src/scalars/strings/substring.rs b/common/functions/src/scalars/strings/substring.rs index f8c4c06c5d91a..abe77db109b0d 100644 --- a/common/functions/src/scalars/strings/substring.rs +++ b/common/functions/src/scalars/strings/substring.rs @@ -41,7 +41,7 @@ impl Function for SubstringFunction { } fn return_type(&self, _args: &[DataType]) -> Result { - Ok(DataType::Utf8) + Ok(DataType::String) } fn nullable(&self, _input_schema: &DataSchema) -> Result { diff --git a/common/functions/src/scalars/udfs/database.rs b/common/functions/src/scalars/udfs/database.rs index 12c3cb3d54cd5..3bf4ec3a10c97 100644 --- a/common/functions/src/scalars/udfs/database.rs +++ b/common/functions/src/scalars/udfs/database.rs @@ -38,7 +38,7 @@ impl Function for DatabaseFunction { } fn return_type(&self, _args: &[DataType]) -> Result { - Ok(DataType::Utf8) + Ok(DataType::String) } fn nullable(&self, _input_schema: &DataSchema) -> Result { diff --git a/common/functions/src/scalars/udfs/to_type_name.rs b/common/functions/src/scalars/udfs/to_type_name.rs index 2e79e24b332dd..5944055f85a29 100644 --- a/common/functions/src/scalars/udfs/to_type_name.rs +++ b/common/functions/src/scalars/udfs/to_type_name.rs @@ -42,7 +42,7 @@ impl Function for ToTypeNameFunction { } fn return_type(&self, _args: &[DataType]) -> Result { - Ok(DataType::Utf8) + Ok(DataType::String) } fn nullable(&self, _input_schema: &DataSchema) -> Result { @@ -52,7 +52,7 @@ impl Function for ToTypeNameFunction { fn eval(&self, columns: &DataColumnsWithField, input_rows: usize) -> Result { let type_name = format!("{}", columns[0].data_type()); Ok(DataColumn::Constant( - DataValue::Utf8(Some(type_name)), + DataValue::String(Some(type_name.into_bytes())), input_rows, )) } diff --git a/common/functions/src/scalars/udfs/version.rs b/common/functions/src/scalars/udfs/version.rs index de09079437176..89005b3f8f3b4 100644 --- a/common/functions/src/scalars/udfs/version.rs +++ b/common/functions/src/scalars/udfs/version.rs @@ -41,7 +41,7 @@ impl Function for VersionFunction { } fn return_type(&self, _args: &[DataType]) -> Result { - Ok(DataType::Utf8) + Ok(DataType::String) } fn nullable(&self, _input_schema: &DataSchema) -> Result { diff --git a/common/io/src/binary_de.rs b/common/io/src/binary_de.rs index 7f0ae7d891c32..c0b42afede203 100644 --- a/common/io/src/binary_de.rs +++ b/common/io/src/binary_de.rs @@ -35,12 +35,12 @@ macro_rules! apply_scalar_de { // primitive types and boolean apply_scalar_de! {u8, u16, u32, u64, i8, i16, i32, i64, f32, f64, bool} -impl BinaryDe for String { +impl BinaryDe for Vec { fn deserialize(reader: &mut R) -> Result { let str_len = reader.read_uvarint()? as usize; let mut buffer = vec![0_u8; str_len]; reader.read_exact(buffer.as_mut())?; - Ok(String::from_utf8(buffer)?) + Ok(buffer) } } diff --git a/common/io/src/binary_ser.rs b/common/io/src/binary_ser.rs index ae6a70ab468f9..1b8034f5cae45 100644 --- a/common/io/src/binary_ser.rs +++ b/common/io/src/binary_ser.rs @@ -40,16 +40,16 @@ macro_rules! apply_scalar_ser { // primitive types and boolean apply_scalar_ser! {u8, u16, u32, u64, i8, i16, i32, i64, f32, f64, bool} -impl BinarySer for String { +impl BinarySer for Vec { fn serialize(&self, writer: &mut W) -> Result<()> { - let bytes = self.as_bytes(); + let bytes = self.as_slice(); writer.write_uvarint(bytes.len() as u64)?; writer.write_all(bytes)?; Ok(()) } fn serialize_to_buf(&self, writer: &mut W) -> Result<()> { - let bytes = self.as_bytes(); + let bytes = self.as_slice(); writer.write_uvarint(bytes.len() as u64)?; writer.put_slice(bytes); Ok(()) diff --git a/common/io/src/binary_write.rs b/common/io/src/binary_write.rs index 5d656f657efb4..dd8c07d5fb274 100644 --- a/common/io/src/binary_write.rs +++ b/common/io/src/binary_write.rs @@ -26,6 +26,7 @@ pub trait BinaryWrite { fn write_string(&mut self, text: impl AsRef) -> Result<()>; fn write_uvarint(&mut self, v: u64) -> Result<()>; + fn write_binary(&mut self, text: impl AsRef<[u8]>) -> Result<()>; fn write_opt_scalar(&mut self, v: &Option) -> Result<()> where V: Marshal + StatBuffer { @@ -63,6 +64,13 @@ where T: std::io::Write self.write_all(&scratch[..ln])?; Ok(()) } + + fn write_binary(&mut self, text: impl AsRef<[u8]>) -> Result<()> { + let bytes = text.as_ref(); + self.write_uvarint(bytes.len() as u64)?; + self.write_all(bytes)?; + Ok(()) + } } // Another trait like BinaryWrite @@ -83,6 +91,7 @@ pub trait BinaryWriteBuf { } fn write_string(&mut self, text: impl AsRef) -> Result<()>; fn write_uvarint(&mut self, v: u64) -> Result<()>; + fn write_binary(&mut self, text: impl AsRef<[u8]>) -> Result<()>; } // We must ensure there are enough buffer to write because BytesMut do not implicitly grow the buffer. @@ -111,6 +120,13 @@ where T: BufMut self.put_slice(&scratch[..ln]); Ok(()) } + + fn write_binary(&mut self, text: impl AsRef<[u8]>) -> Result<()> { + let bytes = text.as_ref(); + self.write_uvarint(bytes.len() as u64)?; + self.put_slice(bytes); + Ok(()) + } } // put_uvarint encodes a uint64 into buf and returns the number of bytes written. diff --git a/common/planners/src/plan_builder.rs b/common/planners/src/plan_builder.rs index a1dac8453c587..837fa41419592 100644 --- a/common/planners/src/plan_builder.rs +++ b/common/planners/src/plan_builder.rs @@ -130,7 +130,7 @@ impl PlanBuilder { let mut partial_fields = fields .iter() - .map(|f| DataField::new(f.name(), DataType::Binary, false)) + .map(|f| DataField::new(f.name(), DataType::String, false)) .collect::>(); if !group_expr.is_empty() { diff --git a/common/planners/src/plan_explain.rs b/common/planners/src/plan_explain.rs index fc185c8fce08a..8822605f78fbf 100644 --- a/common/planners/src/plan_explain.rs +++ b/common/planners/src/plan_explain.rs @@ -36,7 +36,7 @@ pub struct ExplainPlan { impl ExplainPlan { pub fn schema(&self) -> DataSchemaRef { - DataSchemaRefExt::create(vec![DataField::new("explain", DataType::Utf8, false)]) + DataSchemaRefExt::create(vec![DataField::new("explain", DataType::String, false)]) } pub fn set_input(&mut self, node: &PlanNode) { diff --git a/common/planners/src/plan_expression.rs b/common/planners/src/plan_expression.rs index fa4d3477d46ef..560138098dea2 100644 --- a/common/planners/src/plan_expression.rs +++ b/common/planners/src/plan_expression.rs @@ -145,8 +145,11 @@ impl Expression { } => match column_name { Some(name) => name.clone(), None => { - if let DataValue::Utf8(Some(_)) = value { - format!("'{:?}'", value) + if let DataValue::String(Some(v)) = value { + match std::str::from_utf8(v) { + Ok(v) => format!("'{}'", v), + Err(_e) => format!("{:?}", value), + } } else { format!("{:?}", value) } diff --git a/common/planners/src/plan_expression_literal.rs b/common/planners/src/plan_expression_literal.rs index 7791b9ecdcb66..fcf0e6bb16510 100644 --- a/common/planners/src/plan_expression_literal.rs +++ b/common/planners/src/plan_expression_literal.rs @@ -20,15 +20,15 @@ pub trait Literal { fn to_literal(&self) -> Expression; } -impl Literal for &str { +impl Literal for &[u8] { fn to_literal(&self) -> Expression { - Expression::create_literal(DataValue::Utf8(Some(self.to_string()))) + Expression::create_literal(DataValue::String(Some(self.to_vec()))) } } -impl Literal for String { +impl Literal for Vec { fn to_literal(&self) -> Expression { - Expression::create_literal(DataValue::Utf8(Some(self.clone()))) + Expression::create_literal(DataValue::String(Some(self.clone()))) } } diff --git a/query/src/datasources/system/clusters_table.rs b/query/src/datasources/system/clusters_table.rs index 557f3373b8ed5..298caf10d881b 100644 --- a/query/src/datasources/system/clusters_table.rs +++ b/query/src/datasources/system/clusters_table.rs @@ -36,8 +36,8 @@ impl ClustersTable { pub fn create() -> Self { ClustersTable { schema: DataSchemaRefExt::create(vec![ - DataField::new("name", DataType::Utf8, false), - DataField::new("host", DataType::Utf8, false), + DataField::new("name", DataType::String, false), + DataField::new("host", DataType::String, false), DataField::new("port", DataType::UInt16, false), DataField::new("priority", DataType::UInt8, false), ]), @@ -96,12 +96,12 @@ impl Table for ClustersTable { _source_plan: &ReadDataSourcePlan, ) -> Result { let nodes = ctx.try_get_cluster()?.get_nodes()?; - let names: Vec<&str> = nodes.iter().map(|x| x.name.as_str()).collect(); + let names: Vec<&[u8]> = nodes.iter().map(|x| x.name.as_bytes()).collect(); let hosts = nodes .iter() .map(|x| x.address.hostname()) .collect::>(); - let hostnames = hosts.iter().map(|x| x.as_str()).collect::>(); + let hostnames = hosts.iter().map(|x| x.as_bytes()).collect::>(); let ports: Vec = nodes.iter().map(|x| x.address.port()).collect(); let priorities: Vec = nodes.iter().map(|x| x.priority).collect(); let block = DataBlock::create_by_array(self.schema.clone(), vec![ diff --git a/query/src/datasources/system/contributors_table.rs b/query/src/datasources/system/contributors_table.rs index 525c8ab8692ed..1cfcad9bb4cc5 100644 --- a/query/src/datasources/system/contributors_table.rs +++ b/query/src/datasources/system/contributors_table.rs @@ -35,7 +35,7 @@ pub struct ContributorsTable { impl ContributorsTable { pub fn create() -> Self { ContributorsTable { - schema: DataSchemaRefExt::create(vec![DataField::new("name", DataType::Utf8, false)]), + schema: DataSchemaRefExt::create(vec![DataField::new("name", DataType::String, false)]), } } } @@ -90,9 +90,9 @@ impl Table for ContributorsTable { _ctx: DatafuseQueryContextRef, _source_plan: &ReadDataSourcePlan, ) -> Result { - let contributors: Vec<&str> = env!("FUSE_COMMIT_AUTHORS") + let contributors: Vec<&[u8]> = env!("FUSE_COMMIT_AUTHORS") .split_terminator(',') - .map(|x| x.trim()) + .map(|x| x.trim().as_bytes()) .collect(); let block = DataBlock::create_by_array(self.schema.clone(), vec![Series::new(contributors)]); diff --git a/query/src/datasources/system/databases_table.rs b/query/src/datasources/system/databases_table.rs index c0efdfa4d2863..66f74e9798af6 100644 --- a/query/src/datasources/system/databases_table.rs +++ b/query/src/datasources/system/databases_table.rs @@ -36,7 +36,7 @@ pub struct DatabasesTable { impl DatabasesTable { pub fn create() -> Self { DatabasesTable { - schema: DataSchemaRefExt::create(vec![DataField::new("name", DataType::Utf8, false)]), + schema: DataSchemaRefExt::create(vec![DataField::new("name", DataType::String, false)]), } } } @@ -94,9 +94,9 @@ impl Table for DatabasesTable { ctx.get_catalog() .get_databases() .map(|databases_name| -> SendableDataBlockStream { - let databases_name_str: Vec<&str> = databases_name + let databases_name_str: Vec<&[u8]> = databases_name .iter() - .map(|database| database.name()) + .map(|database| database.name().as_bytes()) .collect(); let block = DataBlock::create_by_array(self.schema.clone(), vec![Series::new( diff --git a/query/src/datasources/system/functions_table.rs b/query/src/datasources/system/functions_table.rs index 919462d62946e..3b5f33f1a1864 100644 --- a/query/src/datasources/system/functions_table.rs +++ b/query/src/datasources/system/functions_table.rs @@ -38,7 +38,7 @@ impl FunctionsTable { pub fn create() -> Self { FunctionsTable { schema: DataSchemaRefExt::create(vec![ - DataField::new("name", DataType::Utf8, false), + DataField::new("name", DataType::String, false), DataField::new("is_aggregate", DataType::Boolean, false), ]), } @@ -98,10 +98,10 @@ impl Table for FunctionsTable { let func_names = FunctionFactory::registered_names(); let aggr_func_names = AggregateFunctionFactory::registered_names(); - let names: Vec<&str> = func_names + let names: Vec<&[u8]> = func_names .iter() .chain(aggr_func_names.iter()) - .map(|x| x.as_ref()) + .map(|x| x.as_bytes()) .collect(); let is_aggregate = (0..names.len()) diff --git a/query/src/datasources/system/processes_table.rs b/query/src/datasources/system/processes_table.rs index 39939613c3ab6..324688e2606d0 100644 --- a/query/src/datasources/system/processes_table.rs +++ b/query/src/datasources/system/processes_table.rs @@ -42,23 +42,26 @@ impl ProcessesTable { pub fn create() -> Self { ProcessesTable { schema: DataSchemaRefExt::create(vec![ - DataField::new("id", DataType::Utf8, false), - DataField::new("type", DataType::Utf8, false), - DataField::new("host", DataType::Utf8, true), - DataField::new("state", DataType::Utf8, false), - DataField::new("database", DataType::Utf8, false), - DataField::new("extra_info", DataType::Utf8, true), + DataField::new("id", DataType::String, false), + DataField::new("type", DataType::String, false), + DataField::new("host", DataType::String, true), + DataField::new("state", DataType::String, false), + DataField::new("database", DataType::String, false), + DataField::new("extra_info", DataType::String, true), ]), } } - fn process_host(process_info: &ProcessInfo) -> Option { + fn process_host(process_info: &ProcessInfo) -> Option> { let client_address = process_info.client_address; - client_address.as_ref().map(ToString::to_string) + client_address.as_ref().map(|s| s.to_string().into_bytes()) } - fn process_extra_info(process_info: &ProcessInfo) -> Option { - process_info.session_extra_info.clone() + fn process_extra_info(process_info: &ProcessInfo) -> Option> { + process_info + .session_extra_info + .clone() + .map(|s| s.into_bytes()) } } @@ -123,10 +126,10 @@ impl Table for ProcessesTable { let mut processes_extra_info = Vec::with_capacity(processes_info.len()); for process_info in &processes_info { - processes_id.push(process_info.id.clone()); - processes_type.push(process_info.typ.clone()); - processes_state.push(process_info.state.clone()); - processes_database.push(process_info.database.clone()); + processes_id.push(process_info.id.clone().into_bytes()); + processes_type.push(process_info.typ.clone().into_bytes()); + processes_state.push(process_info.state.clone().into_bytes()); + processes_database.push(process_info.database.clone().into_bytes()); processes_host.push(ProcessesTable::process_host(process_info)); processes_extra_info.push(ProcessesTable::process_extra_info(process_info)); } diff --git a/query/src/datasources/system/settings_table.rs b/query/src/datasources/system/settings_table.rs index 54a09a8f2e8aa..827319ee246af 100644 --- a/query/src/datasources/system/settings_table.rs +++ b/query/src/datasources/system/settings_table.rs @@ -36,10 +36,10 @@ impl SettingsTable { pub fn create() -> Self { SettingsTable { schema: DataSchemaRefExt::create(vec![ - DataField::new("name", DataType::Utf8, false), - DataField::new("value", DataType::Utf8, false), - DataField::new("default_value", DataType::Utf8, false), - DataField::new("description", DataType::Utf8, false), + DataField::new("name", DataType::String, false), + DataField::new("value", DataType::String, false), + DataField::new("default_value", DataType::String, false), + DataField::new("description", DataType::String, false), ]), } } @@ -110,10 +110,10 @@ impl Table for SettingsTable { } } - let names: Vec<&str> = names.iter().map(|x| x.as_str()).collect(); - let values: Vec<&str> = values.iter().map(|x| x.as_str()).collect(); - let default_values: Vec<&str> = default_values.iter().map(|x| x.as_str()).collect(); - let descs: Vec<&str> = descs.iter().map(|x| x.as_str()).collect(); + let names: Vec<&[u8]> = names.iter().map(|x| x.as_bytes()).collect(); + let values: Vec<&[u8]> = values.iter().map(|x| x.as_bytes()).collect(); + let default_values: Vec<&[u8]> = default_values.iter().map(|x| x.as_bytes()).collect(); + let descs: Vec<&[u8]> = descs.iter().map(|x| x.as_bytes()).collect(); let block = DataBlock::create_by_array(self.schema.clone(), vec![ Series::new(names), Series::new(values), diff --git a/query/src/datasources/system/tables_table.rs b/query/src/datasources/system/tables_table.rs index be909bffef809..dab7f26dcce62 100644 --- a/query/src/datasources/system/tables_table.rs +++ b/query/src/datasources/system/tables_table.rs @@ -37,9 +37,9 @@ impl TablesTable { pub fn create() -> Self { TablesTable { schema: DataSchemaRefExt::create(vec![ - DataField::new("database", DataType::Utf8, false), - DataField::new("name", DataType::Utf8, false), - DataField::new("engine", DataType::Utf8, false), + DataField::new("database", DataType::String, false), + DataField::new("name", DataType::String, false), + DataField::new("engine", DataType::String, false), ]), } } @@ -104,14 +104,14 @@ impl Table for TablesTable { } } - let databases: Vec<&str> = database_tables.iter().map(|(d, _)| d.as_str()).collect(); - let names: Vec<&str> = database_tables + let databases: Vec<&[u8]> = database_tables.iter().map(|(d, _)| d.as_bytes()).collect(); + let names: Vec<&[u8]> = database_tables .iter() - .map(|(_, v)| v.raw().name()) + .map(|(_, v)| v.raw().name().as_bytes()) .collect(); - let engines: Vec<&str> = database_tables + let engines: Vec<&[u8]> = database_tables .iter() - .map(|(_, v)| v.raw().engine()) + .map(|(_, v)| v.raw().engine().as_bytes()) .collect(); let block = DataBlock::create_by_array(self.schema.clone(), vec![ diff --git a/query/src/datasources/system/tracing_table.rs b/query/src/datasources/system/tracing_table.rs index 99e9ad24d24a4..8835df69ced0d 100644 --- a/query/src/datasources/system/tracing_table.rs +++ b/query/src/datasources/system/tracing_table.rs @@ -43,12 +43,12 @@ impl TracingTable { TracingTable { schema: DataSchemaRefExt::create(vec![ DataField::new("v", DataType::Int64, false), - DataField::new("name", DataType::Utf8, false), - DataField::new("msg", DataType::Utf8, false), + DataField::new("name", DataType::String, false), + DataField::new("msg", DataType::String, false), DataField::new("level", DataType::Int8, false), - DataField::new("hostname", DataType::Utf8, false), + DataField::new("hostname", DataType::String, false), DataField::new("pid", DataType::Int64, false), - DataField::new("time", DataType::Utf8, false), + DataField::new("time", DataType::String, false), ]), } } diff --git a/query/src/datasources/system/tracing_table_stream.rs b/query/src/datasources/system/tracing_table_stream.rs index 0599f5295e045..8211c60a6d6e6 100644 --- a/query/src/datasources/system/tracing_table_stream.rs +++ b/query/src/datasources/system/tracing_table_stream.rs @@ -89,10 +89,10 @@ impl TracingTableStream { self.limit_offset += 1; } - let names: Vec<&str> = name_col.iter().map(|x| x.as_str()).collect(); - let msgs: Vec<&str> = msg_col.iter().map(|x| x.as_str()).collect(); - let hosts: Vec<&str> = host_col.iter().map(|x| x.as_str()).collect(); - let times: Vec<&str> = time_col.iter().map(|x| x.as_str()).collect(); + let names: Vec<&[u8]> = name_col.iter().map(|x| x.as_bytes()).collect(); + let msgs: Vec<&[u8]> = msg_col.iter().map(|x| x.as_bytes()).collect(); + let hosts: Vec<&[u8]> = host_col.iter().map(|x| x.as_bytes()).collect(); + let times: Vec<&[u8]> = time_col.iter().map(|x| x.as_bytes()).collect(); let block = DataBlock::create_by_array(self.schema.clone(), vec![ Series::new(version_col), diff --git a/query/src/functions/context_function.rs b/query/src/functions/context_function.rs index 5b05aaa6fde3e..1271b66bd5f73 100644 --- a/query/src/functions/context_function.rs +++ b/query/src/functions/context_function.rs @@ -39,11 +39,11 @@ impl ContextFunction { } Ok(match name.to_lowercase().as_str() { - "database" => vec![Expression::create_literal(DataValue::Utf8(Some( - ctx.get_current_database(), + "database" => vec![Expression::create_literal(DataValue::String(Some( + ctx.get_current_database().into_bytes(), )))], - "version" => vec![Expression::create_literal(DataValue::Utf8(Some( - ctx.get_fuse_version(), + "version" => vec![Expression::create_literal(DataValue::String(Some( + ctx.get_fuse_version().into_bytes(), )))], _ => vec![], }) diff --git a/query/src/interpreters/interpreter_describe_table.rs b/query/src/interpreters/interpreter_describe_table.rs index 094491f5bd8ea..103fc931170b9 100644 --- a/query/src/interpreters/interpreter_describe_table.rs +++ b/query/src/interpreters/interpreter_describe_table.rs @@ -64,9 +64,9 @@ impl Interpreter for DescribeTableInterpreter { "NO".to_string() }); } - let names: Vec<&str> = names.iter().map(|x| x.as_str()).collect(); - let types: Vec<&str> = types.iter().map(|x| x.as_str()).collect(); - let nulls: Vec<&str> = nulls.iter().map(|x| x.as_str()).collect(); + let names: Vec<&[u8]> = names.iter().map(|x| x.as_bytes()).collect(); + let types: Vec<&[u8]> = types.iter().map(|x| x.as_bytes()).collect(); + let nulls: Vec<&[u8]> = nulls.iter().map(|x| x.as_bytes()).collect(); let desc_schema = self.plan.schema(); diff --git a/query/src/interpreters/interpreter_explain.rs b/query/src/interpreters/interpreter_explain.rs index 52bf63c116391..b2f64b26ae442 100644 --- a/query/src/interpreters/interpreter_explain.rs +++ b/query/src/interpreters/interpreter_explain.rs @@ -70,6 +70,7 @@ impl ExplainInterpreter { let formatted_plan = Series::new( format!("{}", plan.display_graphviz()) .lines() + .map(|s| s.as_bytes()) .collect::>(), ); Ok(DataBlock::create_by_array(schema, vec![formatted_plan])) @@ -78,7 +79,12 @@ impl ExplainInterpreter { fn explain_syntax(&self) -> Result { let schema = self.schema(); let plan = Optimizers::create(self.ctx.clone()).optimize(&self.explain.input)?; - let formatted_plan = Series::new(format!("{:?}", plan).lines().collect::>()); + let formatted_plan = Series::new( + format!("{:?}", plan) + .lines() + .map(|s| s.as_bytes()) + .collect::>(), + ); Ok(DataBlock::create_by_array(schema, vec![formatted_plan])) } @@ -87,7 +93,12 @@ impl ExplainInterpreter { let plan = Optimizers::without_scatters(self.ctx.clone()).optimize(&self.explain.input)?; let pipeline_builder = PipelineBuilder::create(self.ctx.clone()); let pipeline = pipeline_builder.build(&plan)?; - let formatted_pipeline = Series::new(format!("{:?}", pipeline).lines().collect::>()); + let formatted_pipeline = Series::new( + format!("{:?}", pipeline) + .lines() + .map(|s| s.as_bytes()) + .collect::>(), + ); Ok(DataBlock::create_by_array(schema, vec![formatted_pipeline])) } } diff --git a/query/src/interpreters/interpreter_setting.rs b/query/src/interpreters/interpreter_setting.rs index a49065f3206bc..0ba1b02da11db 100644 --- a/query/src/interpreters/interpreter_setting.rs +++ b/query/src/interpreters/interpreter_setting.rs @@ -61,7 +61,7 @@ impl Interpreter for SettingInterpreter { } } - let schema = DataSchemaRefExt::create(vec![DataField::new("set", DataType::Utf8, false)]); + let schema = DataSchemaRefExt::create(vec![DataField::new("set", DataType::String, false)]); Ok(Box::pin(DataBlockStream::create(schema, None, vec![]))) } } diff --git a/query/src/interpreters/interpreter_show_create_table.rs b/query/src/interpreters/interpreter_show_create_table.rs index cb0df4be3fcb9..452849a18cabc 100644 --- a/query/src/interpreters/interpreter_show_create_table.rs +++ b/query/src/interpreters/interpreter_show_create_table.rs @@ -70,14 +70,14 @@ impl Interpreter for ShowCreateTableInterpreter { table_info.push_str(table_engine.as_str()); let show_fields = vec![ - DataField::new("Table", DataType::Utf8, false), - DataField::new("Create Table", DataType::Utf8, false), + DataField::new("Table", DataType::String, false), + DataField::new("Create Table", DataType::String, false), ]; let show_schema = DataSchemaRefExt::create(show_fields); let block = DataBlock::create_by_array(show_schema.clone(), vec![ - Series::new(vec![name]), - Series::new(vec![table_info]), + Series::new(vec![name.as_bytes()]), + Series::new(vec![table_info.into_bytes()]), ]); debug!("Show create table executor result: {:?}", block); diff --git a/query/src/optimizers/optimizer_statistics_exact.rs b/query/src/optimizers/optimizer_statistics_exact.rs index 34fb36c7c7c16..abd9207a1549c 100644 --- a/query/src/optimizers/optimizer_statistics_exact.rs +++ b/query/src/optimizers/optimizer_statistics_exact.rs @@ -95,7 +95,7 @@ impl PlanRewriter for StatisticsExactImpl<'_> { })?; let mut body: Vec = Vec::new(); body.write_uvarint(read_source_plan.statistics.read_rows as u64)?; - let expr = Expression::create_literal(DataValue::Binary(Some(body))); + let expr = Expression::create_literal(DataValue::String(Some(body))); PlanBuilder::from(&dummy_read_plan) .expression(&[expr.clone()], "Exact Statistics")? .project(&[expr.alias("count(0)")])? diff --git a/query/src/optimizers/optimizer_statistics_exact_test.rs b/query/src/optimizers/optimizer_statistics_exact_test.rs index 7e61a2c3d6395..b5a62d2ca5a68 100644 --- a/query/src/optimizers/optimizer_statistics_exact_test.rs +++ b/query/src/optimizers/optimizer_statistics_exact_test.rs @@ -78,8 +78,8 @@ mod tests { let expect = "\ Projection: count(0):UInt64\ \n AggregatorFinal: groupBy=[[]], aggr=[[count(0)]]\ - \n Projection: 904e as count(0):Binary\ - \n Expression: 904e:Binary (Exact Statistics)\ + \n Projection: 904e as count(0):String\ + \n Expression: 904e:String (Exact Statistics)\ \n ReadDataSource: scan partitions: [1], scan schema: [dummy:UInt8], statistics: [read_rows: 1, read_bytes: 1]"; let actual = format!("{:?}", optimized); assert_eq!(expect, actual); diff --git a/query/src/pipelines/transforms/group_by/aggregator.rs b/query/src/pipelines/transforms/group_by/aggregator.rs index 6130e296e4559..389e61cab5696 100644 --- a/query/src/pipelines/transforms/group_by/aggregator.rs +++ b/query/src/pipelines/transforms/group_by/aggregator.rs @@ -14,7 +14,7 @@ use common_datablocks::DataBlock; use common_datablocks::HashMethod; -use common_datavalues::arrays::BinaryArrayBuilder; +use common_datavalues::arrays::StringArrayBuilder; use common_datavalues::columns::DataColumn; use common_datavalues::prelude::IntoSeries; use common_datavalues::prelude::Series; @@ -190,8 +190,8 @@ impl> Aggregator { let offsets_aggregate_states = &aggregator_params.offsets_aggregate_states; // Builders. - let mut state_builders: Vec = (0..aggr_len) - .map(|_| BinaryArrayBuilder::with_capacity(groups.len() * 4)) + let mut state_builders: Vec = (0..aggr_len) + .map(|_| StringArrayBuilder::with_capacity(groups.len() * 4)) .collect(); let mut group_key_builder = self.method.state_array_builder(groups.len()); diff --git a/query/src/pipelines/transforms/group_by/aggregator_keys_builder.rs b/query/src/pipelines/transforms/group_by/aggregator_keys_builder.rs index 5e9abce3091ac..749568e73c09a 100644 --- a/query/src/pipelines/transforms/group_by/aggregator_keys_builder.rs +++ b/query/src/pipelines/transforms/group_by/aggregator_keys_builder.rs @@ -15,8 +15,8 @@ use common_datablocks::HashMethod; use common_datablocks::HashMethodFixedKeys; use common_datavalues::arrays::ArrayBuilder; -use common_datavalues::arrays::BinaryArrayBuilder; use common_datavalues::arrays::PrimitiveArrayBuilder; +use common_datavalues::arrays::StringArrayBuilder; use common_datavalues::prelude::*; use common_datavalues::DFPrimitiveType; @@ -52,7 +52,7 @@ where } pub struct SerializedKeysArrayBuilder { - pub inner_builder: BinaryArrayBuilder, + pub inner_builder: StringArrayBuilder, } impl KeysArrayBuilder for SerializedKeysArrayBuilder { diff --git a/query/src/pipelines/transforms/group_by/aggregator_polymorphic_keys.rs b/query/src/pipelines/transforms/group_by/aggregator_polymorphic_keys.rs index 81b4aba98a9c3..1dd50bf8377ca 100644 --- a/query/src/pipelines/transforms/group_by/aggregator_polymorphic_keys.rs +++ b/query/src/pipelines/transforms/group_by/aggregator_polymorphic_keys.rs @@ -19,8 +19,9 @@ use common_datablocks::HashMethodKeysU32; use common_datablocks::HashMethodKeysU64; use common_datablocks::HashMethodKeysU8; use common_datablocks::HashMethodSerializer; -use common_datavalues::arrays::BinaryArrayBuilder; use common_datavalues::arrays::PrimitiveArrayBuilder; +use common_datavalues::arrays::StringArrayBuilder; + use crate::common::HashTable; use crate::pipelines::transforms::group_by::aggregator_keys_builder::FixedKeysArrayBuilder; @@ -44,7 +45,7 @@ use crate::pipelines::transforms::group_by::AggregatorState; // use bumpalo::Bump; // use datafuse_query::common::HashTable; // use common_datablocks::HashMethodSerializer; -// use common_datavalues::arrays::BinaryArrayBuilder; +// use common_datavalues::arrays::StringArrayBuilder; // use datafuse_query::pipelines::transforms::group_by::PolymorphicKeysHelper; // use datafuse_query::pipelines::transforms::group_by::aggregator_state::SerializedKeysAggregatorState; // use datafuse_query::pipelines::transforms::group_by::aggregator_keys_builder::SerializedKeysArrayBuilder; @@ -62,7 +63,7 @@ use crate::pipelines::transforms::group_by::AggregatorState; // type ArrayBuilder = SerializedKeysArrayBuilder; // fn state_array_builder(&self, capacity: usize) -> Self::ArrayBuilder { // SerializedKeysArrayBuilder { -// inner_builder: BinaryArrayBuilder::with_capacity(capacity), +// inner_builder: StringArrayBuilder::with_capacity(capacity), // } // } // } @@ -150,7 +151,7 @@ impl PolymorphicKeysHelper for HashMethodSerializer { type ArrayBuilder = SerializedKeysArrayBuilder; fn state_array_builder(&self, capacity: usize) -> Self::ArrayBuilder { SerializedKeysArrayBuilder { - inner_builder: BinaryArrayBuilder::with_capacity(capacity), + inner_builder: StringArrayBuilder::with_capacity(capacity), } } } diff --git a/query/src/pipelines/transforms/transform_aggregator_final.rs b/query/src/pipelines/transforms/transform_aggregator_final.rs index 08de9250fa7da..6ee5f2192b9dc 100644 --- a/query/src/pipelines/transforms/transform_aggregator_final.rs +++ b/query/src/pipelines/transforms/transform_aggregator_final.rs @@ -17,7 +17,7 @@ use std::sync::Arc; use std::time::Instant; use common_datablocks::DataBlock; -use common_datavalues::prelude::DFBinaryArray; +use common_datavalues::prelude::DFStringArray; use common_datavalues::DataSchemaRef; use common_exception::Result; use common_functions::aggregates::get_layout_offsets; @@ -104,7 +104,7 @@ impl Processor for AggregatorFinalTransform { let place = places[idx].into(); let binary_array = block.column(idx).to_array()?; - let binary_array: &DFBinaryArray = binary_array.binary()?; + let binary_array: &DFStringArray = binary_array.string()?; let array = binary_array.inner(); let mut data = array.value(0); diff --git a/query/src/pipelines/transforms/transform_aggregator_partial.rs b/query/src/pipelines/transforms/transform_aggregator_partial.rs index 9a3f3bb8eb56d..e5cdcf75b6dac 100644 --- a/query/src/pipelines/transforms/transform_aggregator_partial.rs +++ b/query/src/pipelines/transforms/transform_aggregator_partial.rs @@ -18,7 +18,7 @@ use std::time::Instant; use bumpalo::Bump; use common_datablocks::DataBlock; -use common_datavalues::arrays::BinaryArrayBuilder; +use common_datavalues::arrays::StringArrayBuilder; use common_datavalues::prelude::*; use common_exception::Result; use common_functions::aggregates::get_layout_offsets; @@ -133,7 +133,7 @@ impl Processor for AggregatorPartialTransform { for (idx, func) in funcs.iter().enumerate() { let place = places[idx].into(); func.serialize(place, &mut bytes)?; - let mut array_builder = BinaryArrayBuilder::with_capacity(4); + let mut array_builder = StringArrayBuilder::with_capacity(4); array_builder.append_value(&bytes[..]); bytes.clear(); let array = array_builder.finish(); diff --git a/query/src/pipelines/transforms/transform_group_by_final.rs b/query/src/pipelines/transforms/transform_group_by_final.rs index a918fe6eecff0..538e3d7203ee8 100644 --- a/query/src/pipelines/transforms/transform_group_by_final.rs +++ b/query/src/pipelines/transforms/transform_group_by_final.rs @@ -131,7 +131,7 @@ impl Processor for GroupByFinalTransform { let mut states_binary_arrays = Vec::with_capacity(states_series.len()); for agg in states_series.iter().take(aggr_funcs_len) { - let aggr_array: &DFBinaryArray = agg.binary()?; + let aggr_array: &DFStringArray = agg.string()?; let aggr_array = aggr_array.inner(); states_binary_arrays.push(aggr_array); } @@ -230,19 +230,19 @@ impl Processor for GroupByFinalTransform { ($method: ident, $apply: ident) => {{ match $method { HashMethodKind::Serializer(hash_method) => { - apply! { hash_method, &DFBinaryArray, binary, RwLock, usize, ahash::RandomState>>} + apply! { hash_method, &DFStringArray, string, RwLock, usize, ahash::RandomState>>} } HashMethodKind::KeysU8(hash_method) => { - apply! { hash_method , &DFUInt8Array, u8, RwLock> } + apply! { hash_method , &DFUInt8Array, u8, RwLock> } } HashMethodKind::KeysU16(hash_method) => { - apply! { hash_method , &DFUInt16Array, u16, RwLock> } + apply! { hash_method , &DFUInt16Array, u16, RwLock> } } HashMethodKind::KeysU32(hash_method) => { - apply! { hash_method , &DFUInt32Array, u32, RwLock> } + apply! { hash_method , &DFUInt32Array, u32, RwLock> } } HashMethodKind::KeysU64(hash_method) => { - apply! { hash_method , &DFUInt64Array, u64, RwLock> } + apply! { hash_method , &DFUInt64Array, u64, RwLock> } } } }}; diff --git a/query/src/servers/clickhouse/writers/query_writer.rs b/query/src/servers/clickhouse/writers/query_writer.rs index c2d4ee40dba93..16763ead8e06c 100644 --- a/query/src/servers/clickhouse/writers/query_writer.rs +++ b/query/src/servers/clickhouse/writers/query_writer.rs @@ -209,7 +209,7 @@ pub fn to_clickhouse_block(block: DataBlock) -> Result { DataType::UInt64 => result.column(name, column.u64()?.collect_values()), DataType::Float32 => result.column(name, column.f32()?.collect_values()), DataType::Float64 => result.column(name, column.f64()?.collect_values()), - DataType::Utf8 => result.column(name, column.utf8()?.collect_values()), + DataType::String => result.column(name, column.string()?.collect_values()), DataType::Boolean => { let v: Vec> = column .bool()? @@ -287,8 +287,8 @@ pub fn to_clickhouse_block(block: DataBlock) -> Result { DataType::Float64 => { result.column(name, column.f64()?.inner().values().as_slice().to_vec()) } - DataType::Utf8 => { - let vs: Vec<&str> = column.utf8()?.into_no_null_iter().collect(); + DataType::String => { + let vs: Vec<&[u8]> = column.string()?.into_no_null_iter().collect(); result.column(name, vs) } DataType::Boolean => { @@ -345,14 +345,10 @@ pub fn from_clickhouse_block(schema: DataSchemaRef, block: Block) -> Result { Ok(DFFloat64Array::new_from_iter(col.iter::()?.copied()).into_series()) } - SqlType::String => Ok(DFUtf8Array::new_from_iter( - col.iter::<&[u8]>()?.map(|c| String::from_utf8_lossy(c)), - ) - .into_series()), - SqlType::FixedString(_) => Ok(DFUtf8Array::new_from_iter( - col.iter::<&[u8]>()?.map(|c| String::from_utf8_lossy(c)), - ) - .into_series()), + SqlType::String => Ok(DFStringArray::new_from_iter(col.iter::<&[u8]>()?).into_series()), + SqlType::FixedString(_) => { + Ok(DFStringArray::new_from_iter(col.iter::<&[u8]>()?).into_series()) + } SqlType::Nullable(SqlType::UInt8) => Ok(DFUInt8Array::new_from_opt_iter( col.iter::>()?.map(|c| c.copied()), @@ -395,16 +391,12 @@ pub fn from_clickhouse_block(schema: DataSchemaRef, block: Block) -> Result>()?.map(|c| c.copied()), ) .into_series()), - SqlType::Nullable(SqlType::String) => Ok(DFUtf8Array::new_from_opt_iter( - col.iter::>()? - .map(|c| c.map(|d| String::from_utf8_lossy(d))), - ) - .into_series()), - SqlType::Nullable(SqlType::FixedString(_)) => Ok(DFUtf8Array::new_from_opt_iter( - col.iter::>()? - .map(|c| c.map(|d| String::from_utf8_lossy(d))), - ) - .into_series()), + SqlType::Nullable(SqlType::String) => { + Ok(DFStringArray::new_from_opt_iter(col.iter::>()?).into_series()) + } + SqlType::Nullable(SqlType::FixedString(_)) => { + Ok(DFStringArray::new_from_opt_iter(col.iter::>()?).into_series()) + } other => Err(CHError::Other(Cow::from(format!( "Unsupported type: {:?}", diff --git a/query/src/servers/mysql/writers/query_result_writer.rs b/query/src/servers/mysql/writers/query_result_writer.rs index f3b8b6c7981f0..697e0ff0af6fa 100644 --- a/query/src/servers/mysql/writers/query_result_writer.rs +++ b/query/src/servers/mysql/writers/query_result_writer.rs @@ -60,8 +60,7 @@ impl<'a, W: std::io::Write> DFQueryResultWriter<'a, W> { DataType::UInt64 => Ok(ColumnType::MYSQL_TYPE_LONG), DataType::Float32 => Ok(ColumnType::MYSQL_TYPE_FLOAT), DataType::Float64 => Ok(ColumnType::MYSQL_TYPE_FLOAT), - DataType::Utf8 => Ok(ColumnType::MYSQL_TYPE_VARCHAR), - DataType::Binary => Ok(ColumnType::MYSQL_TYPE_VARCHAR), + DataType::String => Ok(ColumnType::MYSQL_TYPE_VARCHAR), DataType::Boolean => Ok(ColumnType::MYSQL_TYPE_SHORT), DataType::Date16 | DataType::Date32 => Ok(ColumnType::MYSQL_TYPE_DATE), DataType::DateTime32 => Ok(ColumnType::MYSQL_TYPE_DATETIME), diff --git a/query/src/sessions/settings.rs b/query/src/sessions/settings.rs index 9bb78122ce4cd..53e5d00e54abf 100644 --- a/query/src/sessions/settings.rs +++ b/query/src/sessions/settings.rs @@ -27,11 +27,11 @@ pub struct Settings { impl Settings { apply_macros! { apply_getter_setter_settings, apply_initial_settings, apply_update_settings, - ("max_block_size", u64, 10000, "Maximum block size for reading".to_string()), - ("max_threads", u64, 16, "The maximum number of threads to execute the request. By default, it is determined automatically.".to_string()), - ("flight_client_timeout", u64, 60, "Max duration the flight client request is allowed to take in seconds. By default, it is 60 seconds".to_string()), - ("min_distributed_rows", u64, 100000000, "Minimum distributed read rows. In cluster mode, when read rows exceeds this value, the local table converted to distributed query.".to_string()), - ("min_distributed_bytes", u64, 500 * 1024 * 1024, "Minimum distributed read bytes. In cluster mode, when read bytes exceeds this value, the local table converted to distributed query.".to_string()) + ("max_block_size", u64, 10000, "Maximum block size for reading".as_bytes().to_vec()), + ("max_threads", u64, 16, "The maximum number of threads to execute the request. By default, it is determined automatically.".as_bytes().to_vec()), + ("flight_client_timeout", u64, 60, "Max duration the flight client request is allowed to take in seconds. By default, it is 60 seconds".as_bytes().to_vec()), + ("min_distributed_rows", u64, 100000000, "Minimum distributed read rows. In cluster mode, when read rows exceeds this value, the local table converted to distributed query.".as_bytes().to_vec()), + ("min_distributed_bytes", u64, 500 * 1024 * 1024, "Minimum distributed read bytes. In cluster mode, when read bytes exceeds this value, the local table converted to distributed query.".as_bytes().to_vec()) } pub fn try_create() -> Result> { @@ -68,12 +68,12 @@ impl SettingsBase { // TODO, to use macro generate this codes #[allow(unused)] - pub fn try_set_u64(&self, key: &'static str, val: u64, desc: String) -> Result<()> { + pub fn try_set_u64(&self, key: &'static str, val: u64, desc: Vec) -> Result<()> { let mut settings = self.settings.write(); let setting_val = DataValue::Struct(vec![ DataValue::UInt64(Some(val)), DataValue::UInt64(Some(val)), - DataValue::Utf8(Some(desc)), + DataValue::String(Some(desc)), ]); settings.insert(key, setting_val); Ok(()) @@ -117,12 +117,12 @@ impl SettingsBase { } #[allow(unused)] - pub fn try_set_i64(&self, key: &'static str, val: i64, desc: String) -> Result<()> { + pub fn try_set_i64(&self, key: &'static str, val: i64, desc: Vec) -> Result<()> { let mut settings = self.settings.write(); let setting_val = DataValue::Struct(vec![ DataValue::Int64(Some(val)), DataValue::Int64(Some(val)), - DataValue::Utf8(Some(desc)), + DataValue::String(Some(desc)), ]); settings.insert(key, setting_val); Ok(()) @@ -166,12 +166,12 @@ impl SettingsBase { } #[allow(unused)] - pub fn try_set_f64(&self, key: &'static str, val: f64, desc: String) -> Result<()> { + pub fn try_set_f64(&self, key: &'static str, val: f64, desc: Vec) -> Result<()> { let mut settings = self.settings.write(); let setting_val = DataValue::Struct(vec![ DataValue::Float64(Some(val)), DataValue::Float64(Some(val)), - DataValue::Utf8(Some(desc)), + DataValue::String(Some(desc)), ]); settings.insert(key, setting_val); Ok(()) @@ -215,20 +215,20 @@ impl SettingsBase { } #[allow(unused)] - pub fn try_set_string(&self, key: &'static str, val: String, desc: String) -> Result<()> { + pub fn try_set_string(&self, key: &'static str, val: Vec, desc: Vec) -> Result<()> { let mut settings = self.settings.write(); let default_value = val.clone(); let setting_val = DataValue::Struct(vec![ - DataValue::Utf8(Some(val)), - DataValue::Utf8(Some(default_value)), - DataValue::Utf8(Some(desc)), + DataValue::String(Some(val)), + DataValue::String(Some(default_value)), + DataValue::String(Some(desc)), ]); settings.insert(key, setting_val); Ok(()) } #[allow(unused)] - pub fn try_update_string(&self, key: &'static str, val: String) -> Result<()> { + pub fn try_update_string(&self, key: &'static str, val: Vec) -> Result<()> { let mut settings = self.settings.write(); let setting_val = settings .get(key) @@ -236,7 +236,7 @@ impl SettingsBase { if let DataValue::Struct(values) = setting_val { let v = DataValue::Struct(vec![ - DataValue::Utf8(Some(val)), + DataValue::String(Some(val)), values[1].clone(), values[2].clone(), ]); @@ -246,14 +246,14 @@ impl SettingsBase { } #[allow(unused)] - pub fn try_get_string(&self, key: &str) -> Result { + pub fn try_get_string(&self, key: &str) -> Result> { let settings = self.settings.read(); let setting_val = settings .get(key) .ok_or_else(|| ErrorCode::UnknownVariable(format!("Unknown variable: {:?}", key)))?; if let DataValue::Struct(values) = setting_val { - if let DataValue::Utf8(Some(result)) = values[0].clone() { + if let DataValue::String(Some(result)) = values[0].clone() { return Ok(result); } } @@ -271,7 +271,7 @@ impl SettingsBase { for (k, v) in settings.iter() { if let DataValue::Struct(values) = v { let res = DataValue::Struct(vec![ - DataValue::Utf8(Some(k.to_string())), + DataValue::String(Some(k.as_bytes().to_vec())), values[0].clone(), values[1].clone(), values[2].clone(), diff --git a/query/src/sql/plan_parser.rs b/query/src/sql/plan_parser.rs index 4d81b5683db88..ed12c86d742d6 100644 --- a/query/src/sql/plan_parser.rs +++ b/query/src/sql/plan_parser.rs @@ -342,8 +342,8 @@ impl PlanParser { } let fields = vec![ - DataField::new("Table", DataType::Utf8, false), - DataField::new("Create Table", DataType::Utf8, false), + DataField::new("Table", DataType::String, false), + DataField::new("Create Table", DataType::String, false), ]; let schema = DataSchemaRefExt::create(fields); @@ -368,9 +368,9 @@ impl PlanParser { } let schema = DataSchemaRefExt::create(vec![ - DataField::new("Field", DataType::Utf8, false), - DataField::new("Type", DataType::Utf8, false), - DataField::new("Null", DataType::Utf8, false), + DataField::new("Field", DataType::String, false), + DataField::new("Type", DataType::String, false), + DataField::new("Null", DataType::String, false), ]); Ok(PlanNode::DescribeTable(DescribeTablePlan { @@ -895,7 +895,7 @@ impl PlanParser { DataValue::try_from_literal(n).map(Expression::create_literal) } sqlparser::ast::Value::SingleQuotedString(ref value) => Ok(Expression::create_literal( - DataValue::Utf8(Some(value.clone())), + DataValue::String(Some(value.clone().into_bytes())), )), sqlparser::ast::Value::Boolean(b) => { Ok(Expression::create_literal(DataValue::Boolean(Some(*b)))) @@ -1005,8 +1005,8 @@ impl PlanParser { sqlparser::ast::Expr::Wildcard => Ok(Expression::Wildcard), sqlparser::ast::Expr::TypedString { data_type, value } => { SQLCommon::make_data_type(data_type).map(|data_type| Expression::Cast { - expr: Box::new(Expression::create_literal(DataValue::Utf8(Some( - value.clone(), + expr: Box::new(Expression::create_literal(DataValue::String(Some( + value.clone().into_bytes(), )))), data_type, }) diff --git a/query/src/sql/sql_common.rs b/query/src/sql/sql_common.rs index 3b90bf8bc163c..337821202c5c4 100644 --- a/query/src/sql/sql_common.rs +++ b/query/src/sql/sql_common.rs @@ -27,10 +27,10 @@ impl SQLCommon { SQLDataType::Int => Ok(DataType::Int32), SQLDataType::TinyInt => Ok(DataType::Int8), SQLDataType::SmallInt => Ok(DataType::Int16), - SQLDataType::Char(_) => Ok(DataType::Utf8), - SQLDataType::Varchar(_) => Ok(DataType::Utf8), - SQLDataType::String => Ok(DataType::Utf8), - SQLDataType::Text => Ok(DataType::Utf8), + SQLDataType::Char(_) => Ok(DataType::String), + SQLDataType::Varchar(_) => Ok(DataType::String), + SQLDataType::String => Ok(DataType::String), + SQLDataType::Text => Ok(DataType::String), SQLDataType::Decimal(_, _) => Ok(DataType::Float64), SQLDataType::Float(_) => Ok(DataType::Float32), SQLDataType::Real | SQLDataType::Double => Ok(DataType::Float64), @@ -53,7 +53,7 @@ impl SQLCommon { "INT64" => Ok(DataType::Int64), "FLOAT32" => Ok(DataType::Float32), "FLOAT64" => Ok(DataType::Float64), - "STRING" => Ok(DataType::Utf8), + "STRING" => Ok(DataType::String), _ => Result::Err(ErrorCode::IllegalDataType(format!( "The SQL data type {:?} is not implemented", diff --git a/query/src/tests/parquet.rs b/query/src/tests/parquet.rs index e02d74ae1a4fa..14223a877b94b 100644 --- a/query/src/tests/parquet.rs +++ b/query/src/tests/parquet.rs @@ -37,7 +37,7 @@ impl ParquetTestData { pub fn write_parquet(&self, path: &str) { let schema = DataSchemaRefExt::create(vec![ - DataField::new("name", DataType::Utf8, true), + DataField::new("name", DataType::String, true), DataField::new("age", DataType::Int32, false), ]); diff --git a/tests/suites/0_stateless/02_0001_function_to_type_name.result b/tests/suites/0_stateless/02_0001_function_to_type_name.result index 287c02e62c5d5..7f6ee0e24d27c 100644 --- a/tests/suites/0_stateless/02_0001_function_to_type_name.result +++ b/tests/suites/0_stateless/02_0001_function_to_type_name.result @@ -1,6 +1,6 @@ UInt64 UInt64 Int64 Float64 UInt64 -Utf8 Utf8 +String String === TEST_numeric_coercion UInt8 OP UInt8 UInt16 Int16 UInt16 Float64 UInt8 OP UInt16 UInt32 Int32 UInt32 Float64 diff --git a/tests/suites/0_stateless/08_0000_optimizer.result b/tests/suites/0_stateless/08_0000_optimizer.result index 11a85a5063124..95db3e646d246 100644 --- a/tests/suites/0_stateless/08_0000_optimizer.result +++ b/tests/suites/0_stateless/08_0000_optimizer.result @@ -6,6 +6,6 @@ Projection: max((number + 1)) as c1:UInt64, ((number % 3) + 1) as c2:UInt16 Expression: ((number % 3) + 1):UInt16, (number + 1):UInt64 (Before GroupBy) ReadDataSource: scan partitions: [16], scan schema: [number:UInt64], statistics: [read_rows: 10000, read_bytes: 80000] projection push down: push (name and value) to read datasource -Projection: name:Utf8 +Projection: name:String Filter: (value > 10) - ReadDataSource: scan partitions: [1], scan schema: [name:Utf8, value:Utf8], statistics: [read_rows: 0, read_bytes: 0] + ReadDataSource: scan partitions: [1], scan schema: [name:String, value:String], statistics: [read_rows: 0, read_bytes: 0] diff --git a/tests/suites/0_stateless/10_0000_describe_table.result b/tests/suites/0_stateless/10_0000_describe_table.result index da4c1bf041432..711abc6a1e464 100644 --- a/tests/suites/0_stateless/10_0000_describe_table.result +++ b/tests/suites/0_stateless/10_0000_describe_table.result @@ -1,10 +1,10 @@ a Int64 NO b Int32 NO -c Utf8 NO +c String NO d Int16 NO e Date16 NO a Int64 NO b Int32 NO -c Utf8 NO +c String NO d Int16 NO e Date16 NO From f115bb7a1d2324bc63a01006a1af79e7ddf85aac Mon Sep 17 00:00:00 2001 From: zhyass <34016424+zhyass@users.noreply.github.com> Date: Thu, 2 Sep 2021 14:20:46 +0800 Subject: [PATCH 2/4] Update the arrow2 --- Cargo.lock | 4 ++-- common/arrow/Cargo.toml | 4 ++-- common/datavalues/src/arrays/boolean/mod.rs | 2 ++ common/datavalues/src/arrays/ops/group_hash.rs | 2 +- common/datavalues/src/data_array_filter.rs | 3 ++- common/datavalues/src/data_value.rs | 6 ++++-- common/streams/src/stream_limit_by.rs | 3 ++- .../transforms/group_by/aggregator_polymorphic_keys.rs | 1 - 8 files changed, 15 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 01599768032dc..e3d254ebd23ae 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -111,7 +111,7 @@ checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b" [[package]] name = "arrow-flight" version = "0.1.0" -source = "git+https://github.com/zhyass/arrow2?rev=05f2b5c#05f2b5c129c7f83cade7f371f50c87658613135e" +source = "git+https://github.com/zhyass/arrow2?rev=2e3de5a#2e3de5a753dcadc31195de4d62e039c13e1e69d0" dependencies = [ "arrow2", "bytes", @@ -125,7 +125,7 @@ dependencies = [ [[package]] name = "arrow2" version = "0.4.0" -source = "git+https://github.com/zhyass/arrow2?rev=05f2b5c#05f2b5c129c7f83cade7f371f50c87658613135e" +source = "git+https://github.com/zhyass/arrow2?rev=2e3de5a#2e3de5a753dcadc31195de4d62e039c13e1e69d0" dependencies = [ "ahash 0.7.4", "base64", diff --git a/common/arrow/Cargo.toml b/common/arrow/Cargo.toml index 3801c5efadc1e..fde0270659039 100644 --- a/common/arrow/Cargo.toml +++ b/common/arrow/Cargo.toml @@ -15,8 +15,8 @@ simd = ["arrow/simd"] # Workspace dependencies # Github dependencies -arrow = { package = "arrow2", git="https://github.com/zhyass/arrow2", rev = "05f2b5c" } -arrow-flight = { git="https://github.com/zhyass/arrow2", rev = "05f2b5c" } +arrow = { package = "arrow2", git="https://github.com/zhyass/arrow2", rev = "2e3de5a" } +arrow-flight = { git="https://github.com/zhyass/arrow2", rev = "2e3de5a" } parquet = {package = "parquet2", git = "https://github.com/datafuse-extras/parquet2", rev = "d28330f"} # Crates.io dependencies diff --git a/common/datavalues/src/arrays/boolean/mod.rs b/common/datavalues/src/arrays/boolean/mod.rs index 3e8004087ad3f..0bb764ffcd864 100644 --- a/common/datavalues/src/arrays/boolean/mod.rs +++ b/common/datavalues/src/arrays/boolean/mod.rs @@ -18,6 +18,7 @@ use common_arrow::arrow::array::*; use common_arrow::arrow::bitmap::Bitmap; use common_arrow::arrow::bitmap::MutableBitmap; use common_arrow::arrow::compute::aggregate; +use common_arrow::arrow::datatypes::DataType as ArrowType; use common_exception::ErrorCode; use common_exception::Result; @@ -160,6 +161,7 @@ pub unsafe fn take_bool_iter_unchecked>( 0 => { let iter = indices.into_iter().map(|idx| arr.value(idx)); BooleanArray::from_data( + ArrowType::Boolean, MutableBitmap::from_trusted_len_iter_unchecked(iter).into(), None, ) diff --git a/common/datavalues/src/arrays/ops/group_hash.rs b/common/datavalues/src/arrays/ops/group_hash.rs index e2c83527e3d74..b39bf57a13a5a 100644 --- a/common/datavalues/src/arrays/ops/group_hash.rs +++ b/common/datavalues/src/arrays/ops/group_hash.rs @@ -102,7 +102,7 @@ impl GroupHash for DFStringArray { fn serialize(&self, vec: &mut Vec>) -> Result<()> { assert_eq!(vec.len(), self.len()); for (value, vec) in self.into_no_null_iter().zip(vec.iter_mut()) { - BinaryWrite::write_binary(vec, &value)?; + BinaryWrite::write_binary(vec, value)?; } Ok(()) } diff --git a/common/datavalues/src/data_array_filter.rs b/common/datavalues/src/data_array_filter.rs index b8846d4b88d9f..f3b675aa53890 100644 --- a/common/datavalues/src/data_array_filter.rs +++ b/common/datavalues/src/data_array_filter.rs @@ -17,6 +17,7 @@ use std::sync::Arc; use common_arrow::arrow::array::*; use common_arrow::arrow::compute::filter::build_filter; +use common_arrow::arrow::datatypes::DataType as ArrowType; use common_exception::Result; use crate::prelude::*; @@ -55,7 +56,7 @@ impl DataArrayFilter { let mask = array.values(); if let Some(v) = array.validity() { let mask = mask.bitand(v); - return DFBooleanArray::new(BooleanArray::from_data(mask, None)); + return DFBooleanArray::new(BooleanArray::from_data(ArrowType::Boolean, mask, None)); } filter.clone() } diff --git a/common/datavalues/src/data_value.rs b/common/datavalues/src/data_value.rs index 85c6e71de9b31..70a6afb922dfe 100644 --- a/common/datavalues/src/data_value.rs +++ b/common/datavalues/src/data_value.rs @@ -20,6 +20,7 @@ use std::ops::Deref; use std::sync::Arc; use common_arrow::arrow::array::*; +use common_arrow::arrow::datatypes::DataType as ArrowType; use common_arrow::arrow::datatypes::Field as ArrowField; use common_exception::ErrorCode; use common_exception::Result; @@ -119,7 +120,7 @@ impl DataValue { pub fn to_series_with_size(&self, size: usize) -> Result { match self { DataValue::Null => { - let array = NullArray::new_null(size); + let array = NullArray::new_null(ArrowType::Null, size); let array: DFNullArray = array.into(); Ok(array.into_series()) } @@ -202,7 +203,8 @@ impl DataValue { arrays.push(val_array); } - let r: DFStructArray = StructArray::from_data(fields, arrays, None).into(); + let r: DFStructArray = + StructArray::from_data(ArrowType::Struct(fields), arrays, None).into(); Ok(r.into_series()) } } diff --git a/common/streams/src/stream_limit_by.rs b/common/streams/src/stream_limit_by.rs index 490d6cee46109..924c3d0f0978e 100644 --- a/common/streams/src/stream_limit_by.rs +++ b/common/streams/src/stream_limit_by.rs @@ -19,6 +19,7 @@ use std::task::Poll; use common_arrow::arrow; use common_arrow::arrow::array::BooleanArray; +use common_arrow::arrow::datatypes::DataType as ArrowType; use common_arrow::arrow::bitmap::MutableBitmap; use common_datablocks::DataBlock; use common_datablocks::HashMethod; @@ -70,7 +71,7 @@ impl LimitByStream { } } - let array = BooleanArray::from_data(filter.into(), None); + let array = BooleanArray::from_data(ArrowType::Boolean,filter_vec.into(), None); let batch = block.clone().try_into()?; let batch = arrow::compute::filter::filter_record_batch(&batch, &array)?; Some(batch.try_into()).transpose() diff --git a/query/src/pipelines/transforms/group_by/aggregator_polymorphic_keys.rs b/query/src/pipelines/transforms/group_by/aggregator_polymorphic_keys.rs index 1dd50bf8377ca..790e30f1d673a 100644 --- a/query/src/pipelines/transforms/group_by/aggregator_polymorphic_keys.rs +++ b/query/src/pipelines/transforms/group_by/aggregator_polymorphic_keys.rs @@ -22,7 +22,6 @@ use common_datablocks::HashMethodSerializer; use common_datavalues::arrays::PrimitiveArrayBuilder; use common_datavalues::arrays::StringArrayBuilder; - use crate::common::HashTable; use crate::pipelines::transforms::group_by::aggregator_keys_builder::FixedKeysArrayBuilder; use crate::pipelines::transforms::group_by::aggregator_keys_builder::KeysArrayBuilder; From 4cccbafbcd7a2427e01a74c6ac7f98861b7010e4 Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Thu, 2 Sep 2021 09:59:22 +0800 Subject: [PATCH 3/4] Fix binary de --- common/datavalues/src/arrays/string/builder.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/common/datavalues/src/arrays/string/builder.rs b/common/datavalues/src/arrays/string/builder.rs index 95719cd2f4897..c77b53f24c02c 100644 --- a/common/datavalues/src/arrays/string/builder.rs +++ b/common/datavalues/src/arrays/string/builder.rs @@ -12,8 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::io::Read; + use common_arrow::arrow::array::*; use common_exception::Result; +use common_io::prelude::BinaryRead; use crate::prelude::*; use crate::utils::get_iter_capacity; @@ -54,6 +57,9 @@ impl StringArrayBuilder { impl ArrayDeserializer for StringArrayBuilder { fn de(&mut self, reader: &mut &[u8]) -> Result<()> { + let offset: u64 = reader.read_uvarint()?; + let mut values: Vec = Vec::with_capacity(offset as usize); + reader.read_exact(&mut values)?; self.append_value(reader.clone()); Ok(()) } From 7fc1d5445072c8c2e3ae9066a7933471328c069d Mon Sep 17 00:00:00 2001 From: zhyass <34016424+zhyass@users.noreply.github.com> Date: Thu, 2 Sep 2021 14:24:30 +0800 Subject: [PATCH 4/4] Fix tests --- Cargo.lock | 4 +- common/arrow/Cargo.toml | 4 +- .../src/kernels/data_block_concat_test.rs | 2 +- .../kernels/data_block_group_by_hash_test.rs | 2 +- .../src/kernels/data_block_group_by_test.rs | 2 +- .../src/kernels/data_block_sort_test.rs | 4 +- .../src/kernels/data_block_take_test.rs | 2 +- common/datavalues/src/arrays/ops/agg_test.rs | 12 +- .../datavalues/src/arrays/ops/apply_test.rs | 32 +-- .../datavalues/src/arrays/ops/contain_test.rs | 20 +- common/datavalues/src/arrays/ops/fill_test.rs | 32 +-- common/datavalues/src/arrays/ops/if_test.rs | 17 +- .../datavalues/src/arrays/ops/scatter_test.rs | 16 +- .../src/arrays/ops/take_random_test.rs | 18 +- common/datavalues/src/arrays/ops/take_test.rs | 20 +- common/datavalues/src/arrays/primitive/mod.rs | 17 +- .../datavalues/src/arrays/string/builder.rs | 4 +- .../arrays/{utf8 => string}/builder_test.rs | 18 +- common/datavalues/src/arrays/string/mod.rs | 3 + common/datavalues/src/arrays/utf8/builder.rs | 136 ------------ common/datavalues/src/arrays/utf8/iterator.rs | 75 ------- common/datavalues/src/arrays/utf8/mod.rs | 194 ------------------ common/datavalues/src/data_group_value.rs | 4 +- common/datavalues/src/data_value.rs | 14 +- .../datavalues/src/series/arithmetic_test.rs | 12 +- common/datavalues/src/series/series_impl.rs | 12 ++ .../src/scalars/expressions/cast_test.rs | 2 +- .../src/scalars/strings/substring_test.rs | 2 +- .../src/scalars/udfs/database_test.rs | 2 +- .../src/scalars/udfs/version_test.rs | 2 +- common/indexing/src/index_min_max_test.rs | 6 +- common/indexing/src/index_partition_test.rs | 4 +- common/indexing/src/index_sparse_test.rs | 10 +- .../planners/src/plan_describe_table_test.rs | 12 +- common/planners/src/plan_explain_test.rs | 2 +- common/planners/src/plan_expression_test.rs | 4 +- common/planners/src/plan_projection_test.rs | 6 +- common/planners/src/plan_scan_test.rs | 4 +- common/planners/src/plan_select_test.rs | 4 +- common/streams/src/sources/source_test.rs | 4 +- common/streams/src/stream_datablock_test.rs | 2 +- common/streams/src/stream_limit_by.rs | 4 +- common/streams/src/stream_limit_by_test.rs | 2 +- common/streams/src/stream_skip_test.rs | 16 +- .../interpreter_describe_table_test.rs | 2 +- .../interpreter_show_create_table_test.rs | 2 +- .../interpreter_table_create_test.rs | 2 +- .../optimizer_constant_folding_test.rs | 10 +- .../optimizer_projection_push_down_test.rs | 58 +++--- .../optimizer_statistics_exact_test.rs | 6 +- .../transform_group_by_partial_test.rs | 18 +- query/src/sessions/settings.rs | 36 ++-- query/src/sql/plan_parser_test.rs | 6 +- store/src/api/rpc/flight_service_test.rs | 4 +- store/src/data_part/appender_test.rs | 4 +- .../02_0009_function_siphash64.result | 4 +- .../08_0000_optimizer_cluster.result | 4 +- .../sqlstatement/conversion-functions/cast.md | 10 +- 58 files changed, 275 insertions(+), 655 deletions(-) rename common/datavalues/src/arrays/{utf8 => string}/builder_test.rs (74%) delete mode 100644 common/datavalues/src/arrays/utf8/builder.rs delete mode 100644 common/datavalues/src/arrays/utf8/iterator.rs delete mode 100644 common/datavalues/src/arrays/utf8/mod.rs diff --git a/Cargo.lock b/Cargo.lock index e3d254ebd23ae..d5c75c96cc914 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -111,7 +111,7 @@ checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b" [[package]] name = "arrow-flight" version = "0.1.0" -source = "git+https://github.com/zhyass/arrow2?rev=2e3de5a#2e3de5a753dcadc31195de4d62e039c13e1e69d0" +source = "git+https://github.com/zhyass/arrow2?rev=23682f0#23682f033d6e473a8292e441414795e843b47188" dependencies = [ "arrow2", "bytes", @@ -125,7 +125,7 @@ dependencies = [ [[package]] name = "arrow2" version = "0.4.0" -source = "git+https://github.com/zhyass/arrow2?rev=2e3de5a#2e3de5a753dcadc31195de4d62e039c13e1e69d0" +source = "git+https://github.com/zhyass/arrow2?rev=23682f0#23682f033d6e473a8292e441414795e843b47188" dependencies = [ "ahash 0.7.4", "base64", diff --git a/common/arrow/Cargo.toml b/common/arrow/Cargo.toml index fde0270659039..d3f38513de93a 100644 --- a/common/arrow/Cargo.toml +++ b/common/arrow/Cargo.toml @@ -15,8 +15,8 @@ simd = ["arrow/simd"] # Workspace dependencies # Github dependencies -arrow = { package = "arrow2", git="https://github.com/zhyass/arrow2", rev = "2e3de5a" } -arrow-flight = { git="https://github.com/zhyass/arrow2", rev = "2e3de5a" } +arrow = { package = "arrow2", git="https://github.com/zhyass/arrow2", rev = "23682f0" } +arrow-flight = { git="https://github.com/zhyass/arrow2", rev = "23682f0" } parquet = {package = "parquet2", git = "https://github.com/datafuse-extras/parquet2", rev = "d28330f"} # Crates.io dependencies diff --git a/common/datablocks/src/kernels/data_block_concat_test.rs b/common/datablocks/src/kernels/data_block_concat_test.rs index 20d9b0f65f14f..2d6176ae0c223 100644 --- a/common/datablocks/src/kernels/data_block_concat_test.rs +++ b/common/datablocks/src/kernels/data_block_concat_test.rs @@ -23,7 +23,7 @@ use crate::*; fn test_data_block_concat() -> Result<()> { let schema = DataSchemaRefExt::create(vec![ DataField::new("a", DataType::Int64, false), - DataField::new("b", DataType::Utf8, false), + DataField::new("b", DataType::String, false), ]); let blocks = vec![ diff --git a/common/datablocks/src/kernels/data_block_group_by_hash_test.rs b/common/datablocks/src/kernels/data_block_group_by_hash_test.rs index 246a926c7da9f..2f75f39f8367b 100644 --- a/common/datablocks/src/kernels/data_block_group_by_hash_test.rs +++ b/common/datablocks/src/kernels/data_block_group_by_hash_test.rs @@ -23,7 +23,7 @@ fn test_data_block_group_by_hash() -> Result<()> { DataField::new("a", DataType::Int8, false), DataField::new("b", DataType::Int8, false), DataField::new("c", DataType::Int8, false), - DataField::new("x", DataType::Utf8, false), + DataField::new("x", DataType::String, false), ]); let block = DataBlock::create_by_array(schema.clone(), vec![ diff --git a/common/datablocks/src/kernels/data_block_group_by_test.rs b/common/datablocks/src/kernels/data_block_group_by_test.rs index 9b1570fb0a61a..eabf260109ea7 100644 --- a/common/datablocks/src/kernels/data_block_group_by_test.rs +++ b/common/datablocks/src/kernels/data_block_group_by_test.rs @@ -21,7 +21,7 @@ use crate::*; fn test_data_block_group_by() -> Result<()> { let schema = DataSchemaRefExt::create(vec![ DataField::new("a", DataType::Int8, false), - DataField::new("b", DataType::Utf8, false), + DataField::new("b", DataType::String, false), ]); let block = DataBlock::create_by_array(schema.clone(), vec![ diff --git a/common/datablocks/src/kernels/data_block_sort_test.rs b/common/datablocks/src/kernels/data_block_sort_test.rs index 94021cbf95051..7208841756e84 100644 --- a/common/datablocks/src/kernels/data_block_sort_test.rs +++ b/common/datablocks/src/kernels/data_block_sort_test.rs @@ -21,7 +21,7 @@ use crate::*; fn test_data_block_sort() -> Result<()> { let schema = DataSchemaRefExt::create(vec![ DataField::new("a", DataType::Int64, false), - DataField::new("b", DataType::Utf8, false), + DataField::new("b", DataType::String, false), ]); let raw = DataBlock::create_by_array(schema.clone(), vec![ @@ -77,7 +77,7 @@ fn test_data_block_sort() -> Result<()> { fn test_data_block_merge_sort() -> Result<()> { let schema = DataSchemaRefExt::create(vec![ DataField::new("a", DataType::Int64, false), - DataField::new("b", DataType::Utf8, false), + DataField::new("b", DataType::String, false), ]); let raw1 = DataBlock::create_by_array(schema.clone(), vec![ diff --git a/common/datablocks/src/kernels/data_block_take_test.rs b/common/datablocks/src/kernels/data_block_take_test.rs index be32f3929a25c..a9df94fa258f2 100644 --- a/common/datablocks/src/kernels/data_block_take_test.rs +++ b/common/datablocks/src/kernels/data_block_take_test.rs @@ -21,7 +21,7 @@ use crate::*; fn test_data_block_take() -> Result<()> { let schema = DataSchemaRefExt::create(vec![ DataField::new("a", DataType::Int64, false), - DataField::new("b", DataType::Utf8, false), + DataField::new("b", DataType::String, false), ]); let raw = DataBlock::create_by_array(schema.clone(), vec![ diff --git a/common/datavalues/src/arrays/ops/agg_test.rs b/common/datavalues/src/arrays/ops/agg_test.rs index 62affdb97a4d0..511e29b299541 100644 --- a/common/datavalues/src/arrays/ops/agg_test.rs +++ b/common/datavalues/src/arrays/ops/agg_test.rs @@ -76,8 +76,8 @@ fn test_boolean_array_agg() -> Result<()> { } #[test] -fn test_utf8_array_agg() -> Result<()> { - let array = DFUtf8Array::new_from_slice(&vec!["h", "e", "l", "o"]); +fn test_string_array_agg() -> Result<()> { + let array = DFStringArray::new_from_slice(&vec!["h", "e", "l", "o"]); let value = [ array.max()?, @@ -87,15 +87,15 @@ fn test_utf8_array_agg() -> Result<()> { ]; let expected = [ - DataValue::Utf8(Some("o".to_string())), - DataValue::Utf8(Some("e".to_string())), + DataValue::String(Some("o".as_bytes().to_vec())), + DataValue::String(Some("e".as_bytes().to_vec())), DataValue::Struct(vec![ DataValue::UInt64(Some(1)), - DataValue::Utf8(Some("e".to_string())), + DataValue::String(Some("e".as_bytes().to_vec())), ]), DataValue::Struct(vec![ DataValue::UInt64(Some(3)), - DataValue::Utf8(Some("o".to_string())), + DataValue::String(Some("o".as_bytes().to_vec())), ]), ]; let len = value.len(); diff --git a/common/datavalues/src/arrays/ops/apply_test.rs b/common/datavalues/src/arrays/ops/apply_test.rs index 9a20feb50cbc1..bc5087214ddb0 100644 --- a/common/datavalues/src/arrays/ops/apply_test.rs +++ b/common/datavalues/src/arrays/ops/apply_test.rs @@ -46,8 +46,8 @@ fn new_test_boolean_array(cap: usize, begin: i32, end: i32) -> DFBooleanArray { builder.finish() } -fn new_test_utf8_array(cap: usize, begin: i32, end: i32) -> DFUtf8Array { - let mut builder = Utf8ArrayBuilder::with_capacity(cap); +fn new_test_string_array(cap: usize, begin: i32, end: i32) -> DFStringArray { + let mut builder = StringArrayBuilder::with_capacity(cap); let s = vec!["ax", "by", "cz", "dm", "13"]; (begin..end).for_each(|index| { @@ -164,15 +164,15 @@ fn test_boolean_array_apply() -> Result<()> { } #[test] -fn test_utf8_array_apply() -> Result<()> { +fn test_string_array_apply() -> Result<()> { // array=[null, "by", "cz", null, "13"] - let array = new_test_utf8_array(5, 0, 5); + let array = new_test_string_array(5, 0, 5); let arrays = vec![ array.apply(|arr| Cow::from(&arr[1..])), array.apply_with_idx(|(_, arr)| Cow::from(&arr[..1])), array.apply_with_idx_on_opt(|(_, arr)| match arr { Some(v) => Some(Cow::from(&v[0..])), - None => Some(Cow::from("ff")), + None => Some(Cow::from("ff".as_bytes())), }), ]; @@ -190,26 +190,26 @@ fn test_utf8_array_apply() -> Result<()> { assert_eq!(2, values[0].null_count()); assert_eq!(true, values[0].is_null(0)); - assert_eq!("y", values[0].value(1)); - assert_eq!("z", values[0].value(2)); + assert_eq!(b"y", values[0].value(1)); + assert_eq!(b"z", values[0].value(2)); assert_eq!(true, values[0].is_null(3)); - assert_eq!("3", values[0].value(4)); + assert_eq!(b"3", values[0].value(4)); assert_eq!(true, values[0].is_null(3)); assert_eq!(2, values[1].null_count()); assert_eq!(true, values[1].is_null(0)); - assert_eq!("b", values[1].value(1)); - assert_eq!("c", values[1].value(2)); + assert_eq!(b"b", values[1].value(1)); + assert_eq!(b"c", values[1].value(2)); assert_eq!(true, values[1].is_null(3)); - assert_eq!("1", values[1].value(4)); + assert_eq!(b"1", values[1].value(4)); assert_eq!(true, values[1].is_null(3)); assert_eq!(0, values[2].null_count()); - assert_eq!("ff", values[2].value(0)); - assert_eq!("by", values[2].value(1)); - assert_eq!("cz", values[2].value(2)); - assert_eq!("ff", values[2].value(3)); - assert_eq!("13", values[2].value(4)); + assert_eq!(b"ff", values[2].value(0)); + assert_eq!(b"by", values[2].value(1)); + assert_eq!(b"cz", values[2].value(2)); + assert_eq!(b"ff", values[2].value(3)); + assert_eq!(b"13", values[2].value(4)); assert_eq!(2, cast_values[0].null_count()); assert_eq!(true, cast_values[0].is_null(0)); diff --git a/common/datavalues/src/arrays/ops/contain_test.rs b/common/datavalues/src/arrays/ops/contain_test.rs index 9b5ae390f0c37..6fd7e8db0b36d 100644 --- a/common/datavalues/src/arrays/ops/contain_test.rs +++ b/common/datavalues/src/arrays/ops/contain_test.rs @@ -33,22 +33,22 @@ fn test_contain() -> Result<()> { let values = boolean?.collect_values(); assert_eq!(&[Some(true), Some(false), Some(true)], values.as_slice()); - // Test DFUtf8Array - let mut utf8_builder = Utf8ArrayBuilder::with_capacity(3); - utf8_builder.append_value("1a"); - utf8_builder.append_value("2b"); - utf8_builder.append_value("3c"); - utf8_builder.append_value("4d"); - let df_utf8_array = utf8_builder.finish(); - - let mut builder = get_list_builder(&DataType::Utf8, 12, 1); + // Test DFStringArray + let mut string_builder = StringArrayBuilder::with_capacity(3); + string_builder.append_value("1a"); + string_builder.append_value("2b"); + string_builder.append_value("3c"); + string_builder.append_value("4d"); + let df_string_array = string_builder.finish(); + + let mut builder = get_list_builder(&DataType::String, 12, 1); builder.append_series(&Series::new(vec!["2b", "4d"])); builder.append_series(&Series::new(vec!["2b", "4d"])); builder.append_series(&Series::new(vec!["2b", "4d"])); builder.append_series(&Series::new(vec!["2b", "4d"])); let df_list = builder.finish(); - let boolean = df_utf8_array.contain(&df_list); + let boolean = df_string_array.contain(&df_list); let values = boolean?.collect_values(); assert_eq!( &[Some(false), Some(true), Some(false), Some(true)], diff --git a/common/datavalues/src/arrays/ops/fill_test.rs b/common/datavalues/src/arrays/ops/fill_test.rs index c9dfe20307d82..aa4fe1a5c9fe6 100644 --- a/common/datavalues/src/arrays/ops/fill_test.rs +++ b/common/datavalues/src/arrays/ops/fill_test.rs @@ -42,22 +42,22 @@ fn test_array_fill() -> Result<()> { assert_eq!(true, df_boolean_array.is_null(1)); assert_eq!(true, df_boolean_array.is_null(2)); - // Test full for Utf8Array - let mut df_utf8_array = DFUtf8Array::full("ab", 3); - assert_eq!(0, df_utf8_array.null_count()); - assert_eq!(false, df_utf8_array.is_null(0)); - assert_eq!(false, df_utf8_array.is_null(1)); - assert_eq!(false, df_utf8_array.is_null(2)); - assert_eq!("ab", df_utf8_array.inner().value(0)); - assert_eq!("ab", df_utf8_array.inner().value(1)); - assert_eq!("ab", df_utf8_array.inner().value(2)); - - // Test full_null for Utf8Array - df_utf8_array = DFUtf8Array::full_null(3); - assert_eq!(3, df_utf8_array.null_count()); - assert_eq!(true, df_utf8_array.is_null(0)); - assert_eq!(true, df_utf8_array.is_null(1)); - assert_eq!(true, df_utf8_array.is_null(2)); + // Test full for StringArray + let mut df_string_array = DFStringArray::full("ab".as_bytes(), 3); + assert_eq!(0, df_string_array.null_count()); + assert_eq!(false, df_string_array.is_null(0)); + assert_eq!(false, df_string_array.is_null(1)); + assert_eq!(false, df_string_array.is_null(2)); + assert_eq!("ab".as_bytes(), df_string_array.inner().value(0)); + assert_eq!("ab".as_bytes(), df_string_array.inner().value(1)); + assert_eq!("ab".as_bytes(), df_string_array.inner().value(2)); + + // Test full_null for StringArray + df_string_array = DFStringArray::full_null(3); + assert_eq!(3, df_string_array.null_count()); + assert_eq!(true, df_string_array.is_null(0)); + assert_eq!(true, df_string_array.is_null(1)); + assert_eq!(true, df_string_array.is_null(2)); Ok(()) } diff --git a/common/datavalues/src/arrays/ops/if_test.rs b/common/datavalues/src/arrays/ops/if_test.rs index 30303ae75bd15..e076c2f56dff0 100644 --- a/common/datavalues/src/arrays/ops/if_test.rs +++ b/common/datavalues/src/arrays/ops/if_test.rs @@ -17,6 +17,7 @@ use common_arrow::arrow::array::NullArray; use common_arrow::arrow::array::UInt64Array; use common_arrow::arrow::compute::comparison::compare_scalar; use common_arrow::arrow::compute::comparison::Operator; +use common_arrow::arrow::datatypes::DataType as ArrowType; use common_arrow::arrow::scalar::PrimitiveScalar; use common_exception::Result; @@ -58,19 +59,19 @@ fn test_array_if() -> Result<()> { assert_eq!(true, res.inner().value(1)); assert_eq!(true, res.inner().value(2)); - // DFUtf8Array. - let lhs = DFUtf8Array::new_from_slice(&["a"]); - let rhs = DFUtf8Array::new_from_slice(&["b"]); + // DFStringArray. + let lhs = DFStringArray::new_from_slice(&["a"]); + let rhs = DFStringArray::new_from_slice(&["b"]); let res = lhs.if_then_else(&rhs, &conds[0])?; assert_eq!(3, res.len()); - assert_eq!("a", res.inner().value(0)); - assert_eq!("b", res.inner().value(1)); - assert_eq!("a", res.inner().value(2)); + assert_eq!(b"a", res.inner().value(0)); + assert_eq!(b"b", res.inner().value(1)); + assert_eq!(b"a", res.inner().value(2)); // DFNullArray. - let lhs = NullArray::new_null(2); + let lhs = NullArray::new_null(ArrowType::Null, 2); let lhs: DFNullArray = lhs.into(); - let rhs = NullArray::new_null(1); + let rhs = NullArray::new_null(ArrowType::Null, 1); let rhs: DFNullArray = rhs.into(); let res = lhs.if_then_else(&rhs, &conds[0])?; assert_eq!(2, res.len()); diff --git a/common/datavalues/src/arrays/ops/scatter_test.rs b/common/datavalues/src/arrays/ops/scatter_test.rs index 9bbe1fa554eb5..964646ebf2e3a 100644 --- a/common/datavalues/src/arrays/ops/scatter_test.rs +++ b/common/datavalues/src/arrays/ops/scatter_test.rs @@ -33,16 +33,16 @@ fn test_scatter() -> Result<()> { assert_eq!(&[2u16, 6], array_vec[2].inner().values().as_slice()); assert_eq!(&[3u16, 5, 8], array_vec[3].inner().values().as_slice()); - // Test DFUint16Array - let df_utf8_array = DFUtf8Array::new_from_slice(&["a", "b", "c", "d"]); + // Test DFStringArray + let df_string_array = DFStringArray::new_from_slice(&["a", "b", "c", "d"]); let indices = vec![1, 0, 1, 1]; - assert_eq!(df_utf8_array.len(), indices.len()); + assert_eq!(df_string_array.len(), indices.len()); - let array_vec = unsafe { df_utf8_array.scatter_unchecked(&mut indices.into_iter(), 2)? }; - let v1: Vec<&str> = array_vec[0].into_no_null_iter().collect(); - let v2: Vec<&str> = array_vec[1].into_no_null_iter().collect(); - assert_eq!(vec!["b"], v1); - assert_eq!(vec!["a", "c", "d"], v2); + let array_vec = unsafe { df_string_array.scatter_unchecked(&mut indices.into_iter(), 2)? }; + let v1: Vec<&[u8]> = array_vec[0].into_no_null_iter().collect(); + let v2: Vec<&[u8]> = array_vec[1].into_no_null_iter().collect(); + assert_eq!(vec![b"b"], v1); + assert_eq!(vec![b"a", b"c", b"d"], v2); // Test BooleanArray let df_bool_array = DFBooleanArray::new_from_slice(&[true, false, true, false]); diff --git a/common/datavalues/src/arrays/ops/take_random_test.rs b/common/datavalues/src/arrays/ops/take_random_test.rs index 0cf501ce633c0..2370edec643fa 100644 --- a/common/datavalues/src/arrays/ops/take_random_test.rs +++ b/common/datavalues/src/arrays/ops/take_random_test.rs @@ -52,18 +52,18 @@ fn test_take_random() -> Result<()> { let expected = Series::new(vec![1_u16, 2, 3]); assert!(result.series_equal(&expected)); - // Test DFUtf8Array - let mut utf8_builder = Utf8ArrayBuilder::with_capacity(3); - utf8_builder.append_value("1a"); - utf8_builder.append_value("2b"); - utf8_builder.append_value("3c"); - let df_utf8_array = &utf8_builder.finish(); + // Test DFStringArray + let mut string_builder = StringArrayBuilder::with_capacity(3); + string_builder.append_value("1a"); + string_builder.append_value("2b"); + string_builder.append_value("3c"); + let df_string_array = &string_builder.finish(); // Create TakeRandBranch for the array - let taker = df_utf8_array.take_rand(); - assert_eq!(Some("1a"), taker.get(0)); + let taker = df_string_array.take_rand(); + assert_eq!(Some("1a".as_bytes()), taker.get(0)); // Test get_unchecked let result = unsafe { taker.get_unchecked(1) }; - assert_eq!("2b", result); + assert_eq!(b"2b", result); Ok(()) } diff --git a/common/datavalues/src/arrays/ops/take_test.rs b/common/datavalues/src/arrays/ops/take_test.rs index 546f55c8e1ad7..d6091b08ef9fe 100644 --- a/common/datavalues/src/arrays/ops/take_test.rs +++ b/common/datavalues/src/arrays/ops/take_test.rs @@ -60,21 +60,21 @@ fn test_take() -> Result<()> { let expected = Series::new(vec![7_u16, 8, 9]); assert!(vs[0].series_equal(&expected)); - // Test DFUtf8Array - let mut utf8_builder = Utf8ArrayBuilder::with_capacity(3); - utf8_builder.append_value("1a"); - utf8_builder.append_value("2b"); - utf8_builder.append_value("3c"); - let df_utf8_array = &utf8_builder.finish(); + // Test DFStringArray + let mut string_builder = StringArrayBuilder::with_capacity(3); + string_builder.append_value("1a"); + string_builder.append_value("2b"); + string_builder.append_value("3c"); + let df_string_array = &string_builder.finish(); let index = TakeIdx::from(vec![0, 1].into_iter()); - let take_res = df_utf8_array.take(index)?; + let take_res = df_string_array.take(index)?; let vs: Vec<_> = take_res.into_no_null_iter().collect(); - assert_eq!(&vs, &["1a", "2b"]); + assert_eq!(&vs, &[b"1a", b"2b"]); let index = TakeIdx::from(vec![2, 1].into_iter()); - let take_res = unsafe { df_utf8_array.take_unchecked(index)? }; + let take_res = unsafe { df_string_array.take_unchecked(index)? }; let vs: Vec<_> = take_res.into_no_null_iter().collect(); - assert_eq!(&vs, &["3c", "2b"]); + assert_eq!(&vs, &[b"3c", b"2b"]); Ok(()) } diff --git a/common/datavalues/src/arrays/primitive/mod.rs b/common/datavalues/src/arrays/primitive/mod.rs index 96dedc6b6831f..9954154b431ff 100644 --- a/common/datavalues/src/arrays/primitive/mod.rs +++ b/common/datavalues/src/arrays/primitive/mod.rs @@ -12,15 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use common_arrow::arrow::array::Array; -use common_arrow::arrow::array::PrimitiveArray; -use common_arrow::arrow::bitmap::Bitmap; -use common_arrow::arrow::buffer::Buffer; -use common_exception::ErrorCode; -use common_exception::Result; - -use crate::prelude::*; - mod builder; mod iterator; @@ -28,8 +19,16 @@ mod iterator; mod builder_test; pub use builder::*; +use common_arrow::arrow::array::Array; +use common_arrow::arrow::array::PrimitiveArray; +use common_arrow::arrow::bitmap::Bitmap; +use common_arrow::arrow::buffer::Buffer; +use common_exception::ErrorCode; +use common_exception::Result; pub use iterator::*; +use crate::prelude::*; + /// DFPrimitiveArray is generic struct which wrapped arrow's PrimitiveArray #[derive(Debug, Clone)] pub struct DFPrimitiveArray { diff --git a/common/datavalues/src/arrays/string/builder.rs b/common/datavalues/src/arrays/string/builder.rs index c77b53f24c02c..9ed4c6930b68e 100644 --- a/common/datavalues/src/arrays/string/builder.rs +++ b/common/datavalues/src/arrays/string/builder.rs @@ -60,14 +60,14 @@ impl ArrayDeserializer for StringArrayBuilder { let offset: u64 = reader.read_uvarint()?; let mut values: Vec = Vec::with_capacity(offset as usize); reader.read_exact(&mut values)?; - self.append_value(reader.clone()); + self.append_value(reader); Ok(()) } fn de_batch(&mut self, reader: &[u8], step: usize, rows: usize) -> Result<()> { for row in 0..rows { let reader = &reader[step * row..]; - self.append_value(reader.clone()); + self.append_value(reader); } Ok(()) } diff --git a/common/datavalues/src/arrays/utf8/builder_test.rs b/common/datavalues/src/arrays/string/builder_test.rs similarity index 74% rename from common/datavalues/src/arrays/utf8/builder_test.rs rename to common/datavalues/src/arrays/string/builder_test.rs index 27e8f148aba99..504d04226bd1d 100644 --- a/common/datavalues/src/arrays/utf8/builder_test.rs +++ b/common/datavalues/src/arrays/string/builder_test.rs @@ -16,15 +16,15 @@ use crate::prelude::*; #[test] fn test_empty_array() { - let mut builder = Utf8ArrayBuilder::with_capacity(16); + let mut builder = StringArrayBuilder::with_capacity(16); let data_array = builder.finish(); assert_eq!(true, data_array.is_empty()); - assert_eq!(&DataType::Utf8, data_array.data_type()); + assert_eq!(&DataType::String, data_array.data_type()); } #[test] fn test_fill_data() { - let mut builder = Utf8ArrayBuilder::with_capacity(16); + let mut builder = StringArrayBuilder::with_capacity(16); builder.append_value("你好"); builder.append_option(Some("\u{1F378}")); builder.append_null(); @@ -33,19 +33,19 @@ fn test_fill_data() { let mut iter = data_array.into_iter(); assert_eq!(3, data_array.len()); - assert_eq!(Some(Some("你好")), iter.next()); - assert_eq!(Some(Some("🍸")), iter.next()); + assert_eq!(Some(Some("你好".as_bytes())), iter.next()); + assert_eq!(Some(Some("🍸".as_bytes())), iter.next()); assert_eq!(Some(None), iter.next()); assert_eq!(None, iter.next()); } #[test] fn test_new_from_opt_slice() { - let data_array = DFUtf8Array::new_from_opt_slice(&[Some("你好"), None]); + let data_array = DFStringArray::new_from_opt_slice(&[Some("你好"), None]); let mut iter = data_array.into_iter(); assert_eq!(2, data_array.len()); - assert_eq!(Some(Some("你好")), iter.next()); + assert_eq!(Some(Some("你好".as_bytes())), iter.next()); assert_eq!(Some(None), iter.next()); assert_eq!(None, iter.next()); } @@ -55,11 +55,11 @@ fn test_new_from_opt_iter() { let v = vec![None, Some("你好"), None]; let mut iter = v.into_iter(); iter.next(); // move iterator and create data array from second element - let data_array = DFUtf8Array::new_from_opt_iter(iter); + let data_array = DFStringArray::new_from_opt_iter(iter); let mut iter = data_array.into_iter(); assert_eq!(2, data_array.len()); - assert_eq!(Some(Some("你好")), iter.next()); + assert_eq!(Some(Some("你好".as_bytes())), iter.next()); assert_eq!(Some(None), iter.next()); assert_eq!(None, iter.next()); } diff --git a/common/datavalues/src/arrays/string/mod.rs b/common/datavalues/src/arrays/string/mod.rs index 7e6670410ecc9..3869424d1ef87 100644 --- a/common/datavalues/src/arrays/string/mod.rs +++ b/common/datavalues/src/arrays/string/mod.rs @@ -15,6 +15,9 @@ mod builder; mod iterator; +#[cfg(test)] +mod builder_test; + pub use builder::*; use common_arrow::arrow::array::*; use common_arrow::arrow::bitmap::Bitmap; diff --git a/common/datavalues/src/arrays/utf8/builder.rs b/common/datavalues/src/arrays/utf8/builder.rs deleted file mode 100644 index 80a974badc3a0..0000000000000 --- a/common/datavalues/src/arrays/utf8/builder.rs +++ /dev/null @@ -1,136 +0,0 @@ -// Copyright 2020 Datafuse Labs. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use common_arrow::arrow::array::*; -use common_exception::Result; -use common_io::prelude::BinaryRead; - -use crate::prelude::*; -use crate::utils::get_iter_capacity; - -pub struct Utf8ArrayBuilder { - pub builder: MutableUtf8Array, -} - -impl Utf8ArrayBuilder { - /// Create a new UtfArrayBuilder - /// - /// # Arguments - /// - /// * `capacity` - Number of string elements in the final array. - pub fn with_capacity(bytes_capacity: usize) -> Self { - Utf8ArrayBuilder { - builder: MutableUtf8Array::with_capacity(bytes_capacity), - } - } - - /// Appends a value of type `T` into the builder - #[inline] - pub fn append_value>(&mut self, v: S) { - self.builder.push(Some(v)) - } - - /// Appends a null slot into the builder - #[inline] - pub fn append_null(&mut self) { - self.builder.push_null(); - } - - #[inline] - pub fn append_option>(&mut self, opt: Option) { - match opt { - Some(s) => self.append_value(s.as_ref()), - None => self.append_null(), - } - } - - pub fn finish(&mut self) -> DFUtf8Array { - let array = self.builder.as_arc(); - DFUtf8Array::from_arrow_array(array.as_ref()) - } -} - -impl ArrayDeserializer for Utf8ArrayBuilder { - fn de(&mut self, reader: &mut &[u8]) -> Result<()> { - let value: String = reader.read_string()?; - self.append_value(value); - Ok(()) - } - - fn de_batch(&mut self, reader: &[u8], step: usize, rows: usize) -> Result<()> { - for row in 0..rows { - let mut reader = &reader[step * row..]; - let value: String = reader.read_string()?; - self.append_value(&value); - } - Ok(()) - } - - fn finish_to_series(&mut self) -> Series { - self.finish().into_series() - } - - fn de_text(&mut self, reader: &[u8]) { - match std::str::from_utf8(reader) { - Ok(v) => self.append_value(v), - Err(_) => self.append_null(), - } - } - - fn de_null(&mut self) { - self.append_null() - } -} - -impl NewDataArray for DFUtf8Array -where S: AsRef -{ - fn new_from_slice(v: &[S]) -> Self { - let values_size = v.iter().fold(0, |acc, s| acc + s.as_ref().len()); - let mut builder = Utf8ArrayBuilder::with_capacity(values_size); - v.iter().for_each(|val| { - builder.append_value(val.as_ref()); - }); - - builder.finish() - } - - fn new_from_opt_slice(opt_v: &[Option]) -> Self { - let values_size = opt_v.iter().fold(0, |acc, s| match s { - Some(s) => acc + s.as_ref().len(), - None => acc, - }); - let mut builder = Utf8ArrayBuilder::with_capacity(values_size); - opt_v.iter().for_each(|opt| match opt { - Some(v) => builder.append_value(v.as_ref()), - None => builder.append_null(), - }); - builder.finish() - } - - fn new_from_opt_iter(it: impl Iterator>) -> Self { - let cap = get_iter_capacity(&it); - let mut builder = Utf8ArrayBuilder::with_capacity(cap * 5); - it.for_each(|opt| builder.append_option(opt)); - builder.finish() - } - - /// Create a new DataArray from an iterator. - fn new_from_iter(it: impl Iterator) -> Self { - let cap = get_iter_capacity(&it); - let mut builder = Utf8ArrayBuilder::with_capacity(cap * 5); - it.for_each(|v| builder.append_value(v)); - builder.finish() - } -} diff --git a/common/datavalues/src/arrays/utf8/iterator.rs b/common/datavalues/src/arrays/utf8/iterator.rs deleted file mode 100644 index 0f637b7da0310..0000000000000 --- a/common/datavalues/src/arrays/utf8/iterator.rs +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright 2020 Datafuse Labs. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use common_arrow::arrow::array::*; -use common_arrow::arrow::bitmap::utils::ZipValidity; -use common_arrow::arrow::trusted_len::TrustedLen; - -use crate::prelude::*; - -impl<'a> IntoIterator for &'a DFUtf8Array { - type Item = Option<&'a str>; - type IntoIter = ZipValidity<'a, &'a str, Utf8ValuesIter<'a, i64>>; - fn into_iter(self) -> Self::IntoIter { - self.array.iter() - } -} - -/// all arrays have known size. -impl<'a> ExactSizeIterator for Utf8IterNoNull<'a> {} -unsafe impl<'a> TrustedLen for Utf8IterNoNull<'a> {} - -pub struct Utf8IterNoNull<'a> { - array: &'a LargeUtf8Array, - current: usize, - current_end: usize, -} - -impl<'a> Utf8IterNoNull<'a> { - /// create a new iterator - pub fn new(array: &'a LargeUtf8Array) -> Self { - Utf8IterNoNull { - array, - current: 0, - current_end: array.len(), - } - } -} - -impl<'a> Iterator for Utf8IterNoNull<'a> { - type Item = &'a str; - - fn next(&mut self) -> Option { - if self.current == self.current_end { - None - } else { - let old = self.current; - self.current += 1; - unsafe { Some(self.array.value_unchecked(old)) } - } - } - - fn size_hint(&self) -> (usize, Option) { - ( - self.array.len() - self.current, - Some(self.array.len() - self.current), - ) - } -} - -impl DFUtf8Array { - pub fn into_no_null_iter<'a>(&'a self) -> impl TrustedLen + '_ + Send + Sync { - Utf8IterNoNull::new(self.inner()) - } -} diff --git a/common/datavalues/src/arrays/utf8/mod.rs b/common/datavalues/src/arrays/utf8/mod.rs deleted file mode 100644 index 19935f9fd77d0..0000000000000 --- a/common/datavalues/src/arrays/utf8/mod.rs +++ /dev/null @@ -1,194 +0,0 @@ -// Copyright 2020 Datafuse Labs. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use common_arrow::arrow::array::*; -use common_arrow::arrow::bitmap::Bitmap; -use common_exception::ErrorCode; -use common_exception::Result; - -use crate::prelude::*; - -mod builder; -mod iterator; - -#[cfg(test)] -mod builder_test; - -pub use builder::*; -pub use iterator::*; - -#[derive(Debug, Clone)] -pub struct DFUtf8Array { - pub(crate) array: LargeUtf8Array, -} - -impl From for DFUtf8Array { - fn from(array: LargeUtf8Array) -> Self { - Self { array } - } -} - -impl DFUtf8Array { - pub fn new(array: LargeUtf8Array) -> Self { - Self { array } - } - - pub fn from_arrow_array(array: &dyn Array) -> Self { - Self::new( - array - .as_any() - .downcast_ref::() - .unwrap() - .clone(), - ) - } - - pub fn data_type(&self) -> &DataType { - &DataType::Utf8 - } - - pub fn inner(&self) -> &LargeUtf8Array { - &self.array - } - - /// # Safety - /// Note this doesn't do any bound checking, for performance reason. - pub unsafe fn try_get(&self, index: usize) -> Result { - let v = match self.array.is_null(index) { - true => None, - false => Some(self.array.value_unchecked(index)), - }; - Ok(v.into()) - } - - pub fn len(&self) -> usize { - self.array.len() - } - - #[inline] - pub fn is_empty(&self) -> bool { - self.len() == 0 - } - - #[inline] - pub fn null_count(&self) -> usize { - self.array.null_count() - } - - #[inline] - pub fn is_null(&self, i: usize) -> bool { - self.array.is_null(i) - } - - #[inline] - pub fn all_is_null(&self) -> bool { - self.null_count() == self.len() - } - - #[inline] - /// Get the null count and the buffer of bits representing null values - pub fn null_bits(&self) -> (usize, &Option) { - (self.array.null_count(), self.array.validity()) - } - - /// Take a view of top n elements - pub fn limit(&self, num_elements: usize) -> Self { - self.slice(0, num_elements) - } - - pub fn slice(&self, offset: usize, length: usize) -> Self { - let array = self.array.slice(offset, length); - Self::new(array) - } - - /// Unpack a array to the same physical type. - /// - /// # Safety - /// - /// This is unsafe as the data_type may be uncorrect and - /// is assumed to be correct in other unsafe code. - pub unsafe fn unpack(&self, array: &Series) -> Result<&Self> { - let array_trait = &**array; - if self.data_type() == array.data_type() { - let ca = &*(array_trait as *const dyn SeriesTrait as *const Self); - Ok(ca) - } else { - Err(ErrorCode::IllegalDataType(format!( - "cannot unpack array {:?} into matching type {:?}", - array, - self.data_type() - ))) - } - } - - pub fn collect_values(&self) -> Vec> { - self.inner().iter().collect() - } -} - -/// # Safety -/// Note this doesn't do any bound checking, for performance reason. -pub unsafe fn take_utf8_iter_unchecked>( - arr: &LargeUtf8Array, - indices: I, -) -> LargeUtf8Array { - match arr.null_count() { - 0 => { - let iter = indices - .into_iter() - .map(|idx| Some(arr.value_unchecked(idx))); - LargeUtf8Array::from_trusted_len_iter_unchecked(iter) - } - _ => { - let iter = indices.into_iter().map(|idx| { - if arr.is_null(idx) { - None - } else { - Some(arr.value_unchecked(idx)) - } - }); - LargeUtf8Array::from_trusted_len_iter_unchecked(iter) - } - } -} - -/// # Safety -/// Note this doesn't do any bound checking, for performance reason. -pub unsafe fn take_utf8_opt_iter_unchecked>>( - arr: &LargeUtf8Array, - indices: I, -) -> LargeUtf8Array { - match arr.null_count() { - 0 => { - let iter = indices - .into_iter() - .map(|opt_idx| opt_idx.map(|idx| arr.value_unchecked(idx))); - - LargeUtf8Array::from_trusted_len_iter_unchecked(iter) - } - _ => { - let iter = indices.into_iter().map(|opt_idx| { - opt_idx.and_then(|idx| { - if arr.is_null(idx) { - None - } else { - Some(arr.value_unchecked(idx)) - } - }) - }); - - LargeUtf8Array::from_trusted_len_iter_unchecked(iter) - } - } -} diff --git a/common/datavalues/src/data_group_value.rs b/common/datavalues/src/data_group_value.rs index b9a487eef31bb..5aedd58772b58 100644 --- a/common/datavalues/src/data_group_value.rs +++ b/common/datavalues/src/data_group_value.rs @@ -34,7 +34,7 @@ pub enum DataGroupValue { Int16(i16), Int32(i32), Int64(i64), - String(Box>), + String(Vec), Boolean(bool), } @@ -54,7 +54,7 @@ impl TryFrom<&DataValue> for DataGroupValue { DataValue::UInt16(Some(v)) => DataGroupValue::UInt16(*v), DataValue::UInt32(Some(v)) => DataGroupValue::UInt32(*v), DataValue::UInt64(Some(v)) => DataGroupValue::UInt64(*v), - DataValue::String(Some(v)) => DataGroupValue::String(Box::new(v.clone())), + DataValue::String(Some(v)) => DataGroupValue::String(v.clone()), DataValue::Float32(None) | DataValue::Float64(None) diff --git a/common/datavalues/src/data_value.rs b/common/datavalues/src/data_value.rs index 70a6afb922dfe..5ce499ca6e485 100644 --- a/common/datavalues/src/data_value.rs +++ b/common/datavalues/src/data_value.rs @@ -367,13 +367,15 @@ impl fmt::Display for DataValue { DataValue::UInt32(v) => format_data_value_with_option!(f, v), DataValue::UInt64(v) => format_data_value_with_option!(f, v), DataValue::String(None) => write!(f, "NULL"), - DataValue::String(Some(v)) => { - for c in v { - write!(f, "{:02x}", c)?; + DataValue::String(Some(v)) => match std::str::from_utf8(v) { + Ok(v) => write!(f, "{}", v), + Err(_e) => { + for c in v { + write!(f, "{:02x}", c)?; + } + Ok(()) } - Ok(()) - } - + }, DataValue::List(None, ..) => write!(f, "NULL"), DataValue::List(Some(v), ..) => { write!( diff --git a/common/datavalues/src/series/arithmetic_test.rs b/common/datavalues/src/series/arithmetic_test.rs index 9588ccaaa7389..99c7dff41ea3f 100644 --- a/common/datavalues/src/series/arithmetic_test.rs +++ b/common/datavalues/src/series/arithmetic_test.rs @@ -108,7 +108,9 @@ fn test_arithmetic_series() { Series::new(vec![5.0f64, 5.0, 5.0, 5.0]), Series::new(vec![5.0f64, 5.0, 5.0, 5.0]), ], - error: vec!["Code: 10, displayText = DataValue Error: Unsupported (Utf8) plus (Utf8)."], + error: vec![ + "Code: 10, displayText = DataValue Error: Unsupported (String) plus (String).", + ], }, ArrayTest { name: "minus-passed", @@ -149,7 +151,7 @@ fn test_arithmetic_series() { Series::new(vec![3.0f64, 1.0, -1.0, -3.0]), ], error: vec![ - "Code: 10, displayText = DataValue Error: Unsupported (Utf8) minus (Utf8).", + "Code: 10, displayText = DataValue Error: Unsupported (String) minus (String).", ], }, ArrayTest { @@ -189,7 +191,7 @@ fn test_arithmetic_series() { Series::new(vec![4.0f64, 6.0, 6.0, 4.0]), ], error: vec![ - "Code: 10, displayText = DataValue Error: Unsupported (Utf8) multiply (Utf8).", + "Code: 10, displayText = DataValue Error: Unsupported (String) multiply (String).", ], }, ArrayTest { @@ -253,7 +255,7 @@ fn test_arithmetic_series() { Series::new(vec![4.0, 1.5, 0.6666666666666666, 0.25]), ], error: vec![ - "Code: 10, displayText = DataValue Error: Unsupported (Utf8) divide (Utf8).", + "Code: 10, displayText = DataValue Error: Unsupported (String) divide (String).", ], }, ArrayTest { @@ -274,7 +276,7 @@ fn test_arithmetic_series() { Series::new(vec![0i64, 0, 0, 0]), ], error: vec![ - "Code: 10, displayText = DataValue Error: Unsupported (Utf8) modulo (Utf8).", + "Code: 10, displayText = DataValue Error: Unsupported (String) modulo (String).", ], }, ]; diff --git a/common/datavalues/src/series/series_impl.rs b/common/datavalues/src/series/series_impl.rs index 254bec6ebbd66..f80e047d4e96b 100644 --- a/common/datavalues/src/series/series_impl.rs +++ b/common/datavalues/src/series/series_impl.rs @@ -214,6 +214,18 @@ macro_rules! impl_from { }; } +impl<'a, T: AsRef<[&'a str]>> SeriesFrom for Series { + fn new(v: T) -> Self { + DFStringArray::new_from_slice(v.as_ref()).into_series() + } +} + +impl<'a, T: AsRef<[Option<&'a str>]>> SeriesFrom]> for Series { + fn new(v: T) -> Self { + DFStringArray::new_from_opt_slice(v.as_ref()).into_series() + } +} + impl<'a, T: AsRef<[&'a [u8]]>> SeriesFrom for Series { fn new(v: T) -> Self { DFStringArray::new_from_slice(v.as_ref()).into_series() diff --git a/common/functions/src/scalars/expressions/cast_test.rs b/common/functions/src/scalars/expressions/cast_test.rs index 75c6d0e2d0606..362321c1d1b80 100644 --- a/common/functions/src/scalars/expressions/cast_test.rs +++ b/common/functions/src/scalars/expressions/cast_test.rs @@ -88,7 +88,7 @@ fn test_cast_function() -> Result<()> { }, ]; - let dummy = DataField::new("dummy", DataType::Utf8, false); + let dummy = DataField::new("dummy", DataType::String, false); for t in tests { let rows = t.columns[0].len(); diff --git a/common/functions/src/scalars/strings/substring_test.rs b/common/functions/src/scalars/strings/substring_test.rs index b1ea07abdd1c0..4e8ea3333c642 100644 --- a/common/functions/src/scalars/strings/substring_test.rs +++ b/common/functions/src/scalars/strings/substring_test.rs @@ -34,7 +34,7 @@ fn test_substring_function() -> Result<()> { } let schema = DataSchemaRefExt::create(vec![ - DataField::new("a", DataType::Utf8, false), + DataField::new("a", DataType::String, false), DataField::new("b", DataType::Int64, false), DataField::new("c", DataType::UInt64, false), ]); diff --git a/common/functions/src/scalars/udfs/database_test.rs b/common/functions/src/scalars/udfs/database_test.rs index 16862302baf06..aa84cdd8c366f 100644 --- a/common/functions/src/scalars/udfs/database_test.rs +++ b/common/functions/src/scalars/udfs/database_test.rs @@ -30,7 +30,7 @@ fn test_database_function() -> Result<()> { error: &'static str, func: Box, } - let dummy = DataField::new("dummy", DataType::Utf8, false); + let dummy = DataField::new("dummy", DataType::String, false); let tests = vec![Test { name: "database-function-passed", diff --git a/common/functions/src/scalars/udfs/version_test.rs b/common/functions/src/scalars/udfs/version_test.rs index 5856b90b22119..18a081acd3f57 100644 --- a/common/functions/src/scalars/udfs/version_test.rs +++ b/common/functions/src/scalars/udfs/version_test.rs @@ -47,7 +47,7 @@ fn test_version_function() -> Result<()> { error: "", }]; - let dummy = DataField::new("dummy", DataType::Utf8, false); + let dummy = DataField::new("dummy", DataType::String, false); for t in tests { let rows = t.columns[0].len(); let func = t.func; diff --git a/common/indexing/src/index_min_max_test.rs b/common/indexing/src/index_min_max_test.rs index 53ff1e644990a..7cf33792d990d 100644 --- a/common/indexing/src/index_min_max_test.rs +++ b/common/indexing/src/index_min_max_test.rs @@ -31,7 +31,7 @@ use crate::MinMaxIndex; #[test] fn test_min_max_index() -> Result<()> { let schema = DataSchemaRefExt::create(vec![ - DataField::new("name", DataType::Utf8, true), + DataField::new("name", DataType::String, true), DataField::new("age", DataType::Int32, false), ]); @@ -48,8 +48,8 @@ fn test_min_max_index() -> Result<()> { let idx_slice = vec![ MinMaxIndex { col: "name".to_string(), - min: DataValue::Utf8(Some("jack".to_string())), - max: DataValue::Utf8(Some("xbohu".to_string())), + min: DataValue::String(Some("jack".as_bytes().to_vec())), + max: DataValue::String(Some("xbohu".as_bytes().to_vec())), version: IndexSchemaVersion::V1, }, MinMaxIndex { diff --git a/common/indexing/src/index_partition_test.rs b/common/indexing/src/index_partition_test.rs index c9d6d7051c942..eadb091d53f32 100644 --- a/common/indexing/src/index_partition_test.rs +++ b/common/indexing/src/index_partition_test.rs @@ -25,8 +25,8 @@ use crate::PartitionIndex; fn test_partition_index() -> Result<()> { // Apply index. { - let partition_value = DataValue::Utf8(Some("datafuse".to_string())); - let expr = col("name").eq(lit("bohu")); + let partition_value = DataValue::String(Some("datafuse".as_bytes().to_vec())); + let expr = col("name").eq(lit("bohu".as_bytes())); let actual = PartitionIndex::apply_index(partition_value, &expr)?; let expected = true; assert_eq!(actual, expected); diff --git a/common/indexing/src/index_sparse_test.rs b/common/indexing/src/index_sparse_test.rs index 1c86b1dc5e13f..4c71fb9702196 100644 --- a/common/indexing/src/index_sparse_test.rs +++ b/common/indexing/src/index_sparse_test.rs @@ -32,7 +32,7 @@ use crate::SparseIndexValue; #[test] fn test_sparse_index() -> Result<()> { let schema = DataSchemaRefExt::create(vec![ - DataField::new("name", DataType::Utf8, true), + DataField::new("name", DataType::String, true), DataField::new("age", DataType::Int32, false), ]); @@ -51,13 +51,13 @@ fn test_sparse_index() -> Result<()> { col: "name".to_string(), values: vec![ SparseIndexValue { - min: DataValue::Utf8(Some("jack".to_string())), - max: DataValue::Utf8(Some("bohu".to_string())), + min: DataValue::String(Some("jack".as_bytes().to_vec())), + max: DataValue::String(Some("bohu".as_bytes().to_vec())), page_no: 0, }, SparseIndexValue { - min: DataValue::Utf8(Some("xjack".to_string())), - max: DataValue::Utf8(Some("xbohu".to_string())), + min: DataValue::String(Some("xjack".as_bytes().to_vec())), + max: DataValue::String(Some("xbohu".as_bytes().to_vec())), page_no: 1, }, ], diff --git a/common/planners/src/plan_describe_table_test.rs b/common/planners/src/plan_describe_table_test.rs index 8a5592945e25c..814dc2ed1ac40 100644 --- a/common/planners/src/plan_describe_table_test.rs +++ b/common/planners/src/plan_describe_table_test.rs @@ -23,9 +23,9 @@ use crate::*; #[test] fn test_describe_table_plan() -> Result<()> { let schema = DataSchemaRefExt::create(vec![ - DataField::new("Field", DataType::Utf8, false), - DataField::new("Type", DataType::Utf8, false), - DataField::new("Null", DataType::Utf8, false), + DataField::new("Field", DataType::String, false), + DataField::new("Type", DataType::String, false), + DataField::new("Null", DataType::String, false), ]); let describe = PlanNode::DescribeTable(DescribeTablePlan { @@ -36,9 +36,9 @@ fn test_describe_table_plan() -> Result<()> { let expect = "\ DataSchema { fields: [\ - DataField { name: \"Field\", data_type: Utf8, nullable: false }, \ - DataField { name: \"Type\", data_type: Utf8, nullable: false }, \ - DataField { name: \"Null\", data_type: Utf8, nullable: false }\ + DataField { name: \"Field\", data_type: String, nullable: false }, \ + DataField { name: \"Type\", data_type: String, nullable: false }, \ + DataField { name: \"Null\", data_type: String, nullable: false }\ ] }"; let actual = format!("{:?}", describe.schema()); assert_eq!(expect, actual); diff --git a/common/planners/src/plan_explain_test.rs b/common/planners/src/plan_explain_test.rs index fda6357462a52..fd4bca8c53bf9 100644 --- a/common/planners/src/plan_explain_test.rs +++ b/common/planners/src/plan_explain_test.rs @@ -43,7 +43,7 @@ fn test_explain_plan() -> Result<()> { assert_eq!(expect, actual); assert_eq!(explain.schema().fields().clone(), vec![DataField::new( "explain", - DataType::Utf8, + DataType::String, false )]); diff --git a/common/planners/src/plan_expression_test.rs b/common/planners/src/plan_expression_test.rs index 531e171472435..b358e3b394a3d 100644 --- a/common/planners/src/plan_expression_test.rs +++ b/common/planners/src/plan_expression_test.rs @@ -25,7 +25,7 @@ use crate::*; fn test_expression_plan_format() -> Result<()> { use pretty_assertions::assert_eq; - let schema = DataSchemaRefExt::create(vec![DataField::new("a", DataType::Utf8, false)]); + let schema = DataSchemaRefExt::create(vec![DataField::new("a", DataType::String, false)]); let empty_plan = EmptyPlan::create_with_schema(schema.clone()); let expression = PlanNode::Expression(ExpressionPlan { @@ -35,7 +35,7 @@ fn test_expression_plan_format() -> Result<()> { desc: "".to_string(), }); let _ = expression.schema(); - let expect = "Expression: a:Utf8 ()"; + let expect = "Expression: a:String ()"; let actual = format!("{:?}", expression); assert_eq!(expect, actual); Ok(()) diff --git a/common/planners/src/plan_projection_test.rs b/common/planners/src/plan_projection_test.rs index 8463e8d39029a..d7c59b0aea849 100644 --- a/common/planners/src/plan_projection_test.rs +++ b/common/planners/src/plan_projection_test.rs @@ -23,16 +23,16 @@ use crate::*; fn test_projection_plan() -> Result<()> { use pretty_assertions::assert_eq; - let schema = DataSchemaRefExt::create(vec![DataField::new("a", DataType::Utf8, false)]); + let schema = DataSchemaRefExt::create(vec![DataField::new("a", DataType::String, false)]); let empty_plan = EmptyPlan::create_with_schema(schema.clone()); let projection = PlanNode::Projection(ProjectionPlan { expr: vec![col("a")], - schema: DataSchemaRefExt::create(vec![DataField::new("a", DataType::Utf8, false)]), + schema: DataSchemaRefExt::create(vec![DataField::new("a", DataType::String, false)]), input: Arc::from(PlanBuilder::from(&PlanNode::Empty(empty_plan)).build()?), }); let _ = projection.schema(); - let expect = "Projection: a:Utf8"; + let expect = "Projection: a:String"; let actual = format!("{:?}", projection); assert_eq!(expect, actual); Ok(()) diff --git a/common/planners/src/plan_scan_test.rs b/common/planners/src/plan_scan_test.rs index 4b6bb1ece894e..b74f891bb92be 100644 --- a/common/planners/src/plan_scan_test.rs +++ b/common/planners/src/plan_scan_test.rs @@ -25,11 +25,11 @@ fn test_scan_plan() -> Result<()> { schema_name: "scan_test".to_string(), table_id: 0, table_version: None, - table_schema: DataSchemaRefExt::create(vec![DataField::new("a", DataType::Utf8, false)]), + table_schema: DataSchemaRefExt::create(vec![DataField::new("a", DataType::String, false)]), table_args: None, projected_schema: DataSchemaRefExt::create(vec![DataField::new( "a", - DataType::Utf8, + DataType::String, false, )]), push_downs: Extras::default(), diff --git a/common/planners/src/plan_select_test.rs b/common/planners/src/plan_select_test.rs index 9acb5695e4330..9e59fad82d73f 100644 --- a/common/planners/src/plan_select_test.rs +++ b/common/planners/src/plan_select_test.rs @@ -23,12 +23,12 @@ use crate::*; fn test_select_wildcard_plan() -> Result<()> { use pretty_assertions::assert_eq; - let schema = DataSchemaRefExt::create(vec![DataField::new("a", DataType::Utf8, false)]); + let schema = DataSchemaRefExt::create(vec![DataField::new("a", DataType::String, false)]); let plan = PlanBuilder::create(schema).project(&[col("a")])?.build()?; let select = PlanNode::Select(SelectPlan { input: Arc::new(plan), }); - let expect = "Projection: a:Utf8"; + let expect = "Projection: a:String"; let actual = format!("{:?}", select); assert_eq!(expect, actual); diff --git a/common/streams/src/sources/source_test.rs b/common/streams/src/sources/source_test.rs index 3e081b55d5e77..c97af785a566b 100644 --- a/common/streams/src/sources/source_test.rs +++ b/common/streams/src/sources/source_test.rs @@ -28,7 +28,7 @@ fn test_parse_values() { let schema = DataSchemaRefExt::create(vec![ DataField::new("a", DataType::Int8, false), - DataField::new("b", DataType::Utf8, false), + DataField::new("b", DataType::String, false), DataField::new("c", DataType::Float64, false), ]); let mut values_source = ValueSource::new(buffer.as_bytes(), schema, 10); @@ -57,7 +57,7 @@ fn test_parse_csvs() { let schema = DataSchemaRefExt::create(vec![ DataField::new("a", DataType::Int8, false), - DataField::new("b", DataType::Utf8, false), + DataField::new("b", DataType::String, false), DataField::new("c", DataType::Float64, false), ]); let mut values_source = CsvSource::new(buffer.as_bytes(), schema, 10); diff --git a/common/streams/src/stream_datablock_test.rs b/common/streams/src/stream_datablock_test.rs index 63e74904e3109..dee165b7e606e 100644 --- a/common/streams/src/stream_datablock_test.rs +++ b/common/streams/src/stream_datablock_test.rs @@ -23,7 +23,7 @@ use crate::*; async fn test_datablock_stream() { let schema = DataSchemaRefExt::create(vec![ DataField::new("name", DataType::Int32, false), - DataField::new("age", DataType::Utf8, false), + DataField::new("age", DataType::String, false), ]); let data_blocks = vec![ diff --git a/common/streams/src/stream_limit_by.rs b/common/streams/src/stream_limit_by.rs index 924c3d0f0978e..61833667dab89 100644 --- a/common/streams/src/stream_limit_by.rs +++ b/common/streams/src/stream_limit_by.rs @@ -19,8 +19,8 @@ use std::task::Poll; use common_arrow::arrow; use common_arrow::arrow::array::BooleanArray; -use common_arrow::arrow::datatypes::DataType as ArrowType; use common_arrow::arrow::bitmap::MutableBitmap; +use common_arrow::arrow::datatypes::DataType as ArrowType; use common_datablocks::DataBlock; use common_datablocks::HashMethod; use common_datablocks::HashMethodSerializer; @@ -71,7 +71,7 @@ impl LimitByStream { } } - let array = BooleanArray::from_data(ArrowType::Boolean,filter_vec.into(), None); + let array = BooleanArray::from_data(ArrowType::Boolean, filter.into(), None); let batch = block.clone().try_into()?; let batch = arrow::compute::filter::filter_record_batch(&batch, &array)?; Some(batch.try_into()).transpose() diff --git a/common/streams/src/stream_limit_by_test.rs b/common/streams/src/stream_limit_by_test.rs index 618edd75482f4..5169d280a7ef9 100644 --- a/common/streams/src/stream_limit_by_test.rs +++ b/common/streams/src/stream_limit_by_test.rs @@ -24,7 +24,7 @@ use crate::*; async fn test_limitby_stream() -> Result<()> { let schema = DataSchemaRefExt::create(vec![ DataField::new("id", DataType::UInt8, false), - DataField::new("name", DataType::Utf8, false), + DataField::new("name", DataType::String, false), ]); let ids = vec![2u8, 2, 2, 2, 3, 3, 3]; diff --git a/common/streams/src/stream_skip_test.rs b/common/streams/src/stream_skip_test.rs index 157ed1c88e759..54db094cc8fc7 100644 --- a/common/streams/src/stream_skip_test.rs +++ b/common/streams/src/stream_skip_test.rs @@ -23,22 +23,28 @@ use crate::*; async fn test_skipstream() { let schema = DataSchemaRefExt::create(vec![ DataField::new("id", DataType::Int32, false), - DataField::new("name", DataType::Utf8, false), + DataField::new("name", DataType::String, false), ]); // create a data block with 'id' from 0 to 20 let ids = (0..20).collect::>(); let names = (0..20) - .map(|n| format!("Alice-{}", n)) - .collect::>(); + .map(|n| { + let str = format!("Alice-{}", n); + str.into_bytes() + }) + .collect::>>(); let block0 = DataBlock::create_by_array(schema.clone(), vec![Series::new(ids), Series::new(names)]); // create a data block with 'id' from 20 to 40 let ids = (20..40).collect::>(); let names = (20..40) - .map(|n| format!("Bob-{}", n)) - .collect::>(); + .map(|n| { + let str = format!("Bob-{}", n); + str.into_bytes() + }) + .collect::>>(); let block1 = DataBlock::create_by_array(schema.clone(), vec![Series::new(ids), Series::new(names)]); diff --git a/query/src/interpreters/interpreter_describe_table_test.rs b/query/src/interpreters/interpreter_describe_table_test.rs index 95e98429d1347..52567a8184ccc 100644 --- a/query/src/interpreters/interpreter_describe_table_test.rs +++ b/query/src/interpreters/interpreter_describe_table_test.rs @@ -51,7 +51,7 @@ async fn interpreter_describe_table_test() -> Result<()> { "+-------+--------+------+", "| a | Int64 | NO |", "| b | Int32 | NO |", - "| c | Utf8 | NO |", + "| c | String | NO |", "| d | Int16 | NO |", "| e | Date16 | NO |", "+-------+--------+------+", diff --git a/query/src/interpreters/interpreter_show_create_table_test.rs b/query/src/interpreters/interpreter_show_create_table_test.rs index 2a2e4625479eb..15debd3aa1a7e 100644 --- a/query/src/interpreters/interpreter_show_create_table_test.rs +++ b/query/src/interpreters/interpreter_show_create_table_test.rs @@ -51,7 +51,7 @@ async fn interpreter_show_create_table_test() -> Result<()> { "| a | CREATE TABLE `a` ( |", "| | `a` Int64, |", "| | `b` Int32, |", - "| | `c` Utf8, |", + "| | `c` String, |", "| | `d` Int16, |", "| | `e` Date16, |", "| | ) ENGINE=Null |", diff --git a/query/src/interpreters/interpreter_table_create_test.rs b/query/src/interpreters/interpreter_table_create_test.rs index 8df2190f224f3..39e8d95cadf02 100644 --- a/query/src/interpreters/interpreter_table_create_test.rs +++ b/query/src/interpreters/interpreter_table_create_test.rs @@ -34,7 +34,7 @@ async fn test_create_table_interpreter() -> Result<()> { assert_eq!(plan.schema().field_with_name("a")?.data_type(), &DataType::Int64); assert_eq!(plan.schema().field_with_name("b")?.data_type(), &DataType::Int32); - assert_eq!(plan.schema().field_with_name("c")?.data_type(), &DataType::Utf8); + assert_eq!(plan.schema().field_with_name("c")?.data_type(), &DataType::String); assert_eq!(plan.schema().field_with_name("d")?.data_type(), &DataType::Int16); assert_eq!(plan.schema().field_with_name("e")?.data_type(), &DataType::Date16); diff --git a/query/src/optimizers/optimizer_constant_folding_test.rs b/query/src/optimizers/optimizer_constant_folding_test.rs index 087e5ed614959..54cc42778c1c4 100644 --- a/query/src/optimizers/optimizer_constant_folding_test.rs +++ b/query/src/optimizers/optimizer_constant_folding_test.rs @@ -81,7 +81,7 @@ mod tests { query: "SELECT sipHash('test_string')", expect: "\ Projection: sipHash('test_string'):UInt64\ - \n Expression: 17123704338732264132:UInt64 (Before Projection)\ + \n Expression: 15735157695654173841:UInt64 (Before Projection)\ \n ReadDataSource: scan partitions: [1], scan schema: [dummy:UInt8], statistics: [read_rows: 1, read_bytes: 1]", }, Test { @@ -96,16 +96,16 @@ mod tests { name: "Projection strings const recursion", query: "SELECT SUBSTRING('1234567890' FROM 3 FOR 3)", expect: "\ - Projection: substring('1234567890', 3, 3):Utf8\ - \n Expression: 345:Utf8 (Before Projection)\ + Projection: substring('1234567890', 3, 3):String\ + \n Expression: 345:String (Before Projection)\ \n ReadDataSource: scan partitions: [1], scan schema: [dummy:UInt8], statistics: [read_rows: 1, read_bytes: 1]", }, Test { name: "Projection to type name const recursion", query: "SELECT toTypeName('1234567890')", expect: "\ - Projection: toTypeName('1234567890'):Utf8\ - \n Expression: Utf8:Utf8 (Before Projection)\ + Projection: toTypeName('1234567890'):String\ + \n Expression: String:String (Before Projection)\ \n ReadDataSource: scan partitions: [1], scan schema: [dummy:UInt8], statistics: [read_rows: 1, read_bytes: 1]", }, ]; diff --git a/query/src/optimizers/optimizer_projection_push_down_test.rs b/query/src/optimizers/optimizer_projection_push_down_test.rs index 49968d00bca09..92acb70fe8263 100644 --- a/query/src/optimizers/optimizer_projection_push_down_test.rs +++ b/query/src/optimizers/optimizer_projection_push_down_test.rs @@ -29,16 +29,16 @@ fn test_projection_push_down_optimizer_1() -> Result<()> { let ctx = crate::tests::try_create_context()?; let schema = DataSchemaRefExt::create(vec![ - DataField::new("a", DataType::Utf8, false), - DataField::new("b", DataType::Utf8, false), - DataField::new("c", DataType::Utf8, false), - DataField::new("d", DataType::Utf8, false), + DataField::new("a", DataType::String, false), + DataField::new("b", DataType::String, false), + DataField::new("c", DataType::String, false), + DataField::new("d", DataType::String, false), ]); let output_schema = DataSchemaRefExt::create(vec![ - DataField::new("a", DataType::Utf8, false), - DataField::new("b", DataType::Utf8, false), - DataField::new("c", DataType::Utf8, false), + DataField::new("a", DataType::String, false), + DataField::new("b", DataType::String, false), + DataField::new("c", DataType::String, false), ]); let plan = PlanNode::Projection(ProjectionPlan { @@ -53,7 +53,7 @@ fn test_projection_push_down_optimizer_1() -> Result<()> { let optimized = projection_push_down.optimize(&plan)?; let expect = "\ - Projection: a:Utf8, b:Utf8, c:Utf8"; + Projection: a:String, b:String, c:String"; let actual = format!("{:?}", optimized); assert_eq!(expect, actual); @@ -72,10 +72,10 @@ fn test_projection_push_down_optimizer_group_by() -> Result<()> { let optimized = project_push_down.optimize(&plan)?; let expect = "\ - Projection: max(value) as c1:Utf8, name as c2:Utf8\ + Projection: max(value) as c1:String, name as c2:String\ \n AggregatorFinal: groupBy=[[name]], aggr=[[max(value)]]\ \n AggregatorPartial: groupBy=[[name]], aggr=[[max(value)]]\ - \n ReadDataSource: scan partitions: [1], scan schema: [name:Utf8, value:Utf8], statistics: [read_rows: 0, read_bytes: 0]"; + \n ReadDataSource: scan partitions: [1], scan schema: [name:String, value:String], statistics: [read_rows: 0, read_bytes: 0]"; let actual = format!("{:?}", optimized); assert_eq!(expect, actual); @@ -96,9 +96,9 @@ fn test_projection_push_down_optimizer_2() -> Result<()> { table_id: 0, table_version: None, schema: DataSchemaRefExt::create(vec![ - DataField::new("a", DataType::Utf8, false), - DataField::new("b", DataType::Utf8, false), - DataField::new("c", DataType::Utf8, false), + DataField::new("a", DataType::String, false), + DataField::new("b", DataType::String, false), + DataField::new("c", DataType::String, false), ]), parts: generate_partitions(8, total as u64), statistics: statistics.clone(), @@ -118,7 +118,7 @@ fn test_projection_push_down_optimizer_2() -> Result<()> { let plan = PlanNode::Projection(ProjectionPlan { expr: vec![col("a")], - schema: DataSchemaRefExt::create(vec![DataField::new("a", DataType::Utf8, false)]), + schema: DataSchemaRefExt::create(vec![DataField::new("a", DataType::String, false)]), input: Arc::from(filter_plan), }); @@ -126,9 +126,9 @@ fn test_projection_push_down_optimizer_2() -> Result<()> { let optimized = projection_push_down.optimize(&plan)?; let expect = "\ - Projection: a:Utf8\ + Projection: a:String\ \n Filter: ((a > 6) and (b <= 10))\ - \n ReadDataSource: scan partitions: [8], scan schema: [a:Utf8, b:Utf8], statistics: [read_rows: 10000, read_bytes: 80000]"; + \n ReadDataSource: scan partitions: [8], scan schema: [a:String, b:String], statistics: [read_rows: 10000, read_bytes: 80000]"; let actual = format!("{:?}", optimized); assert_eq!(expect, actual); @@ -149,13 +149,13 @@ fn test_projection_push_down_optimizer_3() -> Result<()> { table_id: 0, table_version: None, schema: DataSchemaRefExt::create(vec![ - DataField::new("a", DataType::Utf8, false), - DataField::new("b", DataType::Utf8, false), - DataField::new("c", DataType::Utf8, false), - DataField::new("d", DataType::Utf8, false), - DataField::new("e", DataType::Utf8, false), - DataField::new("f", DataType::Utf8, false), - DataField::new("g", DataType::Utf8, false), + DataField::new("a", DataType::String, false), + DataField::new("b", DataType::String, false), + DataField::new("c", DataType::String, false), + DataField::new("d", DataType::String, false), + DataField::new("e", DataType::String, false), + DataField::new("f", DataType::String, false), + DataField::new("g", DataType::String, false), ]), parts: generate_partitions(8, total as u64), statistics: statistics.clone(), @@ -186,14 +186,14 @@ fn test_projection_push_down_optimizer_3() -> Result<()> { let optimized = projection_push_down.optimize(&plan)?; let expect = "\ - Projection: a:Utf8\ + Projection: a:String\ \n Limit: 10\ - \n Sort: c:Utf8\ + \n Sort: c:String\ \n Having: (a < 10)\ \n AggregatorFinal: groupBy=[[a, c]], aggr=[[]]\ \n AggregatorPartial: groupBy=[[a, c]], aggr=[[]]\ \n Filter: (b = 10)\ - \n ReadDataSource: scan partitions: [8], scan schema: [a:Utf8, b:Utf8, c:Utf8], statistics: [read_rows: 10000, read_bytes: 80000]"; + \n ReadDataSource: scan partitions: [8], scan schema: [a:String, b:String, c:String], statistics: [read_rows: 10000, read_bytes: 80000]"; let actual = format!("{:?}", optimized); assert_eq!(expect, actual); @@ -211,9 +211,9 @@ fn test_projection_push_down_optimizer_4() -> Result<()> { let mut project_push_down = ProjectionPushDownOptimizer::create(ctx); let optimized = project_push_down.optimize(&plan)?; - let expect = "Projection: substring(value, 1, 3) as c1:Utf8\ - \n Expression: substring(value, 1, 3):Utf8 (Before Projection)\ - \n ReadDataSource: scan partitions: [1], scan schema: [value:Utf8], statistics: [read_rows: 0, read_bytes: 0]"; + let expect = "Projection: substring(value, 1, 3) as c1:String\ + \n Expression: substring(value, 1, 3):String (Before Projection)\ + \n ReadDataSource: scan partitions: [1], scan schema: [value:String], statistics: [read_rows: 0, read_bytes: 0]"; let actual = format!("{:?}", optimized); assert_eq!(expect, actual); diff --git a/query/src/optimizers/optimizer_statistics_exact_test.rs b/query/src/optimizers/optimizer_statistics_exact_test.rs index b5a62d2ca5a68..4df90f723fdf6 100644 --- a/query/src/optimizers/optimizer_statistics_exact_test.rs +++ b/query/src/optimizers/optimizer_statistics_exact_test.rs @@ -39,9 +39,9 @@ mod tests { table_id: 0, table_version: None, schema: DataSchemaRefExt::create(vec![ - DataField::new("a", DataType::Utf8, false), - DataField::new("b", DataType::Utf8, false), - DataField::new("c", DataType::Utf8, false), + DataField::new("a", DataType::String, false), + DataField::new("b", DataType::String, false), + DataField::new("c", DataType::String, false), ]), parts: generate_partitions(8, total as u64), statistics: statistics.clone(), diff --git a/query/src/pipelines/transforms/transform_group_by_partial_test.rs b/query/src/pipelines/transforms/transform_group_by_partial_test.rs index 39fdb90fcba73..554012e40574c 100644 --- a/query/src/pipelines/transforms/transform_group_by_partial_test.rs +++ b/query/src/pipelines/transforms/transform_group_by_partial_test.rs @@ -61,15 +61,15 @@ async fn test_transform_partial_group_by() -> Result<()> { // SELECT SUM(number), AVG(number), number ... GROUP BY number; // binary-state let expected = vec![ - "+--------------------+----------------------------------+---------------+", - "| sum(number) | avg(number) | _group_by_key |", - "+--------------------+----------------------------------+---------------+", - "| 010000000000000000 | 00000000000000000100000000000000 | 0 |", - "| 010100000000000000 | 01000000000000000100000000000000 | 1 |", - "| 010200000000000000 | 02000000000000000100000000000000 | 2 |", - "| 010300000000000000 | 03000000000000000100000000000000 | 3 |", - "| 010400000000000000 | 04000000000000000100000000000000 | 4 |", - "+--------------------+----------------------------------+---------------+", + "+-------------+-------------+---------------+", + "| sum(number) | avg(number) | _group_by_key |", + "+-------------+-------------+---------------+", + "| \u{1}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0} | \u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{1}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0} | 0 |", + "| \u{1}\u{1}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0} | \u{1}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{1}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0} | 1 |", + "| \u{1}\u{2}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0} | \u{2}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{1}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0} | 2 |", + "| \u{1}\u{3}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0} | \u{3}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{1}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0} | 3 |", + "| \u{1}\u{4}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0} | \u{4}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{1}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0} | 4 |", + "+-------------+-------------+---------------+", ]; common_datablocks::assert_blocks_sorted_eq(expected, result.as_slice()); diff --git a/query/src/sessions/settings.rs b/query/src/sessions/settings.rs index 53e5d00e54abf..c75d321d2a416 100644 --- a/query/src/sessions/settings.rs +++ b/query/src/sessions/settings.rs @@ -27,11 +27,11 @@ pub struct Settings { impl Settings { apply_macros! { apply_getter_setter_settings, apply_initial_settings, apply_update_settings, - ("max_block_size", u64, 10000, "Maximum block size for reading".as_bytes().to_vec()), - ("max_threads", u64, 16, "The maximum number of threads to execute the request. By default, it is determined automatically.".as_bytes().to_vec()), - ("flight_client_timeout", u64, 60, "Max duration the flight client request is allowed to take in seconds. By default, it is 60 seconds".as_bytes().to_vec()), - ("min_distributed_rows", u64, 100000000, "Minimum distributed read rows. In cluster mode, when read rows exceeds this value, the local table converted to distributed query.".as_bytes().to_vec()), - ("min_distributed_bytes", u64, 500 * 1024 * 1024, "Minimum distributed read bytes. In cluster mode, when read bytes exceeds this value, the local table converted to distributed query.".as_bytes().to_vec()) + ("max_block_size", u64, 10000, "Maximum block size for reading"), + ("max_threads", u64, 16, "The maximum number of threads to execute the request. By default, it is determined automatically."), + ("flight_client_timeout", u64, 60, "Max duration the flight client request is allowed to take in seconds. By default, it is 60 seconds"), + ("min_distributed_rows", u64, 100000000, "Minimum distributed read rows. In cluster mode, when read rows exceeds this value, the local table converted to distributed query."), + ("min_distributed_bytes", u64, 500 * 1024 * 1024, "Minimum distributed read bytes. In cluster mode, when read bytes exceeds this value, the local table converted to distributed query.") } pub fn try_create() -> Result> { @@ -68,12 +68,12 @@ impl SettingsBase { // TODO, to use macro generate this codes #[allow(unused)] - pub fn try_set_u64(&self, key: &'static str, val: u64, desc: Vec) -> Result<()> { + pub fn try_set_u64(&self, key: &'static str, val: u64, desc: &str) -> Result<()> { let mut settings = self.settings.write(); let setting_val = DataValue::Struct(vec![ DataValue::UInt64(Some(val)), DataValue::UInt64(Some(val)), - DataValue::String(Some(desc)), + DataValue::String(Some(desc.as_bytes().to_vec())), ]); settings.insert(key, setting_val); Ok(()) @@ -117,12 +117,12 @@ impl SettingsBase { } #[allow(unused)] - pub fn try_set_i64(&self, key: &'static str, val: i64, desc: Vec) -> Result<()> { + pub fn try_set_i64(&self, key: &'static str, val: i64, desc: &str) -> Result<()> { let mut settings = self.settings.write(); let setting_val = DataValue::Struct(vec![ DataValue::Int64(Some(val)), DataValue::Int64(Some(val)), - DataValue::String(Some(desc)), + DataValue::String(Some(desc.as_bytes().to_vec())), ]); settings.insert(key, setting_val); Ok(()) @@ -166,12 +166,12 @@ impl SettingsBase { } #[allow(unused)] - pub fn try_set_f64(&self, key: &'static str, val: f64, desc: Vec) -> Result<()> { + pub fn try_set_f64(&self, key: &'static str, val: f64, desc: &str) -> Result<()> { let mut settings = self.settings.write(); let setting_val = DataValue::Struct(vec![ DataValue::Float64(Some(val)), DataValue::Float64(Some(val)), - DataValue::String(Some(desc)), + DataValue::String(Some(desc.as_bytes().to_vec())), ]); settings.insert(key, setting_val); Ok(()) @@ -215,20 +215,20 @@ impl SettingsBase { } #[allow(unused)] - pub fn try_set_string(&self, key: &'static str, val: Vec, desc: Vec) -> Result<()> { + pub fn try_set_string(&self, key: &'static str, val: &str, desc: &str) -> Result<()> { let mut settings = self.settings.write(); - let default_value = val.clone(); + let default_value = val; let setting_val = DataValue::Struct(vec![ - DataValue::String(Some(val)), - DataValue::String(Some(default_value)), - DataValue::String(Some(desc)), + DataValue::String(Some(val.as_bytes().to_vec())), + DataValue::String(Some(default_value.as_bytes().to_vec())), + DataValue::String(Some(desc.as_bytes().to_vec())), ]); settings.insert(key, setting_val); Ok(()) } #[allow(unused)] - pub fn try_update_string(&self, key: &'static str, val: Vec) -> Result<()> { + pub fn try_update_string(&self, key: &'static str, val: &str) -> Result<()> { let mut settings = self.settings.write(); let setting_val = settings .get(key) @@ -236,7 +236,7 @@ impl SettingsBase { if let DataValue::Struct(values) = setting_val { let v = DataValue::Struct(vec![ - DataValue::String(Some(val)), + DataValue::String(Some(val.as_bytes().to_vec())), values[1].clone(), values[2].clone(), ]); diff --git a/query/src/sql/plan_parser_test.rs b/query/src/sql/plan_parser_test.rs index ab5c07f1a0527..de8ce735567b0 100644 --- a/query/src/sql/plan_parser_test.rs +++ b/query/src/sql/plan_parser_test.rs @@ -54,13 +54,13 @@ fn test_plan_parser() -> Result<()> { Test { name: "create-table-passed", sql: "CREATE TABLE t(c1 int, c2 bigint, c3 varchar(255) ) ENGINE = Parquet location = 'foo.parquet' ", - expect: "Create table default.t DataField { name: \"c1\", data_type: Int32, nullable: false }, DataField { name: \"c2\", data_type: Int64, nullable: false }, DataField { name: \"c3\", data_type: Utf8, nullable: false }, engine: Parquet, if_not_exists:false, option: {\"location\": \"foo.parquet\"}", + expect: "Create table default.t DataField { name: \"c1\", data_type: Int32, nullable: false }, DataField { name: \"c2\", data_type: Int64, nullable: false }, DataField { name: \"c3\", data_type: String, nullable: false }, engine: Parquet, if_not_exists:false, option: {\"location\": \"foo.parquet\"}", error: "", }, Test { name: "create-table-if-not-exists-passed", sql: "CREATE TABLE IF NOT EXISTS t(c1 int, c2 bigint, c3 varchar(255) ) ENGINE = Parquet location = 'foo.parquet' ", - expect: "Create table default.t DataField { name: \"c1\", data_type: Int32, nullable: false }, DataField { name: \"c2\", data_type: Int64, nullable: false }, DataField { name: \"c3\", data_type: Utf8, nullable: false }, engine: Parquet, if_not_exists:true, option: {\"location\": \"foo.parquet\"}", + expect: "Create table default.t DataField { name: \"c1\", data_type: Int32, nullable: false }, DataField { name: \"c2\", data_type: Int64, nullable: false }, DataField { name: \"c3\", data_type: String, nullable: false }, engine: Parquet, if_not_exists:true, option: {\"location\": \"foo.parquet\"}", error: "", }, Test { @@ -108,7 +108,7 @@ fn test_plan_parser() -> Result<()> { Test { name: "database-passed", sql: "select database()", - expect: "Projection: database():Utf8\n Expression: database(default):Utf8 (Before Projection)\n ReadDataSource: scan partitions: [1], scan schema: [dummy:UInt8], statistics: [read_rows: 1, read_bytes: 1]", + expect: "Projection: database():String\n Expression: database(default):String (Before Projection)\n ReadDataSource: scan partitions: [1], scan schema: [dummy:UInt8], statistics: [read_rows: 1, read_bytes: 1]", error: "", }, Test { diff --git a/store/src/api/rpc/flight_service_test.rs b/store/src/api/rpc/flight_service_test.rs index 89dd64cfd3eb6..00447056354cf 100644 --- a/store/src/api/rpc/flight_service_test.rs +++ b/store/src/api/rpc/flight_service_test.rs @@ -474,7 +474,7 @@ async fn test_do_append() -> anyhow::Result<()> { let schema = Arc::new(DataSchema::new(vec![ DataField::new("col_i", DataType::Int64, false), - DataField::new("col_s", DataType::Utf8, false), + DataField::new("col_s", DataType::String, false), ])); let db_name = "test_db"; let tbl_name = "test_tbl"; @@ -546,7 +546,7 @@ async fn test_scan_partition() -> anyhow::Result<()> { let schema = Arc::new(DataSchema::new(vec![ DataField::new("col_i", DataType::Int64, false), - DataField::new("col_s", DataType::Utf8, false), + DataField::new("col_s", DataType::String, false), ])); let db_name = "test_db"; let tbl_name = "test_tbl"; diff --git a/store/src/data_part/appender_test.rs b/store/src/data_part/appender_test.rs index 4bdf6282a718c..288231741680a 100644 --- a/store/src/data_part/appender_test.rs +++ b/store/src/data_part/appender_test.rs @@ -37,7 +37,7 @@ mod test { fn test_in_memory_write() -> anyhow::Result<()> { let schema = Arc::new(DataSchema::new(vec![ DataField::new("col_i", DataType::Int64, false), - DataField::new("col_s", DataType::Utf8, false), + DataField::new("col_s", DataType::String, false), ])); let col0 = Series::new(vec![0 as i64, 1, 2]); @@ -64,7 +64,7 @@ mod test { #[tokio::test(flavor = "multi_thread", worker_threads = 1)] async fn test_append() -> anyhow::Result<()> { let col0: ArrayRef = Arc::new(Int64Array::from_values(vec![0, 1, 2])); - let col1: ArrayRef = Arc::new(LargeUtf8Array::from_iter_values( + let col1: ArrayRef = Arc::new(LargeBinaryArray::from_iter_values( vec!["str1", "str2", "str3"].iter(), )); diff --git a/tests/suites/0_stateless/02_0009_function_siphash64.result b/tests/suites/0_stateless/02_0009_function_siphash64.result index cf934d83124d4..30c1fe855859c 100644 --- a/tests/suites/0_stateless/02_0009_function_siphash64.result +++ b/tests/suites/0_stateless/02_0009_function_siphash64.result @@ -1,6 +1,6 @@ -9027491583908826579 +5091324831805182738 4952851536318644461 2854037594257667269 -9027491583908826579 +5091324831805182738 4952851536318644461 2854037594257667269 diff --git a/tests/suites/0_stateless/08_0000_optimizer_cluster.result b/tests/suites/0_stateless/08_0000_optimizer_cluster.result index 7d84397f96d74..28e145b3ff07e 100644 --- a/tests/suites/0_stateless/08_0000_optimizer_cluster.result +++ b/tests/suites/0_stateless/08_0000_optimizer_cluster.result @@ -8,6 +8,6 @@ RedistributeStage[expr: 0] Expression: ((number % 3) + 1):UInt16, (number + 1):UInt64 (Before GroupBy) ReadDataSource: scan partitions: [16], scan schema: [number:UInt64], statistics: [read_rows: 10000, read_bytes: 80000] projection push down: push (name and value) to read datasource -Projection: name:Utf8 +Projection: name:String Filter: (value > 10) - ReadDataSource: scan partitions: [1], scan schema: [name:Utf8, value:Utf8], statistics: [read_rows: 0, read_bytes: 0] + ReadDataSource: scan partitions: [1], scan schema: [name:String, value:String], statistics: [read_rows: 0, read_bytes: 0] diff --git a/website/datafuse/docs/sqlstatement/conversion-functions/cast.md b/website/datafuse/docs/sqlstatement/conversion-functions/cast.md index e0f7026d6567b..c7808783a7845 100644 --- a/website/datafuse/docs/sqlstatement/conversion-functions/cast.md +++ b/website/datafuse/docs/sqlstatement/conversion-functions/cast.md @@ -26,11 +26,11 @@ Converted value. ``` mysql> SELECT CAST(1 AS VARCHAR); -+-----------------+ -| cast(1 as Utf8) | -+-----------------+ -| 1 | -+-----------------+ ++-------------------+ +| cast(1 as String) | ++-------------------+ +| 1 | ++-------------------+ mysql> SELECT CAST(1 AS UInt64); +-------------------+