chore(query): support domain contains in string type (databendlabs#15023

) * feat(query): support domain contains in string type * feat(query): support domain contains in string type * add random tests
yufan022 · Apr 16, 2024 · 2711d85 · 2711d85
1 parent e582031
commit 2711d85
Show file tree

Hide file tree

Showing 4 changed files with 113 additions and 8 deletions.
diff --git a/src/query/expression/src/property.rs b/src/query/expression/src/property.rs
@@ -421,6 +421,7 @@ pub trait SimpleDomainCmp {
     fn domain_gte(&self, other: &Self) -> FunctionDomain<BooleanType>;
     fn domain_lt(&self, other: &Self) -> FunctionDomain<BooleanType>;
     fn domain_lte(&self, other: &Self) -> FunctionDomain<BooleanType>;
+    fn domain_contains(&self, other: &Self) -> FunctionDomain<BooleanType>;
 }
 
 const ALL_TRUE_DOMAIN: BooleanDomain = BooleanDomain {
@@ -489,6 +490,14 @@ impl<T: Ord + PartialOrd> SimpleDomainCmp for SimpleDomain<T> {
             FunctionDomain::Full
         }
     }
+
+    fn domain_contains(&self, other: &Self) -> FunctionDomain<BooleanType> {
+        if self.min > other.max || self.max < other.min {
+            FunctionDomain::Domain(ALL_FALSE_DOMAIN)
+        } else {
+            FunctionDomain::Full
+        }
+    }
 }
 
 impl SimpleDomainCmp for StringDomain {
@@ -521,6 +530,11 @@ impl SimpleDomainCmp for StringDomain {
         let (d1, d2) = unify_string(self, other);
         d1.domain_lte(&d2)
     }
+
+    fn domain_contains(&self, other: &Self) -> FunctionDomain<BooleanType> {
+        let (d1, d2) = unify_string(self, other);
+        d1.domain_contains(&d2)
+    }
 }
 
 fn unify_string(

diff --git a/src/query/functions/src/scalars/array.rs b/src/query/functions/src/scalars/array.rs
@@ -57,6 +57,7 @@ use databend_common_expression::FunctionRegistry;
 use databend_common_expression::FunctionSignature;
 use databend_common_expression::Scalar;
 use databend_common_expression::ScalarRef;
+use databend_common_expression::SimpleDomainCmp;
 use databend_common_expression::SortColumnDescription;
 use databend_common_expression::Value;
 use databend_common_expression::ValueRef;
@@ -481,11 +482,9 @@ pub fn register(registry: &mut FunctionRegistry) {
                 registry.register_passthrough_nullable_2_arg::<ArrayType<NumberType<NUM_TYPE>>, NumberType<NUM_TYPE>, BooleanType, _, _>(
                     "contains",
                     |_, lhs, rhs| {
-                        let has_true = lhs.is_some_and(|lhs| !(lhs.min > rhs.max || lhs.max < rhs.min));
-                        FunctionDomain::Domain(BooleanDomain {
-                            has_false: true,
-                            has_true,
-                        })
+                        lhs.as_ref().map(|lhs| {
+                            lhs.domain_contains(rhs)
+                        }).unwrap_or(FunctionDomain::Full)
                     },
                     |lhs, rhs, _| eval_contains::<NumberType<NUM_TYPE>>(lhs, rhs)
                 );
@@ -495,9 +494,11 @@ pub fn register(registry: &mut FunctionRegistry) {
 
     registry.register_passthrough_nullable_2_arg::<ArrayType<StringType>, StringType, BooleanType, _, _>(
         "contains",
-        |_, _, _| {
-            FunctionDomain::Full
-        },
+         |_, lhs, rhs| {
+                        lhs.as_ref().map(|lhs| {
+                            lhs.domain_contains(rhs)
+                        }).unwrap_or(FunctionDomain::Full)
+                    },
         |lhs, rhs, _| {
             match lhs {
                 ValueRef::Scalar(array) => {

diff --git a/src/query/functions/tests/it/scalars/array.rs b/src/query/functions/tests/it/scalars/array.rs
@@ -155,6 +155,10 @@ fn test_contains(file: &mut impl Write) {
 
     let columns = [
         ("int8_col", Int8Type::from_data(vec![1i8, 2, 7, 8])),
+        (
+            "string_col",
+            StringType::from_data(vec![r#"1"#, r#"2"#, r#"5"#, r#"1234"#]),
+        ),
         (
             "nullable_col",
             Int64Type::from_data_with_validity(vec![9i64, 10, 11, 12], vec![
@@ -164,6 +168,20 @@ fn test_contains(file: &mut impl Write) {
     ];
 
     run_ast(file, "int8_col not in (1, 2, 3, 4, 5, null)", &columns);
+    run_ast(
+        file,
+        "contains(['5000', '6000', '7000'], string_col)",
+        &columns,
+    );
+
+    run_ast(file, "contains(['1', '5'], string_col)", &columns);
+
+    run_ast(
+        file,
+        "contains(['15000', '6000', '7000'], string_col)",
+        &columns,
+    );
+
     run_ast(file, "contains([1,2,null], nullable_col)", &columns);
     run_ast(
         file,

diff --git a/src/query/functions/tests/it/scalars/testdata/array.txt b/src/query/functions/tests/it/scalars/testdata/array.txt
@@ -439,6 +439,78 @@ evaluation (internal):
 +----------+-----------------------+
 
 
+ast            : contains(['5000', '6000', '7000'], string_col)
+raw expr       : contains(array('5000', '6000', '7000'), string_col::String)
+checked expr   : contains<Array(String), String>(array<T0=String><T0, T0, T0>("5000", "6000", "7000"), string_col)
+optimized expr : false
+evaluation:
++--------+-------------+---------+
+|        | string_col  | Output  |
++--------+-------------+---------+
+| Type   | String      | Boolean |
+| Domain | {"1"..="5"} | {FALSE} |
+| Row 0  | '1'         | false   |
+| Row 1  | '2'         | false   |
+| Row 2  | '5'         | false   |
+| Row 3  | '1234'      | false   |
++--------+-------------+---------+
+evaluation (internal):
++------------+-------------------------------------------------------------------+
+| Column     | Data                                                              |
++------------+-------------------------------------------------------------------+
+| string_col | StringColumn { data: 0x31323531323334, offsets: [0, 1, 2, 3, 7] } |
+| Output     | Boolean([0b____0000])                                             |
++------------+-------------------------------------------------------------------+
+
+
+ast            : contains(['1', '5'], string_col)
+raw expr       : contains(array('1', '5'), string_col::String)
+checked expr   : contains<Array(String), String>(array<T0=String><T0, T0>("1", "5"), string_col)
+optimized expr : contains<Array(String), String>(['1', '5'], string_col)
+evaluation:
++--------+-------------+---------------+
+|        | string_col  | Output        |
++--------+-------------+---------------+
+| Type   | String      | Boolean       |
+| Domain | {"1"..="5"} | {FALSE, TRUE} |
+| Row 0  | '1'         | true          |
+| Row 1  | '2'         | false         |
+| Row 2  | '5'         | true          |
+| Row 3  | '1234'      | false         |
++--------+-------------+---------------+
+evaluation (internal):
++------------+-------------------------------------------------------------------+
+| Column     | Data                                                              |
++------------+-------------------------------------------------------------------+
+| string_col | StringColumn { data: 0x31323531323334, offsets: [0, 1, 2, 3, 7] } |
+| Output     | Boolean([0b____0101])                                             |
++------------+-------------------------------------------------------------------+
+
+
+ast            : contains(['15000', '6000', '7000'], string_col)
+raw expr       : contains(array('15000', '6000', '7000'), string_col::String)
+checked expr   : contains<Array(String), String>(array<T0=String><T0, T0, T0>("15000", "6000", "7000"), string_col)
+optimized expr : contains<Array(String), String>(['15000', '6000', '7000'], string_col)
+evaluation:
++--------+-------------+---------------+
+|        | string_col  | Output        |
++--------+-------------+---------------+
+| Type   | String      | Boolean       |
+| Domain | {"1"..="5"} | {FALSE, TRUE} |
+| Row 0  | '1'         | false         |
+| Row 1  | '2'         | false         |
+| Row 2  | '5'         | false         |
+| Row 3  | '1234'      | false         |
++--------+-------------+---------------+
+evaluation (internal):
++------------+-------------------------------------------------------------------+
+| Column     | Data                                                              |
++------------+-------------------------------------------------------------------+
+| string_col | StringColumn { data: 0x31323531323334, offsets: [0, 1, 2, 3, 7] } |
+| Output     | Boolean([0b____0000])                                             |
++------------+-------------------------------------------------------------------+
+
+
 ast            : contains([1,2,null], nullable_col)
 raw expr       : contains(array(1, 2, NULL), nullable_col::Int64 NULL)
 checked expr   : contains<T0=Int64 NULL><Array(T0), T0>(CAST(array<T0=UInt8 NULL><T0, T0, T0>(CAST(1_u8 AS UInt8 NULL), CAST(2_u8 AS UInt8 NULL), CAST(NULL AS UInt8 NULL)) AS Array(Int64 NULL)), nullable_col)