Skip to content

Commit

Permalink
chore(query): support domain contains in string type (databendlabs#15023
Browse files Browse the repository at this point in the history
)

* feat(query): support domain contains in string type

* feat(query): support domain contains in string type

* add random tests
  • Loading branch information
sundy-li authored and yufan022 committed Apr 16, 2024
1 parent e582031 commit 2711d85
Show file tree
Hide file tree
Showing 4 changed files with 113 additions and 8 deletions.
14 changes: 14 additions & 0 deletions src/query/expression/src/property.rs
Original file line number Diff line number Diff line change
Expand Up @@ -421,6 +421,7 @@ pub trait SimpleDomainCmp {
fn domain_gte(&self, other: &Self) -> FunctionDomain<BooleanType>;
fn domain_lt(&self, other: &Self) -> FunctionDomain<BooleanType>;
fn domain_lte(&self, other: &Self) -> FunctionDomain<BooleanType>;
fn domain_contains(&self, other: &Self) -> FunctionDomain<BooleanType>;
}

const ALL_TRUE_DOMAIN: BooleanDomain = BooleanDomain {
Expand Down Expand Up @@ -489,6 +490,14 @@ impl<T: Ord + PartialOrd> SimpleDomainCmp for SimpleDomain<T> {
FunctionDomain::Full
}
}

fn domain_contains(&self, other: &Self) -> FunctionDomain<BooleanType> {
if self.min > other.max || self.max < other.min {
FunctionDomain::Domain(ALL_FALSE_DOMAIN)
} else {
FunctionDomain::Full
}
}
}

impl SimpleDomainCmp for StringDomain {
Expand Down Expand Up @@ -521,6 +530,11 @@ impl SimpleDomainCmp for StringDomain {
let (d1, d2) = unify_string(self, other);
d1.domain_lte(&d2)
}

fn domain_contains(&self, other: &Self) -> FunctionDomain<BooleanType> {
let (d1, d2) = unify_string(self, other);
d1.domain_contains(&d2)
}
}

fn unify_string(
Expand Down
17 changes: 9 additions & 8 deletions src/query/functions/src/scalars/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ use databend_common_expression::FunctionRegistry;
use databend_common_expression::FunctionSignature;
use databend_common_expression::Scalar;
use databend_common_expression::ScalarRef;
use databend_common_expression::SimpleDomainCmp;
use databend_common_expression::SortColumnDescription;
use databend_common_expression::Value;
use databend_common_expression::ValueRef;
Expand Down Expand Up @@ -481,11 +482,9 @@ pub fn register(registry: &mut FunctionRegistry) {
registry.register_passthrough_nullable_2_arg::<ArrayType<NumberType<NUM_TYPE>>, NumberType<NUM_TYPE>, BooleanType, _, _>(
"contains",
|_, lhs, rhs| {
let has_true = lhs.is_some_and(|lhs| !(lhs.min > rhs.max || lhs.max < rhs.min));
FunctionDomain::Domain(BooleanDomain {
has_false: true,
has_true,
})
lhs.as_ref().map(|lhs| {
lhs.domain_contains(rhs)
}).unwrap_or(FunctionDomain::Full)
},
|lhs, rhs, _| eval_contains::<NumberType<NUM_TYPE>>(lhs, rhs)
);
Expand All @@ -495,9 +494,11 @@ pub fn register(registry: &mut FunctionRegistry) {

registry.register_passthrough_nullable_2_arg::<ArrayType<StringType>, StringType, BooleanType, _, _>(
"contains",
|_, _, _| {
FunctionDomain::Full
},
|_, lhs, rhs| {
lhs.as_ref().map(|lhs| {
lhs.domain_contains(rhs)
}).unwrap_or(FunctionDomain::Full)
},
|lhs, rhs, _| {
match lhs {
ValueRef::Scalar(array) => {
Expand Down
18 changes: 18 additions & 0 deletions src/query/functions/tests/it/scalars/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,10 @@ fn test_contains(file: &mut impl Write) {

let columns = [
("int8_col", Int8Type::from_data(vec![1i8, 2, 7, 8])),
(
"string_col",
StringType::from_data(vec![r#"1"#, r#"2"#, r#"5"#, r#"1234"#]),
),
(
"nullable_col",
Int64Type::from_data_with_validity(vec![9i64, 10, 11, 12], vec![
Expand All @@ -164,6 +168,20 @@ fn test_contains(file: &mut impl Write) {
];

run_ast(file, "int8_col not in (1, 2, 3, 4, 5, null)", &columns);
run_ast(
file,
"contains(['5000', '6000', '7000'], string_col)",
&columns,
);

run_ast(file, "contains(['1', '5'], string_col)", &columns);

run_ast(
file,
"contains(['15000', '6000', '7000'], string_col)",
&columns,
);

run_ast(file, "contains([1,2,null], nullable_col)", &columns);
run_ast(
file,
Expand Down
72 changes: 72 additions & 0 deletions src/query/functions/tests/it/scalars/testdata/array.txt
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,78 @@ evaluation (internal):
+----------+-----------------------+


ast : contains(['5000', '6000', '7000'], string_col)
raw expr : contains(array('5000', '6000', '7000'), string_col::String)
checked expr : contains<Array(String), String>(array<T0=String><T0, T0, T0>("5000", "6000", "7000"), string_col)
optimized expr : false
evaluation:
+--------+-------------+---------+
| | string_col | Output |
+--------+-------------+---------+
| Type | String | Boolean |
| Domain | {"1"..="5"} | {FALSE} |
| Row 0 | '1' | false |
| Row 1 | '2' | false |
| Row 2 | '5' | false |
| Row 3 | '1234' | false |
+--------+-------------+---------+
evaluation (internal):
+------------+-------------------------------------------------------------------+
| Column | Data |
+------------+-------------------------------------------------------------------+
| string_col | StringColumn { data: 0x31323531323334, offsets: [0, 1, 2, 3, 7] } |
| Output | Boolean([0b____0000]) |
+------------+-------------------------------------------------------------------+


ast : contains(['1', '5'], string_col)
raw expr : contains(array('1', '5'), string_col::String)
checked expr : contains<Array(String), String>(array<T0=String><T0, T0>("1", "5"), string_col)
optimized expr : contains<Array(String), String>(['1', '5'], string_col)
evaluation:
+--------+-------------+---------------+
| | string_col | Output |
+--------+-------------+---------------+
| Type | String | Boolean |
| Domain | {"1"..="5"} | {FALSE, TRUE} |
| Row 0 | '1' | true |
| Row 1 | '2' | false |
| Row 2 | '5' | true |
| Row 3 | '1234' | false |
+--------+-------------+---------------+
evaluation (internal):
+------------+-------------------------------------------------------------------+
| Column | Data |
+------------+-------------------------------------------------------------------+
| string_col | StringColumn { data: 0x31323531323334, offsets: [0, 1, 2, 3, 7] } |
| Output | Boolean([0b____0101]) |
+------------+-------------------------------------------------------------------+


ast : contains(['15000', '6000', '7000'], string_col)
raw expr : contains(array('15000', '6000', '7000'), string_col::String)
checked expr : contains<Array(String), String>(array<T0=String><T0, T0, T0>("15000", "6000", "7000"), string_col)
optimized expr : contains<Array(String), String>(['15000', '6000', '7000'], string_col)
evaluation:
+--------+-------------+---------------+
| | string_col | Output |
+--------+-------------+---------------+
| Type | String | Boolean |
| Domain | {"1"..="5"} | {FALSE, TRUE} |
| Row 0 | '1' | false |
| Row 1 | '2' | false |
| Row 2 | '5' | false |
| Row 3 | '1234' | false |
+--------+-------------+---------------+
evaluation (internal):
+------------+-------------------------------------------------------------------+
| Column | Data |
+------------+-------------------------------------------------------------------+
| string_col | StringColumn { data: 0x31323531323334, offsets: [0, 1, 2, 3, 7] } |
| Output | Boolean([0b____0000]) |
+------------+-------------------------------------------------------------------+


ast : contains([1,2,null], nullable_col)
raw expr : contains(array(1, 2, NULL), nullable_col::Int64 NULL)
checked expr : contains<T0=Int64 NULL><Array(T0), T0>(CAST(array<T0=UInt8 NULL><T0, T0, T0>(CAST(1_u8 AS UInt8 NULL), CAST(2_u8 AS UInt8 NULL), CAST(NULL AS UInt8 NULL)) AS Array(Int64 NULL)), nullable_col)
Expand Down

0 comments on commit 2711d85

Please sign in to comment.