databendlabs · Dousir9 · Oct 30, 2024 · Oct 30, 2024 · Oct 30, 2024
diff --git a/src/query/sql/src/planner/optimizer/dynamic_sample/filter_selectivity_sample.rs b/src/query/sql/src/planner/optimizer/dynamic_sample/filter_selectivity_sample.rs
@@ -95,8 +95,13 @@ pub async fn filter_selectivity_sample(
                 if let Some(number_scalar) = count.index(0) {
                     // Compute and return selectivity
                     let selectivity = number_scalar.to_f64().to_f64().unwrap() / sample_size;
-                    let mut statistics = child_rel_expr.derive_cardinality()?.statistics.clone();
-                    let mut sb = SelectivityEstimator::new(&mut statistics, HashSet::new());
+                    let stat_info = child_rel_expr.derive_cardinality()?;
+                    let mut statistics = stat_info.statistics.clone();
+                    let mut sb = SelectivityEstimator::new(
+                        &mut statistics,
+                        stat_info.cardinality,
+                        HashSet::new(),
+                    );
                     sb.update_other_statistic_by_selectivity(selectivity);
                     let stat_info = Arc::new(StatInfo {
                         cardinality: (selectivity * num_rows as f64).ceil(),

diff --git a/src/query/sql/src/planner/optimizer/property/selectivity.rs b/src/query/sql/src/planner/optimizer/property/selectivity.rs
@@ -52,13 +52,19 @@ const ANY_CHAR_SEL: f64 = 0.9; // not 1, since it won't match end-of-string
 const FULL_WILDCARD_SEL: f64 = 2.0;
 
 pub struct SelectivityEstimator<'a> {
+    pub cardinality: f64,
     pub input_stat: &'a mut Statistics,
     pub updated_column_indexes: HashSet<IndexType>,
 }
 
 impl<'a> SelectivityEstimator<'a> {
-    pub fn new(input_stat: &'a mut Statistics, updated_column_indexes: HashSet<IndexType>) -> Self {
+    pub fn new(
+        input_stat: &'a mut Statistics,
+        cardinality: f64,
+        updated_column_indexes: HashSet<IndexType>,
+    ) -> Self {
         Self {
+            cardinality,
             input_stat,
             updated_column_indexes,
         }
@@ -102,6 +108,9 @@ impl<'a> SelectivityEstimator<'a> {
                 if func.func_name.eq("like") {
                     return self.compute_like_selectivity(func);
                 }
+                if func.func_name.eq("is_not_null") {
+                    return self.compute_is_not_null_selectivity(&func.arguments[0]);
+                }
                 if let Some(op) = ComparisonOp::try_from_func_name(&func.func_name) {
                     return self.compute_selectivity_comparison_expr(
                         op,
@@ -159,6 +168,29 @@ impl<'a> SelectivityEstimator<'a> {
         }
     }
 
+    fn compute_is_not_null_selectivity(&mut self, expr: &ScalarExpr) -> Result<f64> {
+        match expr {
+            ScalarExpr::BoundColumnRef(column_ref) => {
+                let column_stat = if let Some(stat) = self
+                    .input_stat
+                    .column_stats
+                    .get_mut(&column_ref.column.index)
+                {
+                    stat
+                } else {
+                    return Ok(DEFAULT_SELECTIVITY);
+                };
+                if self.cardinality == 0.0 {
+                    return Ok(0.0);
+                }
+                let selectivity =
+                    (self.cardinality - column_stat.null_count as f64) / self.cardinality;
+                Ok(selectivity)
+            }
+            _ => Ok(DEFAULT_SELECTIVITY),
+        }
+    }
+
     fn compute_selectivity_comparison_expr(
         &mut self,
         mut op: ComparisonOp,

diff --git a/src/query/sql/src/planner/plans/filter.rs b/src/query/sql/src/planner/plans/filter.rs
@@ -87,7 +87,7 @@ impl Operator for Filter {
         let (input_cardinality, mut statistics) =
             (stat_info.cardinality, stat_info.statistics.clone());
         // Derive cardinality
-        let mut sb = SelectivityEstimator::new(&mut statistics, HashSet::new());
+        let mut sb = SelectivityEstimator::new(&mut statistics, input_cardinality, HashSet::new());
         let mut selectivity = MAX_SELECTIVITY;
         for pred in self.predicates.iter() {
             // Compute selectivity for each conjunction

diff --git a/src/query/sql/src/planner/plans/scan.rs b/src/query/sql/src/planner/plans/scan.rs
@@ -271,7 +271,11 @@ impl Operator for Scan {
                     column_stats,
                 };
                 // Derive cardinality
-                let mut sb = SelectivityEstimator::new(&mut statistics, HashSet::new());
+                let mut sb = SelectivityEstimator::new(
+                    &mut statistics,
+                    precise_cardinality as f64,
+                    HashSet::new(),
+                );
                 let mut selectivity = MAX_SELECTIVITY;
                 for pred in prewhere.predicates.iter() {
                     // Compute selectivity for each conjunction

diff --git a/tests/sqllogictests/suites/mode/standalone/explain/eliminate_outer_join.test b/tests/sqllogictests/suites/mode/standalone/explain/eliminate_outer_join.test
@@ -112,11 +112,11 @@ HashJoin
 ├── build keys: [t1.a (#1)]
 ├── probe keys: [t.a (#0)]
 ├── filters: []
-├── estimated rows: 2.00
+├── estimated rows: 10.00
 ├── Filter(Build)
 │   ├── output columns: [t1.a (#1)]
 │   ├── filters: [is_not_null(t1.a (#1))]
-│   ├── estimated rows: 2.00
+│   ├── estimated rows: 10.00
 │   └── TableScan
 │       ├── table: default.eliminate_outer_join.t
 │       ├── output columns: [a (#1)]
@@ -130,7 +130,7 @@ HashJoin
 └── Filter(Probe)
     ├── output columns: [t.a (#0)]
     ├── filters: [is_not_null(t.a (#0))]
-    ├── estimated rows: 2.00
+    ├── estimated rows: 10.00
     └── TableScan
         ├── table: default.eliminate_outer_join.t
         ├── output columns: [a (#0)]
@@ -151,11 +151,11 @@ HashJoin
 ├── build keys: [t1.a (#1)]
 ├── probe keys: [t.a (#0)]
 ├── filters: []
-├── estimated rows: 2.00
+├── estimated rows: 10.00
 ├── Filter(Build)
 │   ├── output columns: [t1.a (#1)]
 │   ├── filters: [is_not_null(t1.a (#1))]
-│   ├── estimated rows: 2.00
+│   ├── estimated rows: 10.00
 │   └── TableScan
 │       ├── table: default.eliminate_outer_join.t
 │       ├── output columns: [a (#1)]
@@ -169,7 +169,7 @@ HashJoin
 └── Filter(Probe)
     ├── output columns: [t.a (#0)]
     ├── filters: [is_not_null(t.a (#0))]
-    ├── estimated rows: 2.00
+    ├── estimated rows: 10.00
     └── TableScan
         ├── table: default.eliminate_outer_join.t
         ├── output columns: [a (#0)]
@@ -190,11 +190,11 @@ HashJoin
 ├── build keys: [t1.a (#1)]
 ├── probe keys: [t.a (#0)]
 ├── filters: []
-├── estimated rows: 2.00
+├── estimated rows: 10.00
 ├── Filter(Build)
 │   ├── output columns: [t1.a (#1)]
 │   ├── filters: [is_not_null(t1.a (#1))]
-│   ├── estimated rows: 2.00
+│   ├── estimated rows: 10.00
 │   └── TableScan
 │       ├── table: default.eliminate_outer_join.t
 │       ├── output columns: [a (#1)]
@@ -208,7 +208,7 @@ HashJoin
 └── Filter(Probe)
     ├── output columns: [t.a (#0)]
     ├── filters: [is_not_null(t.a (#0))]
-    ├── estimated rows: 2.00
+    ├── estimated rows: 10.00
     └── TableScan
         ├── table: default.eliminate_outer_join.t
         ├── output columns: [a (#0)]
@@ -229,11 +229,11 @@ HashJoin
 ├── build keys: [t.a (#0)]
 ├── probe keys: [t1.a (#1)]
 ├── filters: []
-├── estimated rows: 2.00
+├── estimated rows: 10.00
 ├── Filter(Build)
 │   ├── output columns: [t.a (#0)]
 │   ├── filters: [is_not_null(t.a (#0))]
-│   ├── estimated rows: 2.00
+│   ├── estimated rows: 10.00
 │   └── TableScan
 │       ├── table: default.eliminate_outer_join.t
 │       ├── output columns: [a (#0)]
@@ -247,7 +247,7 @@ HashJoin
 └── Filter(Probe)
     ├── output columns: [t1.a (#1)]
     ├── filters: [is_not_null(t1.a (#1))]
-    ├── estimated rows: 2.00
+    ├── estimated rows: 10.00
     └── TableScan
         ├── table: default.eliminate_outer_join.t
         ├── output columns: [a (#1)]
@@ -268,11 +268,11 @@ HashJoin
 ├── build keys: [t1.a (#1)]
 ├── probe keys: [t.a (#0)]
 ├── filters: []
-├── estimated rows: 2.00
+├── estimated rows: 10.00
 ├── Filter(Build)
 │   ├── output columns: [t1.a (#1)]
 │   ├── filters: [is_not_null(t1.a (#1))]
-│   ├── estimated rows: 2.00
+│   ├── estimated rows: 10.00
 │   └── TableScan
 │       ├── table: default.eliminate_outer_join.t
 │       ├── output columns: [a (#1)]
@@ -286,7 +286,7 @@ HashJoin
 └── Filter(Probe)
     ├── output columns: [t.a (#0)]
     ├── filters: [is_not_null(t.a (#0))]
-    ├── estimated rows: 2.00
+    ├── estimated rows: 10.00
     └── TableScan
         ├── table: default.eliminate_outer_join.t
         ├── output columns: [a (#0)]

diff --git a/tests/sqllogictests/suites/mode/standalone/explain/nullable_prune.test b/tests/sqllogictests/suites/mode/standalone/explain/nullable_prune.test
@@ -33,7 +33,7 @@ explain select * from t_nullable_prune where a is not null
 Filter
 ├── output columns: [t_nullable_prune.a (#0)]
 ├── filters: [is_not_null(t_nullable_prune.a (#0))]
-├── estimated rows: 1.20
+├── estimated rows: 3.00
 └── TableScan
     ├── table: default.default.t_nullable_prune
     ├── output columns: [a (#0)]
@@ -51,7 +51,7 @@ explain select * from t_nullable_prune where a is null
 Filter
 ├── output columns: [t_nullable_prune.a (#0)]
 ├── filters: [NOT is_not_null(t_nullable_prune.a (#0))]
-├── estimated rows: 4.80
+├── estimated rows: 3.00
 └── TableScan
     ├── table: default.default.t_nullable_prune
     ├── output columns: [a (#0)]

diff --git a/tests/sqllogictests/suites/mode/standalone/explain/selectivity/is_not_null.test b/tests/sqllogictests/suites/mode/standalone/explain/selectivity/is_not_null.test
@@ -0,0 +1,26 @@
+statement ok
+CREATE OR REPLACE TABLE twocolumn (x INT NULL, y INT NULL);
+
+statement ok
+INSERT INTO twocolumn(x, y) VALUES (44,51), (NULL,52), (42,53), (45,45);
+
+query T
+explain select * from twocolumn where x is not NULL;
+----
+Filter
+├── output columns: [twocolumn.x (#0), twocolumn.y (#1)]
+├── filters: [is_not_null(twocolumn.x (#0))]
+├── estimated rows: 3.00
+└── TableScan
+    ├── table: default.default.twocolumn
+    ├── output columns: [x (#0), y (#1)]
+    ├── read rows: 4
+    ├── read size: < 1 KiB
+    ├── partitions total: 1
+    ├── partitions scanned: 1
+    ├── pruning stats: [segments: <range pruning: 1 to 1>, blocks: <range pruning: 1 to 1>]
+    ├── push downs: [filters: [is_not_null(twocolumn.x (#0))], limit: NONE]
+    └── estimated rows: 4.00
+
+statement ok
+DROP TABLE twocolumn;
diff --git a/tests/sqllogictests/suites/mode/standalone/explain_native/nullable_prune.test b/tests/sqllogictests/suites/mode/standalone/explain_native/nullable_prune.test
@@ -39,7 +39,7 @@ TableScan
 ├── partitions scanned: 1
 ├── pruning stats: [segments: <range pruning: 2 to 1>, blocks: <range pruning: 1 to 1>]
 ├── push downs: [filters: [is_not_null(t_nullable_prune.a (#0))], limit: NONE]
-└── estimated rows: 1.20
+└── estimated rows: 3.00
 
 query T
 explain select * from t_nullable_prune where a is null
@@ -53,7 +53,7 @@ TableScan
 ├── partitions scanned: 1
 ├── pruning stats: [segments: <range pruning: 2 to 1>, blocks: <range pruning: 1 to 1>]
 ├── push downs: [filters: [NOT is_not_null(t_nullable_prune.a (#0))], limit: NONE]
-└── estimated rows: 4.80
+└── estimated rows: 3.00
 
 statement ok
 DROP TABLE default.default.t_nullable_prune