diff --git a/Cargo.lock b/Cargo.lock index 09d99a4e4fdca..24a45026bb26c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3345,7 +3345,6 @@ dependencies = [ "serde_json", "tantivy", "thiserror", - "tokio", "tonic", ] diff --git a/src/common/exception/Cargo.toml b/src/common/exception/Cargo.toml index cc723cc592948..033999b74d0d2 100644 --- a/src/common/exception/Cargo.toml +++ b/src/common/exception/Cargo.toml @@ -30,7 +30,6 @@ serde = { workspace = true } serde_json = { workspace = true } tantivy = { workspace = true } thiserror = { workspace = true } -tokio = "1.39.2" tonic = { workspace = true } [package.metadata.cargo-machete] diff --git a/src/query/sql/Cargo.toml b/src/query/sql/Cargo.toml index 93fbfd52b75db..84596e0ed17d2 100644 --- a/src/query/sql/Cargo.toml +++ b/src/query/sql/Cargo.toml @@ -73,8 +73,8 @@ serde = { workspace = true } sha2 = { workspace = true } simsearch = "0.2" time = "0.3.14" -url = "2.3.1" tokio = "1.39.2" +url = "2.3.1" [lints] workspace = true diff --git a/src/query/sql/src/planner/optimizer/dynamic_sample/dynamic_sample.rs b/src/query/sql/src/planner/optimizer/dynamic_sample/dynamic_sample.rs index 994683c5c0ef3..07e8abf91402b 100644 --- a/src/query/sql/src/planner/optimizer/dynamic_sample/dynamic_sample.rs +++ b/src/query/sql/src/planner/optimizer/dynamic_sample/dynamic_sample.rs @@ -17,7 +17,6 @@ use std::time::Duration; use databend_common_base::base::tokio::time::Instant; use databend_common_catalog::table_context::TableContext; -use databend_common_exception::ErrorCode; use databend_common_exception::Result; use crate::optimizer::dynamic_sample::filter_selectivity_sample::filter_selectivity_sample; @@ -80,9 +79,10 @@ pub async fn dynamic_sample( join_selectivity_sample(ctx, metadata, s_expr, sample_executor).await } RelOperator::Scan(_) => s_expr.plan().derive_stats(&RelExpr::with_s_expr(s_expr)), - _ => Err(ErrorCode::Unimplemented(format!( - "derive_cardinality_by_sample for {:?} is not supported yet", - s_expr.plan() - ))), + // Todo: add more operators here, and support more query patterns. + _ => { + let rel_expr = RelExpr::with_s_expr(s_expr); + rel_expr.derive_cardinality() + } } } diff --git a/src/query/sql/src/planner/optimizer/dynamic_sample/filter_selectivity_sample.rs b/src/query/sql/src/planner/optimizer/dynamic_sample/filter_selectivity_sample.rs index 4c9f5fbfa16d6..262bf06900f22 100644 --- a/src/query/sql/src/planner/optimizer/dynamic_sample/filter_selectivity_sample.rs +++ b/src/query/sql/src/planner/optimizer/dynamic_sample/filter_selectivity_sample.rs @@ -60,17 +60,19 @@ pub async fn filter_selectivity_sample( // Calculate sample size (0.2% of total data) let sample_size = (num_rows as f64 * 0.002).ceil(); - - scan.sample = Some(Sample { - sample_level: SampleLevel::ROW, - sample_conf: SampleConfig::RowsNum(sample_size), - }); - - let new_child = SExpr::create_leaf(Arc::new(RelOperator::Scan(scan))); - let mut new_s_expr = s_expr.replace_children(vec![Arc::new(new_child)]); - let collect_statistics_optimizer = - CollectStatisticsOptimizer::new(ctx.clone(), metadata.clone()); - new_s_expr = collect_statistics_optimizer.run(&new_s_expr).await?; + let mut new_s_expr = s_expr.clone(); + // If the table is too small, we don't need to sample. + if sample_size >= 10.0 { + scan.sample = Some(Sample { + sample_level: SampleLevel::ROW, + sample_conf: SampleConfig::RowsNum(sample_size), + }); + let new_child = SExpr::create_leaf(Arc::new(RelOperator::Scan(scan))); + new_s_expr = s_expr.replace_children(vec![Arc::new(new_child)]); + let collect_statistics_optimizer = + CollectStatisticsOptimizer::new(ctx.clone(), metadata.clone()); + new_s_expr = collect_statistics_optimizer.run(&new_s_expr).await?; + } new_s_expr = SExpr::create_unary( Arc::new(create_count_aggregate(AggregateMode::Partial).into()), @@ -91,12 +93,12 @@ pub async fn filter_selectivity_sample( if let Some(count) = block.get_last_column().as_number() { if let Some(number_scalar) = count.index(0) { // Compute and return selectivity - let selectivity = number_scalar.to_f64().to_f64().unwrap() / sample_size as f64; + let selectivity = number_scalar.to_f64().to_f64().unwrap() / sample_size; let mut statistics = child_rel_expr.derive_cardinality()?.statistics.clone(); let mut sb = SelectivityEstimator::new(&mut statistics, HashSet::new()); sb.update_other_statistic_by_selectivity(selectivity); let stat_info = Arc::new(StatInfo { - cardinality: selectivity * num_rows as f64, + cardinality: (selectivity * num_rows as f64).ceil(), statistics, }); *s_expr.stat_info.lock().unwrap() = Some(stat_info.clone()); diff --git a/src/query/sql/src/planner/optimizer/dynamic_sample/mod.rs b/src/query/sql/src/planner/optimizer/dynamic_sample/mod.rs index be5955ead66e3..0998554242d58 100644 --- a/src/query/sql/src/planner/optimizer/dynamic_sample/mod.rs +++ b/src/query/sql/src/planner/optimizer/dynamic_sample/mod.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#[allow(clippy::module_inception)] mod dynamic_sample; mod filter_selectivity_sample; mod join_selectivity_sample; diff --git a/src/query/sql/src/planner/optimizer/join/single_to_inner.rs b/src/query/sql/src/planner/optimizer/join/single_to_inner.rs index 9eb465a36010d..6ae2cd0f34dd1 100644 --- a/src/query/sql/src/planner/optimizer/join/single_to_inner.rs +++ b/src/query/sql/src/planner/optimizer/join/single_to_inner.rs @@ -46,7 +46,7 @@ impl SingleToInnerOptimizer { let mut children = Vec::with_capacity(s_expr.arity()); for child in s_expr.children() { let new_child = Self::single_to_inner(child)?; - if !new_child.eq(&child) { + if !new_child.eq(child) { children_changed = true; } children.push(Arc::new(new_child)); diff --git a/tests/sqllogictests/suites/tpch/sample.test b/tests/sqllogictests/suites/tpch/sample.test new file mode 100644 index 0000000000000..3d1fbafaabc43 --- /dev/null +++ b/tests/sqllogictests/suites/tpch/sample.test @@ -0,0 +1,407 @@ +statement ok +set sandbox_tenant = 'test_tenant'; + +statement ok +use tpch_test; + +# To make the test stable, we set the dynamic_sample_time_budget_ms to a large vale +statement ok +set dynamic_sample_time_budget_ms = 10000; + +# cbo will remove the `stat_info` computed by sample, so we need to disable cbo to see the estimate row info in explain +statement ok +set enable_cbo = 0; + +statement ok +set random_function_seed = 1; + +# estimated rows and output rows are similar for filter +# the test is flaky, so only put it there as a reference. +onlyif todo +query ok +EXPLAIN ANALYZE PARTIAL +SELECT + * +FROM + orders, + lineitem +WHERE + o_orderkey = l_orderkey + AND l_shipmode LIKE '%MAIL%'; +---- +HashJoin +├── estimated rows: 66953.00 +├── output rows: 85.95 thousand +├── Filter +│ ├── filters: [like(lineitem.l_shipmode (#23), '%MAIL%')] +│ ├── estimated rows: 66953.00 +│ ├── output rows: 85.95 thousand +│ └── TableScan +│ ├── table: default.tpch_test.lineitem +│ ├── estimated rows: 600572.00 +│ └── output rows: 600.57 thousand +└── TableScan + ├── table: default.tpch_test.orders + ├── estimated rows: 150000.00 + └── output rows: 150 thousand + +statement ok +set enable_cbo = 1; + +# use `join.test` to test dynamic sample framework without error +query I +select + c_custkey, count(o_orderkey) as c_count +from + customer + full outer join + orders + on c_custkey = o_custkey + and o_comment not like '%pending%deposits%' and c_custkey > 100 and c_custkey < 120 +group by + c_custkey +order by c_custkey + limit 20; +---- +1 0 +2 0 +3 0 +4 0 +5 0 +6 0 +7 0 +8 0 +9 0 +10 0 +11 0 +12 0 +13 0 +14 0 +15 0 +16 0 +17 0 +18 0 +19 0 +20 0 + + +query I +select + c_custkey +from + customer + inner join + orders + on c_custkey = o_custkey + and o_comment not like '%pending%deposits%' and c_custkey > 100 and c_custkey < 120 order by c_custkey limit 20; +---- +101 +101 +101 +101 +101 +101 +101 +101 +101 +101 +101 +101 +103 +103 +103 +103 +103 +103 +103 +103 + +query I +select + c_custkey, count(o_orderkey) as c_count +from + customer + left join + orders + on c_custkey = o_custkey + and o_comment not like '%pending%deposits%' and c_custkey > 100 and c_custkey < 120 +group by + c_custkey +order by c_custkey + limit 20; +---- +1 0 +2 0 +3 0 +4 0 +5 0 +6 0 +7 0 +8 0 +9 0 +10 0 +11 0 +12 0 +13 0 +14 0 +15 0 +16 0 +17 0 +18 0 +19 0 +20 0 + + +query I +select + c_custkey, count(o_orderkey) as c_count +from + customer + right join + orders + on c_custkey = o_custkey + and o_comment not like '%pending%deposits%' and c_custkey > 100 and c_custkey < 120 +group by + c_custkey +order by c_custkey +limit 20; +---- +101 12 +103 18 +104 7 +106 18 +107 12 +109 25 +110 9 +112 19 +113 17 +115 28 +116 4 +118 18 +119 10 +NULL 149803 + +query I +select + c_custkey +from + customer + left semi join + orders + on c_custkey = o_custkey + and o_comment not like '%pending%deposits%' and c_custkey > 100 and c_custkey < 120 +order by c_custkey + limit 20; +---- +101 +103 +104 +106 +107 +109 +110 +112 +113 +115 +116 +118 +119 + +query I +select + o_custkey +from + customer + right semi join + orders +on c_custkey = o_custkey + and o_comment not like '%pending%deposits%' and c_custkey > 100 and c_custkey < 120 +order by o_custkey + limit 20; +---- +101 +101 +101 +101 +101 +101 +101 +101 +101 +101 +101 +101 +103 +103 +103 +103 +103 +103 +103 +103 + +query I +select + c_custkey +from + customer + left anti join + orders +on c_custkey = o_custkey + and o_comment not like '%pending%deposits%' and c_custkey > 100 and c_custkey < 120 +order by c_custkey + limit 20; +---- +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 + +query I +select + o_custkey +from + customer + right anti join + orders +on c_custkey = o_custkey + and o_comment not like '%pending%deposits%' and c_custkey > 100 and c_custkey < 120 +order by o_custkey + limit 20; +---- +1 +1 +1 +1 +1 +1 +1 +1 +1 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 + +query I +select + o_comment +from + customer + cross join + orders +where o_comment not like '%pending%deposits%' and c_custkey > 100 and c_custkey < 120 +order by o_comment + limit 20; +---- + Tiresias about the blithely ironic a + Tiresias about the blithely ironic a + Tiresias about the blithely ironic a + Tiresias about the blithely ironic a + Tiresias about the blithely ironic a + Tiresias about the blithely ironic a + Tiresias about the blithely ironic a + Tiresias about the blithely ironic a + Tiresias about the blithely ironic a + Tiresias about the blithely ironic a + Tiresias about the blithely ironic a + Tiresias about the blithely ironic a + Tiresias about the blithely ironic a + Tiresias about the blithely ironic a + Tiresias about the blithely ironic a + Tiresias about the blithely ironic a + Tiresias about the blithely ironic a + Tiresias about the blithely ironic a + Tiresias about the blithely ironic a + Tiresias above the carefully ironic packages nag about the pend + +statement ok +set max_block_size = 1024; + + +# Test iejoin with large dataset +query I +select l_orderkey from (select * from lineitem order by l_orderkey limit 5000) as l, (select * from orders order by o_orderkey limit 5000) as o where l.l_orderkey > o.o_orderkey and l.l_partkey < o.o_custkey order by l_orderkey limit 10; +---- +3 +3 +3 +3 +3 +3 +3 +4 +5 +5 + +statement ok +set max_block_size = 65536; + +query I +select l_orderkey from (select * from lineitem order by l_orderkey limit 5000) as l, (select * from orders order by o_orderkey limit 5000) as o where l.l_orderkey > o.o_orderkey order by l_orderkey limit 10; +---- +2 +3 +3 +3 +3 +3 +3 +3 +3 +3 + +# LEFT OUTER / LEFT SINGEL / FULL +query I +select l_orderkey, o_orderdate, o_shippriority from lineitem left join orders on l_orderkey = o_orderkey and o_orderdate < to_date('1995-03-15') order by o_orderdate, l_orderkey limit 5; +---- +3271 1992-01-01 0 +3271 1992-01-01 0 +3271 1992-01-01 0 +3271 1992-01-01 0 +5607 1992-01-01 0 + +# LEFT ANTI +query I +select o_custkey from orders where not exists (select * from customer where substring(c_phone from 1 for 2) in ('13', '31', '23', '29', '30', '18', '17') and o_custkey = c_custkey) order by o_custkey limit 10; +---- +1 +1 +1 +1 +1 +1 +1 +1 +1 +4 + + +statement ok +set random_function_seed = 0; + +statement ok +set dynamic_sample_time_budget_ms = 0;