From d14f7a5ed1b7ca521ec28aa2085fa1a028c2dec9 Mon Sep 17 00:00:00 2001 From: baishen Date: Tue, 20 Aug 2024 23:03:47 +0800 Subject: [PATCH] feat(query): Inverted index search function support options (#16256) * feat(query): Inverted index search function support options * add tests * create query in pruner * add tests * fix --- src/query/catalog/src/plan/pushdown.rs | 20 ++- .../tests/it/inverted_index/index_refresh.rs | 105 ++++++------- .../ee/tests/it/inverted_index/pruning.rs | 15 ++ .../sql/src/planner/semantic/type_check.rs | 144 +++++++++++++++--- .../inverted_index/inverted_index_reader.rs | 39 ++--- .../fuse/src/pruning/inverted_index_pruner.rs | 125 ++++++++++----- src/query/storages/fuse/src/pruning/mod.rs | 1 + .../04_0000_inverted_index_base.test | 76 +++++++-- 8 files changed, 358 insertions(+), 167 deletions(-) diff --git a/src/query/catalog/src/plan/pushdown.rs b/src/query/catalog/src/plan/pushdown.rs index a256da657da56..2d934983968c6 100644 --- a/src/query/catalog/src/plan/pushdown.rs +++ b/src/query/catalog/src/plan/pushdown.rs @@ -72,6 +72,22 @@ pub struct PrewhereInfo { pub virtual_columns: Option>, } +/// Inverted index option for additional search functions configuration. +#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq, Eq)] +pub struct InvertedIndexOption { + /// Fuzzy query match terms within Levenshtein distance + /// https://en.wikipedia.org/wiki/Levenshtein_distance + /// For example: if fuzziness is 1, and query text if `fox`, + /// the term `box` will be matched. + pub fuzziness: Option, + /// Operator: true is AND, false is OR, default is OR. + /// For example: query text `happy tax payer` is equals to `happy OR tax OR payer`, + /// but if operator is true, it will equals to `happy AND tax AND payer`. + pub operator: bool, + /// Parse a query leniently, ignore invalid query, default is false. + pub lenient: bool, +} + /// Information about inverted index. #[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq, Eq)] pub struct InvertedIndexInfo { @@ -91,8 +107,10 @@ pub struct InvertedIndexInfo { pub query_fields: Vec<(String, Option)>, /// The search query text with query syntax. pub query_text: String, - /// whether search with score function + /// Whether search with score function. pub has_score: bool, + /// Optional search configuration option, like fuzziness, lenient, .. + pub inverted_index_option: Option, } /// Extras is a wrapper for push down items. diff --git a/src/query/ee/tests/it/inverted_index/index_refresh.rs b/src/query/ee/tests/it/inverted_index/index_refresh.rs index 630806df74f58..e401a642a1fa9 100644 --- a/src/query/ee/tests/it/inverted_index/index_refresh.rs +++ b/src/query/ee/tests/it/inverted_index/index_refresh.rs @@ -15,15 +15,20 @@ use std::collections::BTreeMap; use databend_common_base::base::tokio; +use databend_common_catalog::plan::InvertedIndexInfo; use databend_common_catalog::table::Table; use databend_common_catalog::table::TableExt; use databend_common_exception::Result; +use databend_common_expression::types::DataType; +use databend_common_expression::DataField; +use databend_common_expression::DataSchema; use databend_common_meta_app::schema::CreateOption; use databend_common_meta_app::schema::CreateTableIndexReq; use databend_common_sql::plans::RefreshTableIndexPlan; use databend_common_storages_fuse::io::read::InvertedIndexReader; use databend_common_storages_fuse::io::MetaReaders; use databend_common_storages_fuse::io::TableMetaLocationGenerator; +use databend_common_storages_fuse::pruning::create_inverted_index_query; use databend_common_storages_fuse::FuseTable; use databend_common_storages_fuse::TableContext; use databend_enterprise_inverted_index::get_inverted_index_handler; @@ -33,11 +38,6 @@ use databend_query::interpreters::RefreshTableIndexInterpreter; use databend_query::test_kits::append_string_sample_data; use databend_query::test_kits::*; use databend_storages_common_cache::LoadParams; -use tantivy::schema::Field; -use tantivy::tokenizer::LowerCaser; -use tantivy::tokenizer::SimpleTokenizer; -use tantivy::tokenizer::TextAnalyzer; -use tantivy::tokenizer::TokenizerManager; #[tokio::test(flavor = "multi_thread")] async fn test_fuse_do_refresh_inverted_index() -> Result<()> { @@ -74,7 +74,7 @@ async fn test_fuse_do_refresh_inverted_index() -> Result<()> { name: index_name.clone(), column_ids: vec![0, 1], sync_creation: false, - options, + options: options.clone(), }; let res = handler.do_create_table_index(catalog.clone(), req).await; @@ -127,7 +127,11 @@ async fn test_fuse_do_refresh_inverted_index() -> Result<()> { let block_meta = &block_metas[0]; let dal = new_fuse_table.get_operator_ref(); - let fields = ["title".to_string(), "content".to_string()]; + let query_fields = vec![("title".to_string(), None), ("content".to_string(), None)]; + let index_schema = DataSchema::new(vec![ + DataField::new("title", DataType::String), + DataField::new("content", DataType::String), + ]); let index_loc = TableMetaLocationGenerator::gen_inverted_index_location_from_block_location( &block_meta.location.0, @@ -135,61 +139,46 @@ async fn test_fuse_do_refresh_inverted_index() -> Result<()> { &index_version, ); - let field_nums = fields.len(); + let field_nums = query_fields.len(); let has_score = true; let need_position = false; - let mut query_fields = Vec::with_capacity(fields.len()); - let query_field_boosts = Vec::new(); - for i in 0..fields.len() { - let field = Field::from_field_id(i as u32); - query_fields.push(field); + let index_reader = + InvertedIndexReader::try_create(dal.clone(), field_nums, need_position, &index_loc).await?; + + let queries = vec![ + ("rust".to_string(), vec![0, 1]), + ("java".to_string(), vec![2]), + ("data".to_string(), vec![4, 1, 5]), + ]; + + for (query_text, ids) in queries.into_iter() { + let inverted_index_info = InvertedIndexInfo { + index_name: index_name.clone(), + index_version: index_version.clone(), + index_options: options.clone(), + index_schema: index_schema.clone(), + query_fields: query_fields.clone(), + query_text, + has_score, + inverted_index_option: None, + }; + + let (query, tokenizer_manager) = create_inverted_index_query(&inverted_index_info)?; + + let matched_rows = index_reader.clone().do_filter( + has_score, + &query, + tokenizer_manager, + block_meta.row_count, + )?; + assert!(matched_rows.is_some()); + let matched_rows = matched_rows.unwrap(); + assert_eq!(matched_rows.len(), ids.len()); + for (matched_row, id) in matched_rows.iter().zip(ids.iter()) { + assert_eq!(matched_row.0, *id); + } } - let tokenizer_manager = TokenizerManager::new(); - let english_analyzer = TextAnalyzer::builder(SimpleTokenizer::default()) - .filter(LowerCaser) - .build(); - tokenizer_manager.register("english", english_analyzer); - - let index_reader = InvertedIndexReader::try_create( - dal.clone(), - field_nums, - has_score, - need_position, - query_fields, - query_field_boosts, - tokenizer_manager, - &index_loc, - ) - .await?; - - let query = "rust"; - let matched_rows = index_reader - .clone() - .do_filter(query, block_meta.row_count)?; - assert!(matched_rows.is_some()); - let matched_rows = matched_rows.unwrap(); - assert_eq!(matched_rows.len(), 2); - assert_eq!(matched_rows[0].0, 0); - assert_eq!(matched_rows[1].0, 1); - - let query = "java"; - let matched_rows = index_reader - .clone() - .do_filter(query, block_meta.row_count)?; - assert!(matched_rows.is_some()); - let matched_rows = matched_rows.unwrap(); - assert_eq!(matched_rows.len(), 1); - assert_eq!(matched_rows[0].0, 2); - - let query = "data"; - let matched_rows = index_reader.do_filter(query, block_meta.row_count)?; - assert!(matched_rows.is_some()); - let matched_rows = matched_rows.unwrap(); - assert_eq!(matched_rows.len(), 3); - assert_eq!(matched_rows[0].0, 4); - assert_eq!(matched_rows[1].0, 1); - assert_eq!(matched_rows[2].0, 5); Ok(()) } diff --git a/src/query/ee/tests/it/inverted_index/pruning.rs b/src/query/ee/tests/it/inverted_index/pruning.rs index 03d7045d33bb1..66809f4b761c7 100644 --- a/src/query/ee/tests/it/inverted_index/pruning.rs +++ b/src/query/ee/tests/it/inverted_index/pruning.rs @@ -552,6 +552,7 @@ async fn test_block_pruner() -> Result<()> { query_fields: vec![("idiom".to_string(), None)], query_text: "test".to_string(), has_score: false, + inverted_index_option: None, }), ..Default::default() }; @@ -564,6 +565,7 @@ async fn test_block_pruner() -> Result<()> { query_fields: vec![("idiom".to_string(), None)], query_text: "save".to_string(), has_score: false, + inverted_index_option: None, }), ..Default::default() }; @@ -576,6 +578,7 @@ async fn test_block_pruner() -> Result<()> { query_fields: vec![("idiom".to_string(), None)], query_text: "one".to_string(), has_score: false, + inverted_index_option: None, }), ..Default::default() }; @@ -588,6 +591,7 @@ async fn test_block_pruner() -> Result<()> { query_fields: vec![("idiom".to_string(), None)], query_text: "the".to_string(), has_score: false, + inverted_index_option: None, }), ..Default::default() }; @@ -600,6 +604,7 @@ async fn test_block_pruner() -> Result<()> { query_fields: vec![("idiom".to_string(), None)], query_text: "光阴".to_string(), has_score: false, + inverted_index_option: None, }), ..Default::default() }; @@ -612,6 +617,7 @@ async fn test_block_pruner() -> Result<()> { query_fields: vec![("idiom".to_string(), None)], query_text: "人生".to_string(), has_score: false, + inverted_index_option: None, }), ..Default::default() }; @@ -624,6 +630,7 @@ async fn test_block_pruner() -> Result<()> { query_fields: vec![("meaning".to_string(), None)], query_text: "people".to_string(), has_score: false, + inverted_index_option: None, }), ..Default::default() }; @@ -636,6 +643,7 @@ async fn test_block_pruner() -> Result<()> { query_fields: vec![("meaning".to_string(), None)], query_text: "bad".to_string(), has_score: false, + inverted_index_option: None, }), ..Default::default() }; @@ -648,6 +656,7 @@ async fn test_block_pruner() -> Result<()> { query_fields: vec![("meaning".to_string(), None)], query_text: "黄金".to_string(), has_score: false, + inverted_index_option: None, }), ..Default::default() }; @@ -660,6 +669,7 @@ async fn test_block_pruner() -> Result<()> { query_fields: vec![("meaning".to_string(), None)], query_text: "时间".to_string(), has_score: false, + inverted_index_option: None, }), ..Default::default() }; @@ -675,6 +685,7 @@ async fn test_block_pruner() -> Result<()> { ], query_text: "you".to_string(), has_score: false, + inverted_index_option: None, }), ..Default::default() }; @@ -690,6 +701,7 @@ async fn test_block_pruner() -> Result<()> { ], query_text: "光阴".to_string(), has_score: false, + inverted_index_option: None, }), ..Default::default() }; @@ -702,6 +714,7 @@ async fn test_block_pruner() -> Result<()> { query_fields: vec![("extras".to_string(), None)], query_text: "extras.title:Blockchain".to_string(), has_score: false, + inverted_index_option: None, }), ..Default::default() }; @@ -714,6 +727,7 @@ async fn test_block_pruner() -> Result<()> { query_fields: vec![("extras".to_string(), None)], query_text: "extras.metadata.author:David".to_string(), has_score: false, + inverted_index_option: None, }), ..Default::default() }; @@ -726,6 +740,7 @@ async fn test_block_pruner() -> Result<()> { query_fields: vec![("extras".to_string(), None)], query_text: "extras.metadata.tags:技术".to_string(), has_score: false, + inverted_index_option: None, }), ..Default::default() }; diff --git a/src/query/sql/src/planner/semantic/type_check.rs b/src/query/sql/src/planner/semantic/type_check.rs index c8bde2adf2f23..aeeb8a1d9507a 100644 --- a/src/query/sql/src/planner/semantic/type_check.rs +++ b/src/query/sql/src/planner/semantic/type_check.rs @@ -51,6 +51,7 @@ use databend_common_catalog::catalog::CatalogManager; use databend_common_catalog::plan::InternalColumn; use databend_common_catalog::plan::InternalColumnType; use databend_common_catalog::plan::InvertedIndexInfo; +use databend_common_catalog::plan::InvertedIndexOption; use databend_common_catalog::table_context::TableContext; use databend_common_compress::CompressAlgorithm; use databend_common_compress::DecompressDecoder; @@ -2107,10 +2108,10 @@ impl<'a> TypeChecker<'a> { .set_span(span)); } - // TODO: support options field - if args.len() != 2 { + // The optional third argument is additional configuration option. + if args.len() != 2 && args.len() != 3 { return Err(ErrorCode::SemanticError(format!( - "invalid arguments for search function, {} expects 2 arguments, but got {}", + "invalid arguments for search function, {} expects 2 or 3 arguments, but got {}", func_name, args.len() )) @@ -2119,6 +2120,7 @@ impl<'a> TypeChecker<'a> { let field_arg = args[0]; let query_arg = args[1]; + let option_arg = if args.len() == 3 { Some(args[2]) } else { None }; let box (field_scalar, _) = self.resolve(field_arg)?; let column_refs = match field_scalar { @@ -2133,7 +2135,7 @@ impl<'a> TypeChecker<'a> { "invalid arguments for search function, field must be a column or constant string, but got {}", constant_expr.value )) - .set_span(constant_expr.span)); + .set_span(constant_expr.span)); }; // fields are separated by commas and boost is separated by ^ @@ -2146,7 +2148,7 @@ impl<'a> TypeChecker<'a> { "invalid arguments for search function, field string must have only one boost, but got {}", constant_field )) - .set_span(constant_expr.span)); + .set_span(constant_expr.span)); } let column_expr = Expr::ColumnRef { span: constant_expr.span, @@ -2175,7 +2177,7 @@ impl<'a> TypeChecker<'a> { "invalid arguments for search function, boost must be a float value, but got {}", field_boosts[1] )) - .set_span(constant_expr.span)); + .set_span(constant_expr.span)); } } } else { @@ -2189,7 +2191,7 @@ impl<'a> TypeChecker<'a> { return Err(ErrorCode::SemanticError( "invalid arguments for search function, field must be a column or constant string".to_string(), ) - .set_span(span)); + .set_span(span)); } }; @@ -2199,27 +2201,19 @@ impl<'a> TypeChecker<'a> { "invalid arguments for search function, query text must be a constant string, but got {}", query_arg )) - .set_span(query_scalar.span())); + .set_span(query_scalar.span())); }; let Some(query_text) = query_expr.value.as_string() else { return Err(ErrorCode::SemanticError(format!( "invalid arguments for search function, query text must be a constant string, but got {}", query_arg )) - .set_span(query_scalar.span())); + .set_span(query_scalar.span())); }; - // match function didn't support query syntax, - // convert query text to lowercase and remove punctuation characters, - // so that tantivy query parser can parse the query text as plain text - // without syntax - let formatted_query_text: String = query_text - .to_lowercase() - .chars() - .map(|v| if v.is_ascii_punctuation() { ' ' } else { v }) - .collect(); - - self.resolve_search_function(span, column_refs, &formatted_query_text) + let inverted_index_option = self.resolve_search_option(option_arg)?; + + self.resolve_search_function(span, column_refs, query_text, inverted_index_option) } /// Resolve query search function. @@ -2244,8 +2238,8 @@ impl<'a> TypeChecker<'a> { .set_span(span)); } - // TODO: support options field - if args.len() != 1 { + // The optional second argument is additional configuration option. + if args.len() != 1 && args.len() != 2 { return Err(ErrorCode::SemanticError(format!( "invalid arguments for search function, {} expects 1 argument, but got {}", func_name, @@ -2255,6 +2249,7 @@ impl<'a> TypeChecker<'a> { } let query_arg = args[0]; + let option_arg = if args.len() == 2 { Some(args[1]) } else { None }; let box (query_scalar, _) = self.resolve(query_arg)?; let Ok(query_expr) = ConstantExpr::try_from(query_scalar.clone()) else { @@ -2262,14 +2257,14 @@ impl<'a> TypeChecker<'a> { "invalid arguments for search function, query text must be a constant string, but got {}", query_arg )) - .set_span(query_scalar.span())); + .set_span(query_scalar.span())); }; let Some(query_text) = query_expr.value.as_string() else { return Err(ErrorCode::SemanticError(format!( "invalid arguments for search function, query text must be a constant string, but got {}", query_arg )) - .set_span(query_scalar.span())); + .set_span(query_scalar.span())); }; let field_strs: Vec<&str> = query_text.split(' ').collect(); @@ -2305,8 +2300,105 @@ impl<'a> TypeChecker<'a> { }; column_refs.push((column_ref, None)); } + let inverted_index_option = self.resolve_search_option(option_arg)?; + + self.resolve_search_function(span, column_refs, query_text, inverted_index_option) + } + + fn resolve_search_option( + &mut self, + option_arg: Option<&Expr>, + ) -> Result> { + if let Some(option_arg) = option_arg { + let box (option_scalar, _) = self.resolve(option_arg)?; + let Ok(option_expr) = ConstantExpr::try_from(option_scalar.clone()) else { + return Err(ErrorCode::SemanticError(format!( + "invalid arguments for search function, option must be a constant string, but got {}", + option_arg + )) + .set_span(option_scalar.span())); + }; + let Some(option_text) = option_expr.value.as_string() else { + return Err(ErrorCode::SemanticError(format!( + "invalid arguments for search function, option text must be a constant string, but got {}", + option_arg + )) + .set_span(option_scalar.span())); + }; + + let mut lenient = None; + let mut operator = None; + let mut fuzziness = None; + + // additional configuration options are separated by semicolon `;` + let option_strs: Vec<&str> = option_text.split(';').collect(); + for option_str in option_strs { + if option_str.trim().is_empty() { + continue; + } + let option_vals: Vec<&str> = option_str.split('=').collect(); + if option_vals.len() != 2 { + return Err(ErrorCode::SemanticError(format!( + "invalid arguments for search function, each option must have key and value joined by equal sign, but got {}", + option_arg + )) + .set_span(option_scalar.span())); + } + let option_key = option_vals[0].trim().to_lowercase(); + let option_val = option_vals[1].trim().to_lowercase(); + match option_key.as_str() { + "fuzziness" => { + // fuzziness is only support 1 and 2 currently. + if fuzziness.is_none() { + if option_val == "1" { + fuzziness = Some(1); + continue; + } else if option_val == "2" { + fuzziness = Some(2); + continue; + } + } + } + "operator" => { + if operator.is_none() { + if option_val == "or" { + operator = Some(false); + continue; + } else if option_val == "and" { + operator = Some(true); + continue; + } + } + } + "lenient" => { + if lenient.is_none() { + if option_val == "false" { + lenient = Some(false); + continue; + } else if option_val == "true" { + lenient = Some(true); + continue; + } + } + } + _ => {} + } + return Err(ErrorCode::SemanticError(format!( + "invalid arguments for search function, unsupported option: {}", + option_arg + )) + .set_span(option_scalar.span())); + } - self.resolve_search_function(span, column_refs, query_text) + let inverted_index_option = InvertedIndexOption { + lenient: lenient.unwrap_or_default(), + operator: operator.unwrap_or_default(), + fuzziness, + }; + + return Ok(Some(inverted_index_option)); + } + Ok(None) } fn resolve_search_function( @@ -2314,6 +2406,7 @@ impl<'a> TypeChecker<'a> { span: Span, column_refs: Vec<(BoundColumnRef, Option)>, query_text: &String, + inverted_index_option: Option, ) -> Result> { if column_refs.is_empty() { return Err(ErrorCode::SemanticError( @@ -2400,6 +2493,7 @@ impl<'a> TypeChecker<'a> { query_fields, query_text: query_text.to_string(), has_score: false, + inverted_index_option, }; self.bind_context diff --git a/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_reader.rs b/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_reader.rs index 69f451ce0a389..20b8950f27dec 100644 --- a/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_reader.rs +++ b/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_reader.rs @@ -21,46 +21,30 @@ use databend_storages_common_index::InvertedIndexDirectory; use opendal::Operator; use tantivy::collector::DocSetCollector; use tantivy::collector::TopDocs; -use tantivy::query::QueryParser; -use tantivy::schema::Field; +use tantivy::query::Query; use tantivy::tokenizer::TokenizerManager; use tantivy::Index; -use tantivy::Score; use crate::io::read::inverted_index::inverted_index_loader::load_inverted_index_directory; use crate::io::read::inverted_index::inverted_index_loader::InvertedIndexFileReader; #[derive(Clone)] pub struct InvertedIndexReader { - has_score: bool, - query_fields: Vec, - query_field_boosts: Vec<(Field, Score)>, directory: InvertedIndexDirectory, - tokenizer_manager: TokenizerManager, } impl InvertedIndexReader { pub async fn try_create( dal: Operator, field_nums: usize, - has_score: bool, need_position: bool, - query_fields: Vec, - query_field_boosts: Vec<(Field, Score)>, - tokenizer_manager: TokenizerManager, index_loc: &str, ) -> Result { let directory = load_inverted_index_directory(dal.clone(), need_position, field_nums, index_loc) .await?; - Ok(Self { - has_score, - query_fields, - query_field_boosts, - directory, - tokenizer_manager, - }) + Ok(Self { directory }) } // Filter the rows and scores in the block that can match the query text, @@ -68,25 +52,20 @@ impl InvertedIndexReader { #[allow(clippy::type_complexity)] pub fn do_filter( self, - query: &str, + has_score: bool, + query: &dyn Query, + tokenizer_manager: TokenizerManager, row_count: u64, ) -> Result)>>> { let start = Instant::now(); let mut index = Index::open(self.directory)?; - index.set_tokenizers(self.tokenizer_manager); + index.set_tokenizers(tokenizer_manager); let reader = index.reader()?; let searcher = reader.searcher(); - let mut query_parser = QueryParser::for_index(&index, self.query_fields); - // set optional boost value for the field - for (field, boost) in &self.query_field_boosts { - query_parser.set_field_boost(*field, *boost); - } - let query = query_parser.parse_query(query)?; - - let matched_rows = if self.has_score { + let matched_rows = if has_score { let collector = TopDocs::with_limit(row_count as usize); - let docs = searcher.search(&query, &collector)?; + let docs = searcher.search(query, &collector)?; let mut matched_rows = Vec::with_capacity(docs.len()); for (score, doc_addr) in docs { @@ -97,7 +76,7 @@ impl InvertedIndexReader { matched_rows } else { let collector = DocSetCollector; - let docs = searcher.search(&query, &collector)?; + let docs = searcher.search(query, &collector)?; let mut matched_rows = Vec::with_capacity(docs.len()); for doc_addr in docs { diff --git a/src/query/storages/fuse/src/pruning/inverted_index_pruner.rs b/src/query/storages/fuse/src/pruning/inverted_index_pruner.rs index def030cb4cef6..5240f851becce 100644 --- a/src/query/storages/fuse/src/pruning/inverted_index_pruner.rs +++ b/src/query/storages/fuse/src/pruning/inverted_index_pruner.rs @@ -19,10 +19,10 @@ use databend_common_catalog::plan::PushDownInfo; use databend_common_exception::Result; use databend_common_expression::types::F32; use opendal::Operator; +use tantivy::query::Query; use tantivy::query::QueryParser; use tantivy::schema::Field; use tantivy::tokenizer::TokenizerManager; -use tantivy::Score; use crate::io::create_index_schema; use crate::io::create_tokenizer_manager; @@ -49,13 +49,13 @@ use crate::io::TableMetaLocationGenerator; // pub struct InvertedIndexPruner { dal: Operator, - field_nums: usize, has_score: bool, + field_nums: usize, + index_name: String, + index_version: String, need_position: bool, - query_fields: Vec, - query_field_boosts: Vec<(Field, Score)>, tokenizer_manager: TokenizerManager, - inverted_index_info: InvertedIndexInfo, + query: Box, } impl InvertedIndexPruner { @@ -65,27 +65,8 @@ impl InvertedIndexPruner { ) -> Result>> { let inverted_index_info = push_down.as_ref().and_then(|p| p.inverted_index.as_ref()); if let Some(inverted_index_info) = inverted_index_info { - // collect query fields and optional boosts - let mut query_fields = Vec::with_capacity(inverted_index_info.query_fields.len()); - let mut query_field_boosts = Vec::with_capacity(inverted_index_info.query_fields.len()); - for (field_name, boost) in &inverted_index_info.query_fields { - let i = inverted_index_info.index_schema.index_of(field_name)?; - let field = Field::from_field_id(i as u32); - query_fields.push(field); - if let Some(boost) = boost { - query_field_boosts.push((field, boost.0)); - } - } + let (query, tokenizer_manager) = create_inverted_index_query(inverted_index_info)?; - // parse query text to check whether has phrase terms need position file. - let (index_schema, index_fields) = create_index_schema( - Arc::new(inverted_index_info.index_schema.clone()), - &inverted_index_info.index_options, - )?; - let tokenizer_manager = create_tokenizer_manager(&inverted_index_info.index_options); - let query_parser = - QueryParser::new(index_schema, index_fields, tokenizer_manager.clone()); - let query = query_parser.parse_query(&inverted_index_info.query_text)?; let mut need_position = false; query.query_terms(&mut |_, pos| { if pos { @@ -95,16 +76,18 @@ impl InvertedIndexPruner { // whether need to generate score internl column let has_score = inverted_index_info.has_score; let field_nums = inverted_index_info.index_schema.num_fields(); + let index_name = inverted_index_info.index_name.clone(); + let index_version = inverted_index_info.index_version.clone(); return Ok(Some(Arc::new(InvertedIndexPruner { dal, - field_nums, has_score, + field_nums, + index_name, + index_version, need_position, - query_fields, - query_field_boosts, tokenizer_manager, - inverted_index_info: inverted_index_info.clone(), + query, }))); } Ok(None) @@ -118,25 +101,93 @@ impl InvertedIndexPruner { ) -> Result)>>> { let index_loc = TableMetaLocationGenerator::gen_inverted_index_location_from_block_location( block_loc, - &self.inverted_index_info.index_name, - &self.inverted_index_info.index_version, + &self.index_name, + &self.index_version, ); let inverted_index_reader = InvertedIndexReader::try_create( self.dal.clone(), self.field_nums, - self.has_score, self.need_position, - self.query_fields.clone(), - self.query_field_boosts.clone(), - self.tokenizer_manager.clone(), &index_loc, ) .await?; - let matched_rows = - inverted_index_reader.do_filter(&self.inverted_index_info.query_text, row_count)?; + let matched_rows = inverted_index_reader.do_filter( + self.has_score, + &self.query, + self.tokenizer_manager.clone(), + row_count, + )?; Ok(matched_rows) } } + +// create tantivy query for inverted index. +pub fn create_inverted_index_query( + inverted_index_info: &InvertedIndexInfo, +) -> Result<(Box, TokenizerManager)> { + // collect query fields and optional boosts + let mut query_fields = Vec::with_capacity(inverted_index_info.query_fields.len()); + let mut query_field_boosts = Vec::with_capacity(inverted_index_info.query_fields.len()); + for (field_name, boost) in &inverted_index_info.query_fields { + let i = inverted_index_info.index_schema.index_of(field_name)?; + let field = Field::from_field_id(i as u32); + query_fields.push(field); + if let Some(boost) = boost { + query_field_boosts.push((field, boost.0)); + } + } + + // parse query text to check whether has phrase terms need position file. + let (index_schema, _) = create_index_schema( + Arc::new(inverted_index_info.index_schema.clone()), + &inverted_index_info.index_options, + )?; + let tokenizer_manager = create_tokenizer_manager(&inverted_index_info.index_options); + let mut query_parser = QueryParser::new( + index_schema, + query_fields.clone(), + tokenizer_manager.clone(), + ); + + // set optional boost value for the field + for (field, boost) in query_field_boosts { + query_parser.set_field_boost(field, boost); + } + let fuzziness = inverted_index_info + .inverted_index_option + .as_ref() + .and_then(|o| o.fuzziness.as_ref()); + if let Some(fuzziness) = fuzziness { + // Fuzzy query matches rows containing a specific term that is within Levenshtein distance. + for field in query_fields { + query_parser.set_field_fuzzy(field, false, *fuzziness, true); + } + } + let operator = inverted_index_info + .inverted_index_option + .as_ref() + .map(|o| o.operator) + .unwrap_or_default(); + if operator { + // Operator if TRUE means operator is `AND`, + // set compose queries to a conjunction. + query_parser.set_conjunction_by_default(); + } + let lenient = inverted_index_info + .inverted_index_option + .as_ref() + .map(|o| o.lenient) + .unwrap_or_default(); + let query = if lenient { + // If lenient is TRUE, invalid query text will not report an error. + let (query, _) = query_parser.parse_query_lenient(&inverted_index_info.query_text); + query + } else { + query_parser.parse_query(&inverted_index_info.query_text)? + }; + + Ok((query, tokenizer_manager)) +} diff --git a/src/query/storages/fuse/src/pruning/mod.rs b/src/query/storages/fuse/src/pruning/mod.rs index 782c529ce52b6..e619c2b16e24e 100644 --- a/src/query/storages/fuse/src/pruning/mod.rs +++ b/src/query/storages/fuse/src/pruning/mod.rs @@ -25,6 +25,7 @@ pub use bloom_pruner::BloomPruner; pub use bloom_pruner::BloomPrunerCreator; pub use fuse_pruner::FusePruner; pub use fuse_pruner::PruningContext; +pub use inverted_index_pruner::create_inverted_index_query; pub use inverted_index_pruner::InvertedIndexPruner; pub use pruner_location::create_segment_location_vector; pub use pruner_location::SegmentLocation; diff --git a/tests/sqllogictests/suites/ee/04_ee_inverted_index/04_0000_inverted_index_base.test b/tests/sqllogictests/suites/ee/04_ee_inverted_index/04_0000_inverted_index_base.test index 06637da50e609..0759f20c17135 100644 --- a/tests/sqllogictests/suites/ee/04_ee_inverted_index/04_0000_inverted_index_base.test +++ b/tests/sqllogictests/suites/ee/04_ee_inverted_index/04_0000_inverted_index_base.test @@ -46,21 +46,42 @@ query SELECT id, score(), content FROM t WHERE match(content, 'test') ---- -query T +query IFT SELECT id, score(), content FROM t WHERE match(content, 'the') ---- -query T +query IFT SELECT id, score(), content FROM t WHERE match(content, 'fly') ---- 5 2.4594712 Time flies like an arrow; fruit flies like a banana -query T +query IFT SELECT id, score(), content FROM t WHERE match(content, 'word') ---- 2 1.5948367 A picture is worth a thousand words 4 1.6550698 Actions speak louder than words +query IFT +SELECT id, score(), content FROM t WHERE match(content, 'box') +---- + +query IFT +SELECT id, score(), content FROM t WHERE match(content, 'box', 'fuzziness=1') +---- +1 1.0 The quick brown fox jumps over the lazy dog + +query IFT +SELECT id, score(), content FROM t WHERE match(content, 'action works', 'fuzziness=1') +---- +2 1.0 A picture is worth a thousand words +3 1.0 The early bird catches the worm +4 2.0 Actions speak louder than words + +query IFT +SELECT id, score(), content FROM t WHERE match(content, 'action works', 'fuzziness=1;operator=AND') +---- +4 2.0 Actions speak louder than words + statement ok INSERT INTO t VALUES (11, '我喜欢在周末的时候去公园散步,感受大自然的美丽。'), @@ -87,7 +108,7 @@ INSERT INTO t VALUES (29, '每个人的人生都是一部独特的传奇,我们需要珍惜每一个瞬间,用心去感受生活的美好。'), (30, '张华考上了北京大学,李萍进了中等技术学校,我在百货公司当售货员,我们都有光明的前途。') -query T +query IFT SELECT id, score(), content FROM t WHERE match(content, '中国') ORDER BY score() ---- 21 1.1111465 中国的古代诗词充满了深邃的意境和独特的韵味,是中华文化的重要组成部分。 @@ -96,23 +117,29 @@ SELECT id, score(), content FROM t WHERE match(content, '中国') ORDER BY score 12 1.4482267 北京的故宫是中国古代建筑的瑰宝,吸引了无数游客前来参观。 15 1.5346593 中国的茶文化源远流长,品茶已经成为一种生活方式。 -query T +query IFT SELECT id, score(), content FROM t WHERE match(content, '北京') ORDER BY score() ---- 30 1.7396812 张华考上了北京大学,李萍进了中等技术学校,我在百货公司当售货员,我们都有光明的前途。 12 1.9475443 北京的故宫是中国古代建筑的瑰宝,吸引了无数游客前来参观。 -query T +query IFT SELECT id, score(), content FROM t WHERE match(content, '北京大学') ORDER BY score() ---- 30 5.2190437 张华考上了北京大学,李萍进了中等技术学校,我在百货公司当售货员,我们都有光明的前途。 -query T +query IFT +SELECT id, score(), content FROM t WHERE match(content, '北京 大', 'fuzziness=1;operator=AND') ORDER BY id +---- +12 2.0 北京的故宫是中国古代建筑的瑰宝,吸引了无数游客前来参观。 +30 2.0 张华考上了北京大学,李萍进了中等技术学校,我在百货公司当售货员,我们都有光明的前途。 + +query IFT SELECT id, score(), content FROM t WHERE match(content, '文化博大精深') ORDER BY score() ---- 28 7.61753 中国的饮食文化博大精深,各地的美食各具特色,让人流连忘返。 -query T +query IFT SELECT id, score(), content FROM t WHERE match(content, '文化 博大精深') ORDER BY score() ---- 21 1.1111465 中国的古代诗词充满了深邃的意境和独特的韵味,是中华文化的重要组成部分。 @@ -120,25 +147,33 @@ SELECT id, score(), content FROM t WHERE match(content, '文化 博大精深') O 15 2.063777 中国的茶文化源远流长,品茶已经成为一种生活方式。 28 7.61753 中国的饮食文化博大精深,各地的美食各具特色,让人流连忘返。 -query T +query IFT SELECT id, score(), content FROM t WHERE match(content, '化博') ORDER BY score() ---- -query T +query IFT SELECT id, score(), content FROM t WHERE match(content, '。') ORDER BY score() ---- -query T +query IFT SELECT id, score(), content FROM t WHERE match(content, '不存在') ORDER BY score() ---- +statement error 1903 +SELECT id, score(), content FROM t WHERE match(content, '()') + +query IFT +SELECT id, score(), content FROM t WHERE match(content, '()', 'lenient=true') +---- + + statement ok UPDATE t SET content = '科技创新是推动社会进步的重要动力,我们应该积极支持和推动科技创新。' WHERE id=24 -query T +query IFT SELECT id, score(), content FROM t WHERE match(content, '中国') ORDER BY score() ---- 21 1.423108 中国的古代诗词充满了深邃的意境和独特的韵味,是中华文化的重要组成部分。 @@ -146,7 +181,7 @@ SELECT id, score(), content FROM t WHERE match(content, '中国') ORDER BY score 15 1.5346593 中国的茶文化源远流长,品茶已经成为一种生活方式。 28 1.5707673 中国的饮食文化博大精深,各地的美食各具特色,让人流连忘返。 -query T +query IFT SELECT id, score(), content FROM t WHERE match(content, '科技') ORDER BY score() ---- 13 2.1947646 随着科技的发展,人们的生活变得越来越便利。 @@ -155,7 +190,7 @@ SELECT id, score(), content FROM t WHERE match(content, '科技') ORDER BY score statement ok DELETE FROM t WHERE id=21 -query T +query IFT SELECT id, score(), content FROM t WHERE match(content, '中国') ORDER BY score() ---- 12 1.4482267 北京的故宫是中国古代建筑的瑰宝,吸引了无数游客前来参观。 @@ -169,7 +204,7 @@ CREATE OR REPLACE INVERTED INDEX idx1 ON t(content) tokenizer = 'chinese' index_ statement ok REFRESH INVERTED INDEX idx1 ON t -query T +query IFT SELECT id, score(), content FROM t WHERE match(content, 'the') ---- 1 0.8323383 The quick brown fox jumps over the lazy dog @@ -177,7 +212,7 @@ SELECT id, score(), content FROM t WHERE match(content, 'the') 6 0.8788376 Beauty is in the eye of the beholder 10 0.8788376 An apple a day keeps the doctor away -query T +query IFT SELECT id, score(), content FROM t WHERE match(content, 'fly') ---- @@ -263,6 +298,15 @@ SELECT id, score(), title FROM books WHERE query('title:python') ORDER BY score( 6 0.96639454 Flask Web开发:基于Python的Web应用开发实战(第2版) 13 0.8931828 Learn AI-Assisted Python Programming: With GitHub Copilot and ChatGPT +query IFT +SELECT id, score(), title FROM books WHERE query('title:pyth', 'fuzziness=2') ORDER BY id +---- +2 1.0 Python深度学习(第2版) +6 1.0 Flask Web开发:基于Python的Web应用开发实战(第2版) +11 1.0 OpenAI GPT For Python Developers, 2nd Edition +13 1.0 Learn AI-Assisted Python Programming: With GitHub Copilot and ChatGPT +14 1.0 Building Recommendation Systems in Python and JAX + query IFT SELECT id, score(), title FROM books WHERE query('title:python OR rust') ORDER BY score() DESC ----