Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(query): Inverted index search function support options #16256

Merged
merged 7 commits into from
Aug 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion src/query/catalog/src/plan/pushdown.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,22 @@ pub struct PrewhereInfo {
pub virtual_columns: Option<Vec<VirtualColumnInfo>>,
}

/// Inverted index option for additional search functions configuration.
#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq, Eq)]
pub struct InvertedIndexOption {
/// Fuzzy query match terms within Levenshtein distance
/// https://en.wikipedia.org/wiki/Levenshtein_distance
/// For example: if fuzziness is 1, and query text if `fox`,
/// the term `box` will be matched.
pub fuzziness: Option<u8>,
/// Operator: true is AND, false is OR, default is OR.
/// For example: query text `happy tax payer` is equals to `happy OR tax OR payer`,
/// but if operator is true, it will equals to `happy AND tax AND payer`.
pub operator: bool,
/// Parse a query leniently, ignore invalid query, default is false.
pub lenient: bool,
}

/// Information about inverted index.
#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq, Eq)]
pub struct InvertedIndexInfo {
Expand All @@ -91,8 +107,10 @@ pub struct InvertedIndexInfo {
pub query_fields: Vec<(String, Option<F32>)>,
/// The search query text with query syntax.
pub query_text: String,
/// whether search with score function
/// Whether search with score function.
pub has_score: bool,
/// Optional search configuration option, like fuzziness, lenient, ..
pub inverted_index_option: Option<InvertedIndexOption>,
}

/// Extras is a wrapper for push down items.
Expand Down
105 changes: 47 additions & 58 deletions src/query/ee/tests/it/inverted_index/index_refresh.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,20 @@
use std::collections::BTreeMap;

use databend_common_base::base::tokio;
use databend_common_catalog::plan::InvertedIndexInfo;
use databend_common_catalog::table::Table;
use databend_common_catalog::table::TableExt;
use databend_common_exception::Result;
use databend_common_expression::types::DataType;
use databend_common_expression::DataField;
use databend_common_expression::DataSchema;
use databend_common_meta_app::schema::CreateOption;
use databend_common_meta_app::schema::CreateTableIndexReq;
use databend_common_sql::plans::RefreshTableIndexPlan;
use databend_common_storages_fuse::io::read::InvertedIndexReader;
use databend_common_storages_fuse::io::MetaReaders;
use databend_common_storages_fuse::io::TableMetaLocationGenerator;
use databend_common_storages_fuse::pruning::create_inverted_index_query;
use databend_common_storages_fuse::FuseTable;
use databend_common_storages_fuse::TableContext;
use databend_enterprise_inverted_index::get_inverted_index_handler;
Expand All @@ -33,11 +38,6 @@ use databend_query::interpreters::RefreshTableIndexInterpreter;
use databend_query::test_kits::append_string_sample_data;
use databend_query::test_kits::*;
use databend_storages_common_cache::LoadParams;
use tantivy::schema::Field;
use tantivy::tokenizer::LowerCaser;
use tantivy::tokenizer::SimpleTokenizer;
use tantivy::tokenizer::TextAnalyzer;
use tantivy::tokenizer::TokenizerManager;

#[tokio::test(flavor = "multi_thread")]
async fn test_fuse_do_refresh_inverted_index() -> Result<()> {
Expand Down Expand Up @@ -74,7 +74,7 @@ async fn test_fuse_do_refresh_inverted_index() -> Result<()> {
name: index_name.clone(),
column_ids: vec![0, 1],
sync_creation: false,
options,
options: options.clone(),
};

let res = handler.do_create_table_index(catalog.clone(), req).await;
Expand Down Expand Up @@ -127,69 +127,58 @@ async fn test_fuse_do_refresh_inverted_index() -> Result<()> {
let block_meta = &block_metas[0];

let dal = new_fuse_table.get_operator_ref();
let fields = ["title".to_string(), "content".to_string()];
let query_fields = vec![("title".to_string(), None), ("content".to_string(), None)];
let index_schema = DataSchema::new(vec![
DataField::new("title", DataType::String),
DataField::new("content", DataType::String),
]);

let index_loc = TableMetaLocationGenerator::gen_inverted_index_location_from_block_location(
&block_meta.location.0,
&index_name,
&index_version,
);

let field_nums = fields.len();
let field_nums = query_fields.len();
let has_score = true;
let need_position = false;

let mut query_fields = Vec::with_capacity(fields.len());
let query_field_boosts = Vec::new();
for i in 0..fields.len() {
let field = Field::from_field_id(i as u32);
query_fields.push(field);
let index_reader =
InvertedIndexReader::try_create(dal.clone(), field_nums, need_position, &index_loc).await?;

let queries = vec![
("rust".to_string(), vec![0, 1]),
("java".to_string(), vec![2]),
("data".to_string(), vec![4, 1, 5]),
];

for (query_text, ids) in queries.into_iter() {
let inverted_index_info = InvertedIndexInfo {
index_name: index_name.clone(),
index_version: index_version.clone(),
index_options: options.clone(),
index_schema: index_schema.clone(),
query_fields: query_fields.clone(),
query_text,
has_score,
inverted_index_option: None,
};

let (query, tokenizer_manager) = create_inverted_index_query(&inverted_index_info)?;

let matched_rows = index_reader.clone().do_filter(
has_score,
&query,
tokenizer_manager,
block_meta.row_count,
)?;
assert!(matched_rows.is_some());
let matched_rows = matched_rows.unwrap();
assert_eq!(matched_rows.len(), ids.len());
for (matched_row, id) in matched_rows.iter().zip(ids.iter()) {
assert_eq!(matched_row.0, *id);
}
}
let tokenizer_manager = TokenizerManager::new();
let english_analyzer = TextAnalyzer::builder(SimpleTokenizer::default())
.filter(LowerCaser)
.build();
tokenizer_manager.register("english", english_analyzer);

let index_reader = InvertedIndexReader::try_create(
dal.clone(),
field_nums,
has_score,
need_position,
query_fields,
query_field_boosts,
tokenizer_manager,
&index_loc,
)
.await?;

let query = "rust";
let matched_rows = index_reader
.clone()
.do_filter(query, block_meta.row_count)?;
assert!(matched_rows.is_some());
let matched_rows = matched_rows.unwrap();
assert_eq!(matched_rows.len(), 2);
assert_eq!(matched_rows[0].0, 0);
assert_eq!(matched_rows[1].0, 1);

let query = "java";
let matched_rows = index_reader
.clone()
.do_filter(query, block_meta.row_count)?;
assert!(matched_rows.is_some());
let matched_rows = matched_rows.unwrap();
assert_eq!(matched_rows.len(), 1);
assert_eq!(matched_rows[0].0, 2);

let query = "data";
let matched_rows = index_reader.do_filter(query, block_meta.row_count)?;
assert!(matched_rows.is_some());
let matched_rows = matched_rows.unwrap();
assert_eq!(matched_rows.len(), 3);
assert_eq!(matched_rows[0].0, 4);
assert_eq!(matched_rows[1].0, 1);
assert_eq!(matched_rows[2].0, 5);

Ok(())
}
15 changes: 15 additions & 0 deletions src/query/ee/tests/it/inverted_index/pruning.rs
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,7 @@ async fn test_block_pruner() -> Result<()> {
query_fields: vec![("idiom".to_string(), None)],
query_text: "test".to_string(),
has_score: false,
inverted_index_option: None,
}),
..Default::default()
};
Expand All @@ -564,6 +565,7 @@ async fn test_block_pruner() -> Result<()> {
query_fields: vec![("idiom".to_string(), None)],
query_text: "save".to_string(),
has_score: false,
inverted_index_option: None,
}),
..Default::default()
};
Expand All @@ -576,6 +578,7 @@ async fn test_block_pruner() -> Result<()> {
query_fields: vec![("idiom".to_string(), None)],
query_text: "one".to_string(),
has_score: false,
inverted_index_option: None,
}),
..Default::default()
};
Expand All @@ -588,6 +591,7 @@ async fn test_block_pruner() -> Result<()> {
query_fields: vec![("idiom".to_string(), None)],
query_text: "the".to_string(),
has_score: false,
inverted_index_option: None,
}),
..Default::default()
};
Expand All @@ -600,6 +604,7 @@ async fn test_block_pruner() -> Result<()> {
query_fields: vec![("idiom".to_string(), None)],
query_text: "光阴".to_string(),
has_score: false,
inverted_index_option: None,
}),
..Default::default()
};
Expand All @@ -612,6 +617,7 @@ async fn test_block_pruner() -> Result<()> {
query_fields: vec![("idiom".to_string(), None)],
query_text: "人生".to_string(),
has_score: false,
inverted_index_option: None,
}),
..Default::default()
};
Expand All @@ -624,6 +630,7 @@ async fn test_block_pruner() -> Result<()> {
query_fields: vec![("meaning".to_string(), None)],
query_text: "people".to_string(),
has_score: false,
inverted_index_option: None,
}),
..Default::default()
};
Expand All @@ -636,6 +643,7 @@ async fn test_block_pruner() -> Result<()> {
query_fields: vec![("meaning".to_string(), None)],
query_text: "bad".to_string(),
has_score: false,
inverted_index_option: None,
}),
..Default::default()
};
Expand All @@ -648,6 +656,7 @@ async fn test_block_pruner() -> Result<()> {
query_fields: vec![("meaning".to_string(), None)],
query_text: "黄金".to_string(),
has_score: false,
inverted_index_option: None,
}),
..Default::default()
};
Expand All @@ -660,6 +669,7 @@ async fn test_block_pruner() -> Result<()> {
query_fields: vec![("meaning".to_string(), None)],
query_text: "时间".to_string(),
has_score: false,
inverted_index_option: None,
}),
..Default::default()
};
Expand All @@ -675,6 +685,7 @@ async fn test_block_pruner() -> Result<()> {
],
query_text: "you".to_string(),
has_score: false,
inverted_index_option: None,
}),
..Default::default()
};
Expand All @@ -690,6 +701,7 @@ async fn test_block_pruner() -> Result<()> {
],
query_text: "光阴".to_string(),
has_score: false,
inverted_index_option: None,
}),
..Default::default()
};
Expand All @@ -702,6 +714,7 @@ async fn test_block_pruner() -> Result<()> {
query_fields: vec![("extras".to_string(), None)],
query_text: "extras.title:Blockchain".to_string(),
has_score: false,
inverted_index_option: None,
}),
..Default::default()
};
Expand All @@ -714,6 +727,7 @@ async fn test_block_pruner() -> Result<()> {
query_fields: vec![("extras".to_string(), None)],
query_text: "extras.metadata.author:David".to_string(),
has_score: false,
inverted_index_option: None,
}),
..Default::default()
};
Expand All @@ -726,6 +740,7 @@ async fn test_block_pruner() -> Result<()> {
query_fields: vec![("extras".to_string(), None)],
query_text: "extras.metadata.tags:技术".to_string(),
has_score: false,
inverted_index_option: None,
}),
..Default::default()
};
Expand Down
Loading
Loading