Skip to content

Commit

Permalink
Merge #311
Browse files Browse the repository at this point in the history
311: fix: Segment number into word instead of chars (#271) r=ManyTheFish a=dqkqd

# Pull Request

## Related issue
Fixes #271 

## What does this PR do?
- Try to return `Match` if a string can be represented as number in `AhoSegmentedStrIter`
- Add numbers to existing testcases
- Add more 2 more testcases for segmenting and tokenizing number: `segmenter_segment_number` and `tokenize_number`

## PR checklist
Please check if your PR fulfills the following requirements:
- [x] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)?
- [x] Have you read the contributing guidelines?
- [x] Have you made sure that the title is accurate and descriptive of the changes?

Thank you so much for contributing to Meilisearch!


Co-authored-by: Khanh Duong Quoc <[email protected]>
  • Loading branch information
meili-bors[bot] and dqkqd authored Oct 14, 2024
2 parents d06a9aa + 588439c commit 1b48ada
Show file tree
Hide file tree
Showing 9 changed files with 154 additions and 19 deletions.
10 changes: 9 additions & 1 deletion charabia/src/segmenter/arabic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ mod test {
use crate::segmenter::test::test_segmenter;

// Original version of the text.
const TEXT: &str = "السلام عليكم، كيف حالكم؟ (أتمنى أن تكونوا بأفضل ٱلأحوال)";
const TEXT: &str = "السلام عليكم، كيف حالكم؟ (أتمنى أن تكونوا بأفضل ٱلأحوال) 123 456";

// Segmented version of the text.
const SEGMENTED: &[&str] = &[
Expand All @@ -61,6 +61,10 @@ mod test {
"ٱل",
"أحوال",
")",
" ",
"123",
" ",
"456",
];

// Segmented and normalized version of the text.
Expand Down Expand Up @@ -88,6 +92,10 @@ mod test {
"ال",
"احوال",
")",
" ",
"123",
" ",
"456",
];

// Macro that run several tests on the Segmenter.
Expand Down
14 changes: 13 additions & 1 deletion charabia/src/segmenter/chinese.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ mod test {

// Original version of the text.
const TEXT: &str =
"人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。";
"人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待 123 456。";

// Segmented version of the text.
const SEGMENTED: &[&str] = &[
Expand Down Expand Up @@ -60,6 +60,10 @@ mod test {
"互相",
"對",
"待",
" ",
"123",
" ",
"456",
"。",
];

Expand Down Expand Up @@ -97,6 +101,10 @@ mod test {
"hùxiāng",
"duì",
"dài",
" ",
"123",
" ",
"456",
"。",
];

Expand Down Expand Up @@ -133,6 +141,10 @@ mod test {
"互相",
"對",
"待",
" ",
"123",
" ",
"456",
"。",
];

Expand Down
10 changes: 9 additions & 1 deletion charabia/src/segmenter/german.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ mod test {
use crate::segmenter::test::test_segmenter;

const TEXT: &str =
"Der Dampfschifffahrtskapitän fährt über den Mittellandkanal zur Strombrücke Magdeburg.";
"Der Dampfschifffahrtskapitän fährt über den Mittellandkanal zur Strombrücke Magdeburg 123 456.";

const SEGMENTED: &[&str] = &[
"Der",
Expand All @@ -52,6 +52,10 @@ mod test {
"brücke",
" ",
"Magdeburg",
" ",
"123",
" ",
"456",
".",
];

Expand All @@ -78,6 +82,10 @@ mod test {
"brucke",
" ",
"magdeburg",
" ",
"123",
" ",
"456",
".",
];

Expand Down
18 changes: 17 additions & 1 deletion charabia/src/segmenter/japanese.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ impl Segmenter for JapaneseSegmenter {
mod test {
use crate::segmenter::test::test_segmenter;

const TEXT: &str = "関西国際空港限定トートバッグ すもももももももものうち";
const TEXT: &str = "関西国際空港限定トートバッグ すもももももももものうち 123 456";

const SEGMENTED: &[&str] = if cfg!(feature = "japanese-segmentation-ipadic") {
&[
Expand All @@ -57,6 +57,10 @@ mod test {
"もも",
"の",
"うち",
" ",
"123",
" ",
"456",
]
} else if cfg!(feature = "japanese-segmentation-unidic") {
&[
Expand All @@ -74,6 +78,10 @@ mod test {
"もも",
"の",
"うち",
" ",
"123",
" ",
"456",
]
} else {
&[]
Expand All @@ -98,6 +106,10 @@ mod test {
"もも",
"の",
"うち",
" ",
"123",
" ",
"456",
]
} else if cfg!(feature = "japanese-segmentation-unidic") {
&[
Expand All @@ -122,6 +134,10 @@ mod test {
"もも",
"の",
"うち",
" ",
"123",
" ",
"456",
]
} else {
&[]
Expand Down
6 changes: 3 additions & 3 deletions charabia/src/segmenter/khmer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,13 @@ mod test {
use crate::segmenter::test::test_segmenter;

// Original version of the text.
const TEXT: &str = "សួស្តីពិភពលោក";
const TEXT: &str = "សួស្តីពិភពលោក 123 456";

// Segmented version of the text.
const SEGMENTED: &[&str] = &["សួស្តី", "ពិភពលោក"];
const SEGMENTED: &[&str] = &["សួស្តី", "ពិភពលោក", " ", "123", " ", "456"];

// Segmented and normalized version of the text.
const TOKENIZED: &[&str] = &["សួស្តី", "ពិភពលោក"];
const TOKENIZED: &[&str] = &["សួស្តី", "ពិភពលោក", " ", "123", " ", "456"];

// Macro that run several tests on the Segmenter.
test_segmenter!(KhmerSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Khmer, Language::Khm);
Expand Down
40 changes: 35 additions & 5 deletions charabia/src/segmenter/korean.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,43 @@ impl Segmenter for KoreanSegmenter {
mod test {
use crate::segmenter::test::test_segmenter;

const TEXT: &str = "한국어의형태해석을실시할수있습니다.";
const TEXT: &str = "한국어의형태해석을실시할수있습니다 123 456.";

const SEGMENTED: &[&str] =
&["한국어", "의", "형태", "해석", "을", "실시", "할", "수", "있", "습니다", "."];
const SEGMENTED: &[&str] = &[
"한국어",
"의",
"형태",
"해석",
"을",
"실시",
"할",
"수",
"있",
"습니다",
" ",
"123",
" ",
"456",
".",
];

const TOKENIZED: &[&str] =
&["한국어", "의", "형태", "해석", "을", "실시", "할", "수", "있", "습니다", "."];
const TOKENIZED: &[&str] = &[
"한국어",
"의",
"형태",
"해석",
"을",
"실시",
"할",
"수",
"있",
"습니다",
" ",
"123",
" ",
"456",
".",
];

// Macro that run several tests on the Segmenter.
test_segmenter!(KoreanSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Hangul, Language::Kor);
Expand Down
10 changes: 5 additions & 5 deletions charabia/src/segmenter/latin/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,15 @@ mod test {
use crate::segmenter::test::test_segmenter;

const TEXT: &str =
"The quick (\"brown\") fox can’t jump 32.3 feet, right? Brr, it's 29.3°F! camelCase kebab-case snake_case";
"The quick (\"brown\") fox can’t jump 32.3 feet, right? Brr, it's 29.3°F! camelCase kebab-case snake_case 123 456";

#[rustfmt::skip]
#[cfg(feature = "latin-camelcase")]
const SEGMENTED: &[&str] = &[
"The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can", "’", "t",
" ", "jump", " ", "32", ".", "3", " ", "feet", ", ", "right", "?", " ", "Brr", ", ", "it",
"'", "s", " ", "29", ".", "3°F", "!", " ", "camel", "Case", " ", "kebab", "-", "case", " ",
"snake", "_", "case",
"snake", "_", "case", " ", "123", " ", "456",
];

#[rustfmt::skip]
Expand All @@ -43,7 +43,7 @@ mod test {
"the", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can", "'", "t",
" ", "jump", " ", "32", ".", "3", " ", "feet", ", ", "right", "?", " ", "brr", ", ", "it",
"'", "s", " ", "29", ".", "3°f", "!", " ", "camel", "case", " ", "kebab", "-", "case", " ",
"snake", "_", "case",
"snake", "_", "case", " ", "123", " ", "456",
];

#[rustfmt::skip]
Expand All @@ -52,7 +52,7 @@ mod test {
"The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can", "’", "t",
" ", "jump", " ", "32", ".", "3", " ", "feet", ", ", "right", "?", " ", "Brr", ", ", "it",
"'", "s", " ", "29", ".", "3°F", "!", " ", "camelCase", " ", "kebab", "-", "case", " ",
"snake", "_", "case",
"snake", "_", "case", " ", "123", " ", "456",
];

#[rustfmt::skip]
Expand All @@ -61,7 +61,7 @@ mod test {
"the", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can", "'", "t",
" ", "jump", " ", "32", ".", "3", " ", "feet", ", ", "right", "?", " ", "brr", ", ", "it",
"'", "s", " ", "29", ".", "3°f", "!", " ", "camelcase", " ", "kebab", "-", "case", " ",
"snake", "_", "case",
"snake", "_", "case", " ", "123", " ", "456",
];

test_segmenter!(LatinSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Latin, Language::Eng);
Expand Down
55 changes: 54 additions & 1 deletion charabia/src/segmenter/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,12 @@ impl<'o, 'aho> Iterator for AhoSegmentedStrIter<'o, 'aho> {
};

if start < end {
Some((&self.text[start..end], match_type))
let text = &self.text[start..end];
if maybe_number(text) {
Some((text, MatchType::Match))
} else {
Some((text, match_type))
}
} else if end < self.text.len() {
self.next()
} else {
Expand All @@ -254,6 +259,10 @@ impl<'o, 'aho> Iterator for AhoSegmentedStrIter<'o, 'aho> {
}
}

fn maybe_number(text: &str) -> bool {
text.chars().all(|c| c.is_numeric() || c.is_ascii_punctuation())
}

enum MatchType {
Interleave,
Match,
Expand Down Expand Up @@ -395,10 +404,22 @@ impl<'o> Segment<'o> for &'o str {
mod test {
macro_rules! test_segmenter {
($segmenter:expr, $text:expr, $segmented:expr, $tokenized:expr, $script:expr, $language:expr) => {
use aho_corasick::{AhoCorasick, MatchKind};
use once_cell::sync::Lazy;
use crate::{Token, Language, Script};
use crate::segmenter::{Segment, AhoSegmentedStrIter, MatchType, DEFAULT_SEPARATOR_AHO};
use super::*;

const NUMBER_SEPARATOR: &[&str] = &[" "];
const TEXT_NUMBER: &str = "123 -123 +123 12.3 -12.3 +12.3";
const SEGMENTED_NUMBER: &[&str] =
&["123", " ", "-123", " ", "+123", " ", "12.3", " ", "-12.3", " ", "+12.3"];
const TOKENIZED_NUMBER: &[&str] =
&["123", " ", "-123", " ", "+123", " ", "12.3", " ", "-12.3", " ", "+12.3"];
static NUMBER_SEPARATOR_AHO: Lazy<AhoCorasick> = Lazy::new(|| {
AhoCorasick::builder().match_kind(MatchKind::LeftmostLongest).build(NUMBER_SEPARATOR).unwrap()
});

#[test]
fn segmenter_segment_str() {

Expand Down Expand Up @@ -453,6 +474,38 @@ Make sure that normalized text is valid or change the trigger condition of the n
fn segmentor_not_panic_for_random_input(text: String) {
let _ = $segmenter.segment_str(&text).collect::<Vec<_>>();
}

#[test]
fn segmenter_segment_number() {

let segmented_text: Vec<_> = AhoSegmentedStrIter::new(TEXT_NUMBER, &NUMBER_SEPARATOR_AHO).flat_map(|m| match m {
(text, MatchType::Match) => Box::new(Some(text).into_iter()),
(text, MatchType::Interleave) => $segmenter.segment_str(text),
}).collect();
assert_eq!(&segmented_text[..], SEGMENTED_NUMBER, r#"
Segmenter {} didn't segment the text as expected.
help: the `segmented` text provided to `test_segmenter!` does not corresponds to the output of the tested segmenter, it's probably due to a bug in the segmenter or a mistake in the provided segmented text.
"#, stringify!($segmenter));
}

#[test]
fn tokenize_number() {

let mut builder = crate::TokenizerBuilder::default();
builder.separators(NUMBER_SEPARATOR);
let tokenizer = builder.build();
let tokens: Vec<_> = tokenizer.tokenize_with_allow_list(TEXT_NUMBER, Some(&[$language])).collect();
let tokenized_text: Vec<_> = tokens.iter().map(|t| t.lemma()).collect();

assert_eq!(&tokenized_text[..], TOKENIZED_NUMBER, r#"
Global tokenize() function didn't tokenize the text as expected.
help: The normalized version of the segmented text is probably wrong, the used normalizers make unexpeted changes to the provided text.
Make sure that normalized text is valid or change the trigger condition of the noisy normalizers by updating `should_normalize`.
"#);
}

}
}
pub(crate) use test_segmenter;
Expand Down
10 changes: 9 additions & 1 deletion charabia/src/segmenter/thai.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ impl Segmenter for ThaiSegmenter {
mod test {
use crate::segmenter::test::test_segmenter;

const TEXT: &str = "ภาษาไทยง่ายนิดเดียว ไก่ขันตอนเช้าบนขันน้ำ ฉันสระผมที่สระน้ำด้วยน้ำยาสระผม";
const TEXT: &str = "ภาษาไทยง่ายนิดเดียว ไก่ขันตอนเช้าบนขันน้ำ ฉันสระผมที่สระน้ำด้วยน้ำยาสระผม 123 456";

const SEGMENTED: &[&str] = &[
"ภาษาไทย",
Expand All @@ -47,6 +47,10 @@ mod test {
"ด้วย",
"น้ำยา",
"สระผม",
" ",
"123",
" ",
"456",
];

const TOKENIZED: &[&str] = &[
Expand All @@ -67,6 +71,10 @@ mod test {
"ดวย",
"นายา",
"สระผม",
" ",
"123",
" ",
"456",
];
// Macro that run several tests on the Segmenter.
test_segmenter!(ThaiSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Thai, Language::Tha);
Expand Down

0 comments on commit 1b48ada

Please sign in to comment.