Merge #311

311: fix: Segment number into word instead of chars (#271) r=ManyTheFish a=dqkqd # Pull Request ## Related issue Fixes #271 ## What does this PR do? - Try to return `Match` if a string can be represented as number in `AhoSegmentedStrIter` - Add numbers to existing testcases - Add more 2 more testcases for segmenting and tokenizing number: `segmenter_segment_number` and `tokenize_number` ## PR checklist Please check if your PR fulfills the following requirements: - [x] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)? - [x] Have you read the contributing guidelines? - [x] Have you made sure that the title is accurate and descriptive of the changes? Thank you so much for contributing to Meilisearch! Co-authored-by: Khanh Duong Quoc <[email protected]>
meilisearch · Oct 14, 2024 · 1b48ada · 1b48ada
2 parents d06a9aa + 588439c
commit 1b48ada
Show file tree

Hide file tree

Showing 9 changed files with 154 additions and 19 deletions.
diff --git a/charabia/src/segmenter/arabic.rs b/charabia/src/segmenter/arabic.rs
@@ -34,7 +34,7 @@ mod test {
     use crate::segmenter::test::test_segmenter;
 
     // Original version of the text.
-    const TEXT: &str = "السلام عليكم، كيف حالكم؟ (أتمنى أن تكونوا بأفضل ٱلأحوال)";
+    const TEXT: &str = "السلام عليكم، كيف حالكم؟ (أتمنى أن تكونوا بأفضل ٱلأحوال) 123 456";
 
     // Segmented version of the text.
     const SEGMENTED: &[&str] = &[
@@ -61,6 +61,10 @@ mod test {
         "ٱل",
         "أحوال",
         ")",
+        " ",
+        "123",
+        " ",
+        "456",
     ];
 
     // Segmented and normalized version of the text.
@@ -88,6 +92,10 @@ mod test {
         "ال",
         "احوال",
         ")",
+        " ",
+        "123",
+        " ",
+        "456",
     ];
 
     // Macro that run several tests on the Segmenter.

diff --git a/charabia/src/segmenter/chinese.rs b/charabia/src/segmenter/chinese.rs
@@ -25,7 +25,7 @@ mod test {
 
     // Original version of the text.
     const TEXT: &str =
-        "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。";
+        "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待 123 456。";
 
     // Segmented version of the text.
     const SEGMENTED: &[&str] = &[
@@ -60,6 +60,10 @@ mod test {
         "互相",
         "對",
         "待",
+        " ",
+        "123",
+        " ",
+        "456",
         "。",
     ];
 
@@ -97,6 +101,10 @@ mod test {
         "hùxiāng",
         "duì",
         "dài",
+        " ",
+        "123",
+        " ",
+        "456",
         "。",
     ];
 
@@ -133,6 +141,10 @@ mod test {
         "互相",
         "對",
         "待",
+        " ",
+        "123",
+        " ",
+        "456",
         "。",
     ];
 

diff --git a/charabia/src/segmenter/german.rs b/charabia/src/segmenter/german.rs
@@ -27,7 +27,7 @@ mod test {
     use crate::segmenter::test::test_segmenter;
 
     const TEXT: &str =
-        "Der Dampfschifffahrtskapitän fährt über den Mittellandkanal zur Strombrücke Magdeburg.";
+        "Der Dampfschifffahrtskapitän fährt über den Mittellandkanal zur Strombrücke Magdeburg 123 456.";
 
     const SEGMENTED: &[&str] = &[
         "Der",
@@ -52,6 +52,10 @@ mod test {
         "brücke",
         " ",
         "Magdeburg",
+        " ",
+        "123",
+        " ",
+        "456",
         ".",
     ];
 
@@ -78,6 +82,10 @@ mod test {
         "brucke",
         " ",
         "magdeburg",
+        " ",
+        "123",
+        " ",
+        "456",
         ".",
     ];
 

diff --git a/charabia/src/segmenter/japanese.rs b/charabia/src/segmenter/japanese.rs
@@ -40,7 +40,7 @@ impl Segmenter for JapaneseSegmenter {
 mod test {
     use crate::segmenter::test::test_segmenter;
 
-    const TEXT: &str = "関西国際空港限定トートバッグ すもももももももものうち";
+    const TEXT: &str = "関西国際空港限定トートバッグ すもももももももものうち 123 456";
 
     const SEGMENTED: &[&str] = if cfg!(feature = "japanese-segmentation-ipadic") {
         &[
@@ -57,6 +57,10 @@ mod test {
             "もも",
             "の",
             "うち",
+            " ",
+            "123",
+            " ",
+            "456",
         ]
     } else if cfg!(feature = "japanese-segmentation-unidic") {
         &[
@@ -74,6 +78,10 @@ mod test {
             "もも",
             "の",
             "うち",
+            " ",
+            "123",
+            " ",
+            "456",
         ]
     } else {
         &[]
@@ -98,6 +106,10 @@ mod test {
             "もも",
             "の",
             "うち",
+            " ",
+            "123",
+            " ",
+            "456",
         ]
     } else if cfg!(feature = "japanese-segmentation-unidic") {
         &[
@@ -122,6 +134,10 @@ mod test {
             "もも",
             "の",
             "うち",
+            " ",
+            "123",
+            " ",
+            "456",
         ]
     } else {
         &[]

diff --git a/charabia/src/segmenter/khmer.rs b/charabia/src/segmenter/khmer.rs
@@ -49,13 +49,13 @@ mod test {
     use crate::segmenter::test::test_segmenter;
 
     // Original version of the text.
-    const TEXT: &str = "សួស្តីពិភពលោក";
+    const TEXT: &str = "សួស្តីពិភពលោក 123 456";
 
     // Segmented version of the text.
-    const SEGMENTED: &[&str] = &["សួស្តី", "ពិភពលោក"];
+    const SEGMENTED: &[&str] = &["សួស្តី", "ពិភពលោក", " ", "123", " ", "456"];
 
     // Segmented and normalized version of the text.
-    const TOKENIZED: &[&str] = &["សួស្តី", "ពិភពលោក"];
+    const TOKENIZED: &[&str] = &["សួស្តី", "ពិភពលោក", " ", "123", " ", "456"];
 
     // Macro that run several tests on the Segmenter.
     test_segmenter!(KhmerSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Khmer, Language::Khm);

diff --git a/charabia/src/segmenter/korean.rs b/charabia/src/segmenter/korean.rs
@@ -28,13 +28,43 @@ impl Segmenter for KoreanSegmenter {
 mod test {
     use crate::segmenter::test::test_segmenter;
 
-    const TEXT: &str = "한국어의형태해석을실시할수있습니다.";
+    const TEXT: &str = "한국어의형태해석을실시할수있습니다 123 456.";
 
-    const SEGMENTED: &[&str] =
-        &["한국어", "의", "형태", "해석", "을", "실시", "할", "수", "있", "습니다", "."];
+    const SEGMENTED: &[&str] = &[
+        "한국어",
+        "의",
+        "형태",
+        "해석",
+        "을",
+        "실시",
+        "할",
+        "수",
+        "있",
+        "습니다",
+        " ",
+        "123",
+        " ",
+        "456",
+        ".",
+    ];
 
-    const TOKENIZED: &[&str] =
-        &["한국어", "의", "형태", "해석", "을", "실시", "할", "수", "있", "습니다", "."];
+    const TOKENIZED: &[&str] = &[
+        "한국어",
+        "의",
+        "형태",
+        "해석",
+        "을",
+        "실시",
+        "할",
+        "수",
+        "있",
+        "습니다",
+        " ",
+        "123",
+        " ",
+        "456",
+        ".",
+    ];
 
     // Macro that run several tests on the Segmenter.
     test_segmenter!(KoreanSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Hangul, Language::Kor);

diff --git a/charabia/src/segmenter/latin/mod.rs b/charabia/src/segmenter/latin/mod.rs
@@ -26,15 +26,15 @@ mod test {
     use crate::segmenter::test::test_segmenter;
 
     const TEXT: &str =
-        "The quick (\"brown\") fox can’t jump 32.3 feet, right? Brr, it's 29.3°F! camelCase kebab-case snake_case";
+        "The quick (\"brown\") fox can’t jump 32.3 feet, right? Brr, it's 29.3°F! camelCase kebab-case snake_case 123 456";
 
     #[rustfmt::skip]
     #[cfg(feature = "latin-camelcase")]
     const SEGMENTED: &[&str] = &[
         "The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can", "’", "t",
         " ", "jump", " ", "32", ".", "3", " ", "feet", ", ", "right", "?", " ", "Brr", ", ", "it",
         "'", "s", " ", "29", ".", "3°F", "!", " ", "camel", "Case", " ", "kebab", "-", "case", " ",
-        "snake", "_", "case",
+        "snake", "_", "case", " ", "123", " ", "456",
     ];
 
     #[rustfmt::skip]
@@ -43,7 +43,7 @@ mod test {
         "the", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can", "'", "t",
         " ", "jump", " ", "32", ".", "3", " ", "feet", ", ", "right", "?", " ", "brr", ", ", "it",
         "'", "s", " ", "29", ".", "3°f", "!", " ", "camel", "case", " ", "kebab", "-", "case", " ",
-        "snake", "_", "case",
+        "snake", "_", "case", " ", "123", " ", "456",
     ];
 
     #[rustfmt::skip]
@@ -52,7 +52,7 @@ mod test {
         "The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can", "’", "t",
         " ", "jump", " ", "32", ".", "3", " ", "feet", ", ", "right", "?", " ", "Brr", ", ", "it",
         "'", "s", " ", "29", ".", "3°F", "!", " ", "camelCase", " ", "kebab", "-", "case", " ",
-        "snake", "_", "case",
+        "snake", "_", "case", " ", "123", " ", "456",
     ];
 
     #[rustfmt::skip]
@@ -61,7 +61,7 @@ mod test {
         "the", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can", "'", "t",
         " ", "jump", " ", "32", ".", "3", " ", "feet", ", ", "right", "?", " ", "brr", ", ", "it",
         "'", "s", " ", "29", ".", "3°f", "!", " ", "camelcase", " ", "kebab", "-", "case", " ",
-        "snake", "_", "case",
+        "snake", "_", "case", " ", "123", " ", "456",
     ];
 
     test_segmenter!(LatinSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Latin, Language::Eng);

diff --git a/charabia/src/segmenter/mod.rs b/charabia/src/segmenter/mod.rs
@@ -245,7 +245,12 @@ impl<'o, 'aho> Iterator for AhoSegmentedStrIter<'o, 'aho> {
         };
 
         if start < end {
-            Some((&self.text[start..end], match_type))
+            let text = &self.text[start..end];
+            if maybe_number(text) {
+                Some((text, MatchType::Match))
+            } else {
+                Some((text, match_type))
+            }
         } else if end < self.text.len() {
             self.next()
         } else {
@@ -254,6 +259,10 @@ impl<'o, 'aho> Iterator for AhoSegmentedStrIter<'o, 'aho> {
     }
 }
 
+fn maybe_number(text: &str) -> bool {
+    text.chars().all(|c| c.is_numeric() || c.is_ascii_punctuation())
+}
+
 enum MatchType {
     Interleave,
     Match,
@@ -395,10 +404,22 @@ impl<'o> Segment<'o> for &'o str {
 mod test {
     macro_rules! test_segmenter {
     ($segmenter:expr, $text:expr, $segmented:expr, $tokenized:expr, $script:expr, $language:expr) => {
+            use aho_corasick::{AhoCorasick, MatchKind};
+            use once_cell::sync::Lazy;
             use crate::{Token, Language, Script};
             use crate::segmenter::{Segment, AhoSegmentedStrIter, MatchType, DEFAULT_SEPARATOR_AHO};
             use super::*;
 
+            const NUMBER_SEPARATOR: &[&str] = &[" "];
+            const TEXT_NUMBER: &str = "123 -123 +123 12.3 -12.3 +12.3";
+            const SEGMENTED_NUMBER: &[&str] =
+                &["123", " ", "-123", " ", "+123", " ", "12.3", " ", "-12.3", " ", "+12.3"];
+            const TOKENIZED_NUMBER: &[&str] =
+                &["123", " ", "-123", " ", "+123", " ", "12.3", " ", "-12.3", " ", "+12.3"];
+            static NUMBER_SEPARATOR_AHO: Lazy<AhoCorasick> = Lazy::new(|| {
+                AhoCorasick::builder().match_kind(MatchKind::LeftmostLongest).build(NUMBER_SEPARATOR).unwrap()
+            });
+
             #[test]
             fn segmenter_segment_str() {
 
@@ -453,6 +474,38 @@ Make sure that normalized text is valid or change the trigger condition of the n
             fn segmentor_not_panic_for_random_input(text: String) {
                 let _ = $segmenter.segment_str(&text).collect::<Vec<_>>();
             }
+
+            #[test]
+            fn segmenter_segment_number() {
+
+                let segmented_text: Vec<_> = AhoSegmentedStrIter::new(TEXT_NUMBER, &NUMBER_SEPARATOR_AHO).flat_map(|m| match m {
+                    (text, MatchType::Match) => Box::new(Some(text).into_iter()),
+                    (text, MatchType::Interleave) => $segmenter.segment_str(text),
+                }).collect();
+                assert_eq!(&segmented_text[..], SEGMENTED_NUMBER, r#"
+Segmenter {} didn't segment the text as expected.
+
+help: the `segmented` text provided to `test_segmenter!` does not corresponds to the output of the tested segmenter, it's probably due to a bug in the segmenter or a mistake in the provided segmented text.
+"#, stringify!($segmenter));
+            }
+
+            #[test]
+            fn tokenize_number() {
+
+                let mut builder = crate::TokenizerBuilder::default();
+                builder.separators(NUMBER_SEPARATOR);
+                let tokenizer = builder.build();
+                let tokens: Vec<_> = tokenizer.tokenize_with_allow_list(TEXT_NUMBER, Some(&[$language])).collect();
+                let tokenized_text: Vec<_> = tokens.iter().map(|t| t.lemma()).collect();
+
+                assert_eq!(&tokenized_text[..], TOKENIZED_NUMBER, r#"
+Global tokenize() function didn't tokenize the text as expected.
+
+help: The normalized version of the segmented text is probably wrong, the used normalizers make unexpeted changes to the provided text.
+Make sure that normalized text is valid or change the trigger condition of the noisy normalizers by updating `should_normalize`.
+"#);
+            }
+
         }
     }
     pub(crate) use test_segmenter;

diff --git a/charabia/src/segmenter/thai.rs b/charabia/src/segmenter/thai.rs
@@ -27,7 +27,7 @@ impl Segmenter for ThaiSegmenter {
 mod test {
     use crate::segmenter::test::test_segmenter;
 
-    const TEXT: &str = "ภาษาไทยง่ายนิดเดียว ไก่ขันตอนเช้าบนขันน้ำ ฉันสระผมที่สระน้ำด้วยน้ำยาสระผม";
+    const TEXT: &str = "ภาษาไทยง่ายนิดเดียว ไก่ขันตอนเช้าบนขันน้ำ ฉันสระผมที่สระน้ำด้วยน้ำยาสระผม 123 456";
 
     const SEGMENTED: &[&str] = &[
         "ภาษาไทย",
@@ -47,6 +47,10 @@ mod test {
         "ด้วย",
         "น้ำยา",
         "สระผม",
+        " ",
+        "123",
+        " ",
+        "456",
     ];
 
     const TOKENIZED: &[&str] = &[
@@ -67,6 +71,10 @@ mod test {
         "ดวย",
         "นายา",
         "สระผม",
+        " ",
+        "123",
+        " ",
+        "456",
     ];
     // Macro that run several tests on the Segmenter.
     test_segmenter!(ThaiSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Thai, Language::Tha);