diff --git a/charabia/src/segmenter/arabic.rs b/charabia/src/segmenter/arabic.rs index c521e0d..1cf567b 100644 --- a/charabia/src/segmenter/arabic.rs +++ b/charabia/src/segmenter/arabic.rs @@ -34,7 +34,7 @@ mod test { use crate::segmenter::test::test_segmenter; // Original version of the text. - const TEXT: &str = "السلام عليكم، كيف حالكم؟ (أتمنى أن تكونوا بأفضل ٱلأحوال)"; + const TEXT: &str = "السلام عليكم، كيف حالكم؟ (أتمنى أن تكونوا بأفضل ٱلأحوال) 123 456"; // Segmented version of the text. const SEGMENTED: &[&str] = &[ @@ -61,6 +61,10 @@ mod test { "ٱل", "أحوال", ")", + " ", + "123", + " ", + "456", ]; // Segmented and normalized version of the text. @@ -88,6 +92,10 @@ mod test { "ال", "احوال", ")", + " ", + "123", + " ", + "456", ]; // Macro that run several tests on the Segmenter. diff --git a/charabia/src/segmenter/chinese.rs b/charabia/src/segmenter/chinese.rs index 9af1ad1..6b7ebf2 100644 --- a/charabia/src/segmenter/chinese.rs +++ b/charabia/src/segmenter/chinese.rs @@ -25,7 +25,7 @@ mod test { // Original version of the text. const TEXT: &str = - "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。"; + "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待 123 456。"; // Segmented version of the text. const SEGMENTED: &[&str] = &[ @@ -60,6 +60,10 @@ mod test { "互相", "對", "待", + " ", + "123", + " ", + "456", "。", ]; @@ -97,6 +101,10 @@ mod test { "hùxiāng", "duì", "dài", + " ", + "123", + " ", + "456", "。", ]; @@ -133,6 +141,10 @@ mod test { "互相", "對", "待", + " ", + "123", + " ", + "456", "。", ]; diff --git a/charabia/src/segmenter/german.rs b/charabia/src/segmenter/german.rs index 9ea40e7..8414790 100644 --- a/charabia/src/segmenter/german.rs +++ b/charabia/src/segmenter/german.rs @@ -27,7 +27,7 @@ mod test { use crate::segmenter::test::test_segmenter; const TEXT: &str = - "Der Dampfschifffahrtskapitän fährt über den Mittellandkanal zur Strombrücke Magdeburg."; + "Der Dampfschifffahrtskapitän fährt über den Mittellandkanal zur Strombrücke Magdeburg 123 456."; const SEGMENTED: &[&str] = &[ "Der", @@ -52,6 +52,10 @@ mod test { "brücke", " ", "Magdeburg", + " ", + "123", + " ", + "456", ".", ]; @@ -78,6 +82,10 @@ mod test { "brucke", " ", "magdeburg", + " ", + "123", + " ", + "456", ".", ]; diff --git a/charabia/src/segmenter/japanese.rs b/charabia/src/segmenter/japanese.rs index e19c4e9..a130373 100644 --- a/charabia/src/segmenter/japanese.rs +++ b/charabia/src/segmenter/japanese.rs @@ -40,7 +40,7 @@ impl Segmenter for JapaneseSegmenter { mod test { use crate::segmenter::test::test_segmenter; - const TEXT: &str = "関西国際空港限定トートバッグ すもももももももものうち"; + const TEXT: &str = "関西国際空港限定トートバッグ すもももももももものうち 123 456"; const SEGMENTED: &[&str] = if cfg!(feature = "japanese-segmentation-ipadic") { &[ @@ -57,6 +57,10 @@ mod test { "もも", "の", "うち", + " ", + "123", + " ", + "456", ] } else if cfg!(feature = "japanese-segmentation-unidic") { &[ @@ -74,6 +78,10 @@ mod test { "もも", "の", "うち", + " ", + "123", + " ", + "456", ] } else { &[] @@ -98,6 +106,10 @@ mod test { "もも", "の", "うち", + " ", + "123", + " ", + "456", ] } else if cfg!(feature = "japanese-segmentation-unidic") { &[ @@ -122,6 +134,10 @@ mod test { "もも", "の", "うち", + " ", + "123", + " ", + "456", ] } else { &[] diff --git a/charabia/src/segmenter/khmer.rs b/charabia/src/segmenter/khmer.rs index 6f0ff02..6eb6e23 100644 --- a/charabia/src/segmenter/khmer.rs +++ b/charabia/src/segmenter/khmer.rs @@ -49,13 +49,13 @@ mod test { use crate::segmenter::test::test_segmenter; // Original version of the text. - const TEXT: &str = "សួស្តីពិភពលោក"; + const TEXT: &str = "សួស្តីពិភពលោក 123 456"; // Segmented version of the text. - const SEGMENTED: &[&str] = &["សួស្តី", "ពិភពលោក"]; + const SEGMENTED: &[&str] = &["សួស្តី", "ពិភពលោក", " ", "123", " ", "456"]; // Segmented and normalized version of the text. - const TOKENIZED: &[&str] = &["សួស្តី", "ពិភពលោក"]; + const TOKENIZED: &[&str] = &["សួស្តី", "ពិភពលោក", " ", "123", " ", "456"]; // Macro that run several tests on the Segmenter. test_segmenter!(KhmerSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Khmer, Language::Khm); diff --git a/charabia/src/segmenter/korean.rs b/charabia/src/segmenter/korean.rs index 3160492..76b6086 100644 --- a/charabia/src/segmenter/korean.rs +++ b/charabia/src/segmenter/korean.rs @@ -28,13 +28,43 @@ impl Segmenter for KoreanSegmenter { mod test { use crate::segmenter::test::test_segmenter; - const TEXT: &str = "한국어의형태해석을실시할수있습니다."; + const TEXT: &str = "한국어의형태해석을실시할수있습니다 123 456."; - const SEGMENTED: &[&str] = - &["한국어", "의", "형태", "해석", "을", "실시", "할", "수", "있", "습니다", "."]; + const SEGMENTED: &[&str] = &[ + "한국어", + "의", + "형태", + "해석", + "을", + "실시", + "할", + "수", + "있", + "습니다", + " ", + "123", + " ", + "456", + ".", + ]; - const TOKENIZED: &[&str] = - &["한국어", "의", "형태", "해석", "을", "실시", "할", "수", "있", "습니다", "."]; + const TOKENIZED: &[&str] = &[ + "한국어", + "의", + "형태", + "해석", + "을", + "실시", + "할", + "수", + "있", + "습니다", + " ", + "123", + " ", + "456", + ".", + ]; // Macro that run several tests on the Segmenter. test_segmenter!(KoreanSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Hangul, Language::Kor); diff --git a/charabia/src/segmenter/latin/mod.rs b/charabia/src/segmenter/latin/mod.rs index b7b7855..982d677 100644 --- a/charabia/src/segmenter/latin/mod.rs +++ b/charabia/src/segmenter/latin/mod.rs @@ -26,7 +26,7 @@ mod test { use crate::segmenter::test::test_segmenter; const TEXT: &str = - "The quick (\"brown\") fox can’t jump 32.3 feet, right? Brr, it's 29.3°F! camelCase kebab-case snake_case"; + "The quick (\"brown\") fox can’t jump 32.3 feet, right? Brr, it's 29.3°F! camelCase kebab-case snake_case 123 456"; #[rustfmt::skip] #[cfg(feature = "latin-camelcase")] @@ -34,7 +34,7 @@ mod test { "The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can", "’", "t", " ", "jump", " ", "32", ".", "3", " ", "feet", ", ", "right", "?", " ", "Brr", ", ", "it", "'", "s", " ", "29", ".", "3°F", "!", " ", "camel", "Case", " ", "kebab", "-", "case", " ", - "snake", "_", "case", + "snake", "_", "case", " ", "123", " ", "456", ]; #[rustfmt::skip] @@ -43,7 +43,7 @@ mod test { "the", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can", "'", "t", " ", "jump", " ", "32", ".", "3", " ", "feet", ", ", "right", "?", " ", "brr", ", ", "it", "'", "s", " ", "29", ".", "3°f", "!", " ", "camel", "case", " ", "kebab", "-", "case", " ", - "snake", "_", "case", + "snake", "_", "case", " ", "123", " ", "456", ]; #[rustfmt::skip] @@ -52,7 +52,7 @@ mod test { "The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can", "’", "t", " ", "jump", " ", "32", ".", "3", " ", "feet", ", ", "right", "?", " ", "Brr", ", ", "it", "'", "s", " ", "29", ".", "3°F", "!", " ", "camelCase", " ", "kebab", "-", "case", " ", - "snake", "_", "case", + "snake", "_", "case", " ", "123", " ", "456", ]; #[rustfmt::skip] @@ -61,7 +61,7 @@ mod test { "the", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can", "'", "t", " ", "jump", " ", "32", ".", "3", " ", "feet", ", ", "right", "?", " ", "brr", ", ", "it", "'", "s", " ", "29", ".", "3°f", "!", " ", "camelcase", " ", "kebab", "-", "case", " ", - "snake", "_", "case", + "snake", "_", "case", " ", "123", " ", "456", ]; test_segmenter!(LatinSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Latin, Language::Eng); diff --git a/charabia/src/segmenter/mod.rs b/charabia/src/segmenter/mod.rs index 628c7bc..653afe5 100644 --- a/charabia/src/segmenter/mod.rs +++ b/charabia/src/segmenter/mod.rs @@ -245,7 +245,12 @@ impl<'o, 'aho> Iterator for AhoSegmentedStrIter<'o, 'aho> { }; if start < end { - Some((&self.text[start..end], match_type)) + let text = &self.text[start..end]; + if maybe_number(text) { + Some((text, MatchType::Match)) + } else { + Some((text, match_type)) + } } else if end < self.text.len() { self.next() } else { @@ -254,6 +259,10 @@ impl<'o, 'aho> Iterator for AhoSegmentedStrIter<'o, 'aho> { } } +fn maybe_number(text: &str) -> bool { + text.chars().all(|c| c.is_numeric() || c.is_ascii_punctuation()) +} + enum MatchType { Interleave, Match, @@ -395,10 +404,22 @@ impl<'o> Segment<'o> for &'o str { mod test { macro_rules! test_segmenter { ($segmenter:expr, $text:expr, $segmented:expr, $tokenized:expr, $script:expr, $language:expr) => { + use aho_corasick::{AhoCorasick, MatchKind}; + use once_cell::sync::Lazy; use crate::{Token, Language, Script}; use crate::segmenter::{Segment, AhoSegmentedStrIter, MatchType, DEFAULT_SEPARATOR_AHO}; use super::*; + const NUMBER_SEPARATOR: &[&str] = &[" "]; + const TEXT_NUMBER: &str = "123 -123 +123 12.3 -12.3 +12.3"; + const SEGMENTED_NUMBER: &[&str] = + &["123", " ", "-123", " ", "+123", " ", "12.3", " ", "-12.3", " ", "+12.3"]; + const TOKENIZED_NUMBER: &[&str] = + &["123", " ", "-123", " ", "+123", " ", "12.3", " ", "-12.3", " ", "+12.3"]; + static NUMBER_SEPARATOR_AHO: Lazy = Lazy::new(|| { + AhoCorasick::builder().match_kind(MatchKind::LeftmostLongest).build(NUMBER_SEPARATOR).unwrap() + }); + #[test] fn segmenter_segment_str() { @@ -453,6 +474,38 @@ Make sure that normalized text is valid or change the trigger condition of the n fn segmentor_not_panic_for_random_input(text: String) { let _ = $segmenter.segment_str(&text).collect::>(); } + + #[test] + fn segmenter_segment_number() { + + let segmented_text: Vec<_> = AhoSegmentedStrIter::new(TEXT_NUMBER, &NUMBER_SEPARATOR_AHO).flat_map(|m| match m { + (text, MatchType::Match) => Box::new(Some(text).into_iter()), + (text, MatchType::Interleave) => $segmenter.segment_str(text), + }).collect(); + assert_eq!(&segmented_text[..], SEGMENTED_NUMBER, r#" +Segmenter {} didn't segment the text as expected. + +help: the `segmented` text provided to `test_segmenter!` does not corresponds to the output of the tested segmenter, it's probably due to a bug in the segmenter or a mistake in the provided segmented text. +"#, stringify!($segmenter)); + } + + #[test] + fn tokenize_number() { + + let mut builder = crate::TokenizerBuilder::default(); + builder.separators(NUMBER_SEPARATOR); + let tokenizer = builder.build(); + let tokens: Vec<_> = tokenizer.tokenize_with_allow_list(TEXT_NUMBER, Some(&[$language])).collect(); + let tokenized_text: Vec<_> = tokens.iter().map(|t| t.lemma()).collect(); + + assert_eq!(&tokenized_text[..], TOKENIZED_NUMBER, r#" +Global tokenize() function didn't tokenize the text as expected. + +help: The normalized version of the segmented text is probably wrong, the used normalizers make unexpeted changes to the provided text. +Make sure that normalized text is valid or change the trigger condition of the noisy normalizers by updating `should_normalize`. +"#); + } + } } pub(crate) use test_segmenter; diff --git a/charabia/src/segmenter/thai.rs b/charabia/src/segmenter/thai.rs index a9488e8..5d72928 100644 --- a/charabia/src/segmenter/thai.rs +++ b/charabia/src/segmenter/thai.rs @@ -27,7 +27,7 @@ impl Segmenter for ThaiSegmenter { mod test { use crate::segmenter::test::test_segmenter; - const TEXT: &str = "ภาษาไทยง่ายนิดเดียว ไก่ขันตอนเช้าบนขันน้ำ ฉันสระผมที่สระน้ำด้วยน้ำยาสระผม"; + const TEXT: &str = "ภาษาไทยง่ายนิดเดียว ไก่ขันตอนเช้าบนขันน้ำ ฉันสระผมที่สระน้ำด้วยน้ำยาสระผม 123 456"; const SEGMENTED: &[&str] = &[ "ภาษาไทย", @@ -47,6 +47,10 @@ mod test { "ด้วย", "น้ำยา", "สระผม", + " ", + "123", + " ", + "456", ]; const TOKENIZED: &[&str] = &[ @@ -67,6 +71,10 @@ mod test { "ดวย", "นายา", "สระผม", + " ", + "123", + " ", + "456", ]; // Macro that run several tests on the Segmenter. test_segmenter!(ThaiSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Thai, Language::Tha);