Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Segment number into word instead of chars (#271) #311

Merged
merged 1 commit into from
Oct 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion charabia/src/segmenter/arabic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ mod test {
use crate::segmenter::test::test_segmenter;

// Original version of the text.
const TEXT: &str = "السلام عليكم، كيف حالكم؟ (أتمنى أن تكونوا بأفضل ٱلأحوال)";
const TEXT: &str = "السلام عليكم، كيف حالكم؟ (أتمنى أن تكونوا بأفضل ٱلأحوال) 123 456";

// Segmented version of the text.
const SEGMENTED: &[&str] = &[
Expand All @@ -61,6 +61,10 @@ mod test {
"ٱل",
"أحوال",
")",
" ",
"123",
" ",
"456",
];

// Segmented and normalized version of the text.
Expand Down Expand Up @@ -88,6 +92,10 @@ mod test {
"ال",
"احوال",
")",
" ",
"123",
" ",
"456",
];

// Macro that run several tests on the Segmenter.
Expand Down
14 changes: 13 additions & 1 deletion charabia/src/segmenter/chinese.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ mod test {

// Original version of the text.
const TEXT: &str =
"人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。";
"人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待 123 456。";

// Segmented version of the text.
const SEGMENTED: &[&str] = &[
Expand Down Expand Up @@ -60,6 +60,10 @@ mod test {
"互相",
"對",
"待",
" ",
"123",
" ",
"456",
"。",
];

Expand Down Expand Up @@ -97,6 +101,10 @@ mod test {
"hùxiāng",
"duì",
"dài",
" ",
"123",
" ",
"456",
"。",
];

Expand Down Expand Up @@ -133,6 +141,10 @@ mod test {
"互相",
"對",
"待",
" ",
"123",
" ",
"456",
"。",
];

Expand Down
10 changes: 9 additions & 1 deletion charabia/src/segmenter/german.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ mod test {
use crate::segmenter::test::test_segmenter;

const TEXT: &str =
"Der Dampfschifffahrtskapitän fährt über den Mittellandkanal zur Strombrücke Magdeburg.";
"Der Dampfschifffahrtskapitän fährt über den Mittellandkanal zur Strombrücke Magdeburg 123 456.";

const SEGMENTED: &[&str] = &[
"Der",
Expand All @@ -52,6 +52,10 @@ mod test {
"brücke",
" ",
"Magdeburg",
" ",
"123",
" ",
"456",
".",
];

Expand All @@ -78,6 +82,10 @@ mod test {
"brucke",
" ",
"magdeburg",
" ",
"123",
" ",
"456",
".",
];

Expand Down
18 changes: 17 additions & 1 deletion charabia/src/segmenter/japanese.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ impl Segmenter for JapaneseSegmenter {
mod test {
use crate::segmenter::test::test_segmenter;

const TEXT: &str = "関西国際空港限定トートバッグ すもももももももものうち";
const TEXT: &str = "関西国際空港限定トートバッグ すもももももももものうち 123 456";

const SEGMENTED: &[&str] = if cfg!(feature = "japanese-segmentation-ipadic") {
&[
Expand All @@ -57,6 +57,10 @@ mod test {
"もも",
"の",
"うち",
" ",
"123",
" ",
"456",
]
} else if cfg!(feature = "japanese-segmentation-unidic") {
&[
Expand All @@ -74,6 +78,10 @@ mod test {
"もも",
"の",
"うち",
" ",
"123",
" ",
"456",
]
} else {
&[]
Expand All @@ -98,6 +106,10 @@ mod test {
"もも",
"の",
"うち",
" ",
"123",
" ",
"456",
]
} else if cfg!(feature = "japanese-segmentation-unidic") {
&[
Expand All @@ -122,6 +134,10 @@ mod test {
"もも",
"の",
"うち",
" ",
"123",
" ",
"456",
]
} else {
&[]
Expand Down
6 changes: 3 additions & 3 deletions charabia/src/segmenter/khmer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,13 @@ mod test {
use crate::segmenter::test::test_segmenter;

// Original version of the text.
const TEXT: &str = "សួស្តីពិភពលោក";
const TEXT: &str = "សួស្តីពិភពលោក 123 456";

// Segmented version of the text.
const SEGMENTED: &[&str] = &["សួស្តី", "ពិភពលោក"];
const SEGMENTED: &[&str] = &["សួស្តី", "ពិភពលោក", " ", "123", " ", "456"];

// Segmented and normalized version of the text.
const TOKENIZED: &[&str] = &["សួស្តី", "ពិភពលោក"];
const TOKENIZED: &[&str] = &["សួស្តី", "ពិភពលោក", " ", "123", " ", "456"];

// Macro that run several tests on the Segmenter.
test_segmenter!(KhmerSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Khmer, Language::Khm);
Expand Down
40 changes: 35 additions & 5 deletions charabia/src/segmenter/korean.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,43 @@ impl Segmenter for KoreanSegmenter {
mod test {
use crate::segmenter::test::test_segmenter;

const TEXT: &str = "한국어의형태해석을실시할수있습니다.";
const TEXT: &str = "한국어의형태해석을실시할수있습니다 123 456.";

const SEGMENTED: &[&str] =
&["한국어", "의", "형태", "해석", "을", "실시", "할", "수", "있", "습니다", "."];
const SEGMENTED: &[&str] = &[
"한국어",
"의",
"형태",
"해석",
"을",
"실시",
"할",
"수",
"있",
"습니다",
" ",
"123",
" ",
"456",
".",
];

const TOKENIZED: &[&str] =
&["한국어", "의", "형태", "해석", "을", "실시", "할", "수", "있", "습니다", "."];
const TOKENIZED: &[&str] = &[
"한국어",
"의",
"형태",
"해석",
"을",
"실시",
"할",
"수",
"있",
"습니다",
" ",
"123",
" ",
"456",
".",
];

// Macro that run several tests on the Segmenter.
test_segmenter!(KoreanSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Hangul, Language::Kor);
Expand Down
10 changes: 5 additions & 5 deletions charabia/src/segmenter/latin/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,15 @@ mod test {
use crate::segmenter::test::test_segmenter;

const TEXT: &str =
"The quick (\"brown\") fox can’t jump 32.3 feet, right? Brr, it's 29.3°F! camelCase kebab-case snake_case";
"The quick (\"brown\") fox can’t jump 32.3 feet, right? Brr, it's 29.3°F! camelCase kebab-case snake_case 123 456";

#[rustfmt::skip]
#[cfg(feature = "latin-camelcase")]
const SEGMENTED: &[&str] = &[
"The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can", "’", "t",
" ", "jump", " ", "32", ".", "3", " ", "feet", ", ", "right", "?", " ", "Brr", ", ", "it",
"'", "s", " ", "29", ".", "3°F", "!", " ", "camel", "Case", " ", "kebab", "-", "case", " ",
"snake", "_", "case",
"snake", "_", "case", " ", "123", " ", "456",
];

#[rustfmt::skip]
Expand All @@ -43,7 +43,7 @@ mod test {
"the", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can", "'", "t",
" ", "jump", " ", "32", ".", "3", " ", "feet", ", ", "right", "?", " ", "brr", ", ", "it",
"'", "s", " ", "29", ".", "3°f", "!", " ", "camel", "case", " ", "kebab", "-", "case", " ",
"snake", "_", "case",
"snake", "_", "case", " ", "123", " ", "456",
];

#[rustfmt::skip]
Expand All @@ -52,7 +52,7 @@ mod test {
"The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can", "’", "t",
" ", "jump", " ", "32", ".", "3", " ", "feet", ", ", "right", "?", " ", "Brr", ", ", "it",
"'", "s", " ", "29", ".", "3°F", "!", " ", "camelCase", " ", "kebab", "-", "case", " ",
"snake", "_", "case",
"snake", "_", "case", " ", "123", " ", "456",
];

#[rustfmt::skip]
Expand All @@ -61,7 +61,7 @@ mod test {
"the", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can", "'", "t",
" ", "jump", " ", "32", ".", "3", " ", "feet", ", ", "right", "?", " ", "brr", ", ", "it",
"'", "s", " ", "29", ".", "3°f", "!", " ", "camelcase", " ", "kebab", "-", "case", " ",
"snake", "_", "case",
"snake", "_", "case", " ", "123", " ", "456",
];

test_segmenter!(LatinSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Latin, Language::Eng);
Expand Down
55 changes: 54 additions & 1 deletion charabia/src/segmenter/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,12 @@ impl<'o, 'aho> Iterator for AhoSegmentedStrIter<'o, 'aho> {
};

if start < end {
Some((&self.text[start..end], match_type))
let text = &self.text[start..end];
if maybe_number(text) {
Some((text, MatchType::Match))
} else {
Some((text, match_type))
}
} else if end < self.text.len() {
self.next()
} else {
Expand All @@ -254,6 +259,10 @@ impl<'o, 'aho> Iterator for AhoSegmentedStrIter<'o, 'aho> {
}
}

fn maybe_number(text: &str) -> bool {
text.chars().all(|c| c.is_numeric() || c.is_ascii_punctuation())
}

enum MatchType {
Interleave,
Match,
Expand Down Expand Up @@ -395,10 +404,22 @@ impl<'o> Segment<'o> for &'o str {
mod test {
macro_rules! test_segmenter {
($segmenter:expr, $text:expr, $segmented:expr, $tokenized:expr, $script:expr, $language:expr) => {
use aho_corasick::{AhoCorasick, MatchKind};
use once_cell::sync::Lazy;
use crate::{Token, Language, Script};
use crate::segmenter::{Segment, AhoSegmentedStrIter, MatchType, DEFAULT_SEPARATOR_AHO};
use super::*;

const NUMBER_SEPARATOR: &[&str] = &[" "];
const TEXT_NUMBER: &str = "123 -123 +123 12.3 -12.3 +12.3";
const SEGMENTED_NUMBER: &[&str] =
&["123", " ", "-123", " ", "+123", " ", "12.3", " ", "-12.3", " ", "+12.3"];
const TOKENIZED_NUMBER: &[&str] =
&["123", " ", "-123", " ", "+123", " ", "12.3", " ", "-12.3", " ", "+12.3"];
static NUMBER_SEPARATOR_AHO: Lazy<AhoCorasick> = Lazy::new(|| {
AhoCorasick::builder().match_kind(MatchKind::LeftmostLongest).build(NUMBER_SEPARATOR).unwrap()
});

#[test]
fn segmenter_segment_str() {

Expand Down Expand Up @@ -453,6 +474,38 @@ Make sure that normalized text is valid or change the trigger condition of the n
fn segmentor_not_panic_for_random_input(text: String) {
let _ = $segmenter.segment_str(&text).collect::<Vec<_>>();
}

#[test]
fn segmenter_segment_number() {

let segmented_text: Vec<_> = AhoSegmentedStrIter::new(TEXT_NUMBER, &NUMBER_SEPARATOR_AHO).flat_map(|m| match m {
(text, MatchType::Match) => Box::new(Some(text).into_iter()),
(text, MatchType::Interleave) => $segmenter.segment_str(text),
}).collect();
assert_eq!(&segmented_text[..], SEGMENTED_NUMBER, r#"
Segmenter {} didn't segment the text as expected.

help: the `segmented` text provided to `test_segmenter!` does not corresponds to the output of the tested segmenter, it's probably due to a bug in the segmenter or a mistake in the provided segmented text.
"#, stringify!($segmenter));
}

#[test]
fn tokenize_number() {

let mut builder = crate::TokenizerBuilder::default();
builder.separators(NUMBER_SEPARATOR);
let tokenizer = builder.build();
let tokens: Vec<_> = tokenizer.tokenize_with_allow_list(TEXT_NUMBER, Some(&[$language])).collect();
let tokenized_text: Vec<_> = tokens.iter().map(|t| t.lemma()).collect();

assert_eq!(&tokenized_text[..], TOKENIZED_NUMBER, r#"
Global tokenize() function didn't tokenize the text as expected.

help: The normalized version of the segmented text is probably wrong, the used normalizers make unexpeted changes to the provided text.
Make sure that normalized text is valid or change the trigger condition of the noisy normalizers by updating `should_normalize`.
"#);
}

}
}
pub(crate) use test_segmenter;
Expand Down
10 changes: 9 additions & 1 deletion charabia/src/segmenter/thai.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ impl Segmenter for ThaiSegmenter {
mod test {
use crate::segmenter::test::test_segmenter;

const TEXT: &str = "ภาษาไทยง่ายนิดเดียว ไก่ขันตอนเช้าบนขันน้ำ ฉันสระผมที่สระน้ำด้วยน้ำยาสระผม";
const TEXT: &str = "ภาษาไทยง่ายนิดเดียว ไก่ขันตอนเช้าบนขันน้ำ ฉันสระผมที่สระน้ำด้วยน้ำยาสระผม 123 456";

const SEGMENTED: &[&str] = &[
"ภาษาไทย",
Expand All @@ -47,6 +47,10 @@ mod test {
"ด้วย",
"น้ำยา",
"สระผม",
" ",
"123",
" ",
"456",
];

const TOKENIZED: &[&str] = &[
Expand All @@ -67,6 +71,10 @@ mod test {
"ดวย",
"นายา",
"สระผม",
" ",
"123",
" ",
"456",
];
// Macro that run several tests on the Segmenter.
test_segmenter!(ThaiSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Thai, Language::Tha);
Expand Down
Loading