Merge #203

203: add khmer segmenter r=ManyTheFish a=xshadowlegendx # Pull Request ## Related issue Fixes #200 ## What does this PR do? - add segmenter for `khmer` language ## PR checklist Please check if your PR fulfills the following requirements: - [x] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)? - [x] Have you read the contributing guidelines? - [x] Have you made sure that the title is accurate and descriptive of the changes? Thank you so much for contributing to Meilisearch! Co-authored-by: xshadowlegendx <[email protected]>
meilisearch · Oct 18, 2023 · 6364941 · 6364941
2 parents 86b827a + fc3c2cd
commit 6364941
Show file tree

Hide file tree

Showing 5 changed files with 108 additions and 1 deletion.
diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml
@@ -33,9 +33,14 @@ pinyin = { version = "0.9", default-features = false, features = [
 wana_kana = { version = "3.0.0", optional = true }
 unicode-normalization = "0.1.22"
 irg-kvariants = "0.1.0"
+litemap = "0.6.1"
+zerovec = "0.9.3"
+icu = { version = "1.3.0", features = ["serde"] , optional = true }
+icu_provider_blob = { version = "1.3.0", optional = true }
+icu_provider = { version = "1.3.0", features = ["sync"], optional = true }
 
 [features]
-default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase"]
+default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer"]
 
 # allow chinese specialized tokenization
 chinese = ["dep:pinyin", "dep:jieba-rs"]
@@ -61,6 +66,8 @@ greek = []
 # allow splitting camelCase latin words
 latin-camelcase = ["dep:finl_unicode"]
 
+khmer = ["dep:icu", "dep:icu_provider_blob", "dep:icu_provider"]
+
 # allow splitting snake_case latin words
 latin-snakecase = ["dep:finl_unicode"]
 

diff --git a/charabia/benches/bench.rs b/charabia/benches/bench.rs
@@ -14,6 +14,7 @@ static DATA_SET: &[((usize, Script, Language), &str)] = &[
     ((132, Script::Thai, Language::Tha), "ไก่จิกเด็กตายเด็กตายบนปากโอ่งไก่อะไรวะโหดจัง"),
     ((132, Script::Hangul, Language::Kor), "제119조 ① 대한민국의 경제질서는 개인과 기업의 경제상의 자유와 창의를 존중함을 기본으로 한다."),
     ((130, Script::Greek, Language::Ell), "Οι θερμοκρασίες είναι σπάνια υπερβολικές στις παραθαλάσσιες περιοχές."),
+    ((132, Script::Khmer, Language::Khm), "ធ្វេីមនុស្សត្រូវចេះស្រលាញ់នឹងជួយគ្នាទៅវិញទៅមក ព្រោះពិភពលោកនេះមានទុកច្រេីនហេីយគួយតែមានអំពេីល្អច្រេីនមិនថាជួយបាន១រឺ២នាក់ច្រេីនរឺតិចទេ៕"),
     ((132, Script::Arabic, Language::Ara), "اللُّغَةُ العربية هي أكثر اللغات السامية تحدثا، ومن أكثر اللغات انتشارا"),
     // long texts (~365 bytes)
     ((363, Script::Cj, Language::Cmn), "距今60万年-2万年的时间内，北京地区处于旧石器时代，在周口店发现了旧石器时代早期北京直立人、中期新洞人和晚期山顶洞人的典型遗址。北京地区在不晚于1万年前已经开始进入新石器时代。当时该地区人类定居生活固定化，逐渐从山洞中迁徙出来，到平原地区定居[12]。"),
@@ -24,6 +25,7 @@ static DATA_SET: &[((usize, Script, Language), &str)] = &[
     ((366, Script::Thai, Language::Tha), "เราจะทำตามสัญญาขอเวลาอีกไม่นานแล้วแผ่นดินที่งดงามจะคืนกลับมาเราจะทำอย่างซื่อตรงขอแค่เธอจงไว้ใจและศรัทธาแผ่นดินจะดีในไม่ช้า"),
     ((364, Script::Hangul, Language::Kor), "제30조 타인의 범죄행위로 인하여 생명·신체에 대한 피해를 받은 국민은 법률이 정하는 바에 의하여 국가로부터 구조를 받을 수 있다. ② 명령·규칙 또는 처분이 헌법이나 법률에 위반되는 여부가 재판의 전제가 된 경우에는 대법원은 이를 최종적으로 심사할 권한을 가진다."),
     ((364, Script::Greek, Language::Ell), "Η άνοιξη έχει μικρή διάρκεια, διότι ο μεν χειμώνας είναι όψιμος, το δε καλοκαίρι αρχίζει πρώιμα. Το φθινόπωρο είναι μακρύ και θερμό και πολλές φορές παρατείνεται στη νότια Ελλάδα και τα νησιά μέχρι τα"),
+    ((327, Script::Khmer, Language::Khm), "រឿងពីរដែលមនុស្សហាមចិត្តខ្លួនឯងមិនបានគឺ សើច និង ស្រឡាញ់។ តែសម្រាប់ខ្ញុំ ប្រាក់ ចន្ទធីតា រឿងមួយទៀតដែលខ្ញុំហាមចិត្តខ្លួនឯងមិនបាននោះ គឺញ៉ាំ គេគ្រប់គ្នាពេលខូចចិត្តបាយទឹកមិនបានទេ តែខ្ញុំពេលខូចចិត្តដឹងតែឃ្លាន ញ៉ាំច្រើនឬតិចក៏អាស្រ័យលើថាទំហំនៃការខូចចិត្តខ្លាំងឬខ្សោយ។"),
     ((366, Script::Arabic, Language::Ara), "العربية لغةٌ رسمية في كل دول الوطن العربي (إضافة إلى كونها لغة رسمية في تشاد وإريتريا). وهي إحدى اللغات الرسمية الست في منظمة الأمم المتحدة، ويُحتفل بالعربية في 18 ديسمبر كذكرى اعتمادها في الأمم المتحدة."),
     ];
 

diff --git a/charabia/dictionaries/bin/icu4x-khmer-keys b/charabia/dictionaries/bin/icu4x-khmer-keys
diff --git a/charabia/src/segmenter/khmer.rs b/charabia/src/segmenter/khmer.rs
@@ -0,0 +1,91 @@
+use std::vec;
+
+use icu::segmenter::WordSegmenter;
+
+// Import `Segmenter` trait.
+use crate::segmenter::Segmenter;
+
+extern crate alloc; // required as my-data-mod is written for #[no_std]
+use icu_provider_blob::BlobDataProvider;
+//TIP: Some segmentation Libraries need to initialize a instance of the Segmenter.
+//     This initialization could be time-consuming and shouldn't be done at each call of `segment_str`.
+//     In this case, you may want to store the initialized instance in a lazy static like below and call it in `segment_str`.
+//     Otherwise, just remove below lines.
+//
+// Put this import at the top of the file.
+use once_cell::sync::Lazy;
+//
+static SEGMENTER: Lazy<WordSegmenter> = Lazy::new(|| {
+    let blob = include_bytes!("../../dictionaries/bin/icu4x-khmer-keys");
+
+    let buffer_provider: BlobDataProvider =
+        BlobDataProvider::try_new_from_static_blob(blob).expect("failed to load khmer keys");
+
+    WordSegmenter::try_new_dictionary_with_buffer_provider(&buffer_provider)
+        .expect("failed to initialize khmer word segmenter")
+});
+
+// Make a small documentation of the specialized Segmenter like below.
+/// <Script/Language> specialized [`Segmenter`].
+///
+/// This Segmenter uses [`<UsedLibraryToSegment>`] internally to segment the provided text.
+/// <OptionalAdditionnalExplanations>
+//
+//TIP: Name the Segmenter with its purpose and not its internal behavior:
+//     prefer JapaneseSegmenter (based on the Language) instead of LinderaSegmenter (based on the used Library).
+//     Same for the filename, prefer `japanese.rs` instead of `lindera.rs`.
+pub struct KhmerSegmenter;
+
+// All specialized segmenters only need to implement the method `segment_str` of the `Segmenter` trait.
+impl Segmenter for KhmerSegmenter {
+    fn segment_str<'o>(&self, to_segment: &'o str) -> Box<dyn Iterator<Item = &'o str> + 'o> {
+        let (_, positions) =
+            SEGMENTER.segment_str(to_segment).fold((None, vec![]), |mut acc, elem| {
+                if acc.0.is_some() {
+                    acc.1.push((acc.0.unwrap(), elem));
+                }
+
+                acc.0 = Some(elem);
+
+                acc
+            });
+
+        // Return the created iterator wrapping it in a Box.
+        Box::new(
+            positions
+                .iter()
+                .map(|(start, end)| &to_segment[*start..*end])
+                .collect::<Vec<&str>>()
+                .into_iter(),
+        )
+    }
+}
+
+// Publish the newly implemented Segmenter:
+//	   - import module by adding `mod dummy;` (filename) in `segmenter/mod.rs`
+//	   - publish Segmenter by adding `pub use dummy::KhmerSegmenter;` in `segmenter/mod.rs`
+//     - running `cargo doc --open` you should see your Segmenter in the segmenter module
+
+// Test the segmenter:
+#[cfg(test)]
+mod test {
+    use crate::segmenter::test::test_segmenter;
+
+    // Original version of the text.
+    const TEXT: &str = "សួស្តីពិភពលោក";
+
+    // Segmented version of the text.
+    const SEGMENTED: &[&str] = &["សួស្តី", "ពិភពលោក"];
+
+    // Segmented and normalized version of the text.
+    const TOKENIZED: &[&str] = &["សួស្តី", "ពិភពលោក"];
+
+    // Macro that run several tests on the Segmenter.
+    test_segmenter!(KhmerSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Khmer, Language::Khm);
+}
+
+// Include the newly implemented Segmenter in the tokenization pipeline:
+//	   - assign Segmenter to a Script and a Language by adding it in `SEGMENTERS` in `segmenter/mod.rs`
+//	   - check if it didn't break any test or benhchmark
+
+// Your Segmenter will now be used on texts of the assigned Script and Language. Thank you for your contribution, and congratulation! 🎉
diff --git a/charabia/src/segmenter/mod.rs b/charabia/src/segmenter/mod.rs
@@ -16,6 +16,9 @@ use slice_group_by::StrGroupBy;
 #[cfg(feature = "thai")]
 pub use thai::ThaiSegmenter;
 
+#[cfg(feature = "khmer")]
+pub use khmer::KhmerSegmenter;
+
 use crate::detection::{Detect, Language, Script, StrDetection};
 use crate::separators::DEFAULT_SEPARATORS;
 use crate::token::Token;
@@ -25,6 +28,8 @@ mod arabic;
 mod chinese;
 #[cfg(feature = "japanese")]
 mod japanese;
+#[cfg(feature = "khmer")]
+mod khmer;
 #[cfg(feature = "korean")]
 mod korean;
 mod latin;
@@ -59,6 +64,8 @@ pub static SEGMENTERS: Lazy<HashMap<(Script, Language), Box<dyn Segmenter>>> = L
         // thai segmenter
         #[cfg(feature = "thai")]
         ((Script::Thai, Language::Tha), Box::new(ThaiSegmenter) as Box<dyn Segmenter>),
+        #[cfg(feature = "khmer")]
+        ((Script::Khmer, Language::Khm), Box::new(KhmerSegmenter) as Box<dyn Segmenter>),
         // arabic segmenter
         ((Script::Arabic, Language::Ara), Box::new(ArabicSegmenter) as Box<dyn Segmenter>),
     ]