-
Notifications
You must be signed in to change notification settings - Fork 90
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
203: add khmer segmenter r=ManyTheFish a=xshadowlegendx # Pull Request ## Related issue Fixes #200 ## What does this PR do? - add segmenter for `khmer` language ## PR checklist Please check if your PR fulfills the following requirements: - [x] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)? - [x] Have you read the contributing guidelines? - [x] Have you made sure that the title is accurate and descriptive of the changes? Thank you so much for contributing to Meilisearch! Co-authored-by: xshadowlegendx <[email protected]>
- Loading branch information
Showing
5 changed files
with
108 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
use std::vec; | ||
|
||
use icu::segmenter::WordSegmenter; | ||
|
||
// Import `Segmenter` trait. | ||
use crate::segmenter::Segmenter; | ||
|
||
extern crate alloc; // required as my-data-mod is written for #[no_std] | ||
use icu_provider_blob::BlobDataProvider; | ||
//TIP: Some segmentation Libraries need to initialize a instance of the Segmenter. | ||
// This initialization could be time-consuming and shouldn't be done at each call of `segment_str`. | ||
// In this case, you may want to store the initialized instance in a lazy static like below and call it in `segment_str`. | ||
// Otherwise, just remove below lines. | ||
// | ||
// Put this import at the top of the file. | ||
use once_cell::sync::Lazy; | ||
// | ||
static SEGMENTER: Lazy<WordSegmenter> = Lazy::new(|| { | ||
let blob = include_bytes!("../../dictionaries/bin/icu4x-khmer-keys"); | ||
|
||
let buffer_provider: BlobDataProvider = | ||
BlobDataProvider::try_new_from_static_blob(blob).expect("failed to load khmer keys"); | ||
|
||
WordSegmenter::try_new_dictionary_with_buffer_provider(&buffer_provider) | ||
.expect("failed to initialize khmer word segmenter") | ||
}); | ||
|
||
// Make a small documentation of the specialized Segmenter like below. | ||
/// <Script/Language> specialized [`Segmenter`]. | ||
/// | ||
/// This Segmenter uses [`<UsedLibraryToSegment>`] internally to segment the provided text. | ||
/// <OptionalAdditionnalExplanations> | ||
// | ||
//TIP: Name the Segmenter with its purpose and not its internal behavior: | ||
// prefer JapaneseSegmenter (based on the Language) instead of LinderaSegmenter (based on the used Library). | ||
// Same for the filename, prefer `japanese.rs` instead of `lindera.rs`. | ||
pub struct KhmerSegmenter; | ||
|
||
// All specialized segmenters only need to implement the method `segment_str` of the `Segmenter` trait. | ||
impl Segmenter for KhmerSegmenter { | ||
fn segment_str<'o>(&self, to_segment: &'o str) -> Box<dyn Iterator<Item = &'o str> + 'o> { | ||
let (_, positions) = | ||
SEGMENTER.segment_str(to_segment).fold((None, vec![]), |mut acc, elem| { | ||
if acc.0.is_some() { | ||
acc.1.push((acc.0.unwrap(), elem)); | ||
} | ||
|
||
acc.0 = Some(elem); | ||
|
||
acc | ||
}); | ||
|
||
// Return the created iterator wrapping it in a Box. | ||
Box::new( | ||
positions | ||
.iter() | ||
.map(|(start, end)| &to_segment[*start..*end]) | ||
.collect::<Vec<&str>>() | ||
.into_iter(), | ||
) | ||
} | ||
} | ||
|
||
// Publish the newly implemented Segmenter: | ||
// - import module by adding `mod dummy;` (filename) in `segmenter/mod.rs` | ||
// - publish Segmenter by adding `pub use dummy::KhmerSegmenter;` in `segmenter/mod.rs` | ||
// - running `cargo doc --open` you should see your Segmenter in the segmenter module | ||
|
||
// Test the segmenter: | ||
#[cfg(test)] | ||
mod test { | ||
use crate::segmenter::test::test_segmenter; | ||
|
||
// Original version of the text. | ||
const TEXT: &str = "សួស្តីពិភពលោក"; | ||
|
||
// Segmented version of the text. | ||
const SEGMENTED: &[&str] = &["សួស្តី", "ពិភពលោក"]; | ||
|
||
// Segmented and normalized version of the text. | ||
const TOKENIZED: &[&str] = &["សួស្តី", "ពិភពលោក"]; | ||
|
||
// Macro that run several tests on the Segmenter. | ||
test_segmenter!(KhmerSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Khmer, Language::Khm); | ||
} | ||
|
||
// Include the newly implemented Segmenter in the tokenization pipeline: | ||
// - assign Segmenter to a Script and a Language by adding it in `SEGMENTERS` in `segmenter/mod.rs` | ||
// - check if it didn't break any test or benhchmark | ||
|
||
// Your Segmenter will now be used on texts of the assigned Script and Language. Thank you for your contribution, and congratulation! 🎉 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters