Merge #218

218: Add UniDic implimentation r=ManyTheFish a=mosuka # Pull Request ## What does this PR do? - Add UniDic implementation to allow consistent tokenization for searching and indexing. - Please see [discussion comment](https://github.com/meilisearch/product/discussions/532#discussioncomment-5895057) ## PR checklist Please check if your PR fulfills the following requirements: - [ ] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)? - [x] Have you read the contributing guidelines? - [x] Have you made sure that the title is accurate and descriptive of the changes? Thank you so much for contributing to Meilisearch! Co-authored-by: Minoru Osuka <[email protected]>
meilisearch · Jun 20, 2023 · 366417d · 366417d
2 parents 91e368a + 2b10d0c
commit 366417d
Show file tree

Hide file tree

Showing 2 changed files with 105 additions and 39 deletions.
diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml
@@ -23,9 +23,9 @@ serde = "1.0"
 slice-group-by = "0.3.0"
 unicode-segmentation = "1.10.1"
 whatlang = "0.16.2"
-lindera-core = "=0.24.0"
-lindera-dictionary = "=0.24.0"
-lindera-tokenizer = { version = "=0.24.0", default-features = false, optional = true }
+lindera-core = "=0.25.0"
+lindera-dictionary = "=0.25.0"
+lindera-tokenizer = { version = "=0.25.0", default-features = false, optional = true }
 pinyin = { version = "0.9", default-features = false, features = [
   "with_tone",
 ], optional = true }
@@ -43,7 +43,9 @@ chinese = ["dep:pinyin", "dep:jieba-rs"]
 hebrew = []
 
 # allow japanese specialized tokenization
-japanese = ["lindera-tokenizer/ipadic", "lindera-tokenizer/ipadic-compress"]
+japanese = ["japanese-segmentation-unidic"]
+japanese-segmentation-ipadic = ["lindera-tokenizer/ipadic", "lindera-tokenizer/ipadic-compress"]
+japanese-segmentation-unidic = ["lindera-tokenizer/unidic", "lindera-tokenizer/unidic-compress"]
 japanese-transliteration = ["dep:wana_kana"]
 
 # allow korean specialized tokenization

diff --git a/charabia/src/segmenter/japanese.rs b/charabia/src/segmenter/japanese.rs
@@ -1,4 +1,6 @@
-use lindera_core::mode::{Mode, Penalty};
+use lindera_core::mode::Mode;
+#[cfg(feature = "japanese-segmentation-ipadic")]
+use lindera_core::mode::Penalty;
 use lindera_dictionary::{DictionaryConfig, DictionaryKind};
 use lindera_tokenizer::tokenizer::{Tokenizer, TokenizerConfig};
 use once_cell::sync::Lazy;
@@ -11,11 +13,21 @@ use crate::segmenter::Segmenter;
 pub struct JapaneseSegmenter;
 
 static LINDERA: Lazy<Tokenizer> = Lazy::new(|| {
+    #[cfg(all(feature = "japanese-segmentation-ipadic", feature = "japanese-segmentation-unidic"))]
+    compile_error!("Feature japanese-segmentation-ipadic and japanese-segmentation-unidic are mutually exclusive and cannot be enabled together");
+
+    #[cfg(feature = "japanese-segmentation-ipadic")]
     let config = TokenizerConfig {
         dictionary: DictionaryConfig { kind: Some(DictionaryKind::IPADIC), path: None },
         mode: Mode::Decompose(Penalty::default()),
         ..TokenizerConfig::default()
     };
+    #[cfg(feature = "japanese-segmentation-unidic")]
+    let config = TokenizerConfig {
+        dictionary: DictionaryConfig { kind: Some(DictionaryKind::UniDic), path: None },
+        mode: Mode::Normal,
+        ..TokenizerConfig::default()
+    };
     Tokenizer::from_config(config).unwrap()
 });
 
@@ -32,41 +44,93 @@ mod test {
 
     const TEXT: &str = "関西国際空港限定トートバッグ すもももももももものうち";
 
-    const SEGMENTED: &[&str] = &[
-        "関西",
-        "国際",
-        "空港",
-        "限定",
-        "トートバッグ",
-        " ",
-        "すもも",
-        "も",
-        "もも",
-        "も",
-        "もも",
-        "の",
-        "うち",
-    ];
+    const SEGMENTED: &[&str] = if cfg!(feature = "japanese-segmentation-ipadic") {
+        &[
+            "関西",
+            "国際",
+            "空港",
+            "限定",
+            "トートバッグ",
+            " ",
+            "すもも",
+            "も",
+            "もも",
+            "も",
+            "もも",
+            "の",
+            "うち",
+        ]
+    } else if cfg!(feature = "japanese-segmentation-unidic") {
+        &[
+            "関西",
+            "国際",
+            "空港",
+            "限定",
+            "トート",
+            "バッグ",
+            " ",
+            "すもも",
+            "も",
+            "もも",
+            "も",
+            "もも",
+            "の",
+            "うち",
+        ]
+    } else {
+        &[]
+    };
+
+    const TOKENIZED: &[&str] = if cfg!(feature = "japanese-segmentation-ipadic") {
+        &[
+            "関西",
+            "国際",
+            "空港",
+            "限定",
+            // Use "とうとばっぐ" instead when feature "japanese-transliteration" is enabled or become default
+            #[cfg(feature = "japanese-transliteration")]
+            "とうとは\u{3099}っく\u{3099}",
+            #[cfg(not(feature = "japanese-transliteration"))]
+            "トートハ\u{3099}ック\u{3099}",
+            " ",
+            "すもも",
+            "も",
+            "もも",
+            "も",
+            "もも",
+            "の",
+            "うち",
+        ]
+    } else if cfg!(feature = "japanese-segmentation-unidic") {
+        &[
+            "関西",
+            "国際",
+            "空港",
+            "限定",
+            // Use "とうとばっぐ" instead when feature "japanese-transliteration" is enabled or become default
+            #[cfg(feature = "japanese-transliteration")]
+            "とうと",
+            #[cfg(not(feature = "japanese-transliteration"))]
+            "トート",
+            #[cfg(feature = "japanese-transliteration")]
+            "は\u{3099}っく\u{3099}",
+            #[cfg(not(feature = "japanese-transliteration"))]
+            "ハ\u{3099}ック\u{3099}",
+            " ",
+            "すもも",
+            "も",
+            "もも",
+            "も",
+            "もも",
+            "の",
+            "うち",
+        ]
+    } else {
+        &[]
+    };
 
-    const TOKENIZED: &[&str] = &[
-        "関西",
-        "国際",
-        "空港",
-        "限定",
-        // Use "とうとばっぐ" instead when feature "japanese-transliteration" is enabled or become default
-        #[cfg(feature = "japanese-transliteration")]
-        "とうとは\u{3099}っく\u{3099}",
-        #[cfg(not(feature = "japanese-transliteration"))]
-        "トートハ\u{3099}ック\u{3099}",
-        " ",
-        "すもも",
-        "も",
-        "もも",
-        "も",
-        "もも",
-        "の",
-        "うち",
-    ];
+    #[cfg(all(feature = "japanese-segmentation-ipadic", feature = "japanese-segmentation-unidic"))]
+    compile_error!("Feature japanese-segmentation-ipadic and japanese-segmentation-unidic are mutually exclusive and cannot be enabled together");
 
     // Macro that run several tests on the Segmenter.
     test_segmenter!(JapaneseSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Cj, Language::Jpn);