From a9fb9680bf4364c79cfb8dd3603fac8c120d90fc Mon Sep 17 00:00:00 2001
From: Minoru Osuka <minoru.osuka@gmail.com>
Date: Sat, 17 Jun 2023 21:51:01 +0900
Subject: [PATCH 1/2] Add UniDic implimentation

---
 charabia/Cargo.toml                | 10 +++---
 charabia/src/segmenter/japanese.rs | 54 +++++++++++++++++++++++++++++-
 2 files changed, 59 insertions(+), 5 deletions(-)
diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml
index daf21558..2aeeae25 100644
--- a/charabia/Cargo.toml
+++ b/charabia/Cargo.toml
@@ -23,9 +23,9 @@ serde = "1.0"
 slice-group-by = "0.3.0"
 unicode-segmentation = "1.10.1"
 whatlang = "0.16.2"
-lindera-core = "=0.24.0"
-lindera-dictionary = "=0.24.0"
-lindera-tokenizer = { version = "=0.24.0", default-features = false, optional = true }
+lindera-core = "=0.25.0"
+lindera-dictionary = "=0.25.0"
+lindera-tokenizer = { version = "=0.25.0", default-features = false, optional = true }
 pinyin = { version = "0.9", default-features = false, features = [
   "with_tone",
 ], optional = true }
@@ -43,7 +43,9 @@ chinese = ["dep:pinyin", "dep:jieba-rs"]
 hebrew = []
 
 # allow japanese specialized tokenization
-japanese = ["lindera-tokenizer/ipadic", "lindera-tokenizer/ipadic-compress"]
+japanese = ["japanese-segmentation-unidic"]
+japanese-segmentation-ipadic = ["lindera-tokenizer/ipadic", "lindera-tokenizer/ipadic-compress"]
+japanese-segmentation-unidic = ["lindera-tokenizer/unidic", "lindera-tokenizer/unidic-compress"]
 japanese-transliteration = ["dep:wana_kana"]
 
 # allow korean specialized tokenization
diff --git a/charabia/src/segmenter/japanese.rs b/charabia/src/segmenter/japanese.rs
index 044d6a33..1e9c42fe 100644
--- a/charabia/src/segmenter/japanese.rs
+++ b/charabia/src/segmenter/japanese.rs
@@ -1,4 +1,6 @@
-use lindera_core::mode::{Mode, Penalty};
+use lindera_core::mode::Mode;
+#[cfg(feature = "japanese-segmentation-ipadic")]
+use lindera_core::mode::Penalty;
 use lindera_dictionary::{DictionaryConfig, DictionaryKind};
 use lindera_tokenizer::tokenizer::{Tokenizer, TokenizerConfig};
 use once_cell::sync::Lazy;
@@ -11,11 +13,18 @@ use crate::segmenter::Segmenter;
 pub struct JapaneseSegmenter;
 
 static LINDERA: Lazy<Tokenizer> = Lazy::new(|| {
+    #[cfg(feature = "japanese-segmentation-ipadic")]
     let config = TokenizerConfig {
         dictionary: DictionaryConfig { kind: Some(DictionaryKind::IPADIC), path: None },
         mode: Mode::Decompose(Penalty::default()),
         ..TokenizerConfig::default()
     };
+    #[cfg(feature = "japanese-segmentation-unidic")]
+    let config = TokenizerConfig {
+        dictionary: DictionaryConfig { kind: Some(DictionaryKind::UniDic), path: None },
+        mode: Mode::Normal,
+        ..TokenizerConfig::default()
+    };
     Tokenizer::from_config(config).unwrap()
 });
 
@@ -32,6 +41,7 @@ mod test {
 
     const TEXT: &str = "関西国際空港限定トートバッグ すもももももももものうち";
 
+    #[cfg(feature = "japanese-segmentation-ipadic")]
     const SEGMENTED: &[&str] = &[
         "関西",
         "国際",
@@ -47,7 +57,25 @@ mod test {
         "の",
         "うち",
     ];
+    #[cfg(feature = "japanese-segmentation-unidic")]
+    const SEGMENTED: &[&str] = &[
+        "関西",
+        "国際",
+        "空港",
+        "限定",
+        "トート",
+        "バッグ",
+        " ",
+        "すもも",
+        "も",
+        "もも",
+        "も",
+        "もも",
+        "の",
+        "うち",
+    ];
 
+    #[cfg(feature = "japanese-segmentation-ipadic")]
     const TOKENIZED: &[&str] = &[
         "関西",
         "国際",
@@ -67,6 +95,30 @@ mod test {
         "の",
         "うち",
     ];
+    #[cfg(feature = "japanese-segmentation-unidic")]
+    const TOKENIZED: &[&str] = &[
+        "関西",
+        "国際",
+        "空港",
+        "限定",
+        // Use "とうとばっぐ" instead when feature "japanese-transliteration" is enabled or become default
+        #[cfg(feature = "japanese-transliteration")]
+        "とうと",
+        #[cfg(not(feature = "japanese-transliteration"))]
+        "トート",
+        #[cfg(feature = "japanese-transliteration")]
+        "は\u{3099}っく\u{3099}",
+        #[cfg(not(feature = "japanese-transliteration"))]
+        "ハ\u{3099}ック\u{3099}",
+        " ",
+        "すもも",
+        "も",
+        "もも",
+        "も",
+        "もも",
+        "の",
+        "うち",
+    ];
 
     // Macro that run several tests on the Segmenter.
     test_segmenter!(JapaneseSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Cj, Language::Jpn);

From 2b10d0cb86e0336f222849499137e80da8051930 Mon Sep 17 00:00:00 2001
From: Minoru Osuka <minoru.osuka@gmail.com>
Date: Mon, 19 Jun 2023 21:03:34 +0900
Subject: [PATCH 2/2] Add a compiling error in case IPADIC and UniDic are both
 activated

---
 charabia/src/segmenter/japanese.rs | 166 ++++++++++++++++-------------
 1 file changed, 89 insertions(+), 77 deletions(-)

diff --git a/charabia/src/segmenter/japanese.rs b/charabia/src/segmenter/japanese.rs
index 1e9c42fe..da256718 100644
--- a/charabia/src/segmenter/japanese.rs
+++ b/charabia/src/segmenter/japanese.rs
@@ -13,6 +13,9 @@ use crate::segmenter::Segmenter;
 pub struct JapaneseSegmenter;
 
 static LINDERA: Lazy<Tokenizer> = Lazy::new(|| {
+    #[cfg(all(feature = "japanese-segmentation-ipadic", feature = "japanese-segmentation-unidic"))]
+    compile_error!("Feature japanese-segmentation-ipadic and japanese-segmentation-unidic are mutually exclusive and cannot be enabled together");
+
     #[cfg(feature = "japanese-segmentation-ipadic")]
     let config = TokenizerConfig {
         dictionary: DictionaryConfig { kind: Some(DictionaryKind::IPADIC), path: None },
@@ -41,84 +44,93 @@ mod test {
 
     const TEXT: &str = "関西国際空港限定トートバッグ すもももももももものうち";
 
-    #[cfg(feature = "japanese-segmentation-ipadic")]
-    const SEGMENTED: &[&str] = &[
-        "関西",
-        "国際",
-        "空港",
-        "限定",
-        "トートバッグ",
-        " ",
-        "すもも",
-        "も",
-        "もも",
-        "も",
-        "もも",
-        "の",
-        "うち",
-    ];
-    #[cfg(feature = "japanese-segmentation-unidic")]
-    const SEGMENTED: &[&str] = &[
-        "関西",
-        "国際",
-        "空港",
-        "限定",
-        "トート",
-        "バッグ",
-        " ",
-        "すもも",
-        "も",
-        "もも",
-        "も",
-        "もも",
-        "の",
-        "うち",
-    ];
+    const SEGMENTED: &[&str] = if cfg!(feature = "japanese-segmentation-ipadic") {
+        &[
+            "関西",
+            "国際",
+            "空港",
+            "限定",
+            "トートバッグ",
+            " ",
+            "すもも",
+            "も",
+            "もも",
+            "も",
+            "もも",
+            "の",
+            "うち",
+        ]
+    } else if cfg!(feature = "japanese-segmentation-unidic") {
+        &[
+            "関西",
+            "国際",
+            "空港",
+            "限定",
+            "トート",
+            "バッグ",
+            " ",
+            "すもも",
+            "も",
+            "もも",
+            "も",
+            "もも",
+            "の",
+            "うち",
+        ]
+    } else {
+        &[]
+    };
 
-    #[cfg(feature = "japanese-segmentation-ipadic")]
-    const TOKENIZED: &[&str] = &[
-        "関西",
-        "国際",
-        "空港",
-        "限定",
-        // Use "とうとばっぐ" instead when feature "japanese-transliteration" is enabled or become default
-        #[cfg(feature = "japanese-transliteration")]
-        "とうとは\u{3099}っく\u{3099}",
-        #[cfg(not(feature = "japanese-transliteration"))]
-        "トートハ\u{3099}ック\u{3099}",
-        " ",
-        "すもも",
-        "も",
-        "もも",
-        "も",
-        "もも",
-        "の",
-        "うち",
-    ];
-    #[cfg(feature = "japanese-segmentation-unidic")]
-    const TOKENIZED: &[&str] = &[
-        "関西",
-        "国際",
-        "空港",
-        "限定",
-        // Use "とうとばっぐ" instead when feature "japanese-transliteration" is enabled or become default
-        #[cfg(feature = "japanese-transliteration")]
-        "とうと",
-        #[cfg(not(feature = "japanese-transliteration"))]
-        "トート",
-        #[cfg(feature = "japanese-transliteration")]
-        "は\u{3099}っく\u{3099}",
-        #[cfg(not(feature = "japanese-transliteration"))]
-        "ハ\u{3099}ック\u{3099}",
-        " ",
-        "すもも",
-        "も",
-        "もも",
-        "も",
-        "もも",
-        "の",
-        "うち",
-    ];
+    const TOKENIZED: &[&str] = if cfg!(feature = "japanese-segmentation-ipadic") {
+        &[
+            "関西",
+            "国際",
+            "空港",
+            "限定",
+            // Use "とうとばっぐ" instead when feature "japanese-transliteration" is enabled or become default
+            #[cfg(feature = "japanese-transliteration")]
+            "とうとは\u{3099}っく\u{3099}",
+            #[cfg(not(feature = "japanese-transliteration"))]
+            "トートハ\u{3099}ック\u{3099}",
+            " ",
+            "すもも",
+            "も",
+            "もも",
+            "も",
+            "もも",
+            "の",
+            "うち",
+        ]
+    } else if cfg!(feature = "japanese-segmentation-unidic") {
+        &[
+            "関西",
+            "国際",
+            "空港",
+            "限定",
+            // Use "とうとばっぐ" instead when feature "japanese-transliteration" is enabled or become default
+            #[cfg(feature = "japanese-transliteration")]
+            "とうと",
+            #[cfg(not(feature = "japanese-transliteration"))]
+            "トート",
+            #[cfg(feature = "japanese-transliteration")]
+            "は\u{3099}っく\u{3099}",
+            #[cfg(not(feature = "japanese-transliteration"))]
+            "ハ\u{3099}ック\u{3099}",
+            " ",
+            "すもも",
+            "も",
+            "もも",
+            "も",
+            "もも",
+            "の",
+            "うち",
+        ]
+    } else {
+        &[]
+    };
+
+    #[cfg(all(feature = "japanese-segmentation-ipadic", feature = "japanese-segmentation-unidic"))]
+    compile_error!("Feature japanese-segmentation-ipadic and japanese-segmentation-unidic are mutually exclusive and cannot be enabled together");
 
     // Macro that run several tests on the Segmenter.
     test_segmenter!(JapaneseSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Cj, Language::Jpn);