From a9fb9680bf4364c79cfb8dd3603fac8c120d90fc Mon Sep 17 00:00:00 2001 From: Minoru Osuka Date: Sat, 17 Jun 2023 21:51:01 +0900 Subject: [PATCH 1/2] Add UniDic implimentation --- charabia/Cargo.toml | 10 +++--- charabia/src/segmenter/japanese.rs | 54 +++++++++++++++++++++++++++++- 2 files changed, 59 insertions(+), 5 deletions(-) diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml index daf21558..2aeeae25 100644 --- a/charabia/Cargo.toml +++ b/charabia/Cargo.toml @@ -23,9 +23,9 @@ serde = "1.0" slice-group-by = "0.3.0" unicode-segmentation = "1.10.1" whatlang = "0.16.2" -lindera-core = "=0.24.0" -lindera-dictionary = "=0.24.0" -lindera-tokenizer = { version = "=0.24.0", default-features = false, optional = true } +lindera-core = "=0.25.0" +lindera-dictionary = "=0.25.0" +lindera-tokenizer = { version = "=0.25.0", default-features = false, optional = true } pinyin = { version = "0.9", default-features = false, features = [ "with_tone", ], optional = true } @@ -43,7 +43,9 @@ chinese = ["dep:pinyin", "dep:jieba-rs"] hebrew = [] # allow japanese specialized tokenization -japanese = ["lindera-tokenizer/ipadic", "lindera-tokenizer/ipadic-compress"] +japanese = ["japanese-segmentation-unidic"] +japanese-segmentation-ipadic = ["lindera-tokenizer/ipadic", "lindera-tokenizer/ipadic-compress"] +japanese-segmentation-unidic = ["lindera-tokenizer/unidic", "lindera-tokenizer/unidic-compress"] japanese-transliteration = ["dep:wana_kana"] # allow korean specialized tokenization diff --git a/charabia/src/segmenter/japanese.rs b/charabia/src/segmenter/japanese.rs index 044d6a33..1e9c42fe 100644 --- a/charabia/src/segmenter/japanese.rs +++ b/charabia/src/segmenter/japanese.rs @@ -1,4 +1,6 @@ -use lindera_core::mode::{Mode, Penalty}; +use lindera_core::mode::Mode; +#[cfg(feature = "japanese-segmentation-ipadic")] +use lindera_core::mode::Penalty; use lindera_dictionary::{DictionaryConfig, DictionaryKind}; use lindera_tokenizer::tokenizer::{Tokenizer, TokenizerConfig}; use once_cell::sync::Lazy; @@ -11,11 +13,18 @@ use crate::segmenter::Segmenter; pub struct JapaneseSegmenter; static LINDERA: Lazy = Lazy::new(|| { + #[cfg(feature = "japanese-segmentation-ipadic")] let config = TokenizerConfig { dictionary: DictionaryConfig { kind: Some(DictionaryKind::IPADIC), path: None }, mode: Mode::Decompose(Penalty::default()), ..TokenizerConfig::default() }; + #[cfg(feature = "japanese-segmentation-unidic")] + let config = TokenizerConfig { + dictionary: DictionaryConfig { kind: Some(DictionaryKind::UniDic), path: None }, + mode: Mode::Normal, + ..TokenizerConfig::default() + }; Tokenizer::from_config(config).unwrap() }); @@ -32,6 +41,7 @@ mod test { const TEXT: &str = "関西国際空港限定トートバッグ すもももももももものうち"; + #[cfg(feature = "japanese-segmentation-ipadic")] const SEGMENTED: &[&str] = &[ "関西", "国際", @@ -47,7 +57,25 @@ mod test { "の", "うち", ]; + #[cfg(feature = "japanese-segmentation-unidic")] + const SEGMENTED: &[&str] = &[ + "関西", + "国際", + "空港", + "限定", + "トート", + "バッグ", + " ", + "すもも", + "も", + "もも", + "も", + "もも", + "の", + "うち", + ]; + #[cfg(feature = "japanese-segmentation-ipadic")] const TOKENIZED: &[&str] = &[ "関西", "国際", @@ -67,6 +95,30 @@ mod test { "の", "うち", ]; + #[cfg(feature = "japanese-segmentation-unidic")] + const TOKENIZED: &[&str] = &[ + "関西", + "国際", + "空港", + "限定", + // Use "とうとばっぐ" instead when feature "japanese-transliteration" is enabled or become default + #[cfg(feature = "japanese-transliteration")] + "とうと", + #[cfg(not(feature = "japanese-transliteration"))] + "トート", + #[cfg(feature = "japanese-transliteration")] + "は\u{3099}っく\u{3099}", + #[cfg(not(feature = "japanese-transliteration"))] + "ハ\u{3099}ック\u{3099}", + " ", + "すもも", + "も", + "もも", + "も", + "もも", + "の", + "うち", + ]; // Macro that run several tests on the Segmenter. test_segmenter!(JapaneseSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Cj, Language::Jpn); From 2b10d0cb86e0336f222849499137e80da8051930 Mon Sep 17 00:00:00 2001 From: Minoru Osuka Date: Mon, 19 Jun 2023 21:03:34 +0900 Subject: [PATCH 2/2] Add a compiling error in case IPADIC and UniDic are both activated --- charabia/src/segmenter/japanese.rs | 166 ++++++++++++++++------------- 1 file changed, 89 insertions(+), 77 deletions(-) diff --git a/charabia/src/segmenter/japanese.rs b/charabia/src/segmenter/japanese.rs index 1e9c42fe..da256718 100644 --- a/charabia/src/segmenter/japanese.rs +++ b/charabia/src/segmenter/japanese.rs @@ -13,6 +13,9 @@ use crate::segmenter::Segmenter; pub struct JapaneseSegmenter; static LINDERA: Lazy = Lazy::new(|| { + #[cfg(all(feature = "japanese-segmentation-ipadic", feature = "japanese-segmentation-unidic"))] + compile_error!("Feature japanese-segmentation-ipadic and japanese-segmentation-unidic are mutually exclusive and cannot be enabled together"); + #[cfg(feature = "japanese-segmentation-ipadic")] let config = TokenizerConfig { dictionary: DictionaryConfig { kind: Some(DictionaryKind::IPADIC), path: None }, @@ -41,84 +44,93 @@ mod test { const TEXT: &str = "関西国際空港限定トートバッグ すもももももももものうち"; - #[cfg(feature = "japanese-segmentation-ipadic")] - const SEGMENTED: &[&str] = &[ - "関西", - "国際", - "空港", - "限定", - "トートバッグ", - " ", - "すもも", - "も", - "もも", - "も", - "もも", - "の", - "うち", - ]; - #[cfg(feature = "japanese-segmentation-unidic")] - const SEGMENTED: &[&str] = &[ - "関西", - "国際", - "空港", - "限定", - "トート", - "バッグ", - " ", - "すもも", - "も", - "もも", - "も", - "もも", - "の", - "うち", - ]; + const SEGMENTED: &[&str] = if cfg!(feature = "japanese-segmentation-ipadic") { + &[ + "関西", + "国際", + "空港", + "限定", + "トートバッグ", + " ", + "すもも", + "も", + "もも", + "も", + "もも", + "の", + "うち", + ] + } else if cfg!(feature = "japanese-segmentation-unidic") { + &[ + "関西", + "国際", + "空港", + "限定", + "トート", + "バッグ", + " ", + "すもも", + "も", + "もも", + "も", + "もも", + "の", + "うち", + ] + } else { + &[] + }; - #[cfg(feature = "japanese-segmentation-ipadic")] - const TOKENIZED: &[&str] = &[ - "関西", - "国際", - "空港", - "限定", - // Use "とうとばっぐ" instead when feature "japanese-transliteration" is enabled or become default - #[cfg(feature = "japanese-transliteration")] - "とうとは\u{3099}っく\u{3099}", - #[cfg(not(feature = "japanese-transliteration"))] - "トートハ\u{3099}ック\u{3099}", - " ", - "すもも", - "も", - "もも", - "も", - "もも", - "の", - "うち", - ]; - #[cfg(feature = "japanese-segmentation-unidic")] - const TOKENIZED: &[&str] = &[ - "関西", - "国際", - "空港", - "限定", - // Use "とうとばっぐ" instead when feature "japanese-transliteration" is enabled or become default - #[cfg(feature = "japanese-transliteration")] - "とうと", - #[cfg(not(feature = "japanese-transliteration"))] - "トート", - #[cfg(feature = "japanese-transliteration")] - "は\u{3099}っく\u{3099}", - #[cfg(not(feature = "japanese-transliteration"))] - "ハ\u{3099}ック\u{3099}", - " ", - "すもも", - "も", - "もも", - "も", - "もも", - "の", - "うち", - ]; + const TOKENIZED: &[&str] = if cfg!(feature = "japanese-segmentation-ipadic") { + &[ + "関西", + "国際", + "空港", + "限定", + // Use "とうとばっぐ" instead when feature "japanese-transliteration" is enabled or become default + #[cfg(feature = "japanese-transliteration")] + "とうとは\u{3099}っく\u{3099}", + #[cfg(not(feature = "japanese-transliteration"))] + "トートハ\u{3099}ック\u{3099}", + " ", + "すもも", + "も", + "もも", + "も", + "もも", + "の", + "うち", + ] + } else if cfg!(feature = "japanese-segmentation-unidic") { + &[ + "関西", + "国際", + "空港", + "限定", + // Use "とうとばっぐ" instead when feature "japanese-transliteration" is enabled or become default + #[cfg(feature = "japanese-transliteration")] + "とうと", + #[cfg(not(feature = "japanese-transliteration"))] + "トート", + #[cfg(feature = "japanese-transliteration")] + "は\u{3099}っく\u{3099}", + #[cfg(not(feature = "japanese-transliteration"))] + "ハ\u{3099}ック\u{3099}", + " ", + "すもも", + "も", + "もも", + "も", + "もも", + "の", + "うち", + ] + } else { + &[] + }; + + #[cfg(all(feature = "japanese-segmentation-ipadic", feature = "japanese-segmentation-unidic"))] + compile_error!("Feature japanese-segmentation-ipadic and japanese-segmentation-unidic are mutually exclusive and cannot be enabled together"); // Macro that run several tests on the Segmenter. test_segmenter!(JapaneseSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Cj, Language::Jpn);