Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add UniDic implimentation #218

Merged
merged 2 commits into from
Jun 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions charabia/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ serde = "1.0"
slice-group-by = "0.3.0"
unicode-segmentation = "1.10.1"
whatlang = "0.16.2"
lindera-core = "=0.24.0"
lindera-dictionary = "=0.24.0"
lindera-tokenizer = { version = "=0.24.0", default-features = false, optional = true }
lindera-core = "=0.25.0"
lindera-dictionary = "=0.25.0"
lindera-tokenizer = { version = "=0.25.0", default-features = false, optional = true }
pinyin = { version = "0.9", default-features = false, features = [
"with_tone",
], optional = true }
Expand All @@ -43,7 +43,9 @@ chinese = ["dep:pinyin", "dep:jieba-rs"]
hebrew = []

# allow japanese specialized tokenization
japanese = ["lindera-tokenizer/ipadic", "lindera-tokenizer/ipadic-compress"]
japanese = ["japanese-segmentation-unidic"]
japanese-segmentation-ipadic = ["lindera-tokenizer/ipadic", "lindera-tokenizer/ipadic-compress"]
japanese-segmentation-unidic = ["lindera-tokenizer/unidic", "lindera-tokenizer/unidic-compress"]
japanese-transliteration = ["dep:wana_kana"]

# allow korean specialized tokenization
Expand Down
134 changes: 99 additions & 35 deletions charabia/src/segmenter/japanese.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
use lindera_core::mode::{Mode, Penalty};
use lindera_core::mode::Mode;
#[cfg(feature = "japanese-segmentation-ipadic")]
use lindera_core::mode::Penalty;
use lindera_dictionary::{DictionaryConfig, DictionaryKind};
use lindera_tokenizer::tokenizer::{Tokenizer, TokenizerConfig};
use once_cell::sync::Lazy;
Expand All @@ -11,11 +13,21 @@ use crate::segmenter::Segmenter;
pub struct JapaneseSegmenter;

static LINDERA: Lazy<Tokenizer> = Lazy::new(|| {
#[cfg(all(feature = "japanese-segmentation-ipadic", feature = "japanese-segmentation-unidic"))]
compile_error!("Feature japanese-segmentation-ipadic and japanese-segmentation-unidic are mutually exclusive and cannot be enabled together");

#[cfg(feature = "japanese-segmentation-ipadic")]
let config = TokenizerConfig {
dictionary: DictionaryConfig { kind: Some(DictionaryKind::IPADIC), path: None },
mode: Mode::Decompose(Penalty::default()),
..TokenizerConfig::default()
};
#[cfg(feature = "japanese-segmentation-unidic")]
let config = TokenizerConfig {
dictionary: DictionaryConfig { kind: Some(DictionaryKind::UniDic), path: None },
mode: Mode::Normal,
..TokenizerConfig::default()
};
Tokenizer::from_config(config).unwrap()
});

Expand All @@ -32,41 +44,93 @@ mod test {

const TEXT: &str = "関西国際空港限定トートバッグ すもももももももものうち";

const SEGMENTED: &[&str] = &[
"関西",
"国際",
"空港",
"限定",
"トートバッグ",
" ",
"すもも",
"も",
"もも",
"も",
"もも",
"の",
"うち",
];
const SEGMENTED: &[&str] = if cfg!(feature = "japanese-segmentation-ipadic") {
&[
"関西",
"国際",
"空港",
"限定",
"トートバッグ",
" ",
"すもも",
"も",
"もも",
"も",
"もも",
"の",
"うち",
]
} else if cfg!(feature = "japanese-segmentation-unidic") {
&[
"関西",
"国際",
"空港",
"限定",
"トート",
"バッグ",
" ",
"すもも",
"も",
"もも",
"も",
"もも",
"の",
"うち",
]
} else {
&[]
};

const TOKENIZED: &[&str] = if cfg!(feature = "japanese-segmentation-ipadic") {
&[
"関西",
"国際",
"空港",
"限定",
// Use "とうとばっぐ" instead when feature "japanese-transliteration" is enabled or become default
#[cfg(feature = "japanese-transliteration")]
"とうとは\u{3099}っく\u{3099}",
#[cfg(not(feature = "japanese-transliteration"))]
"トートハ\u{3099}ック\u{3099}",
" ",
"すもも",
"も",
"もも",
"も",
"もも",
"の",
"うち",
]
} else if cfg!(feature = "japanese-segmentation-unidic") {
&[
"関西",
"国際",
"空港",
"限定",
// Use "とうとばっぐ" instead when feature "japanese-transliteration" is enabled or become default
#[cfg(feature = "japanese-transliteration")]
"とうと",
#[cfg(not(feature = "japanese-transliteration"))]
"トート",
#[cfg(feature = "japanese-transliteration")]
"は\u{3099}っく\u{3099}",
#[cfg(not(feature = "japanese-transliteration"))]
"ハ\u{3099}ック\u{3099}",
" ",
"すもも",
"も",
"もも",
"も",
"もも",
"の",
"うち",
]
} else {
&[]
};

const TOKENIZED: &[&str] = &[
"関西",
"国際",
"空港",
"限定",
// Use "とうとばっぐ" instead when feature "japanese-transliteration" is enabled or become default
#[cfg(feature = "japanese-transliteration")]
"とうとは\u{3099}っく\u{3099}",
#[cfg(not(feature = "japanese-transliteration"))]
"トートハ\u{3099}ック\u{3099}",
" ",
"すもも",
"も",
"もも",
"も",
"もも",
"の",
"うち",
];
#[cfg(all(feature = "japanese-segmentation-ipadic", feature = "japanese-segmentation-unidic"))]
compile_error!("Feature japanese-segmentation-ipadic and japanese-segmentation-unidic are mutually exclusive and cannot be enabled together");

// Macro that run several tests on the Segmenter.
test_segmenter!(JapaneseSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Cj, Language::Jpn);
Expand Down