Skip to content

Commit

Permalink
Merge #218
Browse files Browse the repository at this point in the history
218: Add UniDic implimentation r=ManyTheFish a=mosuka

# Pull Request

## What does this PR do?
- Add UniDic implementation to allow consistent tokenization for searching and indexing.
- Please see [discussion comment](https://github.com/meilisearch/product/discussions/532#discussioncomment-5895057)

## PR checklist
Please check if your PR fulfills the following requirements:
- [ ] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)?
- [x] Have you read the contributing guidelines?
- [x] Have you made sure that the title is accurate and descriptive of the changes?

Thank you so much for contributing to Meilisearch!


Co-authored-by: Minoru Osuka <[email protected]>
  • Loading branch information
meili-bors[bot] and mosuka authored Jun 20, 2023
2 parents 91e368a + 2b10d0c commit 366417d
Show file tree
Hide file tree
Showing 2 changed files with 105 additions and 39 deletions.
10 changes: 6 additions & 4 deletions charabia/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ serde = "1.0"
slice-group-by = "0.3.0"
unicode-segmentation = "1.10.1"
whatlang = "0.16.2"
lindera-core = "=0.24.0"
lindera-dictionary = "=0.24.0"
lindera-tokenizer = { version = "=0.24.0", default-features = false, optional = true }
lindera-core = "=0.25.0"
lindera-dictionary = "=0.25.0"
lindera-tokenizer = { version = "=0.25.0", default-features = false, optional = true }
pinyin = { version = "0.9", default-features = false, features = [
"with_tone",
], optional = true }
Expand All @@ -43,7 +43,9 @@ chinese = ["dep:pinyin", "dep:jieba-rs"]
hebrew = []

# allow japanese specialized tokenization
japanese = ["lindera-tokenizer/ipadic", "lindera-tokenizer/ipadic-compress"]
japanese = ["japanese-segmentation-unidic"]
japanese-segmentation-ipadic = ["lindera-tokenizer/ipadic", "lindera-tokenizer/ipadic-compress"]
japanese-segmentation-unidic = ["lindera-tokenizer/unidic", "lindera-tokenizer/unidic-compress"]
japanese-transliteration = ["dep:wana_kana"]

# allow korean specialized tokenization
Expand Down
134 changes: 99 additions & 35 deletions charabia/src/segmenter/japanese.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
use lindera_core::mode::{Mode, Penalty};
use lindera_core::mode::Mode;
#[cfg(feature = "japanese-segmentation-ipadic")]
use lindera_core::mode::Penalty;
use lindera_dictionary::{DictionaryConfig, DictionaryKind};
use lindera_tokenizer::tokenizer::{Tokenizer, TokenizerConfig};
use once_cell::sync::Lazy;
Expand All @@ -11,11 +13,21 @@ use crate::segmenter::Segmenter;
pub struct JapaneseSegmenter;

static LINDERA: Lazy<Tokenizer> = Lazy::new(|| {
#[cfg(all(feature = "japanese-segmentation-ipadic", feature = "japanese-segmentation-unidic"))]
compile_error!("Feature japanese-segmentation-ipadic and japanese-segmentation-unidic are mutually exclusive and cannot be enabled together");

#[cfg(feature = "japanese-segmentation-ipadic")]
let config = TokenizerConfig {
dictionary: DictionaryConfig { kind: Some(DictionaryKind::IPADIC), path: None },
mode: Mode::Decompose(Penalty::default()),
..TokenizerConfig::default()
};
#[cfg(feature = "japanese-segmentation-unidic")]
let config = TokenizerConfig {
dictionary: DictionaryConfig { kind: Some(DictionaryKind::UniDic), path: None },
mode: Mode::Normal,
..TokenizerConfig::default()
};
Tokenizer::from_config(config).unwrap()
});

Expand All @@ -32,41 +44,93 @@ mod test {

const TEXT: &str = "関西国際空港限定トートバッグ すもももももももものうち";

const SEGMENTED: &[&str] = &[
"関西",
"国際",
"空港",
"限定",
"トートバッグ",
" ",
"すもも",
"も",
"もも",
"も",
"もも",
"の",
"うち",
];
const SEGMENTED: &[&str] = if cfg!(feature = "japanese-segmentation-ipadic") {
&[
"関西",
"国際",
"空港",
"限定",
"トートバッグ",
" ",
"すもも",
"も",
"もも",
"も",
"もも",
"の",
"うち",
]
} else if cfg!(feature = "japanese-segmentation-unidic") {
&[
"関西",
"国際",
"空港",
"限定",
"トート",
"バッグ",
" ",
"すもも",
"も",
"もも",
"も",
"もも",
"の",
"うち",
]
} else {
&[]
};

const TOKENIZED: &[&str] = if cfg!(feature = "japanese-segmentation-ipadic") {
&[
"関西",
"国際",
"空港",
"限定",
// Use "とうとばっぐ" instead when feature "japanese-transliteration" is enabled or become default
#[cfg(feature = "japanese-transliteration")]
"とうとは\u{3099}っく\u{3099}",
#[cfg(not(feature = "japanese-transliteration"))]
"トートハ\u{3099}ック\u{3099}",
" ",
"すもも",
"も",
"もも",
"も",
"もも",
"の",
"うち",
]
} else if cfg!(feature = "japanese-segmentation-unidic") {
&[
"関西",
"国際",
"空港",
"限定",
// Use "とうとばっぐ" instead when feature "japanese-transliteration" is enabled or become default
#[cfg(feature = "japanese-transliteration")]
"とうと",
#[cfg(not(feature = "japanese-transliteration"))]
"トート",
#[cfg(feature = "japanese-transliteration")]
"は\u{3099}っく\u{3099}",
#[cfg(not(feature = "japanese-transliteration"))]
"ハ\u{3099}ック\u{3099}",
" ",
"すもも",
"も",
"もも",
"も",
"もも",
"の",
"うち",
]
} else {
&[]
};

const TOKENIZED: &[&str] = &[
"関西",
"国際",
"空港",
"限定",
// Use "とうとばっぐ" instead when feature "japanese-transliteration" is enabled or become default
#[cfg(feature = "japanese-transliteration")]
"とうとは\u{3099}っく\u{3099}",
#[cfg(not(feature = "japanese-transliteration"))]
"トートハ\u{3099}ック\u{3099}",
" ",
"すもも",
"も",
"もも",
"も",
"もも",
"の",
"うち",
];
#[cfg(all(feature = "japanese-segmentation-ipadic", feature = "japanese-segmentation-unidic"))]
compile_error!("Feature japanese-segmentation-ipadic and japanese-segmentation-unidic are mutually exclusive and cannot be enabled together");

// Macro that run several tests on the Segmenter.
test_segmenter!(JapaneseSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Cj, Language::Jpn);
Expand Down

0 comments on commit 366417d

Please sign in to comment.