Skip to content

Commit

Permalink
Merge branch 'master' into ar
Browse files Browse the repository at this point in the history
  • Loading branch information
mattico authored May 21, 2022
2 parents 1103047 + c1e6cf7 commit 5399fde
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 11 deletions.
11 changes: 6 additions & 5 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
authors = ["Matt Ickstadt <[email protected]>"]
license = "MIT/Apache-2.0"
name = "elasticlunr-rs"
version = "2.3.13"
version = "2.3.14"
description = "A partial port of elasticlunr.js to Rust for generating static document search indexes"
documentation = "https://docs.rs/elasticlunr-rs"
repository = "https://github.com/mattico/elasticlunr-rs"
Expand All @@ -28,8 +28,9 @@ serde_derive = "1.0.34" # First verstion to support #[serde(flatten)]
serde_json = "1"
strum = "0.21"
strum_macros = "0.21"
jieba-rs = { version = "0.5.0", optional = true }
lindera = { version = "0.3.4", optional = true }
jieba-rs = { version = "0.6", optional = true }
lindera = { version = "0.8", optional = true }
lindera-core = { version = "0.8", optional = true }

[features]
default = ["languages"]
Expand All @@ -51,5 +52,5 @@ ru = ["rust-stemmers"]
sv = ["rust-stemmers"]
tr = ["rust-stemmers"]
zh = ["jieba-rs"]
ja = ["lindera"]
ar = []
ja = ["lindera", "lindera-core"]
ar = []
2 changes: 2 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ extern crate maplit;
extern crate jieba_rs;
#[cfg(feature = "ja")]
extern crate lindera;
#[cfg(feature = "ja")]
extern crate lindera_core;

/// The version of elasticlunr.js this library was designed for.
pub const ELASTICLUNR_VERSION: &str = "0.9.5";
Expand Down
16 changes: 10 additions & 6 deletions src/pipeline.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
//! Defines the pipeline which processes text for inclusion in the index. Most users do not need
//! to use this module directly.
#[cfg(feature = "zh")]
use jieba_rs::Jieba;
#[cfg(feature = "ja")]
use lindera::tokenizer::Tokenizer;
use serde::ser::{Serialize, SerializeSeq, Serializer};

/// Splits a text string into a vector of individual tokens.
Expand All @@ -17,7 +13,7 @@ pub fn tokenize(text: &str) -> Vec<String> {

#[cfg(feature = "zh")]
pub fn tokenize_chinese(text: &str) -> Vec<String> {
let jieba = Jieba::new();
let jieba = jieba_rs::Jieba::new();

jieba
.cut_for_search(text.as_ref(), false)
Expand All @@ -28,9 +24,17 @@ pub fn tokenize_chinese(text: &str) -> Vec<String> {

#[cfg(feature = "ja")]
pub fn tokenize_japanese(text: &str) -> Vec<String> {
let mut tokenizer = Tokenizer::new("decompose", "");
use lindera_core::viterbi::Mode;
use lindera::tokenizer::{Tokenizer, TokenizerConfig};
let config = TokenizerConfig {
mode: Mode::Decompose(Default::default()),
..Default::default()
};
// NB: unwrap() is okay since the errors are only related to user-supplied dictionaries.
let mut tokenizer = Tokenizer::with_config(config).unwrap();
tokenizer
.tokenize(text)
.unwrap()
.into_iter()
.filter_map(|tok| match tok.detail.get(0).map(|d| d.as_str()) {
Some("助詞") | Some("助動詞") | Some("記号") | Some("UNK") => None,
Expand Down

0 comments on commit 5399fde

Please sign in to comment.