Skip to content

Commit

Permalink
Add jieba-macros for HMM data generation and remove build.rs file (#117)
Browse files Browse the repository at this point in the history
* feat: add jieba-macros for HMM data generation and remove fs operation in build.rs

* fix: update CodSpeed action to v3 and improve error handling in HMM data generation

* refactor: simplify lifetime annotations
  • Loading branch information
sepcnt authored Dec 25, 2024
1 parent 7f3b273 commit 7216263
Show file tree
Hide file tree
Showing 8 changed files with 74 additions and 68 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ jobs:
run: cargo codspeed build --features tfidf,textrank

- name: Run the benchmarks
uses: CodSpeedHQ/action@v2
uses: CodSpeedHQ/action@v3
with:
run: cargo codspeed run
token: ${{ secrets.CODSPEED_TOKEN }}
6 changes: 2 additions & 4 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ harness = false
required-features = ["tfidf", "textrank"]

[dependencies]
jieba-macros = { path = "./jieba-macros" }
cedarwood = "0.4"
derive_builder = { version = "0.20.0", optional = true }
fxhash = "0.2.1"
Expand All @@ -36,14 +37,11 @@ ordered-float = { version = "4.0", optional = true }
phf = "0.11"
regex = "1.0"

[build-dependencies]
phf_codegen = "0.11"

[features]
default = ["default-dict"]
default-dict = []
tfidf = ["dep:ordered-float", "dep:derive_builder"]
textrank = ["dep:ordered-float", "dep:derive_builder"]

[workspace]
members = [".", "capi", "examples/weicheng"]
members = [".", "capi", "jieba-macros", "examples/weicheng"]
54 changes: 0 additions & 54 deletions build.rs

This file was deleted.

10 changes: 10 additions & 0 deletions jieba-macros/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[package]
name = "jieba-macros"
version = "0.7.0"
edition = "2021"

[lib]
proc-macro = true

[dependencies]
phf_codegen = "0.11"
53 changes: 53 additions & 0 deletions jieba-macros/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
use proc_macro::TokenStream;

#[proc_macro]
pub fn generate_hmm_data(_input: TokenStream) -> TokenStream {
let hmm_data = include_str!("../../src/data/hmm.model");
let mut output = String::new();
let mut lines = hmm_data.lines().skip_while(|x| x.starts_with('#'));

// Initial probabilities
let init_probs = lines
.next()
.expect("Failed to read initial probabilities from hmm.model");

output.push_str("#[allow(clippy::style)]\n");
output.push_str("pub static INITIAL_PROBS: [f64; 4] = [");
output.push_str(&init_probs.replace(' ', ", "));
output.push_str("];\n\n");

// Transition probabilities
output.push_str("#[allow(clippy::style)]\n");
output.push_str("pub static TRANS_PROBS: [[f64; 4]; 4] = [");
for line in lines
.by_ref()
.skip_while(|x| x.starts_with('#'))
.take_while(|x| !x.starts_with('#'))
{
output.push('[');
output.push_str(&line.replace(' ', ", "));
output.push_str("],\n");
}
output.push_str("];\n\n");

// Emission probabilities
for (i, line) in lines.filter(|x| !x.starts_with('#')).enumerate() {
output.push_str("#[allow(clippy::style)]\n");
output.push_str(&format!("pub static EMIT_PROB_{}: phf::Map<&'static str, f64> = ", i));

let mut map = phf_codegen::Map::new();
for word_prob in line.split(',') {
let mut parts = word_prob.split(':');
let word = parts.next().unwrap();
let prob = parts.next().unwrap();
map.entry(word, prob);
}
output.push_str(&map.build().to_string());
output.push_str(";\n\n");
}

output.push_str("#[allow(clippy::style)]\n");
output.push_str("pub static EMIT_PROBS: [&'static phf::Map<&'static str, f64>; 4] = [&EMIT_PROB_0, &EMIT_PROB_1, &EMIT_PROB_2, &EMIT_PROB_3];\n\n");

output.parse().unwrap()
}
11 changes: 5 additions & 6 deletions src/hmm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use lazy_static::lazy_static;
use regex::Regex;

use crate::SplitMatches;
use jieba_macros::generate_hmm_data;

lazy_static! {
static ref RE_HAN: Regex = Regex::new(r"([\u{4E00}-\u{9FD5}]+)").unwrap();
Expand All @@ -12,8 +13,6 @@ lazy_static! {

pub const NUM_STATES: usize = 4;

pub type StateSet = [f64; NUM_STATES];

/// Result of hmm is a labeling of each Unicode Scalar Value in the input
/// string with Begin, Middle, End, or Single. These denote the proposed
/// segments. A segment is one of the following two patterns.
Expand All @@ -26,9 +25,9 @@ pub type StateSet = [f64; NUM_STATES];
/// to that state.
///
/// WARNING: The data file format for hmm.model comments imply one can
/// reassign the index values of each state at the top but `build.rs`
/// currently ignores the mapping. Do not reassign these indicies without
/// verifying how it interacts with `build.rs`. These indicies must also
/// reassign the index values of each state at the top but `jieba-macros`
/// currently ignores the mapping. Do not reassign these indices without
/// verifying how it interacts with `jieba-macros`. These indices must also
/// match the order if ALLOWED_PREV_STATUS.
#[derive(Debug, PartialEq, Eq, Hash, PartialOrd, Ord, Clone, Copy)]
pub enum State {
Expand All @@ -52,7 +51,7 @@ static ALLOWED_PREV_STATUS: [[State; 2]; NUM_STATES] = [
[State::Single, State::End],
];

include!(concat!(env!("OUT_DIR"), "/hmm_prob.rs"));
generate_hmm_data!();

const MIN_FLOAT: f64 = -3.14e100;

Expand Down
4 changes: 2 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ impl<'t> SplitState<'t> {
}
}

impl<'r, 't> Iterator for SplitMatches<'r, 't> {
impl<'t> Iterator for SplitMatches<'_, 't> {
type Item = SplitState<'t>;

fn next(&mut self) -> Option<SplitState<'t>> {
Expand Down Expand Up @@ -768,7 +768,7 @@ impl Jieba {
/// `sentence`: input text
///
/// `hmm`: enable HMM or not
pub fn tag<'a>(&'a self, sentence: &'a str, hmm: bool) -> Vec<Tag> {
pub fn tag<'a>(&'a self, sentence: &'a str, hmm: bool) -> Vec<Tag<'a>> {
let words = self.cut(sentence, hmm);
words
.into_iter()
Expand Down
2 changes: 1 addition & 1 deletion src/sparse_dag.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ pub struct EdgeIter<'a> {
cursor: usize,
}

impl<'a> Iterator for EdgeIter<'a> {
impl Iterator for EdgeIter<'_> {
type Item = usize;

fn size_hint(&self) -> (usize, Option<usize>) {
Expand Down

0 comments on commit 7216263

Please sign in to comment.