Normalizer for russian

meilisearch · Jun 26, 2024 · 0bccf7b · 0bccf7b
1 parent c983b9f
commit 0bccf7b
Show file tree

Hide file tree

Showing 4 changed files with 168 additions and 1 deletion.
diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml
@@ -35,7 +35,7 @@ litemap = "0.7.2"
 zerovec = "0.10.1"
 
 [features]
-default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese"]
+default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese", "russian"]
 
 # allow chinese specialized tokenization
 chinese = ["chinese-segmentation", "chinese-normalization"]
@@ -61,6 +61,9 @@ thai = []
 # allow greek specialized tokenization
 greek = []
 
+# allow russian specialized tokenization
+russian = []
+
 # allow splitting camelCase latin words
 latin-camelcase = ["dep:finl_unicode"]
 

diff --git a/charabia/src/normalizer/compatibility_decomposition.rs b/charabia/src/normalizer/compatibility_decomposition.rs
@@ -52,6 +52,13 @@ mod test {
     // base tokens to normalize.
     fn tokens() -> Vec<Token<'static>> {
         vec![
+            Token {
+                lemma: Owned("Ёё".to_string()),
+                char_end: 2,
+                byte_end: 2,
+                script: Script::Cyrillic,
+                ..Default::default()
+            },
             Token {
                 // Decompose 1E69 to 0073 0323 0307
                 lemma: Owned("ṩ ṩ".to_string()),
@@ -74,6 +81,14 @@ mod test {
     // expected result of the current Normalizer.
     fn normalizer_result() -> Vec<Token<'static>> {
         vec![
+            Token {
+                lemma: Owned("Е\u{308}е\u{308}".to_string()),
+                char_end: 2,
+                byte_end: 2,
+                char_map: Some(vec![(2, 4), (2, 4)]),
+                script: Script::Cyrillic,
+                ..Default::default()
+            },
             Token {
                 lemma: Owned("s\u{0323}\u{0307} s\u{0323}\u{0307}".to_string()),
                 char_end: 2,
@@ -108,6 +123,15 @@ mod test {
     // expected result of the complete Normalizer pieline.
     fn normalized_tokens() -> Vec<Token<'static>> {
         vec![
+            Token {
+                lemma: Owned("ее".to_string()),
+                char_end: 2,
+                byte_end: 2,
+                script: Script::Cyrillic,
+                char_map: Some(vec![(2, 2), (2, 2)]),
+                kind: TokenKind::Word,
+                ..Default::default()
+            },
             Token {
                 lemma: Owned("s s".to_string()),
                 char_end: 2,

diff --git a/charabia/src/normalizer/mod.rs b/charabia/src/normalizer/mod.rs
@@ -19,6 +19,8 @@ use self::quote::QuoteNormalizer;
 use self::swedish_recomposition::SwedishRecompositionNormalizer;
 #[cfg(feature = "vietnamese")]
 pub use self::vietnamese::VietnameseNormalizer;
+#[cfg(feature = "russian")]
+pub use self::russian::RussianNormalizer;
 use crate::segmenter::SegmentedTokenIter;
 use crate::Token;
 
@@ -41,6 +43,8 @@ mod quote;
 mod swedish_recomposition;
 #[cfg(feature = "vietnamese")]
 mod vietnamese;
+#[cfg(feature = "russian")]
+mod russian;
 
 mod ae_oe_normalizer;
 
@@ -71,6 +75,8 @@ pub static LOSSY_NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
         Box::new(NonspacingMarkNormalizer),
         #[cfg(feature = "vietnamese")]
         Box::new(VietnameseNormalizer),
+        #[cfg(feature = "russian")]
+        Box::new(RussianNormalizer)
     ]
 });
 

diff --git a/charabia/src/normalizer/russian.rs b/charabia/src/normalizer/russian.rs
@@ -0,0 +1,134 @@
+use std::borrow::Cow;
+
+use super::{Normalizer, NormalizerOption};
+use aho_corasick::AhoCorasick;
+use once_cell::sync::Lazy;
+use crate::{Script, Token};
+
+pub struct RussianNormalizer;
+
+static MATCHING_STR: Lazy<AhoCorasick> = Lazy::new(|| {
+    AhoCorasick::new(["Е\u{308}", "е\u{308}"]).unwrap()
+});
+
+impl Normalizer for RussianNormalizer {
+    fn normalize<'o>(&self, mut token: Token<'o>, options: &NormalizerOption) -> Token<'o> {
+        match token.char_map.take() {
+            Some(mut char_map) => {
+                // if a char_map already exists,iterate over it to reconstruct sub-strings.
+                let mut lemma = String::new();
+                let mut tail = token.lemma.as_ref();
+                let mut normalized = String::new();
+                for (_, normalized_len) in char_map.iter_mut() {
+                    let (head, t) = tail.split_at(*normalized_len as usize);
+                    tail = t;
+                    normalized.clear();
+                    // then normalize each sub-strings recomputing the size in the char_map.
+                    let mut peekable = head.chars().peekable();
+                    while let Some(c) = peekable.next() {
+                        let (c, peek_consumed) = normalize_russian(c, peekable.peek());
+
+                        if peek_consumed {
+                            peekable.next();
+                        }
+
+                        normalized.push(c);
+                    }
+
+                    *normalized_len = normalized.len() as u8;
+                    lemma.push_str(normalized.as_ref());
+                }
+
+                token.lemma = Cow::Owned(lemma);
+                token.char_map = Some(char_map);
+            }
+            None => {
+                // if no char_map exists, iterate over the lemma recomposing characters.
+                let mut char_map = Vec::new();
+                let mut lemma = String::new();
+                let mut peekable = token.lemma.chars().peekable();
+                while let Some(c) = peekable.next() {
+                    let (normalized, peek_consumed) = normalize_russian(c, peekable.peek());
+
+                    if peek_consumed {
+                        peekable.next();
+                    }
+
+                    if options.create_char_map {
+                        char_map.push((c.len_utf8() as u8, normalized.len_utf8() as u8));
+                    }
+                    lemma.push(normalized);
+                }
+                token.lemma = Cow::Owned(lemma);
+                if options.create_char_map {
+                    token.char_map = Some(char_map);
+                }
+            }
+        }
+
+        token
+    }
+
+    fn should_normalize(&self, token: &Token) -> bool {
+        token.script == Script::Cyrillic && MATCHING_STR.is_match(token.lemma())
+    }
+}
+
+// https://en.wikipedia.org/wiki/Russian_alphabet
+// Only decomposed forms are considered, as compatibility decomposition already takes care of 1-codepoint forms.
+fn normalize_russian(current: char, next: Option<&char>) -> (char, bool) {
+    match (current, next) {
+        // ё -> е, grammatically permissible, common in writing
+        ('Е', Some('\u{308}')) => ('Е', true),
+        ('е', Some('\u{308}')) => ('е', true),
+
+        (c, _) => (c, false),
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::borrow::Cow::Owned;
+
+    use crate::normalizer::test::test_normalizer;
+    use crate::normalizer::Normalizer;
+    use crate::token::TokenKind;
+
+    // base tokens to normalize.
+    fn tokens() -> Vec<Token<'static>> {
+        vec![Token {
+            lemma: Owned("Ёё".to_string()),
+            char_end: 2,
+            byte_end: 2,
+            script: Script::Cyrillic,
+            ..Default::default()
+        }]
+    }
+
+    // expected result of the current Normalizer.
+    fn normalizer_result() -> Vec<Token<'static>> {
+        vec![Token {
+            lemma: Owned("Ёё".to_string()),
+            char_end: 2,
+            byte_end: 2,
+            script: Script::Cyrillic,   
+            char_map: None,   
+            ..Default::default()
+        }]
+    }
+
+    // expected result of the complete Normalizer pipeline.
+    fn normalized_tokens() -> Vec<Token<'static>> {
+        vec![Token {
+            lemma: Owned("ее".to_string()),
+            char_end: 2,
+            byte_end: 2,
+            script: Script::Cyrillic,
+            char_map: Some(vec![(2, 2), (2, 2)]),
+            kind: TokenKind::Word,
+            ..Default::default()
+        }]
+    }
+
+    test_normalizer!(RussianNormalizer, tokens(), normalizer_result(), normalized_tokens());
+}