meilisearch · bors · Oct 12, 2022 · Oct 5, 2022 · Oct 6, 2022 · Oct 6, 2022
diff --git a/src/detection/mod.rs b/src/detection/mod.rs
@@ -1,19 +1,22 @@
 pub use script_language::{Language, Script};
+use std::collections::HashMap;
+use whatlang::Detector;
 
 // file copy pasted from whatlang.
 #[allow(dead_code)]
 mod chars;
 mod script_language;
 
-pub struct StrDetection<'a> {
-    inner: &'a str,
+pub struct StrDetection<'o, 'al> {
+    inner: &'o str,
     pub script: Option<Script>,
     pub language: Option<Language>,
+    allow_list : Option<&'al HashMap<Script,Vec<Language>>>,
 }
 
-impl<'a> StrDetection<'a> {
-    pub fn new(inner: &'a str) -> Self {
-        Self { inner, script: None, language: None }
+impl<'o, 'al> StrDetection<'o, 'al> {
+    pub fn new(inner: &'o str, allow_list: Option<&'al HashMap<Script,Vec<Language>>>) -> Self {
+        Self { inner, script: None, language: None, allow_list: allow_list }
     }
 
     pub fn script(&mut self) -> Script {
@@ -23,7 +26,8 @@ impl<'a> StrDetection<'a> {
 
     pub fn language(&mut self) -> Language {
         let inner = self.inner;
-        *self.language.get_or_insert_with(|| Self::detect_lang(inner))
+        let script = self.script();
+        *self.language.get_or_insert_with(|| Self::detect_lang(inner, script, self.allow_list))
     }
 
     /// detect script with whatlang,
@@ -34,17 +38,24 @@ impl<'a> StrDetection<'a> {
 
     /// detect lang with whatlang
     /// if no language is detected, return Language::Other
-    fn detect_lang(text: &str) -> Language {
-        whatlang::detect_lang(text).map(Language::from).unwrap_or_default()
+    fn detect_lang(text: &str, script: Script, allow_list : Option<&HashMap<Script,Vec<Language>>>) -> Language {
+            let detector = allow_list
+                .and_then(|allow_list| allow_list.get(&script))
+                .and_then(|allow_list| Some(allow_list.iter().map(|lang|(*lang).into()).collect()))
+                .and_then(|allow_list| Some(Detector::with_allowlist(allow_list)))
+                .unwrap_or_default();
+
+            detector.detect_lang(text).map(Language::from).unwrap_or_default()
     }
 }
 
-pub trait Detect {
-    fn detect(&self) -> StrDetection;
+pub trait Detect<'o, 'al> {
+    fn detect(&'o self, allow_list: Option<&'al HashMap<Script,Vec<Language>>>) -> StrDetection<'o, 'al>;
 }
 
-impl Detect for &str {
-    fn detect(&self) -> StrDetection {
-        StrDetection::new(self)
+impl<'o, 'al> Detect<'o, 'al> for &str {
+    fn detect(&'o self, allow_list: Option<&'al HashMap<Script,Vec<Language>>>) -> StrDetection<'o, 'al> 
+    {
+        StrDetection::new(self,allow_list)
     }
 }
diff --git a/src/detection/script_language.rs b/src/detection/script_language.rs
@@ -15,6 +15,15 @@ macro_rules! make_language {
             }
         }
 
+        impl From<Language> for whatlang::Lang {
+            fn from(other: Language) -> whatlang::Lang {
+                match other {
+                    $(Language::$language => whatlang::Lang::$language), +,
+                    _other => whatlang::Lang::Eng,
+                }
+            }
+        }
+
         impl Default for Language {
             fn default() -> Self {
                 Self::Other

diff --git a/src/segmenter/mod.rs b/src/segmenter/mod.rs
@@ -185,7 +185,10 @@ pub trait Segment<'o> {
     /// ```
     fn segment(&self) -> SegmentedTokenIter<'o>;
 
-    /// Segments the provided text creating an Iterator over `&str`.
+    /// Segments the provided text creating an Iterator over Tokens where you can specify an allowed list of languages to be used with a script.
+    fn segment_with_allowlist(&self, allow_list: Option<&'o HashMap<Script,Vec<Language>>>) -> SegmentedTokenIter<'o>;
+
+    /// Segments the provided text creating an Iterator over `&str`. 
     ///
     /// # Example
     ///
@@ -201,10 +204,43 @@ pub trait Segment<'o> {
     /// assert_eq!(segments.next(), Some("quick"));
     /// ```
     fn segment_str(&self) -> Box<dyn Iterator<Item = &'o str> + 'o>;
+
+    /// Segments the provided text creating an Iterator over `&str` where you can specify an allowed list of languages to be used with a script.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use charabia::Segment;
+    /// use charabia::{Language, Script};
+    /// use std::collections::HashMap;
+    ///
+    /// let orig = "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!";
+    /// 
+    /// let scrip_language_map = [
+    ///     (Script::Latin, vec![Language::Eng]),
+    ///     ].into_iter().collect();
+    /// let allow_list: Option<&HashMap<Script,Vec<Language>>> = Some(&scrip_language_map);
+    /// let mut segments = orig.segment_str_with_allowlist(allow_list);
+    ///
+    /// assert_eq!(segments.next(), Some("The"));
+    /// assert_eq!(segments.next(), Some(" "));
+    /// assert_eq!(segments.next(), Some("quick"));
+    /// ```
+    fn segment_str_with_allowlist(&self, allow_list: Option<&'o HashMap<Script,Vec<Language>>>) -> Box<dyn Iterator<Item = &'o str> + 'o>;
+
 }
 
 impl<'o> Segment<'o> for &'o str {
     fn segment(&self) -> SegmentedTokenIter<'o> {
+        self.segment_with_allowlist(None)
+    }
+
+    fn segment_str(&self) -> Box<dyn Iterator<Item = &'o str> + 'o> {
+        self.segment_str_with_allowlist(None)
+    }
+
+    fn segment_with_allowlist(&self, allow_list: Option<&'o HashMap<Script,Vec<Language>>>) -> SegmentedTokenIter<'o>
+    {
         let mut current_script = Script::Other;
         let inner = self
             .linear_group_by_key(move |c| {
@@ -214,20 +250,20 @@ impl<'o> Segment<'o> for &'o str {
                 }
                 current_script
             })
-            .map(|s| {
-                let mut detector = s.detect();
+            .map(move |s| {
+                let mut detector = s.detect(allow_list);
                 let segmenter = segmenter(&mut detector);
                 let script = detector.script();
                 let language = detector.language;
                 InnerSegmentedTokenIter { inner: segmenter.segment_str(s), script, language }
             })
             .flatten();
 
-        SegmentedTokenIter { inner: Box::new(inner), char_index: 0, byte_index: 0 }
+        SegmentedTokenIter::<'o> { inner: Box::new(inner), char_index: 0, byte_index: 0 }
     }
 
-    fn segment_str(&self) -> Box<dyn Iterator<Item = &'o str> + 'o> {
-        let mut detector = self.detect();
+    fn segment_str_with_allowlist(&self, allow_list: Option<&'o HashMap<Script,Vec<Language>>>) -> Box<dyn Iterator<Item = &'o str> + 'o> {
+        let mut detector = self.detect(allow_list);
         let segmenter = segmenter(&mut detector);
 
         segmenter.segment_str(self)

diff --git a/src/tokenizer.rs b/src/tokenizer.rs
@@ -4,6 +4,8 @@ use crate::classifier::{ClassifiedTokenIter, Classify};
 use crate::normalizer::{Normalize, NormalizerOption};
 use crate::segmenter::{Segment, SegmentedTokenIter};
 use crate::Token;
+use crate::detection::{Language,Script};
+use std::collections::HashMap;
 
 /// Iterator over tuples of [`&str`] (part of the original text) and [`Token`].
 pub struct ReconstructedTokenIter<'o, 'sw, A: AsRef<[u8]>> {
@@ -93,15 +95,17 @@ impl<'o> Tokenize<'o, Vec<u8>> for &'o str {
 /// Structure used to tokenize a text with custom configurations.
 ///
 /// See [`TokenizerBuilder`] to know how to build a [`Tokenizer`].
-pub struct Tokenizer<'sw, A> {
+pub struct Tokenizer<'sw, 'al, A> {
     stop_words: Option<&'sw Set<A>>,
     normalizer_option: NormalizerOption,
+    allow_list : Option<&'al HashMap<Script,Vec<Language>>>,
 }
 
-impl<'o, A: AsRef<[u8]>> Tokenizer<'_, A> {
+impl<'o, A: AsRef<[u8]>> Tokenizer<'_, 'o, A>
+{
     pub fn tokenize(&self, original: &'o str) -> ClassifiedTokenIter<'o, '_, A> {
         original
-            .segment()
+            .segment_with_allowlist(self.allow_list)
             .normalize(self.normalizer_option)
             .classify_with_stop_words(self.stop_words)
     }
@@ -111,11 +115,11 @@ impl<'o, A: AsRef<[u8]>> Tokenizer<'_, A> {
     }
 
     pub fn segment(&self, original: &'o str) -> SegmentedTokenIter<'o> {
-        original.segment()
+        original.segment_with_allowlist(self.allow_list)
     }
 
     pub fn segment_str(&self, original: &'o str) -> Box<dyn Iterator<Item = &'o str> + 'o> {
-        original.segment_str()
+        original.segment_str_with_allowlist(self.allow_list)
     }
 }
 
@@ -146,21 +150,22 @@ impl<'o, A: AsRef<[u8]>> Tokenizer<'_, A> {
 /// let tokenizer = builder.build();
 /// ```
 ///
-pub struct TokenizerBuilder<'sw, A> {
+pub struct TokenizerBuilder<'sw, 'al, A> {
     stop_words: Option<&'sw Set<A>>,
     normalizer_option: NormalizerOption,
+    allow_list : Option< &'al HashMap<Script,Vec<Language>>>,
 }
 
-impl<'sw, A> TokenizerBuilder<'sw, A> {
+impl<'sw,'al, A> TokenizerBuilder<'sw, 'al, A> {
     /// Create a `TokenizerBuilder` with default settings,
     ///
     /// if you don't plan to set stop_words, prefer use [`TokenizerBuilder::default`]
-    pub fn new() -> TokenizerBuilder<'sw, A> {
-        Self { stop_words: None, normalizer_option: NormalizerOption::default() }
+    pub fn new() -> TokenizerBuilder<'sw, 'al, A> {
+        Self { stop_words: None, normalizer_option: NormalizerOption::default(), allow_list: None }
     }
 }
 
-impl<'sw, A> TokenizerBuilder<'sw, A> {
+impl<'sw, 'al, A> TokenizerBuilder<'sw, 'al, A> {
     /// Configure the words that will be classified as `TokenKind::StopWord`.
     ///
     /// # Arguments
@@ -180,14 +185,24 @@ impl<'sw, A> TokenizerBuilder<'sw, A> {
         self.normalizer_option.create_char_map = create_char_map;
         self
     }
+
+    /// Configure which languages can be used for which script
+    ///
+    /// # Arguments
+    ///
+    /// * `allow_list` - a `HashMap` of the selection of languages associated with a script to limit during autodetection.   
+    pub fn allow_list(&mut self, allow_list: &'al HashMap<Script,Vec<Language>>) -> &mut Self {
+        self.allow_list = Some(allow_list);
+        self
+    }
 
     /// Build the configurated `Tokenizer`.
-    pub fn build<'o>(&self) -> Tokenizer<'sw, A> {
-        Tokenizer { stop_words: self.stop_words, normalizer_option: self.normalizer_option }
+    pub fn build<'o>(&self) -> Tokenizer<'sw, 'al, A> {
+        Tokenizer { stop_words: self.stop_words, normalizer_option: self.normalizer_option, allow_list: self.allow_list }
     }
 }
 
-impl<'sw> Default for TokenizerBuilder<'sw, Vec<u8>> {
+impl<'sw, 'al> Default for TokenizerBuilder<'sw, 'al, Vec<u8>> {
     fn default() -> Self {
         Self::new()
     }