merge branch next

yannvgn · Nov 1, 2019 · 543d364 · 543d364
2 parents e5f9012 + 0121919
commit 543d364
Show file tree

Hide file tree

Showing 6 changed files with 54 additions and 35 deletions.
diff --git a/README.md b/README.md
@@ -35,8 +35,14 @@ pip install laserembeddings
 To install laserembeddings with extra dependencies:
 
 ```
+# if you need Chinese support:
+pip install laserembeddings[zh]
+
 # if you need Japanese support:
 pip install laserembeddings[ja]
+
+# or both:
+pip install laserembeddings[zh,ja]
 ```
 
 ### Downloading the pre-trained models
@@ -104,10 +110,6 @@ Here's a summary of the differences:
 | Normalization / tokenization | [Moses](https://github.com/moses-smt/mosesdecoder) | [Sacremoses](https://github.com/alvations/sacremoses) | Moses is implemented in Perl |
 | BPE encoding | [fastBPE](https://github.com/glample/fastBPE) | [subword-nmt](https://github.com/rsennrich/subword-nmt) | fastBPE cannot be installed via pip and requires compiling C++ code |
 
-The following features have not been implemented yet:
-- romanize, needed to process Greek (el)
-- Chinese text segmentation, needed to process Chinese (zh, cmn, wuu and yue)
-
 ## Will I get the exact same embeddings?
 
 **For most languages, in most of the cases, yes.**
@@ -150,9 +152,9 @@ First, download the test data.
 python -m laserembeddings download-test-data
 ```
 
-Install extra dependencies (Japanese support):
+Install extra dependencies (Chinese and Japanese support):
 ```
-poetry install -E ja
+poetry install -E zh -E ja
 ```
 
 👉 If you want to know more about the contents and the generation of the test data, check out the [laserembeddings-test-data](https://github.com/yannvgn/laserembeddings-test-data) repository.

diff --git a/laserembeddings/preprocessing.py b/laserembeddings/preprocessing.py
@@ -1,12 +1,19 @@
-from typing import TextIO, Union
+from typing import TextIO, Union, Optional
 
 from sacremoses import MosesPunctNormalizer, MosesTokenizer
 from sacremoses.util import xml_unescape
 from subword_nmt.apply_bpe import BPE as subword_nmt_bpe, read_vocabulary
+from transliterate import translit
 
 from .utils import BPECodesAdapter
 
 # Extras
+try:
+    import jieba
+    jieba.setLogLevel(60)
+except ImportError:
+    jieba = None
+
 try:
     import MeCab
 except ImportError:
@@ -29,16 +36,16 @@ class Tokenizer:
         lang (str): the language code (ISO 639-1) of the texts to tokenize
         lower_case (bool, optional): if True, the texts are lower-cased before being tokenized.
             Defaults to True.
-        romanize (bool, optional): if True, the texts are romanized before being tokenized.
-            Defaults to False. Should be True for "el" language.
+        romanize (bool or None, optional): if True, the texts are romanized.
+            Defaults to None (romanization enabled based on input language).
         descape (bool, optional): if True, the XML-escaped symbols get de-escaped.
             Default to False.
     """
 
     def __init__(self,
                  lang: str = 'en',
                  lower_case: bool = True,
-                 romanize: bool = False,
+                 romanize: Optional[bool] = None,
                  descape: bool = False):
         assert lower_case, 'lower case is needed by all the models'
 
@@ -47,18 +54,18 @@ def __init__(self,
         if lang == 'jpn':
             lang = 'ja'
 
-        if lang == 'zh':
-            raise NotImplementedError('jieba is not yet implemented')
+        if lang == 'zh' and jieba is None:
+            raise ModuleNotFoundError(
+                '''No module named 'jieba'. Install laserembeddings with 'zh' extra to fix that: "pip install laserembeddings[zh]"'''
+            )
         if lang == 'ja' and MeCab is None:
             raise ModuleNotFoundError(
                 '''No module named 'MeCab'. Install laserembeddings with 'ja' extra to fix that: "pip install laserembeddings[ja]"'''
             )
-        if romanize:
-            raise NotImplementedError('romanize is not yet implemented')
 
         self.lang = lang
         self.lower_case = lower_case
-        self.romanize = romanize
+        self.romanize = romanize if romanize is not None else lang == 'el'
         self.descape = descape
 
         self.normalizer = MosesPunctNormalizer(lang=lang)
@@ -68,8 +75,6 @@ def __init__(self,
 
     def tokenize(self, text: str) -> str:
         """Tokenizes a text and returns the tokens as a string"""
-        if self.lower_case:
-            text = text.lower()
 
         # REM_NON_PRINT_CHAR
         # not implemented
@@ -82,20 +87,26 @@ def tokenize(self, text: str) -> str:
             text = xml_unescape(text)
 
         # MOSES_TOKENIZER
-
         # see: https://github.com/facebookresearch/LASER/issues/55#issuecomment-480881573
         text = self.tokenizer.tokenize(text,
                                        return_str=True,
                                        escape=False,
                                        aggressive_dash_splits=False)
 
+        # jieba
+        if self.lang == 'zh':
+            text = ' '.join(jieba.cut(text.rstrip('\r\n')))
+
         # MECAB
         if self.lang == 'ja':
             text = self.mecab_tokenizer.parse(text).rstrip('\r\n')
 
-        # jieba
         # ROMAN_LC
-        # not implemented
+        if self.romanize:
+            text = translit(text, self.lang, reversed=True)
+
+        if self.lower_case:
+            text = text.lower()
 
         return text
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -14,14 +14,17 @@ torch = "^1.0.1.post2"
 subword-nmt = "^0.3.6"
 numpy = "^1.15.4"
 sacremoses = "0.0.35"
+transliterate = "1.10.2"
 mecab-python3 = { version = "^0.996.2", optional = true }
+jieba = { version = "0.39", optional = true }
 
 [tool.poetry.dev-dependencies]
 pytest = "^4.6"
 yapf = "^0.27.0"
 pylint = "^2.3"
 
 [tool.poetry.extras]
+zh = ["jieba"]
 ja = ["mecab-python3"]
 
 [build-system]

diff --git a/tests/report/comparison-with-LASER.md b/tests/report/comparison-with-LASER.md
@@ -20,13 +20,15 @@
 |✅|cbk|1.00000|1.00000|
 |✅|ceb|1.00000|1.00000|
 |✅|ch|1.00000|1.00000|
+|✅|cmn|1.00000|1.00000|
 |✅|cs|1.00000|1.00000|
 |✅|csb|1.00000|1.00000|
 |✅|cy|1.00000|1.00000|
 |✅|da|1.00000|1.00000|
 |✅|de|1.00000|1.00000|
 |✅|dsb|1.00000|1.00000|
 |✅|dtp|1.00000|1.00000|
+|✅|el|1.00000|1.00000|
 |✅|en|1.00000|1.00000|
 |✅|eo|1.00000|1.00000|
 |✅|es|1.00000|1.00000|
@@ -75,7 +77,7 @@
 |✅|nb|1.00000|1.00000|
 |✅|nds|1.00000|1.00000|
 |✅|nl|1.00000|1.00000|
-|⚠️|nn|0.99986|0.99229|
+|✅|nn|1.00000|1.00000|
 |✅|nov|1.00000|1.00000|
 |✅|oc|1.00000|1.00000|
 |✅|orv|1.00000|1.00000|
@@ -90,7 +92,7 @@
 |✅|sl|1.00000|1.00000|
 |✅|sq|1.00000|1.00000|
 |✅|sr|1.00000|1.00000|
-|⚠️|sv|0.99766|0.76591|
+|✅|sv|1.00000|1.00000|
 |✅|swg|1.00000|1.00000|
 |✅|swh|1.00000|1.00000|
 |✅|ta|1.00000|1.00000|
@@ -99,14 +101,16 @@
 |✅|tk|1.00000|1.00000|
 |✅|tl|1.00000|1.00000|
 |✅|tr|1.00000|1.00000|
-|⚠️|tt|0.99904|0.90426|
+|✅|tt|1.00000|1.00000|
 |✅|tzl|1.00000|1.00000|
 |✅|ug|1.00000|1.00000|
 |✅|uk|1.00000|1.00000|
 |✅|ur|1.00000|1.00000|
 |✅|uz|1.00000|1.00000|
 |✅|vi|1.00000|1.00000|
 |✅|war|1.00000|1.00000|
+|✅|wuu|1.00000|1.00000|
 |✅|xh|1.00000|1.00000|
 |⚠️|yi|0.99958|0.96916|
+|✅|yue|1.00000|1.00000|
 |✅|zsm|1.00000|1.00000|
diff --git a/tests/test_laser.py b/tests/test_laser.py
@@ -46,10 +46,6 @@ def test_similarity(test_data):
 
         for lang in test_data['langs']:
 
-            if lang in ('cmn', 'wuu', 'yue', 'zh', 'el'):
-                # language not supported, ignoring
-                continue
-
             sents = test_data[f'{lang}_sentences']
             orig_embeddings = test_data[f'{lang}_embeddings']
             embeddings = laser.embed_sentences(sents, lang)

diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py
@@ -3,19 +3,22 @@
 from laserembeddings import Laser
 from laserembeddings.preprocessing import Tokenizer, BPE
 
+from laserembeddings.utils import sre_performance_patch
+
 
 def test_tokenizer():
-    assert Tokenizer('en').tokenize("Let's do it!") == "let 's do it !"
+    with sre_performance_patch():
+        assert Tokenizer('en').tokenize("Let's do it!") == "let 's do it !"
 
-    with pytest.raises(NotImplementedError):
-        Tokenizer(romanize=True)
+        assert Tokenizer(
+            'en', descape=True).tokenize("Let's do it &amp; pass that test!"
+                                         ) == "let 's do it & pass that test !"
 
-    assert Tokenizer(
-        'en', descape=True).tokenize("Let's do it &amp; pass that test!"
-                                     ) == "let 's do it & pass that test !"
+        with pytest.raises(AssertionError):
+            Tokenizer(lower_case=False)
 
-    with pytest.raises(AssertionError):
-        Tokenizer(lower_case=False)
+        assert not Tokenizer('en').romanize
+        assert Tokenizer('el').romanize
 
 
 def test_bpe():