Merge pull request #13 from yannvgn/next

improve language support
yannvgn · Dec 19, 2019 · 6934ded · 6934ded
2 parents 1df0cc2 + bcf6097
commit 6934ded
Show file tree

Hide file tree

Showing 8 changed files with 111 additions and 42 deletions.
diff --git a/README.md b/README.md
@@ -32,6 +32,19 @@ You'll need Python 3.6 or higher.
 pip install laserembeddings
 ```
 
+To install laserembeddings with extra dependencies:
+
+```
+# if you need Chinese support:
+pip install laserembeddings[zh]
+
+# if you need Japanese support:
+pip install laserembeddings[ja]
+
+# or both:
+pip install laserembeddings[zh,ja]
+```
+
 ### Downloading the pre-trained models
 
 ```
@@ -47,14 +60,25 @@ from laserembeddings import Laser
 
 laser = Laser()
 
+# if all sentences are in the same language:
+
 embeddings = laser.embed_sentences(
     ['let your neural network be polyglot',
      'use multilingual embeddings!'],
-    lang='en')  # lang is used for tokenization
+    lang='en')  # lang is only used for tokenization
 
 # embeddings is a N*1024 (N = number of sentences) NumPy array
 ```
 
+If the sentences are not in the same language, you can pass a list of languages
+```python
+embeddings = laser.embed_sentences(
+    ['I love pasta.',
+     "J'adore les pâtes.",
+     'Ich liebe Pasta.'],
+    lang=['en', 'fr', 'de'])
+```
+
 If you downloaded the models into a specific directory:
 
 ```python
@@ -96,11 +120,7 @@ Here's a summary of the differences:
 |----------------------|-------------------------------------|----------------------------------------|--------|
 | Normalization / tokenization | [Moses](https://github.com/moses-smt/mosesdecoder) | [Sacremoses](https://github.com/alvations/sacremoses) | Moses is implemented in Perl |
 | BPE encoding | [fastBPE](https://github.com/glample/fastBPE) | [subword-nmt](https://github.com/rsennrich/subword-nmt) | fastBPE cannot be installed via pip and requires compiling C++ code |
-
-The following features have not been implemented yet:
-- romanize, needed to process Greek (el)
-- Chinese text segmentation, needed to process Chinese (zh, cmn, wuu and yue)
-- Japanese text segmentation, needed to process Japanese (ja, jpn)
+| Japanese segmentation (optional) | [MeCab](https://github.com/taku910/mecab) / [JapaneseTokenizer](https://github.com/Kensuke-Mitsuzawa/JapaneseTokenizers) | [mecab-python3](https://github.com/SamuraiT/mecab-python3) | mecab-python3 comes with wheels for major platforms (no compilation needed) |
 
 ## Will I get the exact same embeddings?
 
@@ -144,6 +164,11 @@ First, download the test data.
 python -m laserembeddings download-test-data
 ```
 
+Install extra dependencies (Chinese and Japanese support):
+```
+poetry install -E zh -E ja
+```
+
 👉 If you want to know more about the contents and the generation of the test data, check out the [laserembeddings-test-data](https://github.com/yannvgn/laserembeddings-test-data) repository.
 
 Then, run the test with `SIMILARITY_TEST` env. variable set to `1`.

diff --git a/laserembeddings/__main__.py b/laserembeddings/__main__.py
@@ -73,7 +73,7 @@ def download_and_extract_test_data(output_dir):
     print('')
 
     download_file(
-        'https://github.com/yannvgn/laserembeddings-test-data/releases/download/v1.0.0/laserembeddings-test-data.tar.gz',
+        'https://github.com/yannvgn/laserembeddings-test-data/releases/download/v1.0.1/laserembeddings-test-data.tar.gz',
         os.path.join(output_dir, 'laserembeddings-test-data.tar.gz'))
 
     extract_tar(os.path.join(output_dir, 'laserembeddings-test-data.tar.gz'),
@@ -106,7 +106,7 @@ def main():
         repository_root = os.path.dirname(
             os.path.dirname(os.path.realpath(__file__)))
 
-        if os.path.basename(repository_root) != 'laserembeddings':
+        if not os.path.isfile(os.path.join(repository_root, 'pyproject.toml')):
             print(
                 f"{CONSOLE_ERROR}  Looks like you're not running laserembeddings from its source code"
             )

diff --git a/laserembeddings/laser.py b/laserembeddings/laser.py
@@ -88,21 +88,25 @@ def _get_tokenizer(self, lang: str) -> Tokenizer:
 
         return self.tokenizers[lang]
 
-    def embed_sentences(self, sentences: List[str], lang: str) -> np.ndarray:
+    def embed_sentences(self, sentences: Union[List[str], str],
+                        lang: Union[str, List[str]]) -> np.ndarray:
         """
         Computes the LASER embeddings of provided sentences using the tokenizer for the specified language.
 
         Args:
             sentences (List[str]): the sentences to compute the embeddings from.
-            lang (str): the language code (ISO 639-1) used to tokenize the sentences.
+            lang (str or List[str]): the language code(s) (ISO 639-1) used to tokenize the sentences
+                (either as a string - same code for every sentence - or as a list of strings - one code per sentence).
 
         Returns:
             np.ndarray: A N * 1024 NumPy array containing the embeddings, N being the number of sentences provided.
         """
+        sentences = [sentences] if isinstance(sentences, str) else sentences
+        lang = [lang] * len(sentences) if isinstance(lang, str) else lang
         with sre_performance_patch():  # see https://bugs.python.org/issue37723
             sentence_tokens = [
-                self._get_tokenizer(lang).tokenize(sentence)
-                for sentence in sentences
+                self._get_tokenizer(sentence_lang).tokenize(sentence)
+                for sentence, sentence_lang in zip(sentences, lang)
             ]
             bpe_encoded = [
                 self.bpe.encode_tokens(tokens) for tokens in sentence_tokens

diff --git a/laserembeddings/preprocessing.py b/laserembeddings/preprocessing.py
@@ -1,11 +1,24 @@
-from typing import TextIO, Union
+from typing import TextIO, Union, Optional
 
 from sacremoses import MosesPunctNormalizer, MosesTokenizer
 from sacremoses.util import xml_unescape
 from subword_nmt.apply_bpe import BPE as subword_nmt_bpe, read_vocabulary
+from transliterate import translit
 
 from .utils import BPECodesAdapter
 
+# Extras
+try:
+    import jieba
+    jieba.setLogLevel(60)
+except ImportError:
+    jieba = None
+
+try:
+    import MeCab
+except ImportError:
+    MeCab = None
+
 __all__ = ['Tokenizer', 'BPE']
 
 ###############################################################################
@@ -23,16 +36,16 @@ class Tokenizer:
         lang (str): the language code (ISO 639-1) of the texts to tokenize
         lower_case (bool, optional): if True, the texts are lower-cased before being tokenized.
             Defaults to True.
-        romanize (bool, optional): if True, the texts are romanized before being tokenized.
-            Defaults to False. Should be True for "el" language.
+        romanize (bool or None, optional): if True, the texts are romanized.
+            Defaults to None (romanization enabled based on input language).
         descape (bool, optional): if True, the XML-escaped symbols get de-escaped.
             Default to False.
     """
 
     def __init__(self,
                  lang: str = 'en',
                  lower_case: bool = True,
-                 romanize: bool = False,
+                 romanize: Optional[bool] = None,
                  descape: bool = False):
         assert lower_case, 'lower case is needed by all the models'
 
@@ -41,24 +54,27 @@ def __init__(self,
         if lang == 'jpn':
             lang = 'ja'
 
-        if lang == 'zh':
-            raise NotImplementedError('jieba is not yet implemented')
-        if lang == 'ja':
-            raise NotImplementedError('mecab is not yet implemented')
-        if romanize:
-            raise NotImplementedError('romanize is not yet implemented')
+        if lang == 'zh' and jieba is None:
+            raise ModuleNotFoundError(
+                '''No module named 'jieba'. Install laserembeddings with 'zh' extra to fix that: "pip install laserembeddings[zh]"'''
+            )
+        if lang == 'ja' and MeCab is None:
+            raise ModuleNotFoundError(
+                '''No module named 'MeCab'. Install laserembeddings with 'ja' extra to fix that: "pip install laserembeddings[ja]"'''
+            )
 
+        self.lang = lang
         self.lower_case = lower_case
-        self.romanize = romanize
+        self.romanize = romanize if romanize is not None else lang == 'el'
         self.descape = descape
 
         self.normalizer = MosesPunctNormalizer(lang=lang)
         self.tokenizer = MosesTokenizer(lang=lang)
+        self.mecab_tokenizer = MeCab.Tagger(
+            "-O wakati -b 50000") if MeCab is not None else None
 
     def tokenize(self, text: str) -> str:
         """Tokenizes a text and returns the tokens as a string"""
-        if self.lower_case:
-            text = text.lower()
 
         # REM_NON_PRINT_CHAR
         # not implemented
@@ -71,17 +87,26 @@ def tokenize(self, text: str) -> str:
             text = xml_unescape(text)
 
         # MOSES_TOKENIZER
-
         # see: https://github.com/facebookresearch/LASER/issues/55#issuecomment-480881573
         text = self.tokenizer.tokenize(text,
                                        return_str=True,
                                        escape=False,
                                        aggressive_dash_splits=False)
 
         # jieba
+        if self.lang == 'zh':
+            text = ' '.join(jieba.cut(text.rstrip('\r\n')))
+
         # MECAB
+        if self.lang == 'ja':
+            text = self.mecab_tokenizer.parse(text).rstrip('\r\n')
+
         # ROMAN_LC
-        # not implemented
+        if self.romanize:
+            text = translit(text, self.lang, reversed=True)
+
+        if self.lower_case:
+            text = text.lower()
 
         return text
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -14,12 +14,19 @@ torch = "^1.0.1.post2"
 subword-nmt = "^0.3.6"
 numpy = "^1.15.4"
 sacremoses = "0.0.35"
+transliterate = "1.10.2"
+mecab-python3 = { version = "^0.996.2", optional = true }
+jieba = { version = "0.39", optional = true }
 
 [tool.poetry.dev-dependencies]
 pytest = "^4.6"
 yapf = "^0.27.0"
 pylint = "^2.3"
 
+[tool.poetry.extras]
+zh = ["jieba"]
+ja = ["mecab-python3"]
+
 [build-system]
 requires = ["poetry>=0.12"]
 build-backend = "poetry.masonry.api"
diff --git a/tests/report/comparison-with-LASER.md b/tests/report/comparison-with-LASER.md
@@ -20,13 +20,15 @@
 |✅|cbk|1.00000|1.00000|
 |✅|ceb|1.00000|1.00000|
 |✅|ch|1.00000|1.00000|
+|✅|cmn|1.00000|1.00000|
 |✅|cs|1.00000|1.00000|
 |✅|csb|1.00000|1.00000|
 |✅|cy|1.00000|1.00000|
 |✅|da|1.00000|1.00000|
 |✅|de|1.00000|1.00000|
 |✅|dsb|1.00000|1.00000|
 |✅|dtp|1.00000|1.00000|
+|✅|el|1.00000|1.00000|
 |✅|en|1.00000|1.00000|
 |✅|eo|1.00000|1.00000|
 |✅|es|1.00000|1.00000|
@@ -52,6 +54,7 @@
 |✅|io|1.00000|1.00000|
 |✅|is|1.00000|1.00000|
 |✅|it|1.00000|1.00000|
+|✅|ja|1.00000|1.00000|
 |⚠️|jv|0.99987|0.98719|
 |⚠️|ka|0.99739|0.73893|
 |✅|kab|1.00000|1.00000|
@@ -74,7 +77,7 @@
 |✅|nb|1.00000|1.00000|
 |✅|nds|1.00000|1.00000|
 |✅|nl|1.00000|1.00000|
-|⚠️|nn|0.99986|0.99229|
+|✅|nn|1.00000|1.00000|
 |✅|nov|1.00000|1.00000|
 |✅|oc|1.00000|1.00000|
 |✅|orv|1.00000|1.00000|
@@ -89,7 +92,7 @@
 |✅|sl|1.00000|1.00000|
 |✅|sq|1.00000|1.00000|
 |✅|sr|1.00000|1.00000|
-|⚠️|sv|0.99766|0.76591|
+|✅|sv|1.00000|1.00000|
 |✅|swg|1.00000|1.00000|
 |✅|swh|1.00000|1.00000|
 |✅|ta|1.00000|1.00000|
@@ -98,14 +101,16 @@
 |✅|tk|1.00000|1.00000|
 |✅|tl|1.00000|1.00000|
 |✅|tr|1.00000|1.00000|
-|⚠️|tt|0.99904|0.90426|
+|✅|tt|1.00000|1.00000|
 |✅|tzl|1.00000|1.00000|
 |✅|ug|1.00000|1.00000|
 |✅|uk|1.00000|1.00000|
 |✅|ur|1.00000|1.00000|
 |✅|uz|1.00000|1.00000|
 |✅|vi|1.00000|1.00000|
 |✅|war|1.00000|1.00000|
+|✅|wuu|1.00000|1.00000|
 |✅|xh|1.00000|1.00000|
 |⚠️|yi|0.99958|0.96916|
+|✅|yue|1.00000|1.00000|
 |✅|zsm|1.00000|1.00000|
diff --git a/tests/test_laser.py b/tests/test_laser.py
@@ -18,6 +18,10 @@ def test_laser():
         assert laser.embed_sentences(
             ['hello world!', 'i hope the tests are passing'],
             lang='en').shape == (2, 1024)
+        assert laser.embed_sentences(['hello world!', "j'aime les pâtes"],
+                                     lang=['en', 'fr']).shape == (2, 1024)
+        assert laser.embed_sentences('hello world!',
+                                     lang='en').shape == (1, 1024)
 
 
 def test_similarity(test_data):
@@ -46,10 +50,6 @@ def test_similarity(test_data):
 
         for lang in test_data['langs']:
 
-            if lang in ('cmn', 'wuu', 'yue', 'zh', 'jpn', 'ja', 'el'):
-                # language not supported, ignoring
-                continue
-
             sents = test_data[f'{lang}_sentences']
             orig_embeddings = test_data[f'{lang}_embeddings']
             embeddings = laser.embed_sentences(sents, lang)

diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py
@@ -3,19 +3,22 @@
 from laserembeddings import Laser
 from laserembeddings.preprocessing import Tokenizer, BPE
 
+from laserembeddings.utils import sre_performance_patch
+
 
 def test_tokenizer():
-    assert Tokenizer('en').tokenize("Let's do it!") == "let 's do it !"
+    with sre_performance_patch():
+        assert Tokenizer('en').tokenize("Let's do it!") == "let 's do it !"
 
-    with pytest.raises(NotImplementedError):
-        Tokenizer(romanize=True)
+        assert Tokenizer(
+            'en', descape=True).tokenize("Let's do it &amp; pass that test!"
+                                         ) == "let 's do it & pass that test !"
 
-    assert Tokenizer(
-        'en', descape=True).tokenize("Let's do it &amp; pass that test!"
-                                     ) == "let 's do it & pass that test !"
+        with pytest.raises(AssertionError):
+            Tokenizer(lower_case=False)
 
-    with pytest.raises(AssertionError):
-        Tokenizer(lower_case=False)
+        assert not Tokenizer('en').romanize
+        assert Tokenizer('el').romanize
 
 
 def test_bpe():