Skip to content

Commit

Permalink
Merge pull request #13 from yannvgn/next
Browse files Browse the repository at this point in the history
improve language support
  • Loading branch information
yannvgn authored Dec 19, 2019
2 parents 1df0cc2 + bcf6097 commit 6934ded
Show file tree
Hide file tree
Showing 8 changed files with 111 additions and 42 deletions.
37 changes: 31 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,19 @@ You'll need Python 3.6 or higher.
pip install laserembeddings
```

To install laserembeddings with extra dependencies:

```
# if you need Chinese support:
pip install laserembeddings[zh]
# if you need Japanese support:
pip install laserembeddings[ja]
# or both:
pip install laserembeddings[zh,ja]
```

### Downloading the pre-trained models

```
Expand All @@ -47,14 +60,25 @@ from laserembeddings import Laser

laser = Laser()

# if all sentences are in the same language:

embeddings = laser.embed_sentences(
['let your neural network be polyglot',
'use multilingual embeddings!'],
lang='en') # lang is used for tokenization
lang='en') # lang is only used for tokenization

# embeddings is a N*1024 (N = number of sentences) NumPy array
```

If the sentences are not in the same language, you can pass a list of languages
```python
embeddings = laser.embed_sentences(
['I love pasta.',
"J'adore les pâtes.",
'Ich liebe Pasta.'],
lang=['en', 'fr', 'de'])
```

If you downloaded the models into a specific directory:

```python
Expand Down Expand Up @@ -96,11 +120,7 @@ Here's a summary of the differences:
|----------------------|-------------------------------------|----------------------------------------|--------|
| Normalization / tokenization | [Moses](https://github.com/moses-smt/mosesdecoder) | [Sacremoses](https://github.com/alvations/sacremoses) | Moses is implemented in Perl |
| BPE encoding | [fastBPE](https://github.com/glample/fastBPE) | [subword-nmt](https://github.com/rsennrich/subword-nmt) | fastBPE cannot be installed via pip and requires compiling C++ code |

The following features have not been implemented yet:
- romanize, needed to process Greek (el)
- Chinese text segmentation, needed to process Chinese (zh, cmn, wuu and yue)
- Japanese text segmentation, needed to process Japanese (ja, jpn)
| Japanese segmentation (optional) | [MeCab](https://github.com/taku910/mecab) / [JapaneseTokenizer](https://github.com/Kensuke-Mitsuzawa/JapaneseTokenizers) | [mecab-python3](https://github.com/SamuraiT/mecab-python3) | mecab-python3 comes with wheels for major platforms (no compilation needed) |

## Will I get the exact same embeddings?

Expand Down Expand Up @@ -144,6 +164,11 @@ First, download the test data.
python -m laserembeddings download-test-data
```

Install extra dependencies (Chinese and Japanese support):
```
poetry install -E zh -E ja
```

👉 If you want to know more about the contents and the generation of the test data, check out the [laserembeddings-test-data](https://github.com/yannvgn/laserembeddings-test-data) repository.

Then, run the test with `SIMILARITY_TEST` env. variable set to `1`.
Expand Down
4 changes: 2 additions & 2 deletions laserembeddings/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def download_and_extract_test_data(output_dir):
print('')

download_file(
'https://github.com/yannvgn/laserembeddings-test-data/releases/download/v1.0.0/laserembeddings-test-data.tar.gz',
'https://github.com/yannvgn/laserembeddings-test-data/releases/download/v1.0.1/laserembeddings-test-data.tar.gz',
os.path.join(output_dir, 'laserembeddings-test-data.tar.gz'))

extract_tar(os.path.join(output_dir, 'laserembeddings-test-data.tar.gz'),
Expand Down Expand Up @@ -106,7 +106,7 @@ def main():
repository_root = os.path.dirname(
os.path.dirname(os.path.realpath(__file__)))

if os.path.basename(repository_root) != 'laserembeddings':
if not os.path.isfile(os.path.join(repository_root, 'pyproject.toml')):
print(
f"{CONSOLE_ERROR} Looks like you're not running laserembeddings from its source code"
)
Expand Down
12 changes: 8 additions & 4 deletions laserembeddings/laser.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,21 +88,25 @@ def _get_tokenizer(self, lang: str) -> Tokenizer:

return self.tokenizers[lang]

def embed_sentences(self, sentences: List[str], lang: str) -> np.ndarray:
def embed_sentences(self, sentences: Union[List[str], str],
lang: Union[str, List[str]]) -> np.ndarray:
"""
Computes the LASER embeddings of provided sentences using the tokenizer for the specified language.
Args:
sentences (List[str]): the sentences to compute the embeddings from.
lang (str): the language code (ISO 639-1) used to tokenize the sentences.
lang (str or List[str]): the language code(s) (ISO 639-1) used to tokenize the sentences
(either as a string - same code for every sentence - or as a list of strings - one code per sentence).
Returns:
np.ndarray: A N * 1024 NumPy array containing the embeddings, N being the number of sentences provided.
"""
sentences = [sentences] if isinstance(sentences, str) else sentences
lang = [lang] * len(sentences) if isinstance(lang, str) else lang
with sre_performance_patch(): # see https://bugs.python.org/issue37723
sentence_tokens = [
self._get_tokenizer(lang).tokenize(sentence)
for sentence in sentences
self._get_tokenizer(sentence_lang).tokenize(sentence)
for sentence, sentence_lang in zip(sentences, lang)
]
bpe_encoded = [
self.bpe.encode_tokens(tokens) for tokens in sentence_tokens
Expand Down
55 changes: 40 additions & 15 deletions laserembeddings/preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,24 @@
from typing import TextIO, Union
from typing import TextIO, Union, Optional

from sacremoses import MosesPunctNormalizer, MosesTokenizer
from sacremoses.util import xml_unescape
from subword_nmt.apply_bpe import BPE as subword_nmt_bpe, read_vocabulary
from transliterate import translit

from .utils import BPECodesAdapter

# Extras
try:
import jieba
jieba.setLogLevel(60)
except ImportError:
jieba = None

try:
import MeCab
except ImportError:
MeCab = None

__all__ = ['Tokenizer', 'BPE']

###############################################################################
Expand All @@ -23,16 +36,16 @@ class Tokenizer:
lang (str): the language code (ISO 639-1) of the texts to tokenize
lower_case (bool, optional): if True, the texts are lower-cased before being tokenized.
Defaults to True.
romanize (bool, optional): if True, the texts are romanized before being tokenized.
Defaults to False. Should be True for "el" language.
romanize (bool or None, optional): if True, the texts are romanized.
Defaults to None (romanization enabled based on input language).
descape (bool, optional): if True, the XML-escaped symbols get de-escaped.
Default to False.
"""

def __init__(self,
lang: str = 'en',
lower_case: bool = True,
romanize: bool = False,
romanize: Optional[bool] = None,
descape: bool = False):
assert lower_case, 'lower case is needed by all the models'

Expand All @@ -41,24 +54,27 @@ def __init__(self,
if lang == 'jpn':
lang = 'ja'

if lang == 'zh':
raise NotImplementedError('jieba is not yet implemented')
if lang == 'ja':
raise NotImplementedError('mecab is not yet implemented')
if romanize:
raise NotImplementedError('romanize is not yet implemented')
if lang == 'zh' and jieba is None:
raise ModuleNotFoundError(
'''No module named 'jieba'. Install laserembeddings with 'zh' extra to fix that: "pip install laserembeddings[zh]"'''
)
if lang == 'ja' and MeCab is None:
raise ModuleNotFoundError(
'''No module named 'MeCab'. Install laserembeddings with 'ja' extra to fix that: "pip install laserembeddings[ja]"'''
)

self.lang = lang
self.lower_case = lower_case
self.romanize = romanize
self.romanize = romanize if romanize is not None else lang == 'el'
self.descape = descape

self.normalizer = MosesPunctNormalizer(lang=lang)
self.tokenizer = MosesTokenizer(lang=lang)
self.mecab_tokenizer = MeCab.Tagger(
"-O wakati -b 50000") if MeCab is not None else None

def tokenize(self, text: str) -> str:
"""Tokenizes a text and returns the tokens as a string"""
if self.lower_case:
text = text.lower()

# REM_NON_PRINT_CHAR
# not implemented
Expand All @@ -71,17 +87,26 @@ def tokenize(self, text: str) -> str:
text = xml_unescape(text)

# MOSES_TOKENIZER

# see: https://github.com/facebookresearch/LASER/issues/55#issuecomment-480881573
text = self.tokenizer.tokenize(text,
return_str=True,
escape=False,
aggressive_dash_splits=False)

# jieba
if self.lang == 'zh':
text = ' '.join(jieba.cut(text.rstrip('\r\n')))

# MECAB
if self.lang == 'ja':
text = self.mecab_tokenizer.parse(text).rstrip('\r\n')

# ROMAN_LC
# not implemented
if self.romanize:
text = translit(text, self.lang, reversed=True)

if self.lower_case:
text = text.lower()

return text

Expand Down
7 changes: 7 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,19 @@ torch = "^1.0.1.post2"
subword-nmt = "^0.3.6"
numpy = "^1.15.4"
sacremoses = "0.0.35"
transliterate = "1.10.2"
mecab-python3 = { version = "^0.996.2", optional = true }
jieba = { version = "0.39", optional = true }

[tool.poetry.dev-dependencies]
pytest = "^4.6"
yapf = "^0.27.0"
pylint = "^2.3"

[tool.poetry.extras]
zh = ["jieba"]
ja = ["mecab-python3"]

[build-system]
requires = ["poetry>=0.12"]
build-backend = "poetry.masonry.api"
11 changes: 8 additions & 3 deletions tests/report/comparison-with-LASER.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,15 @@
||cbk|1.00000|1.00000|
||ceb|1.00000|1.00000|
||ch|1.00000|1.00000|
||cmn|1.00000|1.00000|
||cs|1.00000|1.00000|
||csb|1.00000|1.00000|
||cy|1.00000|1.00000|
||da|1.00000|1.00000|
||de|1.00000|1.00000|
||dsb|1.00000|1.00000|
||dtp|1.00000|1.00000|
||el|1.00000|1.00000|
||en|1.00000|1.00000|
||eo|1.00000|1.00000|
||es|1.00000|1.00000|
Expand All @@ -52,6 +54,7 @@
||io|1.00000|1.00000|
||is|1.00000|1.00000|
||it|1.00000|1.00000|
||ja|1.00000|1.00000|
|⚠️|jv|0.99987|0.98719|
|⚠️|ka|0.99739|0.73893|
||kab|1.00000|1.00000|
Expand All @@ -74,7 +77,7 @@
||nb|1.00000|1.00000|
||nds|1.00000|1.00000|
||nl|1.00000|1.00000|
|⚠️|nn|0.99986|0.99229|
||nn|1.00000|1.00000|
||nov|1.00000|1.00000|
||oc|1.00000|1.00000|
||orv|1.00000|1.00000|
Expand All @@ -89,7 +92,7 @@
||sl|1.00000|1.00000|
||sq|1.00000|1.00000|
||sr|1.00000|1.00000|
|⚠️|sv|0.99766|0.76591|
||sv|1.00000|1.00000|
||swg|1.00000|1.00000|
||swh|1.00000|1.00000|
||ta|1.00000|1.00000|
Expand All @@ -98,14 +101,16 @@
||tk|1.00000|1.00000|
||tl|1.00000|1.00000|
||tr|1.00000|1.00000|
|⚠️|tt|0.99904|0.90426|
||tt|1.00000|1.00000|
||tzl|1.00000|1.00000|
||ug|1.00000|1.00000|
||uk|1.00000|1.00000|
||ur|1.00000|1.00000|
||uz|1.00000|1.00000|
||vi|1.00000|1.00000|
||war|1.00000|1.00000|
||wuu|1.00000|1.00000|
||xh|1.00000|1.00000|
|⚠️|yi|0.99958|0.96916|
||yue|1.00000|1.00000|
||zsm|1.00000|1.00000|
8 changes: 4 additions & 4 deletions tests/test_laser.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ def test_laser():
assert laser.embed_sentences(
['hello world!', 'i hope the tests are passing'],
lang='en').shape == (2, 1024)
assert laser.embed_sentences(['hello world!', "j'aime les pâtes"],
lang=['en', 'fr']).shape == (2, 1024)
assert laser.embed_sentences('hello world!',
lang='en').shape == (1, 1024)


def test_similarity(test_data):
Expand Down Expand Up @@ -46,10 +50,6 @@ def test_similarity(test_data):

for lang in test_data['langs']:

if lang in ('cmn', 'wuu', 'yue', 'zh', 'jpn', 'ja', 'el'):
# language not supported, ignoring
continue

sents = test_data[f'{lang}_sentences']
orig_embeddings = test_data[f'{lang}_embeddings']
embeddings = laser.embed_sentences(sents, lang)
Expand Down
19 changes: 11 additions & 8 deletions tests/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,22 @@
from laserembeddings import Laser
from laserembeddings.preprocessing import Tokenizer, BPE

from laserembeddings.utils import sre_performance_patch


def test_tokenizer():
assert Tokenizer('en').tokenize("Let's do it!") == "let 's do it !"
with sre_performance_patch():
assert Tokenizer('en').tokenize("Let's do it!") == "let 's do it !"

with pytest.raises(NotImplementedError):
Tokenizer(romanize=True)
assert Tokenizer(
'en', descape=True).tokenize("Let's do it & pass that test!"
) == "let 's do it & pass that test !"

assert Tokenizer(
'en', descape=True).tokenize("Let's do it & pass that test!"
) == "let 's do it & pass that test !"
with pytest.raises(AssertionError):
Tokenizer(lower_case=False)

with pytest.raises(AssertionError):
Tokenizer(lower_case=False)
assert not Tokenizer('en').romanize
assert Tokenizer('el').romanize


def test_bpe():
Expand Down

0 comments on commit 6934ded

Please sign in to comment.