Skip to content

Commit

Permalink
merge branch next
Browse files Browse the repository at this point in the history
  • Loading branch information
yannvgn committed Nov 1, 2019
2 parents e5f9012 + 0121919 commit 543d364
Show file tree
Hide file tree
Showing 6 changed files with 54 additions and 35 deletions.
14 changes: 8 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,14 @@ pip install laserembeddings
To install laserembeddings with extra dependencies:

```
# if you need Chinese support:
pip install laserembeddings[zh]
# if you need Japanese support:
pip install laserembeddings[ja]
# or both:
pip install laserembeddings[zh,ja]
```

### Downloading the pre-trained models
Expand Down Expand Up @@ -104,10 +110,6 @@ Here's a summary of the differences:
| Normalization / tokenization | [Moses](https://github.com/moses-smt/mosesdecoder) | [Sacremoses](https://github.com/alvations/sacremoses) | Moses is implemented in Perl |
| BPE encoding | [fastBPE](https://github.com/glample/fastBPE) | [subword-nmt](https://github.com/rsennrich/subword-nmt) | fastBPE cannot be installed via pip and requires compiling C++ code |

The following features have not been implemented yet:
- romanize, needed to process Greek (el)
- Chinese text segmentation, needed to process Chinese (zh, cmn, wuu and yue)

## Will I get the exact same embeddings?

**For most languages, in most of the cases, yes.**
Expand Down Expand Up @@ -150,9 +152,9 @@ First, download the test data.
python -m laserembeddings download-test-data
```

Install extra dependencies (Japanese support):
Install extra dependencies (Chinese and Japanese support):
```
poetry install -E ja
poetry install -E zh -E ja
```

👉 If you want to know more about the contents and the generation of the test data, check out the [laserembeddings-test-data](https://github.com/yannvgn/laserembeddings-test-data) repository.
Expand Down
39 changes: 25 additions & 14 deletions laserembeddings/preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,19 @@
from typing import TextIO, Union
from typing import TextIO, Union, Optional

from sacremoses import MosesPunctNormalizer, MosesTokenizer
from sacremoses.util import xml_unescape
from subword_nmt.apply_bpe import BPE as subword_nmt_bpe, read_vocabulary
from transliterate import translit

from .utils import BPECodesAdapter

# Extras
try:
import jieba
jieba.setLogLevel(60)
except ImportError:
jieba = None

try:
import MeCab
except ImportError:
Expand All @@ -29,16 +36,16 @@ class Tokenizer:
lang (str): the language code (ISO 639-1) of the texts to tokenize
lower_case (bool, optional): if True, the texts are lower-cased before being tokenized.
Defaults to True.
romanize (bool, optional): if True, the texts are romanized before being tokenized.
Defaults to False. Should be True for "el" language.
romanize (bool or None, optional): if True, the texts are romanized.
Defaults to None (romanization enabled based on input language).
descape (bool, optional): if True, the XML-escaped symbols get de-escaped.
Default to False.
"""

def __init__(self,
lang: str = 'en',
lower_case: bool = True,
romanize: bool = False,
romanize: Optional[bool] = None,
descape: bool = False):
assert lower_case, 'lower case is needed by all the models'

Expand All @@ -47,18 +54,18 @@ def __init__(self,
if lang == 'jpn':
lang = 'ja'

if lang == 'zh':
raise NotImplementedError('jieba is not yet implemented')
if lang == 'zh' and jieba is None:
raise ModuleNotFoundError(
'''No module named 'jieba'. Install laserembeddings with 'zh' extra to fix that: "pip install laserembeddings[zh]"'''
)
if lang == 'ja' and MeCab is None:
raise ModuleNotFoundError(
'''No module named 'MeCab'. Install laserembeddings with 'ja' extra to fix that: "pip install laserembeddings[ja]"'''
)
if romanize:
raise NotImplementedError('romanize is not yet implemented')

self.lang = lang
self.lower_case = lower_case
self.romanize = romanize
self.romanize = romanize if romanize is not None else lang == 'el'
self.descape = descape

self.normalizer = MosesPunctNormalizer(lang=lang)
Expand All @@ -68,8 +75,6 @@ def __init__(self,

def tokenize(self, text: str) -> str:
"""Tokenizes a text and returns the tokens as a string"""
if self.lower_case:
text = text.lower()

# REM_NON_PRINT_CHAR
# not implemented
Expand All @@ -82,20 +87,26 @@ def tokenize(self, text: str) -> str:
text = xml_unescape(text)

# MOSES_TOKENIZER

# see: https://github.com/facebookresearch/LASER/issues/55#issuecomment-480881573
text = self.tokenizer.tokenize(text,
return_str=True,
escape=False,
aggressive_dash_splits=False)

# jieba
if self.lang == 'zh':
text = ' '.join(jieba.cut(text.rstrip('\r\n')))

# MECAB
if self.lang == 'ja':
text = self.mecab_tokenizer.parse(text).rstrip('\r\n')

# jieba
# ROMAN_LC
# not implemented
if self.romanize:
text = translit(text, self.lang, reversed=True)

if self.lower_case:
text = text.lower()

return text

Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,17 @@ torch = "^1.0.1.post2"
subword-nmt = "^0.3.6"
numpy = "^1.15.4"
sacremoses = "0.0.35"
transliterate = "1.10.2"
mecab-python3 = { version = "^0.996.2", optional = true }
jieba = { version = "0.39", optional = true }

[tool.poetry.dev-dependencies]
pytest = "^4.6"
yapf = "^0.27.0"
pylint = "^2.3"

[tool.poetry.extras]
zh = ["jieba"]
ja = ["mecab-python3"]

[build-system]
Expand Down
10 changes: 7 additions & 3 deletions tests/report/comparison-with-LASER.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,15 @@
||cbk|1.00000|1.00000|
||ceb|1.00000|1.00000|
||ch|1.00000|1.00000|
||cmn|1.00000|1.00000|
||cs|1.00000|1.00000|
||csb|1.00000|1.00000|
||cy|1.00000|1.00000|
||da|1.00000|1.00000|
||de|1.00000|1.00000|
||dsb|1.00000|1.00000|
||dtp|1.00000|1.00000|
||el|1.00000|1.00000|
||en|1.00000|1.00000|
||eo|1.00000|1.00000|
||es|1.00000|1.00000|
Expand Down Expand Up @@ -75,7 +77,7 @@
||nb|1.00000|1.00000|
||nds|1.00000|1.00000|
||nl|1.00000|1.00000|
|⚠️|nn|0.99986|0.99229|
||nn|1.00000|1.00000|
||nov|1.00000|1.00000|
||oc|1.00000|1.00000|
||orv|1.00000|1.00000|
Expand All @@ -90,7 +92,7 @@
||sl|1.00000|1.00000|
||sq|1.00000|1.00000|
||sr|1.00000|1.00000|
|⚠️|sv|0.99766|0.76591|
||sv|1.00000|1.00000|
||swg|1.00000|1.00000|
||swh|1.00000|1.00000|
||ta|1.00000|1.00000|
Expand All @@ -99,14 +101,16 @@
||tk|1.00000|1.00000|
||tl|1.00000|1.00000|
||tr|1.00000|1.00000|
|⚠️|tt|0.99904|0.90426|
||tt|1.00000|1.00000|
||tzl|1.00000|1.00000|
||ug|1.00000|1.00000|
||uk|1.00000|1.00000|
||ur|1.00000|1.00000|
||uz|1.00000|1.00000|
||vi|1.00000|1.00000|
||war|1.00000|1.00000|
||wuu|1.00000|1.00000|
||xh|1.00000|1.00000|
|⚠️|yi|0.99958|0.96916|
||yue|1.00000|1.00000|
||zsm|1.00000|1.00000|
4 changes: 0 additions & 4 deletions tests/test_laser.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,6 @@ def test_similarity(test_data):

for lang in test_data['langs']:

if lang in ('cmn', 'wuu', 'yue', 'zh', 'el'):
# language not supported, ignoring
continue

sents = test_data[f'{lang}_sentences']
orig_embeddings = test_data[f'{lang}_embeddings']
embeddings = laser.embed_sentences(sents, lang)
Expand Down
19 changes: 11 additions & 8 deletions tests/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,22 @@
from laserembeddings import Laser
from laserembeddings.preprocessing import Tokenizer, BPE

from laserembeddings.utils import sre_performance_patch


def test_tokenizer():
assert Tokenizer('en').tokenize("Let's do it!") == "let 's do it !"
with sre_performance_patch():
assert Tokenizer('en').tokenize("Let's do it!") == "let 's do it !"

with pytest.raises(NotImplementedError):
Tokenizer(romanize=True)
assert Tokenizer(
'en', descape=True).tokenize("Let's do it & pass that test!"
) == "let 's do it & pass that test !"

assert Tokenizer(
'en', descape=True).tokenize("Let's do it & pass that test!"
) == "let 's do it & pass that test !"
with pytest.raises(AssertionError):
Tokenizer(lower_case=False)

with pytest.raises(AssertionError):
Tokenizer(lower_case=False)
assert not Tokenizer('en').romanize
assert Tokenizer('el').romanize


def test_bpe():
Expand Down

0 comments on commit 543d364

Please sign in to comment.