Skip to content

Commit

Permalink
revert fastBPE switch
Browse files Browse the repository at this point in the history
  • Loading branch information
yannvgn committed Dec 5, 2019
1 parent 3c3f297 commit 4c090f2
Show file tree
Hide file tree
Showing 6 changed files with 76 additions and 28 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -132,4 +132,4 @@ dmypy.json
.pyre/

# PyCharm files
.idea/*
.idea/*
14 changes: 4 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ Here's a summary of the differences:
| Part of the pipeline | LASER dependency (original project) | laserembeddings dependency (this package) | Reason |
|----------------------|-------------------------------------|----------------------------------------|--------|
| Normalization / tokenization | [Moses](https://github.com/moses-smt/mosesdecoder) | [Sacremoses](https://github.com/alvations/sacremoses) | Moses is implemented in Perl |

| BPE encoding | [fastBPE](https://github.com/glample/fastBPE) | [subword-nmt](https://github.com/rsennrich/subword-nmt) | fastBPE cannot be installed via pip and requires compiling C++ code |

The following features have not been implemented yet:
- romanize, needed to process Greek (el)
Expand Down Expand Up @@ -124,20 +124,14 @@ A big thanks to the creators of [Sacremoses](https://github.com/alvations/sacrem

## Testing

First you'll need to checkout this repository and install it (in a virtual environment if you want).

Install [Poetry](https://github.com/sdispater/poetry)
```
pip install poetry
```
The first thing you'll need is [Poetry](https://github.com/sdispater/poetry). Please refer to the [installation guidelines](https://poetry.eustace.io/docs/#installation).

Once Poetry is installed, run
Clone this repository and install the project:
```
poetry install
```

Then, to run the tests:

To run the tests:
```
poetry run pytest
```
Expand Down
29 changes: 15 additions & 14 deletions laserembeddings/preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import tempfile
from typing import TextIO, Union

import fastBPE
from sacremoses import MosesPunctNormalizer, MosesTokenizer
from sacremoses.util import xml_unescape
from subword_nmt.apply_bpe import BPE as subword_nmt_bpe, read_vocabulary

__all__ = ['Tokenizer', 'BPE']
from .utils import BPECodesAdapter

__all__ = ['Tokenizer', 'BPE']

###############################################################################
#
Expand Down Expand Up @@ -111,17 +111,18 @@ def __init__(self, bpe_codes: Union[str, TextIO],
f_bpe_vocab = None

try:
if not isinstance(bpe_codes, str):
f_bpe_codes = tempfile.NamedTemporaryFile(mode='w', encoding='utf-8')
f_bpe_codes.write(bpe_codes.read())
bpe_codes = f_bpe_codes.name

if not isinstance(bpe_vocab, str):
f_bpe_vocab = tempfile.NamedTemporaryFile(mode='w', encoding='utf-8')
f_bpe_vocab.write(bpe_vocab.read())
bpe_vocab = f_bpe_vocab.name
if isinstance(bpe_codes, str):
f_bpe_codes = open(bpe_codes, 'r', encoding='utf-8')
if isinstance(bpe_vocab, str):
f_bpe_vocab = open(bpe_vocab, 'r', encoding='utf-8')

self.bpe = subword_nmt_bpe(codes=BPECodesAdapter(f_bpe_codes
or bpe_codes),
vocab=read_vocabulary(f_bpe_vocab
or bpe_vocab,
threshold=None))
self.bpe.version = (0, 2)

self.bpe = fastBPE.fastBPE(bpe_codes, bpe_vocab)
finally:
if f_bpe_codes:
f_bpe_codes.close()
Expand All @@ -130,4 +131,4 @@ def __init__(self, bpe_codes: Union[str, TextIO],

def encode_tokens(self, sentence_tokens: str) -> str:
"""Returns the BPE-encoded sentence from a tokenized sentence"""
return self.bpe.apply([sentence_tokens])[0]
return self.bpe.process_line(sentence_tokens)
33 changes: 32 additions & 1 deletion laserembeddings/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,35 @@
__all__ = ['sre_performance_patch']
from typing import TextIO

__all__ = ['BPECodesAdapter', 'sre_performance_patch']


class BPECodesAdapter:
"""
A file object kind-of wrapper converting fastBPE codes to subword_nmt BPE codes.
Args:
bpe_codes_f (TextIO): the text-mode file object of fastBPE codes
"""

def __init__(self, bpe_codes_f: TextIO):
self.bpe_codes_f = bpe_codes_f

def seek(self, offset: int, whence: int = 0) -> int:
return self.bpe_codes_f.seek(offset, whence)

def readline(self, limit: int = -1) -> str:
return self._adapt_line(self.bpe_codes_f.readline(limit))

def __iter__(self):
return self

def __next__(self):
return self._adapt_line(next(self.bpe_codes_f))

@staticmethod
def _adapt_line(line: str) -> str:
parts = line.strip('\r\n ').split(' ')
return ' '.join(parts[:-1]) + '\n' if len(parts) == 3 else line


class sre_performance_patch:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ readme = "README.md"
[tool.poetry.dependencies]
python = "^3.6"
torch = "^1.0.1.post2"
fastBPE = "0.1.0"
subword-nmt = "^0.3.6"
numpy = "^1.15.4"
sacremoses = "0.0.35"

Expand Down
24 changes: 23 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,26 @@
from laserembeddings.utils import sre_performance_patch
from io import StringIO

from laserembeddings.utils import BPECodesAdapter, sre_performance_patch


def test_bpe_codes_adapter():
test_f = StringIO(
'#version:2.0\ne n 52708119\ne r 51024442\ne n</w> 47209692')

adapted = BPECodesAdapter(test_f)

assert adapted.readline() == '#version:2.0\n'
assert adapted.readline() == 'e n\n'
assert adapted.readline() == 'e r\n'

for line in adapted:
assert line == 'e n</w>\n'

adapted.seek(0)

for line in adapted:
assert line == '#version:2.0\n'
break


def test_sre_performance_patch():
Expand Down

0 comments on commit 4c090f2

Please sign in to comment.