Merge pull request #8 from yannvgn/zh-support

add Chinese language support
yannvgn · Nov 1, 2019 · 0121919 · 0121919
2 parents 80479a5 + 0fb3719
commit 0121919
Show file tree

Hide file tree

Showing 5 changed files with 34 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -32,6 +32,13 @@ You'll need Python 3.6 or higher.
 pip install laserembeddings
 ```
 
+To install laserembeddings with extra dependencies:
+
+```
+# if you need Chinese support:
+pip install laserembeddings[zh]
+```
+
 ### Downloading the pre-trained models
 
 ```
@@ -98,7 +105,6 @@ Here's a summary of the differences:
 | BPE encoding | [fastBPE](https://github.com/glample/fastBPE) | [subword-nmt](https://github.com/rsennrich/subword-nmt) | fastBPE cannot be installed via pip and requires compiling C++ code |
 
 The following features have not been implemented yet:
-- Chinese text segmentation, needed to process Chinese (zh, cmn, wuu and yue)
 - Japanese text segmentation, needed to process Japanese (ja, jpn)
 
 ## Will I get the exact same embeddings?
@@ -143,6 +149,11 @@ First, download the test data.
 python -m laserembeddings download-test-data
 ```
 
+Install extra dependencies (Chinese support):
+```
+poetry install -E zh
+```
+
 👉 If you want to know more about the contents and the generation of the test data, check out the [laserembeddings-test-data](https://github.com/yannvgn/laserembeddings-test-data) repository.
 
 Then, run the test with `SIMILARITY_TEST` env. variable set to `1`.

diff --git a/laserembeddings/preprocessing.py b/laserembeddings/preprocessing.py
@@ -7,6 +7,13 @@
 
 from .utils import BPECodesAdapter
 
+# Extras
+try:
+    import jieba
+    jieba.setLogLevel(60)
+except ImportError:
+    jieba = None
+
 __all__ = ['Tokenizer', 'BPE']
 
 ###############################################################################
@@ -42,8 +49,10 @@ def __init__(self,
         if lang == 'jpn':
             lang = 'ja'
 
-        if lang == 'zh':
-            raise NotImplementedError('jieba is not yet implemented')
+        if lang == 'zh' and jieba is None:
+            raise ModuleNotFoundError(
+                '''No module named 'jieba'. Install laserembeddings with 'zh' extra to fix that: "pip install laserembeddings[zh]"'''
+            )
         if lang == 'ja':
             raise NotImplementedError('mecab is not yet implemented')
 
@@ -76,6 +85,9 @@ def tokenize(self, text: str) -> str:
                                        aggressive_dash_splits=False)
 
         # jieba
+        if self.lang == 'zh':
+            text = ' '.join(jieba.cut(text.rstrip('\r\n')))
+
         # MECAB
         # not implemented
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -15,12 +15,16 @@ subword-nmt = "^0.3.6"
 numpy = "^1.15.4"
 sacremoses = "0.0.35"
 transliterate = "1.10.2"
+jieba = { version = "0.39", optional = true }
 
 [tool.poetry.dev-dependencies]
 pytest = "^4.6"
 yapf = "^0.27.0"
 pylint = "^2.3"
 
+[tool.poetry.extras]
+zh = ["jieba"]
+
 [build-system]
 requires = ["poetry>=0.12"]
 build-backend = "poetry.masonry.api"
diff --git a/tests/report/comparison-with-LASER.md b/tests/report/comparison-with-LASER.md
@@ -20,6 +20,7 @@
 |✅|cbk|1.00000|1.00000|
 |✅|ceb|1.00000|1.00000|
 |✅|ch|1.00000|1.00000|
+|✅|cmn|1.00000|1.00000|
 |✅|cs|1.00000|1.00000|
 |✅|csb|1.00000|1.00000|
 |✅|cy|1.00000|1.00000|
@@ -107,6 +108,8 @@
 |✅|uz|1.00000|1.00000|
 |✅|vi|1.00000|1.00000|
 |✅|war|1.00000|1.00000|
+|✅|wuu|1.00000|1.00000|
 |✅|xh|1.00000|1.00000|
 |⚠️|yi|0.99958|0.96916|
+|✅|yue|1.00000|1.00000|
 |✅|zsm|1.00000|1.00000|
diff --git a/tests/test_laser.py b/tests/test_laser.py
@@ -46,7 +46,7 @@ def test_similarity(test_data):
 
         for lang in test_data['langs']:
 
-            if lang in ('cmn', 'wuu', 'yue', 'zh', 'jpn', 'ja'):
+            if lang in ('jpn', 'ja'):
                 # language not supported, ignoring
                 continue