Merge pull request #174 from gnes-ai/fix-dict-indexer

test(indexer): add unit test for dict indexer as service
gnes-ai · Aug 29, 2019 · 21d88e4 · 21d88e4
2 parents 135dfa5 + 4efea72
commit 21d88e4
Show file tree

Hide file tree

Showing 6 changed files with 88 additions and 75 deletions.
diff --git a/LICENSE b/LICENSE
@@ -14,94 +14,35 @@ A copy of the Apache License, Version 2.0 is included in this file.
 
 Open Source Software Licensed Under the Apache License, Version 2.0:
 -------------------------------------------------------------------------
-1. TensorFlow  1.10.1
-Copyright 2018 The TensorFlow Authors.  All rights reserved.
-
-2. aiohttp  3.5.4
+1. aiohttp  3.5.4
 Copyright 2013-2019 Nikolay Kim and Andrew Svetlov.
 
-3. grpc  1.21.2
+2. grpc  1.21.2
 Copyright Copyright 2014 gRPC authors.
 
 
-Open Source Software Licensed Under the BSD 2-Clause License:
--------------------------------------------------------------------------
-
-1. imagehash 4.0
-Copyright (c) 2013-2016, Johannes Buchner
-
 Open Source Software Licensed Under the BSD 3-Clause License:
 -------------------------------------------------------------------------
-1. pandas  0.24.2
-Copyright (c) 2008-2012, AQR Capital Management, LLC, Lambda Foundry, Inc.
-and PyData Development Team
-All rights reserved.
-
-2. numpy  1.16.2
+1. numpy  1.16.2
 Copyright ? 2005-2019, NumPy Developers.
 All rights reserved.
 
-3. psutil  5.6.2
-Copyright (c) 2009, Jay Loden, Dave Daeschler, Giampaolo Rodola'
-All rights reserved.
-
-4. memory-profiler  0.55.0
-Copyright (c) 2007®C2014 Fabian Pedregosa.
-All rights reserved.
-
-5. faiss  0.1
-Copyright (c) 2016-present, Facebook, Inc.
-All rights reserved.
-
-6. PyZMQ  18.0.1
+2. PyZMQ  18.0.1
 Copyright (c) 2009-2012, Brian Granger, Min Ragan-Kelley
 All rights reserved.
 
-7. PyTorch  1.0.1.post2
-Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
-Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
-Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
-Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
-Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
-Copyright (c) 2011-2013 NYU                      (Clement Farabet)
-Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou,
-Iain Melvin, Jason Weston)
-Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
-Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio,
-Johnny Mariethoz)
-
-8. protobuf 3.7.1
+3. protobuf 3.7.1
 Copyright 2008 Google Inc.
 All rights reserved.
 
-9. torchvision 0.3.0
-Copyright (c) Soumith Chintala 2016,
-All rights reserved.
-
 Open Source Software Licensed Under the MIT License:
 -------------------------------------------------------------------------
-1. bert-as-service 1.8.9
-Copyright (c) 2018 Han Xiao.
-
-2. termcolor 1.1.0
+1. termcolor 1.1.0
 Copyright (c) 2008-2011 Konstantin Lepa.
 
-3. GPUtil 1.4.0
-Copyright (c) 2017 anderskm.
-
-4. flair 0.4.1
-Flair is licensed under the following MIT License (MIT) Copyright ? 2018 Zalando
-SE, https://tech.zalando.com
-
-5. ruamel.yaml  0.15.94
+2. ruamel.yaml  0.15.94
 Copyright (c) 2014-2019 Anthon van der Neut, Ruamel bvba
 
-6. jieba 0.39
-Copyright (c) 2013 Sun Junyi
-
-7. opencv-python 4.0.0
-Copyright (c) 2016-2018 Olli-Pekka Heinisuo and contributors
-
 
 
 

diff --git a/gnes/preprocessor/text/split.py b/gnes/preprocessor/text/split.py
@@ -22,11 +22,14 @@
 
 
 class SentSplitPreprocessor(BaseTextPreprocessor):
-    def __init__(self, max_sent_len: int = 256,
+    def __init__(self,
+                 min_sent_len: int = 1,
+                 max_sent_len: int = 256,
                  deliminator: str = '.!?。！？',
-                 is_json: bool= False,
+                 is_json: bool = False,
                  *args, **kwargs):
         super().__init__(*args, **kwargs)
+        self.min_sent_len = min_sent_len
         self.max_sent_len = max_sent_len
         self.deliminator = deliminator
         self.is_json = is_json
@@ -46,7 +49,7 @@ def apply(self, doc: 'gnes_pb2.Document') -> None:
         for ci, (r, s, e) in enumerate(ret):
             f = ''.join(filter(lambda x: x in string.printable, r))
             f = re.sub('\n+', ' ', f).strip()
-            if f:
+            if len(f) > self.min_sent_len:
                 c = doc.chunks.add()
                 c.doc_id = doc.doc_id
                 c.text = f[:self.max_sent_len]

diff --git a/gnes/service/encoder.py b/gnes/service/encoder.py
@@ -38,6 +38,9 @@ def embed_chunks_in_docs(self, docs: Union[List['gnes_pb2.Document'], 'gnes_pb2.
         embeds = None
 
         for d in docs:
+            if not d.chunks:
+                raise ServiceError('document contains no chunks! doc: %s' % d)
+
             for c in d.chunks:
                 chunks.append(c)
                 if d.doc_type == gnes_pb2.Document.TEXT:

diff --git a/gnes/service/frontend.py b/gnes/service/frontend.py
@@ -1,3 +1,19 @@
+#  Tencent is pleased to support the open source community by making GNES available.
+#
+#  Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+
 import threading
 from concurrent.futures import ThreadPoolExecutor
 

diff --git a/gnes/service/indexer.py b/gnes/service/indexer.py
@@ -46,11 +46,11 @@ def _handler_chunk_index(self, msg: 'gnes_pb2.Message'):
         for d in msg.request.index.docs:
             if not d.chunks:
                 raise ServiceError('document contains no chunks! doc: %s' % d)
-            else:
-                vecs += [blob2array(c.embedding) for c in d.chunks]
-                doc_ids += [d.doc_id] * len(d.chunks)
-                offsets += [c.offset for c in d.chunks]
-                weights += [c.weight for c in d.chunks]
+
+            vecs += [blob2array(c.embedding) for c in d.chunks]
+            doc_ids += [d.doc_id] * len(d.chunks)
+            offsets += [c.offset for c in d.chunks]
+            weights += [c.weight for c in d.chunks]
 
         self._model.add(list(zip(doc_ids, offsets)), np.concatenate(vecs, 0), weights)
 

diff --git a/tests/test_dict_indexer.py b/tests/test_dict_indexer.py
@@ -2,9 +2,17 @@
 import unittest
 from shutil import rmtree
 
+import grpc
+
+from gnes.cli.parser import set_frontend_parser, set_preprocessor_parser, set_indexer_parser
+from gnes.indexer.base import BaseIndexer
 from gnes.indexer.doc.filesys import DirectoryIndexer
 from gnes.preprocessor.base import BasePreprocessor
-from gnes.proto import gnes_pb2
+from gnes.proto import gnes_pb2, gnes_pb2_grpc, RequestGenerator
+from gnes.service.base import SocketType, ServiceManager
+from gnes.service.frontend import FrontendService
+from gnes.service.indexer import IndexerService
+from gnes.service.preprocessor import PreprocessorService
 
 
 class TestDictIndexer(unittest.TestCase):
@@ -22,9 +30,51 @@ def setUp(self):
 
         self.init_db()
 
+    def test_pymode(self):
+        os.unsetenv('http_proxy')
+        os.unsetenv('https_proxy')
+        args = set_frontend_parser().parse_args([])
+
+        p_args = set_preprocessor_parser().parse_args([
+            '--port_in', str(args.port_out),
+            '--port_out', '5531',
+            '--socket_in', str(SocketType.PULL_CONNECT),
+            '--socket_out', str(SocketType.PUSH_BIND),
+            '--yaml_path', 'SentSplitPreprocessor'
+        ])
+
+        e_args = set_indexer_parser().parse_args([
+            '--port_in', str(p_args.port_out),
+            '--port_out', str(args.port_in),
+            '--socket_in', str(SocketType.PULL_CONNECT),
+            '--socket_out', str(SocketType.PUSH_CONNECT),
+            '--yaml_path', '!DictIndexer {gnes_config: {name: dummy_dict_indexer}}',
+        ])
+
+        with ServiceManager(IndexerService, e_args), \
+             ServiceManager(PreprocessorService, p_args), \
+             FrontendService(args), \
+             grpc.insecure_channel('%s:%s' % (args.grpc_host, args.grpc_port),
+                                   options=[('grpc.max_send_message_length', 70 * 1024 * 1024),
+                                            ('grpc.max_receive_message_length', 70 * 1024 * 1024)]) as channel:
+            stub = gnes_pb2_grpc.GnesRPCStub(channel)
+            all_bytes = []
+            with open(os.path.join(self.dirname, '26-doc-chinese.txt'), 'r', encoding='utf8') as fp:
+                for v in fp:
+                    if v.strip():
+                        all_bytes.append(v.encode())
+            for r in stub.StreamCall(RequestGenerator.index(all_bytes)):
+                print(r)
+
+        bi = BaseIndexer.load('dummy_dict_indexer.bin')
+        self.assertEqual(bi.size, 26)
+        print(bi.query([0]))
+
     def tearDown(self):
         if os.path.exists(self.data_path):
             rmtree(self.data_path)
+        if os.path.exists('dummy_dict_indexer.bin'):
+            os.remove('dummy_dict_indexer.bin')
 
     def init_db(self):
         self.db = DirectoryIndexer(self.data_path)