diff --git a/LICENSE b/LICENSE index 1999bd63..921ba47c 100644 --- a/LICENSE +++ b/LICENSE @@ -14,94 +14,35 @@ A copy of the Apache License, Version 2.0 is included in this file. Open Source Software Licensed Under the Apache License, Version 2.0: ------------------------------------------------------------------------- -1. TensorFlow 1.10.1 -Copyright 2018 The TensorFlow Authors. All rights reserved. - -2. aiohttp 3.5.4 +1. aiohttp 3.5.4 Copyright 2013-2019 Nikolay Kim and Andrew Svetlov. -3. grpc 1.21.2 +2. grpc 1.21.2 Copyright Copyright 2014 gRPC authors. -Open Source Software Licensed Under the BSD 2-Clause License: -------------------------------------------------------------------------- - -1. imagehash 4.0 -Copyright (c) 2013-2016, Johannes Buchner - Open Source Software Licensed Under the BSD 3-Clause License: ------------------------------------------------------------------------- -1. pandas 0.24.2 -Copyright (c) 2008-2012, AQR Capital Management, LLC, Lambda Foundry, Inc. -and PyData Development Team -All rights reserved. - -2. numpy 1.16.2 +1. numpy 1.16.2 Copyright ? 2005-2019, NumPy Developers. All rights reserved. -3. psutil 5.6.2 -Copyright (c) 2009, Jay Loden, Dave Daeschler, Giampaolo Rodola' -All rights reserved. - -4. memory-profiler 0.55.0 -Copyright (c) 2007®C2014 Fabian Pedregosa. -All rights reserved. - -5. faiss 0.1 -Copyright (c) 2016-present, Facebook, Inc. -All rights reserved. - -6. PyZMQ 18.0.1 +2. PyZMQ 18.0.1 Copyright (c) 2009-2012, Brian Granger, Min Ragan-Kelley All rights reserved. -7. PyTorch 1.0.1.post2 -Copyright (c) 2016- Facebook, Inc (Adam Paszke) -Copyright (c) 2014- Facebook, Inc (Soumith Chintala) -Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) -Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) -Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) -Copyright (c) 2011-2013 NYU (Clement Farabet) -Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, -Iain Melvin, Jason Weston) -Copyright (c) 2006 Idiap Research Institute (Samy Bengio) -Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, -Johnny Mariethoz) - -8. protobuf 3.7.1 +3. protobuf 3.7.1 Copyright 2008 Google Inc. All rights reserved. -9. torchvision 0.3.0 -Copyright (c) Soumith Chintala 2016, -All rights reserved. - Open Source Software Licensed Under the MIT License: ------------------------------------------------------------------------- -1. bert-as-service 1.8.9 -Copyright (c) 2018 Han Xiao. - -2. termcolor 1.1.0 +1. termcolor 1.1.0 Copyright (c) 2008-2011 Konstantin Lepa. -3. GPUtil 1.4.0 -Copyright (c) 2017 anderskm. - -4. flair 0.4.1 -Flair is licensed under the following MIT License (MIT) Copyright ? 2018 Zalando -SE, https://tech.zalando.com - -5. ruamel.yaml 0.15.94 +2. ruamel.yaml 0.15.94 Copyright (c) 2014-2019 Anthon van der Neut, Ruamel bvba -6. jieba 0.39 -Copyright (c) 2013 Sun Junyi - -7. opencv-python 4.0.0 -Copyright (c) 2016-2018 Olli-Pekka Heinisuo and contributors - diff --git a/gnes/preprocessor/text/split.py b/gnes/preprocessor/text/split.py index 1b10b9bd..5aa2b7a6 100644 --- a/gnes/preprocessor/text/split.py +++ b/gnes/preprocessor/text/split.py @@ -22,11 +22,14 @@ class SentSplitPreprocessor(BaseTextPreprocessor): - def __init__(self, max_sent_len: int = 256, + def __init__(self, + min_sent_len: int = 1, + max_sent_len: int = 256, deliminator: str = '.!?。!?', - is_json: bool= False, + is_json: bool = False, *args, **kwargs): super().__init__(*args, **kwargs) + self.min_sent_len = min_sent_len self.max_sent_len = max_sent_len self.deliminator = deliminator self.is_json = is_json @@ -46,7 +49,7 @@ def apply(self, doc: 'gnes_pb2.Document') -> None: for ci, (r, s, e) in enumerate(ret): f = ''.join(filter(lambda x: x in string.printable, r)) f = re.sub('\n+', ' ', f).strip() - if f: + if len(f) > self.min_sent_len: c = doc.chunks.add() c.doc_id = doc.doc_id c.text = f[:self.max_sent_len] diff --git a/gnes/service/encoder.py b/gnes/service/encoder.py index ae28af6e..b7c07ed4 100644 --- a/gnes/service/encoder.py +++ b/gnes/service/encoder.py @@ -38,6 +38,9 @@ def embed_chunks_in_docs(self, docs: Union[List['gnes_pb2.Document'], 'gnes_pb2. embeds = None for d in docs: + if not d.chunks: + raise ServiceError('document contains no chunks! doc: %s' % d) + for c in d.chunks: chunks.append(c) if d.doc_type == gnes_pb2.Document.TEXT: diff --git a/gnes/service/frontend.py b/gnes/service/frontend.py index c8bb9112..e5ded6e7 100644 --- a/gnes/service/frontend.py +++ b/gnes/service/frontend.py @@ -1,3 +1,19 @@ +# Tencent is pleased to support the open source community by making GNES available. +# +# Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + import threading from concurrent.futures import ThreadPoolExecutor diff --git a/gnes/service/indexer.py b/gnes/service/indexer.py index 91306c53..57a0be33 100644 --- a/gnes/service/indexer.py +++ b/gnes/service/indexer.py @@ -46,11 +46,11 @@ def _handler_chunk_index(self, msg: 'gnes_pb2.Message'): for d in msg.request.index.docs: if not d.chunks: raise ServiceError('document contains no chunks! doc: %s' % d) - else: - vecs += [blob2array(c.embedding) for c in d.chunks] - doc_ids += [d.doc_id] * len(d.chunks) - offsets += [c.offset for c in d.chunks] - weights += [c.weight for c in d.chunks] + + vecs += [blob2array(c.embedding) for c in d.chunks] + doc_ids += [d.doc_id] * len(d.chunks) + offsets += [c.offset for c in d.chunks] + weights += [c.weight for c in d.chunks] self._model.add(list(zip(doc_ids, offsets)), np.concatenate(vecs, 0), weights) diff --git a/tests/test_dict_indexer.py b/tests/test_dict_indexer.py index fbf8ca16..bbda790c 100644 --- a/tests/test_dict_indexer.py +++ b/tests/test_dict_indexer.py @@ -2,9 +2,17 @@ import unittest from shutil import rmtree +import grpc + +from gnes.cli.parser import set_frontend_parser, set_preprocessor_parser, set_indexer_parser +from gnes.indexer.base import BaseIndexer from gnes.indexer.doc.filesys import DirectoryIndexer from gnes.preprocessor.base import BasePreprocessor -from gnes.proto import gnes_pb2 +from gnes.proto import gnes_pb2, gnes_pb2_grpc, RequestGenerator +from gnes.service.base import SocketType, ServiceManager +from gnes.service.frontend import FrontendService +from gnes.service.indexer import IndexerService +from gnes.service.preprocessor import PreprocessorService class TestDictIndexer(unittest.TestCase): @@ -22,9 +30,51 @@ def setUp(self): self.init_db() + def test_pymode(self): + os.unsetenv('http_proxy') + os.unsetenv('https_proxy') + args = set_frontend_parser().parse_args([]) + + p_args = set_preprocessor_parser().parse_args([ + '--port_in', str(args.port_out), + '--port_out', '5531', + '--socket_in', str(SocketType.PULL_CONNECT), + '--socket_out', str(SocketType.PUSH_BIND), + '--yaml_path', 'SentSplitPreprocessor' + ]) + + e_args = set_indexer_parser().parse_args([ + '--port_in', str(p_args.port_out), + '--port_out', str(args.port_in), + '--socket_in', str(SocketType.PULL_CONNECT), + '--socket_out', str(SocketType.PUSH_CONNECT), + '--yaml_path', '!DictIndexer {gnes_config: {name: dummy_dict_indexer}}', + ]) + + with ServiceManager(IndexerService, e_args), \ + ServiceManager(PreprocessorService, p_args), \ + FrontendService(args), \ + grpc.insecure_channel('%s:%s' % (args.grpc_host, args.grpc_port), + options=[('grpc.max_send_message_length', 70 * 1024 * 1024), + ('grpc.max_receive_message_length', 70 * 1024 * 1024)]) as channel: + stub = gnes_pb2_grpc.GnesRPCStub(channel) + all_bytes = [] + with open(os.path.join(self.dirname, '26-doc-chinese.txt'), 'r', encoding='utf8') as fp: + for v in fp: + if v.strip(): + all_bytes.append(v.encode()) + for r in stub.StreamCall(RequestGenerator.index(all_bytes)): + print(r) + + bi = BaseIndexer.load('dummy_dict_indexer.bin') + self.assertEqual(bi.size, 26) + print(bi.query([0])) + def tearDown(self): if os.path.exists(self.data_path): rmtree(self.data_path) + if os.path.exists('dummy_dict_indexer.bin'): + os.remove('dummy_dict_indexer.bin') def init_db(self): self.db = DirectoryIndexer(self.data_path)