Skip to content
This repository has been archived by the owner on Feb 22, 2020. It is now read-only.

Commit

Permalink
Merge pull request #174 from gnes-ai/fix-dict-indexer
Browse files Browse the repository at this point in the history
test(indexer): add unit test for dict indexer as service
  • Loading branch information
mergify[bot] authored Aug 29, 2019
2 parents 135dfa5 + 4efea72 commit 21d88e4
Show file tree
Hide file tree
Showing 6 changed files with 88 additions and 75 deletions.
73 changes: 7 additions & 66 deletions LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -14,94 +14,35 @@ A copy of the Apache License, Version 2.0 is included in this file.

Open Source Software Licensed Under the Apache License, Version 2.0:
-------------------------------------------------------------------------
1. TensorFlow 1.10.1
Copyright 2018 The TensorFlow Authors. All rights reserved.

2. aiohttp 3.5.4
1. aiohttp 3.5.4
Copyright 2013-2019 Nikolay Kim and Andrew Svetlov.

3. grpc 1.21.2
2. grpc 1.21.2
Copyright Copyright 2014 gRPC authors.


Open Source Software Licensed Under the BSD 2-Clause License:
-------------------------------------------------------------------------

1. imagehash 4.0
Copyright (c) 2013-2016, Johannes Buchner

Open Source Software Licensed Under the BSD 3-Clause License:
-------------------------------------------------------------------------
1. pandas 0.24.2
Copyright (c) 2008-2012, AQR Capital Management, LLC, Lambda Foundry, Inc.
and PyData Development Team
All rights reserved.

2. numpy 1.16.2
1. numpy 1.16.2
Copyright ? 2005-2019, NumPy Developers.
All rights reserved.

3. psutil 5.6.2
Copyright (c) 2009, Jay Loden, Dave Daeschler, Giampaolo Rodola'
All rights reserved.

4. memory-profiler 0.55.0
Copyright (c) 2007®C2014 Fabian Pedregosa.
All rights reserved.

5. faiss 0.1
Copyright (c) 2016-present, Facebook, Inc.
All rights reserved.

6. PyZMQ 18.0.1
2. PyZMQ 18.0.1
Copyright (c) 2009-2012, Brian Granger, Min Ragan-Kelley
All rights reserved.

7. PyTorch 1.0.1.post2
Copyright (c) 2016- Facebook, Inc (Adam Paszke)
Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
Copyright (c) 2011-2013 NYU (Clement Farabet)
Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou,
Iain Melvin, Jason Weston)
Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio,
Johnny Mariethoz)

8. protobuf 3.7.1
3. protobuf 3.7.1
Copyright 2008 Google Inc.
All rights reserved.

9. torchvision 0.3.0
Copyright (c) Soumith Chintala 2016,
All rights reserved.

Open Source Software Licensed Under the MIT License:
-------------------------------------------------------------------------
1. bert-as-service 1.8.9
Copyright (c) 2018 Han Xiao.

2. termcolor 1.1.0
1. termcolor 1.1.0
Copyright (c) 2008-2011 Konstantin Lepa.

3. GPUtil 1.4.0
Copyright (c) 2017 anderskm.

4. flair 0.4.1
Flair is licensed under the following MIT License (MIT) Copyright ? 2018 Zalando
SE, https://tech.zalando.com

5. ruamel.yaml 0.15.94
2. ruamel.yaml 0.15.94
Copyright (c) 2014-2019 Anthon van der Neut, Ruamel bvba

6. jieba 0.39
Copyright (c) 2013 Sun Junyi

7. opencv-python 4.0.0
Copyright (c) 2016-2018 Olli-Pekka Heinisuo and contributors




Expand Down
9 changes: 6 additions & 3 deletions gnes/preprocessor/text/split.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,14 @@


class SentSplitPreprocessor(BaseTextPreprocessor):
def __init__(self, max_sent_len: int = 256,
def __init__(self,
min_sent_len: int = 1,
max_sent_len: int = 256,
deliminator: str = '.!?。!?',
is_json: bool= False,
is_json: bool = False,
*args, **kwargs):
super().__init__(*args, **kwargs)
self.min_sent_len = min_sent_len
self.max_sent_len = max_sent_len
self.deliminator = deliminator
self.is_json = is_json
Expand All @@ -46,7 +49,7 @@ def apply(self, doc: 'gnes_pb2.Document') -> None:
for ci, (r, s, e) in enumerate(ret):
f = ''.join(filter(lambda x: x in string.printable, r))
f = re.sub('\n+', ' ', f).strip()
if f:
if len(f) > self.min_sent_len:
c = doc.chunks.add()
c.doc_id = doc.doc_id
c.text = f[:self.max_sent_len]
Expand Down
3 changes: 3 additions & 0 deletions gnes/service/encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ def embed_chunks_in_docs(self, docs: Union[List['gnes_pb2.Document'], 'gnes_pb2.
embeds = None

for d in docs:
if not d.chunks:
raise ServiceError('document contains no chunks! doc: %s' % d)

for c in d.chunks:
chunks.append(c)
if d.doc_type == gnes_pb2.Document.TEXT:
Expand Down
16 changes: 16 additions & 0 deletions gnes/service/frontend.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
# Tencent is pleased to support the open source community by making GNES available.
#
# Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import threading
from concurrent.futures import ThreadPoolExecutor

Expand Down
10 changes: 5 additions & 5 deletions gnes/service/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,11 @@ def _handler_chunk_index(self, msg: 'gnes_pb2.Message'):
for d in msg.request.index.docs:
if not d.chunks:
raise ServiceError('document contains no chunks! doc: %s' % d)
else:
vecs += [blob2array(c.embedding) for c in d.chunks]
doc_ids += [d.doc_id] * len(d.chunks)
offsets += [c.offset for c in d.chunks]
weights += [c.weight for c in d.chunks]

vecs += [blob2array(c.embedding) for c in d.chunks]
doc_ids += [d.doc_id] * len(d.chunks)
offsets += [c.offset for c in d.chunks]
weights += [c.weight for c in d.chunks]

self._model.add(list(zip(doc_ids, offsets)), np.concatenate(vecs, 0), weights)

Expand Down
52 changes: 51 additions & 1 deletion tests/test_dict_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,17 @@
import unittest
from shutil import rmtree

import grpc

from gnes.cli.parser import set_frontend_parser, set_preprocessor_parser, set_indexer_parser
from gnes.indexer.base import BaseIndexer
from gnes.indexer.doc.filesys import DirectoryIndexer
from gnes.preprocessor.base import BasePreprocessor
from gnes.proto import gnes_pb2
from gnes.proto import gnes_pb2, gnes_pb2_grpc, RequestGenerator
from gnes.service.base import SocketType, ServiceManager
from gnes.service.frontend import FrontendService
from gnes.service.indexer import IndexerService
from gnes.service.preprocessor import PreprocessorService


class TestDictIndexer(unittest.TestCase):
Expand All @@ -22,9 +30,51 @@ def setUp(self):

self.init_db()

def test_pymode(self):
os.unsetenv('http_proxy')
os.unsetenv('https_proxy')
args = set_frontend_parser().parse_args([])

p_args = set_preprocessor_parser().parse_args([
'--port_in', str(args.port_out),
'--port_out', '5531',
'--socket_in', str(SocketType.PULL_CONNECT),
'--socket_out', str(SocketType.PUSH_BIND),
'--yaml_path', 'SentSplitPreprocessor'
])

e_args = set_indexer_parser().parse_args([
'--port_in', str(p_args.port_out),
'--port_out', str(args.port_in),
'--socket_in', str(SocketType.PULL_CONNECT),
'--socket_out', str(SocketType.PUSH_CONNECT),
'--yaml_path', '!DictIndexer {gnes_config: {name: dummy_dict_indexer}}',
])

with ServiceManager(IndexerService, e_args), \
ServiceManager(PreprocessorService, p_args), \
FrontendService(args), \
grpc.insecure_channel('%s:%s' % (args.grpc_host, args.grpc_port),
options=[('grpc.max_send_message_length', 70 * 1024 * 1024),
('grpc.max_receive_message_length', 70 * 1024 * 1024)]) as channel:
stub = gnes_pb2_grpc.GnesRPCStub(channel)
all_bytes = []
with open(os.path.join(self.dirname, '26-doc-chinese.txt'), 'r', encoding='utf8') as fp:
for v in fp:
if v.strip():
all_bytes.append(v.encode())
for r in stub.StreamCall(RequestGenerator.index(all_bytes)):
print(r)

bi = BaseIndexer.load('dummy_dict_indexer.bin')
self.assertEqual(bi.size, 26)
print(bi.query([0]))

def tearDown(self):
if os.path.exists(self.data_path):
rmtree(self.data_path)
if os.path.exists('dummy_dict_indexer.bin'):
os.remove('dummy_dict_indexer.bin')

def init_db(self):
self.db = DirectoryIndexer(self.data_path)
Expand Down

0 comments on commit 21d88e4

Please sign in to comment.