diff --git a/README.md b/README.md index fb2c8eca..bcbc1c53 100644 --- a/README.md +++ b/README.md @@ -278,7 +278,7 @@ Now let's see what the YAML config says. First impression, it is pretty intuitiv Preprocessor config: text-prep.yml (click to expand...) ```yaml -!TextPreprocessor +!PunctSplitPreprocessor parameters: start_doc_id: 0 random_doc_id: True diff --git a/gnes/preprocessor/__init__.py b/gnes/preprocessor/__init__.py index c3a75c8c..689d251d 100644 --- a/gnes/preprocessor/__init__.py +++ b/gnes/preprocessor/__init__.py @@ -33,7 +33,7 @@ 'FFmpegVideoSegmentor': 'video.ffmpeg', 'ShotDetectPreprocessor': 'video.shotdetect', 'AudioVanilla': 'audio.audio_vanilla', - 'BaseAudioPreprocessor': 'base' + 'BaseAudioPreprocessor': 'base', 'RawChunkPreprocessor': 'base' } diff --git a/tests/contrib/fake_faiss.py b/tests/contrib/fake_faiss.py index 06108c24..7429d37e 100644 --- a/tests/contrib/fake_faiss.py +++ b/tests/contrib/fake_faiss.py @@ -8,4 +8,3 @@ def __init__(self, bar: int, *args, **kwargs): self.is_trained = True self.bar = bar self.logger.info('look at me, I override the original GNES faiss indexer') - diff --git a/tests/contrib/fake_faiss2.py b/tests/contrib/fake_faiss2.py index 68003011..4a19a493 100644 --- a/tests/contrib/fake_faiss2.py +++ b/tests/contrib/fake_faiss2.py @@ -8,4 +8,3 @@ def __init__(self, bar: int, *args, **kwargs): self.is_trained = True self.bar = bar self.logger.info('look at me, I override the overrided faiss indexer!!!') - diff --git a/tests/test_annoyindexer.py b/tests/test_annoyindexer.py index 014cadf7..17252ec1 100644 --- a/tests/test_annoyindexer.py +++ b/tests/test_annoyindexer.py @@ -1,10 +1,8 @@ import os -import shutil import unittest import numpy as np -from gnes.helper import touch_dir from gnes.indexer.vector.annoy import AnnoyIndexer diff --git a/tests/test_audio_preprocessor.py b/tests/test_audio_preprocessor.py index f941ac1f..8bde495f 100644 --- a/tests/test_audio_preprocessor.py +++ b/tests/test_audio_preprocessor.py @@ -43,4 +43,4 @@ def test_video_preprocessor_service_realdata(self): self.assertGreater(len(d.chunks), 0) for _ in range(len(d.chunks)): shape = blob2array(d.chunks[_].blob).shape - self.assertEqual(len(shape), 1) \ No newline at end of file + self.assertEqual(len(shape), 1) diff --git a/tests/test_bindexer.py b/tests/test_bindexer.py index def921e0..a62c8722 100644 --- a/tests/test_bindexer.py +++ b/tests/test_bindexer.py @@ -22,7 +22,7 @@ def setUp(self): [3, 2, 1, 2]]).astype(np.uint8) self.toy_exp = [[(234, 0, 1., 1,), (123, 1, 1., 1)], [(432, 0, 1., 1), (1, 0, 1., 1)], - [(234, 0, 1., 0.75), (123, 1, 1., 0.75)]] + [(234, 0, 1., 0.75), (123, 1, 1., 0.75)]] self.weights = [1.] * len(self.toy_label) dirname = os.path.dirname(__file__) diff --git a/tests/test_dict_indexer.py b/tests/test_dict_indexer.py index 851a4d68..457156bc 100644 --- a/tests/test_dict_indexer.py +++ b/tests/test_dict_indexer.py @@ -48,5 +48,3 @@ def test_query_docs(self): res = self.db.query(query_list) num_non_empty = sum(1 for d in res if d) self.assertEqual(num_non_empty, 1) - - diff --git a/tests/test_gif.py b/tests/test_gif.py index 9617be61..9267de5e 100644 --- a/tests/test_gif.py +++ b/tests/test_gif.py @@ -1,10 +1,10 @@ +import copy import os import unittest from gnes.preprocessor.base import BasePreprocessor from gnes.preprocessor.video.ffmpeg import FFmpegVideoSegmentor from gnes.proto import gnes_pb2 -import copy class TestPartition(unittest.TestCase): diff --git a/tests/test_gpt_encoder.py b/tests/test_gpt_encoder.py index 26031dbc..76d96380 100644 --- a/tests/test_gpt_encoder.py +++ b/tests/test_gpt_encoder.py @@ -39,4 +39,4 @@ def test_dump_load(self): def tearDown(self): if os.path.exists(self.dump_path): - os.remove(self.dump_path) \ No newline at end of file + os.remove(self.dump_path) diff --git a/tests/test_hash_encoder.py b/tests/test_hash_encoder.py index 86a36cec..36e0e2ee 100644 --- a/tests/test_hash_encoder.py +++ b/tests/test_hash_encoder.py @@ -31,7 +31,7 @@ def test_train_pred(self): out = m.encode(self.test_data) self.assertEqual(self.x, out.shape[0]) - self.assertEqual(self.num_idx+self.num_bytes, out.shape[1]) + self.assertEqual(self.num_idx + self.num_bytes, out.shape[1]) self.assertEqual(np.uint32, out.dtype) def test_yaml_load(self): @@ -39,4 +39,4 @@ def test_yaml_load(self): pca_hash.train(self.test_data) out = pca_hash.encode(self.test_data) self.assertEqual(self.x, out.shape[0]) - self.assertEqual(self.num_idx+self.num_bytes, out.shape[1]) + self.assertEqual(self.num_idx + self.num_bytes, out.shape[1]) diff --git a/tests/test_hash_indexer.py b/tests/test_hash_indexer.py index efb36308..6c4ec711 100644 --- a/tests/test_hash_indexer.py +++ b/tests/test_hash_indexer.py @@ -1,8 +1,10 @@ import os import unittest + import numpy as np + from gnes.indexer.vector.hbindexer import HBIndexer -import shutil + class TestMHIndexer(unittest.TestCase): @@ -13,7 +15,7 @@ def setUp(self): self.n = 100 self.test_label = [(_, 1) for _ in range(self.n)] - t = np.random.randint(0, 100, size=[self.n, self.n_idx+self.num_bytes]) + t = np.random.randint(0, 100, size=[self.n, self.n_idx + self.num_bytes]) self.test_data = t.astype(np.uint32) self.weights = [1.] * len(self.test_label) self.data_path = 'test_path' diff --git a/tests/test_image_encoder.py b/tests/test_image_encoder.py index 07f60501..d1013ac8 100644 --- a/tests/test_image_encoder.py +++ b/tests/test_image_encoder.py @@ -3,7 +3,7 @@ import unittest import zipfile -from gnes.encoder.image.base import BasePytorchEncoder +from gnes.encoder.base import BaseEncoder from gnes.preprocessor.base import UnaryPreprocessor, PipelinePreprocessor from gnes.preprocessor.image.resize import ResizeChunkPreprocessor from gnes.preprocessor.image.sliding_window import VanillaSlidingPreprocessor @@ -45,7 +45,7 @@ def setUp(self): self.mobilenet_yaml = os.path.join(dirname, 'yaml', 'mobilenet-encoder.yml') def test_vgg_encoding(self): - self.encoder = BasePytorchEncoder.load_yaml(self.vgg_yaml) + self.encoder = BaseEncoder.load_yaml(self.vgg_yaml) for test_img in self.test_img: vec = self.encoder.encode(test_img) print("the length of data now is:", len(test_img)) @@ -53,7 +53,7 @@ def test_vgg_encoding(self): self.assertEqual(vec.shape[1], 4096) def test_resnet_encoding(self): - self.encoder = BasePytorchEncoder.load_yaml(self.res_yaml) + self.encoder = BaseEncoder.load_yaml(self.res_yaml) for test_img in self.test_img: vec = self.encoder.encode(test_img) print("the length of data now is:", len(test_img)) @@ -61,7 +61,7 @@ def test_resnet_encoding(self): self.assertEqual(vec.shape[1], 2048) def test_inception_encoding(self): - self.encoder = BasePytorchEncoder.load_yaml(self.inception_yaml) + self.encoder = BaseEncoder.load_yaml(self.inception_yaml) for test_img in self.test_img: vec = self.encoder.encode(test_img) print("the length of data now is:", len(test_img)) @@ -69,7 +69,7 @@ def test_inception_encoding(self): self.assertEqual(vec.shape[1], 2048) def test_mobilenet_encoding(self): - self.encoder = BasePytorchEncoder.load_yaml(self.mobilenet_yaml) + self.encoder = BaseEncoder.load_yaml(self.mobilenet_yaml) for test_img in self.test_img: vec = self.encoder.encode(test_img) print("the length of data now is:", len(test_img)) @@ -77,11 +77,11 @@ def test_mobilenet_encoding(self): self.assertEqual(vec.shape[1], 1280) def test_dump_load(self): - self.encoder = BasePytorchEncoder.load_yaml(self.vgg_yaml) + self.encoder = BaseEncoder.load_yaml(self.vgg_yaml) self.encoder.dump(self.dump_path) - vgg_encoder2 = BasePytorchEncoder.load(self.dump_path) + vgg_encoder2 = BaseEncoder.load(self.dump_path) for test_img in self.test_img: vec = vgg_encoder2.encode(test_img) diff --git a/tests/test_mfcc_encoder.py b/tests/test_mfcc_encoder.py index 6cb97222..7210a147 100644 --- a/tests/test_mfcc_encoder.py +++ b/tests/test_mfcc_encoder.py @@ -31,4 +31,4 @@ def test_mfcc_encoding(self): vec = self.encoder.encode(self.audios) self.assertEqual(len(vec.shape), 2) self.assertEqual(vec.shape[0], len(self.audios)) - self.assertEqual(vec.shape[1] % self.encoder.n_mfcc, 0) \ No newline at end of file + self.assertEqual(vec.shape[1] % self.encoder.n_mfcc, 0) diff --git a/tests/test_onnx_image_encoder.py b/tests/test_onnx_image_encoder.py index fe501a70..f78d1eef 100644 --- a/tests/test_onnx_image_encoder.py +++ b/tests/test_onnx_image_encoder.py @@ -9,6 +9,7 @@ from gnes.preprocessor.image.sliding_window import VanillaSlidingPreprocessor from gnes.proto import gnes_pb2, blob2array + def img_process_for_test(dirname): zipfile_ = zipfile.ZipFile(os.path.join(dirname, 'imgs/test.zip')) all_bytes = [zipfile_.open(v).read() for v in zipfile_.namelist()] @@ -31,6 +32,7 @@ def img_process_for_test(dirname): for img in test_img_copy for chunk in img.chunks]) return test_img_all_preprocessor + class TestONNXImageEncoder(unittest.TestCase): def setUp(self): diff --git a/tests/test_pytorch_transformers_encoder.py b/tests/test_pytorch_transformers_encoder.py index 271654fa..59c37b11 100644 --- a/tests/test_pytorch_transformers_encoder.py +++ b/tests/test_pytorch_transformers_encoder.py @@ -3,6 +3,7 @@ from gnes.encoder.text.torch_transformers import TorchTransformersEncoder + class TestTorchTransformersEncoder(unittest.TestCase): def setUp(self): diff --git a/tests/test_router.py b/tests/test_router.py index 3c1421d1..dbb3d823 100644 --- a/tests/test_router.py +++ b/tests/test_router.py @@ -17,7 +17,7 @@ def setUp(self): self.publish_router_yaml = '!PublishRouter {parameters: {num_part: 2}}' self.batch_router_yaml = '!DocBatchRouter {gnes_config: {batch_size: 2}}' self.reduce_router_yaml = 'BaseReduceRouter' - self.chunk_router_yaml = 'ChunkToDocumentRouter' + self.chunk_router_yaml = 'ChunkToDocRouter' self.chunk_sum_yaml = 'ChunkSumRouter' self.doc_router_yaml = 'DocFillRouter' self.doc_sum_yaml = 'DocSumRouter' diff --git a/tests/test_video_preprocessor.py b/tests/test_video_preprocessor.py index c53f1a17..bf0c0194 100644 --- a/tests/test_video_preprocessor.py +++ b/tests/test_video_preprocessor.py @@ -65,7 +65,7 @@ def test_video_cut_by_frame(self): r = client.recv_message() for d in r.request.index.docs: self.assertGreater(len(d.chunks), 0) - for _ in range(len(d.chunks)-1): + for _ in range(len(d.chunks) - 1): shape = blob2array(d.chunks[_].blob).shape self.assertEqual(shape, (30, 168, 192, 3)) shape = blob2array(d.chunks[-1].blob).shape diff --git a/tests/yaml/preprocessor1.yml b/tests/yaml/preprocessor1.yml index f510a5e9..e9270bd6 100644 --- a/tests/yaml/preprocessor1.yml +++ b/tests/yaml/preprocessor1.yml @@ -1,4 +1,4 @@ -!TextPreprocessor +!PunctSplitPreprocessor parameters: start_doc_id: 0 random_doc_id: True diff --git a/tests/yaml/router-chunk-reduce.yml b/tests/yaml/router-chunk-reduce.yml index dd69d91a..f3eb4f3e 100644 --- a/tests/yaml/router-chunk-reduce.yml +++ b/tests/yaml/router-chunk-reduce.yml @@ -1 +1 @@ -!ChunkToDocumentRouter {} \ No newline at end of file +!ChunkToDocRouter {} \ No newline at end of file diff --git a/tests/yaml/test-preprocessor.yml b/tests/yaml/test-preprocessor.yml index f510a5e9..e9270bd6 100644 --- a/tests/yaml/test-preprocessor.yml +++ b/tests/yaml/test-preprocessor.yml @@ -1,4 +1,4 @@ -!TextPreprocessor +!PunctSplitPreprocessor parameters: start_doc_id: 0 random_doc_id: True diff --git a/tutorials/component-yaml-spec.md b/tutorials/component-yaml-spec.md index b43e8a29..b6e656d4 100644 --- a/tutorials/component-yaml-spec.md +++ b/tutorials/component-yaml-spec.md @@ -65,10 +65,9 @@ In this example, we define a `BasePytorchEncoder` that loads a pretrained VGG16 |`!CLS`| Component Type | |---|---| |`!BasePreprocessor`|Preprocessor| -|`!TextPreprocessor`|Preprocessor| +|`!PunctSplitPreprocessor`|Preprocessor| |`!BaseImagePreprocessor`|Preprocessor| |`!BaseTextPreprocessor`|Preprocessor| -|`!BaseSlidingPreprocessor`|Preprocessor| |`!VanillaSlidingPreprocessor`|Preprocessor| |`!WeightedSlidingPreprocessor`|Preprocessor| |`!SegmentPreprocessor`|Preprocessor| @@ -110,7 +109,7 @@ In this example, we define a `BasePytorchEncoder` that loads a pretrained VGG16 |`!BaseRouter`|Router| |`!BaseMapRouter`|Router| |`!BaseReduceRouter`|Router| -|`!ChunkToDocumentRouter`|Router| +|`!ChunkToDocRouter`|Router| |`!DocFillRouter`|Router| |`!ConcatEmbedRouter`|Router| |`!PublishRouter`|Router| @@ -216,7 +215,7 @@ Note that how we defines a map under `kwargs` to describe the arguments, they wi The examples above are all about encoder. In fact, every component including encoder, preprocessor, router, indexer can all be described with YAML and loaded to GNES. For example, ```yaml -!TextPreprocessor +!PunctSplitPreprocessor parameters: start_doc_id: 0 random_doc_id: True diff --git a/yaml-example/component/preprocessor.yml b/yaml-example/component/preprocessor.yml index f510a5e9..e9270bd6 100644 --- a/yaml-example/component/preprocessor.yml +++ b/yaml-example/component/preprocessor.yml @@ -1,4 +1,4 @@ -!TextPreprocessor +!PunctSplitPreprocessor parameters: start_doc_id: 0 random_doc_id: True