From 463cb491e347d204c7fe0b87f8b036a58b0ddacb Mon Sep 17 00:00:00 2001 From: Jem Date: Thu, 5 Sep 2019 10:37:00 +0800 Subject: [PATCH 1/4] feat(encoder): add quantizer --- gnes/encoder/__init__.py | 3 +- gnes/encoder/numeric/quantizer.py | 87 +++++++++++++++++++++++++++++++ tests/test_quantizer_encoder.py | 19 +++++++ tests/yaml/quantizer_encoder.yml | 7 +++ 4 files changed, 115 insertions(+), 1 deletion(-) create mode 100644 gnes/encoder/numeric/quantizer.py create mode 100644 tests/test_quantizer_encoder.py create mode 100644 tests/yaml/quantizer_encoder.yml diff --git a/gnes/encoder/__init__.py b/gnes/encoder/__init__.py index fa32f137..11e7de52 100644 --- a/gnes/encoder/__init__.py +++ b/gnes/encoder/__init__.py @@ -44,7 +44,8 @@ 'PyTorchTransformers': 'text.transformer', 'VggishEncoder': 'audio.vggish', 'YouTube8MFeatureExtractor': 'video.yt8m_feature_extractor', - 'YouTube8MEncoder': 'video.yt8m_model' + 'YouTube8MEncoder': 'video.yt8m_model', + 'QuantizerEncoder': 'numeric.quantizer' } register_all_class(_cls2file_map, 'encoder') diff --git a/gnes/encoder/numeric/quantizer.py b/gnes/encoder/numeric/quantizer.py new file mode 100644 index 00000000..efc3d384 --- /dev/null +++ b/gnes/encoder/numeric/quantizer.py @@ -0,0 +1,87 @@ +# Tencent is pleased to support the open source community by making GNES available. +# +# Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import numpy as np +from itertools import product + +from ..base import BaseBinaryEncoder +from ...helper import batching + + +class QuantizerEncoder(BaseBinaryEncoder): + batch_size = 2048 + + def __init__(self, dim_per_byte: int, cluster_per_byte: int = 255, + upper_bound: int = 10000, + lower_bound: int = -10000, + divide_method: str = 'average', + *args, **kwargs): + super().__init__(*args, **kwargs) + assert 1 < cluster_per_byte <= 255, 'cluster number should >1 and <= 255 (0 is reserved for NOP)' + self.dim_per_byte = dim_per_byte + self.num_clusters = cluster_per_byte + self.upper_bound = upper_bound + self.lower_bound = lower_bound + self.divide_method = divide_method + self.centroids = None + + def train(self): + """ + calculate centroids for quantizer + two kinds of divide methods are supported now: average, random + average: split the space averagely and centroids of clusters lie on the corner of sub-space + random: randomly pick points and treat them as centroids of clusters + Variable Explaination: + num_sample_per_dim: number of points to be sample on each dimension + """ + + self.centroids = [] + num_sample_per_dim = np.ceil(pow(self.num_clusters, 1 / self.dim_per_byte)).astype(np.uint8) + axis_point = [] + if self.divide_method == 'average': + interval = (self.upper_bound - self.lower_bound) / (num_sample_per_dim + 1) + for i in range(1, num_sample_per_dim + 1): + axis_point.append(self.lower_bound + i * interval) + coordinates = [axis_point for _ in range(self.dim_per_byte)] + elif self.divide_method == 'random': + for i in range(self.dim_per_byte): + axis_point.append(np.random.randint(self.lower_bound, self.upper_bound, + size=[num_sample_per_dim])) + coordinates = axis_point + else: + raise NotImplementedError + + for item in product(*coordinates): + self.centroids.append(list(item)) + self.centroids = self.centroids[:self.num_clusters] + + @batching + def encode(self, vecs: np.ndarray, *args, **kwargs) -> np.ndarray: + num_bytes = self._get_num_bytes(vecs) + + x = np.reshape(vecs, [vecs.shape[0], num_bytes, 1, self.dim_per_byte]) + x = np.sum(np.square(x - self.centroids), -1) + # start from 1 + x = np.argmax(-x, 2) + 1 + + return np.array(x, dtype=np.uint8) + + def _get_num_bytes(self, vecs: np.ndarray): + num_dim = vecs.shape[1] + assert num_dim % self.dim_per_byte == 0 and num_dim >= (num_dim % self.dim_per_byte), \ + 'input dimension (=%d) should be divided by dim_per_byte (=%d)!' % ( + num_dim, self.dim_per_byte) + return int(num_dim / self.dim_per_byte) diff --git a/tests/test_quantizer_encoder.py b/tests/test_quantizer_encoder.py new file mode 100644 index 00000000..c5253939 --- /dev/null +++ b/tests/test_quantizer_encoder.py @@ -0,0 +1,19 @@ +import os +import unittest +import numpy as np + +from gnes.encoder.base import BaseNumericEncoder + + +class TestQuantizerEncoder(unittest.TestCase): + def setUp(self): + self.vecs = np.random.randint(-1500, 1500, size=[1000, 160]).astype('float32') + dirname = os.path.dirname(__file__) + self.vanilla_quantizer_yaml = os.path.join(dirname, 'yaml', 'quantizer_encoder.yml') + + def test_vanilla_quantizer(self): + encoder = BaseNumericEncoder.load_yaml(self.vanilla_quantizer_yaml) + encoder.train() + out = encoder.encode(self.vecs) + print(out.shape) + diff --git a/tests/yaml/quantizer_encoder.yml b/tests/yaml/quantizer_encoder.yml new file mode 100644 index 00000000..779b428a --- /dev/null +++ b/tests/yaml/quantizer_encoder.yml @@ -0,0 +1,7 @@ +!QuantizerEncoder +parameters: + upper_bound: 2000 + lower_bound: -2000 + divide_method: 'random' + cluster_per_byte: 255 + dim_per_byte: 8 From bbf4283a6cf941e5577b032c0992eb11ddc3c5ae Mon Sep 17 00:00:00 2001 From: Jem Date: Thu, 5 Sep 2019 12:22:52 +0800 Subject: [PATCH 2/4] feat(encoder): add quantizer --- gnes/encoder/numeric/quantizer.py | 47 ++++++++++++++++++++++--------- tests/test_quantizer_encoder.py | 2 +- tests/yaml/quantizer_encoder.yml | 8 +++--- 3 files changed, 38 insertions(+), 19 deletions(-) diff --git a/gnes/encoder/numeric/quantizer.py b/gnes/encoder/numeric/quantizer.py index efc3d384..a5a656a1 100644 --- a/gnes/encoder/numeric/quantizer.py +++ b/gnes/encoder/numeric/quantizer.py @@ -27,7 +27,7 @@ class QuantizerEncoder(BaseBinaryEncoder): def __init__(self, dim_per_byte: int, cluster_per_byte: int = 255, upper_bound: int = 10000, lower_bound: int = -10000, - divide_method: str = 'average', + partition_method: str = 'average', *args, **kwargs): super().__init__(*args, **kwargs) assert 1 < cluster_per_byte <= 255, 'cluster number should >1 and <= 255 (0 is reserved for NOP)' @@ -35,10 +35,11 @@ def __init__(self, dim_per_byte: int, cluster_per_byte: int = 255, self.num_clusters = cluster_per_byte self.upper_bound = upper_bound self.lower_bound = lower_bound - self.divide_method = divide_method + self.partition_method = partition_method self.centroids = None + self._get_centroids() - def train(self): + def _get_centroids(self): """ calculate centroids for quantizer two kinds of divide methods are supported now: average, random @@ -48,19 +49,18 @@ def train(self): num_sample_per_dim: number of points to be sample on each dimension """ + if self.upper_bound < self.lower_bound: + raise ValueError("upper bound is smaller than lower bound") + self.centroids = [] num_sample_per_dim = np.ceil(pow(self.num_clusters, 1 / self.dim_per_byte)).astype(np.uint8) - axis_point = [] - if self.divide_method == 'average': - interval = (self.upper_bound - self.lower_bound) / (num_sample_per_dim + 1) - for i in range(1, num_sample_per_dim + 1): - axis_point.append(self.lower_bound + i * interval) - coordinates = [axis_point for _ in range(self.dim_per_byte)] - elif self.divide_method == 'random': - for i in range(self.dim_per_byte): - axis_point.append(np.random.randint(self.lower_bound, self.upper_bound, - size=[num_sample_per_dim])) - coordinates = axis_point + if self.partition_method == 'average': + axis_point = np.linspace(self.lower_bound, self.upper_bound, num=num_sample_per_dim+1, + endpoint=False, retstep=False, dtype=None)[1:] + coordinates = np.tile(axis_point, (self.dim_per_byte, 1)) + elif self.partition_method == 'random': + coordinates = np.random.randint(self.lower_bound, self.upper_bound, + size=[self.dim_per_byte, num_sample_per_dim]) else: raise NotImplementedError @@ -71,6 +71,9 @@ def train(self): @batching def encode(self, vecs: np.ndarray, *args, **kwargs) -> np.ndarray: num_bytes = self._get_num_bytes(vecs) + max_value, min_value = self._get_max_min_value(vecs) + + self._check_bound(max_value, min_value) x = np.reshape(vecs, [vecs.shape[0], num_bytes, 1, self.dim_per_byte]) x = np.sum(np.square(x - self.centroids), -1) @@ -85,3 +88,19 @@ def _get_num_bytes(self, vecs: np.ndarray): 'input dimension (=%d) should be divided by dim_per_byte (=%d)!' % ( num_dim, self.dim_per_byte) return int(num_dim / self.dim_per_byte) + + @staticmethod + def _get_max_min_value(vecs): + return np.amax(vecs, axis=None), np.amin(vecs, axis=None) + + def _check_bound(self, max_value, min_value): + if self.upper_bound < max_value: + self.logger.warning("upper bound (=%.3f) is smaller than max value of input data (=%.3f), you should choose" + "a bigger value for upper bound" % (self.upper_bound, max_value)) + if self.lower_bound > min_value: + self.logger.warning("lower bound (=%.3f) is bigger than min value of input data (=%.3f), you should choose" + "a smaller value for lower bound" % (self.lower_bound, min_value)) + if (self.upper_bound-self.lower_bound) >= 10*(max_value - min_value): + self.logger.warning("(upper bound - lower_bound) (=%.3f) is 10 times larger than (max value - min value) " + "(=%.3f) of data, maybe you should choose a suitable bound" % + ((self.upper_bound-self.lower_bound), (max_value - min_value))) diff --git a/tests/test_quantizer_encoder.py b/tests/test_quantizer_encoder.py index c5253939..e3227dba 100644 --- a/tests/test_quantizer_encoder.py +++ b/tests/test_quantizer_encoder.py @@ -7,7 +7,7 @@ class TestQuantizerEncoder(unittest.TestCase): def setUp(self): - self.vecs = np.random.randint(-1500, 1500, size=[1000, 160]).astype('float32') + self.vecs = np.random.randint(-150, 150, size=[1000, 160]).astype('float32') dirname = os.path.dirname(__file__) self.vanilla_quantizer_yaml = os.path.join(dirname, 'yaml', 'quantizer_encoder.yml') diff --git a/tests/yaml/quantizer_encoder.yml b/tests/yaml/quantizer_encoder.yml index 779b428a..13e914c2 100644 --- a/tests/yaml/quantizer_encoder.yml +++ b/tests/yaml/quantizer_encoder.yml @@ -1,7 +1,7 @@ !QuantizerEncoder parameters: - upper_bound: 2000 - lower_bound: -2000 - divide_method: 'random' + upper_bound: 1000000 + lower_bound: -100 + partition_method: 'random' cluster_per_byte: 255 - dim_per_byte: 8 + dim_per_byte: 10 From 57cc95ffc80c3e582e4017c94fdbe85690d05b5a Mon Sep 17 00:00:00 2001 From: Jem Date: Thu, 5 Sep 2019 12:35:11 +0800 Subject: [PATCH 3/4] feat(encoder): add quantizer --- gnes/encoder/numeric/quantizer.py | 8 ++++---- tests/test_quantizer_encoder.py | 15 ++++++++++++--- tests/yaml/quantizer_encoder.yml | 4 ++-- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/gnes/encoder/numeric/quantizer.py b/gnes/encoder/numeric/quantizer.py index a5a656a1..ebfeb395 100644 --- a/gnes/encoder/numeric/quantizer.py +++ b/gnes/encoder/numeric/quantizer.py @@ -59,7 +59,7 @@ def _get_centroids(self): endpoint=False, retstep=False, dtype=None)[1:] coordinates = np.tile(axis_point, (self.dim_per_byte, 1)) elif self.partition_method == 'random': - coordinates = np.random.randint(self.lower_bound, self.upper_bound, + coordinates = np.random.uniform(self.lower_bound, self.upper_bound, size=[self.dim_per_byte, num_sample_per_dim]) else: raise NotImplementedError @@ -95,12 +95,12 @@ def _get_max_min_value(vecs): def _check_bound(self, max_value, min_value): if self.upper_bound < max_value: - self.logger.warning("upper bound (=%.3f) is smaller than max value of input data (=%.3f), you should choose" + raise Warning("upper bound (=%.3f) is smaller than max value of input data (=%.3f), you should choose" "a bigger value for upper bound" % (self.upper_bound, max_value)) if self.lower_bound > min_value: - self.logger.warning("lower bound (=%.3f) is bigger than min value of input data (=%.3f), you should choose" + raise Warning("lower bound (=%.3f) is bigger than min value of input data (=%.3f), you should choose" "a smaller value for lower bound" % (self.lower_bound, min_value)) if (self.upper_bound-self.lower_bound) >= 10*(max_value - min_value): - self.logger.warning("(upper bound - lower_bound) (=%.3f) is 10 times larger than (max value - min value) " + raise Warning("(upper bound - lower_bound) (=%.3f) is 10 times larger than (max value - min value) " "(=%.3f) of data, maybe you should choose a suitable bound" % ((self.upper_bound-self.lower_bound), (max_value - min_value))) diff --git a/tests/test_quantizer_encoder.py b/tests/test_quantizer_encoder.py index e3227dba..309ac14a 100644 --- a/tests/test_quantizer_encoder.py +++ b/tests/test_quantizer_encoder.py @@ -7,13 +7,22 @@ class TestQuantizerEncoder(unittest.TestCase): def setUp(self): - self.vecs = np.random.randint(-150, 150, size=[1000, 160]).astype('float32') dirname = os.path.dirname(__file__) self.vanilla_quantizer_yaml = os.path.join(dirname, 'yaml', 'quantizer_encoder.yml') def test_vanilla_quantizer(self): encoder = BaseNumericEncoder.load_yaml(self.vanilla_quantizer_yaml) encoder.train() - out = encoder.encode(self.vecs) - print(out.shape) + + vecs_1 = np.random.uniform(-150, 150, size=[1000, 160]).astype('float32') + out = encoder.encode(vecs_1) + self.assertEqual(len(out.shape), 2) + self.assertEqual(out.shape[0], 1000) + self.assertEqual(out.shape[1], 16) + + vecs_2 = np.random.uniform(-1, 1, size=[1000, 160]).astype('float32') + self.assertRaises(Warning, encoder.encode, vecs_2) + + vecs_3 = np.random.uniform(-1, 1000, size=[1000, 160]).astype('float32') + self.assertRaises(Warning, encoder.encode, vecs_3) diff --git a/tests/yaml/quantizer_encoder.yml b/tests/yaml/quantizer_encoder.yml index 13e914c2..975253d4 100644 --- a/tests/yaml/quantizer_encoder.yml +++ b/tests/yaml/quantizer_encoder.yml @@ -1,7 +1,7 @@ !QuantizerEncoder parameters: - upper_bound: 1000000 - lower_bound: -100 + upper_bound: 500 + lower_bound: -200 partition_method: 'random' cluster_per_byte: 255 dim_per_byte: 10 From 2fd8dab48397579adbb09d74cebff704d575dfe9 Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Thu, 5 Sep 2019 13:05:56 +0800 Subject: [PATCH 4/4] style: minor fix on the styling --- gnes/encoder/numeric/quantizer.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/gnes/encoder/numeric/quantizer.py b/gnes/encoder/numeric/quantizer.py index ebfeb395..3be5b8f1 100644 --- a/gnes/encoder/numeric/quantizer.py +++ b/gnes/encoder/numeric/quantizer.py @@ -36,8 +36,7 @@ def __init__(self, dim_per_byte: int, cluster_per_byte: int = 255, self.upper_bound = upper_bound self.lower_bound = lower_bound self.partition_method = partition_method - self.centroids = None - self._get_centroids() + self.centroids = self._get_centroids() def _get_centroids(self): """ @@ -52,7 +51,7 @@ def _get_centroids(self): if self.upper_bound < self.lower_bound: raise ValueError("upper bound is smaller than lower bound") - self.centroids = [] + centroids = [] num_sample_per_dim = np.ceil(pow(self.num_clusters, 1 / self.dim_per_byte)).astype(np.uint8) if self.partition_method == 'average': axis_point = np.linspace(self.lower_bound, self.upper_bound, num=num_sample_per_dim+1, @@ -65,16 +64,13 @@ def _get_centroids(self): raise NotImplementedError for item in product(*coordinates): - self.centroids.append(list(item)) - self.centroids = self.centroids[:self.num_clusters] + centroids.append(list(item)) + return centroids[:self.num_clusters] @batching def encode(self, vecs: np.ndarray, *args, **kwargs) -> np.ndarray: + self._check_bound(vecs) num_bytes = self._get_num_bytes(vecs) - max_value, min_value = self._get_max_min_value(vecs) - - self._check_bound(max_value, min_value) - x = np.reshape(vecs, [vecs.shape[0], num_bytes, 1, self.dim_per_byte]) x = np.sum(np.square(x - self.centroids), -1) # start from 1 @@ -93,7 +89,8 @@ def _get_num_bytes(self, vecs: np.ndarray): def _get_max_min_value(vecs): return np.amax(vecs, axis=None), np.amin(vecs, axis=None) - def _check_bound(self, max_value, min_value): + def _check_bound(self, vecs): + max_value, min_value = self._get_max_min_value(vecs) if self.upper_bound < max_value: raise Warning("upper bound (=%.3f) is smaller than max value of input data (=%.3f), you should choose" "a bigger value for upper bound" % (self.upper_bound, max_value))