From 463cb491e347d204c7fe0b87f8b036a58b0ddacb Mon Sep 17 00:00:00 2001
From: Jem <jif0729@gmail.com>
Date: Thu, 5 Sep 2019 10:37:00 +0800
Subject: [PATCH 1/4] feat(encoder): add quantizer

---
 gnes/encoder/__init__.py          |  3 +-
 gnes/encoder/numeric/quantizer.py | 87 +++++++++++++++++++++++++++++++
 tests/test_quantizer_encoder.py   | 19 +++++++
 tests/yaml/quantizer_encoder.yml  |  7 +++
 4 files changed, 115 insertions(+), 1 deletion(-)
 create mode 100644 gnes/encoder/numeric/quantizer.py
 create mode 100644 tests/test_quantizer_encoder.py
 create mode 100644 tests/yaml/quantizer_encoder.yml

diff --git a/gnes/encoder/__init__.py b/gnes/encoder/__init__.py
index fa32f137..11e7de52 100644
--- a/gnes/encoder/__init__.py
+++ b/gnes/encoder/__init__.py
@@ -44,7 +44,8 @@
     'PyTorchTransformers': 'text.transformer',
     'VggishEncoder': 'audio.vggish',
     'YouTube8MFeatureExtractor': 'video.yt8m_feature_extractor',
-    'YouTube8MEncoder': 'video.yt8m_model'
+    'YouTube8MEncoder': 'video.yt8m_model',
+    'QuantizerEncoder': 'numeric.quantizer'
 }
 
 register_all_class(_cls2file_map, 'encoder')
diff --git a/gnes/encoder/numeric/quantizer.py b/gnes/encoder/numeric/quantizer.py
new file mode 100644
index 00000000..efc3d384
--- /dev/null
+++ b/gnes/encoder/numeric/quantizer.py
@@ -0,0 +1,87 @@
+#  Tencent is pleased to support the open source community by making GNES available.
+#
+#  Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+
+import numpy as np
+from itertools import product
+
+from ..base import BaseBinaryEncoder
+from ...helper import batching
+
+
+class QuantizerEncoder(BaseBinaryEncoder):
+    batch_size = 2048
+
+    def __init__(self, dim_per_byte: int, cluster_per_byte: int = 255,
+                 upper_bound: int = 10000,
+                 lower_bound: int = -10000,
+                 divide_method: str = 'average',
+                 *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        assert 1 < cluster_per_byte <= 255, 'cluster number should >1 and <= 255 (0 is reserved for NOP)'
+        self.dim_per_byte = dim_per_byte
+        self.num_clusters = cluster_per_byte
+        self.upper_bound = upper_bound
+        self.lower_bound = lower_bound
+        self.divide_method = divide_method
+        self.centroids = None
+
+    def train(self):
+        """
+        calculate centroids for quantizer
+        two kinds of divide methods are supported now: average, random
+        average: split the space averagely and centroids of clusters lie on the corner of sub-space
+        random: randomly pick points and treat them as centroids of clusters
+        Variable Explaination:
+            num_sample_per_dim: number of points to be sample on each dimension
+        """
+
+        self.centroids = []
+        num_sample_per_dim = np.ceil(pow(self.num_clusters, 1 / self.dim_per_byte)).astype(np.uint8)
+        axis_point = []
+        if self.divide_method == 'average':
+            interval = (self.upper_bound - self.lower_bound) / (num_sample_per_dim + 1)
+            for i in range(1, num_sample_per_dim + 1):
+                axis_point.append(self.lower_bound + i * interval)
+            coordinates = [axis_point for _ in range(self.dim_per_byte)]
+        elif self.divide_method == 'random':
+            for i in range(self.dim_per_byte):
+                axis_point.append(np.random.randint(self.lower_bound, self.upper_bound,
+                                                    size=[num_sample_per_dim]))
+            coordinates = axis_point
+        else:
+            raise NotImplementedError
+
+        for item in product(*coordinates):
+            self.centroids.append(list(item))
+        self.centroids = self.centroids[:self.num_clusters]
+
+    @batching
+    def encode(self, vecs: np.ndarray, *args, **kwargs) -> np.ndarray:
+        num_bytes = self._get_num_bytes(vecs)
+
+        x = np.reshape(vecs, [vecs.shape[0], num_bytes, 1, self.dim_per_byte])
+        x = np.sum(np.square(x - self.centroids), -1)
+        # start from 1
+        x = np.argmax(-x, 2) + 1
+
+        return np.array(x, dtype=np.uint8)
+
+    def _get_num_bytes(self, vecs: np.ndarray):
+        num_dim = vecs.shape[1]
+        assert num_dim % self.dim_per_byte == 0 and num_dim >= (num_dim % self.dim_per_byte), \
+            'input dimension (=%d) should be divided by dim_per_byte (=%d)!' % (
+                num_dim, self.dim_per_byte)
+        return int(num_dim / self.dim_per_byte)
diff --git a/tests/test_quantizer_encoder.py b/tests/test_quantizer_encoder.py
new file mode 100644
index 00000000..c5253939
--- /dev/null
+++ b/tests/test_quantizer_encoder.py
@@ -0,0 +1,19 @@
+import os
+import unittest
+import numpy as np
+
+from gnes.encoder.base import BaseNumericEncoder
+
+
+class TestQuantizerEncoder(unittest.TestCase):
+    def setUp(self):
+        self.vecs = np.random.randint(-1500, 1500, size=[1000, 160]).astype('float32')
+        dirname = os.path.dirname(__file__)
+        self.vanilla_quantizer_yaml = os.path.join(dirname, 'yaml', 'quantizer_encoder.yml')
+
+    def test_vanilla_quantizer(self):
+        encoder = BaseNumericEncoder.load_yaml(self.vanilla_quantizer_yaml)
+        encoder.train()
+        out = encoder.encode(self.vecs)
+        print(out.shape)
+
diff --git a/tests/yaml/quantizer_encoder.yml b/tests/yaml/quantizer_encoder.yml
new file mode 100644
index 00000000..779b428a
--- /dev/null
+++ b/tests/yaml/quantizer_encoder.yml
@@ -0,0 +1,7 @@
+!QuantizerEncoder
+parameters:
+  upper_bound: 2000
+  lower_bound: -2000
+  divide_method: 'random'
+  cluster_per_byte: 255
+  dim_per_byte: 8

From bbf4283a6cf941e5577b032c0992eb11ddc3c5ae Mon Sep 17 00:00:00 2001
From: Jem <jif0729@gmail.com>
Date: Thu, 5 Sep 2019 12:22:52 +0800
Subject: [PATCH 2/4] feat(encoder): add quantizer

---
 gnes/encoder/numeric/quantizer.py | 47 ++++++++++++++++++++++---------
 tests/test_quantizer_encoder.py   |  2 +-
 tests/yaml/quantizer_encoder.yml  |  8 +++---
 3 files changed, 38 insertions(+), 19 deletions(-)

diff --git a/gnes/encoder/numeric/quantizer.py b/gnes/encoder/numeric/quantizer.py
index efc3d384..a5a656a1 100644
--- a/gnes/encoder/numeric/quantizer.py
+++ b/gnes/encoder/numeric/quantizer.py
@@ -27,7 +27,7 @@ class QuantizerEncoder(BaseBinaryEncoder):
     def __init__(self, dim_per_byte: int, cluster_per_byte: int = 255,
                  upper_bound: int = 10000,
                  lower_bound: int = -10000,
-                 divide_method: str = 'average',
+                 partition_method: str = 'average',
                  *args, **kwargs):
         super().__init__(*args, **kwargs)
         assert 1 < cluster_per_byte <= 255, 'cluster number should >1 and <= 255 (0 is reserved for NOP)'
@@ -35,10 +35,11 @@ def __init__(self, dim_per_byte: int, cluster_per_byte: int = 255,
         self.num_clusters = cluster_per_byte
         self.upper_bound = upper_bound
         self.lower_bound = lower_bound
-        self.divide_method = divide_method
+        self.partition_method = partition_method
         self.centroids = None
+        self._get_centroids()
 
-    def train(self):
+    def _get_centroids(self):
         """
         calculate centroids for quantizer
         two kinds of divide methods are supported now: average, random
@@ -48,19 +49,18 @@ def train(self):
             num_sample_per_dim: number of points to be sample on each dimension
         """
 
+        if self.upper_bound < self.lower_bound:
+            raise ValueError("upper bound is smaller than lower bound")
+
         self.centroids = []
         num_sample_per_dim = np.ceil(pow(self.num_clusters, 1 / self.dim_per_byte)).astype(np.uint8)
-        axis_point = []
-        if self.divide_method == 'average':
-            interval = (self.upper_bound - self.lower_bound) / (num_sample_per_dim + 1)
-            for i in range(1, num_sample_per_dim + 1):
-                axis_point.append(self.lower_bound + i * interval)
-            coordinates = [axis_point for _ in range(self.dim_per_byte)]
-        elif self.divide_method == 'random':
-            for i in range(self.dim_per_byte):
-                axis_point.append(np.random.randint(self.lower_bound, self.upper_bound,
-                                                    size=[num_sample_per_dim]))
-            coordinates = axis_point
+        if self.partition_method == 'average':
+            axis_point = np.linspace(self.lower_bound, self.upper_bound, num=num_sample_per_dim+1,
+                                     endpoint=False, retstep=False, dtype=None)[1:]
+            coordinates = np.tile(axis_point, (self.dim_per_byte, 1))
+        elif self.partition_method == 'random':
+            coordinates = np.random.randint(self.lower_bound, self.upper_bound,
+                                                    size=[self.dim_per_byte, num_sample_per_dim])
         else:
             raise NotImplementedError
 
@@ -71,6 +71,9 @@ def train(self):
     @batching
     def encode(self, vecs: np.ndarray, *args, **kwargs) -> np.ndarray:
         num_bytes = self._get_num_bytes(vecs)
+        max_value, min_value = self._get_max_min_value(vecs)
+
+        self._check_bound(max_value, min_value)
 
         x = np.reshape(vecs, [vecs.shape[0], num_bytes, 1, self.dim_per_byte])
         x = np.sum(np.square(x - self.centroids), -1)
@@ -85,3 +88,19 @@ def _get_num_bytes(self, vecs: np.ndarray):
             'input dimension (=%d) should be divided by dim_per_byte (=%d)!' % (
                 num_dim, self.dim_per_byte)
         return int(num_dim / self.dim_per_byte)
+
+    @staticmethod
+    def _get_max_min_value(vecs):
+        return np.amax(vecs, axis=None), np.amin(vecs, axis=None)
+
+    def _check_bound(self, max_value, min_value):
+        if self.upper_bound < max_value:
+            self.logger.warning("upper bound (=%.3f) is smaller than max value of input data (=%.3f), you should choose"
+                                "a bigger value for upper bound" % (self.upper_bound, max_value))
+        if self.lower_bound > min_value:
+            self.logger.warning("lower bound (=%.3f) is bigger than min value of input data (=%.3f), you should choose"
+                                "a smaller value for lower bound" % (self.lower_bound, min_value))
+        if (self.upper_bound-self.lower_bound) >= 10*(max_value - min_value):
+            self.logger.warning("(upper bound - lower_bound) (=%.3f) is 10 times larger than (max value - min value) "
+                                "(=%.3f) of data, maybe you should choose a suitable bound" %
+                                ((self.upper_bound-self.lower_bound), (max_value - min_value)))
diff --git a/tests/test_quantizer_encoder.py b/tests/test_quantizer_encoder.py
index c5253939..e3227dba 100644
--- a/tests/test_quantizer_encoder.py
+++ b/tests/test_quantizer_encoder.py
@@ -7,7 +7,7 @@
 
 class TestQuantizerEncoder(unittest.TestCase):
     def setUp(self):
-        self.vecs = np.random.randint(-1500, 1500, size=[1000, 160]).astype('float32')
+        self.vecs = np.random.randint(-150, 150, size=[1000, 160]).astype('float32')
         dirname = os.path.dirname(__file__)
         self.vanilla_quantizer_yaml = os.path.join(dirname, 'yaml', 'quantizer_encoder.yml')
 
diff --git a/tests/yaml/quantizer_encoder.yml b/tests/yaml/quantizer_encoder.yml
index 779b428a..13e914c2 100644
--- a/tests/yaml/quantizer_encoder.yml
+++ b/tests/yaml/quantizer_encoder.yml
@@ -1,7 +1,7 @@
 !QuantizerEncoder
 parameters:
-  upper_bound: 2000
-  lower_bound: -2000
-  divide_method: 'random'
+  upper_bound: 1000000
+  lower_bound: -100
+  partition_method: 'random'
   cluster_per_byte: 255
-  dim_per_byte: 8
+  dim_per_byte: 10

From 57cc95ffc80c3e582e4017c94fdbe85690d05b5a Mon Sep 17 00:00:00 2001
From: Jem <jif0729@gmail.com>
Date: Thu, 5 Sep 2019 12:35:11 +0800
Subject: [PATCH 3/4] feat(encoder): add quantizer

---
 gnes/encoder/numeric/quantizer.py |  8 ++++----
 tests/test_quantizer_encoder.py   | 15 ++++++++++++---
 tests/yaml/quantizer_encoder.yml  |  4 ++--
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/gnes/encoder/numeric/quantizer.py b/gnes/encoder/numeric/quantizer.py
index a5a656a1..ebfeb395 100644
--- a/gnes/encoder/numeric/quantizer.py
+++ b/gnes/encoder/numeric/quantizer.py
@@ -59,7 +59,7 @@ def _get_centroids(self):
                                      endpoint=False, retstep=False, dtype=None)[1:]
             coordinates = np.tile(axis_point, (self.dim_per_byte, 1))
         elif self.partition_method == 'random':
-            coordinates = np.random.randint(self.lower_bound, self.upper_bound,
+            coordinates = np.random.uniform(self.lower_bound, self.upper_bound,
                                                     size=[self.dim_per_byte, num_sample_per_dim])
         else:
             raise NotImplementedError
@@ -95,12 +95,12 @@ def _get_max_min_value(vecs):
 
     def _check_bound(self, max_value, min_value):
         if self.upper_bound < max_value:
-            self.logger.warning("upper bound (=%.3f) is smaller than max value of input data (=%.3f), you should choose"
+            raise Warning("upper bound (=%.3f) is smaller than max value of input data (=%.3f), you should choose"
                                 "a bigger value for upper bound" % (self.upper_bound, max_value))
         if self.lower_bound > min_value:
-            self.logger.warning("lower bound (=%.3f) is bigger than min value of input data (=%.3f), you should choose"
+            raise Warning("lower bound (=%.3f) is bigger than min value of input data (=%.3f), you should choose"
                                 "a smaller value for lower bound" % (self.lower_bound, min_value))
         if (self.upper_bound-self.lower_bound) >= 10*(max_value - min_value):
-            self.logger.warning("(upper bound - lower_bound) (=%.3f) is 10 times larger than (max value - min value) "
+            raise Warning("(upper bound - lower_bound) (=%.3f) is 10 times larger than (max value - min value) "
                                 "(=%.3f) of data, maybe you should choose a suitable bound" %
                                 ((self.upper_bound-self.lower_bound), (max_value - min_value)))
diff --git a/tests/test_quantizer_encoder.py b/tests/test_quantizer_encoder.py
index e3227dba..309ac14a 100644
--- a/tests/test_quantizer_encoder.py
+++ b/tests/test_quantizer_encoder.py
@@ -7,13 +7,22 @@
 
 class TestQuantizerEncoder(unittest.TestCase):
     def setUp(self):
-        self.vecs = np.random.randint(-150, 150, size=[1000, 160]).astype('float32')
         dirname = os.path.dirname(__file__)
         self.vanilla_quantizer_yaml = os.path.join(dirname, 'yaml', 'quantizer_encoder.yml')
 
     def test_vanilla_quantizer(self):
         encoder = BaseNumericEncoder.load_yaml(self.vanilla_quantizer_yaml)
         encoder.train()
-        out = encoder.encode(self.vecs)
-        print(out.shape)
+
+        vecs_1 = np.random.uniform(-150, 150, size=[1000, 160]).astype('float32')
+        out = encoder.encode(vecs_1)
+        self.assertEqual(len(out.shape), 2)
+        self.assertEqual(out.shape[0], 1000)
+        self.assertEqual(out.shape[1], 16)
+
+        vecs_2 = np.random.uniform(-1, 1, size=[1000, 160]).astype('float32')
+        self.assertRaises(Warning, encoder.encode, vecs_2)
+
+        vecs_3 = np.random.uniform(-1, 1000, size=[1000, 160]).astype('float32')
+        self.assertRaises(Warning, encoder.encode, vecs_3)
 
diff --git a/tests/yaml/quantizer_encoder.yml b/tests/yaml/quantizer_encoder.yml
index 13e914c2..975253d4 100644
--- a/tests/yaml/quantizer_encoder.yml
+++ b/tests/yaml/quantizer_encoder.yml
@@ -1,7 +1,7 @@
 !QuantizerEncoder
 parameters:
-  upper_bound: 1000000
-  lower_bound: -100
+  upper_bound: 500
+  lower_bound: -200
   partition_method: 'random'
   cluster_per_byte: 255
   dim_per_byte: 10

From 2fd8dab48397579adbb09d74cebff704d575dfe9 Mon Sep 17 00:00:00 2001
From: Han Xiao <hanhxiao@tencent.com>
Date: Thu, 5 Sep 2019 13:05:56 +0800
Subject: [PATCH 4/4] style: minor fix on the styling

---
 gnes/encoder/numeric/quantizer.py | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/gnes/encoder/numeric/quantizer.py b/gnes/encoder/numeric/quantizer.py
index ebfeb395..3be5b8f1 100644
--- a/gnes/encoder/numeric/quantizer.py
+++ b/gnes/encoder/numeric/quantizer.py
@@ -36,8 +36,7 @@ def __init__(self, dim_per_byte: int, cluster_per_byte: int = 255,
         self.upper_bound = upper_bound
         self.lower_bound = lower_bound
         self.partition_method = partition_method
-        self.centroids = None
-        self._get_centroids()
+        self.centroids = self._get_centroids()
 
     def _get_centroids(self):
         """
@@ -52,7 +51,7 @@ def _get_centroids(self):
         if self.upper_bound < self.lower_bound:
             raise ValueError("upper bound is smaller than lower bound")
 
-        self.centroids = []
+        centroids = []
         num_sample_per_dim = np.ceil(pow(self.num_clusters, 1 / self.dim_per_byte)).astype(np.uint8)
         if self.partition_method == 'average':
             axis_point = np.linspace(self.lower_bound, self.upper_bound, num=num_sample_per_dim+1,
@@ -65,16 +64,13 @@ def _get_centroids(self):
             raise NotImplementedError
 
         for item in product(*coordinates):
-            self.centroids.append(list(item))
-        self.centroids = self.centroids[:self.num_clusters]
+            centroids.append(list(item))
+        return centroids[:self.num_clusters]
 
     @batching
     def encode(self, vecs: np.ndarray, *args, **kwargs) -> np.ndarray:
+        self._check_bound(vecs)
         num_bytes = self._get_num_bytes(vecs)
-        max_value, min_value = self._get_max_min_value(vecs)
-
-        self._check_bound(max_value, min_value)
-
         x = np.reshape(vecs, [vecs.shape[0], num_bytes, 1, self.dim_per_byte])
         x = np.sum(np.square(x - self.centroids), -1)
         # start from 1
@@ -93,7 +89,8 @@ def _get_num_bytes(self, vecs: np.ndarray):
     def _get_max_min_value(vecs):
         return np.amax(vecs, axis=None), np.amin(vecs, axis=None)
 
-    def _check_bound(self, max_value, min_value):
+    def _check_bound(self, vecs):
+        max_value, min_value = self._get_max_min_value(vecs)
         if self.upper_bound < max_value:
             raise Warning("upper bound (=%.3f) is smaller than max value of input data (=%.3f), you should choose"
                                 "a bigger value for upper bound" % (self.upper_bound, max_value))