Skip to content
This repository has been archived by the owner on Feb 22, 2020. It is now read-only.

Commit

Permalink
Merge pull request #213 from gnes-ai/quantizer
Browse files Browse the repository at this point in the history
feat(encoder): add quantizer
  • Loading branch information
mergify[bot] authored Sep 5, 2019
2 parents aafacda + 308b2aa commit 242dc5c
Show file tree
Hide file tree
Showing 4 changed files with 140 additions and 1 deletion.
3 changes: 2 additions & 1 deletion gnes/encoder/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@
'PyTorchTransformers': 'text.transformer',
'VggishEncoder': 'audio.vggish',
'YouTube8MFeatureExtractor': 'video.yt8m_feature_extractor',
'YouTube8MEncoder': 'video.yt8m_model'
'YouTube8MEncoder': 'video.yt8m_model',
'QuantizerEncoder': 'numeric.quantizer'
}

register_all_class(_cls2file_map, 'encoder')
103 changes: 103 additions & 0 deletions gnes/encoder/numeric/quantizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# Tencent is pleased to support the open source community by making GNES available.
#
# Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import numpy as np
from itertools import product

from ..base import BaseBinaryEncoder
from ...helper import batching


class QuantizerEncoder(BaseBinaryEncoder):
batch_size = 2048

def __init__(self, dim_per_byte: int, cluster_per_byte: int = 255,
upper_bound: int = 10000,
lower_bound: int = -10000,
partition_method: str = 'average',
*args, **kwargs):
super().__init__(*args, **kwargs)
assert 1 < cluster_per_byte <= 255, 'cluster number should >1 and <= 255 (0 is reserved for NOP)'
self.dim_per_byte = dim_per_byte
self.num_clusters = cluster_per_byte
self.upper_bound = upper_bound
self.lower_bound = lower_bound
self.partition_method = partition_method
self.centroids = self._get_centroids()

def _get_centroids(self):
"""
calculate centroids for quantizer
two kinds of divide methods are supported now: average, random
average: split the space averagely and centroids of clusters lie on the corner of sub-space
random: randomly pick points and treat them as centroids of clusters
Variable Explaination:
num_sample_per_dim: number of points to be sample on each dimension
"""

if self.upper_bound < self.lower_bound:
raise ValueError("upper bound is smaller than lower bound")

centroids = []
num_sample_per_dim = np.ceil(pow(self.num_clusters, 1 / self.dim_per_byte)).astype(np.uint8)
if self.partition_method == 'average':
axis_point = np.linspace(self.lower_bound, self.upper_bound, num=num_sample_per_dim+1,
endpoint=False, retstep=False, dtype=None)[1:]
coordinates = np.tile(axis_point, (self.dim_per_byte, 1))
elif self.partition_method == 'random':
coordinates = np.random.uniform(self.lower_bound, self.upper_bound,
size=[self.dim_per_byte, num_sample_per_dim])
else:
raise NotImplementedError

for item in product(*coordinates):
centroids.append(list(item))
return centroids[:self.num_clusters]

@batching
def encode(self, vecs: np.ndarray, *args, **kwargs) -> np.ndarray:
self._check_bound(vecs)
num_bytes = self._get_num_bytes(vecs)
x = np.reshape(vecs, [vecs.shape[0], num_bytes, 1, self.dim_per_byte])
x = np.sum(np.square(x - self.centroids), -1)
# start from 1
x = np.argmax(-x, 2) + 1

return np.array(x, dtype=np.uint8)

def _get_num_bytes(self, vecs: np.ndarray):
num_dim = vecs.shape[1]
assert num_dim % self.dim_per_byte == 0 and num_dim >= (num_dim % self.dim_per_byte), \
'input dimension (=%d) should be divided by dim_per_byte (=%d)!' % (
num_dim, self.dim_per_byte)
return int(num_dim / self.dim_per_byte)

@staticmethod
def _get_max_min_value(vecs):
return np.amax(vecs, axis=None), np.amin(vecs, axis=None)

def _check_bound(self, vecs):
max_value, min_value = self._get_max_min_value(vecs)
if self.upper_bound < max_value:
raise Warning("upper bound (=%.3f) is smaller than max value of input data (=%.3f), you should choose"
"a bigger value for upper bound" % (self.upper_bound, max_value))
if self.lower_bound > min_value:
raise Warning("lower bound (=%.3f) is bigger than min value of input data (=%.3f), you should choose"
"a smaller value for lower bound" % (self.lower_bound, min_value))
if (self.upper_bound-self.lower_bound) >= 10*(max_value - min_value):
raise Warning("(upper bound - lower_bound) (=%.3f) is 10 times larger than (max value - min value) "
"(=%.3f) of data, maybe you should choose a suitable bound" %
((self.upper_bound-self.lower_bound), (max_value - min_value)))
28 changes: 28 additions & 0 deletions tests/test_quantizer_encoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import os
import unittest
import numpy as np

from gnes.encoder.base import BaseNumericEncoder


class TestQuantizerEncoder(unittest.TestCase):
def setUp(self):
dirname = os.path.dirname(__file__)
self.vanilla_quantizer_yaml = os.path.join(dirname, 'yaml', 'quantizer_encoder.yml')

def test_vanilla_quantizer(self):
encoder = BaseNumericEncoder.load_yaml(self.vanilla_quantizer_yaml)
encoder.train()

vecs_1 = np.random.uniform(-150, 150, size=[1000, 160]).astype('float32')
out = encoder.encode(vecs_1)
self.assertEqual(len(out.shape), 2)
self.assertEqual(out.shape[0], 1000)
self.assertEqual(out.shape[1], 16)

vecs_2 = np.random.uniform(-1, 1, size=[1000, 160]).astype('float32')
self.assertRaises(Warning, encoder.encode, vecs_2)

vecs_3 = np.random.uniform(-1, 1000, size=[1000, 160]).astype('float32')
self.assertRaises(Warning, encoder.encode, vecs_3)

7 changes: 7 additions & 0 deletions tests/yaml/quantizer_encoder.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
!QuantizerEncoder
parameters:
upper_bound: 500
lower_bound: -200
partition_method: 'random'
cluster_per_byte: 255
dim_per_byte: 10

0 comments on commit 242dc5c

Please sign in to comment.