diff --git a/gnes/encoder/numeric/pca.py b/gnes/encoder/numeric/pca.py index 14b893d9..88dec6b5 100644 --- a/gnes/encoder/numeric/pca.py +++ b/gnes/encoder/numeric/pca.py @@ -23,16 +23,19 @@ class PCAEncoder(BaseNumericEncoder): batch_size = 2048 - def __init__(self, output_dim: int, *args, **kwargs): + def __init__(self, output_dim: int, whiten: bool=False, *args, **kwargs): super().__init__(*args, **kwargs) self.output_dim = output_dim + self.whiten = whiten self.pca_components = None self.mean = None + def post_init(self): from sklearn.decomposition import IncrementalPCA self.pca = IncrementalPCA(n_components=self.output_dim) + @batching def train(self, vecs: np.ndarray, *args, **kwargs) -> None: num_samples, num_dim = vecs.shape @@ -49,11 +52,16 @@ def train(self, vecs: np.ndarray, *args, **kwargs) -> None: self.pca_components = np.transpose(self.pca.components_) self.mean = self.pca.mean_.astype('float32') + self.explained_variance = self.pca.explained_variance_.astype('float32') + @train_required @batching def encode(self, vecs: np.ndarray, *args, **kwargs) -> np.ndarray: - return np.matmul(vecs - self.mean, self.pca_components) + X_transformed = np.matmul(vecs - self.mean, self.pca_components) + if self.whiten: + X_transformed /= np.sqrt(self.explained_variance) + return X_transformed class PCALocalEncoder(BaseNumericEncoder): diff --git a/gnes/encoder/numeric/standarder.py b/gnes/encoder/numeric/standarder.py new file mode 100644 index 00000000..4eb7d604 --- /dev/null +++ b/gnes/encoder/numeric/standarder.py @@ -0,0 +1,45 @@ +# Tencent is pleased to support the open source community by making GNES available. +# +# Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import numpy as np + +from ..base import BaseNumericEncoder +from ...helper import batching, train_required + + +class StandarderEncoder(BaseNumericEncoder): + batch_size = 2048 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.mean = None + self.scale = None + + def post_init(self): + from sklearn.preprocessing import StandardScaler + self.standarder = StandardScaler() + + @batching + def train(self, vecs: np.ndarray, *args, **kwargs) -> None: + self.standarder.partial_fit(vecs) + + self.mean = self.standarder.mean_.astype('float32') + self.scale = self.standarder.scale_.astype('float32') + + @train_required + @batching + def encode(self, vecs: np.ndarray, *args, **kwargs) -> np.ndarray: + return (vecs - self.mean) / self.scale \ No newline at end of file