Skip to content

Commit

Permalink
在梯度提升方法中实现了lad 和huber
Browse files Browse the repository at this point in the history
  • Loading branch information
wwyf committed Oct 10, 2018
1 parent 516afe2 commit 1c365b9
Show file tree
Hide file tree
Showing 7 changed files with 92 additions and 19 deletions.
File renamed without changes.
File renamed without changes.
73 changes: 58 additions & 15 deletions emsemble/regression.py → ensemble/regression.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,86 @@
from pyml.tree.regression import DecisionTreeRegressor
from pyml.metrics.pairwise import euclidean_distance
from pyml.metrics.pairwise import absolute_distance
import numpy as np
from pyml.logger import logger

# TODO: 使用平方误差,还是绝对值误差,还是Huber Loss
# TODO: 如何给基回归器传参

class GradientBoostingRegression():
def __init__(self,
loss='ls',
learning_rate=0.1,
base_estimator=DecisionTreeRegressor,
n_estimators=500,
random_state=None,
max_tree_node_size=10
n_estimators=100,
max_tree_node_size=10,
delta=0.5,
random_state=None
):
"""
Parameters
------------
loss : loss function to be optimized
- 'ls' least squares
- 'lad' least absolute deviation
- 'huber' ???
"""
self.loss = loss
self.estimators = []
self.n_estimators = n_estimators
self.base_estimator = base_estimator
self.learning_rate = learning_rate
self.max_tree_node_size = max_tree_node_size
self.delta=delta
# key='f' : a list of estimator
# key='lr' : a list of learning_rate
self.parameters = {
'f' : [],
'lr' : []
}
# key='f' : a list of estimator
# key='lr' : a list of learning_rate

def optimizer(self, X, Y, watch=False):
"""
训练一次
"""
logger.debug('X : \n{}\nY : {}'.format(X, Y))
cur_Y_pred = self.predict(X)
# print('cur_Y_pred : ', cur_Y_pred)

# 计算cost
cost = euclidean_distance(cur_Y_pred, Y)

# 计算残差 or 计算梯度
d_fx = cur_Y_pred - Y
# print('d_fx : ', d_fx)
logger.debug('cur_Y_pred : {}'.format(cur_Y_pred))

# 梯度取负数
d_fx = - d_fx
if self.loss == 'ls':
# 计算均方误差,平方和除2(除2是为了与之后的梯度对应)
cost = np.square(cur_Y_pred- Y).sum()/2
# 计算残差 or 计算梯度
d_fx = cur_Y_pred - Y
logger.debug('d_fx : {}'.format(d_fx))
# 梯度取负数
d_fx = - d_fx
elif self.loss == 'lad':
cost = absolute_distance(cur_Y_pred, Y)
d_fx = np.sign(cur_Y_pred-Y)
d_fx = - d_fx
elif self.loss == 'huber':
# 计算cost
deviation = cur_Y_pred-Y
logger.debug('deviation : {}'.format(deviation))
abs_deviation = np.abs(deviation)
logger.debug('abs_deviation : {}'.format(abs_deviation))
small_part_index = abs_deviation <= self.delta
big_part_index = abs_deviation > self.delta
# 取得差小于等于delta的部分
cost = np.square(abs_deviation[small_part_index]).sum()
logger.debug('cost : {}'.format(cost))
# 取得差大于delta的部分
cost += self.delta*(abs_deviation[big_part_index]- self.delta/2).sum()
logger.debug('cost : {}'.format(cost))
d_fx = np.zeros((Y.shape))
d_fx[small_part_index] = deviation[small_part_index]
logger.debug('d_fx : {}'.format(d_fx))
d_fx[big_part_index] = self.delta * np.sign(deviation[big_part_index])
logger.debug('d_fx : {}'.format(d_fx))
d_fx = -d_fx
else:
raise NotImplementedError

# 计算学习率,这里默认为初始化参数
lr = self.learning_rate
Expand All @@ -51,6 +93,7 @@ def optimizer(self, X, Y, watch=False):
return cost

def fit(self, X, Y, watch=False):
logger.debug('X : \n{} Y : {}'.format(X, Y))
init_estimator = self.base_estimator(max_node_size=self.max_tree_node_size)
init_estimator.fit(X,Y)
self.parameters['f'].append(init_estimator)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np
from pyml.emsemble.regression import GradientBoostingRegression
from pyml.ensemble.regression import GradientBoostingRegression
from pyml.tree.regression import DecisionTreeRegressor
from pyml.logger import logger


if __name__ == '__main__':
Expand All @@ -23,7 +24,7 @@
mini_standard_out_Y = np.array([
2.5,4.5
])
# rgs = GradientBoostingRegression()
rgs = GradientBoostingRegression(learning_rate=0.1, base_estimator=DecisionTreeRegressor, max_tree_node_size=2, n_estimators=200)
# logger.setLevel(10)
rgs = GradientBoostingRegression(loss='huber', learning_rate=0.05, base_estimator=DecisionTreeRegressor, max_tree_node_size=2, n_estimators=500)
rgs.fit(mini_train_X,mini_train_Y)
print(rgs.predict(mini_test_X))
2 changes: 1 addition & 1 deletion logger/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
logger = logging.getLogger('simple_example')
formatter = logging.Formatter('[%(levelname)8s] - [%(module)10s] - [%(lineno)3d] - [%(funcName)10s] \n%(message)s\n')

logger.setLevel(logging.INFO)
logger.setLevel(logging.WARNING)
ch = logging.StreamHandler()
# ch.setLevel(logging.DEBUG)
ch.setFormatter(formatter)
Expand Down
23 changes: 23 additions & 0 deletions metrics/pairwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,29 @@ def euclidean_distance(vec1, vec2):
assert(vec1.shape[1] == vec2.shape[1])
return np.linalg.norm(vec1-vec2)

def absolute_distance(vec1, vec2):
"""the euclidean distance of two vectors
Parameters
-----------
vec1: shape (1, n_features) or shape(n_features,)
vec2: shape (1, n_features) or shape(n_features,)
Returns
-------
distances : shape (1, 1)
"""
vec1 = vec1.reshape((1,-1))
vec2 = vec2.reshape((1,-1))
assert(vec1.shape[0] == 1)
assert(vec2.shape[0] == 1)
assert(vec1.shape[1] == vec2.shape[1])
return np.abs(vec1-vec2).sum()

def l_p_distance(vec1, vec2,p=1):
""" calculate the p(default 1) norm of two vectors
Expand Down
6 changes: 6 additions & 0 deletions metrics/tests/test_pairwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from math import sqrt

from pyml.metrics.pairwise import euclidean_distance
from pyml.metrics.pairwise import absolute_distance
from pyml.metrics.pairwise import l_p_distance
from pyml.metrics.pairwise import cosine_similarity
from pyml.metrics.pairwise import cosine_distance
Expand Down Expand Up @@ -38,6 +39,11 @@ def test_euclidean_distance(self):
v1 = np.array([[1,1]])
v2 = np.array([[0,0]])
self.assertAlmostEqual(euclidean_distance(v1,v2),1.41421356)

def test_absolute_distance(self):
v1 = np.array([[2,4]])
v2 = np.array([[1,6]])
self.assertAlmostEqual(absolute_distance(v1, v2), 3)

if __name__ == '__main__':
unittest.main()
Expand Down

0 comments on commit 1c365b9

Please sign in to comment.