-
Notifications
You must be signed in to change notification settings - Fork 5.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
add layerwise learning rate for adamw #35569
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,6 +18,7 @@ | |
from ..fluid import framework | ||
from ..fluid.framework import Variable | ||
from ..fluid.dygraph import base as imperative_base | ||
from collections import Callable | ||
import paddle | ||
|
||
_C_ops = core.ops | ||
|
@@ -63,6 +64,10 @@ class AdamW(Adam): | |
epsilon (float, optional): A small float value for numerical stability. | ||
The default value is 1e-08. | ||
weight_decay (float|Tensor, optional): The weight decay coefficient, it can be float or Tensor. The default value is 0.01. | ||
lr_ratio (function|None, optional): If it is not None, | ||
the learning rate will be updated with layerwise learning rate ratio. | ||
Otherwise, the learning rate is the original. | ||
Default: None. | ||
apply_decay_param_fun (function|None, optional): If it is not None, | ||
only tensors that makes apply_decay_param_fun(Tensor.name)==True | ||
will be updated with weight decay. It only works when we want to specify tensors. | ||
|
@@ -140,6 +145,7 @@ def __init__(self, | |
epsilon=1e-8, | ||
parameters=None, | ||
weight_decay=0.01, | ||
lr_ratio=None, | ||
apply_decay_param_fun=None, | ||
grad_clip=None, | ||
lazy_mode=False, | ||
|
@@ -163,6 +169,12 @@ def __init__(self, | |
self._apply_decay_param_fun = apply_decay_param_fun | ||
self._coeff = coeff | ||
self._lr_to_coeff = dict() | ||
if lr_ratio is not None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should add explanation for the new lr_ration argument, which should follow the explanation for "apply_decay_param_fun" There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done. |
||
assert isinstance(lr_ratio, Callable) | ||
if core.is_compiled_with_xpu() or core.is_compiled_with_npu(): | ||
raise NotImplementedError( | ||
"'lr_ratio' is unimplemented in XPU and NPU") | ||
self._lr_ratio = lr_ratio | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you should think about how many kernel will be affected by "lr_ratio". There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done. |
||
|
||
super(AdamW, self).__init__( | ||
learning_rate=learning_rate, | ||
|
@@ -278,6 +290,8 @@ def _append_optimize_op(self, block, param_and_grad): | |
|
||
# create the adamw optimize op | ||
if framework.in_dygraph_mode(): | ||
lr_ratio_ = 1. if self._lr_ratio is None else self._lr_ratio( | ||
param_and_grad[0]) | ||
|
||
_beta1 = self._beta1 if not isinstance( | ||
self._beta1, Variable) else self._beta1.numpy().item(0) | ||
|
@@ -288,7 +302,8 @@ def _append_optimize_op(self, block, param_and_grad): | |
beta1_pow_acc, beta2_pow_acc, param_and_grad[0], moment1, | ||
moment2, beta1_pow_acc, beta2_pow_acc, 'epsilon', self._epsilon, | ||
'lazy_mode', self._lazy_mode, 'min_row_size_to_use_multithread', | ||
1000, 'beta1', _beta1, 'beta2', _beta2, 'coeff', self._coeff) | ||
1000, 'beta1', _beta1, 'beta2', _beta2, 'coeff', self._coeff, | ||
"lr_ratio", lr_ratio_) | ||
|
||
return None | ||
|
||
|
@@ -321,6 +336,8 @@ def _append_optimize_op(self, block, param_and_grad): | |
"multi_precision": find_master, | ||
"with_decay": with_decay, | ||
"coeff": self._coeff, | ||
"lr_ratio": 1. | ||
if self._lr_ratio is None else self._lr_ratio(param_and_grad[0]) | ||
} | ||
|
||
if isinstance(self._beta1, Variable): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why add this argument to adam? is that adamw and adam share the same .cc file ?
in this case, adamw should have its own .cc file
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
AdamWOpMaker inherits AdamOpMaker, and they use the same InferShape function of AdamOp.
In this case, 'lr_ratio' has no effect on Adam.