-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdeepspeed.py
390 lines (308 loc) · 18.3 KB
/
deepspeed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Integration with Deepspeed
"""
import importlib.util #Python的importlib.util模块提供了一系列用于加载、导入、和查找Python模块的工具。
import weakref #weakref模块用于创建弱引用,允许引用对象但不会增加其引用计数。
from functools import partialmethod #functools模块中导入partialmethod函数,这个函数用于创建偏函数,可以固定一部分参数。
from .dependency_versions_check import dep_version_check #从当前包的dependency_versions_check模块中导入dep_version_check函数,这个函数用于检查依赖项的版本。
from .utils import is_accelerate_available, is_torch_available, logging #从当前包的utils模块中导入is_accelerate_available和is_torch_available函数,以及logging模块。
if is_torch_available(): #如果PyTorch库可用,就导入它。
import torch
logger = logging.get_logger(__name__) #使用logging模块的get_logger函数创建一个日志记录器。
def is_deepspeed_available(): #定义一个函数is_deepspeed_available,用于检查DeepSpeed库是否可用。
return importlib.util.find_spec("deepspeed") is not None
if is_accelerate_available() and is_deepspeed_available(): #如果Accelerate库和DeepSpeed库都可用,就从Accelerate的utils.deepspeed模块中导入
from accelerate.utils.deepspeed import HfDeepSpeedConfig as DeepSpeedConfig #HfDeepSpeedConfig类,并将其重命名为DeepSpeedConfig。
else:
# Inherits from a dummy `object` if accelerate is not available, so that python succeeds to import this file.
# Deepspeed glue code will never inherit this dummy object as it checks if accelerate is available.
from builtins import object as DeepSpeedConfig #如果Accelerate库和DeepSpeed库有任何一个不可用,就从Python内置的builtins模块中导入object类,并将其重命名为DeepSpeedConfig。
class HfDeepSpeedConfig(DeepSpeedConfig): #定义一个新的类HfDeepSpeedConfig,这个类继承自DeepSpeedConfig。
"""
This object contains a DeepSpeed configuration dictionary and can be quickly queried for things like zero stage.
A `weakref` of this object is stored in the module's globals to be able to access the config from areas where
things like the Trainer object is not available (e.g. `from_pretrained` and `_get_resized_embeddings`). Therefore
it's important that this object remains alive while the program is still running.
[`Trainer`] uses the `HfTrainerDeepSpeedConfig` subclass instead. That subclass has logic to sync the configuration
with values of [`TrainingArguments`] by replacing special placeholder values: `"auto"`. Without this special logic
the DeepSpeed configuration is not modified in any way.
Args:
config_file_or_dict (`Union[str, Dict]`): path to DeepSpeed config file or dict.
"""
def __init__(self, config_file_or_dict): #定义这个类的构造方法,它接受一个参数,可以是DeepSpeed配置文件的路径,也可以是配置的字典。
# set global weakref object
set_hf_deepspeed_config(self) #调用set_hf_deepspeed_config函数,将当前的HfDeepSpeedConfig实例设置为全局的弱引用对象。
dep_version_check("accelerate") #检查Accelerate和DeepSpeed的版本。
dep_version_check("deepspeed")
super().__init__(config_file_or_dict) #调用父类DeepSpeedConfig的构造方法,传入配置文件的路径或配置的字典。
#总的来说,这段代码主要定义了一个新的类HfDeepSpeedConfig,用于处理和查询DeepSpeed的配置。它继承自Accelerate库中的HfDeepSpeedConfig类,或者在Accelerate和DeepSpeed库都不可用时,继承自Python内置的object类。
class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig):
"""
The `HfTrainerDeepSpeedConfig` object is meant to be created during `TrainingArguments` object creation and has the
same lifespan as the latter.
"""
def __init__(self, config_file_or_dict):
super().__init__(config_file_or_dict)
self._dtype = None
self.mismatches = []
def dtype(self):
if self._dtype is None:
raise ValueError("trainer_config_process() wasn't called yet to tell dtype")
return self._dtype
def is_auto(self, ds_key_long):
val = self.get_value(ds_key_long)
if val is None:
return False
else:
return val == "auto"
def fill_match(self, ds_key_long, hf_val, hf_key=None, must_match=True):
"""
A utility method that massages the config file and can optionally verify that the values match.
1. Replace "auto" values with `TrainingArguments` value.
2. If it wasn't "auto" and `must_match` is true, then check that DS config matches Trainer
config values and if mismatched add the entry to `self.mismatched` - will assert during
`trainer_config_finalize` for one or more mismatches.
"""
config, ds_key = self.find_config_node(ds_key_long)
if config is None:
return
if config.get(ds_key) == "auto":
config[ds_key] = hf_val
return
if not must_match:
return
ds_val = config.get(ds_key)
if ds_val is not None and ds_val != hf_val:
self.mismatches.append(f"- ds {ds_key_long}={ds_val} vs hf {hf_key}={hf_val}")
fill_only = partialmethod(fill_match, must_match=False)
def trainer_config_process(self, args):
"""
Adjust the config with `TrainingArguments` values. This stage is run during `TrainingArguments` object
creation.
"""
# DeepSpeed does:
# train_batch_size = world_size * train_micro_batch_size_per_gpu * gradient_accumulation_steps
train_batch_size = args.world_size * args.per_device_train_batch_size * args.gradient_accumulation_steps
self.fill_match(
"train_micro_batch_size_per_gpu", args.per_device_train_batch_size, "per_device_train_batch_size"
)
self.fill_match("gradient_accumulation_steps", args.gradient_accumulation_steps, "gradient_accumulation_steps")
self.fill_match("train_batch_size", train_batch_size, "train_batch_size (calculated)")
self.fill_match("gradient_clipping", args.max_grad_norm, "max_grad_norm")
self.fill_match("optimizer.params.lr", args.learning_rate, "learning_rate")
self.fill_match("optimizer.params.betas", [args.adam_beta1, args.adam_beta2], "adam_beta1+adam_beta2")
self.fill_match("optimizer.params.eps", args.adam_epsilon, "adam_epsilon")
self.fill_match("optimizer.params.weight_decay", args.weight_decay, "weight_decay")
self.fill_only("scheduler.params.warmup_min_lr", 0) # not a trainer arg
self.fill_match("scheduler.params.warmup_max_lr", args.learning_rate, "learning_rate")
# total_num_steps - will get set in trainer_config_finalize
# fp16
if args.fp16 or args.fp16_full_eval:
fp16_backend = "apex" if args.fp16_backend == "apex" else "amp"
else:
fp16_backend = None
if args.save_on_each_node:
# deepspeed uses shared storage by default. Let's override this setting if save_on_each_node == True
self.config["checkpoint"] = self.config.get("checkpoint", {})
self.config["checkpoint"]["use_node_local_storage"] = args.save_on_each_node
# amp: similar to the pytorch native amp - it has a bunch of optional params but we won't set
# any here unless the user did the work
self.fill_match(
"fp16.enabled",
((args.fp16 or args.fp16_full_eval) and fp16_backend == "amp"),
"fp16|fp16_full_eval+fp16_backend(amp)",
)
# apex: delegates amp work to apex (which needs to be available), but it cannot be used with any
# ZeRO features
self.fill_match("amp.enabled", fp16_backend == "apex", "fp16+fp16_backend(apex)")
self.fill_match("amp.opt_level", args.fp16_opt_level, "fp16_opt_level")
self.fill_match("bf16.enabled", (args.bf16 or args.bf16_full_eval), "bf16|bf16_full_eval")
# deepspeed's default mode is fp16 unless there is a config that says differently
if self.is_true("bf16.enabled"):
self._dtype = torch.bfloat16
elif self.is_false("fp16.enabled"):
self._dtype = torch.float32
else:
self._dtype = torch.float16
def trainer_config_finalize(self, args, model, num_training_steps):
"""
This stage is run after we have the model and know num_training_steps.
Now we can complete the configuration process.
"""
# zero
# deal with config keys that use `auto` value and rely on model's hidden_size
hidden_size_based_keys = [
"zero_optimization.reduce_bucket_size",
"zero_optimization.stage3_prefetch_bucket_size",
"zero_optimization.stage3_param_persistence_threshold",
]
hidden_size_auto_keys = [x for x in hidden_size_based_keys if self.is_auto(x)]
if len(hidden_size_auto_keys) > 0:
if hasattr(model.config, "hidden_size"):
hidden_size = model.config.hidden_size
elif hasattr(model.config, "hidden_sizes"):
# if there are many hidden sizes pick the largest one
hidden_size = max(model.config.hidden_sizes)
else:
raise ValueError(
"The model's config file has neither `hidden_size` nor `hidden_sizes` entry, "
"therefore it's not possible to automatically fill out the following `auto` entries "
f"in the DeepSpeed config file: {hidden_size_auto_keys}. You can fix that by replacing "
"`auto` values for these keys with an integer value of your choice."
)
self.fill_only("zero_optimization.reduce_bucket_size", hidden_size * hidden_size)
if self.is_zero3():
# automatically assign the optimal config values based on model config
self.fill_only("zero_optimization.stage3_prefetch_bucket_size", 0.9 * hidden_size * hidden_size)
self.fill_only("zero_optimization.stage3_param_persistence_threshold", 10 * hidden_size)
# scheduler
self.fill_match("scheduler.params.total_num_steps", num_training_steps, "num_training_steps (calculated)")
self.fill_match("scheduler.params.warmup_num_steps", args.get_warmup_steps(num_training_steps), "warmup_steps")
if len(self.mismatches) > 0:
mismatches = "\n".join(self.mismatches)
raise ValueError(
"Please correct the following DeepSpeed config values that mismatch TrainingArguments"
f" values:\n{mismatches}\nThe easiest method is to set these DeepSpeed config values to 'auto'."
)
# keep the config object global to be able to access it anywhere during TrainingArguments life-cycle
_hf_deepspeed_config_weak_ref = None
def set_hf_deepspeed_config(hf_deepspeed_config_obj):
# this is a special weakref global object to allow us to get to Deepspeed config from APIs
# that don't have an easy way to get to the Deepspeed config outside of the Trainer domain.
global _hf_deepspeed_config_weak_ref
# will go away automatically when HfDeepSpeedConfig is destroyed (when TrainingArguments is destroyed)
_hf_deepspeed_config_weak_ref = weakref.ref(hf_deepspeed_config_obj)
def unset_hf_deepspeed_config():
# useful for unit tests to ensure the global state doesn't leak - call from `tearDown` method
global _hf_deepspeed_config_weak_ref
_hf_deepspeed_config_weak_ref = None
def is_deepspeed_zero3_enabled():
if _hf_deepspeed_config_weak_ref is not None and _hf_deepspeed_config_weak_ref() is not None:
return _hf_deepspeed_config_weak_ref().is_zero3()
else:
return False
def deepspeed_config():
if _hf_deepspeed_config_weak_ref is not None and _hf_deepspeed_config_weak_ref() is not None:
return _hf_deepspeed_config_weak_ref().config
else:
return None
def deepspeed_optim_sched(trainer, hf_deepspeed_config, args, num_training_steps, model_parameters):
"""
A convenience wrapper that deals with optimizer and lr scheduler configuration.
"""
from accelerate.utils import DummyOptim, DummyScheduler
config = hf_deepspeed_config.config
# Optimizer + Scheduler
# Currently supported combos:
# 1. DS scheduler + DS optimizer: Yes
# 2. HF scheduler + HF optimizer: Yes
# 3. DS scheduler + HF optimizer: Yes
# 4. HF scheduler + DS optimizer: No
#
# Unless Offload is enabled in which case it's:
# 1. DS scheduler + DS optimizer: Yes
# 2. HF scheduler + HF optimizer: Mostly*
# 3. DS scheduler + HF optimizer: Mostly*
# 4. HF scheduler + DS optimizer: No
#
# Mostly*: All non-native DeepSpeed optimizers that have both CPU and GPU implementation should work (except LAMB)
optimizer = None
if "optimizer" in config:
if args.adafactor:
raise ValueError(
"--adafactor was passed, but also found `optimizer` configured in the DeepSpeed config. "
"Only one optimizer can be configured."
)
optimizer = DummyOptim(params=model_parameters)
else:
if hf_deepspeed_config.is_offload():
logger.info(
"Detected ZeRO Offload and non-DeepSpeed optimizers: This combination should work as long as the"
" custom optimizer has both CPU and GPU implementation (except LAMB)"
)
# ds supports Adam, OneBitAdam, and Lamb optimizers and can import other optimizers from torch.
# But trainer uses AdamW by default.
optimizer = trainer.create_optimizer()
# To use other optimizers requires voiding warranty with: `zero_allow_untested_optimizer`
config["zero_allow_untested_optimizer"] = True
lr_scheduler = None
if "scheduler" in config:
lr_scheduler = DummyScheduler(optimizer)
else:
if isinstance(optimizer, DummyOptim):
raise ValueError(
"Found `optimizer` configured in the DeepSpeed config, but no `scheduler`. "
"Please configure a scheduler in the DeepSpeed config."
)
lr_scheduler = trainer.create_scheduler(num_training_steps=num_training_steps, optimizer=optimizer)
return optimizer, lr_scheduler
def deepspeed_init(trainer, num_training_steps, inference=False):
"""
Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args.
If `resume_from_checkpoint` was passed then an attempt to resume from a previously saved checkpoint will be made.
Args:
trainer: Trainer object
num_training_steps: per single gpu
resume_from_checkpoint: path to a checkpoint if to resume from after normal DeepSpeedEngine load
inference: launch in inference mode (no optimizer and no lr scheduler)
Returns: optimizer, lr_scheduler
We may use `deepspeed_init` more than once during the life of Trainer, when we do - it's a temp hack based on:
https://github.com/microsoft/DeepSpeed/issues/1394#issuecomment-937405374 until Deepspeed fixes a bug where it
can't resume from a checkpoint after it did some stepping https://github.com/microsoft/DeepSpeed/issues/1612
"""
from deepspeed.utils import logger as ds_logger
model = trainer.model
args = trainer.args
hf_deepspeed_config = trainer.accelerator.state.deepspeed_plugin.hf_ds_config
# resume config update - some bits like `model` and `num_training_steps` only become available during train
hf_deepspeed_config.trainer_config_finalize(args, model, num_training_steps)
# set the Deepspeed log level consistent with the Trainer
ds_logger.setLevel(args.get_process_log_level())
if inference:
# only Z3 makes sense for the inference
if not hf_deepspeed_config.is_zero3():
raise ValueError("ZeRO inference only makes sense with ZeRO Stage 3 - please adjust your config")
# in case the training config is re-used for inference
hf_deepspeed_config.del_config_sub_tree("optimizer")
hf_deepspeed_config.del_config_sub_tree("lr_scheduler")
optimizer, lr_scheduler = None, None
model_parameters = None
else:
trainer.optimizer = None # important for when deepspeed_init is used as re-init
model_parameters = list(filter(lambda p: p.requires_grad, model.parameters()))
optimizer, lr_scheduler = deepspeed_optim_sched(
trainer, hf_deepspeed_config, args, num_training_steps, model_parameters
)
# keep for quick debug:
# from pprint import pprint; pprint(config)
return optimizer, lr_scheduler
def deepspeed_load_checkpoint(deepspeed_engine, checkpoint_path):
# it's possible that the user is trying to resume from model_path, which doesn't necessarily
# contain a deepspeed checkpoint. e.g. examples just check if the dir exists and assume it's
# a resume from a checkpoint and not just a local pretrained weight. So we check here if the
# path contains what looks like a deepspeed checkpoint
import glob
deepspeed_checkpoint_dirs = sorted(glob.glob(f"{checkpoint_path}/global_step*"))
if len(deepspeed_checkpoint_dirs) > 0:
logger.info(f"Attempting to resume from {checkpoint_path}")
# this magically updates self.optimizer and self.lr_scheduler
load_path, _ = deepspeed_engine.load_checkpoint(
checkpoint_path, load_optimizer_states=True, load_lr_scheduler_states=True
)
if load_path is None:
raise ValueError(f"[deepspeed] failed to resume from checkpoint {checkpoint_path}")
else:
raise ValueError(f"Can't find a valid checkpoint at {checkpoint_path}")