-
Notifications
You must be signed in to change notification settings - Fork 100
/
Copy pathmeg_gpt2_base_n4.slurm
135 lines (114 loc) · 4.67 KB
/
meg_gpt2_base_n4.slurm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/bin/bash
#SBATCH --job-name=meg_gpt2_base_n4_dp1_tp4_pp4
#SBATCH --constraint=v100-32g
#SBATCH --nodes=4
#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
#SBATCH --cpus-per-task=40 # number of cores per tasks
#SBATCH --hint=nomultithread # we get physical cores not logical
#SBATCH --gres=gpu:4 # number of gpus
#SBATCH --time 00:10:00 # maximum execution time (HH:MM:SS)
#SBATCH --output=%x-%j.out # output file name
#SBATCH --error=%x-%j.out # error file name (same to watch just one file)
#SBATCH --account=six@gpu
set -x -e
source $six_ALL_CCFRWORK/start-prod
nvidia-smi
cd $six_ALL_CCFRWORK/code/megatron-lm/
CHECKPOINT_PATH=$six_ALL_CCFRWORK/models-custom/megatron-gpt2/megatron_lm_345m_v0.0/release
VOCAB_FILE=$CHECKPOINT_PATH/gpt2-vocab.json
MERGE_FILE=$CHECKPOINT_PATH/gpt2-merges.txt
DATA_PATH=$six_ALL_CCFRWORK/datasets-custom/openwebtext-10k/meg-gpt2_text_document
SAVE_CHECKPOINT_PATH=$six_ALL_CCFRSCRATCH/checkpoints/gpt2-1-node
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
MASTER_PORT=6000
# adjust depending on the number of the nodes
NNODES=4
PP_SIZE=4 # NLAYERS must be a multiple of PP_SIZE here
MICRO_BATCH_SIZE=1
PP_CHUNKS=4
MSIZE=18
if [[ ${MSIZE} == 7 ]]; then NHIDDEN=4096; NLAYERS=36
elif [[ ${MSIZE} == 14 ]]; then NHIDDEN=6144; NLAYERS=32
elif [[ ${MSIZE} == 18 ]]; then NHIDDEN=6144; NLAYERS=40
elif [[ ${MSIZE} == 25 ]]; then NHIDDEN=7168; NLAYERS=40
elif [[ ${MSIZE} == 30 ]]; then NHIDDEN=7168; NLAYERS=48
elif [[ ${MSIZE} == 39 ]]; then NHIDDEN=8192; NLAYERS=48
elif [[ ${MSIZE} == 52 ]]; then NHIDDEN=8192; NLAYERS=64
elif [[ ${MSIZE} == 65 ]]; then NHIDDEN=9216; NLAYERS=64
elif [[ ${MSIZE} == 81 ]]; then NHIDDEN=10240; NLAYERS=64
elif [[ ${MSIZE} == 97 ]]; then NHIDDEN=11264; NLAYERS=64
elif [[ ${MSIZE} == 116 ]]; then NHIDDEN=12288; NLAYERS=64
elif [[ ${MSIZE} == 136 ]]; then NHIDDEN=13312; NLAYERS=64
elif [[ ${MSIZE} == 158 ]]; then NHIDDEN=14336; NLAYERS=64
elif [[ ${MSIZE} == 181 ]]; then NHIDDEN=15360; NLAYERS=64
elif [[ ${MSIZE} == 206 ]]; then NHIDDEN=16384; NLAYERS=64
else echo "invalid MSIZE: $MSIZE"
fi
GPUS_PER_NODE=4
NHEADS=32
SEQ_LEN=1024
VOCAB_SIZE=50257
DP_SIZE=1
TP_SIZE=4
GLOBAL_BATCH_SIZE=$(($MICRO_BATCH_SIZE*$PP_CHUNKS*$DP_SIZE))
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
GPT_ARGS=" \
--num-layers $NLAYERS \
--hidden-size $NHIDDEN \
--num-attention-heads $NHEADS \
--seq-length $SEQ_LEN \
--max-position-embeddings $SEQ_LEN \
--micro-batch-size $MICRO_BATCH_SIZE \
--global-batch-size $GLOBAL_BATCH_SIZE
--lr 0.00015 \
--lr-decay-style cosine \
--min-lr 1.0e-5 \
--train-iters 1000 \
--lr-decay-iters 800 \
--lr-warmup-fraction .01 \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--fp16 \
--checkpoint-activations \
"
OUTPUT_ARGS=" \
--log-interval 1 \
--save-interval 500 \
--eval-interval 100 \
--eval-iters 10 \
"
export LAUNCHER="python -u -m torch.distributed.launch \
--nproc_per_node $GPUS_PER_NODE \
--nnodes $NNODES \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT \
"
export CMD=" \
`pwd`/pretrain_gpt.py \
--tensor-model-parallel-size $TP_SIZE \
--pipeline-model-parallel-size $PP_SIZE \
$GPT_ARGS \
$OUTPUT_ARGS \
--save $SAVE_CHECKPOINT_PATH \
--load $SAVE_CHECKPOINT_PATH \
--data-path $DATA_PATH \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
"
# clear old checkpoint as it'd mismatch while we sort things out
rm -rf $six_ALL_CCFRWORK/checkpoints/gpt2-1-node
# model size
python -c "h=$NHIDDEN; l=$NLAYERS; s=$SEQ_LEN; v=$VOCAB_SIZE; print(f'Model size: {(l * (12*h**2 + 13*h) + (v * h) + (s * h) ) / 10**9 :.0f}B')"
# to debug - add echo (it exits and prints what it would have launched)
srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD'
# iteration 190/ 1000 | consumed samples: 760 | elapsed time per iteration (ms): 1381.7 | learning
# rate: 1.359E-04 | global batch size: 4 | lm loss: 7.416655E+00 | loss scale: 16384.0 | grad norm:
# 2.521 | number of skipped iterations: 0 | number of nan iterations: 0 | time (ms) |
# forward-compute: 175.98 | forward-recv: 126.42 | backward-compute: 515.29 | backward-send: 0.67 |
# backward-send-forward-recv: 4.75 | backward-params-all-reduce: 23.18 |
# backward-embedding-all-reduce: 419.14 | optimizer-copy-to-main-grad: 11.09 |
# optimizer-unscale-and-check-inf: 25.63 | optimizer-clip-main-grad: 19.49 |
# optimizer-copy-main-to-model-params: 11.34 | optimizer: 115.19 | batch-generator: 2.54