Skip to content

Commit

Permalink
updated aquila to fit torch2.0
Browse files Browse the repository at this point in the history
Signed-off-by: ftgreat <[email protected]>
  • Loading branch information
ftgreat committed Aug 7, 2023
1 parent ec3d8cd commit 5939fab
Show file tree
Hide file tree
Showing 14 changed files with 30 additions and 25 deletions.
6 changes: 3 additions & 3 deletions examples/Aquila/Aquila-chat/Aquila-chat-lora.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
batch_size: 16
batch_size: 1
epochs: 5
gradient_accumulation_steps: 1
lr: 4.0e-5
Expand All @@ -16,5 +16,5 @@ enable_flash_attn_models: False
eps: 1.0e-8
lora: True

enable_sft_dataset_dir: '/data2/yzd/FlagAI/convo_v2/'
enable_sft_dataset_file: 'sft_train_english_alpaca.jsonl'
enable_sft_dataset_dir: './data/'
enable_sft_dataset_file: 'convo_samples.jsonl'
7 changes: 6 additions & 1 deletion examples/Aquila/Aquila-chat/generate_chat_lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
use_cache=True,
fp16=True,
device='cuda',
adapter_dir='/data2/yzd/FlagAI/examples/Aquila/Aquila-chat/checkpoints_out/aquila_experiment75new/2023070515/') # eg: /mnt/yzd/git/FlagAI/examples/Aquila/Aquila-chat/checkpoints_out/aquila_experiment/2023062909
adapter_dir='/data2/yzd/FlagAI/examples/Aquila/Aquila-chat/checkpoints_out/aquila_experiment/2023080216/') # Directory to adapter_model.bin and adapter_config.json
model = loader.get_model()

tokenizer = loader.get_tokenizer()
Expand All @@ -32,6 +32,11 @@

texts = [
"Find the product of the numbers: 5 and 8",
"Create a list of potential topics for a company newsletter",
"Explain the theory of relativity in simple terms.",
"Write a short story about a dragon and a knight.",
"翻译成英文: '我饿了想吃饭'",
"write a fairy tale for me",
]

for text in texts:
Expand Down
2 changes: 1 addition & 1 deletion examples/Aquila/Aquila-chat/hostfile
Original file line number Diff line number Diff line change
@@ -1 +1 @@
192.168.21.7 slots=2
192.168.20.3 slots=1
2 changes: 1 addition & 1 deletion examples/Aquila/Aquila-code/aquila_code_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@
# avoid sync loading models in case of Mem OOM
if env_args.bmt_async_load:
import time
time.sleep(10 * 60 * (trainer.local_rank % 4))
time.sleep(10 * 60 * (os.environ['LOCAL_RANK'] % 4))

config_file = os.path.join(cache_dir, 'config.json')
model = AQUILAModel.init_from_json(config_file=config_file)
Expand Down
2 changes: 1 addition & 1 deletion examples/Aquila/Aquila-code/bmtrain_mgpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ OPTS=" --batch_size $BATCH_SIZE \
## Trigger job on Each Node when bmt or ddp.

mkdir -p $PRE_LOAD_DIR
python -m torch.distributed.launch \
torchrun \
--nproc_per_node $GPU_NUM_PER_NODE \
--nnodes $NODES_NUM \
--node_rank $RANK \
Expand Down
5 changes: 2 additions & 3 deletions examples/Aquila/Aquila-pretrain/aquila_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import torch
from torch.utils.data import Dataset
import gc

gc.collect()
torch.cuda.empty_cache()
from flagai.auto_model.auto_loader import AutoLoader
Expand Down Expand Up @@ -62,7 +61,7 @@
import sys
sys.exit(0)

print(f"Trainer effective env_args={env_args} local_rank={trainer.local_rank}",
print(f"Trainer effective env_args={env_args} local_rank={os.environ['LOCAL_RANK']}",
flush=True)
checkpoints = env_args.pre_load_dir
model_name = env_args.model_name
Expand All @@ -77,7 +76,7 @@
# avoid sync loading models in case of Mem OOM
if env_args.bmt_async_load:
import time
time.sleep(10 * 60 * (trainer.local_rank % 4))
time.sleep(10 * 60 * (os.environ['LOCAL_RANK'] % 4))

config_file = os.path.join(cache_dir, 'config.json')
model = AQUILAModel.init_from_json(config_file=config_file)
Expand Down
2 changes: 1 addition & 1 deletion examples/Aquila/Aquila-pretrain/bmtrain_mgpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ OPTS=" --batch_size $BATCH_SIZE \
## Trigger job on Each Node when bmt or ddp.

mkdir -p $PRE_LOAD_DIR
python -m torch.distributed.launch \
torchrun \
--nproc_per_node $GPU_NUM_PER_NODE \
--nnodes $NODES_NUM \
--node_rank $RANK \
Expand Down
4 changes: 2 additions & 2 deletions examples/Aquila/aquila_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@
import sys
sys.exit(0)

print(f"Trainer effective env_args={env_args} local_rank={trainer.local_rank}",
print(f"Trainer effective env_args={env_args} local_rank={os.environ['LOCAL_RANK']}",
flush=True)
checkpoints = env_args.pre_load_dir
model_name = env_args.model_name
Expand All @@ -77,7 +77,7 @@
# avoid sync loading models in case of Mem OOM
if env_args.bmt_async_load:
import time
time.sleep(10 * 60 * (trainer.local_rank % 4))
time.sleep(10 * 60 * (os.environ['LOCAL_RANK'] % 4))

config_file = os.path.join(cache_dir, 'config.json')
model = AQUILAModel.init_from_json(config_file=config_file)
Expand Down
2 changes: 1 addition & 1 deletion examples/Aquila/bmtrain_mgpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ OPTS=" --batch_size $BATCH_SIZE \
## Trigger job on Each Node when bmt or ddp.

mkdir -p $PRE_LOAD_DIR
python -m torch.distributed.launch \
torchrun \
--nproc_per_node $GPU_NUM_PER_NODE \
--nnodes $NODES_NUM \
--node_rank $RANK \
Expand Down
2 changes: 1 addition & 1 deletion examples/Aquila/hostfile
Original file line number Diff line number Diff line change
@@ -1 +1 @@
192.168.21.2 slots=4
192.168.20.3 slots=2
2 changes: 1 addition & 1 deletion examples/swinv2/inference_swinv2.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from tqdm import tqdm
from flagai.auto_model.auto_loader import AutoLoader

data_path = "./imagenet2012/"
data_path = "/data2/yzd/FlagAI/examples/swinv2/imagenet2012/"

# swinv2 model_name support:
# 1. swinv2-base-patch4-window16-256,
Expand Down
3 changes: 1 addition & 2 deletions flagai/model/predictor/aquila.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,7 @@ def aquila_generate(

total_len = min(2048, max_gen_len + max_prompt_size)

# tokens = torch.full((bsz, total_len), 0).cuda().long()
tokens = torch.full((bsz, total_len), 0).to("cuda:5").long()
tokens = torch.full((bsz, total_len), 0).cuda().long()
for k, t in enumerate(prompt_tokens):
tokens[k, : len(t)] = t.clone().detach().long()
input_text_mask = tokens != 0
Expand Down
7 changes: 4 additions & 3 deletions flagai/model/vision/swinv2.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ def forward(self, x, mask=None):

# cosine attention
attn = (F.normalize(q, dim=-1) @ F.normalize(k, dim=-1).transpose(-2, -1))
logit_scale = torch.clamp(self.logit_scale, max=torch.log(torch.tensor(1. / 0.01))).exp()
logit_scale = torch.clamp(self.logit_scale, max=torch.log(torch.tensor(1. / 0.01)).to(self.logit_scale.device)).exp()
attn = attn * logit_scale

relative_position_bias_table = self.cpb_mlp(self.relative_coords_table).view(-1, self.num_heads)
Expand Down Expand Up @@ -518,7 +518,8 @@ def __init__(self, img_size=224,
patch_norm=True,
pretrained_window_sizes=[0, 0, 0, 0],
checkpoint_activations=False,
num_classes=1000):
num_classes=1000,
**kwargs):
self.num_classes = num_classes
self.img_size = img_size
self.patch_size = patch_size
Expand Down Expand Up @@ -566,7 +567,7 @@ class SwinTransformerV2(BaseModel):

def __init__(self, config, num_classes=1000, **kwargs):
super().__init__(config, **kwargs)
swin_config = SwinTransformerConfig(**config)
swin_config = SwinTransformerConfig(**config.json_config)

embed_dim = swin_config.embed_dim
img_size = swin_config.img_size
Expand Down
9 changes: 5 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setup(
name="flagai",
version="v1.7.3",
version="v1.7.4",
description="FlagAI aims to help researchers and developers to freely train and test large-scale models for NLP/CV/VL tasks.",
long_description=open("README.md", encoding="utf-8").read(),
long_description_content_type="text/markdown",
Expand All @@ -21,8 +21,8 @@
'sentencepiece>=0.1.96',
'boto3==1.17.32',
'pandas>=1.3.5',
'jieba==0.42.1',
'scikit-learn==1.0.2',
'jieba>=0.42.1',
'scikit-learn>=1.0.2',
'tensorboard>=2.9.0',
'transformers>=4.20.1',
'datasets>=2.0.0',
Expand All @@ -32,13 +32,14 @@
'Pillow>=9.3.0',
'einops>=0.3.0',
'diffusers==0.7.2',
'pytorch-lightning==1.6.5',
'pytorch-lightning>=1.6.5',
'taming-transformers-rom1504==0.0.6',
'rouge-score',
'sacrebleu>=2.3.1',
'jsonlines',
'accelerate',
'PyYAML==5.4.1',
'safetensors',
'timm',
]
)

0 comments on commit 5939fab

Please sign in to comment.