Skip to content

Commit

Permalink
Merge pull request #4691 from codemayq/feature-suppot-eval-dataset
Browse files Browse the repository at this point in the history
add eval dataset support
  • Loading branch information
hiyouga authored Jul 14, 2024
2 parents a4ae3ab + cba673f commit 15b399a
Show file tree
Hide file tree
Showing 18 changed files with 247 additions and 139 deletions.
3 changes: 2 additions & 1 deletion data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@ Currently we support datasets in **alpaca** and **sharegpt** format.
"formatting": "the format of the dataset. (optional, default: alpaca, can be chosen from {alpaca, sharegpt})",
"ranking": "whether the dataset is a preference dataset or not. (default: False)",
"subset": "the name of the subset. (optional, default: None)",
"split": "the name of dataset split to be used. (optional, default: train)",
"folder": "the name of the folder of the dataset repository on the Hugging Face hub. (optional, default: None)",
"num_samples": "the number of samples in the dataset used for training. (optional, default: None)",
"num_samples": "the number of samples in the dataset to be used. (optional, default: None)",
"columns (optional)": {
"prompt": "the column name in the dataset containing the prompts. (default: instruction)",
"query": "the column name in the dataset containing the queries. (default: input)",
Expand Down
1 change: 1 addition & 0 deletions data/README_zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
"subset": "数据集子集的名称(可选,默认:None)",
"folder": "Hugging Face 仓库的文件夹名称(可选,默认:None)",
"num_samples": "该数据集中用于训练的样本数量。(可选,默认:None)",
"split": "数据集中的要使用的训练测试集切分(可选,默认:train)",
"columns(可选)": {
"prompt": "数据集代表提示词的表头名称(默认:instruction)",
"query": "数据集代表请求的表头名称(默认:input)",
Expand Down
12 changes: 11 additions & 1 deletion data/dataset_info.json
Original file line number Diff line number Diff line change
Expand Up @@ -172,9 +172,19 @@
"deepctrl": {
"ms_hub_url": "deepctrl/deepctrl-sft-data"
},
"adgen": {
"adgen_train": {
"hf_hub_url": "HasturOfficial/adgen",
"ms_hub_url": "AI-ModelScope/adgen",
"split": "train",
"columns": {
"prompt": "content",
"response": "summary"
}
},
"adgen_val": {
"hf_hub_url": "HasturOfficial/adgen",
"ms_hub_url": "AI-ModelScope/adgen",
"split": "validation",
"columns": {
"prompt": "content",
"response": "summary"
Expand Down
4 changes: 2 additions & 2 deletions scripts/cal_lr.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,15 @@ def calculate_lr(
)
tokenizer_module = load_tokenizer(model_args)
tokenizer = tokenizer_module["tokenizer"]
trainset = get_dataset(model_args, data_args, training_args, stage, **tokenizer_module)
dataset_module = get_dataset(model_args, data_args, training_args, stage, **tokenizer_module)
if stage == "pt":
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
elif stage == "sft":
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, label_pad_token_id=IGNORE_INDEX)
else:
raise NotImplementedError("Stage does not supported: {}.".format(stage))

dataloader = DataLoader(trainset, batch_size, shuffle=False, collate_fn=data_collator, pin_memory=True)
dataloader = DataLoader(dataset_module["eval_dataset"], batch_size, shuffle=False, collate_fn=data_collator, pin_memory=True)
valid_tokens, total_tokens = 0, 0
for batch in tqdm(dataloader):
valid_tokens += torch.sum(batch["labels"] != IGNORE_INDEX).item()
Expand Down
4 changes: 2 additions & 2 deletions scripts/cal_ppl.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def cal_ppl(
)
tokenizer_module = load_tokenizer(model_args)
tokenizer = tokenizer_module["tokenizer"]
trainset = get_dataset(model_args, data_args, training_args, stage, **tokenizer_module)
dataset_module = get_dataset(model_args, data_args, training_args, stage, **tokenizer_module)
model = load_model(tokenizer, model_args, finetuning_args, is_trainable=False)
if stage == "pt":
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
Expand All @@ -100,7 +100,7 @@ def cal_ppl(
else:
raise NotImplementedError("Stage does not supported: {}.".format(stage))

dataloader = DataLoader(trainset, batch_size, shuffle=False, collate_fn=data_collator, pin_memory=True)
dataloader = DataLoader(dataset_module["eval_dataset"], batch_size, shuffle=False, collate_fn=data_collator, pin_memory=True)
criterion = torch.nn.CrossEntropyLoss(reduction="none")
total_ppl = 0
perplexities = []
Expand Down
6 changes: 3 additions & 3 deletions scripts/length_cdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,10 @@ def length_cdf(
)
)
tokenizer_module = load_tokenizer(model_args)
trainset = get_dataset(model_args, data_args, training_args, stage="sft", **tokenizer_module)
total_num = len(trainset)
dataset_module = get_dataset(model_args, data_args, training_args, stage="sft", **tokenizer_module)
total_num = len(dataset_module["eval_dataset"])
length_dict = defaultdict(int)
for sample in tqdm(trainset["input_ids"]):
for sample in tqdm(dataset_module["eval_dataset"]["input_ids"]):
length_dict[len(sample) // interval * interval] += 1

length_tuples = list(length_dict.items())
Expand Down
51 changes: 25 additions & 26 deletions src/llamafactory/data/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,15 @@
# limitations under the License.

from enum import Enum, unique
from typing import TYPE_CHECKING, Dict, List, Sequence, Set, Union
from typing import TYPE_CHECKING, Dict, List, Optional, Sequence, Set, TypedDict, Union

from datasets import concatenate_datasets, interleave_datasets
from datasets import DatasetDict, concatenate_datasets, interleave_datasets

from ..extras.logging import get_logger


if TYPE_CHECKING:
from datasets import Dataset, IterableDataset
from transformers import Seq2SeqTrainingArguments

from ..hparams import DataArguments

Expand All @@ -42,47 +41,47 @@ class Role(str, Enum):
OBSERVATION = "observation"


class DatasetModule(TypedDict):
train_dataset: Optional[Union["Dataset", "IterableDataset"]]
eval_dataset: Optional[Union["Dataset", "IterableDataset"]]


def merge_dataset(
all_datasets: List[Union["Dataset", "IterableDataset"]],
data_args: "DataArguments",
training_args: "Seq2SeqTrainingArguments",
all_datasets: List[Union["Dataset", "IterableDataset"]], data_args: "DataArguments", seed: int
) -> Union["Dataset", "IterableDataset"]:
if len(all_datasets) == 1:
return all_datasets[0]
elif data_args.mix_strategy == "concat":
if data_args.streaming:
logger.warning("The samples between different datasets will not be mixed in streaming mode.")

return concatenate_datasets(all_datasets)
elif data_args.mix_strategy.startswith("interleave"):
if not data_args.streaming:
logger.warning("We recommend using `mix_strategy=concat` in non-streaming mode.")

return interleave_datasets(
datasets=all_datasets,
probabilities=data_args.interleave_probs,
seed=training_args.seed,
seed=seed,
stopping_strategy="first_exhausted" if data_args.mix_strategy.endswith("under") else "all_exhausted",
)
else:
raise ValueError("Unknown mixing strategy.")


def split_dataset(
dataset: Union["Dataset", "IterableDataset"], data_args: "DataArguments", training_args: "Seq2SeqTrainingArguments"
) -> Dict[str, "Dataset"]:
if training_args.do_train:
if data_args.val_size > 1e-6: # Split the dataset
if data_args.streaming:
dataset = dataset.shuffle(buffer_size=data_args.buffer_size, seed=training_args.seed)
val_set = dataset.take(int(data_args.val_size))
train_set = dataset.skip(int(data_args.val_size))
return {"train_dataset": train_set, "eval_dataset": val_set}
else:
val_size = int(data_args.val_size) if data_args.val_size > 1 else data_args.val_size
dataset = dataset.train_test_split(test_size=val_size, seed=training_args.seed)
return {"train_dataset": dataset["train"], "eval_dataset": dataset["test"]}
else:
if data_args.streaming:
dataset = dataset.shuffle(buffer_size=data_args.buffer_size, seed=training_args.seed)
return {"train_dataset": dataset}
else: # do_eval or do_predict
return {"eval_dataset": dataset}
dataset: Union["Dataset", "IterableDataset"], data_args: "DataArguments", seed: int
) -> "DatasetDict":
r"""
Splits the dataset and returns a dataset dict containing train set (required) and validation set (optional).
"""
if data_args.streaming:
dataset = dataset.shuffle(buffer_size=data_args.buffer_size, seed=seed)
val_set = dataset.take(int(data_args.val_size))
train_set = dataset.skip(int(data_args.val_size))
return DatasetDict({"train": train_set, "validation": val_set})
else:
val_size = int(data_args.val_size) if data_args.val_size > 1 else data_args.val_size
dataset = dataset.train_test_split(test_size=val_size, seed=seed)
return DatasetDict({"train": dataset["train"], "validation": dataset["test"]})
Loading

0 comments on commit 15b399a

Please sign in to comment.