Skip to content

Commit

Permalink
[Feature] Add P-MMEval (#1714)
Browse files Browse the repository at this point in the history
* Update with PMMEval

* Update

* Update __init__.py

* Fix Bugs

* Delete .pre-commit-config.yaml

* Pull merge

---------

Co-authored-by: liushz <[email protected]>
  • Loading branch information
wanyu2018umac and liushz authored Nov 27, 2024
1 parent f7dbe6b commit 90efcf2
Show file tree
Hide file tree
Showing 38 changed files with 2,200 additions and 1 deletion.
32 changes: 32 additions & 0 deletions configs/eval_PMMEval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from mmengine.config import read_base

from opencompass.models import HuggingFacewithChatTemplate


with read_base():
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models

# from opencompass.configs.datasets.PMMEval.flores_gen import PMMEval_flores_datasets
# from opencompass.configs.datasets.PMMEval.humanevalxl_gen import PMMEval_HumanEvalXL_datasets
# from opencompass.configs.datasets.PMMEval.mgsm_gen import PMMEval_MGSM_datasets
# from opencompass.configs.datasets.PMMEval.mhellaswag_gen import PMMEval_MHellaswag_datasets
# from opencompass.configs.datasets.PMMEval.mifeval_gen import PMMEval_MIFEval_datasets
# from opencompass.configs.datasets.PMMEval.mlogiqa_gen import PMMEval_MLogiQA_datasets
# from opencompass.configs.datasets.PMMEval.mmmlu_gen import PMMEval_MMMLU_datasets
# from opencompass.configs.datasets.PMMEval.xnli import PMMEval_XNLI_datasets

from opencompass.configs.datasets.PMMEval.pmmeval_gen import PMMEval_datasets

from opencompass.configs.summarizers.PMMEval import summarizer


# datasets = PMMEval_flores_datasets
# datasets = PMMEval_HumanEvalXL_datasets
# datasets = PMMEval_MGSM_datasets
# datasets = PMMEval_MHellaswag_datasets
# datasets = PMMEval_MIFEval_datasets
# datasets = PMMEval_MLogiQA_datasets
# datasets = PMMEval_MMMLU_datasets
# datasets = PMMEval_XNLI_datasets

datasets = PMMEval_datasets
4 changes: 4 additions & 0 deletions opencompass/configs/datasets/PMMEval/flores_gen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from mmengine.config import read_base

with read_base():
from .flores_gen_2697d7 import PMMEval_flores_datasets
65 changes: 65 additions & 0 deletions opencompass/configs/datasets/PMMEval/flores_gen_2697d7.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.PMMEval import PMMEvalFloresDataset, PMMEvalFloresEvaluator, pmmeval_flores_postprocess

NATURAL_LANGUAGE_FULLNAMES_FLORES = ['Chinese', 'Arabic', 'Spanish', 'French', 'Japanese', 'Korean', 'Portuguese', 'Thai', 'Vietnamese']

PROMPT = {
"Chinese": "将这个句子从英语翻译成中文。\n\n{src}",
"Arabic": "ترجم هذه الجملة من الإنجليزية إلى العربية.\n\n{src}",
"Spanish": "Traduce esta oración del inglés al español.\n\n{src}",
"Japanese": "この文を英語から日本語に翻訳してください。\n\n{src}",
"Korean": "이 문장을 영어에서 한국어로 번역하세요.\n\n{src}",
"Thai": "แปลประโยคนี้จากภาษาอังกฤษเป็นภาษาไทย.\n\n{src}",
"French": "Traduisez cette phrase de l'anglais en français.\n\n{src}",
"Portuguese": "Traduza esta frase do inglês para o português.\n\n{src}",
"Vietnamese": "Dịch câu này từ tiếng Anh sang tiếng Việt.\n\n{src}"
}

PMMEval_flores_datasets = list()

# Add flores_200

PMMEval_flores_reader_cfg = dict(
input_columns=['src'],
output_column='tgt',
test_split='test'
)


PMMEval_flores_datasets = list()

for lang_fullname in NATURAL_LANGUAGE_FULLNAMES_FLORES:
PMMEval_flores_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt=PROMPT[lang_fullname]
)
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)

PMMEval_flores_eval_cfg = dict(
evaluator=dict(type=PMMEvalFloresEvaluator),
pred_role='BOT',
pred_postprocessor=dict(type=pmmeval_flores_postprocess, lang_fullname=lang_fullname)
)

PMMEval_flores_datasets.append(
dict(
abbr=f'flores-{lang_fullname}',
type=PMMEvalFloresDataset,
path='P-MMEval',
lang_fullname=lang_fullname,
reader_cfg=PMMEval_flores_reader_cfg,
infer_cfg=PMMEval_flores_infer_cfg,
eval_cfg=PMMEval_flores_eval_cfg)
)
4 changes: 4 additions & 0 deletions opencompass/configs/datasets/PMMEval/humanevalxl_gen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from mmengine.config import read_base

with read_base():
from .humanevalxl_gen_4dfef4 import PMMEval_HumanEvalXL_datasets
49 changes: 49 additions & 0 deletions opencompass/configs/datasets/PMMEval/humanevalxl_gen_bdec92.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.PMMEval import PMMEvalHumanEvalXLDataset, PMMEvalHumanEvalXLEvaluator

NATURAL_LANGUAGE_FULLNAMES = ['English', 'Chinese', 'Arabic', 'Spanish', 'French', 'Japanese', 'Korean', 'Portuguese', 'Thai', 'Vietnamese']

PMMEval_HumanEvalXL_datasets = list()

PMMEval_HumanEvalXL_reader_cfg = dict(
input_columns=['task_id', 'prompt', 'entry_point', 'test', 'language', 'description', 'natural_language'],
output_column='declaration',
test_split='test'
)

PMMEval_HumanEvalXL_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template='{prompt}'),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)


PMMEval_HumanEvalXL_datasets = list()

for lang_fullname in NATURAL_LANGUAGE_FULLNAMES:
for program_lang in ['python', 'java', 'javascript']:

PMMEval_HumanEvalXL_eval_cfg = dict(
evaluator=dict(
type=PMMEvalHumanEvalXLEvaluator,
language=program_lang,
text_language=lang_fullname,
ip_address='localhost',
port=5001),
pred_role='BOT')

PMMEval_HumanEvalXL_datasets.append(
dict(
abbr=f'humanevalxl-{program_lang}-{lang_fullname}',
type=PMMEvalHumanEvalXLDataset,
path='P-MMEval',
lang=lang_fullname,
program_lang=program_lang,
reader_cfg=PMMEval_HumanEvalXL_reader_cfg,
infer_cfg=PMMEval_HumanEvalXL_infer_cfg,
eval_cfg=PMMEval_HumanEvalXL_eval_cfg)
)
4 changes: 4 additions & 0 deletions opencompass/configs/datasets/PMMEval/mgsm_gen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from mmengine.config import read_base

with read_base():
from .mgsm_gen_679720 import PMMEval_MGSM_datasets
62 changes: 62 additions & 0 deletions opencompass/configs/datasets/PMMEval/mgsm_gen_679720.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.PMMEval import PMMEvalMGSMDataset, PMMEvalMGSMEvaluator

NATURAL_LANGUAGE_CODES = ['en', 'zh', 'ar', 'es', 'fr', 'ja', 'ko', 'pt', 'th', 'vi']

LANG_TO_INSTRUCTIONS = {
"en": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"The answer is \". Do not add anything other than the integer answer after \"The answer is \".\n\n{question}",
"es": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"La respuesta es \". Do not add anything other than the integer answer after \"La respuesta es \".\n\n{question}",
"fr": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"La réponse est \". Do not add anything other than the integer answer after \"La réponse est \".\n\n{question}",
"zh": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"答案是 \". Do not add anything other than the integer answer after \"答案是 \".\n\n{question}",
"ja": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"答えは \". Do not add anything other than the integer answer after \"答えは \".\n\n{question}",
"th": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"คำตอบคือ \". Do not add anything other than the integer answer after \"คำตอบคือ \".\n\n{question}",
"ko": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"답변은 \". Do not add anything other than the integer answer after \"답변은 \".\n\n{question}",
"pt": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"A resposta é \". Do not add anything other than the integer answer after \"A resposta é \".\n\n{question}",
"vi": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"Câu trả lời là \". Do not add anything other than the integer answer after \"Câu trả lời là \".\n\n{question}",
"ar": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"الجواب هو \". Do not add anything other than the integer answer after \"الجواب هو \".\n\n{question}"
}

PMMEval_MGSM_datasets = list()

# Add flores_200

PMMEval_MGSM_reader_cfg = dict(
input_columns=['question'],
output_column='answer',
test_split='test'
)

PMMEval_MGSM_eval_cfg = dict(
evaluator=dict(type=PMMEvalMGSMEvaluator),
pred_role='BOT')


for lang_code in NATURAL_LANGUAGE_CODES:
PMMEval_MGSM_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt=LANG_TO_INSTRUCTIONS[lang_code]
)
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)

PMMEval_MGSM_datasets.append(
dict(
abbr=f'mgsm-{lang_code}',
type=PMMEvalMGSMDataset,
path='P-MMEval',
lang=lang_code,
reader_cfg=PMMEval_MGSM_reader_cfg,
infer_cfg=PMMEval_MGSM_infer_cfg,
eval_cfg=PMMEval_MGSM_eval_cfg)
)
4 changes: 4 additions & 0 deletions opencompass/configs/datasets/PMMEval/mhellaswag_gen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from mmengine.config import read_base

with read_base():
from .mhellaswag_gen_1a6b73 import PMMEval_MHellaswag_datasets
54 changes: 54 additions & 0 deletions opencompass/configs/datasets/PMMEval/mhellaswag_gen_1a6b73.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.PMMEval import PMMEvalMHellaswagDataset, PMMEvalMHellaswagEvaluator, pmmeval_mhellaswag_postprocess

NATURAL_LANGUAGE_CODES = ['en', 'zh', 'ar', 'es', 'fr', 'ja', 'ko', 'pt', 'th', 'vi']

PMMEVAL_MHELLASWAG_TEMPLATE = "Input: {ctx}\nOptions: \nA. {option_1}\nB. {option_2}\nC. {option_3}\nD. {option_4}\nPick the correct ending for the sentence from A, B, C, and D, and return it in the following JSON format:\n{\"answer\": \"[choice]\"}\nwhere [choice] must be one of A, B, C or D."

PMMEval_MHellaswag_datasets = list()

PMMEval_MHellaswag_reader_cfg = dict(
input_columns=['ctx', 'option_1', 'option_2', 'option_3', 'option_4'],
output_column='label',
test_split='test'
)

PMMEval_MHellaswag_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt=PMMEVAL_MHELLASWAG_TEMPLATE
)
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)


PMMEval_MHellaswag_datasets = list()


for lang_code in NATURAL_LANGUAGE_CODES:
PMMEval_MHellaswag_eval_cfg = dict(
evaluator=dict(type=PMMEvalMHellaswagEvaluator),
pred_role='BOT',
pred_postprocessor=dict(type=pmmeval_mhellaswag_postprocess, lang_code=lang_code)
)

PMMEval_MHellaswag_datasets.append(
dict(
abbr=f'mhellaswag-{lang_code}',
type=PMMEvalMHellaswagDataset,
path='P-MMEval',
lang=lang_code,
reader_cfg=PMMEval_MHellaswag_reader_cfg,
infer_cfg=PMMEval_MHellaswag_infer_cfg,
eval_cfg=PMMEval_MHellaswag_eval_cfg)
)
4 changes: 4 additions & 0 deletions opencompass/configs/datasets/PMMEval/mifeval_gen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from mmengine.config import read_base

with read_base():
from .mifeval_gen_79f8fb import PMMEval_MIFEval_datasets
51 changes: 51 additions & 0 deletions opencompass/configs/datasets/PMMEval/mifeval_gen_79f8fb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.PMMEval import PMMEvalMIFEvalDataset, PMMEvalMIFEvalEvaluator, pmmeval_mifeval_postprocess

NATURAL_LANGUAGE_CODES = ['en', 'zh', 'ar', 'es', 'fr', 'ja', 'ko', 'pt', 'th', 'vi']

PMMEVAL_MIFEVAL_TEMPLATE = "{prompt}"

PMMEval_MIFEval_datasets = list()

PMMEval_MIFEval_reader_cfg = dict(
input_columns=['prompt', 'instruction_id_list', 'kwargs'],
output_column=None,
test_split='test'
)


PMMEval_MIFEval_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt=PMMEVAL_MIFEVAL_TEMPLATE
)
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)

for lang_code in NATURAL_LANGUAGE_CODES:
PMMEval_MIFEval_eval_cfg = dict(
evaluator=dict(type=PMMEvalMIFEvalEvaluator),
pred_role='BOT',
pred_postprocessor=dict(type=pmmeval_mifeval_postprocess, lang_code=lang_code)
)

PMMEval_MIFEval_datasets.append(
dict(
abbr=f'mifeval-{lang_code}',
type=PMMEvalMIFEvalDataset,
path='P-MMEval',
lang=lang_code,
reader_cfg=PMMEval_MIFEval_reader_cfg,
infer_cfg=PMMEval_MIFEval_infer_cfg,
eval_cfg=PMMEval_MIFEval_eval_cfg)
)
4 changes: 4 additions & 0 deletions opencompass/configs/datasets/PMMEval/mlogiqa_gen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from mmengine.config import read_base

with read_base():
from .mlogiqa_gen_36c4f9 import PMMEval_MLogiQA_datasets
50 changes: 50 additions & 0 deletions opencompass/configs/datasets/PMMEval/mlogiqa_gen_36c4f9.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.PMMEval import PMMEvalMLogiQADataset, PMMEvalMLogiQAEvaluator, pmmeval_mlogiqa_postprocess

NATURAL_LANGUAGE_CODES = ['en', 'zh', 'ar', 'es', 'fr', 'ja', 'ko', 'pt', 'th', 'vi']

PMMEVAL_MLOGIQA_TEMPLATE = "Passage: {context}\nQuestion: {question}\nChoices:\nA.{option_1}\nB.{option_2}\nC.{option_3}\nD.{option_4}\nPlease choose the most suitable one among A, B, C and D as the answer to this question, and return it in the following JSON format:\n{'answer': '[choice]'}\nwhere [choice] must be one of A, B, C and D."

PMMEval_MLogiQA_datasets = []


PMMEval_MLogiQA_reader_cfg = dict(
input_columns=['context', 'question', 'option_1', 'option_2', 'option_3', 'option_4'],
output_column='answer',
train_split='test')

PMMEval_MLogiQA_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt=PMMEVAL_MLOGIQA_TEMPLATE
)
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)


for lang_code in NATURAL_LANGUAGE_CODES:
PMMEval_MLogiQA_eval_cfg = dict(
evaluator=dict(type=PMMEvalMLogiQAEvaluator),
pred_role='BOT',
pred_postprocessor=dict(type=pmmeval_mlogiqa_postprocess, lang_code=lang_code))

PMMEval_MLogiQA_datasets.append(
dict(
abbr=f'mlogiqa-{lang_code}',
type=PMMEvalMLogiQADataset,
path='P-MMEval',
lang=lang_code,
reader_cfg=PMMEval_MLogiQA_reader_cfg,
infer_cfg=PMMEval_MLogiQA_infer_cfg,
eval_cfg=PMMEval_MLogiQA_eval_cfg)
)
Loading

0 comments on commit 90efcf2

Please sign in to comment.