[Feature] Add P-MMEval (#1714)

* Update with PMMEval * Update * Update __init__.py * Fix Bugs * Delete .pre-commit-config.yaml * Pull merge --------- Co-authored-by: liushz <[email protected]>
open-compass · Nov 27, 2024 · 90efcf2 · 90efcf2
1 parent f7dbe6b
commit 90efcf2
Show file tree

Hide file tree

Showing 38 changed files with 2,200 additions and 1 deletion.
diff --git a/configs/eval_PMMEval.py b/configs/eval_PMMEval.py
@@ -0,0 +1,32 @@
+from mmengine.config import read_base
+
+from opencompass.models import HuggingFacewithChatTemplate
+
+
+with read_base():
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models
+
+    # from opencompass.configs.datasets.PMMEval.flores_gen import PMMEval_flores_datasets
+    # from opencompass.configs.datasets.PMMEval.humanevalxl_gen import PMMEval_HumanEvalXL_datasets
+    # from opencompass.configs.datasets.PMMEval.mgsm_gen import PMMEval_MGSM_datasets
+    # from opencompass.configs.datasets.PMMEval.mhellaswag_gen import PMMEval_MHellaswag_datasets
+    # from opencompass.configs.datasets.PMMEval.mifeval_gen import PMMEval_MIFEval_datasets
+    # from opencompass.configs.datasets.PMMEval.mlogiqa_gen import PMMEval_MLogiQA_datasets
+    # from opencompass.configs.datasets.PMMEval.mmmlu_gen import PMMEval_MMMLU_datasets
+    # from opencompass.configs.datasets.PMMEval.xnli import PMMEval_XNLI_datasets
+
+    from opencompass.configs.datasets.PMMEval.pmmeval_gen import PMMEval_datasets
+
+    from opencompass.configs.summarizers.PMMEval import summarizer
+
+
+# datasets = PMMEval_flores_datasets
+# datasets = PMMEval_HumanEvalXL_datasets
+# datasets = PMMEval_MGSM_datasets
+# datasets = PMMEval_MHellaswag_datasets
+# datasets = PMMEval_MIFEval_datasets
+# datasets = PMMEval_MLogiQA_datasets
+# datasets = PMMEval_MMMLU_datasets
+# datasets = PMMEval_XNLI_datasets
+
+datasets = PMMEval_datasets
diff --git a/opencompass/configs/datasets/PMMEval/flores_gen.py b/opencompass/configs/datasets/PMMEval/flores_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .flores_gen_2697d7 import PMMEval_flores_datasets
diff --git a/opencompass/configs/datasets/PMMEval/flores_gen_2697d7.py b/opencompass/configs/datasets/PMMEval/flores_gen_2697d7.py
@@ -0,0 +1,65 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets.PMMEval import PMMEvalFloresDataset, PMMEvalFloresEvaluator, pmmeval_flores_postprocess
+
+NATURAL_LANGUAGE_FULLNAMES_FLORES = ['Chinese', 'Arabic', 'Spanish', 'French', 'Japanese', 'Korean', 'Portuguese', 'Thai', 'Vietnamese']
+
+PROMPT = {
+    "Chinese": "将这个句子从英语翻译成中文。\n\n{src}",
+    "Arabic": "ترجم هذه الجملة من الإنجليزية إلى العربية.\n\n{src}",
+    "Spanish": "Traduce esta oración del inglés al español.\n\n{src}",
+    "Japanese": "この文を英語から日本語に翻訳してください。\n\n{src}",
+    "Korean": "이 문장을 영어에서 한국어로 번역하세요.\n\n{src}",
+    "Thai": "แปลประโยคนี้จากภาษาอังกฤษเป็นภาษาไทย.\n\n{src}",
+    "French": "Traduisez cette phrase de l'anglais en français.\n\n{src}",
+    "Portuguese": "Traduza esta frase do inglês para o português.\n\n{src}",
+    "Vietnamese": "Dịch câu này từ tiếng Anh sang tiếng Việt.\n\n{src}"
+}
+
+PMMEval_flores_datasets = list()
+
+# Add flores_200
+
+PMMEval_flores_reader_cfg = dict(
+    input_columns=['src'],
+    output_column='tgt',
+    test_split='test'
+)
+
+
+PMMEval_flores_datasets = list()
+
+for lang_fullname in NATURAL_LANGUAGE_FULLNAMES_FLORES:
+    PMMEval_flores_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(
+                        role='HUMAN',
+                        prompt=PROMPT[lang_fullname]
+                    )
+                ]
+            )
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    PMMEval_flores_eval_cfg = dict(
+        evaluator=dict(type=PMMEvalFloresEvaluator),
+        pred_role='BOT',
+        pred_postprocessor=dict(type=pmmeval_flores_postprocess, lang_fullname=lang_fullname)
+    )
+
+    PMMEval_flores_datasets.append(
+        dict(
+            abbr=f'flores-{lang_fullname}',
+            type=PMMEvalFloresDataset,
+            path='P-MMEval',
+            lang_fullname=lang_fullname,
+            reader_cfg=PMMEval_flores_reader_cfg,
+            infer_cfg=PMMEval_flores_infer_cfg,
+            eval_cfg=PMMEval_flores_eval_cfg)
+    )
diff --git a/opencompass/configs/datasets/PMMEval/humanevalxl_gen.py b/opencompass/configs/datasets/PMMEval/humanevalxl_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .humanevalxl_gen_4dfef4 import PMMEval_HumanEvalXL_datasets
diff --git a/opencompass/configs/datasets/PMMEval/humanevalxl_gen_bdec92.py b/opencompass/configs/datasets/PMMEval/humanevalxl_gen_bdec92.py
@@ -0,0 +1,49 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets.PMMEval import PMMEvalHumanEvalXLDataset, PMMEvalHumanEvalXLEvaluator
+
+NATURAL_LANGUAGE_FULLNAMES = ['English', 'Chinese', 'Arabic', 'Spanish', 'French', 'Japanese', 'Korean', 'Portuguese', 'Thai', 'Vietnamese']
+
+PMMEval_HumanEvalXL_datasets = list()
+
+PMMEval_HumanEvalXL_reader_cfg = dict(
+    input_columns=['task_id', 'prompt', 'entry_point', 'test', 'language', 'description', 'natural_language'],
+    output_column='declaration',
+    test_split='test'
+)
+
+PMMEval_HumanEvalXL_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template='{prompt}'),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+
+PMMEval_HumanEvalXL_datasets = list()
+
+for lang_fullname in NATURAL_LANGUAGE_FULLNAMES:
+    for program_lang in ['python', 'java', 'javascript']:
+
+        PMMEval_HumanEvalXL_eval_cfg = dict(
+            evaluator=dict(
+                type=PMMEvalHumanEvalXLEvaluator,
+                language=program_lang,
+                text_language=lang_fullname,
+                ip_address='localhost',
+                port=5001),
+            pred_role='BOT')
+
+        PMMEval_HumanEvalXL_datasets.append(
+            dict(
+                abbr=f'humanevalxl-{program_lang}-{lang_fullname}',
+                type=PMMEvalHumanEvalXLDataset,
+                path='P-MMEval',
+                lang=lang_fullname,
+                program_lang=program_lang,
+                reader_cfg=PMMEval_HumanEvalXL_reader_cfg,
+                infer_cfg=PMMEval_HumanEvalXL_infer_cfg,
+                eval_cfg=PMMEval_HumanEvalXL_eval_cfg)
+        )
diff --git a/opencompass/configs/datasets/PMMEval/mgsm_gen.py b/opencompass/configs/datasets/PMMEval/mgsm_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .mgsm_gen_679720 import PMMEval_MGSM_datasets
diff --git a/opencompass/configs/datasets/PMMEval/mgsm_gen_679720.py b/opencompass/configs/datasets/PMMEval/mgsm_gen_679720.py
@@ -0,0 +1,62 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets.PMMEval import PMMEvalMGSMDataset, PMMEvalMGSMEvaluator
+
+NATURAL_LANGUAGE_CODES = ['en', 'zh', 'ar', 'es', 'fr', 'ja', 'ko', 'pt', 'th', 'vi']
+
+LANG_TO_INSTRUCTIONS = {
+    "en": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"The answer is \". Do not add anything other than the integer answer after \"The answer is \".\n\n{question}",
+    "es": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"La respuesta es \". Do not add anything other than the integer answer after \"La respuesta es \".\n\n{question}",
+    "fr": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"La réponse est \". Do not add anything other than the integer answer after \"La réponse est \".\n\n{question}",
+    "zh": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"答案是 \". Do not add anything other than the integer answer after \"答案是 \".\n\n{question}",
+    "ja": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"答えは \". Do not add anything other than the integer answer after \"答えは \".\n\n{question}",
+    "th": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"คำตอบคือ \". Do not add anything other than the integer answer after \"คำตอบคือ \".\n\n{question}",
+    "ko": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"답변은 \". Do not add anything other than the integer answer after \"답변은 \".\n\n{question}",
+    "pt": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"A resposta é \". Do not add anything other than the integer answer after \"A resposta é \".\n\n{question}",
+    "vi": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"Câu trả lời là \". Do not add anything other than the integer answer after \"Câu trả lời là \".\n\n{question}",
+    "ar": "Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of \"الجواب هو \". Do not add anything other than the integer answer after \"الجواب هو \".\n\n{question}"
+}
+
+PMMEval_MGSM_datasets = list()
+
+# Add flores_200
+
+PMMEval_MGSM_reader_cfg = dict(
+    input_columns=['question'],
+    output_column='answer',
+    test_split='test'
+)
+
+PMMEval_MGSM_eval_cfg = dict(
+    evaluator=dict(type=PMMEvalMGSMEvaluator),
+    pred_role='BOT')
+
+
+for lang_code in NATURAL_LANGUAGE_CODES:
+    PMMEval_MGSM_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(
+                        role='HUMAN',
+                        prompt=LANG_TO_INSTRUCTIONS[lang_code]
+                    )
+                ]
+            )
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    PMMEval_MGSM_datasets.append(
+        dict(
+            abbr=f'mgsm-{lang_code}',
+            type=PMMEvalMGSMDataset,
+            path='P-MMEval',
+            lang=lang_code,
+            reader_cfg=PMMEval_MGSM_reader_cfg,
+            infer_cfg=PMMEval_MGSM_infer_cfg,
+            eval_cfg=PMMEval_MGSM_eval_cfg)
+    )
diff --git a/opencompass/configs/datasets/PMMEval/mhellaswag_gen.py b/opencompass/configs/datasets/PMMEval/mhellaswag_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .mhellaswag_gen_1a6b73 import PMMEval_MHellaswag_datasets
diff --git a/opencompass/configs/datasets/PMMEval/mhellaswag_gen_1a6b73.py b/opencompass/configs/datasets/PMMEval/mhellaswag_gen_1a6b73.py
@@ -0,0 +1,54 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets.PMMEval import PMMEvalMHellaswagDataset, PMMEvalMHellaswagEvaluator, pmmeval_mhellaswag_postprocess
+
+NATURAL_LANGUAGE_CODES = ['en', 'zh', 'ar', 'es', 'fr', 'ja', 'ko', 'pt', 'th', 'vi']
+
+PMMEVAL_MHELLASWAG_TEMPLATE = "Input: {ctx}\nOptions: \nA. {option_1}\nB. {option_2}\nC. {option_3}\nD. {option_4}\nPick the correct ending for the sentence from A, B, C, and D, and return it in the following JSON format:\n{\"answer\": \"[choice]\"}\nwhere [choice] must be one of A, B, C or D."
+
+PMMEval_MHellaswag_datasets = list()
+
+PMMEval_MHellaswag_reader_cfg = dict(
+    input_columns=['ctx', 'option_1', 'option_2', 'option_3', 'option_4'],
+    output_column='label',
+    test_split='test'
+)
+
+PMMEval_MHellaswag_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=PMMEVAL_MHELLASWAG_TEMPLATE
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+
+PMMEval_MHellaswag_datasets = list()
+
+
+for lang_code in NATURAL_LANGUAGE_CODES:
+    PMMEval_MHellaswag_eval_cfg = dict(
+        evaluator=dict(type=PMMEvalMHellaswagEvaluator),
+        pred_role='BOT',
+        pred_postprocessor=dict(type=pmmeval_mhellaswag_postprocess, lang_code=lang_code)
+    )
+
+    PMMEval_MHellaswag_datasets.append(
+        dict(
+            abbr=f'mhellaswag-{lang_code}',
+            type=PMMEvalMHellaswagDataset,
+            path='P-MMEval',
+            lang=lang_code,
+            reader_cfg=PMMEval_MHellaswag_reader_cfg,
+            infer_cfg=PMMEval_MHellaswag_infer_cfg,
+            eval_cfg=PMMEval_MHellaswag_eval_cfg)
+    )
diff --git a/opencompass/configs/datasets/PMMEval/mifeval_gen.py b/opencompass/configs/datasets/PMMEval/mifeval_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .mifeval_gen_79f8fb import PMMEval_MIFEval_datasets
diff --git a/opencompass/configs/datasets/PMMEval/mifeval_gen_79f8fb.py b/opencompass/configs/datasets/PMMEval/mifeval_gen_79f8fb.py
@@ -0,0 +1,51 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets.PMMEval import PMMEvalMIFEvalDataset, PMMEvalMIFEvalEvaluator, pmmeval_mifeval_postprocess
+
+NATURAL_LANGUAGE_CODES = ['en', 'zh', 'ar', 'es', 'fr', 'ja', 'ko', 'pt', 'th', 'vi']
+
+PMMEVAL_MIFEVAL_TEMPLATE = "{prompt}"
+
+PMMEval_MIFEval_datasets = list()
+
+PMMEval_MIFEval_reader_cfg = dict(
+    input_columns=['prompt', 'instruction_id_list', 'kwargs'],
+    output_column=None,
+    test_split='test'
+)
+
+
+PMMEval_MIFEval_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=PMMEVAL_MIFEVAL_TEMPLATE
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+for lang_code in NATURAL_LANGUAGE_CODES:
+    PMMEval_MIFEval_eval_cfg = dict(
+        evaluator=dict(type=PMMEvalMIFEvalEvaluator),
+        pred_role='BOT',
+        pred_postprocessor=dict(type=pmmeval_mifeval_postprocess, lang_code=lang_code)
+    )
+
+    PMMEval_MIFEval_datasets.append(
+        dict(
+            abbr=f'mifeval-{lang_code}',
+            type=PMMEvalMIFEvalDataset,
+            path='P-MMEval',
+            lang=lang_code,
+            reader_cfg=PMMEval_MIFEval_reader_cfg,
+            infer_cfg=PMMEval_MIFEval_infer_cfg,
+            eval_cfg=PMMEval_MIFEval_eval_cfg)
+    )
diff --git a/opencompass/configs/datasets/PMMEval/mlogiqa_gen.py b/opencompass/configs/datasets/PMMEval/mlogiqa_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .mlogiqa_gen_36c4f9 import PMMEval_MLogiQA_datasets
diff --git a/opencompass/configs/datasets/PMMEval/mlogiqa_gen_36c4f9.py b/opencompass/configs/datasets/PMMEval/mlogiqa_gen_36c4f9.py
@@ -0,0 +1,50 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets.PMMEval import PMMEvalMLogiQADataset, PMMEvalMLogiQAEvaluator, pmmeval_mlogiqa_postprocess
+
+NATURAL_LANGUAGE_CODES = ['en', 'zh', 'ar', 'es', 'fr', 'ja', 'ko', 'pt', 'th', 'vi']
+
+PMMEVAL_MLOGIQA_TEMPLATE = "Passage: {context}\nQuestion: {question}\nChoices:\nA.{option_1}\nB.{option_2}\nC.{option_3}\nD.{option_4}\nPlease choose the most suitable one among A, B, C and D as the answer to this question, and return it in the following JSON format:\n{'answer': '[choice]'}\nwhere [choice] must be one of A, B, C and D."
+
+PMMEval_MLogiQA_datasets = []
+
+
+PMMEval_MLogiQA_reader_cfg = dict(
+    input_columns=['context', 'question', 'option_1', 'option_2', 'option_3', 'option_4'],
+    output_column='answer',
+    train_split='test')
+
+PMMEval_MLogiQA_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=PMMEVAL_MLOGIQA_TEMPLATE
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+
+for lang_code in NATURAL_LANGUAGE_CODES:
+    PMMEval_MLogiQA_eval_cfg = dict(
+        evaluator=dict(type=PMMEvalMLogiQAEvaluator),
+        pred_role='BOT',
+        pred_postprocessor=dict(type=pmmeval_mlogiqa_postprocess, lang_code=lang_code))
+
+    PMMEval_MLogiQA_datasets.append(
+        dict(
+            abbr=f'mlogiqa-{lang_code}',
+            type=PMMEvalMLogiQADataset,
+            path='P-MMEval',
+            lang=lang_code,
+            reader_cfg=PMMEval_MLogiQA_reader_cfg,
+            infer_cfg=PMMEval_MLogiQA_infer_cfg,
+            eval_cfg=PMMEval_MLogiQA_eval_cfg)
+    )