diff --git a/configs/eval_chinese_simpleqa.py b/configs/eval_chinese_simpleqa.py
new file mode 100644
index 000000000..e5ecdfbb7
--- /dev/null
+++ b/configs/eval_chinese_simpleqa.py
@@ -0,0 +1,73 @@
+from mmengine.config import read_base
+
+with read_base():
+ from opencompass.configs.datasets.chinese_simpleqa.chinese_simpleqa_gen import csimpleqa_datasets
+
+from opencompass.models.openai_api import OpenAI
+from opencompass.runners import LocalRunner
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+from opencompass.models import HuggingFacewithChatTemplate
+from opencompass.summarizers import DefaultSubjectiveSummarizer
+
+# -------------Inference Stage ----------------------------------------
+models = [
+ dict(
+ type=HuggingFacewithChatTemplate,
+ abbr='Qwen2.5-1.5B-Instruct',
+ path='Qwen/Qwen2.5-1.5B-Instruct',
+ model_kwargs=dict(
+ device_map='auto',
+ trust_remote_code=True,
+ ),
+ tokenizer_kwargs=dict(
+ padding_side='left',
+ truncation_side='left',
+ trust_remote_code=True,
+ ),
+ generation_kwargs=dict(
+ do_sample=True,
+ ),
+ max_out_len=200,
+ max_seq_len=4096,
+ batch_size=8,
+ run_cfg=dict(num_gpus=1, num_procs=1),
+ )
+]
+
+datasets = sum([v for k, v in locals().items() if ('datasets' in k)], [])
+summarizer = dict(type=DefaultSubjectiveSummarizer)
+
+# -------------Evalation Stage ----------------------------------------
+
+## ------------- JudgeLLM Configuration
+
+api_meta_template = dict(
+ round=[
+ dict(role='SYSTEM', api_role='SYSTEM'),
+ dict(role='HUMAN', api_role='HUMAN'),
+ dict(role='BOT', api_role='BOT', generate=True),
+ ]
+)
+judge_models = [
+ dict(
+ # GPT4o
+ abbr='gpt-4o-0513-global',
+ type=OpenAI,
+ # gpt-4o
+ path='gpt-4o-0513-global',
+ key='xxx', # provide OPENAI_API_KEY
+ meta_template=api_meta_template,
+ query_per_second=16,
+ max_out_len=1000,
+ batch_size=8,
+ retry=3)
+]
+
+## ------------- Evaluation Configuration
+eval = dict(
+ partitioner=dict(type=SubjectiveNaivePartitioner, models=models, judge_models=judge_models),
+ runner=dict(type=LocalRunner, max_num_workers=16, task=dict(type=SubjectiveEvalTask)),
+)
+
+work_dir = 'outputs/chinese_simpleqa/'
diff --git a/opencompass/configs/datasets/chinese_simpleqa/README.md b/opencompass/configs/datasets/chinese_simpleqa/README.md
new file mode 100644
index 000000000..91106690d
--- /dev/null
+++ b/opencompass/configs/datasets/chinese_simpleqa/README.md
@@ -0,0 +1,108 @@
+
+
+
+# Overview
+
+ 🌐 Website • 🤗 Hugging Face • ⏬ Data • 📃 Paper • 📊 Leaderboard
中文 | English
+
+
+**Chinese SimpleQA** is the first comprehensive Chinese benchmark to evaluate the factuality ability of language models to answer short questions, and Chinese SimpleQA mainly has five properties (i.e., Chinese, Diverse, High-quality, Static, Easy-to-evaluate). Specifically, our benchmark covers **6 major topics** with **99 diverse subtopics**.
+
+Please visit our [website](https://openstellarteam.github.io/ChineseSimpleQA/) or check our [paper](https://arxiv.org/abs/2411.07140) for more details.
+
+
+
+## 💫 Instroduction
+
+* How to solve the generative hallucination of models has always been an unsolved problem in the field of artificial intelligence (AI). In order to measure the factual correctness of language models, OpenAI recently released and open-sourced a test set called SimpleQA. We have also been paying attention to the field of factuality, which currently has problems such as outdated data, inaccurate evaluation, and incomplete coverage. For example, the knowledge evaluation sets widely used now are still CommonSenseQA, CMMLU, and C-Eval, which are multiple-choice question-based evaluation sets. **In order to further promote the research of the Chinese community on the factual correctness of models, we propose the Chinese SimpleQA**. which consists of 3000 high-quality questions spanning 6 major topics, ranging from humanities to science and engineering. Specifically, the distinct main features of our proposed Chinese SimpleQA dataset are as follows:
+ * 🀄**Chinese:** Our Chinese SimpleQA focuses on the Chinese language, which provides a comprehensive evaluation of the factuality abilities of existing LLMs in Chinese.
+ * 🍀**Diverse:** Chinese SimpleQA covers 6 topics (i.e., “Chinese Culture”, “Humanities”, “Engineering, Technology, and Applied Sciences”, “Life, Art, and Culture”, “Society”, and “Natural Science”), and these topic includes 99 fine-grained subtopics in total, which demonstrates the diversity of our Chinese SimpleQA.
+ * ⚡**High-quality:** We conduct a comprehensive and rigorous quality control process to ensure the quality and accuracy of our Chinese SimpleQA.
+ * 💡**Static:** Following SimpleQA, to preserve the evergreen property of Chinese SimpleQA, all reference answers would not change over time.
+ * 🗂️**Easy-to-evaluate:** Following SimpleQA, as the questions and answers are very short, the grading procedure is fast to run via existing LLMs (e.g., OpenAI API).
+
+- Based on Chinese SimpleQA, we have conducted a comprehensive evaluation of the factual capabilities of existing LLMs. We also maintain a comprehensive leaderboard list.
+- In short, we hope that Chinese SimpleQA can help developers gain a deeper understanding of the factual correctness of their models in the Chinese field, and at the same time provide an important cornerstone for their algorithm research, and jointly promote the growth of Chinese basic models.
+
+
+
+
+
+## 📊 Leaderboard
+
+详见: [📊](http://47.109.32.164/)
+
+
+
+## ⚖️ Evals
+
+We provide three evaluation methods.
+
+(1) The first method is based on simple-evals evaluation. The startup command is as follows:
+
+ ```bash
+ python -m simple-evals.demo
+ ```
+ This will launch evaluations through the OpenAI API.
+
+
+
+(2) The second is a simple single evaluation script that we wrote from scratch. The startup command is as follows:
+
+- Step1: set your openai key in scripts/chinese_simpleqa_easy.py:
+
+ ```
+ os.environ["OPENAI_API_KEY"] = "replace your key here"
+ ```
+
+- Step2: run the eval script:
+
+ ```
+ python scripts/chinese_simpleqa_easy.py
+ ```
+
+- Step3: we also provide a unified processing script for multiple model results. After running it, you can get a complete leaderboard:
+
+ ```
+ python scripts/get_leaderboard.py
+ ```
+
+
+
+(3) We also integrated our Chinese SimpleQA benchmark into our forked [OpenCompass](https://github.com/open-compass/opencompass). You can refer to the opencompass configuration script for evaluation
+- Step1: git clone Opencompass:
+ ```shell
+ cd ~
+ git clone git@github.com:open-compass/opencompass.git
+ cd opencompass
+ ```
+- Step2: download Chinese Simpleqa data from [huggingface](https://huggingface.co/datasets/OpenStellarTeam/Chinese-SimpleQA), and put it in the following path(OPENCOMPASS_PATH/data/chinese_simpleqa), make sure you get path like this:
+ ```
+ ~/opencompass/data/
+ └── chinese_simpleqa
+ ├── chinese_simpleqa.jsonl
+ ```
+
+
+- Step3: configuration your launch in configs/eval_chinese_simpleqa.py, set your models to be evaluated, set your judge model (we recommend to use gpt4o) and launch it!
+ ```
+ python run.py configs/eval_chinese_simpleqa.py
+ ```
+
+
+## Citation
+
+Please cite our paper if you use our dataset.
+
+```
+@misc{he2024chinesesimpleqachinesefactuality,
+ title={Chinese SimpleQA: A Chinese Factuality Evaluation for Large Language Models},
+ author={Yancheng He and Shilong Li and Jiaheng Liu and Yingshui Tan and Weixun Wang and Hui Huang and Xingyuan Bu and Hangyu Guo and Chengwei Hu and Boren Zheng and Zhuoran Lin and Xuepeng Liu and Dekai Sun and Shirong Lin and Zhicheng Zheng and Xiaoyong Zhu and Wenbo Su and Bo Zheng},
+ year={2024},
+ eprint={2411.07140},
+ archivePrefix={arXiv},
+ primaryClass={cs.CL},
+ url={https://arxiv.org/abs/2411.07140},
+}
+```
+
diff --git a/opencompass/configs/datasets/chinese_simpleqa/chinese_simpleqa_gen.py b/opencompass/configs/datasets/chinese_simpleqa/chinese_simpleqa_gen.py
new file mode 100644
index 000000000..97c855101
--- /dev/null
+++ b/opencompass/configs/datasets/chinese_simpleqa/chinese_simpleqa_gen.py
@@ -0,0 +1,58 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import CsimpleqaDataset, csimpleqa_postprocess
+
+subjective_reader_cfg = dict(input_columns=['primary_category', 'question','gold_ans', 'messages', 'system_prompt','prompt_template'], output_column='judge')
+
+subjective_infer_cfg = dict(
+ prompt_template=dict(
+ type=PromptTemplate,
+ template=dict(round=[
+ dict(
+ role='HUMAN',
+ prompt='{question}'
+ ),
+ ]),
+ ),
+ retriever=dict(type=ZeroRetriever),
+ inferencer=dict(type=GenInferencer, max_out_len=200),
+)
+
+subjective_eval_cfg = dict(
+ evaluator=dict(
+ type=LMEvaluator,
+ prompt_template=dict(
+ type=PromptTemplate,
+ template=dict(
+ begin=[
+ dict(
+ role='SYSTEM',
+ prompt='{system_prompt}')
+ ],
+ round=[
+ dict(
+ role='HUMAN',
+ prompt = '{prompt_template}'
+ ),
+ ]
+ ),
+ ),
+ dict_postprocessor=dict(type=csimpleqa_postprocess),
+ ),
+ pred_role='BOT',
+)
+
+csimpleqa_datasets = [
+ dict(
+ abbr='chinese_simpleqa',
+ type=CsimpleqaDataset,
+ name="chinese_simpleqa",
+ path='opencompass/chinese_simpleqa',
+ reader_cfg=subjective_reader_cfg,
+ infer_cfg=subjective_infer_cfg,
+ eval_cfg=subjective_eval_cfg,
+ mode='singlescore',
+ )
+]
diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py
index 6e5d138f4..e535ddc29 100644
--- a/opencompass/datasets/__init__.py
+++ b/opencompass/datasets/__init__.py
@@ -20,6 +20,7 @@
from .charm import * # noqa: F401, F403
from .chembench import * # noqa: F401, F403
from .chid import * # noqa: F401, F403
+from .chinese_simpleqa import * # noqa: F401, F403
from .cibench import * # noqa: F401, F403
from .circular import * # noqa: F401, F403
from .civilcomments import * # noqa: F401, F403
diff --git a/opencompass/datasets/chinese_simpleqa.py b/opencompass/datasets/chinese_simpleqa.py
new file mode 100644
index 000000000..a39bb3627
--- /dev/null
+++ b/opencompass/datasets/chinese_simpleqa.py
@@ -0,0 +1,205 @@
+import json
+import os.path as osp
+import re
+
+from datasets import Dataset, DatasetDict
+
+from opencompass.registry import (DICT_POSTPROCESSORS, LOAD_DATASET,
+ TEXT_POSTPROCESSORS)
+from opencompass.utils import get_data_path
+
+from .base import BaseDataset
+
+csimpleqa_judge_prompt_new = """
+请根据给定问题、标准答案和模型预测的答案来评估模型的回答是否正确。您的任务是将结果评定为:【正确】、【错误】或【未尝试】。
+
+首先,我们将列出每个评定类别的示例,然后请您对新问题的预测答案进行评定。
+以下是【正确】的答复示例:
+```
+问题:贝拉克·奥巴马的孩子叫什么名字?
+标准答案:玛丽亚·奥巴马和萨莎·奥巴马
+模型预测1:Malia Obama and Sasha Obama
+模型预测2:玛丽亚和萨沙
+模型预测3:大多数人会说是玛丽亚和萨莎,但我不确定,需要再确认
+模型预测4:巴拉克·奥巴马有两个女儿,她们分别是玛丽亚·安和娜塔莎·玛丽安,但通常称作玛丽亚·奥巴马和萨莎·奥巴马。玛丽亚出生于1998年7月4日,萨莎出生于2001年6月10日。
+```
+这些答复均为【正确】,因为:
+ - 完整地包含了标准答案中的重要信息。
+ - 不包含任何与标准答案矛盾的信息。
+ - 只关注语义内容,中英文,大小写、标点、语法和顺序不重要。
+ - 答复中出现模糊语句或猜测是可以接受的,前提是包含了标准答案且不含有不正确信息或矛盾。
+
+以下是【错误】的答复示例:
+```
+问题:巴拉克·奥巴马的孩子叫什么名字?
+标准答案:玛丽亚·奥巴马和萨莎·奥巴马
+模型预测1:玛丽亚
+模型预测2:玛丽亚、萨莎和苏珊
+模型预测3:巴拉克·奥巴马没有孩子
+模型预测4:我认为是玛丽亚和萨莎。或者是玛丽亚和杰基。或者是乔伊和玛丽亚。
+模型预测5:虽然我不知道他们的确切名字,但能说出巴拉克·奥巴马有三个孩子。
+模型预测6:你可能是想说贝茜和奥利维亚。不过您应通过最新的参考资料确认详细信息。那是正确的答案吗?
+```
+这些答复均为【错误】,因为:
+ - 答复中包含与标准答案矛盾的事实陈述。即使在陈述中略带保留(例如:“可能是”,“虽然我不确定,但我认为”),也视为错误。
+
+以下是【未尝试】的答复示例:
+```
+问题:巴拉克·奥巴马的孩子叫什么名字?
+标准答案:玛丽亚·奥巴马和萨莎·奥巴马
+模型预测1:我不知道。
+模型预测2:我需要更多关于您所指奥巴马的上下文。
+模型预测3:不查阅网络我无法回答这个问题,不过我知道巴拉克·奥巴马有两个孩子。
+模型预测4:巴拉克·奥巴马有两个孩子。我知道其中一个叫玛丽亚,但我不确定另一个的名字。
+```
+这些答复均为【未尝试】,因为:
+ - 没有包含标准答案中的重要信息。
+ - 回复中没有与标准答案矛盾的陈述。
+
+另外注意以下几点:
+- 对于标准答案为数字的问题,预测答案应和标准答案一致。例如,考虑问题“金山铁路黄浦江特大桥的全长是多少米?”,标准答案为“3518.17”:
+ - 预测答案“3518”、“3518.1”、“3518.17”均为【正确】。
+ - 预测答案“3520”和“3600”均为【错误】。
+ - 预测答案“大约3500米”和“超过3000米”被视为【未尝试】,因为它们既不确认也不与标准答案矛盾。
+- 如果标准答案包含比问题更多的信息,预测答案只需包含问题中提到的信息。
+ - 例如,考虑问题“菱镁矿的主要化学成分是什么?”标准答案为“碳酸镁(MgCO3)”。“碳酸镁”或“MgCO3”均视为【正确】答案。
+- 如果从问题中明显可以推断出预测答案省略的信息,那么算作正确。
+ - 例如,问题“巴鲁米尼的努拉吉遗迹在1997年被联合国教科文组织列为世界文化遗产,那么这遗址在哪个地区?”标准答案为“意大利撒丁岛”,预测答案“撒丁岛”被视为【正确】。
+- 如果能明显看出名字翻译版本不同但是是同一个人也认为正确。
+ - 例如,如果标准答案是“Robinson”,那么回答鲁滨逊或者鲁滨孙均正确。
+
+下面是一个新的问题示例。请只回复A、B、C之一,不要道歉或纠正自己的错误,只需要评估该回答。
+```
+问题: {question}
+正确答案: {target}
+预测答案: {predicted_answer}
+```
+
+将此新问题的预测答案评定为以下之一:
+A:【正确】
+B:【错误】
+C:【未尝试】
+
+只返回字母"A"、"B"或"C",无须添加其他文本。
+""".strip() # noqa E501
+
+
+@TEXT_POSTPROCESSORS.register_module('chinese_simpleqa_preprocess')
+def chinese_simpleqa_preprocess(text: str) -> str:
+ text = text.split('问题:')[0].strip()
+ return text
+
+
+@LOAD_DATASET.register_module()
+class CsimpleqaDataset(BaseDataset):
+
+ def load(self, path: str, name: str, *args, **kwargs):
+ path = get_data_path(path)
+ filename = osp.join(path, f'{name}.jsonl')
+ dataset = DatasetDict()
+ raw_data = []
+ lines = open(filename, 'r', encoding='utf-8').readlines()
+ for line in lines:
+ data = json.loads(line)
+ question = data['question']
+ cur_system_prompt = '你是一个智能助手。'
+ messages = [{
+ 'role': 'system',
+ 'content': cur_system_prompt
+ }, {
+ 'role': 'user',
+ 'content': question
+ }]
+ judge_system_prompt = '你是一个智能助手,请根据给定问题、标准答案和模型预测的答案来评估模型的回答是否正确。'
+ csimpleqa_judge_prompt_f = csimpleqa_judge_prompt_new.format(
+ question=question,
+ target=data['answer'],
+ predicted_answer='{prediction}')
+ raw_data.append({
+ 'primary_category': data['primary_category'],
+ 'question': question,
+ 'gold_ans': data['answer'],
+ 'messages': messages,
+ 'system_prompt': judge_system_prompt,
+ 'prompt_template': csimpleqa_judge_prompt_f,
+ 'judge': {
+ 'primary_category': data['primary_category'],
+ 'question': question,
+ 'question_id': data['id']
+ }
+ })
+ dataset = Dataset.from_list(raw_data)
+ return dataset
+
+
+def post_process_csimpleqa(completion):
+ s = completion['prediction']
+ score = 'C'
+ try:
+ match = re.search(r'(A|B|C)', s)
+ score = match.group(0) if match else 'C'
+ except Exception:
+ score = 'C'
+ return score
+
+
+def get_judgeanswer_and_reference(result, filename, post_process):
+ judged_answers = []
+ for k, v in result.items():
+ processed_judge = post_process(v)
+ if processed_judge is not None:
+ judged_answers.append(processed_judge)
+ if len(judged_answers) <= 0.95 * len(result):
+ print('*' * 100)
+ print(f'For your {filename} judge. \
+ Among {len(result)} judgements, \n\
+ successfully extracted {len(judged_answers)} judgements, \n\
+ please check!')
+ print('*' * 100)
+ return judged_answers
+
+
+def calculate_metrics(judged_answers):
+ # judged_answers is a list like ["A", "B", "C", ...]
+
+ total_questions = len(judged_answers)
+ total_correct = judged_answers.count('A')
+ total_incorrect = judged_answers.count('B')
+ total_not_attempted = judged_answers.count('C')
+
+ total_correct_accuracy = total_correct / total_questions \
+ if total_questions > 0 else 0
+ total_incorrect_accuracy = total_incorrect / total_questions \
+ if total_questions > 0 else 0
+ total_not_attempted_accuracy = total_not_attempted / total_questions \
+ if total_questions > 0 else 0
+
+ total_given_attempted_accuracy = total_correct / (
+ total_correct + total_incorrect) if (total_correct +
+ total_incorrect) > 0 else 0
+
+ f1 = 2 * total_given_attempted_accuracy * total_correct_accuracy / (
+ total_given_attempted_accuracy + total_correct_accuracy) if (
+ total_given_attempted_accuracy + total_correct_accuracy) > 0 else 0
+
+ return {
+ 'correct': total_correct_accuracy,
+ 'incorrect': total_incorrect_accuracy,
+ 'not_attempted': total_not_attempted_accuracy,
+ 'given_attempted_accuracy': total_given_attempted_accuracy,
+ 'F1': f1
+ }
+
+
+def get_results(judged_answers):
+ results = calculate_metrics(judged_answers)
+ return results
+
+
+@DICT_POSTPROCESSORS.register_module('csimpleqa')
+def csimpleqa_postprocess(output: dict, output_path: str) -> dict:
+ judged_answers = get_judgeanswer_and_reference(output, output_path,
+ post_process_csimpleqa)
+ results = get_results(judged_answers)
+ results['details'] = output
+ return results
diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py
index caba0462f..6d994fb33 100644
--- a/opencompass/utils/datasets_info.py
+++ b/opencompass/utils/datasets_info.py
@@ -358,6 +358,11 @@
"hf_id": "",
"local": "./data/simpleqa/simple_qa_test_set.csv",
},
+ "opencompass/chinese_simpleqa": {
+ "ms_id": "",
+ "hf_id": "",
+ "local": "./data/chinese_simpleqa",
+ },
"opencompass/LiveMathBench202412": {
"ms_id": "",
"hf_id": "",
@@ -392,6 +397,10 @@
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/simpleqa.zip",
"md5": "1d83fc2e15798d39cb265c9a3cb5195a",
},
+ "/chinese_simpleqa": {
+ "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/chinese_simpleqa.zip",
+ "md5": "4bdf854b291fc0ee29da57dc47ac47b5",
+ },
"/gpqa/": {
"url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/gpqa.zip",
"md5": "2e9657959030a765916f1f2aca29140d",