open-compass · MaiziXiao · Dec 11, 2024 · Nov 17, 2024 · Nov 17, 2024 · Nov 17, 2024
diff --git a/configs/eval_chinese_simpleqa.py b/configs/eval_chinese_simpleqa.py
@@ -0,0 +1,73 @@
+from mmengine.config import read_base
+
+with read_base():
+    from opencompass.configs.datasets.chinese_simpleqa.chinese_simpleqa_gen import csimpleqa_datasets
+
+from opencompass.models.openai_api import OpenAI
+from opencompass.runners import LocalRunner
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+from opencompass.models import HuggingFacewithChatTemplate
+from opencompass.summarizers import DefaultSubjectiveSummarizer
+
+# -------------Inference Stage ----------------------------------------
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='Qwen2.5-1.5B-Instruct',
+        path='Qwen/Qwen2.5-1.5B-Instruct',
+        model_kwargs=dict(
+            device_map='auto',
+            trust_remote_code=True,
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        generation_kwargs=dict(
+            do_sample=True,
+        ),
+        max_out_len=200,
+        max_seq_len=4096,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    )
+]
+
+datasets = sum([v for k, v in locals().items() if ('datasets' in k)], [])
+summarizer = dict(type=DefaultSubjectiveSummarizer)
+
+# -------------Evalation Stage ----------------------------------------
+
+## ------------- JudgeLLM Configuration
+
+api_meta_template = dict(
+    round=[
+        dict(role='SYSTEM', api_role='SYSTEM'),
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True),
+    ]
+)
+judge_models = [
+    dict(
+        # GPT4o
+        abbr='gpt-4o-0513-global',
+        type=OpenAI,
+        # gpt-4o
+        path='gpt-4o-0513-global',
+        key='xxx',  # provide OPENAI_API_KEY
+        meta_template=api_meta_template,
+        query_per_second=16,
+        max_out_len=1000,
+        batch_size=8,
+        retry=3)
+]
+
+## ------------- Evaluation Configuration
+eval = dict(
+    partitioner=dict(type=SubjectiveNaivePartitioner, models=models, judge_models=judge_models),
+    runner=dict(type=LocalRunner, max_num_workers=16, task=dict(type=SubjectiveEvalTask)),
+)
+
+work_dir = 'outputs/chinese_simpleqa/'
diff --git a/opencompass/configs/datasets/chinese_simpleqa/README.md b/opencompass/configs/datasets/chinese_simpleqa/README.md
@@ -0,0 +1,108 @@
+
+
+
+# Overview
+<p align="center">
+   🌐 <a href="https://openstellarteam.github.io/ChineseSimpleQA/" target="_blank">Website</a> • 🤗 <a href="https://huggingface.co/datasets/OpenStellarTeam/Chinese-SimpleQA" target="_blank">Hugging Face</a> • ⏬ <a href="#data" target="_blank">Data</a> •   📃 <a href="https://huggingface.co/datasets/OpenStellarTeam/Chinese-SimpleQA" target="_blank">Paper</a> •   📊 <a href="http://47.109.32.164/" target="_blank">Leaderboard</a>  <br>  <a href="https://github.com/OpenStellarTeam/ChineseSimpleQA/blob/master/README_zh.md">   中文</a> | <a href="https://github.com/OpenStellarTeam/ChineseSimpleQA/blob/master/README.md">English 
+</p> 
+
+**Chinese SimpleQA** is the first comprehensive Chinese benchmark to evaluate the factuality ability of language models to answer short questions, and Chinese SimpleQA mainly has five properties (i.e., Chinese, Diverse, High-quality, Static, Easy-to-evaluate). Specifically, our benchmark covers **6 major topics** with **99 diverse subtopics**. 
+
+Please visit our [website](https://openstellarteam.github.io/ChineseSimpleQA/) or check our [paper](https://arxiv.org/abs/2411.07140) for more details. 
+
+
+
+## 💫 Instroduction
+
+* How to solve the generative hallucination of models has always been an unsolved problem in the field of artificial intelligence (AI). In order to measure the factual correctness of language models, OpenAI recently released and open-sourced a test set called SimpleQA. We have also been paying attention to the field of  factuality, which currently has problems such as outdated data, inaccurate evaluation, and incomplete coverage. For example, the knowledge evaluation sets widely used now are still CommonSenseQA, CMMLU, and C-Eval, which are multiple-choice question-based evaluation sets. **In order to further promote the research of the Chinese community on the factual correctness of models, we propose the Chinese SimpleQA**.  which consists of 3000 high-quality questions spanning 6 major topics, ranging from humanities to science and engineering. Specifically, the distinct main features of our proposed Chinese SimpleQA dataset are as follows:
+  * 🀄**Chinese:** Our Chinese SimpleQA focuses on the Chinese language, which provides a comprehensive evaluation of the factuality abilities of existing LLMs in Chinese.
+  * 🍀**Diverse:** Chinese SimpleQA covers 6 topics (i.e., “Chinese Culture”, “Humanities”, “Engineering, Technology, and Applied Sciences”, “Life, Art, and Culture”, “Society”, and “Natural Science”), and these topic includes 99 fine-grained subtopics in total, which demonstrates the diversity of our Chinese SimpleQA. 
+  * ⚡**High-quality:** We conduct a comprehensive and rigorous quality control process to ensure the quality and accuracy of our Chinese SimpleQA.
+  * 💡**Static:** Following SimpleQA, to preserve the evergreen property of Chinese SimpleQA, all reference answers would not change over time. 
+  * 🗂️**Easy-to-evaluate:** Following SimpleQA, as the questions and answers are very short, the grading procedure is fast to run via existing LLMs (e.g., OpenAI API).
+
+- Based on Chinese SimpleQA, we have conducted a comprehensive evaluation of the factual capabilities of existing LLMs. We also maintain a comprehensive leaderboard list. 
+- In short, we hope that Chinese SimpleQA can help developers gain a deeper understanding of the factual correctness of their models in the Chinese field, and at the same time provide an important cornerstone for their algorithm research, and jointly promote the growth of Chinese basic models.
+
+
+
+
+
+## 📊 Leaderboard
+
+详见：  [📊](http://47.109.32.164/)
+
+
+
+## ⚖️ Evals
+
+We provide three evaluation methods. 
+
+(1) The first method is based on simple-evals evaluation. The startup command is as follows: 
+
+    ```bash
+    python -m simple-evals.demo
+    ```
+    This will launch evaluations through the OpenAI API.
+
+
+
+(2) The second is a simple single evaluation script that we wrote from scratch.  The startup command is as follows: 
+
+- Step1: set your openai key in scripts/chinese_simpleqa_easy.py:
+
+  ```
+  os.environ["OPENAI_API_KEY"] = "replace your key here"
+  ```
+
+- Step2: run the eval script:
+
+  ```
+  python scripts/chinese_simpleqa_easy.py
+  ```
+
+- Step3: we also provide a unified processing script for multiple model results. After running it, you can get a complete leaderboard:
+
+  ```
+  python scripts/get_leaderboard.py
+  ```
+
+
+
+(3) We also integrated our Chinese SimpleQA benchmark into our forked [OpenCompass](https://github.com/open-compass/opencompass). You can refer to the opencompass configuration script for evaluation
+- Step1: git clone Opencompass:
+  ```shell
+  cd ~
+  git clone [email protected]:open-compass/opencompass.git
+  cd opencompass
+  ```
+- Step2: download Chinese Simpleqa data from [huggingface](https://huggingface.co/datasets/OpenStellarTeam/Chinese-SimpleQA),  and put it in the following path(OPENCOMPASS_PATH/data/chinese_simpleqa), make sure you get path like this:
+    ```
+    ~/opencompass/data/
+    └── chinese_simpleqa
+        ├── chinese_simpleqa.jsonl
+    ```
+
+
+- Step3: configuration your launch in configs/eval_chinese_simpleqa.py, set your models to be evaluated, set your judge model (we recommend to use gpt4o) and launch it!
+  ```
+  python run.py configs/eval_chinese_simpleqa.py
+  ```
+
+
+## Citation
+
+Please cite our paper if you use our dataset.
+
+```
+@misc{he2024chinesesimpleqachinesefactuality,
+      title={Chinese SimpleQA: A Chinese Factuality Evaluation for Large Language Models}, 
+      author={Yancheng He and Shilong Li and Jiaheng Liu and Yingshui Tan and Weixun Wang and Hui Huang and Xingyuan Bu and Hangyu Guo and Chengwei Hu and Boren Zheng and Zhuoran Lin and Xuepeng Liu and Dekai Sun and Shirong Lin and Zhicheng Zheng and Xiaoyong Zhu and Wenbo Su and Bo Zheng},
+      year={2024},
+      eprint={2411.07140},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2411.07140}, 
+}
+```
+
diff --git a/opencompass/configs/datasets/chinese_simpleqa/chinese_simpleqa_gen.py b/opencompass/configs/datasets/chinese_simpleqa/chinese_simpleqa_gen.py
@@ -0,0 +1,58 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import CsimpleqaDataset, csimpleqa_postprocess
+
+subjective_reader_cfg = dict(input_columns=['primary_category', 'question','gold_ans', 'messages', 'system_prompt','prompt_template'], output_column='judge')
+
+subjective_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt='{question}'
+            ),
+        ]),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=200),
+)
+
+subjective_eval_cfg = dict(
+    evaluator=dict(
+        type=LMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        prompt='{system_prompt}')
+                ],
+                round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = '{prompt_template}'
+                    ),
+                ]
+            ),
+        ),
+        dict_postprocessor=dict(type=csimpleqa_postprocess),
+    ),
+    pred_role='BOT',
+)
+
+csimpleqa_datasets = [
+    dict(
+        abbr='chinese_simpleqa',
+        type=CsimpleqaDataset,
+        name="chinese_simpleqa",
+        path='opencompass/chinese_simpleqa',
+        reader_cfg=subjective_reader_cfg,
+        infer_cfg=subjective_infer_cfg,
+        eval_cfg=subjective_eval_cfg,
+        mode='singlescore',
+    )
+]
diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py
@@ -20,6 +20,7 @@
 from .charm import *  # noqa: F401, F403
 from .chembench import *  # noqa: F401, F403
 from .chid import *  # noqa: F401, F403
+from .chinese_simpleqa import *  # noqa: F401, F403
 from .cibench import *  # noqa: F401, F403
 from .circular import *  # noqa: F401, F403
 from .civilcomments import *  # noqa: F401, F403