diff --git a/configs/datasets/livecodebench/livecodebench_gen.py b/configs/datasets/livecodebench/livecodebench_gen.py index a82ef82e1..f663df068 100644 --- a/configs/datasets/livecodebench/livecodebench_gen.py +++ b/configs/datasets/livecodebench/livecodebench_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .livecodebench_gen_b2b0fd import LCB_datasets # noqa: F401, F403 + from .livecodebench_gen_6966bc import LCB_datasets # noqa: F401, F403 diff --git a/configs/datasets/livecodebench/livecodebench_gen_6966bc.py b/configs/datasets/livecodebench/livecodebench_gen_6966bc.py new file mode 100644 index 000000000..6f2da11ea --- /dev/null +++ b/configs/datasets/livecodebench/livecodebench_gen_6966bc.py @@ -0,0 +1,164 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import ( + LCBCodeGenerationDataset, + LCBCodeExecutionDataset, + LCBTestOutputPredictionDataset, + LCBCodeGenerationEvaluator, + LCBCodeExecutionEvaluator, + LCBTestOutputEvaluator +) +from opencompass.datasets.livecodebench import TestOutputPromptConstants + + +lcb_code_generation_reader_cfg = dict( + input_columns=[ + 'question_content', + 'format_prompt', + ], + # output_column='evaluation_sample', + output_column='question_id', +) + +SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \ + '### Answer: (use the provided format with backticks)\n\n' + + +# Code Generation Tasks +lcb_code_generation_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt=prompt_template + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_code_generation_eval_cfg = dict( + evaluator=dict( + type=LCBCodeGenerationEvaluator, + num_process_evaluate=4, + timeout=6, + ), + pred_role='BOT', +) + +LCBCodeGeneration_dataset = dict( + type=LCBCodeGenerationDataset, + abbr='lcb_code_generation', + path='opencompass/code_generation_lite', + reader_cfg=lcb_code_generation_reader_cfg, + infer_cfg=lcb_code_generation_infer_cfg, + eval_cfg=lcb_code_generation_eval_cfg +) + +# Code Execution Dataset +lcb_code_execution_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +lcb_code_execution_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.' + ), + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_code_execution_eval_cfg = dict( + evaluator=dict( + type=LCBCodeExecutionEvaluator, + ), + pred_role='BOT', +) + +LCBCodeExecution_dataset = dict( + type=LCBCodeExecutionDataset, + abbr='lcb_code_execution', + path='opencompass/execution-v2', + reader_cfg=lcb_code_execution_reader_cfg, + infer_cfg=lcb_code_execution_infer_cfg, + eval_cfg=lcb_code_execution_eval_cfg, +) + +# TestOuputput Dataset +lcb_test_output_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +lcb_test_output_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + # begin=[ + # dict( + # role='SYSTEM', + # prompt=system_prompt + # ), + # ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_test_output_eval_cfg = dict( + evaluator=dict( + type=LCBTestOutputEvaluator, + ), + pred_role='BOT', +) + +LCBTestOutput_dataset = dict( + type=LCBTestOutputPredictionDataset, + abbr='lcb_test_output', + path='opencompass/test_generation', + reader_cfg=lcb_test_output_reader_cfg, + infer_cfg=lcb_test_output_infer_cfg, + eval_cfg=lcb_test_output_eval_cfg, +) + +LCB_datasets = [ + LCBCodeGeneration_dataset, + LCBCodeExecution_dataset, + LCBTestOutput_dataset, +] diff --git a/opencompass/configs/datasets/livecodebench/livecodebench_gen.py b/opencompass/configs/datasets/livecodebench/livecodebench_gen.py index a82ef82e1..f663df068 100644 --- a/opencompass/configs/datasets/livecodebench/livecodebench_gen.py +++ b/opencompass/configs/datasets/livecodebench/livecodebench_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .livecodebench_gen_b2b0fd import LCB_datasets # noqa: F401, F403 + from .livecodebench_gen_6966bc import LCB_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/livecodebench/livecodebench_gen_6966bc.py b/opencompass/configs/datasets/livecodebench/livecodebench_gen_6966bc.py new file mode 100644 index 000000000..6f2da11ea --- /dev/null +++ b/opencompass/configs/datasets/livecodebench/livecodebench_gen_6966bc.py @@ -0,0 +1,164 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import ( + LCBCodeGenerationDataset, + LCBCodeExecutionDataset, + LCBTestOutputPredictionDataset, + LCBCodeGenerationEvaluator, + LCBCodeExecutionEvaluator, + LCBTestOutputEvaluator +) +from opencompass.datasets.livecodebench import TestOutputPromptConstants + + +lcb_code_generation_reader_cfg = dict( + input_columns=[ + 'question_content', + 'format_prompt', + ], + # output_column='evaluation_sample', + output_column='question_id', +) + +SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \ + '### Answer: (use the provided format with backticks)\n\n' + + +# Code Generation Tasks +lcb_code_generation_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt=prompt_template + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_code_generation_eval_cfg = dict( + evaluator=dict( + type=LCBCodeGenerationEvaluator, + num_process_evaluate=4, + timeout=6, + ), + pred_role='BOT', +) + +LCBCodeGeneration_dataset = dict( + type=LCBCodeGenerationDataset, + abbr='lcb_code_generation', + path='opencompass/code_generation_lite', + reader_cfg=lcb_code_generation_reader_cfg, + infer_cfg=lcb_code_generation_infer_cfg, + eval_cfg=lcb_code_generation_eval_cfg +) + +# Code Execution Dataset +lcb_code_execution_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +lcb_code_execution_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.' + ), + ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_code_execution_eval_cfg = dict( + evaluator=dict( + type=LCBCodeExecutionEvaluator, + ), + pred_role='BOT', +) + +LCBCodeExecution_dataset = dict( + type=LCBCodeExecutionDataset, + abbr='lcb_code_execution', + path='opencompass/execution-v2', + reader_cfg=lcb_code_execution_reader_cfg, + infer_cfg=lcb_code_execution_infer_cfg, + eval_cfg=lcb_code_execution_eval_cfg, +) + +# TestOuputput Dataset +lcb_test_output_reader_cfg = dict( + input_columns=[ + 'prompt', + ], + output_column='evaluation_sample', +) + +system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.' + +lcb_test_output_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + # begin=[ + # dict( + # role='SYSTEM', + # prompt=system_prompt + # ), + # ], + round=[ + dict( + role='HUMAN', + prompt='{prompt}' + ) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024) +) + +lcb_test_output_eval_cfg = dict( + evaluator=dict( + type=LCBTestOutputEvaluator, + ), + pred_role='BOT', +) + +LCBTestOutput_dataset = dict( + type=LCBTestOutputPredictionDataset, + abbr='lcb_test_output', + path='opencompass/test_generation', + reader_cfg=lcb_test_output_reader_cfg, + infer_cfg=lcb_test_output_infer_cfg, + eval_cfg=lcb_test_output_eval_cfg, +) + +LCB_datasets = [ + LCBCodeGeneration_dataset, + LCBCodeExecution_dataset, + LCBTestOutput_dataset, +] diff --git a/opencompass/configs/models/chatglm/lmdeploy_glm4_9b.py b/opencompass/configs/models/chatglm/lmdeploy_glm4_9b.py new file mode 100644 index 000000000..e9e2d3940 --- /dev/null +++ b/opencompass/configs/models/chatglm/lmdeploy_glm4_9b.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModel + +models = [ + dict( + type=TurboMindModel, + abbr='glm-4-9b-turbomind', + path='THUDM/glm-4-9b', + engine_config=dict(max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048), + max_seq_len=8192, + max_out_len=2048, + batch_size=16, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_14b.py b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_14b.py new file mode 100644 index 000000000..a5f63e543 --- /dev/null +++ b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_14b.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModel + +models = [ + dict( + type=TurboMindModel, + abbr='qwen2.5-14b-turbomind', + path='Qwen/Qwen2.5-14B', + engine_config=dict(session_len=7168, max_batch_size=16, tp=2), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), + max_seq_len=7168, + max_out_len=1024, + batch_size=16, + run_cfg=dict(num_gpus=2), + ) +] diff --git a/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_32b.py b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_32b.py new file mode 100644 index 000000000..bf0c0c15f --- /dev/null +++ b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_32b.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModel + +models = [ + dict( + type=TurboMindModel, + abbr='qwen2.5-32b-turbomind', + path='Qwen/Qwen2.5-32B', + engine_config=dict(session_len=7168, max_batch_size=16, tp=2), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024), + max_seq_len=7168, + max_out_len=1024, + batch_size=16, + run_cfg=dict(num_gpus=2), + ) +] diff --git a/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_72b.py b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_72b.py new file mode 100644 index 000000000..0bee0557e --- /dev/null +++ b/opencompass/configs/models/qwen2_5/lmdeploy_qwen2_5_72b.py @@ -0,0 +1,17 @@ +from opencompass.models import TurboMindModel + +models = [ + dict( + type=TurboMindModel, + abbr='qwen2.5-72b-turbomind', + path='Qwen/Qwen2.5-72B', + engine_config=dict(session_len=7168, max_batch_size=16, tp=4), + gen_config=dict( + top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024 + ), + max_seq_len=7168, + max_out_len=1024, + batch_size=16, + run_cfg=dict(num_gpus=4), + ) +] diff --git a/opencompass/configs/models/yi/lmdeploy_yi_1_5_9b.py b/opencompass/configs/models/yi/lmdeploy_yi_1_5_9b.py new file mode 100644 index 000000000..5780ec2e3 --- /dev/null +++ b/opencompass/configs/models/yi/lmdeploy_yi_1_5_9b.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModel + +models = [ + dict( + type=TurboMindModel, + abbr='yi-1.5-9b-turbomind', + path='01-ai/Yi-1.5-9B', + engine_config=dict(session_len=4096, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048), + max_seq_len=4096, + max_out_len=2048, + batch_size=16, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/opencompass/models/openai_api.py b/opencompass/models/openai_api.py index ab7ab304f..4e5d22462 100644 --- a/opencompass/models/openai_api.py +++ b/opencompass/models/openai_api.py @@ -526,7 +526,7 @@ def __init__( def _generate(self, input: PromptList | str, max_out_len: int, temperature: float) -> str: - from openai import BadRequestError + from openai import APIStatusError, BadRequestError assert isinstance(input, (str, PromptList)) # max num token for gpt-3.5-turbo is 4097 @@ -616,7 +616,7 @@ def _generate(self, input: PromptList | str, max_out_len: int, from the API provider.') return responses.choices[0].message.content - except BadRequestError as e: + except (BadRequestError, APIStatusError) as e: # Handle BadRequest status # You can specify self.status_code_mappings to bypass \ # API sensitivity blocks diff --git a/opencompass/models/turbomind_with_tf_above_v4_33.py b/opencompass/models/turbomind_with_tf_above_v4_33.py index cf5b880b6..79e6e5562 100644 --- a/opencompass/models/turbomind_with_tf_above_v4_33.py +++ b/opencompass/models/turbomind_with_tf_above_v4_33.py @@ -87,6 +87,7 @@ def _get_potential_stop_words(self, path: Optional[str]): def generate(self, inputs: List[str], max_out_len: int, + min_out_len: Optional[int] = None, stopping_criteria: List[str] = [], do_sample: Optional[bool] = None, temperature: float = 1.0, @@ -123,6 +124,10 @@ def generate(self, gen_config = copy.deepcopy(DEFAULT_GEN_CONFIG) gen_config.update(self.gen_config) + if max_out_len is not None: + gen_config['max_new_tokens'] = max_out_len + if min_out_len is not None: + gen_config['min_new_tokens'] = min_out_len if do_sample or ('do_sample' in self.gen_config and self.gen_config['do_sample']): gen_config['top_k'] = 40 gen_config['temperature'] = temperature