diff --git a/opencompass/configs/models/hf_llama/hf_llama3_2_3b_instruct.py b/opencompass/configs/models/hf_llama/hf_llama3_2_3b_instruct.py new file mode 100644 index 000000000..2197b6ce1 --- /dev/null +++ b/opencompass/configs/models/hf_llama/hf_llama3_2_3b_instruct.py @@ -0,0 +1,13 @@ +from opencompass.models import HuggingFacewithChatTemplate + +models = [ + dict( + type=HuggingFacewithChatTemplate, + abbr='llama-3_2-3b-instruct-hf', + path='meta-llama/Llama-3.2-3B-Instruct', + max_out_len=1024, + batch_size=8, + run_cfg=dict(num_gpus=1), + stop_words=['<|end_of_text|>', '<|eot_id|>'], + ) +] diff --git a/opencompass/configs/models/hf_llama/lmdeploy_llama3_2_3b_instruct.py b/opencompass/configs/models/hf_llama/lmdeploy_llama3_2_3b_instruct.py new file mode 100644 index 000000000..611746dcf --- /dev/null +++ b/opencompass/configs/models/hf_llama/lmdeploy_llama3_2_3b_instruct.py @@ -0,0 +1,16 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='llama-3_2-3b-instruct-turbomind', + path='meta-llama/Llama-3.2-3B-Instruct', + engine_config=dict(max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=16384, + max_out_len=4096, + batch_size=16, + run_cfg=dict(num_gpus=1), + stop_words=['<|end_of_text|>', '<|eot_id|>'], + ) +] diff --git a/opencompass/configs/models/mistral/hf_mistral_nemo_instruct_2407.py b/opencompass/configs/models/mistral/hf_mistral_nemo_instruct_2407.py new file mode 100644 index 000000000..6c90769e3 --- /dev/null +++ b/opencompass/configs/models/mistral/hf_mistral_nemo_instruct_2407.py @@ -0,0 +1,12 @@ +from opencompass.models import HuggingFacewithChatTemplate + +models = [ + dict( + type=HuggingFacewithChatTemplate, + abbr='mistral-nemo-instruct-2407-hf', + path='mistralai/Mistral-Nemo-Instruct-2407', + max_out_len=1024, + batch_size=8, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/opencompass/configs/models/mistral/hf_mistral_small_instruct_2409.py b/opencompass/configs/models/mistral/hf_mistral_small_instruct_2409.py new file mode 100644 index 000000000..b9810c3e2 --- /dev/null +++ b/opencompass/configs/models/mistral/hf_mistral_small_instruct_2409.py @@ -0,0 +1,12 @@ +from opencompass.models import HuggingFacewithChatTemplate + +models = [ + dict( + type=HuggingFacewithChatTemplate, + abbr='mistral-small-instruct-2409-hf', + path='mistralai/Mistral-Small-Instruct-2409', + max_out_len=1024, + batch_size=8, + run_cfg=dict(num_gpus=2), + ) +] diff --git a/opencompass/configs/models/mistral/lmdeploy_mistral_nemo_instruct_2407.py b/opencompass/configs/models/mistral/lmdeploy_mistral_nemo_instruct_2407.py new file mode 100644 index 000000000..5e3c27f47 --- /dev/null +++ b/opencompass/configs/models/mistral/lmdeploy_mistral_nemo_instruct_2407.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='mistral-nemo-instruct-2407-turbomind', + path='mistralai/Mistral-Nemo-Instruct-2407', + engine_config=dict(session_len=32768, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=32768, + max_out_len=4096, + batch_size=16, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/opencompass/configs/models/mistral/lmdeploy_mistral_small_instruct_2409.py b/opencompass/configs/models/mistral/lmdeploy_mistral_small_instruct_2409.py new file mode 100644 index 000000000..1b5ac0106 --- /dev/null +++ b/opencompass/configs/models/mistral/lmdeploy_mistral_small_instruct_2409.py @@ -0,0 +1,15 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr="mistral-small-instruct-2409-turbomind", + path="mistralai/Mistral-Small-Instruct-2409", + engine_config=dict(session_len=32768, max_batch_size=16, tp=2), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096), + max_seq_len=32768, + max_out_len=4096, + batch_size=16, + run_cfg=dict(num_gpus=2), + ) +] diff --git a/opencompass/datasets/compassbench_obj.py b/opencompass/datasets/compassbench_obj.py index 000b18dd0..044b20d9a 100644 --- a/opencompass/datasets/compassbench_obj.py +++ b/opencompass/datasets/compassbench_obj.py @@ -26,7 +26,7 @@ def load(path: str, name: str): circular_patterns = ['ABCD', 'BCDA', 'CDAB', 'DABC'] data = [] - with open(path, 'r') as infile: + with open(path, 'r', encoding='utf-8', errors='ignore') as infile: for id, line in enumerate(infile): entry = json.loads(line) if 'cloze' in name: diff --git a/opencompass/models/bailing_api_oc.py b/opencompass/models/bailing_api_oc.py index 8e107556c..0b721bff7 100644 --- a/opencompass/models/bailing_api_oc.py +++ b/opencompass/models/bailing_api_oc.py @@ -81,8 +81,8 @@ def __init__( self._headers = {'Authorization': f'Bearer {token}'} self._headers['Content-Type'] = 'application/json' - self._url = url if url else \ - 'https://bailingchat.alipay.com/chat/completions' + self._url = (url if url else + 'https://bailingchat.alipay.com/chat/completions') self._model = path self._sessions = [] self._num = (int(os.environ.get('BAILING_API_PARALLEL_NUM')) @@ -136,9 +136,9 @@ def generate( results.append('') else: if (result.get('choices') - and result['choices'][0].get('message') - and result['choices'][0]['message'].get( - 'content')): + and result['choices'][0].get('message') and + result['choices'][0]['message'].get('content') + is not None): results.append( result['choices'][0]['message']['content']) else: diff --git a/opencompass/models/openai_api.py b/opencompass/models/openai_api.py index aff2579a6..ab7ab304f 100644 --- a/opencompass/models/openai_api.py +++ b/opencompass/models/openai_api.py @@ -466,25 +466,28 @@ def bin_trim(self, prompt: str, num_token: int) -> str: class OpenAISDK(OpenAI): - def __init__(self, - path: str = 'gpt-3.5-turbo', - max_seq_len: int = 4096, - query_per_second: int = 1, - rpm_verbose: bool = False, - retry: int = 2, - key: str | List[str] = 'ENV', - org: str | List[str] | None = None, - meta_template: Dict | None = None, - openai_api_base: str = OPENAI_API_BASE, - openai_proxy_url: Optional[str] = None, - mode: str = 'none', - logprobs: bool | None = False, - top_logprobs: int | None = None, - temperature: float | None = None, - tokenizer_path: str | None = None, - extra_body: Dict | None = None, - max_completion_tokens: int = 16384, - verbose: bool = False): + def __init__( + self, + path: str = 'gpt-3.5-turbo', + max_seq_len: int = 4096, + query_per_second: int = 1, + rpm_verbose: bool = False, + retry: int = 2, + key: str | List[str] = 'ENV', + org: str | List[str] | None = None, + meta_template: Dict | None = None, + openai_api_base: str = OPENAI_API_BASE, + openai_proxy_url: Optional[str] = None, + mode: str = 'none', + logprobs: bool | None = False, + top_logprobs: int | None = None, + temperature: float | None = None, + tokenizer_path: str | None = None, + extra_body: Dict | None = None, + max_completion_tokens: int = 16384, + verbose: bool = False, + status_code_mappings: dict = {}, + ): super().__init__(path, max_seq_len, query_per_second, @@ -519,9 +522,11 @@ def __init__(self, http_client=httpx.Client(proxies=proxies)) if self.verbose: self.logger.info(f'Used openai_client: {self.openai_client}') + self.status_code_mappings = status_code_mappings def _generate(self, input: PromptList | str, max_out_len: int, temperature: float) -> str: + from openai import BadRequestError assert isinstance(input, (str, PromptList)) # max num token for gpt-3.5-turbo is 4097 @@ -605,7 +610,30 @@ def _generate(self, input: PromptList | str, max_out_len: int, self.logger.info(responses) except Exception as e: # noqa F841 pass + if not responses.choices: + self.logger.error( + 'Response is empty, it is an internal server error \ + from the API provider.') return responses.choices[0].message.content + + except BadRequestError as e: + # Handle BadRequest status + # You can specify self.status_code_mappings to bypass \ + # API sensitivity blocks + # For example: status_code_mappings={400: 'Input data \ + # may contain inappropriate content.'} + status_code = e.status_code + if (status_code is not None + and status_code in self.status_code_mappings): + original_error_message = e.body.get('message') + error_message = self.status_code_mappings[status_code] + self.logger.info( + f'Status Code: {status_code}, ' + f'Original Error Message: {original_error_message},' + f'Return Message: {error_message} ') + return error_message + else: + self.logger.error(e) except Exception as e: self.logger.error(e) num_retries += 1 diff --git a/opencompass/summarizers/subjective/compassbench.py b/opencompass/summarizers/subjective/compassbench.py index 7ffdfdbe6..67c012433 100644 --- a/opencompass/summarizers/subjective/compassbench.py +++ b/opencompass/summarizers/subjective/compassbench.py @@ -29,13 +29,46 @@ def post_process_wildbench_pair(judgement: str): else: return None -MAP = {'language':['总分','中文总分','英文总分','自然语言处理_cn','创作_cn','对话_cn','NLP_en','creation_en','chat_en'], - 'instruct':['总分','中文总分','英文总分',], - 'reasoning':['总分','中文总分','英文总分','Common Sense Reasoning_cn','Social Reasoning_cn','Humanities (History, Finance, etc.) Professional Reasoning_cn', 'Science and Engineering Professional Reasoning_cn', - 'Common Sense Reasoning_en','Social Reasoning_en','Humanities (History, Finance, etc.) Professional Reasoning_en', 'Science and Engineering Professional Reasoning_en',], - 'coding':['总分','中文总分','英文总分',]} - -MAP = {'instruct':['总分','中文总分','英文总分',]} +MAP = { + 'instruct': [ + '总分', + '中文总分', + '英文总分', + 'instruct/compassbenchv1_4_IF_en_fofo_sub', + 'instruct/compassbenchv1_4_IF_zh_fofo_sub', + ], + 'language': [ + '总分', + '中文总分', + '英文总分', + 'language/compassbenchv1_4_language_zh_chat_sub', + 'language/compassbenchv1_4_language_zh_creation_sub', + 'language/compassbenchv1_4_language_zh_NLP_sub', + 'language/compassbenchv1_4_language_en_chat_sub', + 'language/compassbenchv1_4_language_en_creation_sub', + 'language/compassbenchv1_4_language_en_NLP_sub', + ], + 'reasoning': [ + '总分', + '中文总分', + '英文总分', + 'reasoning/compassbenchv1_4_reasoning_en_CommonSenseSense_sub', + 'reasoning/compassbenchv1_4_reasoning_en_Humanities_sub', + 'reasoning/compassbenchv1_4_reasoning_en_ScienceEngineering_sub', + 'reasoning/compassbenchv1_4_reasoning_en_Social_sub', + 'reasoning/compassbenchv1_4_reasoning_zh_CommonSenseSense_sub', + 'reasoning/compassbenchv1_4_reasoning_zh_Humanities_sub', + 'reasoning/compassbenchv1_4_reasoning_zh_ScienceEngineering_sub', + 'reasoning/compassbenchv1_4_reasoning_zh_Social_sub', + ], + 'coding': [ + '总分', + '中文总分', + '英文总分', + 'coding/compassbenchv1_4_coding_en_sub', + 'coding/compassbenchv1_4_coding_zh_sub', + ], +} class CompassBenchSummarizer: @@ -52,15 +85,18 @@ def __init__(self, config: ConfigDict, check_pos_bias=False) -> None: self.base_models = self.cfg['datasets'][0]['base_models'] self.compare_models = self.cfg['eval']['partitioner']['models'] self.judge_models = self.cfg.get('judge_models', None) - self.meta_judge_model = self.cfg.eval.partitioner.get('meta_judge_model', None) + self.meta_judge_model = self.cfg.eval.partitioner.get( + 'meta_judge_model', None) self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_models'][0]) self.judge_function = post_process_wildbench_pair self.check_pos_bias = check_pos_bias def get_score(self, time_str): output_dir, results_folder = get_outdir(self.cfg, time_str) - model_combinations = list(product(self.base_models, self.compare_models)) - unique_combinations = remove_duplicate_pairs([combo for combo in model_combinations if combo[0] != combo[1]]) + model_combinations = list( + product(self.base_models, self.compare_models)) + unique_combinations = remove_duplicate_pairs( + [combo for combo in model_combinations if combo[0] != combo[1]]) if self.meta_judge_model is not None: self.judge_models.append(self.meta_judge_model) @@ -71,33 +107,47 @@ def get_score(self, time_str): scores[judge_model] = {} for dataset in self.cfg['datasets']: dataset_abbr = dataset_abbr_from_cfg(dataset) - dataset_root, dataset_detail = dataset_abbr.split('/')[0], dataset_abbr.split('/')[1] + dataset_root, dataset_detail = ( + dataset_abbr.split('/')[0], + dataset_abbr.split('/')[1], + ) scores[judge_model][dataset_abbr] = {} for model_pair in unique_combinations: base_model = model_pair[0]['abbr'] compare_model = model_pair[1]['abbr'] if idx == len(self.judge_models): - subdir = base_model + '_' + compare_model + '_summarized-by--' + judge_model + subdir = (base_model + '_' + compare_model + + '_summarized-by--' + judge_model) else: - subdir = base_model + '_' + compare_model + '_judged-by--' + judge_model + subdir = (base_model + '_' + compare_model + + '_judged-by--' + judge_model) subdir_path = os.path.join(results_folder, subdir) if not os.path.isdir(subdir_path): print(subdir_path + ' is not exist! please check!') scores[judge_model][dataset_abbr][compare_model] = None continue - judged_answers, references = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function) + judged_answers, references = get_judgeanswer_and_reference( + dataset, subdir_path, self.judge_function) win_base_model = defaultdict(float) win_compare_model = defaultdict(float) - score_mapping = {'A++': 1, 'A+': 0.5, 'A=B': 0, 'B+': -0.5, 'B++': -1} + score_mapping = { + 'A++': 1, + 'A+': 0.5, + 'A=B': 0, + 'B+': -0.5, + 'B++': -1, + } cnt = defaultdict(float) - for judged_answer, reference in zip(judged_answers, references): + for judged_answer, reference in zip( + judged_answers, references): if judged_answer not in score_mapping: continue else: - flag = 1 if reference['answer1'] == base_model else -1 - score_1 = score_mapping[judged_answer]*flag + flag = (1 if reference['answer1'] == base_model + else -1) + score_1 = score_mapping[judged_answer] * flag score_2 = -score_1 cnt[dataset_abbr] += 1 @@ -107,10 +157,13 @@ def get_score(self, time_str): for key, value in cnt.items(): win_base_model[key] = win_base_model[key] / value * 100 win_base_model[key] = round(win_base_model[key], 2) - win_compare_model[key] = win_compare_model[key] / value * 100 - win_compare_model[key ] = round(win_compare_model[key], 2) + win_compare_model[key] = (win_compare_model[key] / + value * 100) + win_compare_model[key] = round(win_compare_model[key], + 2) - scores[judge_model][dataset_abbr][compare_model] = win_compare_model + scores[judge_model][dataset_abbr][ + compare_model] = win_compare_model return scores @@ -131,7 +184,10 @@ def summarize( for judge_abbr, judge_scores in scores.items(): new_score = {} for dataset_name, model_scores in judge_scores.items(): - dataset_root, dataset_detail = dataset_name.split('/')[0], dataset_name.split('/')[1] + dataset_root, dataset_detail = ( + dataset_name.split('/')[0], + dataset_name.split('/')[1], + ) if dataset_root not in new_score: new_score[dataset_root] = {} if '_en_' in dataset_detail: @@ -141,8 +197,10 @@ def summarize( if len(cate_score) == 0: new_score[dataset_root][model_name]['英文总分'] = None else: - new_score[dataset_root][model_name].update(cate_score) - new_score[dataset_root][model_name]['英文总分'] = sum(cate_score.values()) / len(cate_score) + new_score[dataset_root][model_name].update( + cate_score) + new_score[dataset_root][model_name]['英文总分'] = ( + sum(cate_score.values()) / len(cate_score)) elif '_cn_' in dataset_detail or '_zh_' in dataset_detail: for model_name, cate_score in model_scores.items(): if model_name not in new_score[dataset_root]: @@ -150,17 +208,19 @@ def summarize( if len(cate_score) == 0: new_score[dataset_root][model_name]['中文总分'] = None else: - new_score[dataset_root][model_name].update(cate_score) - new_score[dataset_root][model_name]['中文总分'] = sum(cate_score.values()) / len(cate_score) + new_score[dataset_root][model_name].update( + cate_score) + new_score[dataset_root][model_name]['中文总分'] = ( + sum(cate_score.values()) / len(cate_score)) for dataset, models in new_score.items(): for model, details in models.items(): - if details['英文总分'] is not None and details['中文总分'] is not None: + if (details['英文总分'] is not None + and details['中文总分'] is not None): average_score = (details['英文总分'] + details['中文总分']) / 2 else: average_score = None details['总分'] = average_score - df = pd.DataFrame() # Iterate over the MAP and new_score to populate the DataFrame for category, headers in MAP.items(): @@ -173,15 +233,17 @@ def summarize( category_data.append(row_data) # Create a DataFrame for the category and concatenate with the main DataFrame - new_headers = [category+'_'+item for item in headers] - category_df = pd.DataFrame(category_data, columns=[category] + new_headers) + new_headers = [category + '_' + item for item in headers] + category_df = pd.DataFrame(category_data, + columns=[category] + new_headers) df = pd.concat([df, category_df.set_index(category)], axis=1) df_transposed = df.T - - output_filename = osp.join(output_dir, 'summarized-by--' + judge_abbr + '-' + '-report.csv') - + output_filename = osp.join( + output_dir, + 'summarized-by--' + judge_abbr + '-' + '-report.csv', + ) transposed_csv_file_path = output_filename df_transposed.to_csv(transposed_csv_file_path) diff --git a/opencompass/utils/text_postprocessors.py b/opencompass/utils/text_postprocessors.py index aeb4a0e55..e86030db6 100644 --- a/opencompass/utils/text_postprocessors.py +++ b/opencompass/utils/text_postprocessors.py @@ -71,6 +71,7 @@ def first_option_postprocess(text: str, options: str, cushion=True) -> str: f'答案应该?是\s*([{options}])', f'答案应该?选\s*([{options}])', f'答案选项为?\s*:\s*([{options}])', + f'答案选项为?\s+\(?\*?\*?([{options}])\*?\*?\)?', f'答案选项是?\s*:\s*([{options}])', f'答案为\s*([{options}])', f'答案选\s*([{options}])', @@ -100,6 +101,7 @@ def first_option_postprocess(text: str, options: str, cushion=True) -> str: f'答案为\s?(\S+)(?:。|$)', f'(?i)ANSWER\s*:\s*([{options}])', f'[Tt]he answer is:?\s+\(?([{options}])\)?', + f'[Tt]he answer is:?\s+\(?\*?\*?([{options}])\*?\*?\)?', f'[Tt]he answer is option:?\s+\(?([{options}])\)?', f'[Tt]he correct answer is:?\s+\(?([{options}])\)?', f'[Tt]he correct answer is option:?\s+\(?([{options}])\)?',