Change 'sub_em' to 'acc'

RUC-NLPIR · Jul 4, 2024 · 15de563 · 15de563
1 parent b439c45
commit 15de563
Show file tree

Hide file tree

Showing 7 changed files with 10 additions and 10 deletions.
diff --git a/docs/configuration.md b/docs/configuration.md
@@ -161,7 +161,7 @@ use_fid: False # whether to use FID, only valid in encoder-decoder model
 
 # -------------------------------------------------Evaluation Settings------------------------------------------------#
 # Metrics to evaluate the result
-metrics: ['em','f1','sub_em','precision','recall'] 
+metrics: ['em','f1','acc,'precision','recall'] 
 # Specify setting for metric, will be called within certain metrics
 metric_setting: 
   retrieval_recall_topk: 5
@@ -301,7 +301,7 @@ This section sets various settings used during evaluation. If you use a custom e
 
 ```yaml
 # Metrics to evaluate the result
-metrics: ['em','f1','sub_em','precision','recall'] 
+metrics: ['em','f1','acc','precision','recall'] 
 # Specify setting for metric, will be called within certain metrics
 metric_setting: 
   retrieval_recall_topk: 5

diff --git a/docs/introduction_for_beginners_en.md b/docs/introduction_for_beginners_en.md
@@ -93,7 +93,7 @@ config_dict = {
     'model2path': {'e5': <retriever_path>, 'llama2-7B-chat': <generator_path>},
     'generator_model': 'llama2-7B-chat',
     'retrieval_method': 'e5',
-    'metrics': ['em', 'f1', 'sub_em'],
+    'metrics': ['em', 'f1', 'acc'],
     'retrieval_topk': 1,
     'save_intermediate_data': True
 }
@@ -142,7 +142,7 @@ config_dict = {
                 'model2path': {'e5': <retriever_path>, 'llama2-7B-chat': <generator_path>},
                 'generator_model': 'llama2-7B-chat',
                 'retrieval_method': 'e5',
-                'metrics': ['em','f1','sub_em'],
+                'metrics': ['em','f1','acc'],
                 'retrieval_topk': 1,
                 'save_intermediate_data': True
             }

diff --git a/docs/introduction_for_beginners_zh.md b/docs/introduction_for_beginners_zh.md
@@ -93,7 +93,7 @@ config_dict = {
     'model2path': {'e5': <retriever_path>, 'llama2-7B-chat': <generator_path>},
     'generator_model': 'llama2-7B-chat',
     'retrieval_method': 'e5',
-    'metrics': ['em', 'f1', 'sub_em'],
+    'metrics': ['em', 'f1', 'acc'],
     'retrieval_topk': 1,
     'save_intermediate_data': True
 }

diff --git a/examples/methods/my_config.yaml b/examples/methods/my_config.yaml
@@ -107,7 +107,7 @@ sc_config:
 
 # -------------------------------------------------Evaluation Settings------------------------------------------------#
 # Metrics to evaluate the result
-metrics: [ 'em','f1','sub_em','precision','recall']
+metrics: [ 'em','f1','acc','precision','recall']
 # Specify setting for metric, will be called within certain metrics
 metric_setting:
   retrieval_recall_topk: 5

diff --git a/examples/quick_start/simple_pipeline.py b/examples/quick_start/simple_pipeline.py
@@ -16,7 +16,7 @@
                 'model2path': {'e5': args.retriever_path, 'llama3-8B-instruct': args.model_path},
                 'generator_model': 'llama3-8B-instruct',
                 'retrieval_method': 'e5',
-                'metrics': ['em','f1','sub_em'],
+                'metrics': ['em','f1','acc'],
                 'retrieval_topk': 1,
                 'save_intermediate_data': True
             }

diff --git a/flashrag/config/basic_config.yaml b/flashrag/config/basic_config.yaml
@@ -91,7 +91,7 @@ gpu_memory_utilization: 0.85 # ratio of gpu's memory usage for generator
 
 # -------------------------------------------------Evaluation Settings------------------------------------------------#
 # Metrics to evaluate the result
-metrics: ['em','f1','sub_em','precision','recall','input_tokens'] 
+metrics: ['em','f1','acc','precision','recall','input_tokens'] 
 # Specify setting for metric, will be called within certain metrics
 metric_setting: 
   retrieval_recall_topk: 5

diff --git a/flashrag/evaluator/metrics.py b/flashrag/evaluator/metrics.py
@@ -139,7 +139,7 @@ def calculate_metric(self, data):
 class Sub_ExactMatch(BaseMetric):
     r"""Sub-Exact match measure whether the predicted answer contains the standard answer.
     """
-    metric_name = "sub_em"
+    metric_name = "acc"
 
     def __init__(self, config):
         super().__init__(config)
@@ -172,7 +172,7 @@ def calculate_metric(self, data):
         metric_score_list = [self.calculate_sub_em(pred, golden_answers) for pred, golden_answers in zip(pred_list, golden_answers_list)]
         sub_em_score = sum(metric_score_list) / len(metric_score_list)
 
-        return {"sub_em": sub_em_score}, metric_score_list
+        return {"acc": sub_em_score}, metric_score_list
 
 class Retrieval_Recall(BaseMetric):
     r"""The recall of the top-k retreived passages, we measure if any of the passage contain the answer string. """