fix lint

bittersweet1999 · Sep 10, 2024 · f53663a · f53663a
1 parent b900f93
commit f53663a
Show file tree

Hide file tree

Showing 2 changed files with 56 additions and 31 deletions.
diff --git a/opencompass/datasets/subjective/followbench.py b/opencompass/datasets/subjective/followbench.py
@@ -2,19 +2,25 @@
 import json
 import os.path as osp
 import re
+
 from datasets import Dataset
+
 from opencompass.openicl.icl_evaluator import BaseEvaluator
 from opencompass.registry import LOAD_DATASET
 from opencompass.utils import get_data_path
 
 from ..base import BaseDataset
 
+
 def check_match(target, generation):
-    pattern_str = target.replace('{{', '{').replace('}}', '}').replace('{answer}', '.*')
+    pattern_str = target.replace('{{',
+                                 '{').replace('}}',
+                                              '}').replace('{answer}', '.*')
     pattern_str = re.escape(pattern_str).replace('\\.\\*', '.*')
     match = re.fullmatch(pattern_str, generation)
     return bool(match)
 
+
 @LOAD_DATASET.register_module()
 class FollowBenchDataset(BaseDataset):
 
@@ -28,11 +34,19 @@ def load(self, path: str, name: str, cate: str, *args, **kwargs):
             data = json.load(f)
             for item in data:
                 if cate == 'llm':
-                    raw_data.append({'instruction': item['instruction'], 'judge_prompt': item['judge_prompt'], 'judge': item})
+                    raw_data.append({
+                        'instruction': item['instruction'],
+                        'judge_prompt': item['judge_prompt'],
+                        'judge': item
+                    })
                 elif cate == 'rule':
-                    raw_data.append({'instruction': item['instruction'], 'judge': item})
+                    raw_data.append({
+                        'instruction': item['instruction'],
+                        'judge': item
+                    })
                 else:
-                    raise NotImplementedError(f"Category '{cate}' is not implemented.")
+                    raise NotImplementedError(
+                        f"Category '{cate}' is not implemented.")
 
         dataset = Dataset.from_list(raw_data)
         return dataset
@@ -126,20 +140,24 @@ def chdir_return(cwd, return_value):
         return chdir_return(cwd, pass_flag)
 
     def score(self, predictions, references):
-        results = {'example':{'accuracy':[0,0,0,0,0], 'num':0}}
+        results = {'example': {'accuracy': [0, 0, 0, 0, 0], 'num': 0}}
         for prediction, reference in zip(predictions, references):
             if reference['category'] == 'example':
                 results['example']['num'] += 1
                 template = reference['target'].replace('{instruction}\n', '')
                 match_result = check_match(template, prediction)
                 print(match_result)
-                results['example']['accuracy'][reference['level'] - 1] += match_result
-        results['example']['accuracy'] = [round(acc / (results['example']['num'] // 5), 2) for acc in results['example']['accuracy']]
+                results['example']['accuracy'][reference['level'] -
+                                               1] += match_result
+        results['example']['accuracy'] = [
+            round(acc / (results['example']['num'] // 5), 2)
+            for acc in results['example']['accuracy']
+        ]
         ######## Still not finished for rule-based evaluation
 
         # Each process changes cwd, need to use multi-processing
         with ProcessPoolExecutor(self.num_workers) as executor:
             passed = sum(
                 list(executor.map(self.score_single, predictions, references)))
 
-        return {'accuracy': passed / total}
+        return {'accuracy': passed / total}
diff --git a/opencompass/summarizers/subjective/followbench.py b/opencompass/summarizers/subjective/followbench.py
@@ -3,9 +3,10 @@
 import os
 import os.path as osp
 import re
+import statistics
 from collections import defaultdict
 from datetime import datetime
-import statistics
+
 import numpy as np
 from mmengine import ConfigDict
 
@@ -19,32 +20,36 @@
 from .subjective_post_process import post_process_autoj, post_process_judgelm
 from .utils import get_judgeanswer_and_reference_update, get_outdir
 
+
 def post_process_followbench(item):
     generation, level = item['prediction'], item['gold']['level']
     try:
-        satisify = generation.strip("```").strip().split('\n')[-1]
+        satisfy = generation.strip('```').strip().split('\n')[-1]
 
         if level == 1:
-            if 'YES' in satisify:
+            if 'YES' in satisfy:
                 return 1, 1
-            elif 'NO' in satisify:
+            elif 'NO' in satisfy:
                 return 0, 0
             else:
                 raise Exception('Invalid evaluation for level 1.')
         else:
-            satisify_list = re.search(r'\[.*\]', satisify)
-            if satisify_list:
-                satisify_list = eval(satisify_list.group())
-                if len(satisify_list) == level:
+            satisfy_list = re.search(r'\[.*\]', satisfy)
+            if satisfy_list:
+                satisfy_list = eval(satisfy_list.group())
+                if len(satisfy_list) == level:
                     num_true = 0
-                    for i in satisify_list:
+                    for i in satisfy_list:
                         if i == 'YES' or i == 'True':
                             num_true += 1
-                        elif i in ['NO', 'False', 'PARTIAL', 'MAYBE', 'UNKNOWN', 'N/A']:
+                        elif i in [
+                                'NO', 'False', 'PARTIAL', 'MAYBE', 'UNKNOWN',
+                                'N/A'
+                        ]:
                             num_true += 0
                         else:
                             raise Exception('Invalid element in the list.')
-                    return int(num_true==level), num_true/level
+                    return int(num_true == level), num_true / level
                 else:
                     raise Exception('Invalid number of elements in the list.')
             else:
@@ -53,33 +58,37 @@ def post_process_followbench(item):
     except Exception as e:
         return -1, -1
 
+
 def get_scores(judged_answers, references):
     results = [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]
     n_group = len(judged_answers) // 5
-    n_groups = [n_group]*5
+    n_groups = [n_group] * 5
 
     for judged_answer, reference in zip(judged_answers, references):
         if judged_answer[0] == -1:
-            n_groups[reference['level']-1] -= 1
+            n_groups[reference['level'] - 1] -= 1
         else:
-            results[0][reference['level']-1] += judged_answer[0]
-            results[1][reference['level']-1] += judged_answer[1]
+            results[0][reference['level'] - 1] += judged_answer[0]
+            results[1][reference['level'] - 1] += judged_answer[1]
 
     for i in range(len(results)):
         for j in range(len(results[i])):
             if n_groups[j] != 0:
                 results[i][j] = results[i][j] / n_groups[j]
             else:
                 results[i][j] = 0
-    temp_dict = {"HSR_AVG": statistics.mean(results[0]), "SSR_AVG": statistics.mean(results[1])}
+    temp_dict = {
+        'HSR_AVG': statistics.mean(results[0]),
+        'SSR_AVG': statistics.mean(results[1])
+    }
     for idx, s in enumerate(results[0]):
         temp_dict[f'HSR_L{idx+1}'] = s
     for idx, s in enumerate(results[1]):
         temp_dict[f'SSR_L{idx+1}'] = s
-    
+
     return temp_dict
-            
-            
+
+
 class FollowBenchSummarizer:
     """Do the subjectivity analyze based on evaluation results.
 
@@ -99,7 +108,6 @@ def __init__(self, config: ConfigDict) -> None:
 
         self.judge_function = post_process_followbench
 
-
     def summarize(self,
                   time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
         """Summarize the subjectivity analysis based on evaluation results.
@@ -118,9 +126,8 @@ def summarize(self,
             dataset = dataset_cfgs[0]  # Alignbench just have only one subfile
             output_dir, results_folder = get_outdir(self.cfg, time_str)
 
-            fout = osp.join(
-                output_dir,
-                'followbench-judged-by--' + judge_abbr + '.csv')
+            fout = osp.join(output_dir,
+                            'followbench-judged-by--' + judge_abbr + '.csv')
 
             for eval_model_abbr in self.eval_model_abbrs:
                 subdir = eval_model_abbr + '_judged-by--' + judge_abbr