From 4535bb8e50ccd3e9703436e8d0437bb21ce6d7d6 Mon Sep 17 00:00:00 2001
From: Zirui <91374685+ZiruiSongBest@users.noreply.github.com>
Date: Wed, 22 Nov 2023 18:22:14 +1100
Subject: [PATCH] evaluate

---
 evaluate/gpt_evaluation_script.py          | 129 ++++++++++++++++++++
 evaluate/gpt_evaluation_script_AD.py       | 135 +++++++++++++++++++++
 evaluate/gpt_evaluation_script_RS.py       | 127 +++++++++++++++++++
 evaluate/gpt_evaluation_script_infrard.py  | 128 +++++++++++++++++++
 evaluate/gpt_evaluation_script_original.py | 127 +++++++++++++++++++
 evaluate/gpt_evaluation_script_style.py    | 126 +++++++++++++++++++
 evaluate/gpt_evaluation_script_xray.py     | 122 +++++++++++++++++++
 7 files changed, 894 insertions(+)
 create mode 100644 evaluate/gpt_evaluation_script.py
 create mode 100644 evaluate/gpt_evaluation_script_AD.py
 create mode 100644 evaluate/gpt_evaluation_script_RS.py
 create mode 100644 evaluate/gpt_evaluation_script_infrard.py
 create mode 100644 evaluate/gpt_evaluation_script_original.py
 create mode 100644 evaluate/gpt_evaluation_script_style.py
 create mode 100644 evaluate/gpt_evaluation_script_xray.py

diff --git a/evaluate/gpt_evaluation_script.py b/evaluate/gpt_evaluation_script.py
new file mode 100644
index 0000000..07b793b
--- /dev/null
+++ b/evaluate/gpt_evaluation_script.py
@@ -0,0 +1,129 @@
+import re
+import json
+import openai  # Assuming openai is installed and properly set up
+import time
+from tqdm import tqdm
+
+import argparse
+from openai import OpenAI
+client = OpenAI()
+def load_jsonl(file_path):
+    with open(file_path, 'r') as file:
+        return [json.loads(line) for line in file]
+
+def get_gpt_scores(prediction_jsonl_path, ground_truth_jsonl_path, output_jsonl_path, gpt_model):
+    # Load the ground truths
+    ground_truths = load_jsonl(ground_truth_jsonl_path)
+    
+    # Create a dictionary for easy access to ground truths
+    gt_dict = {item['question_id']: item for item in ground_truths}
+    
+    # Process each prediction
+    predictions = load_jsonl(prediction_jsonl_path)
+        
+    with open(output_jsonl_path, 'w') as out_file:
+        for item in tqdm(predictions,desc='Evaluating, If stuck, please Ctrl + C .', dynamic_ncols=True):
+            question_id = item['question_id']
+            prediction_text = item.get('model_output',"")
+            
+            gt_item = gt_dict.get(question_id, {})
+            gt_answer = gt_item.get('answer',"")
+            
+            prediction_text=str(prediction_text)
+            gt_answer=str(gt_answer)
+            
+            
+            gt_question = gt_item.get('prompt')
+            
+            print(f"question_id: {question_id}, prediction_text: {prediction_text}, gt_answer: {gt_answer}")
+            if not prediction_text or not gt_answer:
+                print(f"Skipping question_id {question_id} due to empty prediction_text or gt_answer.")
+                continue
+            
+            retries = 0
+            max_retries = 3
+            while retries < max_retries:
+            # Create a question for the GPT model and other processing here...
+                question = f"""Compare the ground truth and prediction from AI models, to give a correctness score for the prediction. Ignore case, single and plural grammar problems, and consider whether the meaning is similar. If the meaning is similar, it deserves full marks.  A '/' in ground truth indicates that there are multiple responses to the question, with full marks for any one answer. The correctness score is 0.0 (totally wrong), 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, or 1.0 (totally right). 
+                Example:
+                Question | Ground truth | Prediction | Correctness
+                --- | --- | --- | ---
+                How many apples are here? | 10 | 7 | 0.0
+                How many apples are here? | 10 | 10 | 1.0
+                What are keeping the elephants in their area? | bars / fence / fences / cage | fence | 1
+                What are keeping the elephants in their area? | bars / fence / fences / cage | They are stuck in the cage. | 1.0
+                Identify the relevant traffic signal for the ego-vehicle's current path | None | Green | 0.0
+                Identify the relevant traffic signal for the ego-vehicle's current path | Green Light | Red | 0.0
+                Identify the relevant traffic signal for the ego-vehicle's current path | Green Light | Green | 1.0
+                What can the organ with black color in this image be used for?| breathe | Breathing. | 1.0
+               
+                Here is the QA you need to compare and score 
+                Question: {gt_question} 
+                Ground Truth: {gt_answer}
+                Prediction: {prediction_text} 
+                Score :
+                
+                Provide only the numerical correctness score as the output. 
+                """
+
+                try: 
+                    response = client.chat.completions.create(
+                    model=gpt_model,
+                    max_tokens=64,
+                    messages=[{"role": "user", "content": question}],
+                    timeout = 10,
+                )
+                    # print("response: ",response)
+                except:
+                    print("sleep 30s")
+                    time.sleep(30)
+
+            # Example of how you might write results to the output file
+                
+                else:
+                    # Example of how you might write results to the output file
+                    model_response = response.choices[0].message.content
+                    print(f"model_response: {model_response}")
+                    try:
+                        score_matches = re.findall(r"(\d+(\.\d+)?)", model_response)
+                        if score_matches:
+                            if len(score_matches) > 1:
+                                raise ValueError(f"Multiple numbers detected: {model_response}")
+                            
+                            score = float(score_matches[0][0])
+                            # print(f"model_response: {model_response}")
+                            print(f"score: {score}")
+                            if 0 <= score <= 1:
+                                result = {
+                                    'question_id': question_id,
+                                    'image': gt_item.get('image', ''),
+                                    'model_response': score
+                                }
+                                out_file.write(json.dumps(result) + '\n')
+                                break
+                        else: 
+                            raise ValueError(f"Invalid response format: {model_response}")
+                    except ValueError:
+                        pass
+            
+            
+                retries += 1
+                if retries == max_retries:
+                    print(f"Failed to get a valid score after {max_retries} attempts for question_id {question_id}.")
+
+
+# 调用函数
+#get_gpt_scores("/workspace/LLaVA/Zirui/Results/llava_1.5/llava_1.5_13B_orignal.jsonl", "/workspace/LLaVA/Zirui/jsonl/llava/Benckmark_LLaVA_style.jsonl", "/workspace/LLaVA/Zirui/evaluate/score/oringal_score_LLaVA_1.5_13B.jsonl", "gpt-4-0613")
+
+def main():
+    parser = argparse.ArgumentParser(description='Evaluate predictions using GPT.')
+    parser.add_argument('--prediction_jsonl_path', type=str, required=True,help='Path to the prediction JSONL file.')
+    parser.add_argument('--ground_truth_jsonl_path', type=str, required=True,help='Path to the ground truth JSONL file.')
+    parser.add_argument('--output_jsonl_path', type=str, required=True,help='Path to save the output JSONL file.')
+    parser.add_argument('--gpt_model', type=str, required=True, help='GPT model to use for evaluation.')
+    
+    args = parser.parse_args()
+    get_gpt_scores(args.prediction_jsonl_path, args.ground_truth_jsonl_path, args.output_jsonl_path, args.gpt_model)
+
+if __name__ == '__main__':
+    main()
diff --git a/evaluate/gpt_evaluation_script_AD.py b/evaluate/gpt_evaluation_script_AD.py
new file mode 100644
index 0000000..a38c7cc
--- /dev/null
+++ b/evaluate/gpt_evaluation_script_AD.py
@@ -0,0 +1,135 @@
+import re
+import json
+import openai  # Assuming openai is installed and properly set up
+import time
+from tqdm import tqdm
+import argparse
+from openai import OpenAI
+client = OpenAI()
+
+
+def load_jsonl(file_path):
+    with open(file_path, 'r') as file:
+        return [json.loads(line) for line in file]
+
+def get_gpt_scores(prediction_jsonl_path, ground_truth_jsonl_path, output_jsonl_path, gpt_model):
+    # Load the ground truths
+    ground_truths = load_jsonl(ground_truth_jsonl_path)
+    
+    # Create a dictionary for easy access to ground truths
+    gt_dict = {item['question_id']: item for item in ground_truths}
+    
+    # Process each prediction
+    predictions = load_jsonl(prediction_jsonl_path)
+        
+    with open(output_jsonl_path, 'w') as out_file:
+        for item in tqdm(predictions,desc='Evaluating, If stuck, please Ctrl + C .', dynamic_ncols=True):
+            question_id = item['question_id']
+            prediction_text = item.get('model_output',"")
+            
+            gt_item = gt_dict.get(question_id, {})
+            gt_answer = gt_item.get('answer',"")
+            
+            prediction_text=str(prediction_text)
+            gt_answer=str(gt_answer)
+            
+            
+            gt_question = gt_item.get('prompt')
+            
+            print(f"question_id: {question_id}, prediction_text: {prediction_text}, gt_answer: {gt_answer}")
+            if not prediction_text or not gt_answer:
+                print(f"Skipping question_id {question_id} due to empty prediction_text or gt_answer.")
+                continue
+            
+            retries = 0
+            max_retries = 3
+            while retries < max_retries:
+            # Create a question for the GPT model and other processing here...
+                question = f"""Compare the ground truth and prediction from AI models, to give a correctness score for the prediction. Ignore case, single and plural grammar problems, and consider whether the meaning is similar. If the meaning is similar, it deserves full marks.  A '/' in ground truth indicates that there are multiple responses to the question, with full marks for any one answer. The correctness score is 0.0 (totally wrong), 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, or 1.0 (totally right). 
+                Example:
+                Question | Ground truth | Prediction | Correctness
+                --- | --- | --- | ---
+                Identify the relevant traffic signal for the ego-vehicle's current path\nAnswer the question using a single word or phrase. | Green | None | 0.0
+                Identify the relevant traffic signal for the ego-vehicle's current path\nAnswer the question using a single word or phrase. | green. | Green Light | 1.0
+                Identify the relevant traffic signal for the ego-vehicle's current path\nAnswer the question using a single word or phrase. | Green Light | Green | 1.0
+                Identify the relevant traffic signal for the ego-vehicle's current path\nAnswer the question using a single word or phrase. | green. | Green | 1.0
+                Considering the objects such as car and traffic light from the visual language dataset categories, what is the intended behavior or action for the main vehicle in an autonomous driving scenario?\nAnswer the question using a single word or phrase. | stop | Stop | 1.0
+                How many apples are here? | 10 | 10 | 1.0
+                What are keeping the elephants in their area? | bars / fence / fences / cage | fence | 1
+                What are keeping the elephants in their area? | bars / fence / fences / cage | They are stuck in the cage. | 1.0
+                Identify the relevant traffic signal for the ego-vehicle's current path | None | Green | 0.0
+                Identify the relevant traffic signal for the ego-vehicle's current path | Green Light | Red | 0.0
+                Identify the relevant traffic signal for the ego-vehicle's current path | Green Light | Green | 1.0
+                What can the organ with black color in this image be used for?| breathe | Breathing. | 1.0
+               
+                Here is the QA you need to compare and score 
+                Question: {gt_question} 
+                Ground Truth: {gt_answer}
+                Prediction: {prediction_text} 
+                Score :
+                
+                Provide only the numerical correctness score as the output. 
+                """
+                    
+                    
+
+                try: 
+                    response = client.chat.completions.create(
+                    model=gpt_model,
+                    max_tokens=64,
+                    messages=[{"role": "user", "content": question}],
+                    timeout = 10,
+                )
+                    # print("response: ",response)
+                except:
+                    print("sleep 30s")
+                    time.sleep(30)
+
+            # Example of how you might write results to the output file
+                model_response = response.choices[0].message.content
+                print(f"model_response: {model_response}")
+                try:
+                    score_matches = re.findall(r"(\d+(\.\d+)?)", model_response)
+                    if score_matches:
+                        if len(score_matches) > 1:
+                            raise ValueError(f"Multiple numbers detected: {model_response}")
+                        
+                        score = float(score_matches[0][0])
+                        # print(f"model_response: {model_response}")
+                        print(f"score: {score}")
+                        if 0 <= score <= 1:
+                            result = {
+                                'question_id': question_id,
+                                'image': gt_item.get('image', ''),
+                                'model_response': score
+                            }
+                            out_file.write(json.dumps(result) + '\n')
+                            break
+                    else: 
+                        raise ValueError(f"Invalid response format: {model_response}")
+                except ValueError:
+                    pass
+            
+            
+                retries += 1
+                if retries == max_retries:
+                    print(f"Failed to get a valid score after {max_retries} attempts for question_id {question_id}.")
+
+
+# 调用函数
+#get_gpt_scores("/workspace/LLaVA/Zirui/Results/llava_1.5/llava_1.5_13B_orignal.jsonl", "/workspace/LLaVA/Zirui/jsonl/llava/Benckmark_LLaVA_style.jsonl", "/workspace/LLaVA/Zirui/evaluate/score/oringal_score_LLaVA_1.5_13B.jsonl", "gpt-4-0613")
+
+def main():
+    parser = argparse.ArgumentParser(description='Evaluate predictions using GPT.')
+    parser.add_argument('--prediction_jsonl_path', type=str, required=True,help='Path to the prediction JSONL file.')
+    parser.add_argument('--ground_truth_jsonl_path', type=str, required=True,help='Path to the ground truth JSONL file.')
+    parser.add_argument('--output_jsonl_path', type=str, required=True,help='Path to save the output JSONL file.')
+    parser.add_argument('--gpt_model', type=str, required=True, help='GPT model to use for evaluation.')
+    
+    
+    
+    args = parser.parse_args()
+    get_gpt_scores(args.prediction_jsonl_path, args.ground_truth_jsonl_path, args.output_jsonl_path, args.gpt_model)
+
+if __name__ == '__main__':
+    main()
diff --git a/evaluate/gpt_evaluation_script_RS.py b/evaluate/gpt_evaluation_script_RS.py
new file mode 100644
index 0000000..f1ba258
--- /dev/null
+++ b/evaluate/gpt_evaluation_script_RS.py
@@ -0,0 +1,127 @@
+import re
+import json
+import openai  # Assuming openai is installed and properly set up
+import time
+from tqdm import tqdm
+import argparse
+
+from openai import OpenAI
+client = OpenAI()
+
+def load_jsonl(file_path):
+    with open(file_path, 'r') as file:
+        return [json.loads(line) for line in file]
+
+def get_gpt_scores(prediction_jsonl_path, ground_truth_jsonl_path, output_jsonl_path, gpt_model):
+    # Load the ground truths
+    ground_truths = load_jsonl(ground_truth_jsonl_path)
+    
+    # Create a dictionary for easy access to ground truths
+    gt_dict = {item['question_id']: item for item in ground_truths}
+    
+    # Process each prediction
+    predictions = load_jsonl(prediction_jsonl_path)
+        
+    with open(output_jsonl_path, 'w') as out_file:
+        for item in tqdm(predictions,desc='Evaluating, If stuck, please Ctrl + C .', dynamic_ncols=True):
+            question_id = item['question_id']
+            prediction_text = item.get('model_output',"")
+            
+            gt_item = gt_dict.get(question_id, {})
+            gt_answer = gt_item.get('answer',"")
+            
+            prediction_text=str(prediction_text)
+            gt_answer=str(gt_answer)
+            
+            
+            gt_question = gt_item.get('prompt')
+            
+            print(f"question_id: {question_id}, prediction_text: {prediction_text}, gt_answer: {gt_answer}")
+            if not prediction_text or not gt_answer:
+                print(f"Skipping question_id {question_id} due to empty prediction_text or gt_answer.")
+                continue
+            
+            retries = 0
+            max_retries = 3
+            while retries < max_retries:
+            # Create a question for the GPT model and other processing here...
+                question = f"""Compare the ground truth and prediction from AI models, to give a correctness score for the prediction. Ignore case, single and plural grammar problems, and consider whether the meaning is similar. If the meaning is similar, it deserves full marks.  A '/' in ground truth indicates that there are multiple responses to the question, with full marks for any one answer. The correctness score is 0.0 (totally wrong), 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, or 1.0 (totally right). 
+                Example:
+                Question | Ground truth | Prediction | Correctness
+                --- | --- | --- | ---
+                How many apples are here? | 10 | 7 | 0.0
+                How many apples are here? | 10 | ten | 1.0
+                How many apples are here? | 10 | There are 10 apples. | 1.0
+                Is there a brige in the picture? | yes | Yes, there is a bridge in the picture| 1.0
+                How many vehicles are there in the picture? | 8546 | There are several cars in the picture | 0.0
+                What can the organ with black color in this image be used for?| breathe | Breathing. | 1.0
+               
+                Here is the QA you need to compare and score 
+                Question: {gt_question} 
+                Ground Truth: {gt_answer}
+                Prediction: {prediction_text} 
+                Score :
+                
+                Provide only the numerical correctness score as the output. 
+                """
+                    
+                    
+
+                try: 
+                    response = client.chat.completions.create(
+                    model=gpt_model,
+                    max_tokens=64,
+                    messages=[{"role": "user", "content": question}],
+                    timeout = 10,
+                )
+                    # print("response: ",response)
+                except:
+                    print("sleep 30s")
+                    time.sleep(30)
+
+            # Example of how you might write results to the output file
+                model_response = response.choices[0].message.content
+                print(f"model_response: {model_response}")
+                try:
+                    score_matches = re.findall(r"(\d+(\.\d+)?)", model_response)
+                    if score_matches:
+                        if len(score_matches) > 1:
+                            raise ValueError(f"Multiple numbers detected: {model_response}")
+                        
+                        score = float(score_matches[0][0])
+                        # print(f"model_response: {model_response}")
+                        print(f"score: {score}")
+                        if 0 <= score <= 1:
+                            result = {
+                                'question_id': question_id,
+                                'image': gt_item.get('image', ''),
+                                'model_response': score
+                            }
+                            out_file.write(json.dumps(result) + '\n')
+                            break
+                    else: 
+                        raise ValueError(f"Invalid response format: {model_response}")
+                except ValueError:
+                    pass
+            
+            
+                retries += 1
+                if retries == max_retries:
+                    print(f"Failed to get a valid score after {max_retries} attempts for question_id {question_id}.")
+
+
+# 调用函数
+#get_gpt_scores("/workspace/LLaVA/Zirui/Results/llava_1.5/llava_1.5_13B_orignal.jsonl", "/workspace/LLaVA/Zirui/jsonl/llava/Benckmark_LLaVA_style.jsonl", "/workspace/LLaVA/Zirui/evaluate/score/oringal_score_LLaVA_1.5_13B.jsonl", "gpt-4-0613")
+
+def main():
+    parser = argparse.ArgumentParser(description='Evaluate predictions using GPT.')
+    parser.add_argument('--prediction_jsonl_path', type=str, required=True,help='Path to the prediction JSONL file.')
+    parser.add_argument('--ground_truth_jsonl_path', type=str, required=True,help='Path to the ground truth JSONL file.')
+    parser.add_argument('--output_jsonl_path', type=str, required=True,help='Path to save the output JSONL file.')
+    parser.add_argument('--gpt_model', type=str, required=True, help='GPT model to use for evaluation.')
+    
+    args = parser.parse_args()
+    get_gpt_scores(args.prediction_jsonl_path, args.ground_truth_jsonl_path, args.output_jsonl_path, args.gpt_model)
+
+if __name__ == '__main__':
+    main()
diff --git a/evaluate/gpt_evaluation_script_infrard.py b/evaluate/gpt_evaluation_script_infrard.py
new file mode 100644
index 0000000..58bd056
--- /dev/null
+++ b/evaluate/gpt_evaluation_script_infrard.py
@@ -0,0 +1,128 @@
+import re
+import json
+import openai  # Assuming openai is installed and properly set up
+import time
+from tqdm import tqdm
+import argparse
+
+from openai import OpenAI
+client = OpenAI()
+
+def load_jsonl(file_path):
+    with open(file_path, 'r') as file:
+        return [json.loads(line) for line in file]
+
+def get_gpt_scores(prediction_jsonl_path, ground_truth_jsonl_path, output_jsonl_path, gpt_model):
+    # Load the ground truths
+    ground_truths = load_jsonl(ground_truth_jsonl_path)
+    
+    # Create a dictionary for easy access to ground truths
+    gt_dict = {item['question_id']: item for item in ground_truths}
+    
+    # Process each prediction
+    predictions = load_jsonl(prediction_jsonl_path)
+        
+    with open(output_jsonl_path, 'w') as out_file:
+        for item in tqdm(predictions,desc='Evaluating, If stuck, please Ctrl + C .', dynamic_ncols=True):
+            question_id = item['question_id']
+            prediction_text = item.get('model_output',"")
+            
+            gt_item = gt_dict.get(question_id, {})
+            gt_answer = gt_item.get('answer',"")
+            
+            prediction_text=str(prediction_text)
+            gt_answer=str(gt_answer)
+            
+            
+            gt_question = gt_item.get('prompt')
+            
+            print(f"question_id: {question_id}, prediction_text: {prediction_text}, gt_answer: {gt_answer}")
+            if not prediction_text or not gt_answer:
+                print(f"Skipping question_id {question_id} due to empty prediction_text or gt_answer.")
+                continue
+            
+            retries = 0
+            max_retries = 3
+            while retries < max_retries:
+            # Create a question for the GPT model and other processing here...
+                question = f"""Compare the ground truth and prediction from AI models, to give a correctness score for the prediction. Ignore case, single and plural grammar problems, and consider whether the meaning is similar. If the meaning is similar, it deserves full marks.  A '/' in ground truth indicates that there are multiple responses to the question, with full marks for any one answer. The correctness score is 0.0 (totally wrong), 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, or 1.0 (totally right). 
+                Example:
+                Question | Ground truth | Prediction | Correctness
+                --- | --- | --- | ---
+                Are there any pedestrians or animals near the road?\nAnswer the question using a single word or phrase. | yes | Yes, there are pedestrians and a dog near the road. | 1.0
+                What is the hottest content in the image?\nAnswer the question using a single word or phrase. | person | The hottest content in the image is a group of people walking down a street at night | 1.0 
+                What is the hottest content in the image?\nAnswer the question using a single word or phrase. | car |The hottest content in the image is a car. | 1.0 |
+                What is the hottest content in the image?\nAnswer the question using a single word or phrase. | car | T | 1.0
+                What is the hottest content in the image?\nAnswer the question using a single word or phrase. | car / people / person |What is the hottest content in the image?\nAnswer the question using a single word or phrase. | 1.0
+                
+                Here is the QA you need to compare and score 
+                Question: {gt_question} 
+                Ground Truth: {gt_answer}
+                Prediction: {prediction_text} 
+                Score :
+                
+                Provide only the numerical correctness score as the output. 
+                """
+                    
+                    
+
+                try: 
+                    response = client.chat.completions.create(
+                    model=gpt_model,
+                    max_tokens=64,
+                    messages=[{"role": "user", "content": question}],
+                    timeout = 10,
+                )
+                    # print("response: ",response)
+                except:
+                    print("sleep 30s")
+                    time.sleep(30)
+
+            # Example of how you might write results to the output file
+                model_response = response.choices[0].message.content
+                print(f"model_response: {model_response}")
+                try:
+                    score_matches = re.findall(r"(\d+(\.\d+)?)", model_response)
+                    if score_matches:
+                        if len(score_matches) > 1:
+                            raise ValueError(f"Multiple numbers detected: {model_response}")
+                        
+                        score = float(score_matches[0][0])
+                        # print(f"model_response: {model_response}")
+                        print(f"score: {score}")
+                        if 0 <= score <= 1:
+                            result = {
+                                'question_id': question_id,
+                                'image': gt_item.get('image', ''),
+                                'model_response': score
+                            }
+                            out_file.write(json.dumps(result) + '\n')
+                            break
+                    else: 
+                        raise ValueError(f"Invalid response format: {model_response}")
+                except ValueError:
+                    pass
+            
+            
+                retries += 1
+                if retries == max_retries:
+                    print(f"Failed to get a valid score after {max_retries} attempts for question_id {question_id}.")
+
+
+# 调用函数
+#get_gpt_scores("/workspace/LLaVA/Zirui/Results/llava_1.5/llava_1.5_13B_orignal.jsonl", "/workspace/LLaVA/Zirui/jsonl/llava/Benckmark_LLaVA_style.jsonl", "/workspace/LLaVA/Zirui/evaluate/score/oringal_score_LLaVA_1.5_13B.jsonl", "gpt-4-0613")
+
+def main():
+    parser = argparse.ArgumentParser(description='Evaluate predictions using GPT.')
+    parser.add_argument('--prediction_jsonl_path', type=str, required=True,help='Path to the prediction JSONL file.')
+    parser.add_argument('--ground_truth_jsonl_path', type=str, required=True,help='Path to the ground truth JSONL file.')
+    parser.add_argument('--output_jsonl_path', type=str, required=True,help='Path to save the output JSONL file.')
+    parser.add_argument('--gpt_model', type=str, required=True, help='GPT model to use for evaluation.')
+    
+    
+    
+    args = parser.parse_args()
+    get_gpt_scores(args.prediction_jsonl_path, args.ground_truth_jsonl_path, args.output_jsonl_path, args.gpt_model)
+
+if __name__ == '__main__':
+    main()
diff --git a/evaluate/gpt_evaluation_script_original.py b/evaluate/gpt_evaluation_script_original.py
new file mode 100644
index 0000000..97db5d8
--- /dev/null
+++ b/evaluate/gpt_evaluation_script_original.py
@@ -0,0 +1,127 @@
+import re
+import json
+import openai  # Assuming openai is installed and properly set up
+import time
+from tqdm import tqdm
+
+import argparse
+from openai import OpenAI
+client = OpenAI()
+def load_jsonl(file_path):
+    with open(file_path, 'r') as file:
+        return [json.loads(line) for line in file]
+
+def get_gpt_scores(prediction_jsonl_path, ground_truth_jsonl_path, output_jsonl_path, gpt_model):
+    # Load the ground truths
+    ground_truths = load_jsonl(ground_truth_jsonl_path)
+    
+    # Create a dictionary for easy access to ground truths
+    gt_dict = {item['question_id']: item for item in ground_truths}
+    
+    # Process each prediction
+    predictions = load_jsonl(prediction_jsonl_path)
+        
+    with open(output_jsonl_path, 'w') as out_file:
+        for item in tqdm(predictions,desc='Evaluating, If stuck, please Ctrl + C .', dynamic_ncols=True):
+            question_id = item['question_id']
+            prediction_text = item.get('model_output',"")
+            
+            gt_item = gt_dict.get(question_id, {})
+            gt_answer = gt_item.get('answer',"")
+            
+            prediction_text=str(prediction_text)
+            gt_answer=str(gt_answer)
+            
+            
+            gt_question = gt_item.get('prompt')
+            
+            print(f"question_id: {question_id}, prediction_text: {prediction_text}, gt_answer: {gt_answer}")
+            if not prediction_text or not gt_answer:
+                print(f"Skipping question_id {question_id} due to empty prediction_text or gt_answer.")
+                continue
+            
+            retries = 0
+            max_retries = 3
+            while retries < max_retries:
+            # Create a question for the GPT model and other processing here...
+                question = f"""Compare the ground truth and prediction from AI models, to give a correctness score for the prediction. Ignore case, single and plural grammar problems, and consider whether the meaning is similar. If the meaning is similar, it deserves full marks.  A '/' in ground truth indicates that there are multiple responses to the question, with full marks for any one answer. The correctness score is 0.0 (totally wrong), 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, or 1.0 (totally right). 
+                Example:
+                Question | Ground truth | Prediction | Correctness
+                --- | --- | --- | ---
+                What is the fruit? | Apple | he fruit is apples. | 1.0
+                Is the boat listing? | Yes | yes. | 1.0
+                Is the boat listing? | Yes | no. | 1.0
+                Is the fork dirty or clean? | Dirty | The fork is dirty. | 1.0
+                How many apples are here? | 10 | 7 | 0.0
+                How many apples are here? | 10 | 10 | 1.0
+               
+                Here is the QA you need to compare and score 
+                Question: {gt_question} 
+                Ground Truth: {gt_answer}
+                Prediction: {prediction_text} 
+                Score :
+                
+                Provide only the numerical correctness score as the output. 
+                """
+
+                try: 
+                    response = client.chat.completions.create(
+                    model=gpt_model,
+                    max_tokens=64,
+                    messages=[{"role": "user", "content": question}],
+                    timeout = 10,
+                )
+                    # print("response: ",response)
+                except:
+                    print("sleep 30s")
+                    time.sleep(30)
+
+            # Example of how you might write results to the output file
+                
+                else:
+                    # Example of how you might write results to the output file
+                    model_response = response.choices[0].message.content
+                    print(f"model_response: {model_response}")
+                    try:
+                        score_matches = re.findall(r"(\d+(\.\d+)?)", model_response)
+                        if score_matches:
+                            if len(score_matches) > 1:
+                                raise ValueError(f"Multiple numbers detected: {model_response}")
+                            
+                            score = float(score_matches[0][0])
+                            # print(f"model_response: {model_response}")
+                            print(f"score: {score}")
+                            if 0 <= score <= 1:
+                                result = {
+                                    'question_id': question_id,
+                                    'image': gt_item.get('image', ''),
+                                    'model_response': score
+                                }
+                                out_file.write(json.dumps(result) + '\n')
+                                break
+                        else: 
+                            raise ValueError(f"Invalid response format: {model_response}")
+                    except ValueError:
+                        pass
+            
+            
+                retries += 1
+                if retries == max_retries:
+                    print(f"Failed to get a valid score after {max_retries} attempts for question_id {question_id}.")
+
+
+# 调用函数
+#get_gpt_scores("/workspace/LLaVA/Zirui/Results/llava_1.5/llava_1.5_13B_orignal.jsonl", "/workspace/LLaVA/Zirui/jsonl/llava/Benckmark_LLaVA_style.jsonl", "/workspace/LLaVA/Zirui/evaluate/score/oringal_score_LLaVA_1.5_13B.jsonl", "gpt-4-0613")
+
+def main():
+    parser = argparse.ArgumentParser(description='Evaluate predictions using GPT.')
+    parser.add_argument('--prediction_jsonl_path', type=str, required=True,help='Path to the prediction JSONL file.')
+    parser.add_argument('--ground_truth_jsonl_path', type=str, required=True,help='Path to the ground truth JSONL file.')
+    parser.add_argument('--output_jsonl_path', type=str, required=True,help='Path to save the output JSONL file.')
+    parser.add_argument('--gpt_model', type=str, required=True, help='GPT model to use for evaluation.')
+    
+    args = parser.parse_args()
+    get_gpt_scores(args.prediction_jsonl_path, args.ground_truth_jsonl_path, args.output_jsonl_path, args.gpt_model)
+
+if __name__ == '__main__':
+    main()
diff --git a/evaluate/gpt_evaluation_script_style.py b/evaluate/gpt_evaluation_script_style.py
new file mode 100644
index 0000000..caa5811
--- /dev/null
+++ b/evaluate/gpt_evaluation_script_style.py
@@ -0,0 +1,126 @@
+import re
+import json
+import openai  # Assuming openai is installed and properly set up
+import time
+from tqdm import tqdm
+import argparse
+from openai import OpenAI
+client = OpenAI()
+
+def load_jsonl(file_path):
+    with open(file_path, 'r') as file:
+        return [json.loads(line) for line in file]
+
+def get_gpt_scores(prediction_jsonl_path, ground_truth_jsonl_path, output_jsonl_path, gpt_model):
+    # Load the ground truths
+    ground_truths = load_jsonl(ground_truth_jsonl_path)
+    
+    # Create a dictionary for easy access to ground truths
+    gt_dict = {item['question_id']: item for item in ground_truths}
+    
+    # Process each prediction
+    predictions = load_jsonl(prediction_jsonl_path)
+        
+    with open(output_jsonl_path, 'w') as out_file:
+        for item in tqdm(predictions,desc='Evaluating, If stuck, please Ctrl + C .', dynamic_ncols=True):
+            question_id = item['question_id']
+            prediction_text = item.get('model_output',"")
+            
+            gt_item = gt_dict.get(question_id, {})
+            gt_answer = gt_item.get('answer',"")
+            
+            prediction_text=str(prediction_text)
+            gt_answer=str(gt_answer)
+            
+            
+            gt_question = gt_item.get('prompt')
+            
+            print(f"question_id: {question_id}, prediction_text: {prediction_text}, gt_answer: {gt_answer}")
+            if not prediction_text or not gt_answer:
+                print(f"Skipping question_id {question_id} due to empty prediction_text or gt_answer.")
+                continue
+            
+            retries = 0
+            max_retries = 3
+            while retries < max_retries:
+            # Create a question for the GPT model and other processing here...
+                question = f"""Compare the ground truth and prediction from AI models, to give a correctness score for the prediction. Ignore case, single and plural grammar problems, and consider whether the meaning is similar. If the meaning is similar, it deserves full marks.  A '/' in ground truth indicates that there are multiple responses to the question, with full marks for any one answer. The correctness score is 0.0 (totally wrong), 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, or 1.0 (totally right). 
+                Example:
+                Question | Ground truth | Prediction | Correctness
+                --- | --- | --- | ---
+                How many apples are here? | 10 | 7 | 0.0
+                How many apples are here? | 10 | ten | 1.0
+                How many apples are here? | 10 | There are 10 apples. | 1.0
+                What are keeping the elephants in their area? | bars / fence / fences / cage | fence | 1
+                What are keeping the elephants in their area? | bars / fence / fences / cage | They are stuck in the cage. | 1.0
+                What can the organ with black color in this image be used for?| breathe | Breathing. | 1.0
+               
+                Here is the QA you need to compare and score 
+                Question: {gt_question} 
+                Ground Truth: {gt_answer}
+                Prediction: {prediction_text} 
+                Score :
+                
+                Provide only the numerical correctness score as the output. 
+                """
+                    
+                    
+
+                try: 
+                    response = client.chat.completions.create(
+                    model=gpt_model,
+                    max_tokens=64,
+                    messages=[{"role": "user", "content": question}],
+                    timeout = 10,
+                )
+                    # print("response: ",response)
+                except:
+                    print("sleep 30s")
+                    time.sleep(30)
+
+            # Example of how you might write results to the output file
+                model_response = response.choices[0].message.content
+                print(f"model_response: {model_response}")
+                try:
+                    score_matches = re.findall(r"(\d+(\.\d+)?)", model_response)
+                    if score_matches:
+                        if len(score_matches) > 1:
+                            raise ValueError(f"Multiple numbers detected: {model_response}")
+                        
+                        score = float(score_matches[0][0])
+                        # print(f"model_response: {model_response}")
+                        print(f"score: {score}")
+                        if 0 <= score <= 1:
+                            result = {
+                                'question_id': question_id,
+                                'image': gt_item.get('image', ''),
+                                'model_response': score
+                            }
+                            out_file.write(json.dumps(result) + '\n')
+                            break
+                    else: 
+                        raise ValueError(f"Invalid response format: {model_response}")
+                except ValueError:
+                    pass
+            
+            
+                retries += 1
+                if retries == max_retries:
+                    print(f"Failed to get a valid score after {max_retries} attempts for question_id {question_id}.")
+
+
+# 调用函数
+#get_gpt_scores("/workspace/LLaVA/Zirui/Results/llava_1.5/llava_1.5_13B_orignal.jsonl", "/workspace/LLaVA/Zirui/jsonl/llava/Benckmark_LLaVA_style.jsonl", "/workspace/LLaVA/Zirui/evaluate/score/oringal_score_LLaVA_1.5_13B.jsonl", "gpt-4-0613")
+
+def main():
+    parser = argparse.ArgumentParser(description='Evaluate predictions using GPT.')
+    parser.add_argument('--prediction_jsonl_path', type=str, required=True,help='Path to the prediction JSONL file.')
+    parser.add_argument('--ground_truth_jsonl_path', type=str, required=True,help='Path to the ground truth JSONL file.')
+    parser.add_argument('--output_jsonl_path', type=str, required=True,help='Path to save the output JSONL file.')
+    parser.add_argument('--gpt_model', type=str, required=True, help='GPT model to use for evaluation.')
+    
+    args = parser.parse_args()
+    get_gpt_scores(args.prediction_jsonl_path, args.ground_truth_jsonl_path, args.output_jsonl_path, args.gpt_model)
+
+if __name__ == '__main__':
+    main()
diff --git a/evaluate/gpt_evaluation_script_xray.py b/evaluate/gpt_evaluation_script_xray.py
new file mode 100644
index 0000000..feee118
--- /dev/null
+++ b/evaluate/gpt_evaluation_script_xray.py
@@ -0,0 +1,122 @@
+import re
+import json
+import openai  # Assuming openai is installed and properly set up
+import time
+from tqdm import tqdm
+import argparse
+from openai import OpenAI
+client = OpenAI()
+def load_jsonl(file_path):
+    with open(file_path, 'r') as file:
+        return [json.loads(line) for line in file]
+
+def get_gpt_scores(prediction_jsonl_path, ground_truth_jsonl_path, output_jsonl_path, gpt_model):
+    # Load the ground truths
+    ground_truths = load_jsonl(ground_truth_jsonl_path)
+    
+    # Create a dictionary for easy access to ground truths
+    gt_dict = {item['question_id']: item for item in ground_truths}
+    
+    # Process each prediction
+    predictions = load_jsonl(prediction_jsonl_path)
+        
+    with open(output_jsonl_path, 'w') as out_file:
+        for item in tqdm(predictions,desc='Evaluating, If stuck, please Ctrl + C .', dynamic_ncols=True):
+            question_id = item['question_id']
+            prediction_text = item.get('model_output',"")
+            
+            gt_item = gt_dict.get(question_id, {})
+            gt_answer = gt_item.get('answer',"")
+            
+            prediction_text=str(prediction_text)
+            gt_answer=str(gt_answer)
+            
+            
+            gt_question = gt_item.get('prompt')
+            
+            print(f"question_id: {question_id}, prediction_text: {prediction_text}, gt_answer: {gt_answer}")
+            if not prediction_text or not gt_answer:
+                print(f"Skipping question_id {question_id} due to empty prediction_text or gt_answer.")
+                continue
+            
+            retries = 0
+            max_retries = 3
+            while retries < max_retries:
+            # Create a question for the GPT model and other processing here...
+                question = f"""Compare the ground truth and prediction from AI models, to give a correctness score for the prediction. Ignore case, single and plural grammar problems, and consider whether the meaning is similar. If the meaning is similar, it deserves full marks.  A '/' in ground truth indicates that there are multiple responses to the question, with full marks for any one answer. The correctness score is 0.0 (totally wrong), 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, or 1.0 (totally right). 
+                Example:
+                Question | Ground truth | Prediction | Correctness
+                --- | --- | --- | ---
+                How many prohibited object in the luggage? | 1 |In the X-ray image, the luggage contains one prohibited object, which is a knife. | 1.0
+                Are there prohibited objects int the luggage? | yes | Yes, there is a knife and a gun in the luggage. | 1.0
+                What can the organ with black color in this image be used for?| breathe | Breathing. | 1.0
+               
+                Here is the QA you need to compare and score 
+                Question: {gt_question} 
+                Ground Truth: {gt_answer}
+                Prediction: {prediction_text} 
+                Score :
+                
+                Provide only the numerical correctness score as the output. 
+                """
+                    
+                    
+
+                try: 
+                    response = client.chat.completions.create(
+                    model=gpt_model,
+                    max_tokens=64,
+                    messages=[{"role": "user", "content": question}],
+                    timeout = 10,
+                )
+                    # print("response: ",response)
+                except:
+                    print("sleep 30s")
+                    time.sleep(30)
+
+            # Example of how you might write results to the output file
+                model_response = response.choices[0].message.content
+                print(f"model_response: {model_response}")
+                try:
+                    score_matches = re.findall(r"(\d+(\.\d+)?)", model_response)
+                    if score_matches:
+                        if len(score_matches) > 1:
+                            raise ValueError(f"Multiple numbers detected: {model_response}")
+                        
+                        score = float(score_matches[0][0])
+                        # print(f"model_response: {model_response}")
+                        print(f"score: {score}")
+                        if 0 <= score <= 1:
+                            result = {
+                                'question_id': question_id,
+                                'image': gt_item.get('image', ''),
+                                'model_response': score
+                            }
+                            out_file.write(json.dumps(result) + '\n')
+                            break
+                    else: 
+                        raise ValueError(f"Invalid response format: {model_response}")
+                except ValueError:
+                    pass
+            
+            
+                retries += 1
+                if retries == max_retries:
+                    print(f"Failed to get a valid score after {max_retries} attempts for question_id {question_id}.")
+
+
+# 调用函数
+#get_gpt_scores("/workspace/LLaVA/Zirui/Results/llava_1.5/llava_1.5_13B_orignal.jsonl", "/workspace/LLaVA/Zirui/jsonl/llava/Benckmark_LLaVA_style.jsonl", "/workspace/LLaVA/Zirui/evaluate/score/oringal_score_LLaVA_1.5_13B.jsonl", "gpt-4-0613")
+
+def main():
+    parser = argparse.ArgumentParser(description='Evaluate predictions using GPT.')
+    parser.add_argument('--prediction_jsonl_path', type=str, required=True,help='Path to the prediction JSONL file.')
+    parser.add_argument('--ground_truth_jsonl_path', type=str, required=True,help='Path to the ground truth JSONL file.')
+    parser.add_argument('--output_jsonl_path', type=str, required=True,help='Path to save the output JSONL file.')
+    parser.add_argument('--gpt_model', type=str, required=True, help='GPT model to use for evaluation.')
+    
+    args = parser.parse_args()
+    get_gpt_scores(args.prediction_jsonl_path, args.ground_truth_jsonl_path, args.output_jsonl_path, args.gpt_model)
+
+if __name__ == '__main__':
+    main()