From 54a2039af1e06f872165f36be67b7a7d35271eac Mon Sep 17 00:00:00 2001
From: RashmikaReddy <rashmikareddy777@gmail.com>
Date: Fri, 15 Dec 2023 09:00:23 -0800
Subject: [PATCH 01/13] Pushing changes made for adding metrics

---
 azureml/conda.yml                |  1 +
 docs/requirements.txt            |  1 +
 src/autora/doc/pipelines/main.py | 31 +++++++++++++++++++++++++++++++
 tests/test_main.py               | 13 ++++++++++++-
 4 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/azureml/conda.yml b/azureml/conda.yml
index f772397..69674c7 100644
--- a/azureml/conda.yml
+++ b/azureml/conda.yml
@@ -14,5 +14,6 @@ dependencies:
     - transformers>=4.35.2
     - xformers
     - scipy
+    - nltk
     # This works, while installing from pytorch and cuda from conda does not
     - torch==2.0.1    
\ No newline at end of file
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 62972b9..25ac169 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -7,3 +7,4 @@ jupytext
 jupyter
 matplotlib
 numpy
+nltk
diff --git a/src/autora/doc/pipelines/main.py b/src/autora/doc/pipelines/main.py
index 5afc6bf..65e7018 100644
--- a/src/autora/doc/pipelines/main.py
+++ b/src/autora/doc/pipelines/main.py
@@ -1,7 +1,10 @@
 import itertools
 import logging
+import nltk
 from timeit import default_timer as timer
 from typing import List
+from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
+from nltk.translate.meteor_score import single_meteor_score
 
 import torch
 import typer
@@ -15,6 +18,25 @@
     format="%(asctime)s %(levelname)s %(module)s.%(funcName)s(): %(message)s",
 )
 logger = logging.getLogger(__name__)
+nltk.download('wordnet')
+
+def evaluate_documentation(predictions, references):
+    # Tokenize predictions and references
+    tokenized_predictions = [pred[0].split() if pred else [] for pred in predictions]
+    tokenized_references = [[ref.split()] for ref in references]
+
+    # Calculate BLEU score
+    bleu = corpus_bleu(tokenized_references, tokenized_predictions,
+                       smoothing_function=SmoothingFunction().method1)
+
+    # Calculate METEOR scores
+    meteor_scores = [single_meteor_score(ref[0], tokenized_pred)
+                     for ref, tokenized_pred in zip(tokenized_references, tokenized_predictions)]
+    meteor = sum(meteor_scores) / len(predictions) if predictions else 0
+
+    return (bleu, meteor)
+
+
 
 
 @app.command(help="Evaluate model on a data file")
@@ -55,6 +77,11 @@ def eval(
         pred = Predictor(model_path)
         timer_start = timer()
         predictions = pred.predict(sys_prompt, instr_prompt, inputs, **param_dict)
+        print(predictions)
+        print("len of predictions ", len(predictions))
+        print("len of predictions index 0", len(predictions[0]))
+
+        bleu, meteor = evaluate_documentation(predictions, labels)
         timer_end = timer()
         pred_time = timer_end - timer_start
         mlflow.log_metric("prediction_time/doc", pred_time / (len(inputs)))
@@ -63,6 +90,8 @@ def eval(
             mlflow.log_text(inputs[i], f"input_{i}.py")
             for j in range(len(predictions[i])):
                 mlflow.log_text(predictions[i][j], f"prediction_{i}_{j}.txt")
+        mlflow.log_text("bleu_score is ", str(bleu))
+        mlflow.log_text("meteor_score is ", str(meteor))
 
         # flatten predictions for counting tokens
         predictions_flat = list(itertools.chain.from_iterable(predictions))
@@ -70,6 +99,8 @@ def eval(
         total_tokens = sum([len(token) for token in tokens])
         mlflow.log_metric("total_tokens", total_tokens)
         mlflow.log_metric("tokens/sec", total_tokens / pred_time)
+        mlflow.log_metric("bleu_score", round(bleu,5))
+        mlflow.log_metric("meteor_score", round(meteor,5))
         return predictions
 
 
diff --git a/tests/test_main.py b/tests/test_main.py
index 097e8c7..2b0d1e3 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -1,6 +1,7 @@
+import jsonlines
 from pathlib import Path
 
-from autora.doc.pipelines.main import eval, generate
+from autora.doc.pipelines.main import eval, generate, evaluate_documentation
 from autora.doc.runtime.prompts import InstructionPrompts, SystemPrompts
 
 # dummy HF model for testing
@@ -14,6 +15,16 @@ def test_predict() -> None:
     for output in outputs:
         assert len(output[0]) > 0, "Expected non-empty output"
 
+def test_evaluation():
+    # Test Case: Valid Scores in the range of 0 and 1
+    data = Path(__file__).parent.joinpath("../data/data.jsonl").resolve()
+    with jsonlines.open(data) as reader:
+            items = [item for item in reader]
+            labels = [item["output"] for item in items]
+    
+    bleu, meteor = evaluate_documentation(labels, labels)
+    assert bleu >= 0 and bleu <= 1, "BLEU score should be between 0 and 1"
+    assert meteor >= 0 and meteor <= 1, "METEOR score should be between 0 and 1"
 
 def test_generate() -> None:
     python_file = __file__

From 0bb0aaf9d9f0b17e017f9b4d59e4ab77ac0101a0 Mon Sep 17 00:00:00 2001
From: RashmikaReddy <rashmikareddy777@gmail.com>
Date: Fri, 15 Dec 2023 09:03:00 -0800
Subject: [PATCH 02/13] updating main.py

---
 src/autora/doc/pipelines/main.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/autora/doc/pipelines/main.py b/src/autora/doc/pipelines/main.py
index 65e7018..2c077b2 100644
--- a/src/autora/doc/pipelines/main.py
+++ b/src/autora/doc/pipelines/main.py
@@ -77,10 +77,6 @@ def eval(
         pred = Predictor(model_path)
         timer_start = timer()
         predictions = pred.predict(sys_prompt, instr_prompt, inputs, **param_dict)
-        print(predictions)
-        print("len of predictions ", len(predictions))
-        print("len of predictions index 0", len(predictions[0]))
-
         bleu, meteor = evaluate_documentation(predictions, labels)
         timer_end = timer()
         pred_time = timer_end - timer_start

From 4c5e472636557c9a1c1746f64e80eac50dc78f91 Mon Sep 17 00:00:00 2001
From: Rashmika Reddy Vookanti <rashmikareddy777@gmail.com>
Date: Fri, 15 Dec 2023 10:08:46 -0800
Subject: [PATCH 03/13] Update main.py

---
 src/autora/doc/pipelines/main.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/autora/doc/pipelines/main.py b/src/autora/doc/pipelines/main.py
index 2c077b2..984b81a 100644
--- a/src/autora/doc/pipelines/main.py
+++ b/src/autora/doc/pipelines/main.py
@@ -1,13 +1,14 @@
 import itertools
 import logging
-import nltk
+
 from timeit import default_timer as timer
 from typing import List
-from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
-from nltk.translate.meteor_score import single_meteor_score
 
+import nltk
 import torch
 import typer
+from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
+from nltk.translate.meteor_score import single_meteor_score
 
 from autora.doc.runtime.predict_hf import Predictor
 from autora.doc.runtime.prompts import INSTR, SYS, InstructionPrompts, SystemPrompts
@@ -20,7 +21,7 @@
 logger = logging.getLogger(__name__)
 nltk.download('wordnet')
 
-def evaluate_documentation(predictions, references):
+def evaluate_documentation(predictions, references) -> None:
     # Tokenize predictions and references
     tokenized_predictions = [pred[0].split() if pred else [] for pred in predictions]
     tokenized_references = [[ref.split()] for ref in references]

From 1f7b43e89288c74986d86a995dd5e63154535720 Mon Sep 17 00:00:00 2001
From: Rashmika Reddy Vookanti <rashmikareddy777@gmail.com>
Date: Fri, 15 Dec 2023 10:13:05 -0800
Subject: [PATCH 04/13] Update test_main.py

---
 tests/test_main.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/test_main.py b/tests/test_main.py
index 2b0d1e3..85ba730 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -1,6 +1,7 @@
-import jsonlines
 from pathlib import Path
 
+import jsonlines
+
 from autora.doc.pipelines.main import eval, generate, evaluate_documentation
 from autora.doc.runtime.prompts import InstructionPrompts, SystemPrompts
 
@@ -15,7 +16,7 @@ def test_predict() -> None:
     for output in outputs:
         assert len(output[0]) > 0, "Expected non-empty output"
 
-def test_evaluation():
+def test_evaluation() -> None:
     # Test Case: Valid Scores in the range of 0 and 1
     data = Path(__file__).parent.joinpath("../data/data.jsonl").resolve()
     with jsonlines.open(data) as reader:

From 1811daeab687be89220ab99128679d4e65635b7c Mon Sep 17 00:00:00 2001
From: RashmikaReddy <rashmikareddy777@gmail.com>
Date: Fri, 15 Dec 2023 12:07:11 -0800
Subject: [PATCH 05/13] formatting changes for evaluation metrics

---
 .mypy.ini                        |  3 +++
 .pre-commit-config.yaml          |  2 +-
 src/autora/doc/pipelines/main.py | 30 ++++++++++++++++--------------
 tests/test_main.py               | 15 +++++++++------
 4 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/.mypy.ini b/.mypy.ini
index b2565b1..12e730a 100644
--- a/.mypy.ini
+++ b/.mypy.ini
@@ -7,4 +7,7 @@ explicit_package_bases = True
 ignore_missing_imports = True
 
 [mypy-mlflow.*]
+ignore_missing_imports = True
+
+[mypy-nltk.*]
 ignore_missing_imports = True
\ No newline at end of file
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 029a6e6..b225f7b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -66,7 +66,7 @@ repos:
         # supported by your project here, or alternatively use
         # pre-commit's default_language_version, see
         # https://pre-commit.com/#top_level-default_language_version
-        language_version: python3.10
+        language_version: python3
 
 
 
diff --git a/src/autora/doc/pipelines/main.py b/src/autora/doc/pipelines/main.py
index 2c077b2..2087fb3 100644
--- a/src/autora/doc/pipelines/main.py
+++ b/src/autora/doc/pipelines/main.py
@@ -1,13 +1,13 @@
 import itertools
 import logging
-import nltk
 from timeit import default_timer as timer
-from typing import List
-from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
-from nltk.translate.meteor_score import single_meteor_score
+from typing import List, Tuple
 
+import nltk
 import torch
 import typer
+from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu
+from nltk.translate.meteor_score import single_meteor_score
 
 from autora.doc.runtime.predict_hf import Predictor
 from autora.doc.runtime.prompts import INSTR, SYS, InstructionPrompts, SystemPrompts
@@ -18,27 +18,29 @@
     format="%(asctime)s %(levelname)s %(module)s.%(funcName)s(): %(message)s",
 )
 logger = logging.getLogger(__name__)
-nltk.download('wordnet')
+nltk.download("wordnet")
 
-def evaluate_documentation(predictions, references):
+
+def evaluate_documentation(predictions: List[List[str]], references: List[str]) -> Tuple[float, float]:
     # Tokenize predictions and references
     tokenized_predictions = [pred[0].split() if pred else [] for pred in predictions]
     tokenized_references = [[ref.split()] for ref in references]
 
     # Calculate BLEU score
-    bleu = corpus_bleu(tokenized_references, tokenized_predictions,
-                       smoothing_function=SmoothingFunction().method1)
+    bleu = corpus_bleu(
+        tokenized_references, tokenized_predictions, smoothing_function=SmoothingFunction().method1
+    )
 
     # Calculate METEOR scores
-    meteor_scores = [single_meteor_score(ref[0], tokenized_pred)
-                     for ref, tokenized_pred in zip(tokenized_references, tokenized_predictions)]
+    meteor_scores = [
+        single_meteor_score(ref[0], tokenized_pred)
+        for ref, tokenized_pred in zip(tokenized_references, tokenized_predictions)
+    ]
     meteor = sum(meteor_scores) / len(predictions) if predictions else 0
 
     return (bleu, meteor)
 
 
-
-
 @app.command(help="Evaluate model on a data file")
 def eval(
     data_file: str = typer.Argument(..., help="JSONL Data file to evaluate on"),
@@ -95,8 +97,8 @@ def eval(
         total_tokens = sum([len(token) for token in tokens])
         mlflow.log_metric("total_tokens", total_tokens)
         mlflow.log_metric("tokens/sec", total_tokens / pred_time)
-        mlflow.log_metric("bleu_score", round(bleu,5))
-        mlflow.log_metric("meteor_score", round(meteor,5))
+        mlflow.log_metric("bleu_score", round(bleu, 5))
+        mlflow.log_metric("meteor_score", round(meteor, 5))
         return predictions
 
 
diff --git a/tests/test_main.py b/tests/test_main.py
index 2b0d1e3..02aee63 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -1,7 +1,8 @@
-import jsonlines
 from pathlib import Path
 
-from autora.doc.pipelines.main import eval, generate, evaluate_documentation
+import jsonlines
+
+from autora.doc.pipelines.main import eval, evaluate_documentation, generate
 from autora.doc.runtime.prompts import InstructionPrompts, SystemPrompts
 
 # dummy HF model for testing
@@ -15,17 +16,19 @@ def test_predict() -> None:
     for output in outputs:
         assert len(output[0]) > 0, "Expected non-empty output"
 
-def test_evaluation():
+
+def test_evaluation() -> None:
     # Test Case: Valid Scores in the range of 0 and 1
     data = Path(__file__).parent.joinpath("../data/data.jsonl").resolve()
     with jsonlines.open(data) as reader:
-            items = [item for item in reader]
-            labels = [item["output"] for item in items]
-    
+        items = [item for item in reader]
+        labels = [item["output"] for item in items]
+
     bleu, meteor = evaluate_documentation(labels, labels)
     assert bleu >= 0 and bleu <= 1, "BLEU score should be between 0 and 1"
     assert meteor >= 0 and meteor <= 1, "METEOR score should be between 0 and 1"
 
+
 def test_generate() -> None:
     python_file = __file__
     output = Path("output.txt")

From 368e73c3f49cc2c15cd83d338d340744f49c5e4d Mon Sep 17 00:00:00 2001
From: RashmikaReddy <rashmikareddy777@gmail.com>
Date: Fri, 15 Dec 2023 14:09:50 -0800
Subject: [PATCH 06/13] adding dependencies in pyproject.toml

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 422c8ff..3f97b29 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,6 +21,7 @@ dependencies = [
     # This works, while installing from pytorch and cuda from conda does not",
     "torch==2.0.1",
     "transformers>=4.35.2",
+    "nltk",
 ]
 
 # On a mac, install optional dependencies with `pip install '.[dev]'` (include the single quotes)

From 2376a6d7c3bca0874e4b5cce32092d00d04a35d3 Mon Sep 17 00:00:00 2001
From: RashmikaReddy <rashmikareddy777@gmail.com>
Date: Fri, 12 Jan 2024 09:08:40 -0800
Subject: [PATCH 07/13] Modified the test cases

---
 docs/requirements.txt            |  3 +--
 src/autora/doc/pipelines/main.py | 11 +++++++----
 tests/test_main.py               | 10 ++++++----
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/docs/requirements.txt b/docs/requirements.txt
index 25ac169..2b5c37d 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -6,5 +6,4 @@ ipython
 jupytext
 jupyter
 matplotlib
-numpy
-nltk
+numpy
\ No newline at end of file
diff --git a/src/autora/doc/pipelines/main.py b/src/autora/doc/pipelines/main.py
index 2087fb3..9ffa311 100644
--- a/src/autora/doc/pipelines/main.py
+++ b/src/autora/doc/pipelines/main.py
@@ -18,15 +18,17 @@
     format="%(asctime)s %(levelname)s %(module)s.%(funcName)s(): %(message)s",
 )
 logger = logging.getLogger(__name__)
-nltk.download("wordnet")
 
 
 def evaluate_documentation(predictions: List[List[str]], references: List[str]) -> Tuple[float, float]:
-    # Tokenize predictions and references
-    tokenized_predictions = [pred[0].split() if pred else [] for pred in predictions]
+    nltk.download("wordnet")
+
+    # Tokenize references
     tokenized_references = [[ref.split()] for ref in references]
+    tokenized_predictions = [pred[0].split() if pred else [] for pred in predictions]
 
-    # Calculate BLEU score
+    # Calculate BLEU score with smoothing function
+    # SmoothingFunction().method1 is used to avoid zero scores for n-grams not found in the reference.
     bleu = corpus_bleu(
         tokenized_references, tokenized_predictions, smoothing_function=SmoothingFunction().method1
     )
@@ -80,6 +82,7 @@ def eval(
         timer_start = timer()
         predictions = pred.predict(sys_prompt, instr_prompt, inputs, **param_dict)
         bleu, meteor = evaluate_documentation(predictions, labels)
+
         timer_end = timer()
         pred_time = timer_end - timer_start
         mlflow.log_metric("prediction_time/doc", pred_time / (len(inputs)))
diff --git a/tests/test_main.py b/tests/test_main.py
index 02aee63..f92acf9 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -1,6 +1,7 @@
 from pathlib import Path
 
 import jsonlines
+import pytest
 
 from autora.doc.pipelines.main import eval, evaluate_documentation, generate
 from autora.doc.runtime.prompts import InstructionPrompts, SystemPrompts
@@ -18,15 +19,16 @@ def test_predict() -> None:
 
 
 def test_evaluation() -> None:
-    # Test Case: Valid Scores in the range of 0 and 1
+    # Test Case: Meteor and Bleu scores are close to 1
     data = Path(__file__).parent.joinpath("../data/data.jsonl").resolve()
     with jsonlines.open(data) as reader:
         items = [item for item in reader]
         labels = [item["output"] for item in items]
+        predictions = [[item["output"]] for item in items]
 
-    bleu, meteor = evaluate_documentation(labels, labels)
-    assert bleu >= 0 and bleu <= 1, "BLEU score should be between 0 and 1"
-    assert meteor >= 0 and meteor <= 1, "METEOR score should be between 0 and 1"
+    bleu, meteor = evaluate_documentation(predictions, labels)
+    assert bleu == pytest.approx(1, 0.01), f"BLEU Score is {bleu}"
+    assert meteor == pytest.approx(1, 0.01), f"METEOR Score is {meteor}"
 
 
 def test_generate() -> None:

From 413172d9548932d601320ee6c1daf98ab2e3ad84 Mon Sep 17 00:00:00 2001
From: RashmikaReddy <rashmikareddy777@gmail.com>
Date: Fri, 12 Jan 2024 11:12:34 -0800
Subject: [PATCH 08/13] Added test cases

---
 src/autora/doc/pipelines/main.py |  3 +++
 tests/test_main.py               | 36 ++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/src/autora/doc/pipelines/main.py b/src/autora/doc/pipelines/main.py
index 9ffa311..4c711fc 100644
--- a/src/autora/doc/pipelines/main.py
+++ b/src/autora/doc/pipelines/main.py
@@ -24,7 +24,9 @@ def evaluate_documentation(predictions: List[List[str]], references: List[str])
     nltk.download("wordnet")
 
     # Tokenize references
+    # To calculate corpus_bleu, we need the references to be in a list[list].
     tokenized_references = [[ref.split()] for ref in references]
+    # Currently there is only 1 prediction for 1 reference, need to avg in future
     tokenized_predictions = [pred[0].split() if pred else [] for pred in predictions]
 
     # Calculate BLEU score with smoothing function
@@ -34,6 +36,7 @@ def evaluate_documentation(predictions: List[List[str]], references: List[str])
     )
 
     # Calculate METEOR scores
+    # As we have list[list], we take ref[0] to calculate meteor score.
     meteor_scores = [
         single_meteor_score(ref[0], tokenized_pred)
         for ref, tokenized_pred in zip(tokenized_references, tokenized_predictions)
diff --git a/tests/test_main.py b/tests/test_main.py
index f92acf9..534d714 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -31,6 +31,42 @@ def test_evaluation() -> None:
     assert meteor == pytest.approx(1, 0.01), f"METEOR Score is {meteor}"
 
 
+def test_extra_token_in_prediction() -> None:
+    # Test Case bleu score should be less due to brevity penalty and meteor is robust to small mistakes
+    labels = ["this is a test"]
+    predictions = [["this is a test extra"]]
+    bleu, meteor = evaluate_documentation(predictions, labels)
+    assert 0.6 <= bleu <= 0.8, f"BLEU Score is {bleu}"
+    assert 0.8 <= meteor <= 1, f"METEOR Score is {meteor}"
+
+
+def test_missing_token_in_prediction() -> None:
+    # bleu score is less, meteor is higher
+    labels = ["this is a test"]
+    predictions = [["this is a"]]
+    bleu, meteor = evaluate_documentation(predictions, labels)
+    assert 0.4 <= bleu <= 0.6, f"BLEU Score is {bleu}"
+    assert 0.6 <= meteor <= 0.8, f"METEOR Score is {meteor}"
+
+
+def test_completely_different_tokens() -> None:
+    # both scores are less, as no common tokens
+    labels = ["this is a test"]
+    predictions = [["completely different sentence"]]
+    bleu, meteor = evaluate_documentation(predictions, labels)
+    assert bleu <= 0.1, f"BLEU Score is {bleu}"
+    assert meteor <= 0.1, f"METEOR Score is {meteor}"
+
+
+def test_partially_matching_tokens() -> None:
+    # As ngrams arent matching because of extra token within, BLEU score is very less. Meteor gives a good score only.
+    labels = ["this is a test"]
+    predictions = [["this is a different test"]]
+    bleu, meteor = evaluate_documentation(predictions, labels)
+    assert 0.25 <= bleu <= 0.4, f"BLEU Score is {bleu}"
+    assert 0.8 <= meteor <= 0.95, f"METEOR Score is {meteor}"
+
+
 def test_generate() -> None:
     python_file = __file__
     output = Path("output.txt")

From 8d5c75e95ae8d912a285bea7a3005c46b0cc8d7d Mon Sep 17 00:00:00 2001
From: Rashmika Reddy Vookanti <rashmikareddy777@gmail.com>
Date: Wed, 17 Jan 2024 22:22:50 -0800
Subject: [PATCH 09/13] Made the suggested changes

---
 src/autora/doc/pipelines/main.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/autora/doc/pipelines/main.py b/src/autora/doc/pipelines/main.py
index 4c711fc..32b01db 100644
--- a/src/autora/doc/pipelines/main.py
+++ b/src/autora/doc/pipelines/main.py
@@ -24,22 +24,23 @@ def evaluate_documentation(predictions: List[List[str]], references: List[str])
     nltk.download("wordnet")
 
     # Tokenize references
-    # To calculate corpus_bleu, we need the references to be in a list[list].
-    tokenized_references = [[ref.split()] for ref in references]
+    tokenized_references = [ref.split() for ref in references]
     # Currently there is only 1 prediction for 1 reference, need to avg in future
     tokenized_predictions = [pred[0].split() if pred else [] for pred in predictions]
 
     # Calculate BLEU score with smoothing function
     # SmoothingFunction().method1 is used to avoid zero scores for n-grams not found in the reference.
     bleu = corpus_bleu(
-        tokenized_references, tokenized_predictions, smoothing_function=SmoothingFunction().method1
+        # Wrap each reference list in another list
+        [[tokenized_ref] for tokenized_ref in tokenized_references],
+        tokenized_predictions,
+        smoothing_function=SmoothingFunction().method1,
     )
 
     # Calculate METEOR scores
-    # As we have list[list], we take ref[0] to calculate meteor score.
     meteor_scores = [
-        single_meteor_score(ref[0], tokenized_pred)
-        for ref, tokenized_pred in zip(tokenized_references, tokenized_predictions)
+        single_meteor_score(tokenized_ref, tokenized_pred)
+        for tokenized_ref, tokenized_pred in zip(tokenized_references, tokenized_predictions)
     ]
     meteor = sum(meteor_scores) / len(predictions) if predictions else 0
 

From e5657b731034407c650a9d6ebc5076ed245aac3f Mon Sep 17 00:00:00 2001
From: Rashmika Reddy Vookanti <rashmikareddy777@gmail.com>
Date: Wed, 17 Jan 2024 22:59:45 -0800
Subject: [PATCH 10/13] Updating test_main.py with changes related to main

---
 tests/test_main.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_main.py b/tests/test_main.py
index ab912c3..b325c53 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -1,11 +1,11 @@
 from pathlib import Path
 
-import jsonlines
-import pytest
-
 from autora.doc.pipelines.main import eval, evaluate_documentation, generate, import_data
 from autora.doc.runtime.prompts import InstructionPrompts, SystemPrompts
 
+import jsonlines
+import pytest
+
 # dummy HF model for testing
 TEST_HF_MODEL = "hf-internal-testing/tiny-random-FalconForCausalLM"
 
@@ -20,7 +20,7 @@ def test_predict() -> None:
 
 def test_evaluation() -> None:
     # Test Case: Meteor and Bleu scores are close to 1
-    data = Path(__file__).parent.joinpath("../data/data.jsonl").resolve()
+    data = Path(__file__).parent.joinpath("../data/sweetpea/data.jsonl").resolve()
     with jsonlines.open(data) as reader:
         items = [item for item in reader]
         labels = [item["output"] for item in items]

From 3e7e5e854e7401770c15fb53fe57a68f46ab15c6 Mon Sep 17 00:00:00 2001
From: Rashmika Reddy Vookanti <rashmikareddy777@gmail.com>
Date: Wed, 17 Jan 2024 23:18:09 -0800
Subject: [PATCH 11/13] Update requirements.txt

---
 docs/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/requirements.txt b/docs/requirements.txt
index 2b5c37d..62972b9 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -6,4 +6,4 @@ ipython
 jupytext
 jupyter
 matplotlib
-numpy
\ No newline at end of file
+numpy

From 3df0c8d66efab8a2c1a8465d0a89879490804d66 Mon Sep 17 00:00:00 2001
From: Rashmika Reddy Vookanti <rashmikareddy777@gmail.com>
Date: Wed, 17 Jan 2024 23:18:38 -0800
Subject: [PATCH 12/13] Update .mypy.ini

---
 .mypy.ini | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.mypy.ini b/.mypy.ini
index 12e730a..6e719c2 100644
--- a/.mypy.ini
+++ b/.mypy.ini
@@ -10,4 +10,4 @@ ignore_missing_imports = True
 ignore_missing_imports = True
 
 [mypy-nltk.*]
-ignore_missing_imports = True
\ No newline at end of file
+ignore_missing_imports = True

From 9fcd8ec9bfff87ddd8527a0ab68bfb58761a9227 Mon Sep 17 00:00:00 2001
From: Rashmika Reddy Vookanti <rashmikareddy777@gmail.com>
Date: Thu, 18 Jan 2024 11:33:14 -0800
Subject: [PATCH 13/13] Updated test_main.py

---
 tests/test_main.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_main.py b/tests/test_main.py
index b325c53..a1eed5f 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -1,11 +1,11 @@
 from pathlib import Path
 
-from autora.doc.pipelines.main import eval, evaluate_documentation, generate, import_data
-from autora.doc.runtime.prompts import InstructionPrompts, SystemPrompts
-
 import jsonlines
 import pytest
 
+from autora.doc.pipelines.main import eval, evaluate_documentation, generate, import_data
+from autora.doc.runtime.prompts import InstructionPrompts, SystemPrompts
+
 # dummy HF model for testing
 TEST_HF_MODEL = "hf-internal-testing/tiny-random-FalconForCausalLM"