AutoResearch · RashmikaReddy · Jan 19, 2024 · Dec 15, 2023 · Dec 15, 2023 · Dec 15, 2023
diff --git a/.mypy.ini b/.mypy.ini
@@ -7,4 +7,7 @@ explicit_package_bases = True
 ignore_missing_imports = True
 
 [mypy-mlflow.*]
+ignore_missing_imports = True
+
+[mypy-nltk.*]
 ignore_missing_imports = True
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -66,7 +66,7 @@ repos:
         # supported by your project here, or alternatively use
         # pre-commit's default_language_version, see
         # https://pre-commit.com/#top_level-default_language_version
-        language_version: python3.10
+        language_version: python3
 
 
 

diff --git a/azureml/conda.yml b/azureml/conda.yml
@@ -14,5 +14,6 @@ dependencies:
     - transformers>=4.35.2
     - xformers
     - scipy
+    - nltk
     # This works, while installing from pytorch and cuda from conda does not
     - torch==2.0.1    
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -7,3 +7,4 @@ jupytext
 jupyter
 matplotlib
 numpy
+nltk
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,6 +21,7 @@ dependencies = [
     # This works, while installing from pytorch and cuda from conda does not",
     "torch==2.0.1",
     "transformers>=4.35.2",
+    "nltk",
 ]
 
 # On a mac, install optional dependencies with `pip install '.[dev]'` (include the single quotes)

diff --git a/src/autora/doc/pipelines/main.py b/src/autora/doc/pipelines/main.py
@@ -1,10 +1,13 @@
 import itertools
 import logging
 from timeit import default_timer as timer
-from typing import List
+from typing import List, Tuple
 
+import nltk
 import torch
 import typer
+from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu
+from nltk.translate.meteor_score import single_meteor_score
 
 from autora.doc.runtime.predict_hf import Predictor
 from autora.doc.runtime.prompts import INSTR, SYS, InstructionPrompts, SystemPrompts
@@ -15,6 +18,27 @@
     format="%(asctime)s %(levelname)s %(module)s.%(funcName)s(): %(message)s",
 )
 logger = logging.getLogger(__name__)
+nltk.download("wordnet")
+
+
+def evaluate_documentation(predictions: List[List[str]], references: List[str]) -> Tuple[float, float]:
+    # Tokenize predictions and references
+    tokenized_predictions = [pred[0].split() if pred else [] for pred in predictions]
+    tokenized_references = [[ref.split()] for ref in references]
+
+    # Calculate BLEU score
+    bleu = corpus_bleu(
+        tokenized_references, tokenized_predictions, smoothing_function=SmoothingFunction().method1
+    )
+
+    # Calculate METEOR scores
+    meteor_scores = [
+        single_meteor_score(ref[0], tokenized_pred)
+        for ref, tokenized_pred in zip(tokenized_references, tokenized_predictions)
+    ]
+    meteor = sum(meteor_scores) / len(predictions) if predictions else 0
+
+    return (bleu, meteor)
 
 
 @app.command(help="Evaluate model on a data file")
@@ -55,6 +79,7 @@ def eval(
         pred = Predictor(model_path)
         timer_start = timer()
         predictions = pred.predict(sys_prompt, instr_prompt, inputs, **param_dict)
+        bleu, meteor = evaluate_documentation(predictions, labels)
         timer_end = timer()
         pred_time = timer_end - timer_start
         mlflow.log_metric("prediction_time/doc", pred_time / (len(inputs)))
@@ -63,13 +88,17 @@ def eval(
             mlflow.log_text(inputs[i], f"input_{i}.py")
             for j in range(len(predictions[i])):
                 mlflow.log_text(predictions[i][j], f"prediction_{i}_{j}.txt")
+        mlflow.log_text("bleu_score is ", str(bleu))
+        mlflow.log_text("meteor_score is ", str(meteor))
 
         # flatten predictions for counting tokens
         predictions_flat = list(itertools.chain.from_iterable(predictions))
         tokens = pred.tokenize(predictions_flat)["input_ids"]
         total_tokens = sum([len(token) for token in tokens])
         mlflow.log_metric("total_tokens", total_tokens)
         mlflow.log_metric("tokens/sec", total_tokens / pred_time)
+        mlflow.log_metric("bleu_score", round(bleu, 5))
+        mlflow.log_metric("meteor_score", round(meteor, 5))
         return predictions
 
 

diff --git a/tests/test_main.py b/tests/test_main.py
@@ -1,6 +1,8 @@
 from pathlib import Path
 
-from autora.doc.pipelines.main import eval, generate
+import jsonlines
+
+from autora.doc.pipelines.main import eval, evaluate_documentation, generate
 from autora.doc.runtime.prompts import InstructionPrompts, SystemPrompts
 
 # dummy HF model for testing
@@ -15,6 +17,18 @@ def test_predict() -> None:
         assert len(output[0]) > 0, "Expected non-empty output"
 
 
+def test_evaluation() -> None:
+    # Test Case: Valid Scores in the range of 0 and 1
+    data = Path(__file__).parent.joinpath("../data/data.jsonl").resolve()
+    with jsonlines.open(data) as reader:
+        items = [item for item in reader]
+        labels = [item["output"] for item in items]
+
+    bleu, meteor = evaluate_documentation(labels, labels)
+    assert bleu >= 0 and bleu <= 1, "BLEU score should be between 0 and 1"
+    assert meteor >= 0 and meteor <= 1, "METEOR score should be between 0 and 1"
+
+
 def test_generate() -> None:
     python_file = __file__
     output = Path("output.txt")
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,3 +7,4 @@ jupytext @@
     jupyter
     matplotlib
     numpy
+    nltk