Added test cases

AutoResearch · Jan 12, 2024 · 413172d · 413172d
1 parent 2376a6d
commit 413172d
Show file tree

Hide file tree

Showing 2 changed files with 39 additions and 0 deletions.
diff --git a/src/autora/doc/pipelines/main.py b/src/autora/doc/pipelines/main.py
@@ -24,7 +24,9 @@ def evaluate_documentation(predictions: List[List[str]], references: List[str])
     nltk.download("wordnet")
 
     # Tokenize references
+    # To calculate corpus_bleu, we need the references to be in a list[list].
     tokenized_references = [[ref.split()] for ref in references]
+    # Currently there is only 1 prediction for 1 reference, need to avg in future
     tokenized_predictions = [pred[0].split() if pred else [] for pred in predictions]
 
     # Calculate BLEU score with smoothing function
@@ -34,6 +36,7 @@ def evaluate_documentation(predictions: List[List[str]], references: List[str])
     )
 
     # Calculate METEOR scores
+    # As we have list[list], we take ref[0] to calculate meteor score.
     meteor_scores = [
         single_meteor_score(ref[0], tokenized_pred)
         for ref, tokenized_pred in zip(tokenized_references, tokenized_predictions)

diff --git a/tests/test_main.py b/tests/test_main.py
@@ -31,6 +31,42 @@ def test_evaluation() -> None:
     assert meteor == pytest.approx(1, 0.01), f"METEOR Score is {meteor}"
 
 
+def test_extra_token_in_prediction() -> None:
+    # Test Case bleu score should be less due to brevity penalty and meteor is robust to small mistakes
+    labels = ["this is a test"]
+    predictions = [["this is a test extra"]]
+    bleu, meteor = evaluate_documentation(predictions, labels)
+    assert 0.6 <= bleu <= 0.8, f"BLEU Score is {bleu}"
+    assert 0.8 <= meteor <= 1, f"METEOR Score is {meteor}"
+
+
+def test_missing_token_in_prediction() -> None:
+    # bleu score is less, meteor is higher
+    labels = ["this is a test"]
+    predictions = [["this is a"]]
+    bleu, meteor = evaluate_documentation(predictions, labels)
+    assert 0.4 <= bleu <= 0.6, f"BLEU Score is {bleu}"
+    assert 0.6 <= meteor <= 0.8, f"METEOR Score is {meteor}"
+
+
+def test_completely_different_tokens() -> None:
+    # both scores are less, as no common tokens
+    labels = ["this is a test"]
+    predictions = [["completely different sentence"]]
+    bleu, meteor = evaluate_documentation(predictions, labels)
+    assert bleu <= 0.1, f"BLEU Score is {bleu}"
+    assert meteor <= 0.1, f"METEOR Score is {meteor}"
+
+
+def test_partially_matching_tokens() -> None:
+    # As ngrams arent matching because of extra token within, BLEU score is very less. Meteor gives a good score only.
+    labels = ["this is a test"]
+    predictions = [["this is a different test"]]
+    bleu, meteor = evaluate_documentation(predictions, labels)
+    assert 0.25 <= bleu <= 0.4, f"BLEU Score is {bleu}"
+    assert 0.8 <= meteor <= 0.95, f"METEOR Score is {meteor}"
+
+
 def test_generate() -> None:
     python_file = __file__
     output = Path("output.txt")