Skip to content

Commit

Permalink
Added test cases
Browse files Browse the repository at this point in the history
  • Loading branch information
RashmikaReddy committed Jan 12, 2024
1 parent 2376a6d commit 413172d
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 0 deletions.
3 changes: 3 additions & 0 deletions src/autora/doc/pipelines/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@ def evaluate_documentation(predictions: List[List[str]], references: List[str])
nltk.download("wordnet")

# Tokenize references
# To calculate corpus_bleu, we need the references to be in a list[list].
tokenized_references = [[ref.split()] for ref in references]
# Currently there is only 1 prediction for 1 reference, need to avg in future
tokenized_predictions = [pred[0].split() if pred else [] for pred in predictions]

# Calculate BLEU score with smoothing function
Expand All @@ -34,6 +36,7 @@ def evaluate_documentation(predictions: List[List[str]], references: List[str])
)

# Calculate METEOR scores
# As we have list[list], we take ref[0] to calculate meteor score.
meteor_scores = [
single_meteor_score(ref[0], tokenized_pred)
for ref, tokenized_pred in zip(tokenized_references, tokenized_predictions)
Expand Down
36 changes: 36 additions & 0 deletions tests/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,42 @@ def test_evaluation() -> None:
assert meteor == pytest.approx(1, 0.01), f"METEOR Score is {meteor}"


def test_extra_token_in_prediction() -> None:
# Test Case bleu score should be less due to brevity penalty and meteor is robust to small mistakes
labels = ["this is a test"]
predictions = [["this is a test extra"]]
bleu, meteor = evaluate_documentation(predictions, labels)
assert 0.6 <= bleu <= 0.8, f"BLEU Score is {bleu}"
assert 0.8 <= meteor <= 1, f"METEOR Score is {meteor}"


def test_missing_token_in_prediction() -> None:
# bleu score is less, meteor is higher
labels = ["this is a test"]
predictions = [["this is a"]]
bleu, meteor = evaluate_documentation(predictions, labels)
assert 0.4 <= bleu <= 0.6, f"BLEU Score is {bleu}"
assert 0.6 <= meteor <= 0.8, f"METEOR Score is {meteor}"


def test_completely_different_tokens() -> None:
# both scores are less, as no common tokens
labels = ["this is a test"]
predictions = [["completely different sentence"]]
bleu, meteor = evaluate_documentation(predictions, labels)
assert bleu <= 0.1, f"BLEU Score is {bleu}"
assert meteor <= 0.1, f"METEOR Score is {meteor}"


def test_partially_matching_tokens() -> None:
# As ngrams arent matching because of extra token within, BLEU score is very less. Meteor gives a good score only.
labels = ["this is a test"]
predictions = [["this is a different test"]]
bleu, meteor = evaluate_documentation(predictions, labels)
assert 0.25 <= bleu <= 0.4, f"BLEU Score is {bleu}"
assert 0.8 <= meteor <= 0.95, f"METEOR Score is {meteor}"


def test_generate() -> None:
python_file = __file__
output = Path("output.txt")
Expand Down

0 comments on commit 413172d

Please sign in to comment.