Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add metrics calculations to the inference pipeline #23

Merged
merged 16 commits into from
Jan 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .mypy.ini
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,6 @@ ignore_missing_imports = True

[mypy-mlflow.*]
ignore_missing_imports = True

[mypy-nltk.*]
ignore_missing_imports = True
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ repos:
# supported by your project here, or alternatively use
# pre-commit's default_language_version, see
# https://pre-commit.com/#top_level-default_language_version
language_version: python3.10
language_version: python3



Expand Down
1 change: 1 addition & 0 deletions azureml/conda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,6 @@ dependencies:
- transformers>=4.35.2
- xformers
- scipy
- nltk
# This works, while installing from pytorch and cuda from conda does not
- torch==2.0.1
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ dependencies = [
# This works, while installing from pytorch and cuda from conda does not",
"torch==2.0.1",
"transformers>=4.35.2",
"nltk",
]

# On a mac, install optional dependencies with `pip install '.[dev]'` (include the single quotes)
Expand Down
38 changes: 37 additions & 1 deletion src/autora/doc/pipelines/main.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import itertools
import logging
from timeit import default_timer as timer
from typing import List
from typing import List, Tuple

import nltk
import torch
import typer
from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu
from nltk.translate.meteor_score import single_meteor_score

from autora.doc.runtime.predict_hf import Predictor
from autora.doc.runtime.prompts import INSTR, SYS, InstructionPrompts, SystemPrompts
Expand All @@ -17,6 +20,33 @@
logger = logging.getLogger(__name__)


def evaluate_documentation(predictions: List[List[str]], references: List[str]) -> Tuple[float, float]:
nltk.download("wordnet")

# Tokenize references
tokenized_references = [ref.split() for ref in references]
# Currently there is only 1 prediction for 1 reference, need to avg in future
tokenized_predictions = [pred[0].split() if pred else [] for pred in predictions]
RashmikaReddy marked this conversation as resolved.
Show resolved Hide resolved

# Calculate BLEU score with smoothing function
# SmoothingFunction().method1 is used to avoid zero scores for n-grams not found in the reference.
bleu = corpus_bleu(
carlosgjs marked this conversation as resolved.
Show resolved Hide resolved
# Wrap each reference list in another list
[[tokenized_ref] for tokenized_ref in tokenized_references],
tokenized_predictions,
smoothing_function=SmoothingFunction().method1,
)

# Calculate METEOR scores
meteor_scores = [
single_meteor_score(tokenized_ref, tokenized_pred)
for tokenized_ref, tokenized_pred in zip(tokenized_references, tokenized_predictions)
]
meteor = sum(meteor_scores) / len(predictions) if predictions else 0

return (bleu, meteor)


@app.command(help="Evaluate model on a data file")
def eval(
data_file: str = typer.Argument(..., help="JSONL Data file to evaluate on"),
Expand Down Expand Up @@ -55,6 +85,8 @@ def eval(
pred = Predictor(model_path)
timer_start = timer()
predictions = pred.predict(sys_prompt, instr_prompt, inputs, **param_dict)
bleu, meteor = evaluate_documentation(predictions, labels)

timer_end = timer()
pred_time = timer_end - timer_start
mlflow.log_metric("prediction_time/doc", pred_time / (len(inputs)))
Expand All @@ -63,13 +95,17 @@ def eval(
mlflow.log_text(inputs[i], f"input_{i}.py")
for j in range(len(predictions[i])):
mlflow.log_text(predictions[i][j], f"prediction_{i}_{j}.txt")
mlflow.log_text("bleu_score is ", str(bleu))
mlflow.log_text("meteor_score is ", str(meteor))

# flatten predictions for counting tokens
predictions_flat = list(itertools.chain.from_iterable(predictions))
tokens = pred.tokenize(predictions_flat)["input_ids"]
total_tokens = sum([len(token) for token in tokens])
mlflow.log_metric("total_tokens", total_tokens)
mlflow.log_metric("tokens/sec", total_tokens / pred_time)
mlflow.log_metric("bleu_score", round(bleu, 5))
mlflow.log_metric("meteor_score", round(meteor, 5))
return predictions


Expand Down
54 changes: 53 additions & 1 deletion tests/test_main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
from pathlib import Path

from autora.doc.pipelines.main import eval, generate, import_data
import jsonlines
import pytest

from autora.doc.pipelines.main import eval, evaluate_documentation, generate, import_data
from autora.doc.runtime.prompts import InstructionPrompts, SystemPrompts

# dummy HF model for testing
Expand All @@ -15,6 +18,55 @@ def test_predict() -> None:
assert len(output[0]) > 0, "Expected non-empty output"


def test_evaluation() -> None:
# Test Case: Meteor and Bleu scores are close to 1
data = Path(__file__).parent.joinpath("../data/sweetpea/data.jsonl").resolve()
with jsonlines.open(data) as reader:
items = [item for item in reader]
labels = [item["output"] for item in items]
predictions = [[item["output"]] for item in items]

bleu, meteor = evaluate_documentation(predictions, labels)
assert bleu == pytest.approx(1, 0.01), f"BLEU Score is {bleu}"
assert meteor == pytest.approx(1, 0.01), f"METEOR Score is {meteor}"


def test_extra_token_in_prediction() -> None:
# Test Case bleu score should be less due to brevity penalty and meteor is robust to small mistakes
labels = ["this is a test"]
predictions = [["this is a test extra"]]
bleu, meteor = evaluate_documentation(predictions, labels)
assert 0.6 <= bleu <= 0.8, f"BLEU Score is {bleu}"
assert 0.8 <= meteor <= 1, f"METEOR Score is {meteor}"


def test_missing_token_in_prediction() -> None:
# bleu score is less, meteor is higher
labels = ["this is a test"]
predictions = [["this is a"]]
bleu, meteor = evaluate_documentation(predictions, labels)
assert 0.4 <= bleu <= 0.6, f"BLEU Score is {bleu}"
assert 0.6 <= meteor <= 0.8, f"METEOR Score is {meteor}"


def test_completely_different_tokens() -> None:
# both scores are less, as no common tokens
labels = ["this is a test"]
predictions = [["completely different sentence"]]
bleu, meteor = evaluate_documentation(predictions, labels)
assert bleu <= 0.1, f"BLEU Score is {bleu}"
assert meteor <= 0.1, f"METEOR Score is {meteor}"


def test_partially_matching_tokens() -> None:
# As ngrams arent matching because of extra token within, BLEU score is very less. Meteor gives a good score only.
labels = ["this is a test"]
predictions = [["this is a different test"]]
bleu, meteor = evaluate_documentation(predictions, labels)
assert 0.25 <= bleu <= 0.4, f"BLEU Score is {bleu}"
assert 0.8 <= meteor <= 0.95, f"METEOR Score is {meteor}"


def test_generate() -> None:
python_file = __file__
output = Path("output.txt")
Expand Down