Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add metrics calculations to the inference pipeline #23

Merged
merged 16 commits into from
Jan 19, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .mypy.ini
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,7 @@ explicit_package_bases = True
ignore_missing_imports = True

[mypy-mlflow.*]
ignore_missing_imports = True

[mypy-nltk.*]
ignore_missing_imports = True
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ repos:
# supported by your project here, or alternatively use
# pre-commit's default_language_version, see
# https://pre-commit.com/#top_level-default_language_version
language_version: python3.10
language_version: python3



Expand Down
1 change: 1 addition & 0 deletions azureml/conda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,6 @@ dependencies:
- transformers>=4.35.2
- xformers
- scipy
- nltk
# This works, while installing from pytorch and cuda from conda does not
- torch==2.0.1
1 change: 1 addition & 0 deletions docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ jupytext
jupyter
matplotlib
numpy
nltk
RashmikaReddy marked this conversation as resolved.
Show resolved Hide resolved
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ dependencies = [
# This works, while installing from pytorch and cuda from conda does not",
"torch==2.0.1",
"transformers>=4.35.2",
"nltk",
]

# On a mac, install optional dependencies with `pip install '.[dev]'` (include the single quotes)
Expand Down
31 changes: 30 additions & 1 deletion src/autora/doc/pipelines/main.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import itertools
import logging
from timeit import default_timer as timer
from typing import List
from typing import List, Tuple

import nltk
import torch
import typer
from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu
from nltk.translate.meteor_score import single_meteor_score

from autora.doc.runtime.predict_hf import Predictor
from autora.doc.runtime.prompts import INSTR, SYS, InstructionPrompts, SystemPrompts
Expand All @@ -15,6 +18,27 @@
format="%(asctime)s %(levelname)s %(module)s.%(funcName)s(): %(message)s",
)
logger = logging.getLogger(__name__)
nltk.download("wordnet")
RashmikaReddy marked this conversation as resolved.
Show resolved Hide resolved


def evaluate_documentation(predictions: List[List[str]], references: List[str]) -> Tuple[float, float]:
# Tokenize predictions and references
tokenized_predictions = [pred[0].split() if pred else [] for pred in predictions]
RashmikaReddy marked this conversation as resolved.
Show resolved Hide resolved
tokenized_references = [[ref.split()] for ref in references]

# Calculate BLEU score
bleu = corpus_bleu(
carlosgjs marked this conversation as resolved.
Show resolved Hide resolved
tokenized_references, tokenized_predictions, smoothing_function=SmoothingFunction().method1
)

# Calculate METEOR scores
meteor_scores = [
single_meteor_score(ref[0], tokenized_pred)
RashmikaReddy marked this conversation as resolved.
Show resolved Hide resolved
for ref, tokenized_pred in zip(tokenized_references, tokenized_predictions)
RashmikaReddy marked this conversation as resolved.
Show resolved Hide resolved
]
meteor = sum(meteor_scores) / len(predictions) if predictions else 0

return (bleu, meteor)


@app.command(help="Evaluate model on a data file")
Expand Down Expand Up @@ -55,6 +79,7 @@ def eval(
pred = Predictor(model_path)
timer_start = timer()
predictions = pred.predict(sys_prompt, instr_prompt, inputs, **param_dict)
bleu, meteor = evaluate_documentation(predictions, labels)
timer_end = timer()
pred_time = timer_end - timer_start
mlflow.log_metric("prediction_time/doc", pred_time / (len(inputs)))
Expand All @@ -63,13 +88,17 @@ def eval(
mlflow.log_text(inputs[i], f"input_{i}.py")
for j in range(len(predictions[i])):
mlflow.log_text(predictions[i][j], f"prediction_{i}_{j}.txt")
mlflow.log_text("bleu_score is ", str(bleu))
mlflow.log_text("meteor_score is ", str(meteor))

# flatten predictions for counting tokens
predictions_flat = list(itertools.chain.from_iterable(predictions))
tokens = pred.tokenize(predictions_flat)["input_ids"]
total_tokens = sum([len(token) for token in tokens])
mlflow.log_metric("total_tokens", total_tokens)
mlflow.log_metric("tokens/sec", total_tokens / pred_time)
mlflow.log_metric("bleu_score", round(bleu, 5))
mlflow.log_metric("meteor_score", round(meteor, 5))
return predictions


Expand Down
16 changes: 15 additions & 1 deletion tests/test_main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from pathlib import Path

from autora.doc.pipelines.main import eval, generate
import jsonlines

from autora.doc.pipelines.main import eval, evaluate_documentation, generate
from autora.doc.runtime.prompts import InstructionPrompts, SystemPrompts

# dummy HF model for testing
Expand All @@ -15,6 +17,18 @@ def test_predict() -> None:
assert len(output[0]) > 0, "Expected non-empty output"


def test_evaluation() -> None:
# Test Case: Valid Scores in the range of 0 and 1
data = Path(__file__).parent.joinpath("../data/data.jsonl").resolve()
with jsonlines.open(data) as reader:
items = [item for item in reader]
labels = [item["output"] for item in items]

bleu, meteor = evaluate_documentation(labels, labels)
assert bleu >= 0 and bleu <= 1, "BLEU score should be between 0 and 1"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are bleu, meteor==1 when label==prediction?

It would actually be a bit clearer if you hard code some examples and assert specific values. Eg. all tokens match, extra token in the prediction, missing token in the prediction, etc.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added more tests cases, and code works as expected.

assert meteor >= 0 and meteor <= 1, "METEOR score should be between 0 and 1"


def test_generate() -> None:
python_file = __file__
output = Path("output.txt")
Expand Down