From d20c6d9f599068629e28839cbc8b77273974924a Mon Sep 17 00:00:00 2001 From: Anuj Sinha Date: Tue, 30 Jan 2024 03:10:53 -0800 Subject: [PATCH 1/8] feat: created eval_on_prompts_file() to run and compare multiple prompts on single data file input --- data/autora/prompts/all_prompt.json | 14 ++++++++++++++ src/autora/doc/util.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 data/autora/prompts/all_prompt.json create mode 100644 src/autora/doc/util.py diff --git a/data/autora/prompts/all_prompt.json b/data/autora/prompts/all_prompt.json new file mode 100644 index 0000000..af19692 --- /dev/null +++ b/data/autora/prompts/all_prompt.json @@ -0,0 +1,14 @@ +[ + { + "SYS": "You are a technical documentation writer. You always write clear, concise, and accurate documentation for\nscientific experiments. Your documentation focuses on the experiment's purpose, procedure, and results. Therefore,\ndetails about specific python functions, packages, or libraries are not necessary. Your readers are experimental\nscientists.", + "INSTR": "Please generate high-level one or two paragraph documentation for the following experiment." + }, + { + "SYS": "You are a technical documentation writer. You always write clear, concise, and accurate documentation\nfor scientific experiments. Your documentation focuses on the experiment's procedure. Therefore, details about specific\npython functions, packages, or libraries are NOT necessary. Your readers are experimental scientists.\nFor writing your descriptions, follow these instructions:\n- DO NOT write greetings or preambles\n- Use the Variable 'name' attribute and not the python variable names\n- Use LaTeX for math expressions\n- DO NOT include code or code-like syntax and do not use python function or class names\n- Write in paragraph style, NOT bullet points", + "INSTR": "Generate a one line description of the dependent and independent variables used in the following\npython code: " + }, + { + "SYS": "You are a research scientist. You always write clear, concise, and accurate documentation\nfor scientific experiments from python code. Your documentation focuses on the experiment's procedure. Therefore, details about specific\npython functions, packages, or libraries are NOT necessary. Your readers are experimental scientists.\nFor writing your descriptions, follow these instructions:\n- DO NOT write greetings or preambles\n- Use the Variable 'name' attribute and not the python variable names\n- Use LaTeX for math expressions\n- DO NOT include code or code-like syntax and do not use python function or class names\n- Write in paragraph style, NOT bullet points", + "INSTR": "Generate a three line description of the dependent and independent variables used in the following\npython code: " + } +] diff --git a/src/autora/doc/util.py b/src/autora/doc/util.py new file mode 100644 index 0000000..3c83ad2 --- /dev/null +++ b/src/autora/doc/util.py @@ -0,0 +1,29 @@ +import json +from typing import Any, Dict, List, Tuple + +from autora.doc.runtime.prompts import PromptBuilder + + +def load_file(json_file_path: str) -> List[Dict[str, Any]]: + # Read and parse the JSON file + with open(json_file_path, "r") as file: + data: List[Dict[str, Any]] = json.load(file) + return data + + +def get_prompts_from_file(prompts_file: str) -> List[str]: + prompts_data = load_file(prompts_file) + prompts_list = [PromptBuilder(p["SYS"], p["INSTR"]).build() for p in prompts_data] + return prompts_list + + +def get_eval_result_from_prediction( + prediction: Tuple[List[str], float, float], prompt: str +) -> Dict[str, Any]: + eval_result = { + "prediction": prediction[0], + "bleu": prediction[1], + "meteor": prediction[2], + "prompt": prompt, + } + return eval_result From 905ae5f8554638dcafb780b87f62464657a9716c Mon Sep 17 00:00:00 2001 From: Anuj Sinha Date: Tue, 30 Jan 2024 03:16:48 -0800 Subject: [PATCH 2/8] feat: created eval_on_prompts_file() to run and compare multiple prompts on single data file input --- src/autora/doc/pipelines/main.py | 41 +++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/src/autora/doc/pipelines/main.py b/src/autora/doc/pipelines/main.py index 2c122c7..f9ae7c8 100644 --- a/src/autora/doc/pipelines/main.py +++ b/src/autora/doc/pipelines/main.py @@ -10,7 +10,8 @@ from nltk.translate.meteor_score import single_meteor_score from autora.doc.runtime.predict_hf import Predictor -from autora.doc.runtime.prompts import PROMPTS, PromptIds +from autora.doc.runtime.prompts import PROMPTS, PromptBuilder, PromptIds +from autora.doc.util import get_eval_result_from_prediction, get_prompts_from_file, load_file app = typer.Typer() logging.basicConfig( @@ -47,6 +48,44 @@ def evaluate_documentation(predictions: List[str], references: List[str]) -> Tup return (bleu, meteor) +@app.command(help="Evaluate a model for code-to-documentation generation for all prompts in the prompts_file") +def eval_on_prompts_file( + data_file: str = typer.Argument(..., help="JSONL Data file to evaluate on"), + model_path: str = typer.Option("meta-llama/Llama-2-7b-chat-hf", help="Path to HF model"), + prompts_file: str = typer.Argument(..., help="JSON file with a list of dictionary of prompts"), + param: List[str] = typer.Option( + [], help="Additional float parameters to pass to the model as name=float pairs" + ), +) -> List[Dict[str, str]]: + import mlflow + + results_list = [] + + mlflow.autolog() + param_dict = {pair[0]: float(pair[1]) for pair in [pair.split("=") for pair in param]} + run = mlflow.active_run() + + prompts_list = get_prompts_from_file(prompts_file) + + if run is None: + run = mlflow.start_run() + with run: + logger.info(f"Active run_id: {run.info.run_id}") + logger.info(f"running predict with {data_file}") + logger.info(f"model path: {model_path}") + mlflow.log_params(param_dict) + mlflow.log_param("model_path", model_path) + mlflow.log_param("data_file", data_file) + predictor = Predictor(model_path) + for i in range(len(prompts_list)): + logger.info(f"Starting to run model on prompt {i}: {prompts_list[i]}") + prediction_with_scores = eval_prompt(data_file, predictor, prompts_list[i], param_dict) + logger.info(f"Model run completed on prompt {i}: {prompts_list[i]}") + eval_result = get_eval_result_from_prediction(prediction_with_scores, prompts_list[i]) + results_list.append(eval_result) + return results_list + + @app.command(help="Evaluate model on a data file") def eval( data_file: str = typer.Argument(..., help="JSONL Data file to evaluate on"), From 588b85fd16c0d4aa6f756376f438ff7cd7e20276 Mon Sep 17 00:00:00 2001 From: Anuj Sinha Date: Wed, 31 Jan 2024 03:30:31 -0800 Subject: [PATCH 3/8] refactor: update function name --- src/autora/doc/pipelines/main.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/autora/doc/pipelines/main.py b/src/autora/doc/pipelines/main.py index f9ae7c8..5e49fb6 100644 --- a/src/autora/doc/pipelines/main.py +++ b/src/autora/doc/pipelines/main.py @@ -49,7 +49,7 @@ def evaluate_documentation(predictions: List[str], references: List[str]) -> Tup @app.command(help="Evaluate a model for code-to-documentation generation for all prompts in the prompts_file") -def eval_on_prompts_file( +def eval_prompts( data_file: str = typer.Argument(..., help="JSONL Data file to evaluate on"), model_path: str = typer.Option("meta-llama/Llama-2-7b-chat-hf", help="Path to HF model"), prompts_file: str = typer.Argument(..., help="JSON file with a list of dictionary of prompts"), @@ -76,9 +76,10 @@ def eval_on_prompts_file( mlflow.log_params(param_dict) mlflow.log_param("model_path", model_path) mlflow.log_param("data_file", data_file) + mlflow.log_param("prompts_file", prompts_file) predictor = Predictor(model_path) for i in range(len(prompts_list)): - logger.info(f"Starting to run model on prompt {i}: {prompts_list[i]}") + logger.info(f"Starting to run model on prompt {i}") prediction_with_scores = eval_prompt(data_file, predictor, prompts_list[i], param_dict) logger.info(f"Model run completed on prompt {i}: {prompts_list[i]}") eval_result = get_eval_result_from_prediction(prediction_with_scores, prompts_list[i]) From ec4dbbfb436631f15bc51e79c37efc3d9198d463 Mon Sep 17 00:00:00 2001 From: Anuj Sinha Date: Wed, 31 Jan 2024 03:33:02 -0800 Subject: [PATCH 4/8] test: add test for multi-prompts prediction --- tests/test_main.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/test_main.py b/tests/test_main.py index 46a74f5..c5ad7e1 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -1,9 +1,10 @@ from pathlib import Path +from typing import Dict, List import jsonlines import pytest -from autora.doc.pipelines.main import eval, evaluate_documentation, generate, import_data +from autora.doc.pipelines.main import eval, eval_prompts, evaluate_documentation, generate, import_data from autora.doc.runtime.prompts import PromptIds # dummy HF model for testing @@ -84,3 +85,12 @@ def test_import(tmp_path: Path) -> None: import_data(str(code), str(text), str(data)) new_lines = data.read_text().splitlines() assert len(new_lines) == 1, "Expected one new line" + + +def test_eval_prompts() -> None: + data_file = Path(__file__).parent.joinpath("../data/sweetpea/data.jsonl").resolve() + prompts_file = Path(__file__).parent.joinpath("../data/autora/prompts/all_prompt.json").resolve() + outputs: List[Dict[str, str]] = eval_prompts(str(data_file), TEST_HF_MODEL, str(prompts_file), []) + assert len(outputs) == 3, "Expected 3 outputs" + for output in outputs: + assert len(output) > 0, "Expected non-empty output" From 3a48d6c553a2fbe2c6794daffccd8fd9a7c2e0ad Mon Sep 17 00:00:00 2001 From: Anuj Sinha Date: Wed, 31 Jan 2024 03:44:58 -0800 Subject: [PATCH 5/8] test: add tests for utility function --- tests/test_util.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 tests/test_util.py diff --git a/tests/test_util.py b/tests/test_util.py new file mode 100644 index 0000000..f340c83 --- /dev/null +++ b/tests/test_util.py @@ -0,0 +1,32 @@ +from pathlib import Path + +from autora.doc.util import get_eval_result_from_prediction, get_prompts_from_file, load_file + + +def test_load_file() -> None: + prompts_file_path = Path(__file__).parent.joinpath("../data/autora/prompts/all_prompt.json").resolve() + data = load_file(str(prompts_file_path)) + assert type(data) == list + + +def test_get_prompts_from_file() -> None: + prompts_file_path = Path(__file__).parent.joinpath("../data/autora/prompts/all_prompt.json").resolve() + prompts_list = get_prompts_from_file(str(prompts_file_path)) + + assert len(prompts_list) == 3, "Expected 3 outputs" + for prompt in prompts_list: + assert type(prompt) == str + + +def test_get_eval_result_from_prediction() -> None: + prediction = (["response1", "response2"], 0.8, 0.7) + prompt = "prompt1" + result = get_eval_result_from_prediction(prediction, prompt) + expected_result = { + "prediction": ["response1", "response2"], + "bleu": 0.8, + "meteor": 0.7, + "prompt": "prompt1", + } + assert type(result) == dict # Assert result is a dictionary + assert result == expected_result # Assert specific keys and values From 9d5dc0f08e0263d283b478675dbb70e29cef3130 Mon Sep 17 00:00:00 2001 From: Anuj Sinha Date: Wed, 31 Jan 2024 14:28:56 -0800 Subject: [PATCH 6/8] refactor: change the return type for eval_prompts() --- src/autora/doc/classes/EvalResult.py | 21 +++++++++++++++++++++ src/autora/doc/pipelines/main.py | 14 ++++++++++---- src/autora/doc/util.py | 12 ------------ tests/test_main.py | 10 ++++++---- tests/test_util.py | 16 +--------------- 5 files changed, 38 insertions(+), 35 deletions(-) create mode 100644 src/autora/doc/classes/EvalResult.py diff --git a/src/autora/doc/classes/EvalResult.py b/src/autora/doc/classes/EvalResult.py new file mode 100644 index 0000000..2b50bd3 --- /dev/null +++ b/src/autora/doc/classes/EvalResult.py @@ -0,0 +1,21 @@ +from typing import List, Optional + + +class EvalResult: + def __init__( + self, + prediction: List[str], + prompt: str, + bleu_score: Optional[float] = None, + meteor_score: Optional[float] = None, + ): + self.prediction = prediction + self.prompt = prompt + self.bleu_score = bleu_score + self.meteor_score = meteor_score + + def __str__(self) -> str: + return ( + f"prediction: {self.prediction}, prompt: {self.prompt}," + f"bleu_score: {self.bleu_score}, meteor_score: {self.meteor_score} )" + ) diff --git a/src/autora/doc/pipelines/main.py b/src/autora/doc/pipelines/main.py index 5e49fb6..33d6504 100644 --- a/src/autora/doc/pipelines/main.py +++ b/src/autora/doc/pipelines/main.py @@ -9,9 +9,10 @@ from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu from nltk.translate.meteor_score import single_meteor_score +from autora.doc.classes.EvalResult import EvalResult from autora.doc.runtime.predict_hf import Predictor -from autora.doc.runtime.prompts import PROMPTS, PromptBuilder, PromptIds -from autora.doc.util import get_eval_result_from_prediction, get_prompts_from_file, load_file +from autora.doc.runtime.prompts import PROMPTS, PromptIds +from autora.doc.util import get_prompts_from_file app = typer.Typer() logging.basicConfig( @@ -56,7 +57,7 @@ def eval_prompts( param: List[str] = typer.Option( [], help="Additional float parameters to pass to the model as name=float pairs" ), -) -> List[Dict[str, str]]: +) -> List[EvalResult]: import mlflow results_list = [] @@ -82,7 +83,12 @@ def eval_prompts( logger.info(f"Starting to run model on prompt {i}") prediction_with_scores = eval_prompt(data_file, predictor, prompts_list[i], param_dict) logger.info(f"Model run completed on prompt {i}: {prompts_list[i]}") - eval_result = get_eval_result_from_prediction(prediction_with_scores, prompts_list[i]) + eval_result = EvalResult( + prediction_with_scores[0], + prompts_list[i], + prediction_with_scores[1], + prediction_with_scores[2], + ) results_list.append(eval_result) return results_list diff --git a/src/autora/doc/util.py b/src/autora/doc/util.py index 3c83ad2..6a63fa0 100644 --- a/src/autora/doc/util.py +++ b/src/autora/doc/util.py @@ -15,15 +15,3 @@ def get_prompts_from_file(prompts_file: str) -> List[str]: prompts_data = load_file(prompts_file) prompts_list = [PromptBuilder(p["SYS"], p["INSTR"]).build() for p in prompts_data] return prompts_list - - -def get_eval_result_from_prediction( - prediction: Tuple[List[str], float, float], prompt: str -) -> Dict[str, Any]: - eval_result = { - "prediction": prediction[0], - "bleu": prediction[1], - "meteor": prediction[2], - "prompt": prompt, - } - return eval_result diff --git a/tests/test_main.py b/tests/test_main.py index c5ad7e1..2437be1 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -4,6 +4,7 @@ import jsonlines import pytest +from autora.doc.classes.EvalResult import EvalResult from autora.doc.pipelines.main import eval, eval_prompts, evaluate_documentation, generate, import_data from autora.doc.runtime.prompts import PromptIds @@ -90,7 +91,8 @@ def test_import(tmp_path: Path) -> None: def test_eval_prompts() -> None: data_file = Path(__file__).parent.joinpath("../data/sweetpea/data.jsonl").resolve() prompts_file = Path(__file__).parent.joinpath("../data/autora/prompts/all_prompt.json").resolve() - outputs: List[Dict[str, str]] = eval_prompts(str(data_file), TEST_HF_MODEL, str(prompts_file), []) - assert len(outputs) == 3, "Expected 3 outputs" - for output in outputs: - assert len(output) > 0, "Expected non-empty output" + results: List[EvalResult] = eval_prompts(str(data_file), TEST_HF_MODEL, str(prompts_file), []) + assert len(results) == 3, "Expected 3 outputs" + for result in results: + assert result.prediction is not None, "The prediction should not be None" + assert result.prompt is not None, "The prompt should not be None" diff --git a/tests/test_util.py b/tests/test_util.py index f340c83..42d6db2 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -1,6 +1,6 @@ from pathlib import Path -from autora.doc.util import get_eval_result_from_prediction, get_prompts_from_file, load_file +from autora.doc.util import get_prompts_from_file, load_file def test_load_file() -> None: @@ -16,17 +16,3 @@ def test_get_prompts_from_file() -> None: assert len(prompts_list) == 3, "Expected 3 outputs" for prompt in prompts_list: assert type(prompt) == str - - -def test_get_eval_result_from_prediction() -> None: - prediction = (["response1", "response2"], 0.8, 0.7) - prompt = "prompt1" - result = get_eval_result_from_prediction(prediction, prompt) - expected_result = { - "prediction": ["response1", "response2"], - "bleu": 0.8, - "meteor": 0.7, - "prompt": "prompt1", - } - assert type(result) == dict # Assert result is a dictionary - assert result == expected_result # Assert specific keys and values From 667e77e0ba9370cb3f17be3838cb674378ab01cc Mon Sep 17 00:00:00 2001 From: Anuj Sinha Date: Wed, 31 Jan 2024 15:27:53 -0800 Subject: [PATCH 7/8] refactor: use @dataclass annotation --- src/autora/doc/classes/EvalResult.py | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/src/autora/doc/classes/EvalResult.py b/src/autora/doc/classes/EvalResult.py index 2b50bd3..2cde094 100644 --- a/src/autora/doc/classes/EvalResult.py +++ b/src/autora/doc/classes/EvalResult.py @@ -1,21 +1,12 @@ +from dataclasses import dataclass from typing import List, Optional +@dataclass class EvalResult: - def __init__( - self, - prediction: List[str], - prompt: str, - bleu_score: Optional[float] = None, - meteor_score: Optional[float] = None, - ): - self.prediction = prediction - self.prompt = prompt - self.bleu_score = bleu_score - self.meteor_score = meteor_score + """Class for storing LLM evaluation results""" - def __str__(self) -> str: - return ( - f"prediction: {self.prediction}, prompt: {self.prompt}," - f"bleu_score: {self.bleu_score}, meteor_score: {self.meteor_score} )" - ) + prediction: List[str] + prompt: str + bleu_score: Optional[float] + meteor_score: Optional[float] From 1f8c14a3eae3cd1ad5027cdef9fb4e5666db2f54 Mon Sep 17 00:00:00 2001 From: Anuj Sinha Date: Wed, 31 Jan 2024 15:37:44 -0800 Subject: [PATCH 8/8] refactor: default bleu_score and meteor_Score as None --- src/autora/doc/classes/EvalResult.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/autora/doc/classes/EvalResult.py b/src/autora/doc/classes/EvalResult.py index 2cde094..993a848 100644 --- a/src/autora/doc/classes/EvalResult.py +++ b/src/autora/doc/classes/EvalResult.py @@ -8,5 +8,5 @@ class EvalResult: prediction: List[str] prompt: str - bleu_score: Optional[float] - meteor_score: Optional[float] + bleu_score: Optional[float] = None + meteor_score: Optional[float] = None