diff --git a/notebooks/generate.ipynb b/notebooks/generate.ipynb index 89861fd..44de82f 100644 --- a/notebooks/generate.ipynb +++ b/notebooks/generate.ipynb @@ -8,8 +8,19 @@ "source": [ "%load_ext autoreload\n", "%autoreload 2\n", - "from autora.doc.runtime.predict_hf import Predictor\n", - "from autora.doc.runtime.prompts import PROMPTS, PromptIds" + "from autora.doc.runtime.predict_hf import Predictor, preprocess_code\n", + "from autora.doc.runtime.prompts import PROMPTS, PromptIds, PromptBuilder, SYS_GUIDES\n", + "from autora.doc.pipelines.main import evaluate_documentation\n", + "from autora.doc.pipelines.main import eval_prompt, load_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = \"meta-llama/Llama-2-7b-chat-hf\"" ] }, { @@ -18,11 +29,16 @@ "metadata": {}, "outputs": [], "source": [ - "# model = \"../../models\" # if model has been previously downloaded via huggingface-cli\n", - "model = \"meta-llama/Llama-2-7b-chat-hf\"\n", "pred = Predictor(model)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test generation for the variable declararion only" + ] + }, { "cell_type": "code", "execution_count": null, @@ -33,7 +49,8 @@ "iv = Variable(name=\"x\", value_range=(0, 2 * np.pi), allowed_values=np.linspace(0, 2 * np.pi, 30))\n", "dv = Variable(name=\"y\", type=ValueType.REAL)\n", "variables = VariableCollection(independent_variables=[iv], dependent_variables=[dv])\n", - "\"\"\"" + "\"\"\"\n", + "LABEL = \"The discovery problem is defined by a single independent variable $x \\in [0, 2 \\pi]$ and dependent variable $y$.\"" ] }, { @@ -42,18 +59,46 @@ "metadata": {}, "outputs": [], "source": [ - "def test(promptid, code):\n", + "def test(promptid, code, label):\n", " output = pred.predict(\n", " PROMPTS[promptid],\n", " [code],\n", " do_sample=0,\n", - " max_length=800,\n", + " max_new_tokens=100,\n", " temperature=0.05,\n", " top_k=10,\n", " num_ret_seq=1,\n", - " )[0]\n", - " for i, o in enumerate(output):\n", - " print(f\"{promptid}\\n******* Output {i} ********\\n{o}\\n*************\\n\")" + " )\n", + " bleu, meteor = evaluate_documentation(output, [label])\n", + " for i, o in enumerate(output[0]):\n", + " print(f\"{promptid}\\n******* Output {i} ********. bleu={bleu}, meteor={meteor}\\n{o}\\n*************\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Zero shot test\n", + "test(PromptIds.AUTORA_VARS_ZEROSHOT, TEST_VAR_CODE, LABEL)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# One shot test\n", + "test(PromptIds.AUTORA_VARS_ONESHOT, TEST_VAR_CODE, LABEL)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## One-shot generation for the complete code sample" ] }, { @@ -62,7 +107,13 @@ "metadata": {}, "outputs": [], "source": [ - "test(PromptIds.AUTORA_VARS_ZEROSHOT, TEST_VAR_CODE)" + "data_file = \"../data/autora/data.jsonl\"\n", + "inputs, labels = load_data(data_file)\n", + "# preprocessing removes comments, import statements and empty lines\n", + "inputs = [preprocess_code(i) for i in inputs]\n", + "INSTR = \"Generate high-level, one or two paragraph documentation for the following experiment.\"\n", + "prompt = PromptBuilder(SYS_GUIDES, INSTR).add_example(f\"{inputs[0]}\", labels[0]).build()\n", + "print(prompt)" ] }, { @@ -71,8 +122,16 @@ "metadata": {}, "outputs": [], "source": [ - "test(PromptIds.AUTORA_VARS_ONESHOT, TEST_VAR_CODE)" + "out, bleu, meteor = eval_prompt(data_file, pred, prompt, {\"max_new_tokens\": 800.0})\n", + "print(f\"bleu={bleu}, meteor={meteor}\\n{out[0][0]}\\n*************\\n\")" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/src/autora/doc/pipelines/main.py b/src/autora/doc/pipelines/main.py index 8c46761..2c122c7 100644 --- a/src/autora/doc/pipelines/main.py +++ b/src/autora/doc/pipelines/main.py @@ -1,7 +1,7 @@ import itertools import logging from timeit import default_timer as timer -from typing import List, Tuple +from typing import Dict, List, Tuple import nltk import torch @@ -20,13 +20,13 @@ logger = logging.getLogger(__name__) -def evaluate_documentation(predictions: List[List[str]], references: List[str]) -> Tuple[float, float]: +def evaluate_documentation(predictions: List[str], references: List[str]) -> Tuple[float, float]: nltk.download("wordnet") # Tokenize references tokenized_references = [ref.split() for ref in references] # Currently there is only 1 prediction for 1 reference, need to avg in future - tokenized_predictions = [pred[0].split() if pred else [] for pred in predictions] + tokenized_predictions = [pred.split() if pred else [] for pred in predictions] # Calculate BLEU score with smoothing function # SmoothingFunction().method1 is used to avoid zero scores for n-grams not found in the reference. @@ -55,16 +55,13 @@ def eval( param: List[str] = typer.Option( [], help="Additional float parameters to pass to the model as name=float pairs" ), -) -> List[List[str]]: - import jsonlines +) -> Tuple[List[str], float, float]: import mlflow mlflow.autolog() - - param_dict = {pair[0]: float(pair[1]) for pair in [pair.split("=") for pair in param]} run = mlflow.active_run() + param_dict = {pair[0]: float(pair[1]) for pair in [pair.split("=") for pair in param]} - prompt = PROMPTS[prompt_id] if run is None: run = mlflow.start_run() with run: @@ -75,36 +72,51 @@ def eval( mlflow.log_param("prompt_id", prompt_id) mlflow.log_param("model_path", model_path) mlflow.log_param("data_file", data_file) + prompt = PROMPTS[prompt_id] + pred = Predictor(model_path) + return eval_prompt(data_file, pred, prompt, param_dict) + + +def load_data(data_file: str) -> Tuple[List[str], List[str]]: + import jsonlines + + with jsonlines.open(data_file) as reader: + items = [item for item in reader] + inputs = [f"{item['instruction']}" for item in items] + labels = [item["output"] for item in items] + return inputs, labels + + +def eval_prompt( + data_file: str, pred: Predictor, prompt: str, param_dict: Dict[str, float] +) -> Tuple[List[str], float, float]: + import mlflow - with jsonlines.open(data_file) as reader: - items = [item for item in reader] - inputs = [item["instruction"] for item in items] - labels = [item["output"] for item in items] - - pred = Predictor(model_path) - timer_start = timer() - predictions = pred.predict(prompt, inputs, **param_dict) - timer_end = timer() - bleu, meteor = evaluate_documentation(predictions, labels) - pred_time = timer_end - timer_start - mlflow.log_metric("prediction_time/doc", pred_time / (len(inputs))) - for i in range(len(inputs)): - mlflow.log_text(labels[i], f"label_{i}.txt") - mlflow.log_text(inputs[i], f"input_{i}.py") - for j in range(len(predictions[i])): - mlflow.log_text(predictions[i][j], f"prediction_{i}_{j}.txt") - mlflow.log_text("bleu_score is ", str(bleu)) - mlflow.log_text("meteor_score is ", str(meteor)) - - # flatten predictions for counting tokens - predictions_flat = list(itertools.chain.from_iterable(predictions)) - tokens = pred.tokenize(predictions_flat)["input_ids"] - total_tokens = sum([len(token) for token in tokens]) - mlflow.log_metric("total_tokens", total_tokens) - mlflow.log_metric("tokens/sec", total_tokens / pred_time) - mlflow.log_metric("bleu_score", round(bleu, 5)) - mlflow.log_metric("meteor_score", round(meteor, 5)) - return predictions + inputs, labels = load_data(data_file) + + timer_start = timer() + predictions = pred.predict(prompt, inputs, **param_dict) + timer_end = timer() + bleu, meteor = evaluate_documentation(predictions, labels) + pred_time = timer_end - timer_start + mlflow.log_metric("prediction_time/doc", pred_time / (len(inputs))) + for i in range(len(inputs)): + mlflow.log_text(labels[i], f"label_{i}.txt") + mlflow.log_text(inputs[i], f"input_{i}.py") + for j in range(len(predictions[i])): + mlflow.log_text(predictions[i][j], f"prediction_{i}_{j}.txt") + mlflow.log_text("bleu_score is ", str(bleu)) + mlflow.log_text("meteor_score is ", str(meteor)) + + # flatten predictions for counting tokens + predictions_flat = list(itertools.chain.from_iterable(predictions)) + tokens = pred.tokenize(predictions_flat)["input_ids"] + total_tokens = sum([len(token) for token in tokens]) + mlflow.log_metric("total_tokens", total_tokens) + mlflow.log_metric("tokens/sec", total_tokens / pred_time) + mlflow.log_metric("bleu_score", round(bleu, 5)) + mlflow.log_metric("meteor_score", round(meteor, 5)) + return predictions, bleu, meteor @app.command() @@ -126,7 +138,7 @@ def generate( prompt = PROMPTS[prompt_id] pred = Predictor(model_path) # grab first result since we only passed one input - predictions = pred.predict(prompt, [input], **param_dict)[0] + predictions = pred.predict(prompt, [input], **param_dict) assert len(predictions) == 1, f"Expected only one output, got {len(predictions)}" logger.info(f"Writing output to {output}") with open(output, "w") as f: diff --git a/src/autora/doc/runtime/predict_hf.py b/src/autora/doc/runtime/predict_hf.py index 599ba04..9a5adbd 100644 --- a/src/autora/doc/runtime/predict_hf.py +++ b/src/autora/doc/runtime/predict_hf.py @@ -1,15 +1,25 @@ import logging -from typing import Dict, List +from typing import Dict, Iterable, List import torch import transformers from transformers import AutoModelForCausalLM, AutoTokenizer -from autora.doc.runtime.prompts import LLAMA2_INST_CLOSE +from autora.doc.runtime.prompts import CODE_PLACEHOLDER, LLAMA2_INST_CLOSE logger = logging.getLogger(__name__) +def preprocess_code(code: str) -> str: + lines: Iterable[str] = code.splitlines() + skip_starts = {"import", "from", "#"} + lines = filter( + lambda line: not (any([line.strip().startswith(skip) for skip in skip_starts]) or line.strip() == ""), + lines, + ) + return "\n".join(lines) + + class Predictor: def __init__(self, model_path: str): config = self.get_config() @@ -35,16 +45,18 @@ def predict( temperature: float = 0.01, top_p: float = 0.95, top_k: float = 1, - max_length: float = 2048, + max_new_tokens: float = 2048, num_ret_seq: float = 1, - ) -> List[List[str]]: + ) -> List[str]: # convert to bool in case it came in as a generate float param from the CLI do_sample = bool(do_sample) logger.info( f"Generating {len(inputs)} predictions. do_sample: {do_sample}, temperature: {temperature}, top_p: {top_p}," - f" top_k: {top_k}, max_length: {max_length}" + f" top_k: {top_k}, max_new_tokens: {max_new_tokens}" ) - prompts = [prompt_template.format(code=input) for input in inputs] + prompts = [ + prompt_template.replace(CODE_PLACEHOLDER, preprocess_code(input).strip("\n")) for input in inputs + ] sequences = self.pipeline( prompts, do_sample=do_sample, @@ -53,12 +65,10 @@ def predict( top_k=int(top_k), num_return_sequences=int(num_ret_seq), eos_token_id=self.tokenizer.eos_token_id, - max_length=int(max_length), + max_new_tokens=int(max_new_tokens), ) - results = [ - [Predictor.trim_prompt(seq["generated_text"]) for seq in sequence] for sequence in sequences - ] + results = [Predictor.trim_prompt(seq["generated_text"]) for sequence in sequences for seq in sequence] logger.info(f"Generated {len(results)} results") return results diff --git a/src/autora/doc/runtime/prompts.py b/src/autora/doc/runtime/prompts.py index d6fef47..5b6dc53 100644 --- a/src/autora/doc/runtime/prompts.py +++ b/src/autora/doc/runtime/prompts.py @@ -1,9 +1,9 @@ from __future__ import annotations -import textwrap from enum import Enum LLAMA2_INST_CLOSE = "[/INST]\n" +CODE_PLACEHOLDER = "__CODE_INPUT__" class PromptBuilder: @@ -15,39 +15,46 @@ class PromptBuilder: def __init__(self, sys: str, instr: str): self.instr = instr # Initialize the prompt with the system prompt - self.prompt_text = f""" - [INST] <> - { sys } - <> - """ + self.prompt_text = PromptBuilder._trim_leading_ws( + f""" + [INST] <> + { sys } + <> + """ + ) def _add_input(self) -> PromptBuilder: # Add the instruction (e.g. "Generate a one line descrip...") # and a placeholder for the code - self.prompt_text += f""" - { self.instr } - ----------{{code}}---------- - """ + self.prompt_text += PromptBuilder._trim_leading_ws( + f""" + { self.instr } + ---------- + {CODE_PLACEHOLDER} + ---------- + """ + ) return self def add_example(self, code: str, doc: str) -> PromptBuilder: # This adds an example in the form of instruction+code+doc self._add_input() - self.prompt_text = self.prompt_text.format(code=code) - self.prompt_text += f""" - [/INST] - {doc} - - - [INST] - """ + self.prompt_text = self.prompt_text.replace(CODE_PLACEHOLDER, code) + self.prompt_text += PromptBuilder._trim_leading_ws( + f""" + [/INST] + {doc} + + + [INST] + """ + ) return self def build(self) -> str: # Add a instruction+code placeholder and close the instruction self._add_input() - self.prompt_text = PromptBuilder._trim_leading_ws(self.prompt_text) - self.prompt_text += LLAMA2_INST_CLOSE + self.prompt_text = self.prompt_text + LLAMA2_INST_CLOSE return self.prompt_text @staticmethod @@ -63,12 +70,13 @@ def _trim_leading_ws(s: str) -> str: SYS_GUIDES = """You are a technical documentation writer. You always write clear, concise, and accurate documentation for scientific experiments. Your documentation focuses on the experiment's procedure. Therefore, details about specific -python functions, packages, or libraries are not necessary. Your readers are experimental scientists. Use the following -guidelines for writing your descriptions: -- Do not write greetings or preambles +python functions, packages, or libraries are NOT necessary. Your readers are experimental scientists. +For writing your descriptions, follow these instructions: +- DO NOT write greetings or preambles - Use the Variable 'name' attribute and not the python variable names - Use LaTeX for math expressions -- Do not include code or code-like syntax and do not use python function or class names +- DO NOT include code or code-like syntax and do not use python function or class names +- Write in paragraph style, NOT bullet points """ INSTR_SWEETP_1 = ( diff --git a/tests/test_main.py b/tests/test_main.py index f736ce2..46a74f5 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -12,10 +12,10 @@ def test_predict() -> None: data = Path(__file__).parent.joinpath("../data/sweetpea/data.jsonl").resolve() - outputs = eval(str(data), TEST_HF_MODEL, PromptIds.SWEETP_1, []) + outputs, _, _ = eval(str(data), TEST_HF_MODEL, PromptIds.SWEETP_1, []) assert len(outputs) == 3, "Expected 3 outputs" for output in outputs: - assert len(output[0]) > 0, "Expected non-empty output" + assert len(output) > 0, "Expected non-empty output" def test_evaluation() -> None: @@ -24,7 +24,7 @@ def test_evaluation() -> None: with jsonlines.open(data) as reader: items = [item for item in reader] labels = [item["output"] for item in items] - predictions = [[item["output"]] for item in items] + predictions = [item["output"] for item in items] bleu, meteor = evaluate_documentation(predictions, labels) assert bleu == pytest.approx(1, 0.01), f"BLEU Score is {bleu}" @@ -34,7 +34,7 @@ def test_evaluation() -> None: def test_extra_token_in_prediction() -> None: # Test Case bleu score should be less due to brevity penalty and meteor is robust to small mistakes labels = ["this is a test"] - predictions = [["this is a test extra"]] + predictions = ["this is a test extra"] bleu, meteor = evaluate_documentation(predictions, labels) assert 0.6 <= bleu <= 0.8, f"BLEU Score is {bleu}" assert 0.8 <= meteor <= 1, f"METEOR Score is {meteor}" @@ -43,7 +43,7 @@ def test_extra_token_in_prediction() -> None: def test_missing_token_in_prediction() -> None: # bleu score is less, meteor is higher labels = ["this is a test"] - predictions = [["this is a"]] + predictions = ["this is a"] bleu, meteor = evaluate_documentation(predictions, labels) assert 0.4 <= bleu <= 0.6, f"BLEU Score is {bleu}" assert 0.6 <= meteor <= 0.8, f"METEOR Score is {meteor}" @@ -52,7 +52,7 @@ def test_missing_token_in_prediction() -> None: def test_completely_different_tokens() -> None: # both scores are less, as no common tokens labels = ["this is a test"] - predictions = [["completely different sentence"]] + predictions = ["completely different sentence"] bleu, meteor = evaluate_documentation(predictions, labels) assert bleu <= 0.1, f"BLEU Score is {bleu}" assert meteor <= 0.1, f"METEOR Score is {meteor}" @@ -61,7 +61,7 @@ def test_completely_different_tokens() -> None: def test_partially_matching_tokens() -> None: # As ngrams arent matching because of extra token within, BLEU score is very less. Meteor gives a good score only. labels = ["this is a test"] - predictions = [["this is a different test"]] + predictions = ["this is a different test"] bleu, meteor = evaluate_documentation(predictions, labels) assert 0.25 <= bleu <= 0.4, f"BLEU Score is {bleu}" assert 0.8 <= meteor <= 0.95, f"METEOR Score is {meteor}"