diff --git a/notebooks/generate.ipynb b/notebooks/generate.ipynb
index 89861fd..44de82f 100644
--- a/notebooks/generate.ipynb
+++ b/notebooks/generate.ipynb
@@ -8,8 +8,19 @@
    "source": [
     "%load_ext autoreload\n",
     "%autoreload 2\n",
-    "from autora.doc.runtime.predict_hf import Predictor\n",
-    "from autora.doc.runtime.prompts import PROMPTS, PromptIds"
+    "from autora.doc.runtime.predict_hf import Predictor, preprocess_code\n",
+    "from autora.doc.runtime.prompts import PROMPTS, PromptIds, PromptBuilder, SYS_GUIDES\n",
+    "from autora.doc.pipelines.main import evaluate_documentation\n",
+    "from autora.doc.pipelines.main import eval_prompt, load_data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = \"meta-llama/Llama-2-7b-chat-hf\""
    ]
   },
   {
@@ -18,11 +29,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# model = \"../../models\" # if model has been previously downloaded via huggingface-cli\n",
-    "model = \"meta-llama/Llama-2-7b-chat-hf\"\n",
     "pred = Predictor(model)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Test generation for the variable declararion only"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -33,7 +49,8 @@
     "iv = Variable(name=\"x\", value_range=(0, 2 * np.pi), allowed_values=np.linspace(0, 2 * np.pi, 30))\n",
     "dv = Variable(name=\"y\", type=ValueType.REAL)\n",
     "variables = VariableCollection(independent_variables=[iv], dependent_variables=[dv])\n",
-    "\"\"\""
+    "\"\"\"\n",
+    "LABEL = \"The discovery problem is defined by a single independent variable $x \\in [0, 2 \\pi]$ and dependent variable $y$.\""
    ]
   },
   {
@@ -42,18 +59,46 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def test(promptid, code):\n",
+    "def test(promptid, code, label):\n",
     "    output = pred.predict(\n",
     "        PROMPTS[promptid],\n",
     "        [code],\n",
     "        do_sample=0,\n",
-    "        max_length=800,\n",
+    "        max_new_tokens=100,\n",
     "        temperature=0.05,\n",
     "        top_k=10,\n",
     "        num_ret_seq=1,\n",
-    "    )[0]\n",
-    "    for i, o in enumerate(output):\n",
-    "        print(f\"{promptid}\\n******* Output {i} ********\\n{o}\\n*************\\n\")"
+    "    )\n",
+    "    bleu, meteor = evaluate_documentation(output, [label])\n",
+    "    for i, o in enumerate(output[0]):\n",
+    "        print(f\"{promptid}\\n******* Output {i} ********. bleu={bleu}, meteor={meteor}\\n{o}\\n*************\\n\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Zero shot test\n",
+    "test(PromptIds.AUTORA_VARS_ZEROSHOT, TEST_VAR_CODE, LABEL)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# One shot test\n",
+    "test(PromptIds.AUTORA_VARS_ONESHOT, TEST_VAR_CODE, LABEL)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## One-shot generation for the complete code sample"
    ]
   },
   {
@@ -62,7 +107,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "test(PromptIds.AUTORA_VARS_ZEROSHOT, TEST_VAR_CODE)"
+    "data_file = \"../data/autora/data.jsonl\"\n",
+    "inputs, labels = load_data(data_file)\n",
+    "# preprocessing removes comments, import statements and empty lines\n",
+    "inputs = [preprocess_code(i) for i in inputs]\n",
+    "INSTR = \"Generate high-level, one or two paragraph documentation for the following experiment.\"\n",
+    "prompt = PromptBuilder(SYS_GUIDES, INSTR).add_example(f\"{inputs[0]}\", labels[0]).build()\n",
+    "print(prompt)"
    ]
   },
   {
@@ -71,8 +122,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "test(PromptIds.AUTORA_VARS_ONESHOT, TEST_VAR_CODE)"
+    "out, bleu, meteor = eval_prompt(data_file, pred, prompt, {\"max_new_tokens\": 800.0})\n",
+    "print(f\"bleu={bleu}, meteor={meteor}\\n{out[0][0]}\\n*************\\n\")"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
diff --git a/src/autora/doc/pipelines/main.py b/src/autora/doc/pipelines/main.py
index 8c46761..2c122c7 100644
--- a/src/autora/doc/pipelines/main.py
+++ b/src/autora/doc/pipelines/main.py
@@ -1,7 +1,7 @@
 import itertools
 import logging
 from timeit import default_timer as timer
-from typing import List, Tuple
+from typing import Dict, List, Tuple
 
 import nltk
 import torch
@@ -20,13 +20,13 @@
 logger = logging.getLogger(__name__)
 
 
-def evaluate_documentation(predictions: List[List[str]], references: List[str]) -> Tuple[float, float]:
+def evaluate_documentation(predictions: List[str], references: List[str]) -> Tuple[float, float]:
     nltk.download("wordnet")
 
     # Tokenize references
     tokenized_references = [ref.split() for ref in references]
     # Currently there is only 1 prediction for 1 reference, need to avg in future
-    tokenized_predictions = [pred[0].split() if pred else [] for pred in predictions]
+    tokenized_predictions = [pred.split() if pred else [] for pred in predictions]
 
     # Calculate BLEU score with smoothing function
     # SmoothingFunction().method1 is used to avoid zero scores for n-grams not found in the reference.
@@ -55,16 +55,13 @@ def eval(
     param: List[str] = typer.Option(
         [], help="Additional float parameters to pass to the model as name=float pairs"
     ),
-) -> List[List[str]]:
-    import jsonlines
+) -> Tuple[List[str], float, float]:
     import mlflow
 
     mlflow.autolog()
-
-    param_dict = {pair[0]: float(pair[1]) for pair in [pair.split("=") for pair in param]}
     run = mlflow.active_run()
+    param_dict = {pair[0]: float(pair[1]) for pair in [pair.split("=") for pair in param]}
 
-    prompt = PROMPTS[prompt_id]
     if run is None:
         run = mlflow.start_run()
     with run:
@@ -75,36 +72,51 @@ def eval(
         mlflow.log_param("prompt_id", prompt_id)
         mlflow.log_param("model_path", model_path)
         mlflow.log_param("data_file", data_file)
+    prompt = PROMPTS[prompt_id]
+    pred = Predictor(model_path)
+    return eval_prompt(data_file, pred, prompt, param_dict)
+
+
+def load_data(data_file: str) -> Tuple[List[str], List[str]]:
+    import jsonlines
+
+    with jsonlines.open(data_file) as reader:
+        items = [item for item in reader]
+        inputs = [f"{item['instruction']}" for item in items]
+        labels = [item["output"] for item in items]
+        return inputs, labels
+
+
+def eval_prompt(
+    data_file: str, pred: Predictor, prompt: str, param_dict: Dict[str, float]
+) -> Tuple[List[str], float, float]:
+    import mlflow
 
-        with jsonlines.open(data_file) as reader:
-            items = [item for item in reader]
-            inputs = [item["instruction"] for item in items]
-            labels = [item["output"] for item in items]
-
-        pred = Predictor(model_path)
-        timer_start = timer()
-        predictions = pred.predict(prompt, inputs, **param_dict)
-        timer_end = timer()
-        bleu, meteor = evaluate_documentation(predictions, labels)
-        pred_time = timer_end - timer_start
-        mlflow.log_metric("prediction_time/doc", pred_time / (len(inputs)))
-        for i in range(len(inputs)):
-            mlflow.log_text(labels[i], f"label_{i}.txt")
-            mlflow.log_text(inputs[i], f"input_{i}.py")
-            for j in range(len(predictions[i])):
-                mlflow.log_text(predictions[i][j], f"prediction_{i}_{j}.txt")
-        mlflow.log_text("bleu_score is ", str(bleu))
-        mlflow.log_text("meteor_score is ", str(meteor))
-
-        # flatten predictions for counting tokens
-        predictions_flat = list(itertools.chain.from_iterable(predictions))
-        tokens = pred.tokenize(predictions_flat)["input_ids"]
-        total_tokens = sum([len(token) for token in tokens])
-        mlflow.log_metric("total_tokens", total_tokens)
-        mlflow.log_metric("tokens/sec", total_tokens / pred_time)
-        mlflow.log_metric("bleu_score", round(bleu, 5))
-        mlflow.log_metric("meteor_score", round(meteor, 5))
-        return predictions
+    inputs, labels = load_data(data_file)
+
+    timer_start = timer()
+    predictions = pred.predict(prompt, inputs, **param_dict)
+    timer_end = timer()
+    bleu, meteor = evaluate_documentation(predictions, labels)
+    pred_time = timer_end - timer_start
+    mlflow.log_metric("prediction_time/doc", pred_time / (len(inputs)))
+    for i in range(len(inputs)):
+        mlflow.log_text(labels[i], f"label_{i}.txt")
+        mlflow.log_text(inputs[i], f"input_{i}.py")
+        for j in range(len(predictions[i])):
+            mlflow.log_text(predictions[i][j], f"prediction_{i}_{j}.txt")
+    mlflow.log_text("bleu_score is ", str(bleu))
+    mlflow.log_text("meteor_score is ", str(meteor))
+
+    # flatten predictions for counting tokens
+    predictions_flat = list(itertools.chain.from_iterable(predictions))
+    tokens = pred.tokenize(predictions_flat)["input_ids"]
+    total_tokens = sum([len(token) for token in tokens])
+    mlflow.log_metric("total_tokens", total_tokens)
+    mlflow.log_metric("tokens/sec", total_tokens / pred_time)
+    mlflow.log_metric("bleu_score", round(bleu, 5))
+    mlflow.log_metric("meteor_score", round(meteor, 5))
+    return predictions, bleu, meteor
 
 
 @app.command()
@@ -126,7 +138,7 @@ def generate(
     prompt = PROMPTS[prompt_id]
     pred = Predictor(model_path)
     # grab first result since we only passed one input
-    predictions = pred.predict(prompt, [input], **param_dict)[0]
+    predictions = pred.predict(prompt, [input], **param_dict)
     assert len(predictions) == 1, f"Expected only one output, got {len(predictions)}"
     logger.info(f"Writing output to {output}")
     with open(output, "w") as f:
diff --git a/src/autora/doc/runtime/predict_hf.py b/src/autora/doc/runtime/predict_hf.py
index 599ba04..9a5adbd 100644
--- a/src/autora/doc/runtime/predict_hf.py
+++ b/src/autora/doc/runtime/predict_hf.py
@@ -1,15 +1,25 @@
 import logging
-from typing import Dict, List
+from typing import Dict, Iterable, List
 
 import torch
 import transformers
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-from autora.doc.runtime.prompts import LLAMA2_INST_CLOSE
+from autora.doc.runtime.prompts import CODE_PLACEHOLDER, LLAMA2_INST_CLOSE
 
 logger = logging.getLogger(__name__)
 
 
+def preprocess_code(code: str) -> str:
+    lines: Iterable[str] = code.splitlines()
+    skip_starts = {"import", "from", "#"}
+    lines = filter(
+        lambda line: not (any([line.strip().startswith(skip) for skip in skip_starts]) or line.strip() == ""),
+        lines,
+    )
+    return "\n".join(lines)
+
+
 class Predictor:
     def __init__(self, model_path: str):
         config = self.get_config()
@@ -35,16 +45,18 @@ def predict(
         temperature: float = 0.01,
         top_p: float = 0.95,
         top_k: float = 1,
-        max_length: float = 2048,
+        max_new_tokens: float = 2048,
         num_ret_seq: float = 1,
-    ) -> List[List[str]]:
+    ) -> List[str]:
         # convert to bool in case it came in as a generate float param from the CLI
         do_sample = bool(do_sample)
         logger.info(
             f"Generating {len(inputs)} predictions. do_sample: {do_sample}, temperature: {temperature}, top_p: {top_p},"
-            f" top_k: {top_k}, max_length: {max_length}"
+            f" top_k: {top_k}, max_new_tokens: {max_new_tokens}"
         )
-        prompts = [prompt_template.format(code=input) for input in inputs]
+        prompts = [
+            prompt_template.replace(CODE_PLACEHOLDER, preprocess_code(input).strip("\n")) for input in inputs
+        ]
         sequences = self.pipeline(
             prompts,
             do_sample=do_sample,
@@ -53,12 +65,10 @@ def predict(
             top_k=int(top_k),
             num_return_sequences=int(num_ret_seq),
             eos_token_id=self.tokenizer.eos_token_id,
-            max_length=int(max_length),
+            max_new_tokens=int(max_new_tokens),
         )
 
-        results = [
-            [Predictor.trim_prompt(seq["generated_text"]) for seq in sequence] for sequence in sequences
-        ]
+        results = [Predictor.trim_prompt(seq["generated_text"]) for sequence in sequences for seq in sequence]
         logger.info(f"Generated {len(results)} results")
         return results
 
diff --git a/src/autora/doc/runtime/prompts.py b/src/autora/doc/runtime/prompts.py
index d6fef47..5b6dc53 100644
--- a/src/autora/doc/runtime/prompts.py
+++ b/src/autora/doc/runtime/prompts.py
@@ -1,9 +1,9 @@
 from __future__ import annotations
 
-import textwrap
 from enum import Enum
 
 LLAMA2_INST_CLOSE = "[/INST]\n"
+CODE_PLACEHOLDER = "__CODE_INPUT__"
 
 
 class PromptBuilder:
@@ -15,39 +15,46 @@ class PromptBuilder:
     def __init__(self, sys: str, instr: str):
         self.instr = instr
         # Initialize the prompt with the system prompt
-        self.prompt_text = f"""
-                            <s>[INST] <<SYS>>
-                            { sys }
-                            <</SYS>>
-                            """
+        self.prompt_text = PromptBuilder._trim_leading_ws(
+            f"""
+            <s>[INST] <<SYS>>
+            { sys }
+            <</SYS>>
+            """
+        )
 
     def _add_input(self) -> PromptBuilder:
         # Add the instruction (e.g. "Generate a one line descrip...")
         # and a placeholder for the code
-        self.prompt_text += f"""
-                            { self.instr }
-                            ----------{{code}}----------
-                            """
+        self.prompt_text += PromptBuilder._trim_leading_ws(
+            f"""
+            { self.instr }
+            ----------
+            {CODE_PLACEHOLDER}
+            ----------
+            """
+        )
         return self
 
     def add_example(self, code: str, doc: str) -> PromptBuilder:
         # This adds an example in the form of instruction+code+doc
         self._add_input()
-        self.prompt_text = self.prompt_text.format(code=code)
-        self.prompt_text += f"""
-                            [/INST]
-                            {doc}
-                            </s>
-                            <s>
-                            [INST]
-                            """
+        self.prompt_text = self.prompt_text.replace(CODE_PLACEHOLDER, code)
+        self.prompt_text += PromptBuilder._trim_leading_ws(
+            f"""
+            [/INST]
+            {doc}
+            </s>
+            <s>
+            [INST]
+            """
+        )
         return self
 
     def build(self) -> str:
         # Add a instruction+code placeholder and close the instruction
         self._add_input()
-        self.prompt_text = PromptBuilder._trim_leading_ws(self.prompt_text)
-        self.prompt_text += LLAMA2_INST_CLOSE
+        self.prompt_text = self.prompt_text + LLAMA2_INST_CLOSE
         return self.prompt_text
 
     @staticmethod
@@ -63,12 +70,13 @@ def _trim_leading_ws(s: str) -> str:
 
 SYS_GUIDES = """You are a technical documentation writer. You always write clear, concise, and accurate documentation
 for scientific experiments. Your documentation focuses on the experiment's procedure. Therefore, details about specific
-python functions, packages, or libraries are not necessary. Your readers are experimental scientists. Use the following
-guidelines for writing your descriptions:
-- Do not write greetings or preambles
+python functions, packages, or libraries are NOT necessary. Your readers are experimental scientists.
+For writing your descriptions, follow these instructions:
+- DO NOT write greetings or preambles
 - Use the Variable 'name' attribute and not the python variable names
 - Use LaTeX for math expressions
-- Do not include code or code-like syntax and do not use python function or class names
+- DO NOT include code or code-like syntax and do not use python function or class names
+- Write in paragraph style, NOT bullet points
 """
 
 INSTR_SWEETP_1 = (
diff --git a/tests/test_main.py b/tests/test_main.py
index f736ce2..46a74f5 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -12,10 +12,10 @@
 
 def test_predict() -> None:
     data = Path(__file__).parent.joinpath("../data/sweetpea/data.jsonl").resolve()
-    outputs = eval(str(data), TEST_HF_MODEL, PromptIds.SWEETP_1, [])
+    outputs, _, _ = eval(str(data), TEST_HF_MODEL, PromptIds.SWEETP_1, [])
     assert len(outputs) == 3, "Expected 3 outputs"
     for output in outputs:
-        assert len(output[0]) > 0, "Expected non-empty output"
+        assert len(output) > 0, "Expected non-empty output"
 
 
 def test_evaluation() -> None:
@@ -24,7 +24,7 @@ def test_evaluation() -> None:
     with jsonlines.open(data) as reader:
         items = [item for item in reader]
         labels = [item["output"] for item in items]
-        predictions = [[item["output"]] for item in items]
+        predictions = [item["output"] for item in items]
 
     bleu, meteor = evaluate_documentation(predictions, labels)
     assert bleu == pytest.approx(1, 0.01), f"BLEU Score is {bleu}"
@@ -34,7 +34,7 @@ def test_evaluation() -> None:
 def test_extra_token_in_prediction() -> None:
     # Test Case bleu score should be less due to brevity penalty and meteor is robust to small mistakes
     labels = ["this is a test"]
-    predictions = [["this is a test extra"]]
+    predictions = ["this is a test extra"]
     bleu, meteor = evaluate_documentation(predictions, labels)
     assert 0.6 <= bleu <= 0.8, f"BLEU Score is {bleu}"
     assert 0.8 <= meteor <= 1, f"METEOR Score is {meteor}"
@@ -43,7 +43,7 @@ def test_extra_token_in_prediction() -> None:
 def test_missing_token_in_prediction() -> None:
     # bleu score is less, meteor is higher
     labels = ["this is a test"]
-    predictions = [["this is a"]]
+    predictions = ["this is a"]
     bleu, meteor = evaluate_documentation(predictions, labels)
     assert 0.4 <= bleu <= 0.6, f"BLEU Score is {bleu}"
     assert 0.6 <= meteor <= 0.8, f"METEOR Score is {meteor}"
@@ -52,7 +52,7 @@ def test_missing_token_in_prediction() -> None:
 def test_completely_different_tokens() -> None:
     # both scores are less, as no common tokens
     labels = ["this is a test"]
-    predictions = [["completely different sentence"]]
+    predictions = ["completely different sentence"]
     bleu, meteor = evaluate_documentation(predictions, labels)
     assert bleu <= 0.1, f"BLEU Score is {bleu}"
     assert meteor <= 0.1, f"METEOR Score is {meteor}"
@@ -61,7 +61,7 @@ def test_completely_different_tokens() -> None:
 def test_partially_matching_tokens() -> None:
     # As ngrams arent matching because of extra token within, BLEU score is very less. Meteor gives a good score only.
     labels = ["this is a test"]
-    predictions = [["this is a different test"]]
+    predictions = ["this is a different test"]
     bleu, meteor = evaluate_documentation(predictions, labels)
     assert 0.25 <= bleu <= 0.4, f"BLEU Score is {bleu}"
     assert 0.8 <= meteor <= 0.95, f"METEOR Score is {meteor}"