Skip to content

Commit

Permalink
feat(evals): add an output_parser to llm_generate (#1736)
Browse files Browse the repository at this point in the history
* feat(evals): add an output_parser param for structured data extraction

* remove brittle test
  • Loading branch information
mikeldking authored Nov 14, 2023
1 parent f9c0fc2 commit 6408dda
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 32 deletions.
30 changes: 24 additions & 6 deletions src/phoenix/experimental/evals/functions/generate.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging
from typing import List, Optional, Union
from typing import Any, Callable, Dict, Optional, Union

import pandas as pd

Expand All @@ -9,13 +9,18 @@
logger = logging.getLogger(__name__)


def _no_op_parser(response: str) -> Dict[str, str]:
return {"output": response}


def llm_generate(
dataframe: pd.DataFrame,
template: Union[PromptTemplate, str],
model: BaseEvalModel,
system_instruction: Optional[str] = None,
verbose: bool = False,
) -> List[str]:
output_parser: Optional[Callable[[str], Dict[str, Any]]] = None,
) -> pd.DataFrame:
"""
Generates a text using a template using an LLM. This function is useful
if you want to generate synthetic data, such as irrelevant responses
Expand All @@ -38,16 +43,29 @@ def llm_generate(
verbose (bool, optional): If True, prints detailed information to stdout such as model
invocation parameters and retry info. Default False.
output_parser (Callable[[str], Dict[str, Any]], optional): An optional function
that takes each generated response and parses it to a dictionary. The keys of the dictionary
should correspond to the column names of the output dataframe. If None, the output dataframe
will have a single column named "output". Default None.
Returns:
List[Optional[str]]: A list of strings representing the output of the
model for each record
pandas.DataFrame: A dataframe where each row represents the generated output
"""
output_parser = output_parser or _no_op_parser
with set_verbosity(model, verbose) as verbose_model:
template = normalize_template(template)
logger.info(f"Template: \n{template.text}\n")
logger.info(f"Template variables: {template.variables}")
prompts = map_template(dataframe, template)

responses = verbose_model.generate(prompts.to_list(), system_instruction)
return responses
# For each prompt, generate and parse the response
output = []
for prompt in prompts:
logger.info(f"Prompt: {prompt}")
response = verbose_model(prompt, instruction=system_instruction)
parsed_response = output_parser(response)
output.append(parsed_response)

# Return the data as a dataframe
return pd.DataFrame(output)
64 changes: 38 additions & 26 deletions tests/experimental/evals/functions/test_generate.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import json
from typing import Dict
from unittest.mock import patch

import httpx
import numpy as np
import pandas as pd
import pytest
import respx
Expand Down Expand Up @@ -54,61 +57,70 @@ def test_llm_generate(monkeypatch: pytest.MonkeyPatch, respx_mock: respx.mock):
model = OpenAIModel()

generated = llm_generate(dataframe=dataframe, template=template, model=model)
assert generated == [
"it's a dialect of french",
"it's a music notation",
"It's a crazy language",
"it's a programming language",
]
assert generated.iloc[:, 0].tolist() == responses


@pytest.mark.respx(base_url="https://api.openai.com/v1/chat/completions")
def test_llm_generate_prints_info_with_verbose_flag(
monkeypatch: pytest.MonkeyPatch, capfd, respx_mock: respx.mock
):
def test_llm_generate_with_output_parser(monkeypatch: pytest.MonkeyPatch, respx_mock: respx.mock):
monkeypatch.setenv(OPENAI_API_KEY_ENVVAR_NAME, "sk-0123456789")
dataframe = pd.DataFrame(
[
{
"query": "What is Python?",
"reference": "Python is a programming language.",
},
{
"query": "What is Python?",
"reference": "Ruby is a programming language.",
},
{
"query": "What is C++?",
"reference": "C++ is a programming language.",
},
{
"query": "What is C++?",
"reference": "irrelevant",
},
{
"query": "gobbledygook",
},
]
)
responses = [
"it's a dialect of french",
"it's a music notation",
"It's a crazy language",
"it's a programming language",
'{ "category": "programming", "language": "Python" }',
'{ "category": "programming", "language": "Python" }',
'{ "category": "programming", "language": "C++" }',
'{ "category": "programming", "language": "C++" }',
"unparsable response",
]
queries = dataframe["query"].tolist()
references = dataframe["reference"].tolist()
for query, reference, response in zip(queries, references, responses):
matcher = M(content__contains=query) & M(content__contains=reference)

for query, response in zip(queries, responses):
matcher = M(content__contains=query) & M(content__contains=query)
respx_mock.route(matcher).mock(
return_value=httpx.Response(200, json={"choices": [{"message": {"content": response}}]})
)

template = (
"Given {query} and a golden answer {reference}, generate an answer that is incorrect."
)
template = "Given {query}, generate output"

with patch.object(OpenAIModel, "_init_tiktoken", return_value=None):
model = OpenAIModel()

llm_generate(dataframe=dataframe, template=template, model=model, verbose=True)
def output_parser(response: str) -> Dict[str, str]:
try:
return json.loads(response)
except json.JSONDecodeError as e:
return {"__error__": str(e)}

generated = llm_generate(
dataframe=dataframe, template=template, model=model, output_parser=output_parser
)
# check the output is parsed correctly
assert generated["category"].tolist() == [
"programming",
"programming",
"programming",
"programming",
np.nan,
]

out, _ = capfd.readouterr()
assert "Generating responses for 4 prompts..." in out, "Response generation should be printed"
# check the unparsable response captures the error
assert generated["__error__"].tolist() == [np.nan] * 4 + [
"Expecting value: line 1 column 1 (char 0)"
]

0 comments on commit 6408dda

Please sign in to comment.