-
Notifications
You must be signed in to change notification settings - Fork 336
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(traces): evaluation annotations on traces for associating spans …
…with eval metrics (#1693) * feat: initial associations of evaluations to traces * add some documentaiton * wip: add dataframe utils * Switch to a single evaluation per dataframe * make copy the default * fix doc string * fix name * fix notebook * Add immutability * remove value from being required * fix tutorials formatting * make type a string to see if it fixes tests * fix test to handle un-parsable * Update src/phoenix/trace/trace_eval_dataset.py Co-authored-by: Xander Song <[email protected]> * Update src/phoenix/trace/trace_eval_dataset.py Co-authored-by: Xander Song <[email protected]> * change to trace_evaluations * cleanup * Fix formatting * pr comments * cleanup notebook * make sure columns are dropped * remove unused test --------- Co-authored-by: Xander Song <[email protected]>
- Loading branch information
1 parent
13d019f
commit a218a65
Showing
7 changed files
with
356 additions
and
102 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
from .span_evaluations import SpanEvaluations | ||
from .trace_dataset import TraceDataset | ||
|
||
__all__ = ["TraceDataset", "SpanEvaluations"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
import pandas as pd | ||
|
||
EVALUATIONS_INDEX_NAME = "context.span_id" | ||
RESULTS_COLUMN_NAMES = ["score", "label", "explanation"] | ||
|
||
EVAL_NAME_COLUMN_PREFIX = "eval." | ||
|
||
|
||
class SpanEvaluations: | ||
""" | ||
SpanEvaluations is a set of evaluation annotations for a set of spans. | ||
SpanEvaluations encompasses the evaluation annotations for a single evaluation task | ||
such as toxicity or hallucinations. | ||
SpanEvaluations can be appended to TraceDatasets so that the spans and | ||
evaluations can be joined and analyzed together. | ||
Parameters | ||
__________ | ||
eval_name: str | ||
the name of the evaluation, e.x. 'toxicity' | ||
dataframe: pandas.DataFrame | ||
the pandas dataframe containing the evaluation annotations Each row | ||
represents the evaluations on a span. | ||
Example | ||
_______ | ||
DataFrame of evaluations for toxicity may look like: | ||
| span_id | score | label | explanation | | ||
|---------|--------------------|--------------------|--------------------| | ||
| span_1 | 1 | toxic | bad language | | ||
| span_2 | 0 | non-toxic | violence | | ||
| span_3 | 1 | toxic | discrimination | | ||
""" | ||
|
||
dataframe: pd.DataFrame | ||
|
||
eval_name: str # The name for the evaluation, e.x. 'toxicity' | ||
|
||
def __init__(self, eval_name: str, dataframe: pd.DataFrame): | ||
self.eval_name = eval_name | ||
|
||
# If the dataframe contains the index column, set the index to that column | ||
if EVALUATIONS_INDEX_NAME in dataframe.columns: | ||
dataframe = dataframe.set_index(EVALUATIONS_INDEX_NAME) | ||
|
||
# validate that the dataframe is indexed by context.span_id | ||
if dataframe.index.name != EVALUATIONS_INDEX_NAME: | ||
raise ValueError( | ||
f"The dataframe index must be '{EVALUATIONS_INDEX_NAME}' but was " | ||
f"'{dataframe.index.name}'" | ||
) | ||
|
||
# Drop the unnecessary columns | ||
extra_column_names = dataframe.columns.difference(RESULTS_COLUMN_NAMES) | ||
self.dataframe = dataframe.drop(extra_column_names, axis=1) | ||
|
||
def get_dataframe(self, prefix_columns_with_name: bool = True) -> pd.DataFrame: | ||
""" | ||
Returns a copy of the dataframe with the evaluation annotations | ||
Parameters | ||
__________ | ||
prefix_columns_with_name: bool | ||
if True, the columns will be prefixed with the eval_name, e.x. 'eval.toxicity.value' | ||
""" | ||
if prefix_columns_with_name: | ||
prefix = f"{EVAL_NAME_COLUMN_PREFIX}{self.eval_name}." | ||
return self.dataframe.add_prefix(prefix) | ||
return self.dataframe.copy() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
import pandas as pd | ||
from phoenix.trace.span_evaluations import SpanEvaluations | ||
|
||
|
||
def test_span_evaluations_construction(): | ||
num_records = 5 | ||
span_ids = [f"span_{index}" for index in range(num_records)] | ||
|
||
eval_ds = SpanEvaluations( | ||
eval_name="my_eval", | ||
dataframe=pd.DataFrame( | ||
{ | ||
"context.span_id": span_ids, | ||
"label": [index for index in range(num_records)], | ||
"score": [index for index in range(num_records)], | ||
"random_column": [index for index in range(num_records)], | ||
} | ||
).set_index("context.span_id"), | ||
) | ||
|
||
# make sure the dataframe only has the needed values | ||
assert "context.span_id" not in eval_ds.dataframe.columns | ||
assert "random_column" not in eval_ds.dataframe.columns | ||
assert "score" in eval_ds.dataframe.columns |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.