diff --git a/src/documentation/how_tos/studio/how_to_execute_a_benchmark.ipynb b/src/documentation/how_tos/studio/how_to_execute_a_benchmark.ipynb index 9b23112c..99bb5d6f 100644 --- a/src/documentation/how_tos/studio/how_to_execute_a_benchmark.ipynb +++ b/src/documentation/how_tos/studio/how_to_execute_a_benchmark.ipynb @@ -12,21 +12,32 @@ "\n", "from documentation.how_tos.example_data import (\n", " DummyAggregationLogic,\n", - " DummyEvaluationLogic,\n", + " DummyEvaluation,\n", " DummyTask,\n", " example_data,\n", ")\n", - "from intelligence_layer.connectors.studio.studio import StudioClient\n", - "from intelligence_layer.evaluation.benchmark.studio_benchmark import (\n", + "from intelligence_layer.connectors import StudioClient\n", + "from intelligence_layer.evaluation import (\n", + " EvaluationLogic,\n", + " Example,\n", " StudioBenchmarkRepository,\n", - ")\n", - "from intelligence_layer.evaluation.dataset.studio_dataset_repository import (\n", " StudioDatasetRepository,\n", + " SuccessfulExampleOutput,\n", ")\n", "\n", "load_dotenv()\n", "my_example_data = example_data()\n", - "examples = my_example_data.examples" + "examples = my_example_data.examples\n", + "\n", + "\n", + "class DummyEvaluationLogic(EvaluationLogic[str, str, str, DummyEvaluation]):\n", + " def do_evaluate(\n", + " self, example: Example[str, str], *output: SuccessfulExampleOutput[str]\n", + " ) -> DummyEvaluation:\n", + " output_str = \"(\" + (\", \".join(o.output for o in output)) + \")\"\n", + " return DummyEvaluation(\n", + " eval=f\"{example.input}, {example.expected_output}, {output_str} -> evaluation\"\n", + " )" ] }, { @@ -89,7 +100,7 @@ ], "metadata": { "kernelspec": { - "display_name": "intelligence-layer-LP3DLT23-py3.12", + "display_name": ".venv", "language": "python", "name": "python3" }, diff --git a/src/intelligence_layer/evaluation/__init__.py b/src/intelligence_layer/evaluation/__init__.py index f97a162e..055b155f 100644 --- a/src/intelligence_layer/evaluation/__init__.py +++ b/src/intelligence_layer/evaluation/__init__.py @@ -23,6 +23,10 @@ from .aggregation.in_memory_aggregation_repository import ( InMemoryAggregationRepository as InMemoryAggregationRepository, ) +from .benchmark.studio_benchmark import StudioBenchmark as StudioBenchmark +from .benchmark.studio_benchmark import ( + StudioBenchmarkRepository as StudioBenchmarkRepository, +) from .dataset.dataset_repository import DatasetRepository as DatasetRepository from .dataset.domain import Dataset as Dataset from .dataset.domain import Example as Example diff --git a/src/intelligence_layer/evaluation/benchmark/get_code.py b/src/intelligence_layer/evaluation/benchmark/get_code.py new file mode 100644 index 00000000..e5f5fde0 --- /dev/null +++ b/src/intelligence_layer/evaluation/benchmark/get_code.py @@ -0,0 +1,69 @@ +"""Utilities for working with IPython/Jupyter notebooks.""" + +import ast +import inspect +import textwrap + +from intelligence_layer.evaluation.aggregation.aggregator import AggregationLogic +from intelligence_layer.evaluation.evaluation.evaluator.evaluator import EvaluationLogic + + +class NotInteractiveEnvironmentError(Exception): ... + + +def is_running_interactively() -> bool: + """Check if the code is running in an interactive environment.""" + try: + from IPython import get_ipython + + return get_ipython() is not None + except ModuleNotFoundError: + return False + + +def get_notebook_source() -> str: + """Get the source code of the running notebook.""" + from IPython import get_ipython + + shell = get_ipython() + if shell is None: + raise NotInteractiveEnvironmentError + + if not hasattr(shell, "user_ns"): + raise AttributeError("Cannot access user namespace") + + # This is the list of input cells in the notebook + in_list = shell.user_ns["In"] + + # Stitch them back into a single "file" + full_source = "\n\n".join(cell for cell in in_list[1:] if cell) + + return full_source + + +def get_class_source(cls: type) -> str: + """Get the latest source definition of a class in the notebook.""" + notebook_source = get_notebook_source() + tree = ast.parse(notebook_source) + class_name = cls.__name__ + + # We need to walk the entire tree and get the last one since that's the most version of the cls + segment = None + for node in ast.walk(tree): + if isinstance(node, ast.ClassDef) and node.name == class_name: + segment = ast.get_source_segment(notebook_source, node) + + if segment is not None: + return segment + + raise ValueError(f"Class '{class_name}' not found in the notebook") + +def get_source_notebook_safe(logic: EvaluationLogic | AggregationLogic) -> str: + # In ipython, we can't use inspect.getsource on classes defined in the notebook + logic_class = type(logic) + try: + src = inspect.getsource(logic_class) + except OSError: + if is_running_interactively() and inspect.isclass(logic_class): + src = get_class_source(logic_class) + return textwrap.dedent(src) diff --git a/src/intelligence_layer/evaluation/benchmark/studio_benchmark.py b/src/intelligence_layer/evaluation/benchmark/studio_benchmark.py index 23235024..d88c35e7 100644 --- a/src/intelligence_layer/evaluation/benchmark/studio_benchmark.py +++ b/src/intelligence_layer/evaluation/benchmark/studio_benchmark.py @@ -1,4 +1,3 @@ -import inspect from collections.abc import Sequence from datetime import datetime from http import HTTPStatus @@ -29,6 +28,7 @@ Benchmark, BenchmarkRepository, ) +from intelligence_layer.evaluation.benchmark.get_code import get_source_notebook_safe from intelligence_layer.evaluation.dataset.domain import ExpectedOutput from intelligence_layer.evaluation.dataset.studio_dataset_repository import ( StudioDatasetRepository, @@ -269,7 +269,7 @@ def create_evaluation_logic_identifier( evaluation_logic=eval_logic, ) return EvaluationLogicIdentifier( - logic=inspect.getsource(type(eval_logic)), + logic=get_source_notebook_safe(eval_logic), input_schema=type_to_schema(evaluator.input_type()), output_schema=type_to_schema(evaluator.output_type()), expected_output_schema=type_to_schema(evaluator.expected_output_type()), @@ -287,7 +287,7 @@ def create_aggregation_logic_identifier( aggregation_logic=aggregation_logic, ) return AggregationLogicIdentifier( - logic=inspect.getsource(type(aggregation_logic)), + logic=get_source_notebook_safe(aggregation_logic), evaluation_schema=type_to_schema(aggregator.evaluation_type()), aggregation_schema=type_to_schema(aggregator.aggregated_evaluation_type()), ) diff --git a/src/intelligence_layer/evaluation/dataset/studio_dataset_repository.py b/src/intelligence_layer/evaluation/dataset/studio_dataset_repository.py index 4c0f4ffc..e9c91b66 100644 --- a/src/intelligence_layer/evaluation/dataset/studio_dataset_repository.py +++ b/src/intelligence_layer/evaluation/dataset/studio_dataset_repository.py @@ -10,9 +10,9 @@ StudioExample, ) from intelligence_layer.core import Input -from intelligence_layer.evaluation import ( +from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository +from intelligence_layer.evaluation.dataset.domain import ( Dataset, - DatasetRepository, Example, ExpectedOutput, )