fix: get source code in jupyter notebook for benchmark logics

Aleph-Alpha · Dec 16, 2024 · 0c9a174 · 0c9a174
1 parent 08331e1
commit 0c9a174
Show file tree

Hide file tree

Showing 5 changed files with 96 additions and 12 deletions.
diff --git a/src/documentation/how_tos/studio/how_to_execute_a_benchmark.ipynb b/src/documentation/how_tos/studio/how_to_execute_a_benchmark.ipynb
@@ -12,21 +12,32 @@
     "\n",
     "from documentation.how_tos.example_data import (\n",
     "    DummyAggregationLogic,\n",
-    "    DummyEvaluationLogic,\n",
+    "    DummyEvaluation,\n",
     "    DummyTask,\n",
     "    example_data,\n",
     ")\n",
-    "from intelligence_layer.connectors.studio.studio import StudioClient\n",
-    "from intelligence_layer.evaluation.benchmark.studio_benchmark import (\n",
+    "from intelligence_layer.connectors import StudioClient\n",
+    "from intelligence_layer.evaluation import (\n",
+    "    EvaluationLogic,\n",
+    "    Example,\n",
     "    StudioBenchmarkRepository,\n",
-    ")\n",
-    "from intelligence_layer.evaluation.dataset.studio_dataset_repository import (\n",
     "    StudioDatasetRepository,\n",
+    "    SuccessfulExampleOutput,\n",
     ")\n",
     "\n",
     "load_dotenv()\n",
     "my_example_data = example_data()\n",
-    "examples = my_example_data.examples"
+    "examples = my_example_data.examples\n",
+    "\n",
+    "\n",
+    "class DummyEvaluationLogic(EvaluationLogic[str, str, str, DummyEvaluation]):\n",
+    "    def do_evaluate(\n",
+    "        self, example: Example[str, str], *output: SuccessfulExampleOutput[str]\n",
+    "    ) -> DummyEvaluation:\n",
+    "        output_str = \"(\" + (\", \".join(o.output for o in output)) + \")\"\n",
+    "        return DummyEvaluation(\n",
+    "            eval=f\"{example.input}, {example.expected_output}, {output_str} -> evaluation\"\n",
+    "        )"
    ]
   },
   {
@@ -89,7 +100,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "intelligence-layer-LP3DLT23-py3.12",
+   "display_name": ".venv",
    "language": "python",
    "name": "python3"
   },

diff --git a/src/intelligence_layer/evaluation/__init__.py b/src/intelligence_layer/evaluation/__init__.py
@@ -23,6 +23,10 @@
 from .aggregation.in_memory_aggregation_repository import (
     InMemoryAggregationRepository as InMemoryAggregationRepository,
 )
+from .benchmark.studio_benchmark import StudioBenchmark as StudioBenchmark
+from .benchmark.studio_benchmark import (
+    StudioBenchmarkRepository as StudioBenchmarkRepository,
+)
 from .dataset.dataset_repository import DatasetRepository as DatasetRepository
 from .dataset.domain import Dataset as Dataset
 from .dataset.domain import Example as Example

diff --git a/src/intelligence_layer/evaluation/benchmark/get_code.py b/src/intelligence_layer/evaluation/benchmark/get_code.py
@@ -0,0 +1,69 @@
+"""Utilities for working with IPython/Jupyter notebooks."""
+
+import ast
+import inspect
+import textwrap
+
+from intelligence_layer.evaluation.aggregation.aggregator import AggregationLogic
+from intelligence_layer.evaluation.evaluation.evaluator.evaluator import EvaluationLogic
+
+
+class NotInteractiveEnvironmentError(Exception): ...
+
+
+def is_running_interactively() -> bool:
+    """Check if the code is running in an interactive environment."""
+    try:
+        from IPython import get_ipython
+
+        return get_ipython() is not None
+    except ModuleNotFoundError:
+        return False
+
+
+def get_notebook_source() -> str:
+    """Get the source code of the running notebook."""
+    from IPython import get_ipython
+
+    shell = get_ipython()
+    if shell is None:
+        raise NotInteractiveEnvironmentError
+
+    if not hasattr(shell, "user_ns"):
+        raise AttributeError("Cannot access user namespace")
+
+    # This is the list of input cells in the notebook
+    in_list = shell.user_ns["In"]
+
+    # Stitch them back into a single "file"
+    full_source = "\n\n".join(cell for cell in in_list[1:] if cell)
+
+    return full_source
+
+
+def get_class_source(cls: type) -> str:
+    """Get the latest source definition of a class in the notebook."""
+    notebook_source = get_notebook_source()
+    tree = ast.parse(notebook_source)
+    class_name = cls.__name__
+
+    # We need to walk the entire tree and get the last one since that's the most version of the cls
+    segment = None
+    for node in ast.walk(tree):
+        if isinstance(node, ast.ClassDef) and node.name == class_name:
+            segment = ast.get_source_segment(notebook_source, node)
+
+    if segment is not None:
+        return segment
+
+    raise ValueError(f"Class '{class_name}' not found in the notebook")
+
+def get_source_notebook_safe(logic: EvaluationLogic | AggregationLogic) -> str:
+    # In ipython, we can't use inspect.getsource on classes defined in the notebook
+    logic_class = type(logic)
+    try:
+        src = inspect.getsource(logic_class)
+    except OSError:
+        if is_running_interactively() and inspect.isclass(logic_class):
+            src = get_class_source(logic_class)
+    return textwrap.dedent(src)
diff --git a/src/intelligence_layer/evaluation/benchmark/studio_benchmark.py b/src/intelligence_layer/evaluation/benchmark/studio_benchmark.py
@@ -1,4 +1,3 @@
-import inspect
 from collections.abc import Sequence
 from datetime import datetime
 from http import HTTPStatus
@@ -29,6 +28,7 @@
     Benchmark,
     BenchmarkRepository,
 )
+from intelligence_layer.evaluation.benchmark.get_code import get_source_notebook_safe
 from intelligence_layer.evaluation.dataset.domain import ExpectedOutput
 from intelligence_layer.evaluation.dataset.studio_dataset_repository import (
     StudioDatasetRepository,
@@ -269,7 +269,7 @@ def create_evaluation_logic_identifier(
         evaluation_logic=eval_logic,
     )
     return EvaluationLogicIdentifier(
-        logic=inspect.getsource(type(eval_logic)),
+        logic=get_source_notebook_safe(eval_logic),
         input_schema=type_to_schema(evaluator.input_type()),
         output_schema=type_to_schema(evaluator.output_type()),
         expected_output_schema=type_to_schema(evaluator.expected_output_type()),
@@ -287,7 +287,7 @@ def create_aggregation_logic_identifier(
         aggregation_logic=aggregation_logic,
     )
     return AggregationLogicIdentifier(
-        logic=inspect.getsource(type(aggregation_logic)),
+        logic=get_source_notebook_safe(aggregation_logic),
         evaluation_schema=type_to_schema(aggregator.evaluation_type()),
         aggregation_schema=type_to_schema(aggregator.aggregated_evaluation_type()),
     )
diff --git a/src/intelligence_layer/evaluation/dataset/studio_dataset_repository.py b/src/intelligence_layer/evaluation/dataset/studio_dataset_repository.py
@@ -10,9 +10,9 @@
     StudioExample,
 )
 from intelligence_layer.core import Input
-from intelligence_layer.evaluation import (
+from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository
+from intelligence_layer.evaluation.dataset.domain import (
     Dataset,
-    DatasetRepository,
     Example,
     ExpectedOutput,
 )