From b898671b31913d7f4c5fc73e312a323f5d8b67f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20K=C3=B6hnecke?= Date: Mon, 27 May 2024 17:53:35 +0200 Subject: [PATCH] feat: restructure tests according to src structure --- tests/conftest.py | 31 +--- tests/evaluation/aggregation/conftest.py | 27 ++++ .../{ => aggregation}/test_accumulator.py | 0 .../test_aggregation_repository.py | 0 tests/evaluation/aggregation/test_domain.py | 12 ++ .../{ => aggregation}/test_elo_calculator.py | 0 ...est_hugging_face_aggregation_repository.py | 0 tests/evaluation/conftest.py | 140 +++++------------- .../{ => dataset}/test_dataset_repository.py | 2 +- .../test_hugging_face_dataset_repository.py | 0 ...t_single_huggingface_dataset_repository.py | 0 .../evaluation/conftest.py} | 0 .../test_argilla_evaluator.py | 54 ++++++- .../test_async_evaluation_repository.py | 0 .../test_elo_evaluation_logic.py | 0 .../test_evaluation_repository.py | 0 .../{ => evaluation}/test_evaluator.py | 0 .../test_file_evaluation_repository.py | 0 .../{ => evaluation}/test_graders.py | 0 .../test_incremental_evaluator.py | 0 ...t_instruct_comparison_argilla_evaluator.py | 0 .../test_hugging_face_repository.py | 0 .../test_repository_navigator.py | 0 .../{ => run}/test_file_run_repository.py | 0 tests/evaluation/{ => run}/test_run.py | 12 +- .../{ => run}/test_run_repository.py | 2 +- tests/evaluation/{ => run}/test_runner.py | 0 .../{test_domain.py => run/test_trace.py} | 18 +-- .../classify/test_classify.py | 0 .../classify/test_embedding_based_classify.py | 0 .../classify/test_keyword_extract.py | 0 .../classify/test_prompt_based_classify.py | 0 ..._prompt_based_classify_with_definitions.py | 0 tests/{use_cases => examples}/qa/conftest.py | 0 .../qa/test_long_context_qa.py | 0 .../qa/test_multiple_chunk_qa.py | 0 .../qa/test_multiple_chunk_retriever_qa.py | 0 .../qa/test_retriever_based_qa.py | 0 .../qa/test_single_chunk_qa.py | 0 .../search/test_expand_chunk.py | 0 .../search/test_search.py | 0 tests/examples/summarize/__init__.py | 0 .../summarize/conftest.py | 0 .../summarize/test_recursive_summarize.py | 0 .../test_steerable_long_context_summarize.py | 0 .../summarize/test_summarize.py | 0 .../summarize/very_long_text.txt | 0 47 files changed, 134 insertions(+), 164 deletions(-) create mode 100644 tests/evaluation/aggregation/conftest.py rename tests/evaluation/{ => aggregation}/test_accumulator.py (100%) rename tests/evaluation/{ => aggregation}/test_aggregation_repository.py (100%) create mode 100644 tests/evaluation/aggregation/test_domain.py rename tests/evaluation/{ => aggregation}/test_elo_calculator.py (100%) rename tests/evaluation/{ => aggregation}/test_hugging_face_aggregation_repository.py (100%) rename tests/evaluation/{ => dataset}/test_dataset_repository.py (99%) rename tests/evaluation/{ => dataset}/test_hugging_face_dataset_repository.py (100%) rename tests/evaluation/{ => dataset}/test_single_huggingface_dataset_repository.py (100%) rename tests/{use_cases/summarize/__init__.py => evaluation/evaluation/conftest.py} (100%) rename tests/evaluation/{ => evaluation}/test_argilla_evaluator.py (88%) rename tests/evaluation/{ => evaluation}/test_async_evaluation_repository.py (100%) rename tests/evaluation/{ => evaluation}/test_elo_evaluation_logic.py (100%) rename tests/evaluation/{ => evaluation}/test_evaluation_repository.py (100%) rename tests/evaluation/{ => evaluation}/test_evaluator.py (100%) rename tests/evaluation/{ => evaluation}/test_file_evaluation_repository.py (100%) rename tests/evaluation/{ => evaluation}/test_graders.py (100%) rename tests/evaluation/{ => evaluation}/test_incremental_evaluator.py (100%) rename tests/evaluation/{ => evaluation}/test_instruct_comparison_argilla_evaluator.py (100%) rename tests/evaluation/{ => infrastructure}/test_hugging_face_repository.py (100%) rename tests/evaluation/{ => infrastructure}/test_repository_navigator.py (100%) rename tests/evaluation/{ => run}/test_file_run_repository.py (100%) rename tests/evaluation/{ => run}/test_run.py (89%) rename tests/evaluation/{ => run}/test_run_repository.py (99%) rename tests/evaluation/{ => run}/test_runner.py (100%) rename tests/evaluation/{test_domain.py => run/test_trace.py} (72%) rename tests/{use_cases => examples}/classify/test_classify.py (100%) rename tests/{use_cases => examples}/classify/test_embedding_based_classify.py (100%) rename tests/{use_cases => examples}/classify/test_keyword_extract.py (100%) rename tests/{use_cases => examples}/classify/test_prompt_based_classify.py (100%) rename tests/{use_cases => examples}/classify/test_prompt_based_classify_with_definitions.py (100%) rename tests/{use_cases => examples}/qa/conftest.py (100%) rename tests/{use_cases => examples}/qa/test_long_context_qa.py (100%) rename tests/{use_cases => examples}/qa/test_multiple_chunk_qa.py (100%) rename tests/{use_cases => examples}/qa/test_multiple_chunk_retriever_qa.py (100%) rename tests/{use_cases => examples}/qa/test_retriever_based_qa.py (100%) rename tests/{use_cases => examples}/qa/test_single_chunk_qa.py (100%) rename tests/{use_cases => examples}/search/test_expand_chunk.py (100%) rename tests/{use_cases => examples}/search/test_search.py (100%) create mode 100644 tests/examples/summarize/__init__.py rename tests/{use_cases => examples}/summarize/conftest.py (100%) rename tests/{use_cases => examples}/summarize/test_recursive_summarize.py (100%) rename tests/{use_cases => examples}/summarize/test_steerable_long_context_summarize.py (100%) rename tests/{use_cases => examples}/summarize/test_summarize.py (100%) rename tests/{use_cases => examples}/summarize/very_long_text.txt (100%) diff --git a/tests/conftest.py b/tests/conftest.py index 6b740a5c5..419c10f76 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,7 +4,6 @@ from aleph_alpha_client import Client, Image from dotenv import load_dotenv -from pydantic import BaseModel from pytest import fixture from intelligence_layer.connectors import ( @@ -17,13 +16,7 @@ QdrantInMemoryRetriever, RetrieverType, ) -from intelligence_layer.core import ( - LuminousControlModel, - NoOpTracer, - Task, - TaskSpan, - utc_now, -) +from intelligence_layer.core import LuminousControlModel, NoOpTracer, utc_now from intelligence_layer.evaluation import ( AsyncInMemoryEvaluationRepository, EvaluationOverview, @@ -117,28 +110,6 @@ def to_document(document_chunk: DocumentChunk) -> Document: return Document(text=document_chunk.text, metadata=document_chunk.metadata) -class DummyStringInput(BaseModel): - input: str = "dummy-input" - - -class DummyStringOutput(BaseModel): - output: str = "dummy-output" - - -class DummyStringEvaluation(BaseModel): - evaluation: str = "dummy-evaluation" - - -class DummyStringTask(Task[DummyStringInput, DummyStringOutput]): - def do_run(self, input: DummyStringInput, task_span: TaskSpan) -> DummyStringOutput: - return DummyStringOutput() - - -@fixture -def dummy_string_task() -> DummyStringTask: - return DummyStringTask() - - @fixture def in_memory_dataset_repository() -> InMemoryDatasetRepository: return InMemoryDatasetRepository() diff --git a/tests/evaluation/aggregation/conftest.py b/tests/evaluation/aggregation/conftest.py new file mode 100644 index 000000000..aeb2683f3 --- /dev/null +++ b/tests/evaluation/aggregation/conftest.py @@ -0,0 +1,27 @@ +from pytest import fixture + +from intelligence_layer.core import utc_now +from intelligence_layer.evaluation import AggregationOverview, EvaluationOverview +from tests.evaluation.conftest import DummyAggregatedEvaluation + + +@fixture +def dummy_aggregated_evaluation() -> DummyAggregatedEvaluation: + return DummyAggregatedEvaluation(score=0.5) + + +@fixture +def aggregation_overview( + evaluation_overview: EvaluationOverview, + dummy_aggregated_evaluation: DummyAggregatedEvaluation, +) -> AggregationOverview[DummyAggregatedEvaluation]: + return AggregationOverview( + evaluation_overviews=frozenset([evaluation_overview]), + id="aggregation-id", + start=utc_now(), + end=utc_now(), + successful_evaluation_count=5, + crashed_during_evaluation_count=3, + description="dummy-evaluator", + statistics=dummy_aggregated_evaluation, + ) diff --git a/tests/evaluation/test_accumulator.py b/tests/evaluation/aggregation/test_accumulator.py similarity index 100% rename from tests/evaluation/test_accumulator.py rename to tests/evaluation/aggregation/test_accumulator.py diff --git a/tests/evaluation/test_aggregation_repository.py b/tests/evaluation/aggregation/test_aggregation_repository.py similarity index 100% rename from tests/evaluation/test_aggregation_repository.py rename to tests/evaluation/aggregation/test_aggregation_repository.py diff --git a/tests/evaluation/aggregation/test_domain.py b/tests/evaluation/aggregation/test_domain.py new file mode 100644 index 000000000..bdde1d35f --- /dev/null +++ b/tests/evaluation/aggregation/test_domain.py @@ -0,0 +1,12 @@ +import pytest + +from intelligence_layer.evaluation.aggregation.domain import AggregationOverview +from intelligence_layer.evaluation.evaluation.domain import EvaluationFailed +from tests.evaluation.conftest import DummyAggregatedEvaluation + + +def test_raise_on_exception_for_evaluation_run_overview( + aggregation_overview: AggregationOverview[DummyAggregatedEvaluation], +) -> None: + with pytest.raises(EvaluationFailed): + aggregation_overview.raise_on_evaluation_failure() diff --git a/tests/evaluation/test_elo_calculator.py b/tests/evaluation/aggregation/test_elo_calculator.py similarity index 100% rename from tests/evaluation/test_elo_calculator.py rename to tests/evaluation/aggregation/test_elo_calculator.py diff --git a/tests/evaluation/test_hugging_face_aggregation_repository.py b/tests/evaluation/aggregation/test_hugging_face_aggregation_repository.py similarity index 100% rename from tests/evaluation/test_hugging_face_aggregation_repository.py rename to tests/evaluation/aggregation/test_hugging_face_aggregation_repository.py diff --git a/tests/evaluation/conftest.py b/tests/evaluation/conftest.py index 833d5df79..ad8e7a632 100644 --- a/tests/evaluation/conftest.py +++ b/tests/evaluation/conftest.py @@ -8,18 +8,9 @@ from pydantic import BaseModel from pytest import fixture -from intelligence_layer.connectors import ( - ArgillaClient, - ArgillaEvaluation, - Field, - Question, - RecordData, -) -from intelligence_layer.core import Task, Tracer, utc_now +from intelligence_layer.core import Task, TaskSpan, Tracer from intelligence_layer.evaluation import ( - AggregationOverview, DatasetRepository, - EvaluationOverview, Example, ExampleEvaluation, FileAggregationRepository, @@ -29,12 +20,43 @@ InMemoryRunRepository, Runner, ) -from tests.conftest import DummyStringInput, DummyStringOutput FAIL_IN_EVAL_INPUT = "fail in eval" FAIL_IN_TASK_INPUT = "fail in task" +class DummyStringInput(BaseModel): + input: str = "dummy-input" + + +class DummyStringOutput(BaseModel): + output: str = "dummy-output" + + +class DummyStringEvaluation(BaseModel): + evaluation: str = "dummy-evaluation" + + +class DummyStringTask(Task[DummyStringInput, DummyStringOutput]): + def do_run(self, input: DummyStringInput, task_span: TaskSpan) -> DummyStringOutput: + return DummyStringOutput() + + +@fixture +def dummy_string_task() -> DummyStringTask: + return DummyStringTask() + + +@fixture +def string_dataset_id( + dummy_string_examples: Iterable[Example[DummyStringInput, DummyStringOutput]], + in_memory_dataset_repository: DatasetRepository, +) -> str: + return in_memory_dataset_repository.create_dataset( + examples=dummy_string_examples, dataset_name="test-dataset" + ).id + + class DummyTask(Task[str, str]): def do_run(self, input: str, tracer: Tracer) -> str: if input == FAIL_IN_TASK_INPUT: @@ -42,10 +64,6 @@ def do_run(self, input: str, tracer: Tracer) -> str: return input -class DummyStringEvaluation(BaseModel): - same: bool - - class DummyEvaluation(BaseModel): result: str @@ -93,38 +111,6 @@ def file_run_repository(tmp_path: Path) -> FileRunRepository: return FileRunRepository(tmp_path) -@fixture -def string_dataset_id( - dummy_string_examples: Iterable[Example[DummyStringInput, DummyStringOutput]], - in_memory_dataset_repository: DatasetRepository, -) -> str: - return in_memory_dataset_repository.create_dataset( - examples=dummy_string_examples, dataset_name="test-dataset" - ).id - - -@fixture -def dummy_aggregated_evaluation() -> DummyAggregatedEvaluation: - return DummyAggregatedEvaluation(score=0.5) - - -@fixture -def aggregation_overview( - evaluation_overview: EvaluationOverview, - dummy_aggregated_evaluation: DummyAggregatedEvaluation, -) -> AggregationOverview[DummyAggregatedEvaluation]: - return AggregationOverview( - evaluation_overviews=frozenset([evaluation_overview]), - id="aggregation-id", - start=utc_now(), - end=utc_now(), - successful_evaluation_count=5, - crashed_during_evaluation_count=3, - description="dummy-evaluator", - statistics=dummy_aggregated_evaluation, - ) - - @fixture def dummy_string_example() -> Example[DummyStringInput, DummyStringOutput]: return Example(input=DummyStringInput(), expected_output=DummyStringOutput()) @@ -150,66 +136,6 @@ def dummy_runner( ) -class StubArgillaClient(ArgillaClient): - _expected_workspace_id: str - _expected_fields: Sequence[Field] - _expected_questions: Sequence[Question] - _datasets: dict[str, list[RecordData]] = {} - _score = 3.0 - - def create_dataset( - self, - workspace_id: str, - dataset_name: str, - fields: Sequence[Field], - questions: Sequence[Question], - ) -> str: - return self.ensure_dataset_exists(workspace_id, dataset_name, fields, questions) - - def ensure_dataset_exists( - self, - workspace_id: str, - dataset_name: str, - fields: Sequence[Field], - questions: Sequence[Question], - ) -> str: - if workspace_id != self._expected_workspace_id: - raise Exception("Incorrect workspace id") - elif fields != self._expected_fields: - raise Exception("Incorrect fields") - elif questions != self._expected_questions: - raise Exception("Incorrect questions") - dataset_id = str(uuid4()) - self._datasets[dataset_id] = [] - return dataset_id - - def add_record(self, dataset_id: str, record: RecordData) -> None: - if dataset_id not in self._datasets: - raise Exception("Add record: dataset not found") - self._datasets[dataset_id].append(record) - - def evaluations(self, dataset_id: str) -> Iterable[ArgillaEvaluation]: - dataset = self._datasets.get(dataset_id) - assert dataset - return [ - ArgillaEvaluation( - example_id=record.example_id, - record_id="ignored", - responses={"human-score": self._score}, - metadata=dict(), - ) - for record in dataset - ] - - def split_dataset(self, dataset_id: str, n_splits: int) -> None: - raise NotImplementedError - - -@fixture -def stub_argilla_client() -> StubArgillaClient: - return StubArgillaClient() - - @fixture() def temp_file_system() -> Iterable[MemoryFileSystem]: mfs = MemoryFileSystem() diff --git a/tests/evaluation/test_dataset_repository.py b/tests/evaluation/dataset/test_dataset_repository.py similarity index 99% rename from tests/evaluation/test_dataset_repository.py rename to tests/evaluation/dataset/test_dataset_repository.py index 6f9ca38f3..00afbd1aa 100644 --- a/tests/evaluation/test_dataset_repository.py +++ b/tests/evaluation/dataset/test_dataset_repository.py @@ -14,7 +14,7 @@ from intelligence_layer.evaluation.dataset.hugging_face_dataset_repository import ( HuggingFaceDatasetRepository, ) -from tests.conftest import DummyStringInput, DummyStringOutput +from tests.evaluation.conftest import DummyStringInput, DummyStringOutput @fixture diff --git a/tests/evaluation/test_hugging_face_dataset_repository.py b/tests/evaluation/dataset/test_hugging_face_dataset_repository.py similarity index 100% rename from tests/evaluation/test_hugging_face_dataset_repository.py rename to tests/evaluation/dataset/test_hugging_face_dataset_repository.py diff --git a/tests/evaluation/test_single_huggingface_dataset_repository.py b/tests/evaluation/dataset/test_single_huggingface_dataset_repository.py similarity index 100% rename from tests/evaluation/test_single_huggingface_dataset_repository.py rename to tests/evaluation/dataset/test_single_huggingface_dataset_repository.py diff --git a/tests/use_cases/summarize/__init__.py b/tests/evaluation/evaluation/conftest.py similarity index 100% rename from tests/use_cases/summarize/__init__.py rename to tests/evaluation/evaluation/conftest.py diff --git a/tests/evaluation/test_argilla_evaluator.py b/tests/evaluation/evaluation/test_argilla_evaluator.py similarity index 88% rename from tests/evaluation/test_argilla_evaluator.py rename to tests/evaluation/evaluation/test_argilla_evaluator.py index 34326c294..78ecec58f 100644 --- a/tests/evaluation/test_argilla_evaluator.py +++ b/tests/evaluation/evaluation/test_argilla_evaluator.py @@ -27,13 +27,63 @@ Runner, SuccessfulExampleOutput, ) -from tests.conftest import ( +from tests.evaluation.conftest import ( DummyStringEvaluation, DummyStringInput, DummyStringOutput, DummyStringTask, ) -from tests.evaluation.conftest import StubArgillaClient + + +class StubArgillaClient(ArgillaClient): + _expected_workspace_id: str + _expected_fields: Sequence[Field] + _expected_questions: Sequence[Question] + _datasets: dict[str, list[RecordData]] = {} + _score = 3.0 + + def ensure_dataset_exists( + self, + workspace_id: str, + dataset_name: str, + fields: Sequence[Field], + questions: Sequence[Question], + ) -> str: + if workspace_id != self._expected_workspace_id: + raise Exception("Incorrect workspace id") + elif fields != self._expected_fields: + raise Exception("Incorrect fields") + elif questions != self._expected_questions: + raise Exception("Incorrect questions") + dataset_id = str(uuid4()) + self._datasets[dataset_id] = [] + return dataset_id + + def add_record(self, dataset_id: str, record: RecordData) -> None: + if dataset_id not in self._datasets: + raise Exception("Add record: dataset not found") + self._datasets[dataset_id].append(record) + + def evaluations(self, dataset_id: str) -> Iterable[ArgillaEvaluation]: + dataset = self._datasets.get(dataset_id) + assert dataset + return [ + ArgillaEvaluation( + example_id=record.example_id, + record_id="ignored", + responses={"human-score": self._score}, + metadata=dict(), + ) + for record in dataset + ] + + def split_dataset(self, dataset_id: str, n_splits: int) -> None: + raise NotImplementedError + + +@fixture +def stub_argilla_client() -> StubArgillaClient: + return StubArgillaClient() class DummyStringTaskArgillaEvaluationLogic( diff --git a/tests/evaluation/test_async_evaluation_repository.py b/tests/evaluation/evaluation/test_async_evaluation_repository.py similarity index 100% rename from tests/evaluation/test_async_evaluation_repository.py rename to tests/evaluation/evaluation/test_async_evaluation_repository.py diff --git a/tests/evaluation/test_elo_evaluation_logic.py b/tests/evaluation/evaluation/test_elo_evaluation_logic.py similarity index 100% rename from tests/evaluation/test_elo_evaluation_logic.py rename to tests/evaluation/evaluation/test_elo_evaluation_logic.py diff --git a/tests/evaluation/test_evaluation_repository.py b/tests/evaluation/evaluation/test_evaluation_repository.py similarity index 100% rename from tests/evaluation/test_evaluation_repository.py rename to tests/evaluation/evaluation/test_evaluation_repository.py diff --git a/tests/evaluation/test_evaluator.py b/tests/evaluation/evaluation/test_evaluator.py similarity index 100% rename from tests/evaluation/test_evaluator.py rename to tests/evaluation/evaluation/test_evaluator.py diff --git a/tests/evaluation/test_file_evaluation_repository.py b/tests/evaluation/evaluation/test_file_evaluation_repository.py similarity index 100% rename from tests/evaluation/test_file_evaluation_repository.py rename to tests/evaluation/evaluation/test_file_evaluation_repository.py diff --git a/tests/evaluation/test_graders.py b/tests/evaluation/evaluation/test_graders.py similarity index 100% rename from tests/evaluation/test_graders.py rename to tests/evaluation/evaluation/test_graders.py diff --git a/tests/evaluation/test_incremental_evaluator.py b/tests/evaluation/evaluation/test_incremental_evaluator.py similarity index 100% rename from tests/evaluation/test_incremental_evaluator.py rename to tests/evaluation/evaluation/test_incremental_evaluator.py diff --git a/tests/evaluation/test_instruct_comparison_argilla_evaluator.py b/tests/evaluation/evaluation/test_instruct_comparison_argilla_evaluator.py similarity index 100% rename from tests/evaluation/test_instruct_comparison_argilla_evaluator.py rename to tests/evaluation/evaluation/test_instruct_comparison_argilla_evaluator.py diff --git a/tests/evaluation/test_hugging_face_repository.py b/tests/evaluation/infrastructure/test_hugging_face_repository.py similarity index 100% rename from tests/evaluation/test_hugging_face_repository.py rename to tests/evaluation/infrastructure/test_hugging_face_repository.py diff --git a/tests/evaluation/test_repository_navigator.py b/tests/evaluation/infrastructure/test_repository_navigator.py similarity index 100% rename from tests/evaluation/test_repository_navigator.py rename to tests/evaluation/infrastructure/test_repository_navigator.py diff --git a/tests/evaluation/test_file_run_repository.py b/tests/evaluation/run/test_file_run_repository.py similarity index 100% rename from tests/evaluation/test_file_run_repository.py rename to tests/evaluation/run/test_file_run_repository.py diff --git a/tests/evaluation/test_run.py b/tests/evaluation/run/test_run.py similarity index 89% rename from tests/evaluation/test_run.py rename to tests/evaluation/run/test_run.py index 15df23124..c0c4456bd 100644 --- a/tests/evaluation/test_run.py +++ b/tests/evaluation/run/test_run.py @@ -75,11 +75,11 @@ def test_run_evaluation( [ "", "--eval-logic", - "tests.evaluation.test_run.DummyEvaluationLogic", + "tests.evaluation.run.test_run.DummyEvaluationLogic", "--aggregation-logic", - "tests.evaluation.test_run.DummyAggregationLogic", + "tests.evaluation.run.test_run.DummyAggregationLogic", "--task", - "tests.evaluation.test_run.DummyTask", + "tests.evaluation.run.test_run.DummyTask", "--dataset-repository-path", str(dataset_path), "--dataset-id", @@ -112,11 +112,11 @@ def test_run_evaluation_with_task_with_client( [ "", "--eval-logic", - "tests.evaluation.test_run.DummyEvaluationLogic", + "tests.evaluation.run.test_run.DummyEvaluationLogic", "--aggregation-logic", - "tests.evaluation.test_run.DummyAggregationLogic", + "tests.evaluation.run.test_run.DummyAggregationLogic", "--task", - "tests.evaluation.test_run.DummyTaskWithClient", + "tests.evaluation.run.test_run.DummyTaskWithClient", "--dataset-repository-path", str(dataset_path), "--dataset-id", diff --git a/tests/evaluation/test_run_repository.py b/tests/evaluation/run/test_run_repository.py similarity index 99% rename from tests/evaluation/test_run_repository.py rename to tests/evaluation/run/test_run_repository.py index d931f1db8..68c2cb593 100644 --- a/tests/evaluation/test_run_repository.py +++ b/tests/evaluation/run/test_run_repository.py @@ -20,7 +20,7 @@ TaskSpanTrace, ) from intelligence_layer.evaluation.run.domain import FailedExampleRun -from tests.conftest import DummyStringInput +from tests.evaluation.conftest import DummyStringInput test_repository_fixtures = [ "file_run_repository", diff --git a/tests/evaluation/test_runner.py b/tests/evaluation/run/test_runner.py similarity index 100% rename from tests/evaluation/test_runner.py rename to tests/evaluation/run/test_runner.py diff --git a/tests/evaluation/test_domain.py b/tests/evaluation/run/test_trace.py similarity index 72% rename from tests/evaluation/test_domain.py rename to tests/evaluation/run/test_trace.py index df2ef0edc..8047ef644 100644 --- a/tests/evaluation/test_domain.py +++ b/tests/evaluation/run/test_trace.py @@ -1,16 +1,7 @@ -from pytest import raises - from intelligence_layer.core import utc_now from intelligence_layer.core.tracer.in_memory_tracer import InMemoryTracer -from intelligence_layer.evaluation import ( - AggregationOverview, - EvaluationFailed, - LogTrace, - SpanTrace, - TaskSpanTrace, -) +from intelligence_layer.evaluation import LogTrace, SpanTrace, TaskSpanTrace from intelligence_layer.evaluation.run.trace import _to_trace_entry -from tests.evaluation.conftest import DummyAggregatedEvaluation def test_to_trace_entry() -> None: @@ -49,10 +40,3 @@ def test_deserialize_task_trace() -> None: output=["c"], ) assert trace.model_validate_json(trace.model_dump_json()) == trace - - -def test_raise_on_exception_for_evaluation_run_overview( - aggregation_overview: AggregationOverview[DummyAggregatedEvaluation], -) -> None: - with raises(EvaluationFailed): - aggregation_overview.raise_on_evaluation_failure() diff --git a/tests/use_cases/classify/test_classify.py b/tests/examples/classify/test_classify.py similarity index 100% rename from tests/use_cases/classify/test_classify.py rename to tests/examples/classify/test_classify.py diff --git a/tests/use_cases/classify/test_embedding_based_classify.py b/tests/examples/classify/test_embedding_based_classify.py similarity index 100% rename from tests/use_cases/classify/test_embedding_based_classify.py rename to tests/examples/classify/test_embedding_based_classify.py diff --git a/tests/use_cases/classify/test_keyword_extract.py b/tests/examples/classify/test_keyword_extract.py similarity index 100% rename from tests/use_cases/classify/test_keyword_extract.py rename to tests/examples/classify/test_keyword_extract.py diff --git a/tests/use_cases/classify/test_prompt_based_classify.py b/tests/examples/classify/test_prompt_based_classify.py similarity index 100% rename from tests/use_cases/classify/test_prompt_based_classify.py rename to tests/examples/classify/test_prompt_based_classify.py diff --git a/tests/use_cases/classify/test_prompt_based_classify_with_definitions.py b/tests/examples/classify/test_prompt_based_classify_with_definitions.py similarity index 100% rename from tests/use_cases/classify/test_prompt_based_classify_with_definitions.py rename to tests/examples/classify/test_prompt_based_classify_with_definitions.py diff --git a/tests/use_cases/qa/conftest.py b/tests/examples/qa/conftest.py similarity index 100% rename from tests/use_cases/qa/conftest.py rename to tests/examples/qa/conftest.py diff --git a/tests/use_cases/qa/test_long_context_qa.py b/tests/examples/qa/test_long_context_qa.py similarity index 100% rename from tests/use_cases/qa/test_long_context_qa.py rename to tests/examples/qa/test_long_context_qa.py diff --git a/tests/use_cases/qa/test_multiple_chunk_qa.py b/tests/examples/qa/test_multiple_chunk_qa.py similarity index 100% rename from tests/use_cases/qa/test_multiple_chunk_qa.py rename to tests/examples/qa/test_multiple_chunk_qa.py diff --git a/tests/use_cases/qa/test_multiple_chunk_retriever_qa.py b/tests/examples/qa/test_multiple_chunk_retriever_qa.py similarity index 100% rename from tests/use_cases/qa/test_multiple_chunk_retriever_qa.py rename to tests/examples/qa/test_multiple_chunk_retriever_qa.py diff --git a/tests/use_cases/qa/test_retriever_based_qa.py b/tests/examples/qa/test_retriever_based_qa.py similarity index 100% rename from tests/use_cases/qa/test_retriever_based_qa.py rename to tests/examples/qa/test_retriever_based_qa.py diff --git a/tests/use_cases/qa/test_single_chunk_qa.py b/tests/examples/qa/test_single_chunk_qa.py similarity index 100% rename from tests/use_cases/qa/test_single_chunk_qa.py rename to tests/examples/qa/test_single_chunk_qa.py diff --git a/tests/use_cases/search/test_expand_chunk.py b/tests/examples/search/test_expand_chunk.py similarity index 100% rename from tests/use_cases/search/test_expand_chunk.py rename to tests/examples/search/test_expand_chunk.py diff --git a/tests/use_cases/search/test_search.py b/tests/examples/search/test_search.py similarity index 100% rename from tests/use_cases/search/test_search.py rename to tests/examples/search/test_search.py diff --git a/tests/examples/summarize/__init__.py b/tests/examples/summarize/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/use_cases/summarize/conftest.py b/tests/examples/summarize/conftest.py similarity index 100% rename from tests/use_cases/summarize/conftest.py rename to tests/examples/summarize/conftest.py diff --git a/tests/use_cases/summarize/test_recursive_summarize.py b/tests/examples/summarize/test_recursive_summarize.py similarity index 100% rename from tests/use_cases/summarize/test_recursive_summarize.py rename to tests/examples/summarize/test_recursive_summarize.py diff --git a/tests/use_cases/summarize/test_steerable_long_context_summarize.py b/tests/examples/summarize/test_steerable_long_context_summarize.py similarity index 100% rename from tests/use_cases/summarize/test_steerable_long_context_summarize.py rename to tests/examples/summarize/test_steerable_long_context_summarize.py diff --git a/tests/use_cases/summarize/test_summarize.py b/tests/examples/summarize/test_summarize.py similarity index 100% rename from tests/use_cases/summarize/test_summarize.py rename to tests/examples/summarize/test_summarize.py diff --git a/tests/use_cases/summarize/very_long_text.txt b/tests/examples/summarize/very_long_text.txt similarity index 100% rename from tests/use_cases/summarize/very_long_text.txt rename to tests/examples/summarize/very_long_text.txt