Skip to content

Commit

Permalink
feat: Add SearchEvaluationLogic and SearchAggregationLogic to eva…
Browse files Browse the repository at this point in the history
…luate `Search`-use-cases [F13-82] (#711)

* WIP: search eval

* `SearchEvaluationLogic`

* assert 1-indexed ranks

* use 1-indexed rank (instead of 0-based index)

* add search_eval_logic fixture

* fix types

* add searchaggregationlogic

* add search aggregation logic

* Adjust CHANGELOG.md

---------

Co-authored-by: Ivo Schaper <[email protected]>, Niklas Finken <[email protected]>
  • Loading branch information
NickyHavoc and NickyHavoc authored Apr 8, 2024
1 parent 3d77f4f commit b05d1fe
Show file tree
Hide file tree
Showing 6 changed files with 292 additions and 11 deletions.
11 changes: 7 additions & 4 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,21 +1,24 @@
# Changelog

## 0.8.1

### Fixes
- fix: Linting for release version

## 0.8.0
### New Features
- feature: Add `SearchEvaluationLogic` and `SearchAggregationLogic` to evaluate `Search`-use-cases

## 0.8.0

### New Features
- feature: Expose start and end index in DocumentChunk
- feature: Add sorted_scores property to `SingleLabelClassifyOutput`.
- feature: Error information is printed to the console on failed runs and evaluations.
- feature: The stack trace of a failed run/evaluation is included in the `FailedExampleRun`/`FailedExampleEvaluation` object
- feature: The `Runner.run_dataset(..)` and `Evaluator.evaluate_run(..)` have an optional flag `abort_on_error` to stop running/evaluating when an error occurs.
- feature: Added `Runner.failed_runs(..)` and `Evaluator.failed_evaluations(..)` to retrieve all failed run / evaluation lineages
- feature: Added `.successful_example_outputs(..)` and `.failed_example_outputs(..)` to `RunRepository` to match the evaluation repository
- feature: Added optional argument to set an id when creating a `Dataset` via `DatasetRepository.create_dataset(..)`
- feature: Add `Runner.failed_runs(..)` and `Evaluator.failed_evaluations(..)` to retrieve all failed run / evaluation lineages
- feature: Add `.successful_example_outputs(..)` and `.failed_example_outputs(..)` to `RunRepository` to match the evaluation repository
- feature: Add optional argument to set an id when creating a `Dataset` via `DatasetRepository.create_dataset(..)`
- feature: Traces now log exceptions using the `ErrorValue` type.


Expand Down
4 changes: 2 additions & 2 deletions src/intelligence_layer/connectors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@
from .limited_concurrency_client import (
LimitedConcurrencyClient as LimitedConcurrencyClient,
)
from .retrievers.base_retriever import BaseRetriever # noqa :F401
from .retrievers.base_retriever import SearchResult # noqa :F401
from .retrievers.base_retriever import BaseRetriever as BaseRetriever
from .retrievers.base_retriever import Document as Document
from .retrievers.base_retriever import DocumentChunk as DocumentChunk
from .retrievers.base_retriever import SearchResult as SearchResult
from .retrievers.document_index_retriever import (
DocumentIndexRetriever as DocumentIndexRetriever,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@ class DocumentChunk(BaseModel):
"""

text: str
metadata: Any = None
start: int
end: int
metadata: Any = None


ID = TypeVar("ID")
Expand Down
7 changes: 7 additions & 0 deletions src/intelligence_layer/use_cases/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,15 @@
from .qa.single_chunk_qa import SingleChunkQa as SingleChunkQa
from .qa.single_chunk_qa import SingleChunkQaInput as SingleChunkQaInput
from .qa.single_chunk_qa import SingleChunkQaOutput as SingleChunkQaOutput
from .search.search import AggregatedSearchEvaluation as AggregatedSearchEvaluation
from .search.search import ChunkFound as ChunkFound
from .search.search import ExpectedSearchOutput as ExpectedSearchOutput
from .search.search import Search as Search
from .search.search import SearchAggregationLogic as SearchAggregationLogic
from .search.search import SearchEvaluation as SearchEvaluation
from .search.search import SearchEvaluationLogic as SearchEvaluationLogic
from .search.search import SearchInput as SearchInput
from .search.search import SearchOutput as SearchOutput
from .summarize.recursive_summarize import RecursiveSummarize as RecursiveSummarize
from .summarize.recursive_summarize import (
RecursiveSummarizeInput as RecursiveSummarizeInput,
Expand Down
115 changes: 114 additions & 1 deletion src/intelligence_layer/use_cases/search/search.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Generic, Sequence
from typing import Generic, Iterable, Mapping, Optional, Sequence

from pydantic import BaseModel

Expand All @@ -8,6 +8,12 @@
SearchResult,
)
from intelligence_layer.core import Task, TaskSpan
from intelligence_layer.evaluation import (
AggregationLogic,
Example,
MeanAccumulator,
SingleOutputEvaluationLogic,
)


class SearchInput(BaseModel):
Expand Down Expand Up @@ -67,3 +73,110 @@ def __init__(self, retriever: BaseRetriever[ID]):
def do_run(self, input: SearchInput, task_span: TaskSpan) -> SearchOutput[ID]:
results = self._retriever.get_relevant_documents_with_scores(input.query)
return SearchOutput(results=results)


class ExpectedSearchOutput(BaseModel):
document_id: str
start_idx: int
end_idx: int
origin_chunk: str
answer: str
task_label: str


class SearchEvaluation(BaseModel):
rank: Optional[int]
similarity_score: Optional[float]


class SearchEvaluationLogic(
Generic[ID],
SingleOutputEvaluationLogic[
SearchInput, SearchOutput[ID], ExpectedSearchOutput, SearchEvaluation
],
):
def do_evaluate_single_output(
self,
example: Example[SearchInput, ExpectedSearchOutput],
output: SearchOutput[ID],
) -> SearchEvaluation:
results = output.results

def overlaps(a: tuple[int, int], b: tuple[int, int]) -> bool:
a_start, a_end = a
b_start, b_end = b
return a_start < b_end and b_start < a_end

rank, score = next(
(
(index + 1, result.score)
for index, result in enumerate(results)
if overlaps(
(result.document_chunk.start, result.document_chunk.end),
(
example.expected_output.start_idx,
example.expected_output.end_idx,
),
)
),
(None, None),
)

return SearchEvaluation(rank=rank, similarity_score=score)


class ChunkFound(BaseModel):
found_count: int # found => chunk was within top-k results of retriever
expected_count: int
percentage: float


class AggregatedSearchEvaluation(BaseModel):
mean_score: float
mean_reciprocal_rank: float
mean_top_ks: Mapping[int, float]
chunk_found: ChunkFound


class SearchAggregationLogic(
AggregationLogic[SearchEvaluation, AggregatedSearchEvaluation]
):
def __init__(self, top_ks_to_evaluate: Sequence[int]) -> None:
assert all(top_k > 0 for top_k in top_ks_to_evaluate)
self.top_ks_to_evaluate = top_ks_to_evaluate

def aggregate(
self, evaluations: Iterable[SearchEvaluation]
) -> AggregatedSearchEvaluation:
score_accumulator = MeanAccumulator()
reciprocal_rank_accumulator = MeanAccumulator()
chunk_found_accumulator = MeanAccumulator()
top_k_accumulator = {
top_k: MeanAccumulator() for top_k in self.top_ks_to_evaluate
}

for evaluation in evaluations:
chunk_found = True if evaluation.rank else False
chunk_found_accumulator.add(chunk_found)
if chunk_found:
assert evaluation.similarity_score and evaluation.rank

score_accumulator.add(evaluation.similarity_score)
reciprocal_rank_accumulator.add(1 / evaluation.rank)
for top_k in self.top_ks_to_evaluate:
top_k_accumulator[top_k].add(
1.0 if evaluation.rank <= top_k else 0.0
)

return AggregatedSearchEvaluation(
mean_score=score_accumulator.extract(),
mean_reciprocal_rank=reciprocal_rank_accumulator.extract(),
mean_top_ks={
top_k: acc.extract() for top_k, acc in top_k_accumulator.items()
},
chunk_found=ChunkFound(
found_count=int(chunk_found_accumulator._acc),
expected_count=chunk_found_accumulator._n,
percentage=chunk_found_accumulator.extract(),
),
)
164 changes: 161 additions & 3 deletions tests/use_cases/search/test_search.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,25 @@
from statistics import mean
from typing import Sequence

from pytest import fixture

from intelligence_layer.connectors.retrievers.base_retriever import Document
from intelligence_layer.connectors.retrievers.qdrant_in_memory_retriever import (
from intelligence_layer.connectors import (
Document,
DocumentChunk,
QdrantInMemoryRetriever,
SearchResult,
)
from intelligence_layer.core import NoOpTracer
from intelligence_layer.use_cases.search.search import Search, SearchInput
from intelligence_layer.evaluation import Example
from intelligence_layer.use_cases import (
ExpectedSearchOutput,
Search,
SearchAggregationLogic,
SearchEvaluation,
SearchEvaluationLogic,
SearchInput,
SearchOutput,
)
from tests.conftest import to_document


Expand All @@ -25,6 +37,46 @@ def search(asymmetric_in_memory_retriever: QdrantInMemoryRetriever) -> Search[in
return Search(asymmetric_in_memory_retriever)


@fixture
def expected_output() -> ExpectedSearchOutput:
return ExpectedSearchOutput(
document_id="1",
start_idx=0,
end_idx=5,
origin_chunk="hallo",
answer="",
task_label="",
)


@fixture
def example(
expected_output: ExpectedSearchOutput,
) -> Example[SearchInput, ExpectedSearchOutput]:
return Example(input=SearchInput(query=""), expected_output=expected_output)


@fixture
def search_eval_logic() -> SearchEvaluationLogic[str]:
return SearchEvaluationLogic[str]()


@fixture
def search_evaluations() -> Sequence[SearchEvaluation]:
return [
SearchEvaluation(rank=1, similarity_score=0.7),
SearchEvaluation(rank=3, similarity_score=0.6),
SearchEvaluation(rank=10, similarity_score=0.5),
SearchEvaluation(rank=None, similarity_score=None),
SearchEvaluation(rank=None, similarity_score=None),
]


@fixture
def search_aggregation_logic() -> SearchAggregationLogic:
return SearchAggregationLogic(top_ks_to_evaluate=[1, 3])


def test_search(
search: Search[int],
no_op_tracer: NoOpTracer,
Expand All @@ -40,3 +92,109 @@ def test_search(
result.results[0].document_chunk.end
== len(in_memory_retriever_documents[2].text) - 1
)


def test_search_evaluation_logic_works_for_overlapping_output(
example: Example[SearchInput, ExpectedSearchOutput],
search_eval_logic: SearchEvaluationLogic[str],
) -> None:
output = SearchOutput(
results=[
SearchResult(
id="1",
score=0.5,
document_chunk=DocumentChunk(text="llo", start=2, end=5),
)
]
)
eval = search_eval_logic.do_evaluate_single_output(example, output)

assert eval.rank == 1
assert eval.similarity_score == output.results[0].score


def test_search_evaluation_logic_works_for_wholly_included_output(
example: Example[SearchInput, ExpectedSearchOutput],
search_eval_logic: SearchEvaluationLogic[str],
) -> None:
output = SearchOutput(
results=[
SearchResult(
id="1",
score=0.5,
document_chunk=DocumentChunk(text="l", start=2, end=3),
)
]
)
eval = search_eval_logic.do_evaluate_single_output(example, output)

assert eval.rank == 1
assert eval.similarity_score == output.results[0].score


def test_search_evaluation_logic_works_for_identical_ranges(
example: Example[SearchInput, ExpectedSearchOutput],
search_eval_logic: SearchEvaluationLogic[str],
) -> None:
output = SearchOutput(
results=[
SearchResult(
id="1",
score=0.5,
document_chunk=DocumentChunk(text="hallo", start=0, end=5),
)
]
)
eval = search_eval_logic.do_evaluate_single_output(example, output)

assert eval.rank == 1
assert eval.similarity_score == output.results[0].score


def test_search_evaluation_logic_works_for_non_overlapping_output(
example: Example[SearchInput, ExpectedSearchOutput],
search_eval_logic: SearchEvaluationLogic[str],
) -> None:
output = SearchOutput(
results=[
SearchResult(
id="1",
score=0.5,
document_chunk=DocumentChunk(text=" test.", start=5, end=10),
)
]
)
eval = search_eval_logic.do_evaluate_single_output(example, output)

assert not eval.rank
assert not eval.similarity_score


def test_search_aggregation_logic_works(
search_evaluations: Sequence[SearchEvaluation],
search_aggregation_logic: SearchAggregationLogic,
) -> None:
aggregations = search_aggregation_logic.aggregate(search_evaluations)

assert (
aggregations.mean_score
== mean(
[
eval.similarity_score
for eval in search_evaluations
if eval.similarity_score
]
)
== 0.6
)
assert (
round(aggregations.mean_reciprocal_rank, 5)
== round(mean([1 / eval.rank for eval in search_evaluations if eval.rank]), 5)
== round((1 + (1 / 3) + (1 / 10)) / 3, 5)
)
assert aggregations.mean_top_ks
assert aggregations.chunk_found.found_count == 3
assert aggregations.chunk_found.expected_count == len(search_evaluations) == 5
assert aggregations.chunk_found.percentage == 3 / 5
assert aggregations.mean_top_ks[1] == 1 / 3
assert aggregations.mean_top_ks[3] == 2 / 3

0 comments on commit b05d1fe

Please sign in to comment.