fix: expand chunk index error for large texts (#901)

* fix: expand chunk index error for large texts * docs: add changelog entry Task: PHS-556
Aleph-Alpha · Jun 11, 2024 · 19f5ad0 · 19f5ad0
1 parent 12c0c96
commit 19f5ad0
Show file tree

Hide file tree

Showing 3 changed files with 59 additions and 12 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,9 +14,9 @@
  - `run_dataset` now has a flag `trace_examples_individually` to create `Tracer`s for each example. Defaults to True.
 
 ### Fixes
-  - ControlModels throw warning instead of error in case a not recommended model is selected.
-  - Cap `LimitedConcurrencyClient.max_concurrency` at 10 and set default to 10.
-
+  - ControlModels throw a warning instead of an error in case a not-recommended model is selected.
+  - The `LimitedConcurrencyClient.max_concurrency` is now capped at 10, which is its default, as the underlying `aleph_alpha_client` does not support more currently.
+  - ExpandChunk now works properly if the chunk of interest is not at the beginning of a very large document. As a consequence, `MultipleChunkRetrieverQa` now works better with larger documents and should return fewer `None` answers.
 ### Deprecations
 ...
 

diff --git a/src/intelligence_layer/examples/search/expand_chunks.py b/src/intelligence_layer/examples/search/expand_chunks.py
@@ -31,7 +31,7 @@ class ExpandChunks(Generic[ID], Task[ExpandChunksInput[ID], ExpandChunksOutput])
     Args:
         retriever: Used to access and return a set of texts.
         model: The model's tokenizer is relevant to calculate the correct size of the returned chunks.
-        max_chunk_size: The maximum chunk size of each returned chunk.
+        max_chunk_size: The maximum chunk size of each returned chunk in #tokens.
     """
 
     def __init__(
@@ -51,10 +51,12 @@ def do_run(
     ) -> ExpandChunksOutput:
         text = self._retrieve_text(input.document_id)
         large_chunks = self._expand_chunks(
-            text, input.chunks_found, self._large_chunker
+            text, 0, input.chunks_found, self._large_chunker
         )
         nested_expanded_chunks = [
-            self._expand_chunks(chunk.chunk, input.chunks_found, self._target_chunker)
+            self._expand_chunks(
+                chunk.chunk, chunk.start_index, input.chunks_found, self._target_chunker
+            )
             for chunk in large_chunks
         ]
         return ExpandChunksOutput(
@@ -77,13 +79,17 @@ def _retrieve_text(self, document_id: ID) -> str:
     def _expand_chunks(
         self,
         text: str,
+        text_start: int,
         chunks_found: Sequence[DocumentChunk],
         chunker: ChunkWithIndices,
     ) -> Sequence[ChunkWithStartEndIndices]:
         chunked_text = self._chunk_text(text, chunker)
 
         overlapping_chunk_indices = self._overlapping_chunk_indices(
-            [(c.start_index, c.end_index) for c in chunked_text],
+            [
+                (c.start_index + text_start, c.end_index + text_start)
+                for c in chunked_text
+            ],
             [(chunk.start, chunk.end) for chunk in chunks_found],
         )
 

diff --git a/tests/examples/search/test_expand_chunk.py b/tests/examples/search/test_expand_chunk.py
@@ -1,16 +1,16 @@
 from datetime import datetime, timedelta
-from typing import Sequence
+from typing import Optional, Sequence
 
 from pytest import fixture
 
 from intelligence_layer.connectors import (
+    BaseRetriever,
     Document,
     DocumentChunk,
-    QdrantInMemoryRetriever,
-)
-from intelligence_layer.connectors.document_index.document_index import DocumentPath
-from intelligence_layer.connectors.retrievers.document_index_retriever import (
     DocumentIndexRetriever,
+    DocumentPath,
+    QdrantInMemoryRetriever,
+    SearchResult,
 )
 from intelligence_layer.core import LuminousControlModel, NoOpTracer
 from intelligence_layer.examples import ExpandChunks, ExpandChunksInput
@@ -203,3 +203,44 @@ def test_expand_chunk_is_fast_with_large_document(
 
     assert len(output.chunks) == 1
     assert elapsed < timedelta(seconds=10)
+
+
+class FakeRetriever(BaseRetriever[str]):
+    def __init__(self, result: str) -> None:
+        super().__init__()
+        self.result = result
+
+    def get_relevant_documents_with_scores(
+        self, query: str
+    ) -> Sequence[SearchResult[str]]:
+        return []
+
+    def get_full_document(self, id: str) -> Optional[Document]:
+        return Document(text=self.result)
+
+
+def test_expand_chunks_works_if_chunk_of_interest_is_outside_first_large_chunk(
+    luminous_control_model: LuminousControlModel,
+    no_op_tracer: NoOpTracer,
+) -> None:
+    # given
+    task_input = ExpandChunksInput(
+        document_id="id",
+        chunks_found=[
+            DocumentChunk(
+                text="",
+                start=1500,  # outside of first large chunk boundary, which is ~1200
+                end=1505,
+            )
+        ],
+    )
+    full_text = " ".join(str(i) for i in range(1000))
+    max_chunk_size = 10
+    expand_chunk_task = ExpandChunks(
+        FakeRetriever(result=full_text),
+        luminous_control_model,
+        max_chunk_size=max_chunk_size,
+    )
+    res = expand_chunk_task.run(task_input, no_op_tracer)
+    assert len(res.chunks) > 0
+    assert len(res.chunks[0].chunk.strip().split(" ")) == max_chunk_size