fix: expand chunk index error for large texts

Aleph-Alpha · Jun 10, 2024 · a08d7aa · a08d7aa
1 parent fcd9908
commit a08d7aa
Show file tree

Hide file tree

Showing 2 changed files with 56 additions and 5 deletions.
diff --git a/src/intelligence_layer/examples/search/expand_chunks.py b/src/intelligence_layer/examples/search/expand_chunks.py
@@ -31,7 +31,7 @@ class ExpandChunks(Generic[ID], Task[ExpandChunksInput[ID], ExpandChunksOutput])
     Args:
         retriever: Used to access and return a set of texts.
         model: The model's tokenizer is relevant to calculate the correct size of the returned chunks.
-        max_chunk_size: The maximum chunk size of each returned chunk.
+        max_chunk_size: The maximum chunk size of each returned chunk in #tokens.
     """
 
     def __init__(
@@ -51,10 +51,12 @@ def do_run(
     ) -> ExpandChunksOutput:
         text = self._retrieve_text(input.document_id)
         large_chunks = self._expand_chunks(
-            text, input.chunks_found, self._large_chunker
+            text, 0, input.chunks_found, self._large_chunker
         )
         nested_expanded_chunks = [
-            self._expand_chunks(chunk.chunk, input.chunks_found, self._target_chunker)
+            self._expand_chunks(
+                chunk.chunk, chunk.start_index, input.chunks_found, self._target_chunker
+            )
             for chunk in large_chunks
         ]
         return ExpandChunksOutput(
@@ -77,13 +79,17 @@ def _retrieve_text(self, document_id: ID) -> str:
     def _expand_chunks(
         self,
         text: str,
+        text_start: int,
         chunks_found: Sequence[DocumentChunk],
         chunker: ChunkWithIndices,
     ) -> Sequence[ChunkWithStartEndIndices]:
         chunked_text = self._chunk_text(text, chunker)
 
         overlapping_chunk_indices = self._overlapping_chunk_indices(
-            [(c.start_index, c.end_index) for c in chunked_text],
+            [
+                (c.start_index + text_start, c.end_index + text_start)
+                for c in chunked_text
+            ],
             [(chunk.start, chunk.end) for chunk in chunks_found],
         )
 

diff --git a/tests/examples/search/test_expand_chunk.py b/tests/examples/search/test_expand_chunk.py
@@ -1,5 +1,5 @@
 from datetime import datetime, timedelta
-from typing import Sequence
+from typing import Optional, Sequence
 
 from pytest import fixture
 
@@ -9,6 +9,10 @@
     QdrantInMemoryRetriever,
 )
 from intelligence_layer.connectors.document_index.document_index import DocumentPath
+from intelligence_layer.connectors.retrievers.base_retriever import (
+    BaseRetriever,
+    SearchResult,
+)
 from intelligence_layer.connectors.retrievers.document_index_retriever import (
     DocumentIndexRetriever,
 )
@@ -203,3 +207,44 @@ def test_expand_chunk_is_fast_with_large_document(
 
     assert len(output.chunks) == 1
     assert elapsed < timedelta(seconds=10)
+
+
+class FakeRetriever(BaseRetriever[str]):
+    def __init__(self, result: str) -> None:
+        super().__init__()
+        self.result = result
+
+    def get_relevant_documents_with_scores(
+        self, query: str
+    ) -> Sequence[SearchResult[str]]:
+        return []
+
+    def get_full_document(self, id: str) -> Optional[Document]:
+        return Document(text=self.result)
+
+
+def test_expand_chunks_works_if_chunk_of_interest_is_outside_first_large_chunk(
+    luminous_control_model: LuminousControlModel,
+    no_op_tracer: NoOpTracer,
+) -> None:
+    # given
+    task_input = ExpandChunksInput(
+        document_id="id",
+        chunks_found=[
+            DocumentChunk(
+                text="",
+                start=1500,  # outside of first large chunk boundary, which is ~1200
+                end=1505,
+            )
+        ],
+    )
+    full_text = " ".join(str(i) for i in range(1000))
+    max_chunk_size = 10
+    expand_chunk_task = ExpandChunks(
+        FakeRetriever(result=full_text),
+        luminous_control_model,
+        max_chunk_size=max_chunk_size,
+    )
+    res = expand_chunk_task.run(task_input, no_op_tracer)
+    assert len(res.chunks) > 0
+    assert len(res.chunks[0].chunk.strip().split(" ")) == max_chunk_size