Skip to content

Commit

Permalink
fix: expand chunk index error for large texts
Browse files Browse the repository at this point in the history
  • Loading branch information
NiklasKoehneckeAA committed Jun 10, 2024
1 parent fcd9908 commit a08d7aa
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 5 deletions.
14 changes: 10 additions & 4 deletions src/intelligence_layer/examples/search/expand_chunks.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class ExpandChunks(Generic[ID], Task[ExpandChunksInput[ID], ExpandChunksOutput])
Args:
retriever: Used to access and return a set of texts.
model: The model's tokenizer is relevant to calculate the correct size of the returned chunks.
max_chunk_size: The maximum chunk size of each returned chunk.
max_chunk_size: The maximum chunk size of each returned chunk in #tokens.
"""

def __init__(
Expand All @@ -51,10 +51,12 @@ def do_run(
) -> ExpandChunksOutput:
text = self._retrieve_text(input.document_id)
large_chunks = self._expand_chunks(
text, input.chunks_found, self._large_chunker
text, 0, input.chunks_found, self._large_chunker
)
nested_expanded_chunks = [
self._expand_chunks(chunk.chunk, input.chunks_found, self._target_chunker)
self._expand_chunks(
chunk.chunk, chunk.start_index, input.chunks_found, self._target_chunker
)
for chunk in large_chunks
]
return ExpandChunksOutput(
Expand All @@ -77,13 +79,17 @@ def _retrieve_text(self, document_id: ID) -> str:
def _expand_chunks(
self,
text: str,
text_start: int,
chunks_found: Sequence[DocumentChunk],
chunker: ChunkWithIndices,
) -> Sequence[ChunkWithStartEndIndices]:
chunked_text = self._chunk_text(text, chunker)

overlapping_chunk_indices = self._overlapping_chunk_indices(
[(c.start_index, c.end_index) for c in chunked_text],
[
(c.start_index + text_start, c.end_index + text_start)
for c in chunked_text
],
[(chunk.start, chunk.end) for chunk in chunks_found],
)

Expand Down
47 changes: 46 additions & 1 deletion tests/examples/search/test_expand_chunk.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from datetime import datetime, timedelta
from typing import Sequence
from typing import Optional, Sequence

from pytest import fixture

Expand All @@ -9,6 +9,10 @@
QdrantInMemoryRetriever,
)
from intelligence_layer.connectors.document_index.document_index import DocumentPath
from intelligence_layer.connectors.retrievers.base_retriever import (
BaseRetriever,
SearchResult,
)
from intelligence_layer.connectors.retrievers.document_index_retriever import (
DocumentIndexRetriever,
)
Expand Down Expand Up @@ -203,3 +207,44 @@ def test_expand_chunk_is_fast_with_large_document(

assert len(output.chunks) == 1
assert elapsed < timedelta(seconds=10)


class FakeRetriever(BaseRetriever[str]):
def __init__(self, result: str) -> None:
super().__init__()
self.result = result

def get_relevant_documents_with_scores(
self, query: str
) -> Sequence[SearchResult[str]]:
return []

def get_full_document(self, id: str) -> Optional[Document]:
return Document(text=self.result)


def test_expand_chunks_works_if_chunk_of_interest_is_outside_first_large_chunk(
luminous_control_model: LuminousControlModel,
no_op_tracer: NoOpTracer,
) -> None:
# given
task_input = ExpandChunksInput(
document_id="id",
chunks_found=[
DocumentChunk(
text="",
start=1500, # outside of first large chunk boundary, which is ~1200
end=1505,
)
],
)
full_text = " ".join(str(i) for i in range(1000))
max_chunk_size = 10
expand_chunk_task = ExpandChunks(
FakeRetriever(result=full_text),
luminous_control_model,
max_chunk_size=max_chunk_size,
)
res = expand_chunk_task.run(task_input, no_op_tracer)
assert len(res.chunks) > 0
assert len(res.chunks[0].chunk.strip().split(" ")) == max_chunk_size

0 comments on commit a08d7aa

Please sign in to comment.