Skip to content

Commit

Permalink
fix: expand chunk index error for large texts (#901)
Browse files Browse the repository at this point in the history
* fix: expand chunk index error for large texts
* docs: add changelog entry
Task: PHS-556
  • Loading branch information
NiklasKoehneckeAA authored Jun 11, 2024
1 parent 12c0c96 commit 19f5ad0
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 12 deletions.
6 changes: 3 additions & 3 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@
- `run_dataset` now has a flag `trace_examples_individually` to create `Tracer`s for each example. Defaults to True.

### Fixes
- ControlModels throw warning instead of error in case a not recommended model is selected.
- Cap `LimitedConcurrencyClient.max_concurrency` at 10 and set default to 10.

- ControlModels throw a warning instead of an error in case a not-recommended model is selected.
- The `LimitedConcurrencyClient.max_concurrency` is now capped at 10, which is its default, as the underlying `aleph_alpha_client` does not support more currently.
- ExpandChunk now works properly if the chunk of interest is not at the beginning of a very large document. As a consequence, `MultipleChunkRetrieverQa` now works better with larger documents and should return fewer `None` answers.
### Deprecations
...

Expand Down
14 changes: 10 additions & 4 deletions src/intelligence_layer/examples/search/expand_chunks.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class ExpandChunks(Generic[ID], Task[ExpandChunksInput[ID], ExpandChunksOutput])
Args:
retriever: Used to access and return a set of texts.
model: The model's tokenizer is relevant to calculate the correct size of the returned chunks.
max_chunk_size: The maximum chunk size of each returned chunk.
max_chunk_size: The maximum chunk size of each returned chunk in #tokens.
"""

def __init__(
Expand All @@ -51,10 +51,12 @@ def do_run(
) -> ExpandChunksOutput:
text = self._retrieve_text(input.document_id)
large_chunks = self._expand_chunks(
text, input.chunks_found, self._large_chunker
text, 0, input.chunks_found, self._large_chunker
)
nested_expanded_chunks = [
self._expand_chunks(chunk.chunk, input.chunks_found, self._target_chunker)
self._expand_chunks(
chunk.chunk, chunk.start_index, input.chunks_found, self._target_chunker
)
for chunk in large_chunks
]
return ExpandChunksOutput(
Expand All @@ -77,13 +79,17 @@ def _retrieve_text(self, document_id: ID) -> str:
def _expand_chunks(
self,
text: str,
text_start: int,
chunks_found: Sequence[DocumentChunk],
chunker: ChunkWithIndices,
) -> Sequence[ChunkWithStartEndIndices]:
chunked_text = self._chunk_text(text, chunker)

overlapping_chunk_indices = self._overlapping_chunk_indices(
[(c.start_index, c.end_index) for c in chunked_text],
[
(c.start_index + text_start, c.end_index + text_start)
for c in chunked_text
],
[(chunk.start, chunk.end) for chunk in chunks_found],
)

Expand Down
51 changes: 46 additions & 5 deletions tests/examples/search/test_expand_chunk.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
from datetime import datetime, timedelta
from typing import Sequence
from typing import Optional, Sequence

from pytest import fixture

from intelligence_layer.connectors import (
BaseRetriever,
Document,
DocumentChunk,
QdrantInMemoryRetriever,
)
from intelligence_layer.connectors.document_index.document_index import DocumentPath
from intelligence_layer.connectors.retrievers.document_index_retriever import (
DocumentIndexRetriever,
DocumentPath,
QdrantInMemoryRetriever,
SearchResult,
)
from intelligence_layer.core import LuminousControlModel, NoOpTracer
from intelligence_layer.examples import ExpandChunks, ExpandChunksInput
Expand Down Expand Up @@ -203,3 +203,44 @@ def test_expand_chunk_is_fast_with_large_document(

assert len(output.chunks) == 1
assert elapsed < timedelta(seconds=10)


class FakeRetriever(BaseRetriever[str]):
def __init__(self, result: str) -> None:
super().__init__()
self.result = result

def get_relevant_documents_with_scores(
self, query: str
) -> Sequence[SearchResult[str]]:
return []

def get_full_document(self, id: str) -> Optional[Document]:
return Document(text=self.result)


def test_expand_chunks_works_if_chunk_of_interest_is_outside_first_large_chunk(
luminous_control_model: LuminousControlModel,
no_op_tracer: NoOpTracer,
) -> None:
# given
task_input = ExpandChunksInput(
document_id="id",
chunks_found=[
DocumentChunk(
text="",
start=1500, # outside of first large chunk boundary, which is ~1200
end=1505,
)
],
)
full_text = " ".join(str(i) for i in range(1000))
max_chunk_size = 10
expand_chunk_task = ExpandChunks(
FakeRetriever(result=full_text),
luminous_control_model,
max_chunk_size=max_chunk_size,
)
res = expand_chunk_task.run(task_input, no_op_tracer)
assert len(res.chunks) > 0
assert len(res.chunks[0].chunk.strip().split(" ")) == max_chunk_size

0 comments on commit 19f5ad0

Please sign in to comment.