Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: expand chunk index error for large texts #901

Merged
merged 3 commits into from
Jun 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@
- `run_dataset` now has a flag `trace_examples_individually` to create `Tracer`s for each example. Defaults to True.

### Fixes
- ControlModels throw warning instead of error in case a not recommended model is selected.
- Cap `LimitedConcurrencyClient.max_concurrency` at 10 and set default to 10.

- ControlModels throw a warning instead of an error in case a not-recommended model is selected.
- The `LimitedConcurrencyClient.max_concurrency` is now capped at 10, which is its default, as the underlying `aleph_alpha_client` does not support more currently.
- ExpandChunk now works properly if the chunk of interest is not at the beginning of a very large document. As a consequence, `MultipleChunkRetrieverQa` now works better with larger documents and should return fewer `None` answers.
### Deprecations
...

Expand Down
14 changes: 10 additions & 4 deletions src/intelligence_layer/examples/search/expand_chunks.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class ExpandChunks(Generic[ID], Task[ExpandChunksInput[ID], ExpandChunksOutput])
Args:
retriever: Used to access and return a set of texts.
model: The model's tokenizer is relevant to calculate the correct size of the returned chunks.
max_chunk_size: The maximum chunk size of each returned chunk.
max_chunk_size: The maximum chunk size of each returned chunk in #tokens.
"""

def __init__(
Expand All @@ -51,10 +51,12 @@ def do_run(
) -> ExpandChunksOutput:
text = self._retrieve_text(input.document_id)
large_chunks = self._expand_chunks(
text, input.chunks_found, self._large_chunker
text, 0, input.chunks_found, self._large_chunker
)
nested_expanded_chunks = [
self._expand_chunks(chunk.chunk, input.chunks_found, self._target_chunker)
self._expand_chunks(
chunk.chunk, chunk.start_index, input.chunks_found, self._target_chunker
)
for chunk in large_chunks
]
return ExpandChunksOutput(
Expand All @@ -77,13 +79,17 @@ def _retrieve_text(self, document_id: ID) -> str:
def _expand_chunks(
self,
text: str,
text_start: int,
chunks_found: Sequence[DocumentChunk],
chunker: ChunkWithIndices,
) -> Sequence[ChunkWithStartEndIndices]:
chunked_text = self._chunk_text(text, chunker)

overlapping_chunk_indices = self._overlapping_chunk_indices(
[(c.start_index, c.end_index) for c in chunked_text],
[
(c.start_index + text_start, c.end_index + text_start)
for c in chunked_text
],
[(chunk.start, chunk.end) for chunk in chunks_found],
)

Expand Down
51 changes: 46 additions & 5 deletions tests/examples/search/test_expand_chunk.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
from datetime import datetime, timedelta
from typing import Sequence
from typing import Optional, Sequence

from pytest import fixture

from intelligence_layer.connectors import (
BaseRetriever,
Document,
DocumentChunk,
QdrantInMemoryRetriever,
)
from intelligence_layer.connectors.document_index.document_index import DocumentPath
from intelligence_layer.connectors.retrievers.document_index_retriever import (
DocumentIndexRetriever,
DocumentPath,
QdrantInMemoryRetriever,
SearchResult,
)
from intelligence_layer.core import LuminousControlModel, NoOpTracer
from intelligence_layer.examples import ExpandChunks, ExpandChunksInput
Expand Down Expand Up @@ -203,3 +203,44 @@ def test_expand_chunk_is_fast_with_large_document(

assert len(output.chunks) == 1
assert elapsed < timedelta(seconds=10)


class FakeRetriever(BaseRetriever[str]):
def __init__(self, result: str) -> None:
super().__init__()
self.result = result

def get_relevant_documents_with_scores(
self, query: str
) -> Sequence[SearchResult[str]]:
return []

def get_full_document(self, id: str) -> Optional[Document]:
return Document(text=self.result)


def test_expand_chunks_works_if_chunk_of_interest_is_outside_first_large_chunk(
luminous_control_model: LuminousControlModel,
no_op_tracer: NoOpTracer,
) -> None:
# given
task_input = ExpandChunksInput(
document_id="id",
chunks_found=[
DocumentChunk(
text="",
start=1500, # outside of first large chunk boundary, which is ~1200
end=1505,
)
],
)
full_text = " ".join(str(i) for i in range(1000))
max_chunk_size = 10
expand_chunk_task = ExpandChunks(
FakeRetriever(result=full_text),
luminous_control_model,
max_chunk_size=max_chunk_size,
)
res = expand_chunk_task.run(task_input, no_op_tracer)
assert len(res.chunks) > 0
assert len(res.chunks[0].chunk.strip().split(" ")) == max_chunk_size