Skip to content

Commit

Permalink
feat(document-index): retrieve chunks of an indexed document
Browse files Browse the repository at this point in the history
  • Loading branch information
TilTheunissenAA authored and NiklasKoehneckeAA committed Dec 16, 2024
1 parent 047c4d5 commit 3d4c75d
Show file tree
Hide file tree
Showing 4 changed files with 122 additions and 1 deletion.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
- Add `create_project` bool to `StudioClient.__init__()` to enable users to automatically create their Studio projects
- Add progressbar to the `Runner` to be able to track the `Run`
- Add `StudioClient.submit_benchmark_lineages` function and include it in `StudioClient.submit_benchmark_execution`
- Add method `DocumentIndexClient.chunks()` for retrieving all text chunks of a document.

### Fixes
...
Expand Down
19 changes: 19 additions & 0 deletions src/documentation/document_index.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,25 @@
"document_index.documents(collection_path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Once a document is indexed, we can also have a look at its chunks:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"document_index.chunks(\n",
" DocumentPath(collection_path=collection_path, document_name=document_1[\"name\"]),\n",
" index_name=INDEX,\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down
57 changes: 57 additions & 0 deletions src/intelligence_layer/connectors/document_index/document_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,38 @@ def _from_search_response(
)


class DocumentChunk(BaseModel):
"""A chunk of a document.
Note:
Currently only supports text-only documents.
Args:
document_path: Path to the document that the chunk originates from.
section: Content of the chunk.
position: Position of the chunk within the document.
"""

document_path: DocumentPath
section: str
position: DocumentTextPosition

@classmethod
def _from_chunk_response(cls, chunk_response: Mapping[str, Any]) -> "DocumentChunk":
assert chunk_response["start"]["item"] == chunk_response["end"]["item"]
assert chunk_response["section"][0]["modality"] == "text"

return cls(
document_path=DocumentPath.from_json(chunk_response["document_path"]),
section=chunk_response["section"][0]["text"],
position=DocumentTextPosition(
item=chunk_response["start"]["item"],
start_position=chunk_response["start"]["position"],
end_position=chunk_response["end"]["position"],
),
)


class DocumentIndexError(RuntimeError):
"""Raised in case of any `DocumentIndexClient`-related errors.
Expand Down Expand Up @@ -880,6 +912,31 @@ def search(
self._raise_for_status(response)
return [DocumentSearchResult._from_search_response(r) for r in response.json()]

def chunks(
self, document_path: DocumentPath, index_name: str
) -> Sequence[DocumentChunk]:
"""Retrieve all chunks of an indexed document.
If the document is still indexing, a ResourceNotFound error is raised.
Args:
document_path: Path to the document.
index_name: Name of the index to retrieve chunks from.
Returns:
List of all chunks of the indexed document.
"""
url_suffix = f"collections/{document_path.collection_path.namespace}/{document_path.collection_path.collection}/docs/{document_path.encoded_document_name()}/indexes/{index_name}/chunks"
url = urljoin(self._base_document_index_url, url_suffix)

response = requests.get(url, headers=self.headers)
self._raise_for_status(response)
return [
DocumentChunk._from_chunk_response(r)
for r in response.json()
if len(r["section"]) > 0 and r["section"][0]["modality"] == "text"
]

def _raise_for_status(self, response: requests.Response) -> None:
try:
response.raise_for_status()
Expand Down
46 changes: 45 additions & 1 deletion tests/connectors/document_index/test_document_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,13 @@
InvalidInput,
ResourceNotFound,
SearchQuery,
SemanticEmbed,
)
from tests.conftest_document_index import (
random_embedding_config,
random_identifier,
retry,
)
from tests.conftest_document_index import random_embedding_config, retry


@pytest.mark.internal
Expand Down Expand Up @@ -752,3 +757,42 @@ def test_document_indexes_works(
document_index: DocumentIndexClient, random_collection: CollectionPath
) -> None:
document_index.progress(random_collection)


def test_retrieve_chunks(
document_index: DocumentIndexClient,
random_collection: CollectionPath,
document_index_namespace: str,
) -> None:
index_name = random_identifier()
index_path = IndexPath(namespace=document_index_namespace, index=index_name)
index_configuration = IndexConfiguration(
chunk_size=512,
chunk_overlap=0,
embedding=SemanticEmbed(
representation="asymmetric",
model_name="luminous-base",
),
)
document_index.create_index(index_path, index_configuration)
document_index.assign_index_to_collection(random_collection, index_name)

document_path = DocumentPath(
collection_path=random_collection,
document_name="document-with-chunks",
)
document_contents = DocumentContents(
contents=[
# because chunk size is 512, this item will be split into 2 chunks
" token" * 750,
"final chunk",
],
)
document_index.add_document(document_path, document_contents)

@retry
def chunks() -> None:
chunks = document_index.chunks(document_path, index_name)
assert len(chunks) == 3

chunks()

0 comments on commit 3d4c75d

Please sign in to comment.