feat(document-index): retrieve chunks of an indexed document

Aleph-Alpha · Dec 16, 2024 · 3d4c75d · 3d4c75d
1 parent 047c4d5
commit 3d4c75d
Show file tree

Hide file tree

Showing 4 changed files with 122 additions and 1 deletion.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,7 @@
 - Add `create_project` bool to `StudioClient.__init__()` to enable users to automatically create their Studio projects
 - Add progressbar to the `Runner` to be able to track the `Run`
 - Add `StudioClient.submit_benchmark_lineages` function and include it in `StudioClient.submit_benchmark_execution`
+- Add method `DocumentIndexClient.chunks()` for retrieving all text chunks of a document.
 
 ### Fixes
 ...

diff --git a/src/documentation/document_index.ipynb b/src/documentation/document_index.ipynb
@@ -262,6 +262,25 @@
     "document_index.documents(collection_path)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once a document is indexed, we can also have a look at its chunks:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "document_index.chunks(\n",
+    "    DocumentPath(collection_path=collection_path, document_name=document_1[\"name\"]),\n",
+    "    index_name=INDEX,\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},

diff --git a/src/intelligence_layer/connectors/document_index/document_index.py b/src/intelligence_layer/connectors/document_index/document_index.py
@@ -367,6 +367,38 @@ def _from_search_response(
         )
 
 
+class DocumentChunk(BaseModel):
+    """A chunk of a document.
+
+    Note:
+        Currently only supports text-only documents.
+
+    Args:
+        document_path: Path to the document that the chunk originates from.
+        section: Content of the chunk.
+        position: Position of the chunk within the document.
+    """
+
+    document_path: DocumentPath
+    section: str
+    position: DocumentTextPosition
+
+    @classmethod
+    def _from_chunk_response(cls, chunk_response: Mapping[str, Any]) -> "DocumentChunk":
+        assert chunk_response["start"]["item"] == chunk_response["end"]["item"]
+        assert chunk_response["section"][0]["modality"] == "text"
+
+        return cls(
+            document_path=DocumentPath.from_json(chunk_response["document_path"]),
+            section=chunk_response["section"][0]["text"],
+            position=DocumentTextPosition(
+                item=chunk_response["start"]["item"],
+                start_position=chunk_response["start"]["position"],
+                end_position=chunk_response["end"]["position"],
+            ),
+        )
+
+
 class DocumentIndexError(RuntimeError):
     """Raised in case of any `DocumentIndexClient`-related errors.
 
@@ -880,6 +912,31 @@ def search(
         self._raise_for_status(response)
         return [DocumentSearchResult._from_search_response(r) for r in response.json()]
 
+    def chunks(
+        self, document_path: DocumentPath, index_name: str
+    ) -> Sequence[DocumentChunk]:
+        """Retrieve all chunks of an indexed document.
+
+        If the document is still indexing, a ResourceNotFound error is raised.
+
+        Args:
+            document_path: Path to the document.
+            index_name: Name of the index to retrieve chunks from.
+
+        Returns:
+            List of all chunks of the indexed document.
+        """
+        url_suffix = f"collections/{document_path.collection_path.namespace}/{document_path.collection_path.collection}/docs/{document_path.encoded_document_name()}/indexes/{index_name}/chunks"
+        url = urljoin(self._base_document_index_url, url_suffix)
+
+        response = requests.get(url, headers=self.headers)
+        self._raise_for_status(response)
+        return [
+            DocumentChunk._from_chunk_response(r)
+            for r in response.json()
+            if len(r["section"]) > 0 and r["section"][0]["modality"] == "text"
+        ]
+
     def _raise_for_status(self, response: requests.Response) -> None:
         try:
             response.raise_for_status()

diff --git a/tests/connectors/document_index/test_document_index.py b/tests/connectors/document_index/test_document_index.py
@@ -19,8 +19,13 @@
     InvalidInput,
     ResourceNotFound,
     SearchQuery,
+    SemanticEmbed,
+)
+from tests.conftest_document_index import (
+    random_embedding_config,
+    random_identifier,
+    retry,
 )
-from tests.conftest_document_index import random_embedding_config, retry
 
 
 @pytest.mark.internal
@@ -752,3 +757,42 @@ def test_document_indexes_works(
     document_index: DocumentIndexClient, random_collection: CollectionPath
 ) -> None:
     document_index.progress(random_collection)
+
+
+def test_retrieve_chunks(
+    document_index: DocumentIndexClient,
+    random_collection: CollectionPath,
+    document_index_namespace: str,
+) -> None:
+    index_name = random_identifier()
+    index_path = IndexPath(namespace=document_index_namespace, index=index_name)
+    index_configuration = IndexConfiguration(
+        chunk_size=512,
+        chunk_overlap=0,
+        embedding=SemanticEmbed(
+            representation="asymmetric",
+            model_name="luminous-base",
+        ),
+    )
+    document_index.create_index(index_path, index_configuration)
+    document_index.assign_index_to_collection(random_collection, index_name)
+
+    document_path = DocumentPath(
+        collection_path=random_collection,
+        document_name="document-with-chunks",
+    )
+    document_contents = DocumentContents(
+        contents=[
+            # because chunk size is 512, this item will be split into 2 chunks
+            " token" * 750,
+            "final chunk",
+        ],
+    )
+    document_index.add_document(document_path, document_contents)
+
+    @retry
+    def chunks() -> None:
+        chunks = document_index.chunks(document_path, index_name)
+        assert len(chunks) == 3
+
+    chunks()