From 72570ed48474f5393bc684a89a4bce74566128ce Mon Sep 17 00:00:00 2001 From: anakin87 Date: Fri, 22 Dec 2023 09:51:24 +0100 Subject: [PATCH 1/7] put tests together --- .../src/pinecone_haystack/document_store.py | 5 +++-- integrations/pinecone/tests/test_count.py | 7 ------- integrations/pinecone/tests/test_delete.py | 7 ------- .../pinecone/tests/test_document_store.py | 15 ++++++++++++- integrations/pinecone/tests/test_write.py | 21 ------------------- 5 files changed, 17 insertions(+), 38 deletions(-) delete mode 100644 integrations/pinecone/tests/test_count.py delete mode 100644 integrations/pinecone/tests/test_delete.py delete mode 100644 integrations/pinecone/tests/test_write.py diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py index 2c089f9b3..0c8d65e6c 100644 --- a/integrations/pinecone/src/pinecone_haystack/document_store.py +++ b/integrations/pinecone/src/pinecone_haystack/document_store.py @@ -164,7 +164,7 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]: if not filters: # in this case, we try to return all documents but Pinecone has some limits - documents = self._embedding_retrieval(query_embedding=self._dummy_vector, top_k=TOP_K_LIMIT) + documents = self._embedding_retrieval(query_embedding=self._dummy_vector, namespace=self.namespace, top_k=TOP_K_LIMIT) for doc in documents: doc.score = None @@ -190,6 +190,7 @@ def _embedding_retrieval( self, query_embedding: List[float], *, + namespace: Optional[str]=None, filters: Optional[Dict[str, Any]] = None, # noqa: ARG002 (filters to be implemented) top_k: int = 10, ) -> List[Document]: @@ -214,7 +215,7 @@ def _embedding_retrieval( result = self._index.query( vector=query_embedding, top_k=top_k, - namespace=self.namespace, + namespace=namespace, include_values=True, include_metadata=True, ) diff --git a/integrations/pinecone/tests/test_count.py b/integrations/pinecone/tests/test_count.py deleted file mode 100644 index 02462d422..000000000 --- a/integrations/pinecone/tests/test_count.py +++ /dev/null @@ -1,7 +0,0 @@ -from haystack.testing.document_store import ( - CountDocumentsTest, -) - - -class TestCountDocuments(CountDocumentsTest): - ... diff --git a/integrations/pinecone/tests/test_delete.py b/integrations/pinecone/tests/test_delete.py deleted file mode 100644 index 88b145704..000000000 --- a/integrations/pinecone/tests/test_delete.py +++ /dev/null @@ -1,7 +0,0 @@ -from haystack.testing.document_store import ( - DeleteDocumentsTest, -) - - -class TestDeleteDocuments(DeleteDocumentsTest): - ... diff --git a/integrations/pinecone/tests/test_document_store.py b/integrations/pinecone/tests/test_document_store.py index a0a523991..01017d67f 100644 --- a/integrations/pinecone/tests/test_document_store.py +++ b/integrations/pinecone/tests/test_document_store.py @@ -3,11 +3,24 @@ import numpy as np import pytest from haystack import Document +from haystack.testing.document_store import CountDocumentsTest, DeleteDocumentsTest, WriteDocumentsTest from pinecone_haystack.document_store import PineconeDocumentStore -class TestDocumentStore: +class TestDocumentStore(CountDocumentsTest, DeleteDocumentsTest, WriteDocumentsTest): + def test_write_documents(self, document_store: PineconeDocumentStore): + docs = [Document(id="1")] + assert document_store.write_documents(docs) == 1 + + @pytest.mark.skip(reason="Pinecone only supports UPSERT operations") + def test_write_documents_duplicate_fail(self, document_store: PineconeDocumentStore): + ... + + @pytest.mark.skip(reason="Pinecone only supports UPSERT operations") + def test_write_documents_duplicate_skip(self, document_store: PineconeDocumentStore): + ... + @patch("pinecone_haystack.document_store.pinecone") def test_init(self, mock_pinecone): mock_pinecone.Index.return_value.describe_index_stats.return_value = {"dimension": 30} diff --git a/integrations/pinecone/tests/test_write.py b/integrations/pinecone/tests/test_write.py deleted file mode 100644 index 7c04a93be..000000000 --- a/integrations/pinecone/tests/test_write.py +++ /dev/null @@ -1,21 +0,0 @@ -import pytest -from haystack import Document -from haystack.testing.document_store import ( - WriteDocumentsTest, -) - -from pinecone_haystack.document_store import PineconeDocumentStore - - -class TestWriteDocuments(WriteDocumentsTest): - def test_write_documents(self, document_store: PineconeDocumentStore): - docs = [Document(id="1")] - assert document_store.write_documents(docs) == 1 - - @pytest.mark.skip(reason="Pinecone only supports UPSERT operations") - def test_write_documents_duplicate_fail(self, document_store: PineconeDocumentStore): - ... - - @pytest.mark.skip(reason="Pinecone only supports UPSERT operations") - def test_write_documents_duplicate_skip(self, document_store: PineconeDocumentStore): - ... From 2e690e4a9d8d220f3705373987acc52902a50c42 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Fri, 22 Dec 2023 09:51:41 +0100 Subject: [PATCH 2/7] format --- .../pinecone/src/pinecone_haystack/document_store.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py index 0c8d65e6c..b7133e757 100644 --- a/integrations/pinecone/src/pinecone_haystack/document_store.py +++ b/integrations/pinecone/src/pinecone_haystack/document_store.py @@ -164,7 +164,9 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]: if not filters: # in this case, we try to return all documents but Pinecone has some limits - documents = self._embedding_retrieval(query_embedding=self._dummy_vector, namespace=self.namespace, top_k=TOP_K_LIMIT) + documents = self._embedding_retrieval( + query_embedding=self._dummy_vector, namespace=self.namespace, top_k=TOP_K_LIMIT + ) for doc in documents: doc.score = None @@ -190,7 +192,7 @@ def _embedding_retrieval( self, query_embedding: List[float], *, - namespace: Optional[str]=None, + namespace: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, # noqa: ARG002 (filters to be implemented) top_k: int = 10, ) -> List[Document]: From 9437c02e69004f9e6b78d670a83e7137feedee89 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Fri, 22 Dec 2023 09:59:01 +0100 Subject: [PATCH 3/7] add fallback for namespace in _embedding_retrieval --- integrations/pinecone/src/pinecone_haystack/document_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py index b7133e757..c81efd58c 100644 --- a/integrations/pinecone/src/pinecone_haystack/document_store.py +++ b/integrations/pinecone/src/pinecone_haystack/document_store.py @@ -217,7 +217,7 @@ def _embedding_retrieval( result = self._index.query( vector=query_embedding, top_k=top_k, - namespace=namespace, + namespace=namespace or self.namespace, include_values=True, include_metadata=True, ) From fdfd3e7641cdf386878815a8b9ee5037b86da0df Mon Sep 17 00:00:00 2001 From: anakin87 Date: Fri, 22 Dec 2023 10:21:14 +0100 Subject: [PATCH 4/7] try to parallelize tests --- integrations/pinecone/pyproject.toml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/integrations/pinecone/pyproject.toml b/integrations/pinecone/pyproject.toml index 506795e7f..9de2561fe 100644 --- a/integrations/pinecone/pyproject.toml +++ b/integrations/pinecone/pyproject.toml @@ -48,8 +48,9 @@ dependencies = [ "pytest-xdist", ] [tool.hatch.envs.default.scripts] -test = "pytest {args:tests}" -test-cov = "coverage run -m pytest {args:tests}" +# Pinecone tests are slow (require HTTP requests), so we run them in parallel +test = "pytest -n auto --dist worksteal {args:tests}" +test-cov = "coverage run -m pytest -n auto --dist worksteal {args:tests}" cov-report = [ "- coverage combine", "coverage report", From 8e6f0e63487d1b8e0ee9fd2037de82636fa9b6f9 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Fri, 22 Dec 2023 10:31:28 +0100 Subject: [PATCH 5/7] better try --- integrations/pinecone/pyproject.toml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/integrations/pinecone/pyproject.toml b/integrations/pinecone/pyproject.toml index 9de2561fe..069dba1be 100644 --- a/integrations/pinecone/pyproject.toml +++ b/integrations/pinecone/pyproject.toml @@ -49,8 +49,9 @@ dependencies = [ ] [tool.hatch.envs.default.scripts] # Pinecone tests are slow (require HTTP requests), so we run them in parallel -test = "pytest -n auto --dist worksteal {args:tests}" -test-cov = "coverage run -m pytest -n auto --dist worksteal {args:tests}" +# with pytest-xdist (https://pytest-xdist.readthedocs.io/en/stable/distribution.html) +test = "pytest -n auto --maxprocesses=3 {args:tests}" +test-cov = "coverage run -m pytest -n auto --maxprocesses=3 {args:tests}" cov-report = [ "- coverage combine", "coverage report", From c759d103e656e3a1a815c32ab965e287a3128328 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Fri, 22 Dec 2023 11:04:44 +0100 Subject: [PATCH 6/7] labeler --- .github/labeler.yml | 5 +++ .../src/pinecone_haystack/document_store.py | 39 ++++++++++++------- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/.github/labeler.yml b/.github/labeler.yml index f2dcedad2..1a41c2caf 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -44,6 +44,11 @@ integration:qdrant: - any-glob-to-any-file: "integrations/qdrant/**/*" - any-glob-to-any-file: ".github/workflows/qdrant.yml" +integration:pinecone: + - changed-files: + - any-glob-to-any-file: "integrations/pinecone/**/*" + - any-glob-to-any-file: ".github/workflows/pinecone.yml" + integration:unstructured-fileconverter: - changed-files: - any-glob-to-any-file: "integrations/unstructured/fileconverter/**/*" diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py index c81efd58c..dfa5bd564 100644 --- a/integrations/pinecone/src/pinecone_haystack/document_store.py +++ b/integrations/pinecone/src/pinecone_haystack/document_store.py @@ -162,23 +162,31 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D return written_docs def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]: - if not filters: - # in this case, we try to return all documents but Pinecone has some limits - documents = self._embedding_retrieval( - query_embedding=self._dummy_vector, namespace=self.namespace, top_k=TOP_K_LIMIT - ) - for doc in documents: - doc.score = None + """ + Returns the documents that match the filters provided. - total_docs_number = self.count_documents() - if total_docs_number > TOP_K_LIMIT: - logger.warning( - f"PineconeDocumentStore can only return {TOP_K_LIMIT} documents. " - f"However, there are {total_docs_number} documents in the namespace. " - ) - return documents + For a detailed specification of the filters, + refer to the [documentation](https://docs.haystack.deepset.ai/v2.0/docs/metadata-filtering) - return [] + :param filters: The filters to apply to the document list. + :return: A list of Documents that match the given filters. + """ + + # Pinecone only performs vector similarity search + # here we are querying with a dummy vector and the max compatible top_k + documents = self._embedding_retrieval(query_embedding=self._dummy_vector, filters=filters, top_k=TOP_K_LIMIT) + + # when simply filtering, we don't want to return any scores + # furthermore, we are querying with a dummy vector, so the scores are meaningless + for doc in documents: + doc.score = None + + if len(documents) == TOP_K_LIMIT: + logger.warning( + f"PineconeDocumentStore can return at most {TOP_K_LIMIT} documents and the query has hit this limit. " + f"It is likely that there are more matching documents in the document store. " + ) + return documents def delete_documents(self, document_ids: List[str]) -> None: """ @@ -204,6 +212,7 @@ def _embedding_retrieval( `PineconeEmbeddingRetriever` uses this method directly and is the public interface for it. :param query_embedding: Embedding of the query. + :param namespace: Pinecone namespace to query. Defaults the namespace of the document store. :param filters: Filters applied to the retrieved Documents. Defaults to None. :param top_k: Maximum number of Documents to return, defaults to 10 From 017cd750b81d3600cba6884986f68867f4099ff3 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Fri, 22 Dec 2023 11:13:20 +0100 Subject: [PATCH 7/7] format fix --- .../pinecone/src/pinecone_haystack/document_store.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py index dfa5bd564..2e4fe5f14 100644 --- a/integrations/pinecone/src/pinecone_haystack/document_store.py +++ b/integrations/pinecone/src/pinecone_haystack/document_store.py @@ -165,17 +165,17 @@ def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Doc """ Returns the documents that match the filters provided. - For a detailed specification of the filters, + For a detailed specification of the filters, refer to the [documentation](https://docs.haystack.deepset.ai/v2.0/docs/metadata-filtering) :param filters: The filters to apply to the document list. :return: A list of Documents that match the given filters. """ - + # Pinecone only performs vector similarity search # here we are querying with a dummy vector and the max compatible top_k documents = self._embedding_retrieval(query_embedding=self._dummy_vector, filters=filters, top_k=TOP_K_LIMIT) - + # when simply filtering, we don't want to return any scores # furthermore, we are querying with a dummy vector, so the scores are meaningless for doc in documents: