From 72570ed48474f5393bc684a89a4bce74566128ce Mon Sep 17 00:00:00 2001
From: anakin87 <stefanofiorucci@gmail.com>
Date: Fri, 22 Dec 2023 09:51:24 +0100
Subject: [PATCH 1/7] put tests together

---
 .../src/pinecone_haystack/document_store.py   |  5 +++--
 integrations/pinecone/tests/test_count.py     |  7 -------
 integrations/pinecone/tests/test_delete.py    |  7 -------
 .../pinecone/tests/test_document_store.py     | 15 ++++++++++++-
 integrations/pinecone/tests/test_write.py     | 21 -------------------
 5 files changed, 17 insertions(+), 38 deletions(-)
 delete mode 100644 integrations/pinecone/tests/test_count.py
 delete mode 100644 integrations/pinecone/tests/test_delete.py
 delete mode 100644 integrations/pinecone/tests/test_write.py

diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py
index 2c089f9b3..0c8d65e6c 100644
--- a/integrations/pinecone/src/pinecone_haystack/document_store.py
+++ b/integrations/pinecone/src/pinecone_haystack/document_store.py
@@ -164,7 +164,7 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D
     def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]:
         if not filters:
             # in this case, we try to return all documents but Pinecone has some limits
-            documents = self._embedding_retrieval(query_embedding=self._dummy_vector, top_k=TOP_K_LIMIT)
+            documents = self._embedding_retrieval(query_embedding=self._dummy_vector, namespace=self.namespace, top_k=TOP_K_LIMIT)
             for doc in documents:
                 doc.score = None
 
@@ -190,6 +190,7 @@ def _embedding_retrieval(
         self,
         query_embedding: List[float],
         *,
+        namespace: Optional[str]=None,
         filters: Optional[Dict[str, Any]] = None,  # noqa: ARG002 (filters to be implemented)
         top_k: int = 10,
     ) -> List[Document]:
@@ -214,7 +215,7 @@ def _embedding_retrieval(
         result = self._index.query(
             vector=query_embedding,
             top_k=top_k,
-            namespace=self.namespace,
+            namespace=namespace,
             include_values=True,
             include_metadata=True,
         )
diff --git a/integrations/pinecone/tests/test_count.py b/integrations/pinecone/tests/test_count.py
deleted file mode 100644
index 02462d422..000000000
--- a/integrations/pinecone/tests/test_count.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from haystack.testing.document_store import (
-    CountDocumentsTest,
-)
-
-
-class TestCountDocuments(CountDocumentsTest):
-    ...
diff --git a/integrations/pinecone/tests/test_delete.py b/integrations/pinecone/tests/test_delete.py
deleted file mode 100644
index 88b145704..000000000
--- a/integrations/pinecone/tests/test_delete.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from haystack.testing.document_store import (
-    DeleteDocumentsTest,
-)
-
-
-class TestDeleteDocuments(DeleteDocumentsTest):
-    ...
diff --git a/integrations/pinecone/tests/test_document_store.py b/integrations/pinecone/tests/test_document_store.py
index a0a523991..01017d67f 100644
--- a/integrations/pinecone/tests/test_document_store.py
+++ b/integrations/pinecone/tests/test_document_store.py
@@ -3,11 +3,24 @@
 import numpy as np
 import pytest
 from haystack import Document
+from haystack.testing.document_store import CountDocumentsTest, DeleteDocumentsTest, WriteDocumentsTest
 
 from pinecone_haystack.document_store import PineconeDocumentStore
 
 
-class TestDocumentStore:
+class TestDocumentStore(CountDocumentsTest, DeleteDocumentsTest, WriteDocumentsTest):
+    def test_write_documents(self, document_store: PineconeDocumentStore):
+        docs = [Document(id="1")]
+        assert document_store.write_documents(docs) == 1
+
+    @pytest.mark.skip(reason="Pinecone only supports UPSERT operations")
+    def test_write_documents_duplicate_fail(self, document_store: PineconeDocumentStore):
+        ...
+
+    @pytest.mark.skip(reason="Pinecone only supports UPSERT operations")
+    def test_write_documents_duplicate_skip(self, document_store: PineconeDocumentStore):
+        ...
+
     @patch("pinecone_haystack.document_store.pinecone")
     def test_init(self, mock_pinecone):
         mock_pinecone.Index.return_value.describe_index_stats.return_value = {"dimension": 30}
diff --git a/integrations/pinecone/tests/test_write.py b/integrations/pinecone/tests/test_write.py
deleted file mode 100644
index 7c04a93be..000000000
--- a/integrations/pinecone/tests/test_write.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import pytest
-from haystack import Document
-from haystack.testing.document_store import (
-    WriteDocumentsTest,
-)
-
-from pinecone_haystack.document_store import PineconeDocumentStore
-
-
-class TestWriteDocuments(WriteDocumentsTest):
-    def test_write_documents(self, document_store: PineconeDocumentStore):
-        docs = [Document(id="1")]
-        assert document_store.write_documents(docs) == 1
-
-    @pytest.mark.skip(reason="Pinecone only supports UPSERT operations")
-    def test_write_documents_duplicate_fail(self, document_store: PineconeDocumentStore):
-        ...
-
-    @pytest.mark.skip(reason="Pinecone only supports UPSERT operations")
-    def test_write_documents_duplicate_skip(self, document_store: PineconeDocumentStore):
-        ...

From 2e690e4a9d8d220f3705373987acc52902a50c42 Mon Sep 17 00:00:00 2001
From: anakin87 <stefanofiorucci@gmail.com>
Date: Fri, 22 Dec 2023 09:51:41 +0100
Subject: [PATCH 2/7] format

---
 .../pinecone/src/pinecone_haystack/document_store.py        | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py
index 0c8d65e6c..b7133e757 100644
--- a/integrations/pinecone/src/pinecone_haystack/document_store.py
+++ b/integrations/pinecone/src/pinecone_haystack/document_store.py
@@ -164,7 +164,9 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D
     def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]:
         if not filters:
             # in this case, we try to return all documents but Pinecone has some limits
-            documents = self._embedding_retrieval(query_embedding=self._dummy_vector, namespace=self.namespace, top_k=TOP_K_LIMIT)
+            documents = self._embedding_retrieval(
+                query_embedding=self._dummy_vector, namespace=self.namespace, top_k=TOP_K_LIMIT
+            )
             for doc in documents:
                 doc.score = None
 
@@ -190,7 +192,7 @@ def _embedding_retrieval(
         self,
         query_embedding: List[float],
         *,
-        namespace: Optional[str]=None,
+        namespace: Optional[str] = None,
         filters: Optional[Dict[str, Any]] = None,  # noqa: ARG002 (filters to be implemented)
         top_k: int = 10,
     ) -> List[Document]:

From 9437c02e69004f9e6b78d670a83e7137feedee89 Mon Sep 17 00:00:00 2001
From: anakin87 <stefanofiorucci@gmail.com>
Date: Fri, 22 Dec 2023 09:59:01 +0100
Subject: [PATCH 3/7] add fallback for namespace in _embedding_retrieval

---
 integrations/pinecone/src/pinecone_haystack/document_store.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py
index b7133e757..c81efd58c 100644
--- a/integrations/pinecone/src/pinecone_haystack/document_store.py
+++ b/integrations/pinecone/src/pinecone_haystack/document_store.py
@@ -217,7 +217,7 @@ def _embedding_retrieval(
         result = self._index.query(
             vector=query_embedding,
             top_k=top_k,
-            namespace=namespace,
+            namespace=namespace or self.namespace,
             include_values=True,
             include_metadata=True,
         )

From fdfd3e7641cdf386878815a8b9ee5037b86da0df Mon Sep 17 00:00:00 2001
From: anakin87 <stefanofiorucci@gmail.com>
Date: Fri, 22 Dec 2023 10:21:14 +0100
Subject: [PATCH 4/7] try to parallelize tests

---
 integrations/pinecone/pyproject.toml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/integrations/pinecone/pyproject.toml b/integrations/pinecone/pyproject.toml
index 506795e7f..9de2561fe 100644
--- a/integrations/pinecone/pyproject.toml
+++ b/integrations/pinecone/pyproject.toml
@@ -48,8 +48,9 @@ dependencies = [
   "pytest-xdist",
 ]
 [tool.hatch.envs.default.scripts]
-test = "pytest {args:tests}"
-test-cov = "coverage run -m pytest {args:tests}"
+# Pinecone tests are slow (require HTTP requests), so we run them in parallel
+test = "pytest -n auto --dist worksteal {args:tests}"
+test-cov = "coverage run -m pytest -n auto --dist worksteal {args:tests}"
 cov-report = [
   "- coverage combine",
   "coverage report",

From 8e6f0e63487d1b8e0ee9fd2037de82636fa9b6f9 Mon Sep 17 00:00:00 2001
From: anakin87 <stefanofiorucci@gmail.com>
Date: Fri, 22 Dec 2023 10:31:28 +0100
Subject: [PATCH 5/7] better try

---
 integrations/pinecone/pyproject.toml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/integrations/pinecone/pyproject.toml b/integrations/pinecone/pyproject.toml
index 9de2561fe..069dba1be 100644
--- a/integrations/pinecone/pyproject.toml
+++ b/integrations/pinecone/pyproject.toml
@@ -49,8 +49,9 @@ dependencies = [
 ]
 [tool.hatch.envs.default.scripts]
 # Pinecone tests are slow (require HTTP requests), so we run them in parallel
-test = "pytest -n auto --dist worksteal {args:tests}"
-test-cov = "coverage run -m pytest -n auto --dist worksteal {args:tests}"
+# with pytest-xdist (https://pytest-xdist.readthedocs.io/en/stable/distribution.html)
+test = "pytest -n auto --maxprocesses=3 {args:tests}"
+test-cov = "coverage run -m pytest -n auto --maxprocesses=3 {args:tests}"
 cov-report = [
   "- coverage combine",
   "coverage report",

From c759d103e656e3a1a815c32ab965e287a3128328 Mon Sep 17 00:00:00 2001
From: anakin87 <stefanofiorucci@gmail.com>
Date: Fri, 22 Dec 2023 11:04:44 +0100
Subject: [PATCH 6/7] labeler

---
 .github/labeler.yml                           |  5 +++
 .../src/pinecone_haystack/document_store.py   | 39 ++++++++++++-------
 2 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/.github/labeler.yml b/.github/labeler.yml
index f2dcedad2..1a41c2caf 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -44,6 +44,11 @@ integration:qdrant:
       - any-glob-to-any-file: "integrations/qdrant/**/*"
       - any-glob-to-any-file: ".github/workflows/qdrant.yml"
 
+integration:pinecone:
+  - changed-files:
+      - any-glob-to-any-file: "integrations/pinecone/**/*"
+      - any-glob-to-any-file: ".github/workflows/pinecone.yml"      
+
 integration:unstructured-fileconverter:
   - changed-files:
       - any-glob-to-any-file: "integrations/unstructured/fileconverter/**/*"
diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py
index c81efd58c..dfa5bd564 100644
--- a/integrations/pinecone/src/pinecone_haystack/document_store.py
+++ b/integrations/pinecone/src/pinecone_haystack/document_store.py
@@ -162,23 +162,31 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D
         return written_docs
 
     def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]:
-        if not filters:
-            # in this case, we try to return all documents but Pinecone has some limits
-            documents = self._embedding_retrieval(
-                query_embedding=self._dummy_vector, namespace=self.namespace, top_k=TOP_K_LIMIT
-            )
-            for doc in documents:
-                doc.score = None
+        """
+        Returns the documents that match the filters provided.
 
-            total_docs_number = self.count_documents()
-            if total_docs_number > TOP_K_LIMIT:
-                logger.warning(
-                    f"PineconeDocumentStore can only return {TOP_K_LIMIT} documents. "
-                    f"However, there are {total_docs_number} documents in the namespace. "
-                )
-            return documents
+        For a detailed specification of the filters, 
+        refer to the [documentation](https://docs.haystack.deepset.ai/v2.0/docs/metadata-filtering)
 
-        return []
+        :param filters: The filters to apply to the document list.
+        :return: A list of Documents that match the given filters.
+        """
+        
+        # Pinecone only performs vector similarity search
+        # here we are querying with a dummy vector and the max compatible top_k
+        documents = self._embedding_retrieval(query_embedding=self._dummy_vector, filters=filters, top_k=TOP_K_LIMIT)
+        
+        # when simply filtering, we don't want to return any scores
+        # furthermore, we are querying with a dummy vector, so the scores are meaningless
+        for doc in documents:
+            doc.score = None
+
+        if len(documents) == TOP_K_LIMIT:
+            logger.warning(
+                f"PineconeDocumentStore can return at most {TOP_K_LIMIT} documents and the query has hit this limit. "
+                f"It is likely that there are more matching documents in the document store. "
+            )
+        return documents
 
     def delete_documents(self, document_ids: List[str]) -> None:
         """
@@ -204,6 +212,7 @@ def _embedding_retrieval(
         `PineconeEmbeddingRetriever` uses this method directly and is the public interface for it.
 
         :param query_embedding: Embedding of the query.
+        :param namespace: Pinecone namespace to query. Defaults the namespace of the document store.
         :param filters: Filters applied to the retrieved Documents. Defaults to None.
         :param top_k: Maximum number of Documents to return, defaults to 10
 

From 017cd750b81d3600cba6884986f68867f4099ff3 Mon Sep 17 00:00:00 2001
From: anakin87 <stefanofiorucci@gmail.com>
Date: Fri, 22 Dec 2023 11:13:20 +0100
Subject: [PATCH 7/7] format fix

---
 .../pinecone/src/pinecone_haystack/document_store.py        | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py
index dfa5bd564..2e4fe5f14 100644
--- a/integrations/pinecone/src/pinecone_haystack/document_store.py
+++ b/integrations/pinecone/src/pinecone_haystack/document_store.py
@@ -165,17 +165,17 @@ def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Doc
         """
         Returns the documents that match the filters provided.
 
-        For a detailed specification of the filters, 
+        For a detailed specification of the filters,
         refer to the [documentation](https://docs.haystack.deepset.ai/v2.0/docs/metadata-filtering)
 
         :param filters: The filters to apply to the document list.
         :return: A list of Documents that match the given filters.
         """
-        
+
         # Pinecone only performs vector similarity search
         # here we are querying with a dummy vector and the max compatible top_k
         documents = self._embedding_retrieval(query_embedding=self._dummy_vector, filters=filters, top_k=TOP_K_LIMIT)
-        
+
         # when simply filtering, we don't want to return any scores
         # furthermore, we are querying with a dummy vector, so the scores are meaningless
         for doc in documents: