Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pinecone - review docstrings and API reference #503

Merged
merged 5 commits into from
Feb 29, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions integrations/pinecone/pydoc/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,7 @@ loaders:
modules:
[
"haystack_integrations.components.retrievers.pinecone.embedding_retriever",
"haystack_integrations.document_stores.pinecone.document_store",
"haystack_integrations.document_stores.pinecone.errors",
"haystack_integrations.document_stores.pinecone.filters",
shadeMe marked this conversation as resolved.
Show resolved Hide resolved
"haystack_integrations.document_stores.pinecone.document_store"
]
ignore_when_discovered: ["__init__"]
processors:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,41 @@
@component
class PineconeEmbeddingRetriever:
"""
Retrieves documents from the PineconeDocumentStore, based on their dense embeddings.
Retrieves documents from the `PineconeDocumentStore`, based on their dense embeddings.

Needs to be connected to the PineconeDocumentStore.
Example usage:
anakin87 marked this conversation as resolved.
Show resolved Hide resolved
```python
import os
from haystack.document_stores.types import DuplicatePolicy
from haystack import Document
from haystack import Pipeline
from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder
from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever
from haystack_integrations.document_stores.pinecone import PineconeDocumentStore

os.environ["PINECONE_API_KEY"] = "YOUR_PINECONE_API_KEY"
document_store = PineconeDocumentStore(index="my_index", namespace="my_namespace", dimension=768)

documents = [Document(content="There are over 7,000 languages spoken around the world today."),
Document(content="Elephants have been observed to behave in a way that indicates..."),
Document(content="In certain places, you can witness the phenomenon of bioluminescent waves.")]

document_embedder = SentenceTransformersDocumentEmbedder()
document_embedder.warm_up()
documents_with_embeddings = document_embedder.run(documents)

document_store.write_documents(documents_with_embeddings.get("documents"), policy=DuplicatePolicy.OVERWRITE)

query_pipeline = Pipeline()
query_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder())
query_pipeline.add_component("retriever", PineconeEmbeddingRetriever(document_store=document_store))
query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")

query = "How many languages are there?"

res = query_pipeline.run({"text_embedder": {"text": query}})
assert res['retriever']['documents'][0].content == "There are over 7,000 languages spoken around the world today."
```
"""

def __init__(
Expand All @@ -25,13 +57,11 @@ def __init__(
top_k: int = 10,
):
"""
Create the PineconeEmbeddingRetriever component.

:param document_store: An instance of PineconeDocumentStore.
:param filters: Filters applied to the retrieved Documents. Defaults to None.
:param top_k: Maximum number of Documents to return, defaults to 10.
:param document_store: An instance of `PineconeDocumentStore`.
anakin87 marked this conversation as resolved.
Show resolved Hide resolved
:param filters: Filters applied to the retrieved Documents.
:param top_k: Maximum number of Documents to return.

:raises ValueError: If `document_store` is not an instance of PineconeDocumentStore.
:raises ValueError: If `document_store` is not an instance of `PineconeDocumentStore`.
"""
if not isinstance(document_store, PineconeDocumentStore):
msg = "document_store must be an instance of PineconeDocumentStore"
Expand All @@ -42,6 +72,11 @@ def __init__(
self.top_k = top_k

def to_dict(self) -> Dict[str, Any]:
"""
Serializes the component to a dictionary.
:returns:
Dictionary with serialized data.
"""
return default_to_dict(
self,
filters=self.filters,
Expand All @@ -51,6 +86,13 @@ def to_dict(self) -> Dict[str, Any]:

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "PineconeEmbeddingRetriever":
"""
Deserializes the component from a dictionary.
:param data:
Dictionary to deserialize from.
:returns:
Deserialized component.
"""
data["init_parameters"]["document_store"] = default_from_dict(
PineconeDocumentStore, data["init_parameters"]["document_store"]
)
Expand All @@ -59,10 +101,10 @@ def from_dict(cls, data: Dict[str, Any]) -> "PineconeEmbeddingRetriever":
@component.output_types(documents=List[Document])
def run(self, query_embedding: List[float]):
"""
Retrieve documents from the PineconeDocumentStore, based on their dense embeddings.
Retrieve documents from the `PineconeDocumentStore`, based on their dense embeddings.

:param query_embedding: Embedding of the query.
:return: List of Document similar to `query_embedding`.
:returns: List of Document similar to `query_embedding`.
"""
docs = self.document_store._embedding_retrieval(
query_embedding=query_embedding,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@


class PineconeDocumentStore:
"""
A Document Store using [Pinecone vector database](https://www.pinecone.io/).
"""

def __init__(
self,
*,
Expand All @@ -42,20 +46,17 @@ def __init__(
It is meant to be connected to a Pinecone index and namespace.

:param api_key: The Pinecone API key. It can be explicitly provided or automatically read from the
environment variable PINECONE_API_KEY (recommended).
:param environment: The Pinecone environment to connect to. Defaults to "us-west1-gcp".
environment variable `PINECONE_API_KEY` (recommended).
:param environment: The Pinecone environment to connect to.
:param index: The Pinecone index to connect to. If the index does not exist, it will be created.
Defaults to "default".
:param namespace: The Pinecone namespace to connect to. If the namespace does not exist, it will be created
at the first write. Defaults to "default".
:param batch_size: The number of documents to write in a single batch. Defaults to 100, as recommended by
Pinecone.
at the first write.
:param batch_size: The number of documents to write in a single batch. When setting this parameter,
consider [documented Pinecone limits](https://docs.pinecone.io/docs/limits).
:param dimension: The dimension of the embeddings. This parameter is only used when creating a new index.
Defaults to 768.
:param index_creation_kwargs: Additional keyword arguments to pass to the index creation method.
For example, you can specify `metric`, `pods`, `replicas`...
shadeMe marked this conversation as resolved.
Show resolved Hide resolved
You can find the full list of supported arguments in the
[API reference](https://docs.pinecone.io/reference/create_index-1).
[API reference](https://docs.pinecone.io/reference/create_index).

"""
resolved_api_key = api_key.resolve_value()
Expand Down Expand Up @@ -95,10 +96,22 @@ def __init__(

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "PineconeDocumentStore":
"""
Deserializes the component from a dictionary.
:param data:
Dictionary to deserialize from.
:returns:
Deserialized component.
"""
deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
return default_from_dict(cls, data)

def to_dict(self) -> Dict[str, Any]:
"""
Serializes the component to a dictionary.
:returns:
Dictionary with serialized data.
"""
return default_to_dict(
self,
api_key=self.api_key.to_dict(),
Expand Down Expand Up @@ -128,7 +141,7 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D
:param policy: The duplicate policy to use when writing documents.
PineconeDocumentStore only supports `DuplicatePolicy.OVERWRITE`.

:return: The number of documents written to the document store.
:returns: The number of documents written to the document store.
"""
if len(documents) > 0 and not isinstance(documents[0], Document):
msg = "param 'documents' must contain a list of objects of type Document"
Expand Down Expand Up @@ -157,7 +170,7 @@ def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Doc
refer to the [documentation](https://docs.haystack.deepset.ai/v2.0/docs/metadata-filtering)

:param filters: The filters to apply to the document list.
:return: A list of Documents that match the given filters.
:returns: A list of Documents that match the given filters.
"""

# Pinecone only performs vector similarity search
Expand All @@ -178,7 +191,7 @@ def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Doc

def delete_documents(self, document_ids: List[str]) -> None:
"""
Deletes all documents with a matching document_ids from the document store.
Deletes documents that match the provided `document_ids` from the document store.

:param document_ids: the document ids to delete
"""
Expand All @@ -197,14 +210,14 @@ def _embedding_retrieval(

This method is not mean to be part of the public interface of
`PineconeDocumentStore` nor called directly.
`PineconeDenseRetriever` uses this method directly and is the public interface for it.
`PineconeEmbeddingRetriever` uses this method directly and is the public interface for it.

:param query_embedding: Embedding of the query.
:param namespace: Pinecone namespace to query. Defaults the namespace of the document store.
:param filters: Filters applied to the retrieved Documents. Defaults to None.
:param top_k: Maximum number of Documents to return, defaults to 10
:param filters: Filters applied to the retrieved Documents.
:param top_k: Maximum number of Documents to return.

:return: List of Document that are most similar to `query_embedding`
:returns: List of Document that are most similar to `query_embedding`
"""

if not query_embedding:
Expand Down

This file was deleted.