Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pgvector - embedding retrieval #298

Merged
merged 6 commits into from
Jan 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,10 @@
meta = EXCLUDED.meta
"""

VALID_VECTOR_FUNCTIONS = ["cosine_similarity", "inner_product", "l2_distance"]

VECTOR_FUNCTION_TO_POSTGRESQL_OPS = {
"cosine_distance": "vector_cosine_ops",
"cosine_similarity": "vector_cosine_ops",
"inner_product": "vector_ip_ops",
"l2_distance": "vector_l2_ops",
}
Expand All @@ -70,7 +72,7 @@ def __init__(
connection_string: str,
table_name: str = "haystack_documents",
embedding_dimension: int = 768,
vector_function: Literal["cosine_distance", "inner_product", "l2_distance"] = "cosine_distance",
vector_function: Literal["cosine_similarity", "inner_product", "l2_distance"] = "cosine_similarity",
recreate_table: bool = False,
search_strategy: Literal["exact_nearest_neighbor", "hnsw"] = "exact_nearest_neighbor",
hnsw_recreate_index_if_exists: bool = False,
Expand All @@ -87,12 +89,23 @@ def __init__(
:param table_name: The name of the table to use to store Haystack documents. Defaults to "haystack_documents".
:param embedding_dimension: The dimension of the embedding. Defaults to 768.
:param vector_function: The similarity function to use when searching for similar embeddings.
Defaults to "cosine_distance". Set it to one of the following values:
:type vector_function: Literal["cosine_distance", "inner_product", "l2_distance"]
Defaults to "cosine_similarity". "cosine_similarity" and "inner_product" are similarity functions and
higher scores indicate greater similarity between the documents.
"l2_distance" returns the straight-line distance between vectors,
and the most similar documents are the ones with the smallest score.

Important: when using the "hnsw" search strategy, an index will be created that depends on the
`vector_function` passed here. Make sure subsequent queries will keep using the same
vector similarity function in order to take advantage of the index.
:type vector_function: Literal["cosine_similarity", "inner_product", "l2_distance"]
:param recreate_table: Whether to recreate the table if it already exists. Defaults to False.
:param search_strategy: The search strategy to use when searching for similar embeddings.
Defaults to "exact_nearest_neighbor". "hnsw" is an approximate nearest neighbor search strategy,
which trades off some accuracy for speed; it is recommended for large numbers of documents.

Important: when using the "hnsw" search strategy, an index will be created that depends on the
`vector_function` passed here. Make sure subsequent queries will keep using the same
vector similarity function in order to take advantage of the index.
:type search_strategy: Literal["exact_nearest_neighbor", "hnsw"]
:param hnsw_recreate_index_if_exists: Whether to recreate the HNSW index if it already exists.
Defaults to False. Only used if search_strategy is set to "hnsw".
Expand All @@ -107,6 +120,9 @@ def __init__(
self.connection_string = connection_string
self.table_name = table_name
self.embedding_dimension = embedding_dimension
if vector_function not in VALID_VECTOR_FUNCTIONS:
msg = f"vector_function must be one of {VALID_VECTOR_FUNCTIONS}, but got {vector_function}"
raise ValueError(msg)
self.vector_function = vector_function
self.recreate_table = recreate_table
self.search_strategy = search_strategy
Expand Down Expand Up @@ -423,3 +439,81 @@ def delete_documents(self, document_ids: List[str]) -> None:
)

self._execute_sql(delete_sql, error_msg="Could not delete documents from PgvectorDocumentStore")

def _embedding_retrieval(
self,
query_embedding: List[float],
*,
filters: Optional[Dict[str, Any]] = None,
top_k: int = 10,
vector_function: Optional[Literal["cosine_similarity", "inner_product", "l2_distance"]] = None,
) -> List[Document]:
"""
Retrieves documents that are most similar to the query embedding using a vector similarity metric.

This method is not meant to be part of the public interface of
`PgvectorDocumentStore` and it should not be called directly.
`PgvectorEmbeddingRetriever` uses this method directly and is the public interface for it.
:raises ValueError
:return: List of Documents that are most similar to `query_embedding`
"""

if not query_embedding:
msg = "query_embedding must be a non-empty list of floats"
raise ValueError(msg)
if len(query_embedding) != self.embedding_dimension:
msg = (
f"query_embedding dimension ({len(query_embedding)}) does not match PgvectorDocumentStore "
f"embedding dimension ({self.embedding_dimension})."
)
raise ValueError(msg)

vector_function = vector_function or self.vector_function
if vector_function not in VALID_VECTOR_FUNCTIONS:
msg = f"vector_function must be one of {VALID_VECTOR_FUNCTIONS}, but got {vector_function}"
raise ValueError(msg)

# the vector must be a string with this format: "'[3,1,2]'"
query_embedding_for_postgres = f"'[{','.join(str(el) for el in query_embedding)}]'"

# to compute the scores, we use the approach described in pgvector README:
# https://github.com/pgvector/pgvector?tab=readme-ov-file#distances
# cosine_similarity and inner_product are modified from the result of the operator
if vector_function == "cosine_similarity":
score_definition = f"1 - (embedding <=> {query_embedding_for_postgres}) AS score"
elif vector_function == "inner_product":
score_definition = f"(embedding <#> {query_embedding_for_postgres}) * -1 AS score"
elif vector_function == "l2_distance":
score_definition = f"embedding <-> {query_embedding_for_postgres} AS score"

sql_select = SQL("SELECT *, {score} FROM {table_name}").format(
table_name=Identifier(self.table_name),
score=SQL(score_definition),
)

sql_where_clause = SQL("")
params = ()
if filters:
sql_where_clause, params = _convert_filters_to_where_clause_and_params(filters)

# we always want to return the most similar documents first
# so when using l2_distance, the sort order must be ASC
sort_order = "ASC" if vector_function == "l2_distance" else "DESC"

sql_sort = SQL(" ORDER BY score {sort_order} LIMIT {top_k}").format(
top_k=SQLLiteral(top_k),
sort_order=SQL(sort_order),
)

sql_query = sql_select + sql_where_clause + sql_sort

result = self._execute_sql(
sql_query,
params,
error_msg="Could not retrieve documents from PgvectorDocumentStore.",
cursor=self._dict_cursor,
)

records = result.fetchall()
docs = self._from_pg_to_haystack_documents(records)
return docs
2 changes: 1 addition & 1 deletion integrations/pgvector/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ def document_store(request):
connection_string = "postgresql://postgres:postgres@localhost:5432/postgres"
table_name = f"haystack_{request.node.name}"
embedding_dimension = 768
vector_function = "cosine_distance"
vector_function = "cosine_similarity"
recreate_table = True
search_strategy = "exact_nearest_neighbor"

Expand Down
130 changes: 130 additions & 0 deletions integrations/pgvector/tests/test_embedding_retrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# SPDX-FileCopyrightText: 2023-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0

from typing import List

import pytest
from haystack.dataclasses.document import Document
from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore
from numpy.random import rand


class TestEmbeddingRetrieval:
@pytest.fixture
def document_store_w_hnsw_index(self, request):
connection_string = "postgresql://postgres:postgres@localhost:5432/postgres"
table_name = f"haystack_hnsw_{request.node.name}"
embedding_dimension = 768
vector_function = "cosine_similarity"
recreate_table = True
search_strategy = "hnsw"

store = PgvectorDocumentStore(
connection_string=connection_string,
table_name=table_name,
embedding_dimension=embedding_dimension,
vector_function=vector_function,
recreate_table=recreate_table,
search_strategy=search_strategy,
)
yield store

store.delete_table()

@pytest.mark.parametrize("document_store", ["document_store", "document_store_w_hnsw_index"], indirect=True)
def test_embedding_retrieval_cosine_similarity(self, document_store: PgvectorDocumentStore):
query_embedding = [0.1] * 768
most_similar_embedding = [0.8] * 768
second_best_embedding = [0.8] * 700 + [0.1] * 3 + [0.2] * 65
another_embedding = rand(768).tolist()

docs = [
Document(content="Most similar document (cosine sim)", embedding=most_similar_embedding),
Document(content="2nd best document (cosine sim)", embedding=second_best_embedding),
Document(content="Not very similar document (cosine sim)", embedding=another_embedding),
]

document_store.write_documents(docs)

results = document_store._embedding_retrieval(
query_embedding=query_embedding, top_k=2, filters={}, vector_function="cosine_similarity"
)
assert len(results) == 2
assert results[0].content == "Most similar document (cosine sim)"
assert results[1].content == "2nd best document (cosine sim)"
assert results[0].score > results[1].score

@pytest.mark.parametrize("document_store", ["document_store", "document_store_w_hnsw_index"], indirect=True)
def test_embedding_retrieval_inner_product(self, document_store: PgvectorDocumentStore):
query_embedding = [0.1] * 768
most_similar_embedding = [0.8] * 768
second_best_embedding = [0.8] * 700 + [0.1] * 3 + [0.2] * 65
another_embedding = rand(768).tolist()

docs = [
Document(content="Most similar document (inner product)", embedding=most_similar_embedding),
Document(content="2nd best document (inner product)", embedding=second_best_embedding),
Document(content="Not very similar document (inner product)", embedding=another_embedding),
]

document_store.write_documents(docs)

results = document_store._embedding_retrieval(
query_embedding=query_embedding, top_k=2, filters={}, vector_function="inner_product"
)
assert len(results) == 2
assert results[0].content == "Most similar document (inner product)"
assert results[1].content == "2nd best document (inner product)"
assert results[0].score > results[1].score

@pytest.mark.parametrize("document_store", ["document_store", "document_store_w_hnsw_index"], indirect=True)
def test_embedding_retrieval_l2_distance(self, document_store: PgvectorDocumentStore):
query_embedding = [0.1] * 768
most_similar_embedding = [0.1] * 765 + [0.15] * 3
second_best_embedding = [0.1] * 700 + [0.1] * 3 + [0.2] * 65
another_embedding = rand(768).tolist()

docs = [
Document(content="Most similar document (l2 dist)", embedding=most_similar_embedding),
Document(content="2nd best document (l2 dist)", embedding=second_best_embedding),
Document(content="Not very similar document (l2 dist)", embedding=another_embedding),
]

document_store.write_documents(docs)

results = document_store._embedding_retrieval(
query_embedding=query_embedding, top_k=2, filters={}, vector_function="l2_distance"
)
assert len(results) == 2
assert results[0].content == "Most similar document (l2 dist)"
assert results[1].content == "2nd best document (l2 dist)"
assert results[0].score < results[1].score

@pytest.mark.parametrize("document_store", ["document_store", "document_store_w_hnsw_index"], indirect=True)
def test_embedding_retrieval_with_filters(self, document_store: PgvectorDocumentStore):
docs = [Document(content=f"Document {i}", embedding=rand(768).tolist()) for i in range(10)]

for i in range(10):
docs[i].meta["meta_field"] = "custom_value" if i % 2 == 0 else "other_value"

document_store.write_documents(docs)

query_embedding = [0.1] * 768
filters = {"field": "meta.meta_field", "operator": "==", "value": "custom_value"}

results = document_store._embedding_retrieval(query_embedding=query_embedding, top_k=3, filters=filters)
assert len(results) == 3
for result in results:
assert result.meta["meta_field"] == "custom_value"
assert results[0].score > results[1].score > results[2].score

def test_empty_query_embedding(self, document_store: PgvectorDocumentStore):
query_embedding: List[float] = []
with pytest.raises(ValueError):
document_store._embedding_retrieval(query_embedding=query_embedding)

def test_query_embedding_wrong_dimension(self, document_store: PgvectorDocumentStore):
query_embedding = [0.1] * 4
with pytest.raises(ValueError):
document_store._embedding_retrieval(query_embedding=query_embedding)