Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Preparing sqlite-vec as a vectore store partner #5

Merged
merged 2 commits into from
Aug 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions libs/community/extended_testing_deps.txt
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ requests-toolbelt>=1.0.0,<2
rspace_client>=2.5.0,<3
scikit-learn>=1.2.2,<2
simsimd>=4.3.1,<5
sqlite-vec==0.0.1a19
sqlite-vss>=0.1.2,<0.2
streamlit>=1.18.0,<2
sympy>=1.12,<2
Expand Down
5 changes: 5 additions & 0 deletions libs/community/langchain_community/vectorstores/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,9 @@
from langchain_community.vectorstores.sklearn import (
SKLearnVectorStore,
)
from langchain_community.vectorstores.sqlitevec import (
SQLiteVec,
)
from langchain_community.vectorstores.sqlitevss import (
SQLiteVSS,
)
Expand Down Expand Up @@ -376,6 +379,7 @@
"Relyt",
"Rockset",
"SKLearnVectorStore",
"SQLiteVec",
"SQLiteVSS",
"ScaNN",
"SemaDB",
Expand Down Expand Up @@ -478,6 +482,7 @@
"Relyt": "langchain_community.vectorstores.relyt",
"Rockset": "langchain_community.vectorstores.rocksetdb",
"SKLearnVectorStore": "langchain_community.vectorstores.sklearn",
"SQLiteVec": "langchain_community.vectorstores.sqlitevec",
"SQLiteVSS": "langchain_community.vectorstores.sqlitevss",
"ScaNN": "langchain_community.vectorstores.scann",
"SemaDB": "langchain_community.vectorstores.semadb",
Expand Down
227 changes: 227 additions & 0 deletions libs/community/langchain_community/vectorstores/sqlitevec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
from __future__ import annotations

import json
import logging
import warnings
from typing import (
TYPE_CHECKING,
Any,
Iterable,
List,
Optional,
Tuple,
Type,
)

from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import VectorStore

if TYPE_CHECKING:
import sqlite3

logger = logging.getLogger(__name__)


class SQLiteVec(VectorStore):
"""SQLite with vector extension as a vector database.

To use, you should have the ``sqlite-vec`` python package installed.
Example:
.. code-block:: python
from langchain_community.vectorstores import SQLiteVec
from langchain_community.embeddings.openai import OpenAIEmbeddings
...
"""

def __init__(
self,
table: str,
connection: Optional[sqlite3.Connection],
embedding: Embeddings,
db_file: str = "vec.db",
):
"""Initialize with sqlite client with sqlite-vec extension."""
try:
import sqlite_vec # noqa # pylint: disable=unused-import
except ImportError:
raise ImportError(
"Could not import sqlite-vec python package. "
"Please install it with `pip install sqlite-vec`."
)

if not connection:
connection = self.create_connection(db_file)

if not isinstance(embedding, Embeddings):
warnings.warn("embeddings input must be Embeddings object.")

self._connection = connection
self._table = table
self._embedding = embedding

self.create_table_if_not_exists()

def create_table_if_not_exists(self) -> None:
self._connection.execute(
f"""
CREATE TABLE IF NOT EXISTS {self._table}
(
rowid INTEGER PRIMARY KEY AUTOINCREMENT,
text TEXT,
metadata BLOB,
text_embedding BLOB
)
;
"""
)
self._connection.execute(
f"""
CREATE VIRTUAL TABLE IF NOT EXISTS vec_{self._table} USING vec0(
text_embedding FLOAT[{self.get_dimensionality()}]
);
"""
)
self._connection.execute(
f"""
CREATE TRIGGER IF NOT EXISTS embed_text
AFTER INSERT ON {self._table}
BEGIN
INSERT INTO vec_{self._table}(rowid, text_embedding)
VALUES (new.rowid, new.text_embedding)
;
END;
"""
)
self._connection.commit()

def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> List[str]:
"""Add more texts to the vectorstore index.
Args:
texts: Iterable of strings to add to the vectorstore.
metadatas: Optional list of metadatas associated with the texts.
kwargs: vectorstore specific parameters
"""
max_id = self._connection.execute(
f"SELECT max(rowid) as rowid FROM {self._table}"
).fetchone()["rowid"]
if max_id is None: # no text added yet
max_id = 0

embeds = self._embedding.embed_documents(list(texts))
if not metadatas:
metadatas = [{} for _ in texts]
data_input = [
(text, json.dumps(metadata), json.dumps(embed))
for text, metadata, embed in zip(texts, metadatas, embeds)
]
self._connection.executemany(
f"INSERT INTO {self._table}(text, metadata, text_embedding) "
f"VALUES (?,?,?)",
data_input,
)
self._connection.commit()
# pulling every id we just inserted
results = self._connection.execute(
f"SELECT rowid FROM {self._table} WHERE rowid > {max_id}"
)
return [row["rowid"] for row in results]

def similarity_search_with_score_by_vector(
self, embedding: List[float], k: int = 4, **kwargs: Any
) -> List[Tuple[Document, float]]:
sql_query = f"""
SELECT
text,
metadata,
distance
FROM {self._table} e
INNER JOIN vec_{self._table} v on v.rowid = e.rowid
WHERE v.text_embedding MATCH '{json.dumps(embedding)}'
AND k = {k}
ORDER BY distance
"""
cursor = self._connection.cursor()
cursor.execute(sql_query)
results = cursor.fetchall()

documents = []
for row in results:
metadata = json.loads(row["metadata"]) or {}
doc = Document(page_content=row["text"], metadata=metadata)
documents.append((doc, row["distance"]))

return documents

def similarity_search(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Document]:
"""Return docs most similar to query."""
embedding = self._embedding.embed_query(query)
documents = self.similarity_search_with_score_by_vector(
embedding=embedding, k=k
)
return [doc for doc, _ in documents]

def similarity_search_with_score(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Tuple[Document, float]]:
"""Return docs most similar to query."""
embedding = self._embedding.embed_query(query)
documents = self.similarity_search_with_score_by_vector(
embedding=embedding, k=k
)
return documents

def similarity_search_by_vector(
self, embedding: List[float], k: int = 4, **kwargs: Any
) -> List[Document]:
documents = self.similarity_search_with_score_by_vector(
embedding=embedding, k=k
)
return [doc for doc, _ in documents]

@classmethod
def from_texts(
cls: Type[SQLiteVec],
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
table: str = "langchain",
db_file: str = "vec.db",
**kwargs: Any,
) -> SQLiteVec:
"""Return VectorStore initialized from texts and embeddings."""
connection = cls.create_connection(db_file)
vec = cls(
table=table, connection=connection, db_file=db_file, embedding=embedding
)
vec.add_texts(texts=texts, metadatas=metadatas)
return vec

@staticmethod
def create_connection(db_file: str) -> sqlite3.Connection:
import sqlite3

import sqlite_vec

connection = sqlite3.connect(db_file)
connection.row_factory = sqlite3.Row
connection.enable_load_extension(True)
sqlite_vec.load(connection)
connection.enable_load_extension(False)
return connection

def get_dimensionality(self) -> int:
"""
Function that does a dummy embedding to figure out how many dimensions
this embedding function returns. Needed for the virtual table DDL.
"""
dummy_text = "This is a dummy text"
dummy_embedding = self._embedding.embed_query(dummy_text)
return len(dummy_embedding)
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from typing import List, Optional

import pytest
from langchain_core.documents import Document

from langchain_community.vectorstores import SQLiteVec
from tests.integration_tests.vectorstores.fake_embeddings import (
FakeEmbeddings,
fake_texts,
)


def _sqlite_vec_from_texts(
metadatas: Optional[List[dict]] = None, drop: bool = True
) -> SQLiteVec:
return SQLiteVec.from_texts(
fake_texts,
FakeEmbeddings(),
metadatas=metadatas,
table="test",
db_file=":memory:",
)


@pytest.mark.requires("sqlite-vec")
def test_sqlitevec() -> None:
"""Test end to end construction and search."""
docsearch = _sqlite_vec_from_texts()
output = docsearch.similarity_search("foo", k=1)
assert output == [Document(page_content="foo", metadata={})]


@pytest.mark.requires("sqlite-vec")
def test_sqlitevec_with_score() -> None:
"""Test end to end construction and search with scores and IDs."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = _sqlite_vec_from_texts(metadatas=metadatas)
output = docsearch.similarity_search_with_score("foo", k=3)
docs = [o[0] for o in output]
distances = [o[1] for o in output]
assert docs == [
Document(page_content="foo", metadata={"page": 0}),
Document(page_content="bar", metadata={"page": 1}),
Document(page_content="baz", metadata={"page": 2}),
]
assert distances[0] < distances[1] < distances[2]


@pytest.mark.requires("sqlite-vec")
def test_sqlitevec_add_extra() -> None:
"""Test end to end construction and MRR search."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = _sqlite_vec_from_texts(metadatas=metadatas)
docsearch.add_texts(texts, metadatas)
output = docsearch.similarity_search("foo", k=10)
assert len(output) == 6
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@
"Relyt",
"Rockset",
"SKLearnVectorStore",
"SQLiteVec",
"SQLiteVSS",
"ScaNN",
"SemaDB",
Expand Down
Loading