From 36cf534ac6f83fcd14e3d1bbc536707561509381 Mon Sep 17 00:00:00 2001 From: Trayan Azarov Date: Sun, 21 Jan 2024 14:18:16 +0200 Subject: [PATCH] feat: Added FTS Rebuild command Refs: #4 --- README.md | 17 ++++++++++++ chroma_ops/main.py | 4 +++ chroma_ops/rebuild_fts.py | 49 +++++++++++++++++++++++++++++++++ chroma_ops/scripts/drop_fts.sql | 10 +++++++ chroma_ops/utils.py | 6 ++++ chroma_ops/wal_clean.py | 9 +++--- tests/test_rebuild_fts.py | 39 ++++++++++++++++++++++++++ 7 files changed, 130 insertions(+), 4 deletions(-) create mode 100644 chroma_ops/rebuild_fts.py create mode 100644 chroma_ops/scripts/drop_fts.sql create mode 100644 tests/test_rebuild_fts.py diff --git a/README.md b/README.md index f7571ad..6f3b4a1 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,17 @@ chops export-wal /path/to/persist_dir --out /path/to/export.jsonl > Note: If --out or -o is not specified the command will print the output to stdout. +### Full-Text Search Index Rebuild + +This command rebuilds the full-text search index. + +> Note: **_Why is this needed_**? Users have reported broken FTS indices that result in a error of this +> kind: `no such table: embedding_fulltext_search` + +```bash +chops rebuild-fts /path/to/persist_dir +``` + ### Using Docker > Note: You have to mount your persist directory into the container for the commands to work. @@ -72,3 +83,9 @@ docker run -it --rm -v ./persist_dir:/chroma-data ghcr.io/amikos-tech/chromadb-o ```bash docker run -it --rm -v ./persist_dir:/chroma-data -v ./backup:/backup ghcr.io/amikos-tech/chromadb-ops/chops:latest export-wal /chroma-data --out /backup/export.jsonl ``` + +#### Full-Text Search Index Rebuild + +```bash +docker run -it --rm -v ./persist_dir:/chroma-data ghcr.io/amikos-tech/chromadb-ops/chops:latest rebuild-fts /chroma-data +``` diff --git a/chroma_ops/main.py b/chroma_ops/main.py index b09cb1b..5cb1fa6 100644 --- a/chroma_ops/main.py +++ b/chroma_ops/main.py @@ -1,5 +1,6 @@ import typer +from chroma_ops.rebuild_fts import rebuild_fts from chroma_ops.wal_commit import command as commit_wal_command from chroma_ops.wal_clean import command as clean_wal_command from chroma_ops.wal_export import command as export_wal_command @@ -18,5 +19,8 @@ app.command( name="export-wal", help="Exports the WAL to a jsonl file.", no_args_is_help=True )(export_wal_command) +app.command( + name="rebuild-fts", help="Rebuilds Full Text Search index.", no_args_is_help=True +)(rebuild_fts) if __name__ == "__main__": app() diff --git a/chroma_ops/rebuild_fts.py b/chroma_ops/rebuild_fts.py new file mode 100644 index 0000000..7a32c9b --- /dev/null +++ b/chroma_ops/rebuild_fts.py @@ -0,0 +1,49 @@ +import argparse +import os +import sqlite3 +import sys + +import chromadb +import typer + +from chroma_ops.utils import validate_chroma_persist_dir, read_script + + +def rebuild_fts(persist_dir: str) -> None: + validate_chroma_persist_dir(persist_dir) + sql_file = os.path.join(persist_dir, "chroma.sqlite3") + conn = sqlite3.connect(sql_file) + cursor = conn.cursor() + script = read_script("scripts/drop_fts.sql") + cursor.executescript(script) + cursor.close() + conn.close() + typer.echo("Dropped FTS. Will try to start your Chroma now.", file=sys.stderr) + typer.echo( + "NOTE: Depending on the size of your documents in Chroma it may take a while for Chroma to start up again.", + file=sys.stderr, + color=typer.colors.YELLOW, + ) + try: + chromadb.PersistentClient(path=persist_dir) + typer.echo("Chroma started successfully.", file=sys.stderr) + except Exception as e: + typer.echo( + f"Chroma failed to start. Error: {repr(e)}", + file=sys.stderr, + color=typer.colors.RED, + err=True, + ) + + +def command( + persist_dir: str = typer.Argument(..., help="The persist directory"), +) -> None: + rebuild_fts(persist_dir) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("persist_dir", type=str) + arg = parser.parse_args() + rebuild_fts(arg.persist_dir) diff --git a/chroma_ops/scripts/drop_fts.sql b/chroma_ops/scripts/drop_fts.sql new file mode 100644 index 0000000..d74afa7 --- /dev/null +++ b/chroma_ops/scripts/drop_fts.sql @@ -0,0 +1,10 @@ +BEGIN TRANSACTION; +DROP TABLE IF EXISTS embedding_fulltext_search; +DROP TABLE IF EXISTS embedding_fulltext_search_config; +DROP TABLE IF EXISTS embedding_fulltext_search_content; +DROP TABLE IF EXISTS embedding_fulltext_search_data; +DROP TABLE IF EXISTS embedding_fulltext_search_docsize; +DROP TABLE IF EXISTS embedding_fulltext_search_idx; +CREATE TABLE embedding_fulltext (id INTEGER PRIMARY KEY); +DELETE FROM migrations WHERE dir='metadb' AND version='3' AND filename='00003-full-text-tokenize.sqlite.sql'; +COMMIT TRANSACTION; diff --git a/chroma_ops/utils.py b/chroma_ops/utils.py index e436058..a21eac5 100644 --- a/chroma_ops/utils.py +++ b/chroma_ops/utils.py @@ -25,6 +25,12 @@ def get_hnsw_index_ids(filename: str, space: str = "l2", dim: int = 384) -> List return cast(List[int], ids) +def read_script(script: str) -> str: + return open( + os.path.join(os.path.dirname(os.path.realpath(__file__)), script), "r" + ).read() + + def get_dir_size(path: str) -> int: total_size = 0 for dirpath, dirnames, filenames in os.walk(path): diff --git a/chroma_ops/wal_clean.py b/chroma_ops/wal_clean.py index f33a693..6a8df3b 100755 --- a/chroma_ops/wal_clean.py +++ b/chroma_ops/wal_clean.py @@ -2,6 +2,8 @@ import argparse import os import sqlite3 +import sys + import typer from chromadb.segment.impl.vector.local_persistent_hnsw import PersistentData @@ -14,8 +16,7 @@ def clean_wal(persist_dir: str) -> None: validate_chroma_persist_dir(persist_dir) - print("Size before: ", get_dir_size(persist_dir)) - # TODO add path join here + typer.echo(f"Size before: {get_dir_size(persist_dir)}", file=sys.stderr) sql_file = os.path.join(persist_dir, "chroma.sqlite3") conn = sqlite3.connect(sql_file) # conn = sqlite3.connect(f"file:{sql_file}?mode=ro", uri=True) @@ -50,13 +51,13 @@ def clean_wal(persist_dir: str) -> None: f"DELETE FROM embeddings_queue WHERE seq_id IN ({','.join([str(i) for i in list_of_ids[batch:batch + batch_size]])});" ) if len(wal_cleanup_queries) > 0: - print("Cleaning up WAL") + typer.echo("Cleaning up WAL", file=sys.stderr) wal_cleanup_queries.append("VACUUM;") cursor.executescript("\n".join(wal_cleanup_queries)) # Close the cursor and connection cursor.close() conn.close() - print("Size after: ", get_dir_size(persist_dir)) + typer.echo(f"Size after: {get_dir_size(persist_dir)}", file=sys.stderr) def command( diff --git a/tests/test_rebuild_fts.py b/tests/test_rebuild_fts.py new file mode 100644 index 0000000..3ae1904 --- /dev/null +++ b/tests/test_rebuild_fts.py @@ -0,0 +1,39 @@ +import os.path +import shutil +import sqlite3 +import tempfile +import uuid + +import chromadb +import pytest + +from chroma_ops.rebuild_fts import rebuild_fts + + +def test_basic_clean() -> None: + records_to_add = 1 + with tempfile.TemporaryDirectory() as temp_dir: + client = chromadb.PersistentClient(path=temp_dir) + col = client.create_collection("test") + ids_documents = [ + (f"{uuid.uuid4()}", f"document {i}", [0.1] * 1536) + for i in range(records_to_add) + ] + ids, documents, embeddings = zip(*ids_documents) + col.add(ids=list(ids), documents=list(documents), embeddings=list(embeddings)) + sql_file = os.path.join(temp_dir, "chroma.sqlite3") + conn = sqlite3.connect(sql_file) + cursor = conn.cursor() + cursor.execute("DROP TABLE IF EXISTS embedding_fulltext_search;") + conn.commit() + cursor.close() + with pytest.raises(Exception) as e: + col.get(where_document={"$contains": "document 0"}) + + assert "no such table: embedding_fulltext_search" in str(e) + rebuild_fts(temp_dir) + fixed_temp_dir = os.path.join(temp_dir, "fixed") + shutil.copytree(temp_dir, fixed_temp_dir) + client = chromadb.PersistentClient(path=fixed_temp_dir) + col = client.get_collection("test") + col.get(where_document={"$contains": "document 0"})