Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Added FTS Rebuild command #5

Merged
merged 1 commit into from
Jan 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,17 @@ chops export-wal /path/to/persist_dir --out /path/to/export.jsonl

> Note: If --out or -o is not specified the command will print the output to stdout.

### Full-Text Search Index Rebuild

This command rebuilds the full-text search index.

> Note: **_Why is this needed_**? Users have reported broken FTS indices that result in a error of this
> kind: `no such table: embedding_fulltext_search`

```bash
chops rebuild-fts /path/to/persist_dir
```

### Using Docker

> Note: You have to mount your persist directory into the container for the commands to work.
Expand Down Expand Up @@ -72,3 +83,9 @@ docker run -it --rm -v ./persist_dir:/chroma-data ghcr.io/amikos-tech/chromadb-o
```bash
docker run -it --rm -v ./persist_dir:/chroma-data -v ./backup:/backup ghcr.io/amikos-tech/chromadb-ops/chops:latest export-wal /chroma-data --out /backup/export.jsonl
```

#### Full-Text Search Index Rebuild

```bash
docker run -it --rm -v ./persist_dir:/chroma-data ghcr.io/amikos-tech/chromadb-ops/chops:latest rebuild-fts /chroma-data
```
4 changes: 4 additions & 0 deletions chroma_ops/main.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import typer

from chroma_ops.rebuild_fts import rebuild_fts
from chroma_ops.wal_commit import command as commit_wal_command
from chroma_ops.wal_clean import command as clean_wal_command
from chroma_ops.wal_export import command as export_wal_command
Expand All @@ -18,5 +19,8 @@
app.command(
name="export-wal", help="Exports the WAL to a jsonl file.", no_args_is_help=True
)(export_wal_command)
app.command(
name="rebuild-fts", help="Rebuilds Full Text Search index.", no_args_is_help=True
)(rebuild_fts)
if __name__ == "__main__":
app()
49 changes: 49 additions & 0 deletions chroma_ops/rebuild_fts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import argparse
import os
import sqlite3
import sys

import chromadb
import typer

from chroma_ops.utils import validate_chroma_persist_dir, read_script


def rebuild_fts(persist_dir: str) -> None:
validate_chroma_persist_dir(persist_dir)
sql_file = os.path.join(persist_dir, "chroma.sqlite3")
conn = sqlite3.connect(sql_file)
cursor = conn.cursor()
script = read_script("scripts/drop_fts.sql")
cursor.executescript(script)
cursor.close()
conn.close()
typer.echo("Dropped FTS. Will try to start your Chroma now.", file=sys.stderr)
typer.echo(
"NOTE: Depending on the size of your documents in Chroma it may take a while for Chroma to start up again.",
file=sys.stderr,
color=typer.colors.YELLOW,
)
try:
chromadb.PersistentClient(path=persist_dir)
typer.echo("Chroma started successfully.", file=sys.stderr)
except Exception as e:
typer.echo(
f"Chroma failed to start. Error: {repr(e)}",
file=sys.stderr,
color=typer.colors.RED,
err=True,
)


def command(
persist_dir: str = typer.Argument(..., help="The persist directory"),
) -> None:
rebuild_fts(persist_dir)


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("persist_dir", type=str)
arg = parser.parse_args()
rebuild_fts(arg.persist_dir)
10 changes: 10 additions & 0 deletions chroma_ops/scripts/drop_fts.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
BEGIN TRANSACTION;
DROP TABLE IF EXISTS embedding_fulltext_search;
DROP TABLE IF EXISTS embedding_fulltext_search_config;
DROP TABLE IF EXISTS embedding_fulltext_search_content;
DROP TABLE IF EXISTS embedding_fulltext_search_data;
DROP TABLE IF EXISTS embedding_fulltext_search_docsize;
DROP TABLE IF EXISTS embedding_fulltext_search_idx;
CREATE TABLE embedding_fulltext (id INTEGER PRIMARY KEY);
DELETE FROM migrations WHERE dir='metadb' AND version='3' AND filename='00003-full-text-tokenize.sqlite.sql';
COMMIT TRANSACTION;
6 changes: 6 additions & 0 deletions chroma_ops/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@ def get_hnsw_index_ids(filename: str, space: str = "l2", dim: int = 384) -> List
return cast(List[int], ids)


def read_script(script: str) -> str:
return open(
os.path.join(os.path.dirname(os.path.realpath(__file__)), script), "r"
).read()


def get_dir_size(path: str) -> int:
total_size = 0
for dirpath, dirnames, filenames in os.walk(path):
Expand Down
9 changes: 5 additions & 4 deletions chroma_ops/wal_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import argparse
import os
import sqlite3
import sys

import typer
from chromadb.segment.impl.vector.local_persistent_hnsw import PersistentData

Expand All @@ -14,8 +16,7 @@

def clean_wal(persist_dir: str) -> None:
validate_chroma_persist_dir(persist_dir)
print("Size before: ", get_dir_size(persist_dir))
# TODO add path join here
typer.echo(f"Size before: {get_dir_size(persist_dir)}", file=sys.stderr)
sql_file = os.path.join(persist_dir, "chroma.sqlite3")
conn = sqlite3.connect(sql_file)
# conn = sqlite3.connect(f"file:{sql_file}?mode=ro", uri=True)
Expand Down Expand Up @@ -50,13 +51,13 @@ def clean_wal(persist_dir: str) -> None:
f"DELETE FROM embeddings_queue WHERE seq_id IN ({','.join([str(i) for i in list_of_ids[batch:batch + batch_size]])});"
)
if len(wal_cleanup_queries) > 0:
print("Cleaning up WAL")
typer.echo("Cleaning up WAL", file=sys.stderr)
wal_cleanup_queries.append("VACUUM;")
cursor.executescript("\n".join(wal_cleanup_queries))
# Close the cursor and connection
cursor.close()
conn.close()
print("Size after: ", get_dir_size(persist_dir))
typer.echo(f"Size after: {get_dir_size(persist_dir)}", file=sys.stderr)


def command(
Expand Down
39 changes: 39 additions & 0 deletions tests/test_rebuild_fts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import os.path
import shutil
import sqlite3
import tempfile
import uuid

import chromadb
import pytest

from chroma_ops.rebuild_fts import rebuild_fts


def test_basic_clean() -> None:
records_to_add = 1
with tempfile.TemporaryDirectory() as temp_dir:
client = chromadb.PersistentClient(path=temp_dir)
col = client.create_collection("test")
ids_documents = [
(f"{uuid.uuid4()}", f"document {i}", [0.1] * 1536)
for i in range(records_to_add)
]
ids, documents, embeddings = zip(*ids_documents)
col.add(ids=list(ids), documents=list(documents), embeddings=list(embeddings))
sql_file = os.path.join(temp_dir, "chroma.sqlite3")
conn = sqlite3.connect(sql_file)
cursor = conn.cursor()
cursor.execute("DROP TABLE IF EXISTS embedding_fulltext_search;")
conn.commit()
cursor.close()
with pytest.raises(Exception) as e:
col.get(where_document={"$contains": "document 0"})

assert "no such table: embedding_fulltext_search" in str(e)
rebuild_fts(temp_dir)
fixed_temp_dir = os.path.join(temp_dir, "fixed")
shutil.copytree(temp_dir, fixed_temp_dir)
client = chromadb.PersistentClient(path=fixed_temp_dir)
col = client.get_collection("test")
col.get(where_document={"$contains": "document 0"})
Loading