Skip to content

Commit

Permalink
feat: Added FTS Rebuild command
Browse files Browse the repository at this point in the history
Refs: #4
  • Loading branch information
tazarov committed Jan 21, 2024
1 parent dac710f commit 36cf534
Show file tree
Hide file tree
Showing 7 changed files with 130 additions and 4 deletions.
17 changes: 17 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,17 @@ chops export-wal /path/to/persist_dir --out /path/to/export.jsonl

> Note: If --out or -o is not specified the command will print the output to stdout.
### Full-Text Search Index Rebuild

This command rebuilds the full-text search index.

> Note: **_Why is this needed_**? Users have reported broken FTS indices that result in a error of this
> kind: `no such table: embedding_fulltext_search`
```bash
chops rebuild-fts /path/to/persist_dir
```

### Using Docker

> Note: You have to mount your persist directory into the container for the commands to work.
Expand Down Expand Up @@ -72,3 +83,9 @@ docker run -it --rm -v ./persist_dir:/chroma-data ghcr.io/amikos-tech/chromadb-o
```bash
docker run -it --rm -v ./persist_dir:/chroma-data -v ./backup:/backup ghcr.io/amikos-tech/chromadb-ops/chops:latest export-wal /chroma-data --out /backup/export.jsonl
```

#### Full-Text Search Index Rebuild

```bash
docker run -it --rm -v ./persist_dir:/chroma-data ghcr.io/amikos-tech/chromadb-ops/chops:latest rebuild-fts /chroma-data
```
4 changes: 4 additions & 0 deletions chroma_ops/main.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import typer

from chroma_ops.rebuild_fts import rebuild_fts
from chroma_ops.wal_commit import command as commit_wal_command
from chroma_ops.wal_clean import command as clean_wal_command
from chroma_ops.wal_export import command as export_wal_command
Expand All @@ -18,5 +19,8 @@
app.command(
name="export-wal", help="Exports the WAL to a jsonl file.", no_args_is_help=True
)(export_wal_command)
app.command(
name="rebuild-fts", help="Rebuilds Full Text Search index.", no_args_is_help=True
)(rebuild_fts)
if __name__ == "__main__":
app()
49 changes: 49 additions & 0 deletions chroma_ops/rebuild_fts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import argparse
import os
import sqlite3
import sys

import chromadb
import typer

from chroma_ops.utils import validate_chroma_persist_dir, read_script


def rebuild_fts(persist_dir: str) -> None:
validate_chroma_persist_dir(persist_dir)
sql_file = os.path.join(persist_dir, "chroma.sqlite3")
conn = sqlite3.connect(sql_file)
cursor = conn.cursor()
script = read_script("scripts/drop_fts.sql")
cursor.executescript(script)
cursor.close()
conn.close()
typer.echo("Dropped FTS. Will try to start your Chroma now.", file=sys.stderr)
typer.echo(
"NOTE: Depending on the size of your documents in Chroma it may take a while for Chroma to start up again.",
file=sys.stderr,
color=typer.colors.YELLOW,
)
try:
chromadb.PersistentClient(path=persist_dir)
typer.echo("Chroma started successfully.", file=sys.stderr)
except Exception as e:
typer.echo(
f"Chroma failed to start. Error: {repr(e)}",
file=sys.stderr,
color=typer.colors.RED,
err=True,
)


def command(
persist_dir: str = typer.Argument(..., help="The persist directory"),
) -> None:
rebuild_fts(persist_dir)


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("persist_dir", type=str)
arg = parser.parse_args()
rebuild_fts(arg.persist_dir)
10 changes: 10 additions & 0 deletions chroma_ops/scripts/drop_fts.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
BEGIN TRANSACTION;
DROP TABLE IF EXISTS embedding_fulltext_search;
DROP TABLE IF EXISTS embedding_fulltext_search_config;
DROP TABLE IF EXISTS embedding_fulltext_search_content;
DROP TABLE IF EXISTS embedding_fulltext_search_data;
DROP TABLE IF EXISTS embedding_fulltext_search_docsize;
DROP TABLE IF EXISTS embedding_fulltext_search_idx;
CREATE TABLE embedding_fulltext (id INTEGER PRIMARY KEY);
DELETE FROM migrations WHERE dir='metadb' AND version='3' AND filename='00003-full-text-tokenize.sqlite.sql';
COMMIT TRANSACTION;
6 changes: 6 additions & 0 deletions chroma_ops/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@ def get_hnsw_index_ids(filename: str, space: str = "l2", dim: int = 384) -> List
return cast(List[int], ids)


def read_script(script: str) -> str:
return open(
os.path.join(os.path.dirname(os.path.realpath(__file__)), script), "r"
).read()


def get_dir_size(path: str) -> int:
total_size = 0
for dirpath, dirnames, filenames in os.walk(path):
Expand Down
9 changes: 5 additions & 4 deletions chroma_ops/wal_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import argparse
import os
import sqlite3
import sys

import typer
from chromadb.segment.impl.vector.local_persistent_hnsw import PersistentData

Expand All @@ -14,8 +16,7 @@

def clean_wal(persist_dir: str) -> None:
validate_chroma_persist_dir(persist_dir)
print("Size before: ", get_dir_size(persist_dir))
# TODO add path join here
typer.echo(f"Size before: {get_dir_size(persist_dir)}", file=sys.stderr)
sql_file = os.path.join(persist_dir, "chroma.sqlite3")
conn = sqlite3.connect(sql_file)
# conn = sqlite3.connect(f"file:{sql_file}?mode=ro", uri=True)
Expand Down Expand Up @@ -50,13 +51,13 @@ def clean_wal(persist_dir: str) -> None:
f"DELETE FROM embeddings_queue WHERE seq_id IN ({','.join([str(i) for i in list_of_ids[batch:batch + batch_size]])});"
)
if len(wal_cleanup_queries) > 0:
print("Cleaning up WAL")
typer.echo("Cleaning up WAL", file=sys.stderr)
wal_cleanup_queries.append("VACUUM;")
cursor.executescript("\n".join(wal_cleanup_queries))
# Close the cursor and connection
cursor.close()
conn.close()
print("Size after: ", get_dir_size(persist_dir))
typer.echo(f"Size after: {get_dir_size(persist_dir)}", file=sys.stderr)


def command(
Expand Down
39 changes: 39 additions & 0 deletions tests/test_rebuild_fts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import os.path
import shutil
import sqlite3
import tempfile
import uuid

import chromadb
import pytest

from chroma_ops.rebuild_fts import rebuild_fts


def test_basic_clean() -> None:
records_to_add = 1
with tempfile.TemporaryDirectory() as temp_dir:
client = chromadb.PersistentClient(path=temp_dir)
col = client.create_collection("test")
ids_documents = [
(f"{uuid.uuid4()}", f"document {i}", [0.1] * 1536)
for i in range(records_to_add)
]
ids, documents, embeddings = zip(*ids_documents)
col.add(ids=list(ids), documents=list(documents), embeddings=list(embeddings))
sql_file = os.path.join(temp_dir, "chroma.sqlite3")
conn = sqlite3.connect(sql_file)
cursor = conn.cursor()
cursor.execute("DROP TABLE IF EXISTS embedding_fulltext_search;")
conn.commit()
cursor.close()
with pytest.raises(Exception) as e:
col.get(where_document={"$contains": "document 0"})

assert "no such table: embedding_fulltext_search" in str(e)
rebuild_fts(temp_dir)
fixed_temp_dir = os.path.join(temp_dir, "fixed")
shutil.copytree(temp_dir, fixed_temp_dir)
client = chromadb.PersistentClient(path=fixed_temp_dir)
col = client.get_collection("test")
col.get(where_document={"$contains": "document 0"})

0 comments on commit 36cf534

Please sign in to comment.