Skip to content

Commit

Permalink
feat: Clean orphanated vector segment dirs (#36)
Browse files Browse the repository at this point in the history
Closes #35
  • Loading branch information
tazarov authored Nov 29, 2024
1 parent f39d734 commit 5a60961
Show file tree
Hide file tree
Showing 6 changed files with 101 additions and 4 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@ jobs:
steps:
- uses: actions/checkout@v2

- name: Set up Python 3.9
- name: Set up Python 3.10
uses: actions/setup-python@v2
with:
python-version: '3.9'
python-version: '3.10'

- name: Install dependencies
run: |
Expand Down
6 changes: 4 additions & 2 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
name: Test

on:
push:
branches:
Expand All @@ -15,10 +17,10 @@ jobs:
runs-on: [ "ubuntu-latest" ]
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.9
- name: Set up Python 3.10
uses: actions/setup-python@v2
with:
python-version: "3.9"
python-version: "3.10"
- name: Install dependencies
run: |
set -e
Expand Down
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,17 @@ This command rebuilds the full-text search index.
chops rebuild-fts /path/to/persist_dir
```

### Clean

This command cleans up orphaned vector segment directories.

```bash
chops clean /path/to/persist_dir
```

> Note: The command is particularly useful for windows users where deleting collections may leave behind orphaned vector
> segment directories due to Windows file locking.
### Using Docker

> Note: You have to mount your persist directory into the container for the commands to work.
Expand Down
45 changes: 45 additions & 0 deletions chroma_ops/clean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import argparse
import os
import shutil
import sqlite3
import sys
import uuid

import typer
from chroma_ops.utils import validate_chroma_persist_dir


def clean(persist_dir: str):
validate_chroma_persist_dir(persist_dir)
sql_file = os.path.join(persist_dir, "chroma.sqlite3")
conn = sqlite3.connect(f"file:{sql_file}?mode=ro", uri=True)
cursor = conn.cursor()

print("Cleaning up orphanated segment dirs...", file=sys.stderr)
query = "SELECT id FROM segments WHERE scope = 'VECTOR';"
cursor.execute(query)
results = cursor.fetchall()
active_segments = []
for result in results:
active_segments.append(result[0])
cursor.close()
conn.commit()
conn.close()
# list dirs in persist_dir
for dir in os.listdir(persist_dir):
if os.path.isdir(os.path.join(persist_dir, dir)) and dir not in active_segments and os.path.exists(os.path.join(persist_dir, dir, "header.bin")):
print(f"Deleting orphanated segment dir: {dir}", file=sys.stderr)
shutil.rmtree(os.path.join(persist_dir, dir))


def command(
persist_dir: str = typer.Argument(..., help="The persist directory"),
) -> None:
clean(persist_dir)


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("persist_dir", type=str, help="The persist directory")
arg = parser.parse_args()
clean(arg.persist_dir)
7 changes: 7 additions & 0 deletions chroma_ops/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from chroma_ops.wal_clean import command as clean_wal_command
from chroma_ops.wal_export import command as export_wal_command
from chroma_ops.info import command as info_command
from chroma_ops.clean import command as clean_command

app = typer.Typer(no_args_is_help=True, help="ChromaDB Ops Commands.")

Expand All @@ -31,5 +32,11 @@
no_args_is_help=True,
)(info_command)

app.command(
name="clean",
help="Clean up orphaned vector segment directories.",
no_args_is_help=True,
)(clean_command)

if __name__ == "__main__":
app()
32 changes: 32 additions & 0 deletions tests/test_clean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@


import os
import shutil
import tempfile
import uuid

import chromadb
import numpy as np

from chroma_ops.clean import clean
from hypothesis import given, settings
import hypothesis.strategies as st

@given(records_to_add=st.sampled_from([100,1000]),number_of_collections=st.integers(min_value=1, max_value=10))
@settings(deadline=None)
def test_clean(records_to_add: int, number_of_collections: int):
with tempfile.TemporaryDirectory() as temp_dir:
client = chromadb.PersistentClient(path=temp_dir)
for i in range(number_of_collections):
col = client.get_or_create_collection(f"test_{i}")
data = np.random.uniform(-1, 1, (records_to_add, 5))
col.add(ids=[str(i) for i in range(records_to_add)], embeddings=data)
for dir in os.listdir(temp_dir):
if os.path.isdir(os.path.join(temp_dir, dir)) and os.path.exists(os.path.join(temp_dir, dir, "header.bin")):
shutil.copytree(os.path.join(temp_dir, dir), os.path.join(temp_dir, f"{uuid.uuid4()}"))
clean(temp_dir)
segment_dirs = []
for dir in os.listdir(temp_dir):
if os.path.isdir(os.path.join(temp_dir, dir)) and os.path.exists(os.path.join(temp_dir, dir, "header.bin")):
segment_dirs.append(dir)
assert len(segment_dirs) == number_of_collections

0 comments on commit 5a60961

Please sign in to comment.