Skip to content

Commit

Permalink
fix: document deletion and large embeddings (vana-com#27)
Browse files Browse the repository at this point in the history
  • Loading branch information
tnunamak authored Mar 11, 2024
1 parent e979c71 commit 04b5263
Show file tree
Hide file tree
Showing 12 changed files with 253 additions and 200 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -298,8 +298,8 @@ Selfie is a work in progress. Here are some features that are not yet fully supp

To build an executable for your platform:

1. Run `pip install pyinstaller`. *(pyinstaller is not compatible with Python >3.12 so it is not included by default)*
2. Run `pyinstaller selfie.spec --noconfirm`.
1. Run `poetry run pip install pyinstaller`. *(pyinstaller is not compatible with Python >3.12 so it is not included by default)*
2. Run `poetry run pyinstaller selfie.spec --noconfirm`.
3. Start the built service with `./dist/selfie/selfie`.

## Contributing
Expand Down
316 changes: 158 additions & 158 deletions poetry.lock

Large diffs are not rendered by default.

10 changes: 4 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
[tool.poetry]
name = "selfie"
version = "0.1.0"
description = "Data awareness mixin for LLMs"
description = "Data mixin for LLMs"
authors = ["Vana <[email protected]>"]
readme = "README.md"

[tool.poetry.dependencies]
python = ">=3.9,<4.0" # >=3.9 for numpy, <4 for llama-index
beautifulsoup4 = "^4.12.3"
colorlog = "^6.8.2"
fastapi = "^0.109.0"
uvicorn = "^0.27.0"
humanize = "^4.9.0"
llama-cpp-python = "^0.2.42"
llama-cpp-python = "^0.2.26"
litellm = "^1.23.12"
txtai = {version = "^6.3.0", extras = ["pipeline-llm"]}
txtai = {version = "^7.0.0", extras = ["pipeline-llm"]}
sse-starlette = "^2.0.0"
llama-index = "^0.10.4"
numpy = "^1.26.4"
Expand All @@ -34,9 +35,6 @@ autoawq = { version = "^0.1.8", optional = true }
[tool.poetry.extras]
gpu = ["auto-gptq", "optimum", "autoawq"]

[tool.poetry.group.dev.dependencies]
colorlog = "^6.8.2"

[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
6 changes: 3 additions & 3 deletions selfie/api/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,16 @@


class UnindexDocumentsRequest(BaseModel):
document_ids: List[str] = []
document_ids: List[int] = []


class IndexDocumentsRequest(BaseModel):
is_chat: bool = False
document_ids: List[str] = []
document_ids: List[int] = []


class DeleteDocumentsRequest(BaseModel):
document_ids: List[str] = []
document_ids: List[int] = []


@router.get("/documents")
Expand Down
6 changes: 5 additions & 1 deletion selfie/api/index_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@

router = APIRouter()

from selfie.config import get_app_config

config = get_app_config()

@router.get("/index_documents")
async def get_documents(offset: int = 0, limit: int = 10):
Expand Down Expand Up @@ -78,7 +81,8 @@ async def load_data(request: DataLoaderRequest):
print(documents)

text_parser = SentenceSplitter(
chunk_size=1024,
chunk_size=config.embedding_chunk_size,
chunk_overlap=config.embedding_chunk_overlap,
# separator=" ",
)

Expand Down
2 changes: 2 additions & 0 deletions selfie/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ class AppConfig(BaseModel):
local_gpu_model: str = Field(default='TheBloke/Mistral-7B-OpenOrca-GPTQ', description="Local GPU model")
local_functionary_model: str = Field(default="meetkai/functionary-7b-v2-GGUF/functionary-7b-v2.q4_0.gguf", description="Local functionary model")
hosted_model: str = Field(default="openai/gpt-3.5-turbo", description="Hosted model")
embedding_chunk_size: int = Field(default=512, description="Embedding chunk size")
embedding_chunk_overlap: int = Field(default=50, description="Embedding chunk overlap")

@property
def base_url(self):
Expand Down
10 changes: 9 additions & 1 deletion selfie/connectors/text_files/connector.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
from abc import ABC
from typing import Any, List

from selfie.config import get_app_config


from llama_index.core.node_parser import SentenceSplitter

from selfie.connectors.base_connector import BaseConnector
Expand All @@ -9,6 +12,8 @@
from selfie.types.documents import DocumentDTO
from selfie.utils import data_uri_to_dict

config = get_app_config()


class TextFilesConfiguration(BaseModel):
files: List[str]
Expand Down Expand Up @@ -46,5 +51,8 @@ def transform_for_embedding(self, configuration: dict[str, Any], documents: List
source_document_id=document.id,
)
for document in documents
for text_chunk in SentenceSplitter(chunk_size=1024).split_text(document.content)
for text_chunk in SentenceSplitter(
chunk_size=config.embedding_chunk_size,
chunk_overlap=config.embedding_chunk_overlap,
).split_text(document.content)
]
25 changes: 13 additions & 12 deletions selfie/data_generators/chat_training_data.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
#!/usr/bin/env python3

from typing import List, Dict
from typing import List, Dict, Callable
from enum import Enum
import os
import time
import json
import random
import argparse
import logging
from itertools import groupby

from enum import Enum

from selfie.parsers.chat import ChatFileParser, Parser
from selfie.types.share_gpt import ShareGPTMessage

logger = logging.getLogger(__name__)


class Strategy(Enum):
BUNDLE = 'bundle'
Expand Down Expand Up @@ -78,26 +79,26 @@ def extract_message_bundles(conversations: List[ShareGPTMessage]):
return message_bundles

@staticmethod
def group_messages_into_chunks(conversations: List[ShareGPTMessage], overlap: int = 0, max_messages: int = 3, max_characters: int = 0) -> List[List[ShareGPTMessage]]:
def group_messages_into_chunks(conversations: List[ShareGPTMessage], tokenizer: Callable, overlap: int = 0, max_messages: int = 3, max_tokens: int = 0) -> List[List[ShareGPTMessage]]:
chunks = []
index = 0
while index < len(conversations):
end_index = index + max_messages
chunk = conversations[index:end_index]

# If there's a max characters limit, adjust the chunk to not exceed it
if max_characters > 0:
characters_count = sum(len(msg.value.split()) for msg in chunk)
while characters_count > max_characters and len(chunk) > 0:
chunk.pop() # Remove the last message
characters_count = sum(len(msg.value.split()) for msg in chunk)
if max_tokens > 0:
tokens_count = sum(len(tokenizer(msg.value)) for msg in chunk)
while tokens_count > max_tokens and len(chunk) > 0:
if len(chunk) == 1:
logger.warning(f"Warning: A single message exceeds the max tokens limit ({max_tokens}).")
chunk.pop()
tokens_count = sum(len(tokenizer(msg.value)) for msg in chunk)

chunks.append(chunk)
index += max_messages - overlap

return chunks


@staticmethod
def generate_sharegpt_jsonl_line(messages: List[ShareGPTMessage]) -> str:
"""
Expand Down
5 changes: 4 additions & 1 deletion selfie/database/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,10 @@ def _map_selfie_documents_to_index_documents(selfie_document: DocumentDTO):
timestamp=DataManager._extract_timestamp(selfie_document),
source_document_id=selfie_document.id,
)
for text_chunk in SentenceSplitter(chunk_size=1024).split_text(selfie_document.content)
for text_chunk in SentenceSplitter(
chunk_size=config.embedding_chunks_size,
chunk_overlap=config.embedding_chunk_overlap
).split_text(selfie_document.content)
]

@staticmethod
Expand Down
65 changes: 50 additions & 15 deletions selfie/embeddings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
from typing import Optional, List, Dict, Any, Coroutine, Callable

import humanize
import logging
import tiktoken
from llama_index.core.node_parser import SentenceSplitter

from selfie.config import get_app_config
from selfie.data_generators.chat_training_data import (
Expand All @@ -18,7 +21,6 @@
from selfie.embeddings.recency_scorer import RecencyScorer
from selfie.embeddings.relevance_scorer import RelevanceScorer
from txtai.embeddings import Embeddings
import logging

from txtai.pipeline import LLM

Expand All @@ -28,12 +30,30 @@

config = get_app_config()

llm = LLM(
verbose=config.verbose,
path=config.local_model,
method="llama.cpp",
n_ctx=8192,
n_gpu_layers=-1 if config.gpu else 0,

def get_default_completion():
llm = LLM(
verbose=config.verbose,
path=config.local_model,
method="llama.cpp",
n_ctx=8192,
n_gpu_layers=-1 if config.gpu else 0,
)

async def completion(prompt):
return llm(prompt)

return completion


# TODO: Probably a minor issue, so hard-coding the tokenizer for now:
# 1. The default tokenizer should probably be based on the user's default/configured model
# 2. If the user changes their default model, already-indexed documents could be larger than max_embedding_size_tokens
tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo").encode
splitter = SentenceSplitter(
tokenizer=tokenizer,
chunk_size=config.embedding_chunk_size,
chunk_overlap=config.embedding_chunk_overlap
)


Expand All @@ -51,11 +71,7 @@ def __init__(self, character_name, storage_path: str = config.embeddings_storage
self.storage_path = os.path.join(storage_path, "index")
os.makedirs(storage_path, exist_ok=True)

async def completion_async(prompt):
return llm(prompt)

self.completion = completion or completion_async

self.completion = completion or get_default_completion()
self.character_name = character_name
self.embeddings = Embeddings(
sqlite={"wal": True},
Expand Down Expand Up @@ -149,11 +165,30 @@ async def enqueue_delete(self, ids: List[int]):
def map_share_gpt_data(
conversation: List[ShareGPTMessage], source: str = "Unknown", source_document_id: int = None
) -> List[EmbeddingDocumentModel]:
chunks = ChatTrainingDataGenerator.group_messages_into_chunks(
conversation, overlap=1, max_messages=8, max_characters=0
conversation_with_chunked_messages = [
ShareGPTMessage(**{
"from": msg.from_user,
"value": chunk,
"timestamp": msg.timestamp,
})
for msg in conversation
for chunk in (
splitter.split_text(msg.value)
if len(tokenizer(msg.value)) > config.embedding_chunk_size
else [msg.value]
)
]

message_chunks = ChatTrainingDataGenerator.group_messages_into_chunks(
conversation_with_chunked_messages,
overlap=2,
max_messages=32,
max_tokens=config.embedding_chunk_size,
tokenizer=tokenizer
)

documents = []
for i, conv in enumerate(chunks):
for i, conv in enumerate(message_chunks):
if any("REDACTED" in msg.value for msg in conv):
continue
last_user = ""
Expand Down
3 changes: 2 additions & 1 deletion selfie/parsers/chat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ def __init__(self, blacklist_patterns=None, rewrite_placeholder: str = "REDACTED
self.parser_cache = {}
self.blacklist_patterns = [
re.compile(pattern, re.IGNORECASE)
for pattern in default_blacklist_patterns + (blacklist_patterns or [])
# TODO: Disabling blacklisting until it is more configurable
for pattern in [] #default_blacklist_patterns + (blacklist_patterns or [])
]
self.rewrite_placeholder = rewrite_placeholder

Expand Down
1 change: 1 addition & 0 deletions selfie/text_generation/generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

config = get_app_config()


async def completion(request: CompletionRequest | ChatCompletionRequest) -> SelfieCompletionResponse:
logger.debug(f"Handling a completion request: {request}")

Expand Down

0 comments on commit 04b5263

Please sign in to comment.