Skip to content

Commit

Permalink
Updates to search algorithm
Browse files Browse the repository at this point in the history
  • Loading branch information
smathot committed Mar 28, 2024
1 parent 88ead5c commit 8995c74
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 6 deletions.
8 changes: 7 additions & 1 deletion heymans/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,13 @@
# The number of documents that is added to the documentation for each search
# query
search_docs_per_query = 2

# The distance metric used for search. The cosine metric is useful because it
# is somewhat invariant to changes in document length
search_metric = 'cosine'
# The cache folder for the library that is used to chat
db_cache = '.db.cache'
# The cache folder for the library that is used for public search
public_search_cache = '.ps.cache'

def process_ai_message(msg):
# This pattern looks for a colon possibly followed by any number of
Expand Down
3 changes: 2 additions & 1 deletion heymans/documentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,8 @@ def __init__(self, heymans):
logger.info('reading FAISS documentation cache')
self._db = FAISS.load_local(Path('.db.cache'), self._embeddings_model)
self._retriever = self._db.as_retriever(
search_kwargs={'k': config.search_docs_per_query})
search_kwargs={'k': config.search_docs_per_query,
'metric': config.search_metric})

def search(self, queries):
if config.openai_api_key is None:
Expand Down
11 changes: 9 additions & 2 deletions heymans/library.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@
logger = logging.getLogger('heymans')


def load_library(force_reindex=False):
db_cache = Path('.db.cache')
def load_library(force_reindex=False, cache_folder=config.db_cache,
exclude_filter=None):
db_cache = Path(cache_folder)
src_path = Path('sources')
embeddings_model = OpenAIEmbeddings(openai_api_key=config.openai_api_key)
if not force_reindex and db_cache.exists():
Expand All @@ -22,11 +23,17 @@ def load_library(force_reindex=False):
data = []
# PDF files are unstructured. They can be named through config.sources
for src in src_path.glob('pdf/**/*.pdf'):
if exclude_filter and exclude_filter in str(src):
logger.info(f'skipping pdf: {src}')
continue
logger.info(f'indexing pdf: {src}')
data += PyPDFLoader(str(src)).load_and_split()
# jsonl is mainly for documentation
for src in src_path.glob('jsonl/*.jsonl'):
logger.info(f'indexing json: {src}')
if exclude_filter and exclude_filter in str(src):
logger.info(f'skipping json: {src}')
continue
loader = JSONLoader(src, jq_schema='', content_key='content',
json_lines=True,
metadata_func=_extract_metadata)
Expand Down
7 changes: 5 additions & 2 deletions index_library.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from heymans import library
from heymans import library, config
import logging; logging.basicConfig(level=logging.INFO, force=True)

if __name__ == '__main__':
library.load_library(force_reindex=True)
library.load_library(force_reindex=True, exclude_filter='forum')
# library.load_library(force_reindex=True,
# cache_folder=config.public_search_cache,
# exclude_filter='howtos')

0 comments on commit 8995c74

Please sign in to comment.