Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/neural search #655

Merged
merged 3 commits into from
Feb 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions llm-server/routes/search/search_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from utils.get_logger import CustomLogger
from utils.llm_consts import VectorCollections, initialize_qdrant_client
from qdrant_client import models # Add this line
from routes.search.search_service import weighted_search
from pydantic import BaseModel

search_workflow = Blueprint("search", __name__)

Expand Down Expand Up @@ -43,3 +45,32 @@
results = get_all_results(chatbot_id, keyword)

return jsonify(results), 201


class WeightedSearchRequest(BaseModel):
query: str
title_weight: float = 0.7
description_weight: float = 0.3


@search_workflow.route("/cmd_bar/<chatbot_id>", methods=["POST"])
def get_cmdbar_data(chatbot_id: str):
try:
request_data = WeightedSearchRequest(
**request.get_json()
) # Assuming you have a class to parse data
scored_points = weighted_search(
chatbot_id,
request_data.query,
request_data.title_weight,
request_data.description_weight,
)
return (
jsonify([sp.model_dump() for sp in scored_points]),
200,
)

except ValueError as e: # Example of handling a potential error
return jsonify({"error": str(e)}), 400 # Bad request

Check warning

Code scanning / CodeQL

Information exposure through an exception Medium

Stack trace information
flows to this location and may be exposed to an external user.
except Exception as e:
return jsonify({"error": "Internal server error"}), 500
28 changes: 25 additions & 3 deletions llm-server/routes/search/search_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from typing import Dict, List, Optional
import operator
from copy import deepcopy
from utils.llm_consts import ENABLE_NEURAL_SEARCH

client = initialize_qdrant_client()
embedding = get_embeddings()
Expand Down Expand Up @@ -64,23 +65,44 @@ def add_cmdbar_data(items: List[Item], metadata: Dict[str, str]) -> None:

# Function to search with weights
def weighted_search(
query: str, title_weight: float = 0.7, description_weight: float = 0.3
chatbot_id: str,
query: str,
title_weight: float = 0.7,
description_weight: float = 0.3,
) -> List[models.ScoredPoint]:
query_embedding = embedding.embed_query(query)

# Search title and descriptions
title_results = client.search(
collection_name=VectorCollections.neural_search,
query_vector=models.NamedVector(name="title", vector=query_embedding),
query_filter=models.Filter(
must=[
models.FieldCondition(
key="metadata.bot_id",
match=models.MatchValue(value=str(chatbot_id)),
)
]
),
limit=20,
with_payload=True,
with_vector=False,
with_vectors=False,
)

description_results = client.search(
collection_name=VectorCollections.neural_search,
query_vector=models.NamedVector(name="description", vector=query_embedding),
query_filter=models.Filter(
must=[
models.FieldCondition(
key="metadata.bot_id",
match=models.MatchValue(value=chatbot_id),
)
]
),
limit=20,
with_payload=True,
with_vector=False,
with_vectors=False,
)

# Build a lookup for description results
Expand Down
2 changes: 2 additions & 0 deletions llm-server/utils/llm_consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,3 +121,5 @@ def get_mysql_uri():
)

JWT_SECRET_KEY = os.getenv("JWT_SECRET_KEY", "YOURSUPERSECRETKEY")

ENABLE_NEURAL_SEARCH = os.getenv("ENABLE_NEURAL_SEARCH", "NO") == "YES"
32 changes: 15 additions & 17 deletions llm-server/workers/tasks/url_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,24 +70,22 @@ def get_url_fragments(self, content) -> List[LinkInformation]:
def find_all_headings_and_highlights(
self, content: str
) -> Tuple[str, List[Tuple[str, str]]]:
soup = BeautifulSoup(content, "lxml")
title_tag = soup.title
title = ""
if title_tag is not None:
title = title_tag.get_text(strip=True)

headings: List[Tuple[str, str]] = []

for heading in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]):
heading_text = heading.get_text(strip=True)

# Check if the heading or one of its children has an 'id' attribute
id_tag = heading.find(attrs={"id": True})
if id_tag:
heading_id = id_tag["id"]
headings.append((heading_text, heading_id))

return title, headings
soup = BeautifulSoup(content, "lxml")
title = soup.title.text if soup.title else ""
elements_with_id = soup.find_all(id=True)
links = soup.find_all("a")
pairs = []
for element in elements_with_id:
id_ = element.get("id")
if id_: # A simple check if the id exists
corresponding_links = [
link for link in links if link.get("href") == "#" + id_
] # Removed "./#" prefix
if corresponding_links:
for link in corresponding_links:
pairs.append((element.get_text(strip=True), id_))
return title, pairs

def parse_text_content(self, content) -> str:
text = BeautifulSoup(content, "lxml").get_text()
Expand Down
10 changes: 4 additions & 6 deletions llm-server/workers/tasks/web_crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ def scrape_url(url: str, bot_id: str):
for heading_text, heading_id in headings
]

add_cmdbar_data(items, {"url": url, "bot_id": bot_id})
if len(items) > 0:
add_cmdbar_data(items, {"url": url, "bot_id": bot_id})
return parser.parse_text_content(content)
except ValueError as e:
# Log an error message if no parser is available for the content type
Expand Down Expand Up @@ -140,11 +141,8 @@ def scrape_website(url: str, bot_id: str, max_pages: int) -> int:
chatbot_id=bot_id, url=current_url, status="SUCCESS"
)

# Get links on the current page
links = get_links(current_url)

# Add new links to the queue
queue.extend(links)
links = get_links(current_url)
queue.extend(links)

except Exception as e:
logger.error("WEB_SCRAPE_ERROR", error=e)
Expand Down
Loading