Skip to content

Commit

Permalink
Remove opensearch code (#220)
Browse files Browse the repository at this point in the history
* Remove opensearch code paths

* Remove opensearch dev and test setup

* Remove Opensearch documentation references
  • Loading branch information
olaughter authored Feb 7, 2024
1 parent c8886f5 commit 4dfe4b8
Show file tree
Hide file tree
Showing 17 changed files with 182 additions and 5,695 deletions.
28 changes: 0 additions & 28 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -19,34 +19,6 @@ VESPA_SEARCH_LIMIT=150
# Shared search config
INDEX_ENCODER_CACHE_FOLDER=/models

# Opensearch connection settings
OPENSEARCH_USER=admin
OPENSEARCH_PASSWORD=admin
OPENSEARCH_URL=http://opensearch-node1:9200
OPENSEARCH_INDEX_PREFIX=navigator
OPENSEARCH_REQUEST_TIMEOUT=30
OPENSEARCH_USE_SSL=False
OPENSEARCH_VERIFY_CERTS=False
OPENSEARCH_SSL_WARNINGS=False
OPENSEARCH_INDEX_EMBEDDING_DIM=768

# Opensearch query/index settings - optional
# Disabled as not used in deployment. Defaults are set within the application.
# OPENSEARCH_INDEX_INNER_PRODUCT_THRESHOLD=70.0
# OPENSEARCH_INDEX_MAX_DOC_COUNT=100
# OPENSEARCH_INDEX_MAX_PASSAGES_PER_DOC=10
# OPENSEARCH_INDEX_KNN_K_VALUE=10000
# OPENSEARCH_INDEX_N_PASSAGES_TO_SAMPLE_PER_SHARD=5000
# OPENSEARCH_INDEX_NAME_BOOST=100
# OPENSEARCH_INDEX_DESCRIPTION_BOOST=40
# OPENSEARCH_INDEX_EMBEDDED_TEXT_BOOST=50
# OPENSEARCH_JIT_MAX_DOC_COUNT=20
# OPENSEARCH_INDEX_NAME_KEY=for_search_action_name
# OPENSEARCH_INDEX_DESCRIPTION_KEY=for_search_action_description
# OPENSEARCH_INDEX_DESCRIPTION_EMBEDDING_KEY=action_description_embedding
# OPENSEARCH_INDEX_INDEX_KEY=action_name_and_id
# OPENSEARCH_INDEX_TEXT_BLOCK_KEY=text_block_id

# Backend Superuser account information for admin
SUPERUSER_EMAIL=[email protected]
SUPERUSER_PASSWORD=password
Expand Down
1 change: 0 additions & 1 deletion .env.local
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# api host
API_HOST=http://localhost:8888
DATABASE_URL=postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@localhost:5432/${POSTGRES_USER}
OPENSEARCH_URL=http://localhost:9200

# Frontend
NEXT_PUBLIC_API_URL=http://localhost:8000/api/v1
Expand Down
9 changes: 0 additions & 9 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,6 @@ jobs:
- name: Run backend search tests for vespa
run: make test_search

- name: Run backend search tests for opensearch
run: make test_opensearch

- name: Browse Benchmark opensearch - response times in ms
run: cat benchmark_browse.txt

- name: Search Benchmark opensearch - response times in ms
run: cat benchmark_search.txt

- name: Log Dump
if: always()
run: docker-compose logs
Expand Down
52 changes: 16 additions & 36 deletions app/api/api_v1/routers/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,24 +34,14 @@
from app.core.search import (
ENCODER,
FilterField,
OpenSearchConfig,
OpenSearchConnection,
OpenSearchQueryConfig,
create_vespa_search_params,
process_result_into_csv,
process_vespa_search_response,
)
from app.db.crud.document import DocumentExtraCache
from app.db.session import get_db

_LOGGER = logging.getLogger(__name__)

# Use configured environment for router
_OPENSEARCH_CONFIG = OpenSearchConfig()
_OPENSEARCH_CONNECTION = OpenSearchConnection(opensearch_config=_OPENSEARCH_CONFIG)
_OPENSEARCH_INDEX_CONFIG = OpenSearchQueryConfig()
_DOCUMENT_EXTRA_INFO_CACHE = DocumentExtraCache()

_VESPA_CONNECTION = VespaSearchAdapter(
instance_url=VESPA_URL,
cert_directory=VESPA_SECRETS_LOCATION,
Expand All @@ -78,32 +68,22 @@ def _search_request(
req=_get_browse_args_from_search_request_body(search_body),
)
else:
if use_vespa:
data_access_search_params = create_vespa_search_params(db, search_body)
# TODO: we may wish to cache responses to improve pagination performance
try:
data_access_search_response = _VESPA_CONNECTION.search(
parameters=data_access_search_params
)
except QueryError:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid Query"
)
return process_vespa_search_response(
db,
data_access_search_response,
limit=search_body.limit,
offset=search_body.offset,
).increment_pages()
else:
return _OPENSEARCH_CONNECTION.query_families(
search_request_body=search_body,
opensearch_internal_config=_OPENSEARCH_INDEX_CONFIG,
document_extra_info=_DOCUMENT_EXTRA_INFO_CACHE.get_document_extra_info(
db
),
preference="default_search_preference",
).increment_pages()
data_access_search_params = create_vespa_search_params(db, search_body)
# TODO: we may wish to cache responses to improve pagination performance
try:
data_access_search_response = _VESPA_CONNECTION.search(
parameters=data_access_search_params
)
except QueryError:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid Query"
)
return process_vespa_search_response(
db,
data_access_search_response,
limit=search_body.limit,
offset=search_body.offset,
).increment_pages()


@search_router.post("/searches")
Expand Down
52 changes: 3 additions & 49 deletions app/api/api_v1/schemas/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,28 +12,21 @@


class SortOrder(str, Enum):
"""Sort ordering for use building OpenSearch query body."""
"""Sort ordering for use building query body."""

ASCENDING = "asc"
DESCENDING = "desc"


class SortField(str, Enum):
"""Sort field for use building OpenSearch query body."""
"""Sort field for use building query body."""

DATE = "date"
TITLE = "title"


class JitQuery(str, Enum):
"""Flag used for determining if a jit query is to be used."""

ENABLED = "enabled"
DISABLED = "disabled"


class FilterField(str, Enum):
"""Filter field for use building OpenSearch query body."""
"""Filter field for use building query body."""

SOURCE = "sources"
COUNTRY = "countries"
Expand Down Expand Up @@ -84,45 +77,6 @@ class SearchResponseDocumentPassage(BaseModel):
text_block_coords: Optional[Sequence[Coord]] = None


class OpenSearchResponseMatchBase(BaseModel):
"""Describes matches returned by an OpenSearch query"""

document_name: str
document_geography: str
document_description: str
document_sectors: Sequence[str]
document_source: str
document_id: str # Changed semantics to be import_id, not database id
document_date: str
document_type: str
document_source_url: Optional[str] = None
document_cdn_object: Optional[str] = None
document_category: str
document_content_type: Optional[str] = None
document_slug: str


class OpenSearchResponseNameMatch(OpenSearchResponseMatchBase):
"""Describes matches returned by OpenSearch on Document name."""

for_search_document_name: str


class OpenSearchResponseDescriptionMatch(OpenSearchResponseMatchBase):
"""Describes matches returned by OpenSearch on Document description."""

for_search_document_description: str


class OpenSearchResponsePassageMatch(OpenSearchResponseMatchBase):
"""Describes matches returned by OpenSearch on Document passage."""

text: str
text_block_id: str
text_block_page: Optional[int] = None
text_block_coords: Optional[Sequence[Coord]] = None


class SearchResponseFamilyDocument(BaseModel):
"""A single document in a search response."""

Expand Down
53 changes: 0 additions & 53 deletions app/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,59 +9,6 @@
PUBLIC_APP_URL = os.environ["PUBLIC_APP_URL"].rstrip("/")
API_V1_STR = "/api/v1"

# OpenSearch Config
OPENSEARCH_URL = os.environ["OPENSEARCH_URL"]
OPENSEARCH_USERNAME = os.environ["OPENSEARCH_USER"]
OPENSEARCH_PASSWORD = os.environ["OPENSEARCH_PASSWORD"]
OPENSEARCH_INDEX_PREFIX = os.environ["OPENSEARCH_INDEX_PREFIX"]
OPENSEARCH_REQUEST_TIMEOUT: int = int(os.getenv("OPENSEARCH_REQUEST_TIMEOUT", "30"))
OPENSEARCH_USE_SSL: bool = os.getenv("OPENSEARCH_USE_SSL", "False").lower() == "true"
OPENSEARCH_VERIFY_CERTS: bool = (
os.getenv("OPENSEARCH_VERIFY_CERTS", "False").lower() == "true"
)
OPENSEARCH_SSL_WARNINGS: bool = (
os.getenv("OPENSEARCH_SSL_WARNINGS", "False").lower() == "true"
)
# OpenSearch Index Config
OPENSEARCH_INDEX_INNER_PRODUCT_THRESHOLD: float = float(
os.getenv("OPENSEARCH_INDEX_INNER_PRODUCT_THRESHOLD", "70.0")
)
OPENSEARCH_INDEX_MAX_DOC_COUNT: int = int(
os.getenv("OPENSEARCH_INDEX_MAX_DOC_COUNT", "140")
)
OPENSEARCH_INDEX_MAX_PASSAGES_PER_DOC: int = int(
os.getenv("OPENSEARCH_INDEX_MAX_PASSAGES_PER_DOC", "10")
)
OPENSEARCH_INDEX_KNN_K_VALUE = int(os.getenv("OPENSEARCH_INDEX_KNN_K_VALUE", "10000"))
OPENSEARCH_INDEX_N_PASSAGES_TO_SAMPLE_PER_SHARD: int = int(
os.getenv("OPENSEARCH_INDEX_N_PASSAGES_TO_SAMPLE_PER_SHARD", "5000")
)
OPENSEARCH_INDEX_NAME_BOOST: int = int(os.getenv("OPENSEARCH_INDEX_NAME_BOOST", "100"))
OPENSEARCH_INDEX_DESCRIPTION_BOOST: int = int(
os.getenv("OPENSEARCH_INDEX_DESCRIPTION_BOOST", "40")
)

OPENSEARCH_INDEX_EMBEDDED_TEXT_BOOST: int = int(
os.getenv("OPENSEARCH_INDEX_EMBEDDED_TEXT_BOOOST", "50")
)

OPENSEARCH_INDEX_NAME_KEY: str = os.getenv(
"OPENSEARCH_INDEX_NAME_KEY", "for_search_document_name"
)
OPENSEARCH_INDEX_DESCRIPTION_KEY: str = os.getenv(
"OPENSEARCH_INDEX_DESCRIPTION_KEY", "for_search_document_description"
)
OPENSEARCH_INDEX_DESCRIPTION_EMBEDDING_KEY: str = os.getenv(
"OPENSEARCH_INDEX_DESCRIPTION_EMBEDDING_KEY", "document_description_embedding"
)
OPENSEARCH_INDEX_INDEX_KEY: str = os.getenv(
"OPENSEARCH_INDEX_INDEX_KEY", "document_name_and_slug"
)
OPENSEARCH_INDEX_TEXT_BLOCK_KEY: str = os.getenv(
"OPENSEARCH_INDEX_TEXT_BLOCK_KEY", "text_block_id"
)
OPENSEARCH_JIT_MAX_DOC_COUNT: int = int(os.getenv("OPENSEARCH_JIT_MAX_DOC_COUNT", "20"))

# Vespa Config
VESPA_SEARCH_LIMIT: int = int(os.getenv("VESPA_SEARCH_LIMIT", "100"))
VESPA_SEARCH_MATCHES_PER_DOC: int = int(
Expand Down
Loading

0 comments on commit 4dfe4b8

Please sign in to comment.