Skip to content

Commit

Permalink
feat: use enhanced guess_product_type for free-text-search
Browse files Browse the repository at this point in the history
  • Loading branch information
sbrunato committed Mar 8, 2024
1 parent 88e2119 commit 241127f
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 131 deletions.
121 changes: 45 additions & 76 deletions eodag/api/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@
Set,
Tuple,
Union,
cast,
)

import geojson
Expand All @@ -46,7 +45,6 @@
from whoosh.fields import Schema
from whoosh.index import create_in, exists_in, open_dir
from whoosh.qparser import QueryParser
from whoosh.searching import Results

from eodag.api.product.metadata_mapping import (
NOT_MAPPED,
Expand Down Expand Up @@ -517,10 +515,7 @@ def set_locations_conf(self, locations_conf_path: str) -> None:
self.locations_config = []

def list_product_types(
self,
provider: Optional[str] = None,
fetch_providers: bool = True,
filter: Optional[str] = None,
self, provider: Optional[str] = None, fetch_providers: bool = True
) -> List[Dict[str, Any]]:
"""Lists supported product types.
Expand All @@ -530,8 +525,6 @@ def list_product_types(
:param fetch_providers: (optional) Whether to fetch providers for new product
types or not
:type fetch_providers: bool
:param filter: (optional) Comma separated list of free text search terms.
:type filter: str
:returns: The list of the product types that can be accessed using eodag.
:rtype: list(dict)
:raises: :class:`~eodag.utils.exceptions.UnsupportedProvider`
Expand All @@ -554,83 +547,25 @@ def list_product_types(
product_type = dict(ID=product_type_id, **config)
if product_type_id not in product_types:
product_types.append(product_type)
else:
raise UnsupportedProvider(
f"invalid requested provider: {provider} is not (yet) supported"
)

if filter:
product_types = self.__apply_product_type_free_text_search_filter(
product_types, filter=filter
)
return sorted(product_types, key=itemgetter("ID"))

return sorted(product_types, key=itemgetter("ID"))
raise UnsupportedProvider(
f"invalid requested provider: {provider} is not (yet) supported"
)
# Only get the product types supported by the available providers
for provider in self.available_providers():
current_product_type_ids = [pt["ID"] for pt in product_types]
product_types.extend(
[
pt
for pt in self.list_product_types(
provider=provider, fetch_providers=False, filter=filter
provider=provider, fetch_providers=False
)
if pt["ID"] not in current_product_type_ids
]
)

# Return the product_types sorted in lexicographic order of their ID
return sorted(product_types, key=itemgetter("ID"))

def __apply_product_type_free_text_search_filter(
self, product_types: List[Dict[str, Any]], filter: str
) -> List[Dict[str, Any]]:
"""Apply the free text search filter to the given list of product types
:param product_types: The list of product types
:type product_types: list
:param filter: Comma separated list of search terms (ex. "EO,Earth Observation")
:type filter: str
:returns: The new list of product types
:rtype: list
"""
# Apply the free text search
if not filter:
# no filter was given -> return the original list
return product_types
fts_terms = filter.split(",")
fts_supported_params = {"title", "abstract", "keywords"}
with self._product_types_index.searcher() as searcher:
results: Optional[Results] = None
# For each search key, do a guess and then upgrade the result (i.e. when
# merging results, if a hit appears in both results, its position is raised
# to the top. This way, the top most result will be the hit that best
# matches the given queries. Put another way, this best guess is the one
# that crosses the highest number of search params from the given queries
for term in fts_terms:
for search_key in fts_supported_params:
result = cast(
Results,
searcher.search( # type: ignore
QueryParser(
search_key, self._product_types_index.schema
).parse( # type: ignore
term
),
limit=None,
),
)
if not results:
results = result
else:
results.upgrade_and_extend(result) # type: ignore
if not results:
# no result found -> intersection is empty set
return []

result_ids = {r["ID"] for r in results or []}
return [p for p in product_types if p["ID"] in result_ids]

def fetch_product_types_list(self, provider: Optional[str] = None) -> None:
"""Fetch product types list and update if needed
Expand Down Expand Up @@ -979,16 +914,32 @@ def get_alias_from_product_type(self, product_type: str) -> str:

return self.product_types_config[product_type].get("alias", product_type)

def guess_product_type(self, **kwargs: Any) -> List[str]:
"""Find eodag product types codes that best match a set of search params
def guess_product_type(
self,
free_text_filter: Optional[str] = None,
intersect: bool = False,
**kwargs: Any,
) -> List[str]:
"""Find eodag product types ids that best match a set of search params
See https://whoosh.readthedocs.io/en/latest/querylang.html#the-default-query-language
for syntax.
:param free_text_filter: whoosh compatible free text search filter used to search
`title`, `abstract` and `keywords`
:type free_text_filter: Optional[str]
:param intersect: join results for each parameter using INTERSECT instead of UNION
:type intersect: bool
:param kwargs: A set of search parameters as keywords arguments
:returns: The best match for the given parameters
:rtype: list[str]
:raises: :class:`~eodag.utils.exceptions.NoMatchingProductType`
"""
if kwargs.get("productType", None):
return [kwargs["productType"]]
free_text_search_params = (
["title", "abstract", "keywords"] if free_text_filter else []
)
supported_params = {
param
for param in (
Expand All @@ -999,26 +950,44 @@ def guess_product_type(self, **kwargs: Any) -> List[str]:
"sensorType",
"keywords",
"md5",
"abstract",
"title",
)
if kwargs.get(param, None) is not None
}
if not self._product_types_index:
raise EodagError("Missing product types index")
with self._product_types_index.searcher() as searcher:
results = None
# For each search key, do a guess and then upgrade the result (i.e. when
# merging results, if a hit appears in both results, its position is raised
# to the top. This way, the top most result will be the hit that best
# Using `upgrade_and_extend`, for each search key, do a guess and
# then upgrade the result (i.e. when merging results,
# if a hit appears in both results, its position is raised
# to the top). This way, the top most result will be the hit that best
# matches the given queries. Put another way, this best guess is the one
# that crosses the highest number of search params from the given queries

# Always use UNION to join free_text_search results
for search_key in free_text_search_params:
query = QueryParser(search_key, self._product_types_index.schema).parse(
free_text_filter
)
if results is None:
results = searcher.search(query, limit=None)
else:
results.upgrade_and_extend(searcher.search(query, limit=None))

# join results from kwargs using UNION or INTERSECT
for search_key in supported_params:
query = QueryParser(search_key, self._product_types_index.schema).parse(
kwargs[search_key]
)
if results is None:
results = searcher.search(query, limit=None)
elif intersect:
results.filter(searcher.search(query, limit=None))
else:
results.upgrade_and_extend(searcher.search(query, limit=None))

guesses: List[str] = [r["ID"] for r in results or []]
if guesses:
return guesses
Expand Down
29 changes: 22 additions & 7 deletions eodag/rest/stac.py
Original file line number Diff line number Diff line change
Expand Up @@ -637,22 +637,37 @@ def __get_product_types(
"""
if filters is None:
filters = {}
free_text_filter = filters.pop("q", None)

# product types matching filters
try:
guessed_product_types = self.eodag_api.guess_product_type(**filters)
guessed_product_types = (
self.eodag_api.guess_product_type(**filters) if filters else []
)
except NoMatchingProductType:
guessed_product_types = []

# product types matching free text filter
if free_text_filter and not guessed_product_types:
whooshable_filter = " OR ".join(
[f"({x})" for x in free_text_filter.split(",")]
)
try:
guessed_product_types = self.eodag_api.guess_product_type(
whooshable_filter
)
except NoMatchingProductType:
guessed_product_types = []

# list product types with all metadata using guessed ids
if guessed_product_types:
product_types = [
pt
for pt in self.eodag_api.list_product_types(
provider=self.provider, filter=filters.get("q", None)
)
for pt in self.eodag_api.list_product_types(provider=self.provider)
if pt["ID"] in guessed_product_types
]
else:
product_types = self.eodag_api.list_product_types(
provider=self.provider, filter=filters.get("q", None)
)
product_types = self.eodag_api.list_product_types(provider=self.provider)
return product_types

def __get_collection_list(
Expand Down
74 changes: 29 additions & 45 deletions tests/units/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,19 +517,7 @@ def test_list_product_types_fetch_providers(self, mock_fetch_product_types_list)
self.dag.list_product_types(provider="peps", fetch_providers=True)
mock_fetch_product_types_list.assert_called_once_with(self.dag, provider="peps")

def test_list_product_types_with_free_text_filter_ok(self):
"""Core api must correctly return the list of supported product types"""

product_types = self.dag.list_product_types(
fetch_providers=False, filter="ABSTRACTFOO"
)
self.assertIsInstance(product_types, list)
for product_type in product_types:
self.assertListProductTypesRightStructure(product_type)
# There should be no repeated product type in the output
self.assertEqual(len(product_types), len(set(pt["ID"] for pt in product_types)))

def test_list_product_types_with_free_text_filter(self):
def test_guess_product_type_with_filter(self):
"""Testing the search terms"""

with open(
Expand All @@ -538,47 +526,43 @@ def test_list_product_types_with_free_text_filter(self):
ext_product_types_conf = json.load(f)
self.dag.update_product_types_list(ext_product_types_conf)

# match in the abstract
product_types = self.dag.list_product_types(
fetch_providers=False, filter="ABSTRACTFOO"
)
product_types_ids = [r["ID"] for r in product_types or []]
# Free text search: match in the abstract
filter = "ABSTRACTFOO"
product_types_ids = self.dag.guess_product_type(filter)
self.assertListEqual(product_types_ids, ["foo"])

# passing the provider
product_types = self.dag.list_product_types(
provider="astraea_eod", fetch_providers=False, filter="ABSTRACTFOO"
)
product_types_ids = [r["ID"] for r in product_types or []]
filter = "(ABSTRACTFOO)"
product_types_ids = self.dag.guess_product_type(filter)
self.assertListEqual(product_types_ids, ["foo"])

# match in the abstract
product_types = self.dag.list_product_types(
fetch_providers=False, filter=" FOO THIS IS "
)
product_types_ids = [r["ID"] for r in product_types or []]
filter = " FOO THIS IS "
product_types_ids = self.dag.guess_product_type(filter)
self.assertListEqual(product_types_ids, ["foo"])

# match in the keywords
product_types = self.dag.list_product_types(
fetch_providers=False, filter="LECTUS_BAR_KEY"
)
product_types_ids = [r["ID"] for r in product_types or []]
# Free text search: match in the keywords
filter = "LECTUS_BAR_KEY"
product_types_ids = self.dag.guess_product_type(filter)
self.assertListEqual(product_types_ids, ["bar"])

# match in the title
product_types = self.dag.list_product_types(
fetch_providers=False, filter="COLLECTION FOOBAR"
)
product_types_ids = [r["ID"] for r in product_types or []]
# Free text search: match in the title
filter = "COLLECTION FOOBAR"
product_types_ids = self.dag.guess_product_type(filter)
self.assertListEqual(product_types_ids, ["foobar"])

# multiple terms
product_types = self.dag.list_product_types(
fetch_providers=False, filter="FOOANDBAR,FOOBAR"
# Free text search: multiple terms
filter = "(This is FOOBAR) OR (This is BAR)"
product_types_ids = self.dag.guess_product_type(filter)
self.assertListEqual(sorted(product_types_ids), ["bar", "foobar"])

# Free text search: multiple terms joined with param search (UNION)
filter = "(This is FOOBAR) OR (This is BAR)"
product_types_ids = self.dag.guess_product_type(filter, title="FOO*")
self.assertListEqual(sorted(product_types_ids), ["bar", "foo", "foobar"])

# Free text search: multiple terms joined with param search (INTERSECT)
filter = "(This is FOOBAR) OR (This is BAR)"
product_types_ids = self.dag.guess_product_type(
filter, intersect=True, title="titleFOO*"
)
product_types_ids = [r["ID"] for r in product_types or []]
self.assertListEqual(product_types_ids, ["bar", "foo", "foobar"])
self.assertListEqual(sorted(product_types_ids), ["foobar"])

def test_update_product_types_list(self):
"""Core api.update_product_types_list must update eodag product types list"""
Expand Down
7 changes: 4 additions & 3 deletions tests/units/test_http_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -964,7 +964,6 @@ def test_list_product_types_ok(self, list_pt: Mock, guess_pt: Mock):
"""A simple request for product types with(out) a provider must succeed"""
for url in ("/collections",):
r = self.app.get(url)
self.assertTrue(guess_pt.called)
self.assertTrue(list_pt.called)
self.assertEqual(200, r.status_code)
self.assertListEqual(
Expand Down Expand Up @@ -1379,10 +1378,12 @@ def test_cql_post_search(self):
)

@mock.patch("eodag.rest.core.eodag_api.list_product_types", autospec=True)
def test_collection_free_text_search(self, list_pt: Mock):
@mock.patch("eodag.rest.core.eodag_api.guess_product_type", autospec=True)
def test_collection_free_text_search(self, guess_pt: Mock, list_pt: Mock):
"""Test STAC Collection free-text search"""

url = "/collections?q=TERM1,TERM2"
r = self.app.get(url)
list_pt.assert_called_once_with(provider=None, filter="TERM1,TERM2")
list_pt.assert_called_once_with(provider=None)
guess_pt.assert_called_once_with("(TERM1) OR (TERM2)")
self.assertEqual(200, r.status_code)

0 comments on commit 241127f

Please sign in to comment.