From c27b9fc9a48e939e34ea410d0274a9b6bc147898 Mon Sep 17 00:00:00 2001 From: Alexander Kozlovsky Date: Mon, 6 Jun 2022 15:16:57 +0200 Subject: [PATCH 01/32] Add search_rank() function and use it in FTS queries --- .../components/metadata_store/db/store.py | 28 +++-- src/tribler/core/tests/test_search_utils.py | 76 ++++++++++++- src/tribler/core/utilities/search_utils.py | 103 +++++++++++++++++- 3 files changed, 197 insertions(+), 10 deletions(-) diff --git a/src/tribler/core/components/metadata_store/db/store.py b/src/tribler/core/components/metadata_store/db/store.py index 513ef6075ca..c273f518526 100644 --- a/src/tribler/core/components/metadata_store/db/store.py +++ b/src/tribler/core/components/metadata_store/db/store.py @@ -10,6 +10,7 @@ from pony import orm from pony.orm import db_session, desc, left_join, raw_sql, select +from pony.orm.dbproviders.sqlite import keep_exception from tribler.core import notifications from tribler.core.components.metadata_store.db.orm_bindings import ( @@ -50,9 +51,11 @@ from tribler.core.utilities.notifier import Notifier from tribler.core.utilities.path_util import Path from tribler.core.utilities.pony_utils import get_max, get_or_create +from tribler.core.utilities.search_utils import torrent_rank from tribler.core.utilities.unicode import hexlify from tribler.core.utilities.utilities import MEMORY_DB + BETA_DB_VERSIONS = [0, 1, 2, 3, 4, 5] CURRENT_DB_VERSION = 14 @@ -167,7 +170,7 @@ def __init__( # with the static analysis. # pylint: disable=unused-variable @self._db.on_connect(provider='sqlite') - def sqlite_disable_sync(_, connection): + def on_connect(_, connection): cursor = connection.cursor() cursor.execute("PRAGMA journal_mode = WAL") cursor.execute("PRAGMA synchronous = NORMAL") @@ -180,6 +183,10 @@ def sqlite_disable_sync(_, connection): # losing power during a write will corrupt the database. cursor.execute("PRAGMA journal_mode = 0") cursor.execute("PRAGMA synchronous = 0") + + sqlite_rank = keep_exception(torrent_rank) + connection.create_function('search_rank', 4, sqlite_rank) + # pylint: enable=unused-variable self.MiscData = misc.define_binding(self._db) @@ -591,7 +598,7 @@ def torrent_exists_in_personal_channel(self, infohash): ) # pylint: disable=unused-argument - def search_keyword(self, query, lim=100): + def search_keyword(self, query): # Requires FTS5 table "FtsIndex" to be generated and populated. # FTS table is maintained automatically by SQL triggers. # BM25 ranking is embedded in FTS5. @@ -600,10 +607,11 @@ def search_keyword(self, query, lim=100): if not query or query == "*": return [] - fts_ids = raw_sql( - """SELECT rowid FROM ChannelNode WHERE rowid IN (SELECT rowid FROM FtsIndex WHERE FtsIndex MATCH $query - ORDER BY bm25(FtsIndex) LIMIT $lim) GROUP BY coalesce(infohash, rowid)""" - ) + fts_ids = raw_sql(""" + SELECT rowid FROM ChannelNode + WHERE rowid IN (SELECT rowid FROM FtsIndex WHERE FtsIndex MATCH $query) + GROUP BY coalesce(infohash, rowid) + """) return left_join(g for g in self.MetadataNode if g.rowid in fts_ids) # pylint: disable=E1135 @db_session @@ -639,7 +647,7 @@ def get_entries_query( if cls is None: cls = self.ChannelNode - pony_query = self.search_keyword(txt_filter, lim=1000) if txt_filter else left_join(g for g in cls) + pony_query = self.search_keyword(txt_filter) if txt_filter else left_join(g for g in cls) infohash_set = infohash_set or ({infohash} if infohash else None) if popular: if metadata_type != REGULAR_TORRENT: @@ -731,7 +739,11 @@ def get_entries_query( pony_query = pony_query.sort_by( f""" (1 if g.metadata_type == {CHANNEL_TORRENT} else 2 if g.metadata_type == {COLLECTION_NODE} else 3), - desc(g.health.seeders), desc(g.health.leechers) + raw_sql('''search_rank( + $txt_filter, g.title, torrentstate.seeders + 0.1 * torrentstate.leechers, + $int(time()) - strftime('%s', g.torrent_date) + ) DESC'''), + desc(g.health.last_check) # just to trigger the TorrentState table inclusion into the left join """ ) elif popular: diff --git a/src/tribler/core/tests/test_search_utils.py b/src/tribler/core/tests/test_search_utils.py index 0506d136f57..703d8f060aa 100644 --- a/src/tribler/core/tests/test_search_utils.py +++ b/src/tribler/core/tests/test_search_utils.py @@ -1,4 +1,9 @@ -from tribler.core.utilities.search_utils import filter_keywords, split_into_keywords +import pytest + +from tribler.core.utilities.search_utils import filter_keywords, split_into_keywords, torrent_rank + + +DAY = 60 * 60 * 24 def test_split_into_keywords(): @@ -15,3 +20,72 @@ def test_filter_keywords(): result = filter_keywords(["to", "be", "or", "not", "to", "be"]) assert isinstance(result, list) assert len(result) == 4 + + +def test_torrent_rank(): + query = 'Big Buck Bunny' + + # The exact match ranked as pretty high + assert torrent_rank(query, 'Big Buck Bunny') == pytest.approx(0.81) + + # Seeders are good for the rank + assert torrent_rank(query, 'Big Buck Bunny', seeders=100, freshness=100 * DAY) == pytest.approx(0.876923) + + # The more seeders the better + assert torrent_rank(query, 'Big Buck Bunny', seeders=1000, freshness=100 * DAY) == pytest.approx(0.9146853) + + # The fewer days have passed since the creation of the torrent, the higher its rank + assert torrent_rank(query, 'Big Buck Bunny', seeders=1000, freshness=1 * DAY) == pytest.approx(0.9877126) + + # If a title contains non-matching words missed in the query string it is not as good as the exact match + assert torrent_rank(query, 'Big Buck Bunny II') == pytest.approx(0.80381679) + + # The closer to the start of the string non-matching words are placed in the title, the worse is rank + assert torrent_rank(query, 'Big Buck Brown Bunny') == pytest.approx(0.75061099) + assert torrent_rank(query, 'Big Bad Buck Bunny') == pytest.approx(0.74242068) + assert torrent_rank(query, 'Boring Big Buck Bunny') == pytest.approx(0.73125) + + # The more non-matching words are in the title, the worse is rank + assert torrent_rank(query, 'Big Buck A Bunny') == pytest.approx(0.75061099) + assert torrent_rank(query, 'Big Buck A B Bunny') == pytest.approx(0.699335863) + assert torrent_rank(query, 'Big Buck A B C Bunny') == pytest.approx(0.6546181) + + # Non-matching words close to the beginning of the title give a bigger penalty + assert torrent_rank(query, 'Big A Buck Bunny') == pytest.approx(0.742420681) + assert torrent_rank(query, 'Big A B Buck Bunny') == pytest.approx(0.6852494577) + assert torrent_rank(query, 'Big A B C Buck Bunny') == pytest.approx(0.636253776) + + assert torrent_rank(query, 'A Big Buck Bunny') == pytest.approx(0.73125) + assert torrent_rank(query, 'A B Big Buck Bunny') == pytest.approx(0.66645569) + assert torrent_rank(query, 'A B C Big Buck Bunny') == pytest.approx(0.6122093) + + # Wrong order of words in the title imposes a penalty to the rank + assert torrent_rank(query, 'Big Bunny Buck') == pytest.approx(0.7476923) + + # Missed query words imposes a really big penalty + assert torrent_rank(query, 'Big Buck') == pytest.approx(0.4725) + + # The close the missed words to the beginning of the query, the worse + assert torrent_rank(query, 'Big Bunny') == pytest.approx(0.441818181) + assert torrent_rank(query, 'Buck Bunny') == pytest.approx(0.405) + + # The more seeders is still better, the more days from the check the less relevant the number of seeders is + assert torrent_rank(query, 'Buck Bunny', seeders=10, freshness=5 * DAY) == pytest.approx(0.44805194) + assert torrent_rank(query, 'Buck Bunny', seeders=100, freshness=5 * DAY) == pytest.approx(0.46821428) + assert torrent_rank(query, 'Buck Bunny', seeders=1000, freshness=5 * DAY) == pytest.approx(0.4883766) + assert torrent_rank(query, 'Buck Bunny', seeders=1000, freshness=10 * DAY) == pytest.approx(0.48306818) + assert torrent_rank(query, 'Buck Bunny', seeders=1000, freshness=20 * DAY) == pytest.approx(0.47563636) + + # The exact match + assert torrent_rank('Sintel', 'Sintel') == pytest.approx(0.81) + # Non-matching words at the end of the title give slightly worse results + assert torrent_rank('Sintel', 'Sintel Part II') == pytest.approx(0.79553571) + # Non-matching words at the beginning of the title are much worse + assert torrent_rank('Sintel', 'Part of Sintel') == pytest.approx(0.664925373) + # Too many non-matching words give a bigger penalty + assert torrent_rank('Sintel', 'the.script.from.the.movie.Sintel.pdf') == pytest.approx(0.52105263) + + # Some more examples + assert torrent_rank("Internet's Own Boy", "Internet's Own Boy") == pytest.approx(0.81) + assert torrent_rank("Internet's Own Boy", "Internet's very Own Boy") == pytest.approx(0.75099337) + assert torrent_rank("Internet's Own Boy", "Internet's very special Boy person") == pytest.approx(0.4353166986) diff --git a/src/tribler/core/utilities/search_utils.py b/src/tribler/core/utilities/search_utils.py index 6c4180a41b9..6cde9913e77 100644 --- a/src/tribler/core/utilities/search_utils.py +++ b/src/tribler/core/utilities/search_utils.py @@ -1,13 +1,17 @@ """ Search utilities. -Author(s): Jelle Roozenburg, Arno Bakker +Author(s): Jelle Roozenburg, Arno Bakker, Alexander Kozlovsky """ import re +from collections import deque +from typing import Deque, List, Optional, Tuple RE_KEYWORD_SPLIT = re.compile(r"[\W_]", re.UNICODE) DIALOG_STOPWORDS = {'an', 'and', 'by', 'for', 'from', 'of', 'the', 'to', 'with'} +SECONDS_IN_DAY = 60 * 60 * 24 + def split_into_keywords(string, to_filter_stopwords=False): """ @@ -27,3 +31,100 @@ def split_into_keywords(string, to_filter_stopwords=False): def filter_keywords(keywords): return [kw for kw in keywords if len(kw) > 0 and kw not in DIALOG_STOPWORDS] + + +def torrent_rank(query: str, title: str, seeders: int = 0, freshness: Optional[float] = 0) -> float: + """ + Calculates search rank for a torrent. + Takes into account: + - similarity of the title to the query string; + - the reported number of seeders; + - how long ago the torrent file was created. + """ + freshness = max(0, freshness or 0) + tr = title_rank(query or '', title or '') + sr = (seeders_rank(seeders or 0) + 9) / 10 # range [0.9, 1] + fr = (freshness_rank(freshness) + 9) / 10 # range [0.9, 1] + result = tr * sr * fr + # uncomment the next line to debug the function inside an SQL query: + # print(f'*** {result} : {seeders}/{freshness} ({freshness / SECONDS_IN_DAY} days)/{title} | {query}') + return result + + +def seeders_rank(seeders: float) -> float: + """ + Calculates rank based on the number of seeders. The result is normalized to the range [0, 1] + """ + return seeders / (100 + seeders) # inf seeders -> 1; 100 seeders -> 0.5; 10 seeders -> approx 0.1 + + +def freshness_rank(freshness: Optional[float] = 0): + """ + Calculates rank based on the torrent freshness. The result is normalized to the range [0, 1] + """ + if not freshness: + return 0 + + days = (freshness or 0) / SECONDS_IN_DAY + + return 1 / (1 + days / 30) # 2x drop per 30 days + + +word_re = re.compile(r'\w+', re.UNICODE) + + +def title_rank(query: str, title: str) -> float: + """ + Calculate the similarity of the title string to a query string, with or without stemming. + """ + query = word_re.findall(query.lower()) + title = word_re.findall(title.lower()) + return calculate_rank(query, title) + + +def calculate_rank(query: List[str], title: List[str]) -> float: + """ + Calculate the similarity of the title to the query as a float value in range [0, 1]. + """ + if not query: + return 1.0 + + if not title: + return 0.0 + + title = deque(title) + total_error = 0 + for i, term in enumerate(query): + # The first word is more important than the second word, and so on + term_weight = 5 / (5 + i) + + found, skipped = find_term(term, title) + if found: + # if the query word is found in the title, add penalty for skipped words in title before it + total_error += skipped * term_weight + else: + # if the query word is not found in the title, add a big penalty for it + total_error += 10 * term_weight + + # a small penalty for excess words in the title that was not mentioned in the search phrase + remainder_weight = 1 / (10 + len(query)) + remained_words_error = len(title) * remainder_weight + total_error += remained_words_error + + # a search rank should be between 1 and 0 + return 10 / (10 + total_error) + + +def find_term(term: str, title: Deque[str]) -> Tuple[bool, int]: + """ + Finds the query word in the title. + Returns whether it was found or not and the number of skipped words in the title. + """ + try: + skipped = title.index(term) + except ValueError: + return False, 0 + + title.rotate(-skipped) # rotate skipped words to the end + title.popleft() # remove found word + return True, skipped From 52f18afa3ad664e8c5e8b1ef50fca844b42aa76f Mon Sep 17 00:00:00 2001 From: Alexander Kozlovsky Date: Mon, 13 Jun 2022 16:29:26 +0200 Subject: [PATCH 02/32] Show local search results immediately --- .../gui/widgets/searchresultswidget.py | 56 ++++++------------- 1 file changed, 18 insertions(+), 38 deletions(-) diff --git a/src/tribler/gui/widgets/searchresultswidget.py b/src/tribler/gui/widgets/searchresultswidget.py index 9ce466c53f4..2d8a44c0653 100644 --- a/src/tribler/gui/widgets/searchresultswidget.py +++ b/src/tribler/gui/widgets/searchresultswidget.py @@ -64,40 +64,11 @@ def initialize(self, hide_xxx=False): self.hide_xxx = hide_xxx self.results_page.initialize_content_page(hide_xxx=hide_xxx) self.results_page.channel_torrents_filter_input.setHidden(True) - connect(self.timeout_progress_bar.timeout, self.show_results) - connect(self.show_results_button.clicked, self.show_results) @property def has_results(self): return self.last_search_query is not None - def show_results(self, *_): - if self.search_request is None: - # Fixes a race condition where the user clicks the show_results button before the search request - # has been registered by the Core - return - self.timeout_progress_bar.stop() - query = self.search_request.query - self.results_page.initialize_root_model( - SearchResultsModel( - channel_info={ - "name": (tr("Search results for %s") % query.original_query) - if len(query.original_query) < 50 - else f"{query.original_query[:50]}..." - }, - endpoint_url="search", - hide_xxx=self.results_page.hide_xxx, - text_filter=to_fts_query(query.fts_text), - tags=list(query.tags), - type_filter=[REGULAR_TORRENT], - ) - ) - self.setCurrentWidget(self.results_page) - - # After transitioning to the page with search results, we refresh the viewport since some rows might have been - # rendered already with an incorrect row height. - self.results_page.run_brain_dead_refresh() - def check_can_show(self, query): if ( self.last_search_query == query @@ -119,16 +90,25 @@ def search(self, query: Query) -> bool: self.last_search_query = query.original_query self.last_search_time = time.time() - # Trigger remote search - def register_request(response): - self._logger.info(f'Request registered: {response}') - self.search_request = SearchRequest(response["request_uuid"], query, set(response["peers"])) - self.state_label.setText(format_search_loading_label(self.search_request)) - self.timeout_progress_bar.start() - self.setCurrentWidget(self.loading_page) + self.results_page.initialize_root_model( + SearchResultsModel( + channel_info={ + "name": (tr("Search results for %s") % query.original_query) + if len(query.original_query) < 50 + else f"{query.original_query[:50]}..." + }, + endpoint_url="search", + hide_xxx=self.results_page.hide_xxx, + text_filter=to_fts_query(query.fts_text), + tags=list(query.tags), + type_filter=[REGULAR_TORRENT, CHANNEL_TORRENT, COLLECTION_NODE], + ) + ) + self.setCurrentWidget(self.results_page) - params = {'txt_filter': fts_query, 'hide_xxx': self.hide_xxx, 'tags': list(query.tags)} - TriblerNetworkRequest('remote_query', register_request, method="PUT", url_params=params) + # After transitioning to the page with search results, we refresh the viewport since some rows might have been + # rendered already with an incorrect row height. + self.results_page.run_brain_dead_refresh() return True def reset(self): From 49289bc9434fd795fb3b16924c454ec20d863772 Mon Sep 17 00:00:00 2001 From: Alexander Kozlovsky Date: Wed, 15 Jun 2022 13:04:47 +0200 Subject: [PATCH 03/32] Remove unused parameter --- src/tribler/gui/widgets/channelsmenulistwidget.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tribler/gui/widgets/channelsmenulistwidget.py b/src/tribler/gui/widgets/channelsmenulistwidget.py index 910fee64050..5c62b6818ce 100644 --- a/src/tribler/gui/widgets/channelsmenulistwidget.py +++ b/src/tribler/gui/widgets/channelsmenulistwidget.py @@ -133,7 +133,7 @@ def on_query_results(self, response): self.items_set = frozenset(entry_to_tuple(channel_info) for channel_info in channels) - def load_channels(self, request=None): + def load_channels(self): TriblerNetworkRequest(self.base_url, self.on_query_results, url_params={"subscribed": True, "last": 1000}) def reload_if_necessary(self, changed_entries): From a2a81e05aafa60b3aa17831ee605ff8ad30e064b Mon Sep 17 00:00:00 2001 From: Alexander Kozlovsky Date: Wed, 15 Jun 2022 13:29:08 +0200 Subject: [PATCH 04/32] Load local search results in a single shot without multi-page loading --- src/tribler/gui/widgets/tablecontentmodel.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/tribler/gui/widgets/tablecontentmodel.py b/src/tribler/gui/widgets/tablecontentmodel.py index 00df68041c4..0eaf91a6291 100644 --- a/src/tribler/gui/widgets/tablecontentmodel.py +++ b/src/tribler/gui/widgets/tablecontentmodel.py @@ -258,6 +258,9 @@ def remove_items(self, items): self.info_changed.emit(items) + def perform_initial_query(self): + self.perform_query() + def perform_query(self, **kwargs): """ Fetch results for a given query. @@ -316,8 +319,8 @@ def on_query_results(self, response, remote=False, on_top=False): if update_labels: self.info_changed.emit(response['results']) - self.query_complete.emit() self.loaded = True + self.query_complete.emit() return True @@ -360,7 +363,7 @@ def __init__( self.endpoint_url_override = endpoint_url # Load the initial batch of entries - self.perform_query() + self.perform_initial_query() @property def edit_enabled(self): @@ -548,8 +551,12 @@ def perform_query(self, **kwargs): class SearchResultsModel(ChannelContentModel): - pass + def perform_initial_query(self): + return self.perform_query(first=1, last=200) + @property + def all_local_entries_loaded(self): + return self.loaded class PopularTorrentsModel(ChannelContentModel): columns_shown = (Column.CATEGORY, Column.NAME, Column.SIZE, Column.UPDATED) From e811887c77a3bba64f39f54fdfefa7ce0084849a Mon Sep 17 00:00:00 2001 From: Alexander Kozlovsky Date: Wed, 15 Jun 2022 16:48:50 +0200 Subject: [PATCH 05/32] Grouping of local similar items --- .../gui/widgets/tablecontentdelegate.py | 12 +++++--- src/tribler/gui/widgets/tablecontentmodel.py | 28 +++++++++++++++---- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/src/tribler/gui/widgets/tablecontentdelegate.py b/src/tribler/gui/widgets/tablecontentdelegate.py index c9fd42ce1e7..b88e302d6d0 100644 --- a/src/tribler/gui/widgets/tablecontentdelegate.py +++ b/src/tribler/gui/widgets/tablecontentdelegate.py @@ -372,9 +372,13 @@ class TagsMixin: edit_tags_icon = QIcon(get_image_path("edit_white.png")) edit_tags_icon_hover = QIcon(get_image_path("edit_orange.png")) - def draw_title_and_tags( - self, painter: QPainter, option: QStyleOptionViewItem, index: QModelIndex, data_item: Dict - ) -> None: + def draw_title_and_tags(self, painter: QPainter, option: QStyleOptionViewItem, index: QModelIndex, + data_item: Dict) -> None: + item_name = data_item["name"] + group = data_item.get("group") + if group: + plural = len(group) > 1 + item_name += f" (and {len(group)} similar item{'s' if plural else ''})" painter.setRenderHint(QPainter.Antialiasing, True) title_text_pos = option.rect.topLeft() title_text_height = 60 if data_item["type"] == SNIPPET else 28 @@ -391,7 +395,7 @@ def draw_title_and_tags( painter.drawText( QRectF(title_text_x, title_text_y, option.rect.width() - 6, title_text_height), Qt.AlignVCenter, - data_item["name"], + item_name, ) if data_item["type"] == SNIPPET: diff --git a/src/tribler/gui/widgets/tablecontentmodel.py b/src/tribler/gui/widgets/tablecontentmodel.py index 0eaf91a6291..ef8b4f7ea14 100644 --- a/src/tribler/gui/widgets/tablecontentmodel.py +++ b/src/tribler/gui/widgets/tablecontentmodel.py @@ -108,6 +108,8 @@ def __init__(self, parent=None): self.saved_scroll_state = None self.qt_object_destroyed = False + self.group_by_name = False + connect(self.destroyed, self.on_destroy) # Every remote query must be attributed to its specific model to avoid updating wrong models # on receiving a result. We achieve this by maintaining a set of in-flight remote queries. @@ -177,14 +179,26 @@ def add_items(self, new_items, on_top=False, remote=False): # Only add unique items to the table model and reverse mapping from unique ids to rows is built. insert_index = 0 if on_top else len(self.data_items) unique_new_items = [] + name_mapping = {item['name']: item for item in self.data_items} if self.group_by_name else {} for item in new_items: item_uid = get_item_uid(item) if item_uid not in self.item_uid_map: - self.item_uid_map[item_uid] = insert_index - if 'infohash' in item: - self.item_uid_map[item['infohash']] = insert_index - unique_new_items.append(item) - insert_index += 1 + + prev_item = name_mapping.get(item['name']) + if self.group_by_name and prev_item is not None and not on_top: + group = prev_item.setdefault('group', {}) + if item_uid not in group: + group[item_uid] = item + else: + self.item_uid_map[item_uid] = insert_index + if 'infohash' in item: + self.item_uid_map[item['infohash']] = insert_index + unique_new_items.append(item) + + if self.group_by_name and item['type'] == REGULAR_TORRENT and prev_item is None: + name_mapping[item['name']] = item + + insert_index += 1 # If no new items are found, skip if not unique_new_items: @@ -551,6 +565,10 @@ def perform_query(self, **kwargs): class SearchResultsModel(ChannelContentModel): + def __init__(self,**kwargs): + super().__init__(**kwargs) + self.group_by_name = True + def perform_initial_query(self): return self.perform_query(first=1, last=200) From f9028dcbf5bd7d83528cb6e84e271fb4eb09423a Mon Sep 17 00:00:00 2001 From: Alexander Kozlovsky Date: Wed, 15 Jun 2022 14:32:08 +0200 Subject: [PATCH 06/32] Increase max_query_peers to 20 --- .../metadata_store/remote_query_community/settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tribler/core/components/metadata_store/remote_query_community/settings.py b/src/tribler/core/components/metadata_store/remote_query_community/settings.py index ea380da426e..66ee1737047 100644 --- a/src/tribler/core/components/metadata_store/remote_query_community/settings.py +++ b/src/tribler/core/components/metadata_store/remote_query_community/settings.py @@ -5,7 +5,7 @@ class RemoteQueryCommunitySettings(TriblerConfigSection): minimal_blob_size: int = 200 maximum_payload_size: int = 1300 max_entries: int = maximum_payload_size // minimal_blob_size - max_query_peers: int = 5 + max_query_peers: int = 20 max_response_size: int = 100 # Max number of entries returned by SQL query max_channel_query_back: int = 4 # Max number of entries to query back on receiving an unknown channel push_updates_back_enabled = True From 4ae1d4806b2651a72bbe18486bf8699947f4e988 Mon Sep 17 00:00:00 2001 From: Alexander Kozlovsky Date: Wed, 15 Jun 2022 15:01:59 +0200 Subject: [PATCH 07/32] Receive & display remote search results --- src/tribler/core/utilities/search_utils.py | 9 +++ src/tribler/gui/tests/test_gui.py | 3 +- .../gui/widgets/channelcontentswidget.py | 10 +++ .../gui/widgets/searchresultswidget.py | 30 +++---- .../gui/widgets/tablecontentdelegate.py | 13 +++- src/tribler/gui/widgets/tablecontentmodel.py | 78 +++++++++++++++---- 6 files changed, 111 insertions(+), 32 deletions(-) diff --git a/src/tribler/core/utilities/search_utils.py b/src/tribler/core/utilities/search_utils.py index 6cde9913e77..d7b4ba59f15 100644 --- a/src/tribler/core/utilities/search_utils.py +++ b/src/tribler/core/utilities/search_utils.py @@ -4,6 +4,7 @@ Author(s): Jelle Roozenburg, Arno Bakker, Alexander Kozlovsky """ import re +import time from collections import deque from typing import Deque, List, Optional, Tuple @@ -33,6 +34,14 @@ def filter_keywords(keywords): return [kw for kw in keywords if len(kw) > 0 and kw not in DIALOG_STOPWORDS] +def item_rank(query: str, item: dict): + title = item['name'] + seeders = item.get('num_seeders', 0) + leechers = item.get('num_leechers', 0) + freshness = time.time() - item.get('updated', 0) + return torrent_rank(query, title, seeders + leechers * 0.1, freshness) + + def torrent_rank(query: str, title: str, seeders: int = 0, freshness: Optional[float] = 0) -> float: """ Calculates search rank for a torrent. diff --git a/src/tribler/gui/tests/test_gui.py b/src/tribler/gui/tests/test_gui.py index ad057b8b787..cb6450e50a0 100644 --- a/src/tribler/gui/tests/test_gui.py +++ b/src/tribler/gui/tests/test_gui.py @@ -384,9 +384,8 @@ def test_search_suggestions(window): def test_search(window): window.top_search_bar.setText("a") # This is likely to trigger some search results QTest.keyClick(window.top_search_bar, Qt.Key_Enter) - wait_for_variable(window, "search_results_page.search_request") + QTest.qWait(100) screenshot(window, name="search_loading_page") - QTest.mouseClick(window.search_results_page.show_results_button, Qt.LeftButton) tst_channels_widget( window, window.search_results_page.results_page, diff --git a/src/tribler/gui/widgets/channelcontentswidget.py b/src/tribler/gui/widgets/channelcontentswidget.py index 34bebc5824a..48caf90df29 100644 --- a/src/tribler/gui/widgets/channelcontentswidget.py +++ b/src/tribler/gui/widgets/channelcontentswidget.py @@ -318,6 +318,16 @@ def on_breadcrumb_clicked(self, tgt_level): # Reset the view if the user clicks on the last part of the breadcrumb self.reset_view() + def format_search_title(self): + self.channel_name_label.setTextFormat(Qt.RichText) + text = self.format_link(self.model.format_title()) + self.channel_name_label.setText(text) + self.channel_name_label.setTextInteractionFlags(Qt.TextBrowserInteraction) + self.channel_name_label.setFocusPolicy(Qt.NoFocus) + + def format_link(self, text): + return f'{text}' + def _set_filter_controls_from_model(self): # This should typically be called under freeze_controls context manager content_category = ContentCategories.get(self.model.category_filter) diff --git a/src/tribler/gui/widgets/searchresultswidget.py b/src/tribler/gui/widgets/searchresultswidget.py index 2d8a44c0653..8df0c463c20 100644 --- a/src/tribler/gui/widgets/searchresultswidget.py +++ b/src/tribler/gui/widgets/searchresultswidget.py @@ -92,23 +92,27 @@ def search(self, query: Query) -> bool: self.results_page.initialize_root_model( SearchResultsModel( - channel_info={ - "name": (tr("Search results for %s") % query.original_query) - if len(query.original_query) < 50 - else f"{query.original_query[:50]}..." - }, endpoint_url="search", hide_xxx=self.results_page.hide_xxx, + original_query=query.original_query, text_filter=to_fts_query(query.fts_text), tags=list(query.tags), type_filter=[REGULAR_TORRENT, CHANNEL_TORRENT, COLLECTION_NODE], ) ) self.setCurrentWidget(self.results_page) + self.results_page.format_search_title() # After transitioning to the page with search results, we refresh the viewport since some rows might have been # rendered already with an incorrect row height. self.results_page.run_brain_dead_refresh() + + def register_request(response): + self.search_request = SearchRequest(response["request_uuid"], query, set(response["peers"])) + + params = {'txt_filter': fts_query, 'hide_xxx': self.hide_xxx, 'tags': list(query.tags)} + TriblerNetworkRequest('remote_query', register_request, method="PUT", url_params=params) + return True def reset(self): @@ -116,15 +120,13 @@ def reset(self): self.results_page.go_back_to_level(0) def update_loading_page(self, remote_results): - if ( - not self.search_request - or remote_results.get("uuid") != self.search_request.uuid - or self.currentWidget() == self.results_page - ): + if not self.search_request or self.search_request.uuid != remote_results.get("uuid"): return + peer = remote_results["peer"] + results = remote_results.get("results", []) self.search_request.peers_complete.add(peer) - self.search_request.remote_results.append(remote_results.get("results", [])) - self.state_label.setText(format_search_loading_label(self.search_request)) - if self.search_request.complete: - self.show_results() + self.search_request.remote_results.append(results) + + self.results_page.model.on_remote_results(results) + self.results_page.format_search_title() diff --git a/src/tribler/gui/widgets/tablecontentdelegate.py b/src/tribler/gui/widgets/tablecontentdelegate.py index b88e302d6d0..2979bd6a5d2 100644 --- a/src/tribler/gui/widgets/tablecontentdelegate.py +++ b/src/tribler/gui/widgets/tablecontentdelegate.py @@ -374,11 +374,20 @@ class TagsMixin: def draw_title_and_tags(self, painter: QPainter, option: QStyleOptionViewItem, index: QModelIndex, data_item: Dict) -> None: + debug = False # change to True to see the search rank of items and to highlight remote items item_name = data_item["name"] + group = data_item.get("group") if group: - plural = len(group) > 1 - item_name += f" (and {len(group)} similar item{'s' if plural else ''})" + has_remote_items = any(group_item.get('remote') for group_item in group.values()) + item_name += f" (+ {len(group)} similar{' *' if debug and has_remote_items else ''})" + + if debug: + rank = data_item.get("rank") + if rank is not None: + item_name += f' rank: {rank:.6}' + if data_item.get('remote'): + item_name = '* ' + item_name painter.setRenderHint(QPainter.Antialiasing, True) title_text_pos = option.rect.topLeft() title_text_height = 60 if data_item["type"] == SNIPPET else 28 diff --git a/src/tribler/gui/widgets/tablecontentmodel.py b/src/tribler/gui/widgets/tablecontentmodel.py index ef8b4f7ea14..bbc284d2bf4 100644 --- a/src/tribler/gui/widgets/tablecontentmodel.py +++ b/src/tribler/gui/widgets/tablecontentmodel.py @@ -10,6 +10,7 @@ from tribler.core.components.metadata_store.db.orm_bindings.channel_node import NEW from tribler.core.components.metadata_store.db.serialization import CHANNEL_TORRENT, COLLECTION_NODE, REGULAR_TORRENT, \ SNIPPET +from tribler.core.utilities.search_utils import item_rank from tribler.core.utilities.simpledefs import CHANNELS_VIEW_UUID, CHANNEL_STATE from tribler.core.utilities.utilities import to_fts_query @@ -98,7 +99,6 @@ def __init__(self, parent=None): self.columns_dict = define_columns() self.data_items = [] - self.remote_items = [] self.max_rowid = None self.local_total = None self.item_load_batch = 50 @@ -109,6 +109,8 @@ def __init__(self, parent=None): self.qt_object_destroyed = False self.group_by_name = False + self.sort_by_rank = False + self.text_filter = '' connect(self.destroyed, self.on_destroy) # Every remote query must be attributed to its specific model to avoid updating wrong models @@ -140,7 +142,6 @@ def reset(self): self.beginResetModel() self.loaded = False self.data_items = [] - self.remote_items = [] self.max_rowid = None self.local_total = None self.item_uid_map = {} @@ -170,10 +171,6 @@ def add_items(self, new_items, on_top=False, remote=False): if not new_items: return - if remote and not self.all_local_entries_loaded: - self.remote_items.extend(new_items) - return - # Note: If we want to block the signal like itemChanged, we must use QSignalBlocker object or blockSignals # Only add unique items to the table model and reverse mapping from unique ids to rows is built. @@ -181,6 +178,12 @@ def add_items(self, new_items, on_top=False, remote=False): unique_new_items = [] name_mapping = {item['name']: item for item in self.data_items} if self.group_by_name else {} for item in new_items: + if remote: + item['remote'] = True + if self.sort_by_rank: + if 'rank' not in item: + item['rank'] = item_rank(self.text_filter, item) + item_uid = get_item_uid(item) if item_uid not in self.item_uid_map: @@ -204,6 +207,21 @@ def add_items(self, new_items, on_top=False, remote=False): if not unique_new_items: return + if remote and self.sort_by_rank: + new_data_items = self.data_items + unique_new_items + new_data_items.sort(key = lambda item: item['rank'], reverse=True) + new_item_uid_map = {} + for item in new_data_items: + item_uid = get_item_uid(item) + new_item_uid_map[item_uid] = insert_index + if 'infohash' in item: + new_item_uid_map[item['infohash']] = insert_index + self.beginResetModel() + self.data_items = new_data_items + self.item_uid_map = new_item_uid_map + self.endResetModel() + return + # Else if remote items, to make space for new unique items shift the existing items if on_top and insert_index > 0: new_items_map = {} @@ -225,11 +243,6 @@ def add_items(self, new_items, on_top=False, remote=False): self.data_items.extend(unique_new_items) self.endInsertRows() - if self.all_local_entries_loaded: - remote_items = self.remote_items - self.remote_items = [] - self.add_items(remote_items, remote=True) # to filter non-unique entries - def remove_items(self, items): uids_to_remove = [] rows_to_remove = [] @@ -321,7 +334,7 @@ def on_query_results(self, response, remote=False, on_top=False): if not remote: if "total" in response: self.local_total = response["total"] - self.channel_info["total"] = self.local_total + len(self.remote_items) + self.channel_info["total"] = self.local_total elif self.channel_info.get("total"): self.channel_info["total"] += len(response["results"]) @@ -565,9 +578,23 @@ def perform_query(self, **kwargs): class SearchResultsModel(ChannelContentModel): - def __init__(self,**kwargs): - super().__init__(**kwargs) + def __init__(self, original_query, **kwargs): + self.original_query = original_query + self.remote_results = {} + title = self.format_title() + super().__init__(channel_info={"name": title}, **kwargs) + self.remote_results_received = False + self.postponed_remote_results = [] self.group_by_name = True + self.sort_by_rank = True + + def format_title(self): + q = self.original_query + q = q if len(q) < 50 else q[:50] + '...' + title = f'Search results for {q}' + if self.remote_results: + title += f' (click to add {len(self.remote_results)} remote results)' + return title def perform_initial_query(self): return self.perform_query(first=1, last=200) @@ -576,6 +603,29 @@ def perform_initial_query(self): def all_local_entries_loaded(self): return self.loaded + def on_remote_results(self, results): + self.remote_results_received = True + if not self.all_local_entries_loaded: + self.postponed_remote_results.extend(results) + return + + results = self.postponed_remote_results + results + self.postponed_remote_results = [] + for item in results: + uid = get_item_uid(item) + if uid not in self.item_uid_map and uid not in self.remote_results: + self.remote_results[uid] = item + + def show_remote_results(self): + if not self.all_local_entries_loaded: + return + + remote_items = list(self.remote_results.values()) + self.remote_results.clear() + self.remote_results_received = False + self.add_items(remote_items, remote=True) + + class PopularTorrentsModel(ChannelContentModel): columns_shown = (Column.CATEGORY, Column.NAME, Column.SIZE, Column.UPDATED) From 8e38f0d616b0a792517850e8e5b275dce6835767 Mon Sep 17 00:00:00 2001 From: Alexander Kozlovsky Date: Thu, 20 Oct 2022 05:50:02 +0200 Subject: [PATCH 08/32] Add compatibility with snippets --- src/tribler/gui/widgets/tablecontentmodel.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/src/tribler/gui/widgets/tablecontentmodel.py b/src/tribler/gui/widgets/tablecontentmodel.py index bbc284d2bf4..565f1592944 100644 --- a/src/tribler/gui/widgets/tablecontentmodel.py +++ b/src/tribler/gui/widgets/tablecontentmodel.py @@ -188,7 +188,7 @@ def add_items(self, new_items, on_top=False, remote=False): if item_uid not in self.item_uid_map: prev_item = name_mapping.get(item['name']) - if self.group_by_name and prev_item is not None and not on_top: + if self.group_by_name and prev_item is not None and not on_top and prev_item['type'] == REGULAR_TORRENT: group = prev_item.setdefault('group', {}) if item_uid not in group: group[item_uid] = item @@ -208,8 +208,18 @@ def add_items(self, new_items, on_top=False, remote=False): return if remote and self.sort_by_rank: - new_data_items = self.data_items + unique_new_items - new_data_items.sort(key = lambda item: item['rank'], reverse=True) + torrents = [item for item in self.data_items if item['type'] == REGULAR_TORRENT] + non_torrents = [item for item in self.data_items if item['type'] != REGULAR_TORRENT] + + new_torrents = [item for item in unique_new_items if item['type'] == REGULAR_TORRENT] + new_non_torrents = [item for item in unique_new_items if item['type'] != REGULAR_TORRENT] + + torrents += new_torrents + non_torrents += new_non_torrents + + torrents.sort(key = lambda item: item['rank'], reverse=True) + new_data_items = non_torrents + torrents + new_item_uid_map = {} for item in new_data_items: item_uid = get_item_uid(item) From 58c889607a6bec37b22c478b90383903cc827667 Mon Sep 17 00:00:00 2001 From: Alexander Kozlovsky Date: Mon, 20 Jun 2022 14:06:11 +0200 Subject: [PATCH 09/32] Log search query execution time --- .../metadata_store/restapi/search_endpoint.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/tribler/core/components/metadata_store/restapi/search_endpoint.py b/src/tribler/core/components/metadata_store/restapi/search_endpoint.py index e0710d53db0..a7a1963871f 100644 --- a/src/tribler/core/components/metadata_store/restapi/search_endpoint.py +++ b/src/tribler/core/components/metadata_store/restapi/search_endpoint.py @@ -1,3 +1,4 @@ +import time from collections import defaultdict from typing import Dict, List @@ -119,13 +120,27 @@ async def search(self, request): def search_db(): with db_session: + t1 = time.time() pony_query = mds.get_entries(**sanitized) + t2 = time.time() search_results = [r.to_simple_dict() for r in pony_query] + t3 = time.time() if include_total: total = mds.get_total_count(**sanitized) + t4 = time.time() max_rowid = mds.get_max_rowid() + t5 = time.time() + self._logger.info(f'Search performance for {sanitized}:\n' + f'Main query executed in {t2 - t1:.6} seconds;\n' + f'Result constructed in {t3 - t2:.6} seconds;\n' + f'Total rows count calculated in {t4 - t3:.6} seconds;\n' + f'Max rowid determined in {t5 - t4:.6} seconds.') else: total = max_rowid = None + self._logger.info(f'Search performance for {sanitized}:\n' + f'Main query executed in {t2 - t1:.6} seconds;\n' + f'Result constructed in {t3 - t2:.6} seconds.') + return search_results, total, max_rowid try: From 143219051b9f121a9ae2de47de2fe1df9b76b250 Mon Sep 17 00:00:00 2001 From: Alexander Kozlovsky Date: Wed, 28 Sep 2022 13:39:37 +0200 Subject: [PATCH 10/32] Widget renaming `results_page` -> `results_page_content` as a preparation to wrap widget to another widget named results_page --- .../gui/qt_resources/search_results.ui | 2 +- src/tribler/gui/tests/test_gui.py | 3 +-- .../gui/widgets/searchresultswidget.py | 22 +++++++++---------- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/src/tribler/gui/qt_resources/search_results.ui b/src/tribler/gui/qt_resources/search_results.ui index 7ef8639c575..f66c456f1f8 100644 --- a/src/tribler/gui/qt_resources/search_results.ui +++ b/src/tribler/gui/qt_resources/search_results.ui @@ -110,7 +110,7 @@ - + diff --git a/src/tribler/gui/tests/test_gui.py b/src/tribler/gui/tests/test_gui.py index cb6450e50a0..622c622a64d 100644 --- a/src/tribler/gui/tests/test_gui.py +++ b/src/tribler/gui/tests/test_gui.py @@ -388,14 +388,13 @@ def test_search(window): screenshot(window, name="search_loading_page") tst_channels_widget( window, - window.search_results_page.results_page, + window.search_results_page.results_page_content, "search_results", sort_column=2, test_filter=False, test_subscribe=False, ) - @pytest.mark.guitest def test_add_download_url(window): go_to_and_wait_for_downloads(window) diff --git a/src/tribler/gui/widgets/searchresultswidget.py b/src/tribler/gui/widgets/searchresultswidget.py index 8df0c463c20..bef190a8453 100644 --- a/src/tribler/gui/widgets/searchresultswidget.py +++ b/src/tribler/gui/widgets/searchresultswidget.py @@ -62,8 +62,8 @@ def __init__(self, parent=None): def initialize(self, hide_xxx=False): self.hide_xxx = hide_xxx - self.results_page.initialize_content_page(hide_xxx=hide_xxx) - self.results_page.channel_torrents_filter_input.setHidden(True) + self.results_page_content.initialize_content_page(hide_xxx=hide_xxx) + self.results_page_content.channel_torrents_filter_input.setHidden(True) @property def has_results(self): @@ -90,22 +90,22 @@ def search(self, query: Query) -> bool: self.last_search_query = query.original_query self.last_search_time = time.time() - self.results_page.initialize_root_model( + self.results_page_content.initialize_root_model( SearchResultsModel( endpoint_url="search", - hide_xxx=self.results_page.hide_xxx, + hide_xxx=self.results_page_content.hide_xxx, original_query=query.original_query, text_filter=to_fts_query(query.fts_text), tags=list(query.tags), type_filter=[REGULAR_TORRENT, CHANNEL_TORRENT, COLLECTION_NODE], ) ) - self.setCurrentWidget(self.results_page) - self.results_page.format_search_title() + self.setCurrentWidget(self.results_page_content) + self.results_page_content.format_search_title() # After transitioning to the page with search results, we refresh the viewport since some rows might have been # rendered already with an incorrect row height. - self.results_page.run_brain_dead_refresh() + self.results_page_content.run_brain_dead_refresh() def register_request(response): self.search_request = SearchRequest(response["request_uuid"], query, set(response["peers"])) @@ -116,8 +116,8 @@ def register_request(response): return True def reset(self): - if self.currentWidget() == self.results_page: - self.results_page.go_back_to_level(0) + if self.currentWidget() == self.results_page_content: + self.results_page_content.go_back_to_level(0) def update_loading_page(self, remote_results): if not self.search_request or self.search_request.uuid != remote_results.get("uuid"): @@ -128,5 +128,5 @@ def update_loading_page(self, remote_results): self.search_request.peers_complete.add(peer) self.search_request.remote_results.append(results) - self.results_page.model.on_remote_results(results) - self.results_page.format_search_title() + self.results_page_content.model.on_remote_results(results) + self.results_page_content.format_search_title() From 63967c778984966832d7d54803eb07e72f321071 Mon Sep 17 00:00:00 2001 From: Alexander Kozlovsky Date: Thu, 22 Sep 2022 08:35:04 +0200 Subject: [PATCH 11/32] Add a progress bar to the search page --- .../gui/qt_resources/search_results.ui | 39 ++++++- .../gui/widgets/channelcontentswidget.py | 23 ++-- .../gui/widgets/search_progress_bar.py | 104 ++++++++++++++++++ .../gui/widgets/searchresultswidget.py | 40 ++++--- src/tribler/gui/widgets/tablecontentmodel.py | 19 ++-- 5 files changed, 195 insertions(+), 30 deletions(-) create mode 100644 src/tribler/gui/widgets/search_progress_bar.py diff --git a/src/tribler/gui/qt_resources/search_results.ui b/src/tribler/gui/qt_resources/search_results.ui index f66c456f1f8..c610b6c2063 100644 --- a/src/tribler/gui/qt_resources/search_results.ui +++ b/src/tribler/gui/qt_resources/search_results.ui @@ -110,7 +110,38 @@ - + + + + + + + 0 + 0 + + + + + 16777215 + 10 + + + + 0 + + + Qt::AlignLeading|Qt::AlignLeft|Qt::AlignVCenter + + + + + + + + + + + @@ -129,6 +160,12 @@
tribler.gui.widgets.timeoutprogressbar.h
1
+ + SearchProgressBar + QProgressBar +
tribler.gui.widgets.search_progress_bar.h
+ 1 +
diff --git a/src/tribler/gui/widgets/channelcontentswidget.py b/src/tribler/gui/widgets/channelcontentswidget.py index 48caf90df29..2d150fc7413 100644 --- a/src/tribler/gui/widgets/channelcontentswidget.py +++ b/src/tribler/gui/widgets/channelcontentswidget.py @@ -1,7 +1,7 @@ from base64 import b64encode from PyQt5 import uic -from PyQt5.QtCore import QDir, QTimer, Qt +from PyQt5.QtCore import QDir, QTimer, Qt, pyqtSignal from PyQt5.QtGui import QIcon from PyQt5.QtWidgets import QAction, QFileDialog @@ -39,6 +39,9 @@ # pylint: disable=too-many-instance-attributes, too-many-public-methods class ChannelContentsWidget(AddBreadcrumbOnShowMixin, widget_form, widget_class): + + model_query_completed = pyqtSignal() + def __init__(self, parent=None): widget_class.__init__(self, parent=parent) @@ -110,6 +113,10 @@ def personal_channel_model(self): def model(self): return self.channels_stack[-1] if self.channels_stack else None + @property + def root_model(self): + return self.channels_stack[0] if self.channels_stack else None + def on_channel_committed(self, response): if not response or not response.get("success", False): return @@ -260,6 +267,9 @@ def on_model_info_changed(self, changed_entries): self.model.channel_info["dirty"] = dirty self.update_labels() + def on_model_query_completed(self): + self.model_query_completed.emit() + def initialize_root_model_from_channel_info(self, channel_info): if channel_info.get("state") == CHANNEL_STATE.PERSONAL.value: self.default_channel_model = self.personal_channel_model @@ -293,10 +303,13 @@ def reset_view(self, text_filter=None, category_filter=None): def disconnect_current_model(self): disconnect(self.window().core_manager.events_manager.node_info_updated, self.model.update_node_info) disconnect(self.model.info_changed, self.on_model_info_changed) + disconnect(self.model.query_complete, self.on_model_query_completed) + self.controller.unset_model() # Disconnect the selectionChanged signal def connect_current_model(self): connect(self.model.info_changed, self.on_model_info_changed) + connect(self.model.query_complete, self.on_model_query_completed) connect(self.window().core_manager.events_manager.node_info_updated, self.model.update_node_info) @property @@ -319,14 +332,8 @@ def on_breadcrumb_clicked(self, tgt_level): self.reset_view() def format_search_title(self): - self.channel_name_label.setTextFormat(Qt.RichText) - text = self.format_link(self.model.format_title()) + text = self.model.format_title() self.channel_name_label.setText(text) - self.channel_name_label.setTextInteractionFlags(Qt.TextBrowserInteraction) - self.channel_name_label.setFocusPolicy(Qt.NoFocus) - - def format_link(self, text): - return f'{text}' def _set_filter_controls_from_model(self): # This should typically be called under freeze_controls context manager diff --git a/src/tribler/gui/widgets/search_progress_bar.py b/src/tribler/gui/widgets/search_progress_bar.py new file mode 100644 index 00000000000..b6747cae675 --- /dev/null +++ b/src/tribler/gui/widgets/search_progress_bar.py @@ -0,0 +1,104 @@ +import time + +from PyQt5.QtCore import QTimer, pyqtSignal +from PyQt5.QtWidgets import QProgressBar + +from tribler.gui.utilities import connect + +MAX_VALUE = 10000 +UPDATE_DELAY = 0.5 +REMOTE_DELAY = 0.25 + + +class SearchProgressBar(QProgressBar): + ready_to_update_results = pyqtSignal() + + def __init__(self, parent=None, timeout=20): + super().__init__(parent) + self.timeout_interval = timeout + self.timer = QTimer() + self.timer.setSingleShot(False) + self.timer.setInterval(100) # update the progress bar tick + + self.start_time = None + self.last_update_time = None + self.last_remote_result_time = None + self.has_new_remote_results = False + self.peers_total = 0 + self.peers_responded = 0 + self.new_remote_items_count = 0 + self.total_remote_items_count = 0 + + self._value = 0 + self.setValue(0) + self.setMaximum(MAX_VALUE) + + connect(self.timer.timeout, self._update) + + def start(self): + t = time.time() + self.start_time = t + self.peers_total = 0 + self.peers_responded = 0 + self.setToolTip('') + self.setValue(0) + self.timer.start() + self.show() + + def _update(self): + if self.start_time is None: + return + + t = time.time() + + time_progress = (t - self.start_time) / self.timeout_interval + response_progress = (self.peers_responded / self.peers_total) if self.peers_total else 0 + scale = 1 - ((1 - time_progress) * (1 - response_progress)) ** 2 + value = int(scale * MAX_VALUE) + self.setValue(value) + + timeout = time_progress >= 1 + most_peers_responded = self.peers_total > 0 and self.peers_responded / self.peers_total >= 0.8 + active_transfers_finished = self.last_remote_result_time and t - self.last_remote_result_time > REMOTE_DELAY + + should_stop = timeout or (most_peers_responded and active_transfers_finished) + + if self.last_update_time is not None and self.has_new_remote_results \ + and (t - self.last_update_time > UPDATE_DELAY and active_transfers_finished or should_stop): + self.last_update_time = t + self.has_new_remote_results = False + self.new_remote_items_count = 0 + self.ready_to_update_results.emit() + + if should_stop: + self.stop() + + def stop(self): + self.start_time = None + self.timer.stop() + self.hide() + + def mousePressEvent(self, _): + self.stop() + + def on_local_results(self): + self.last_update_time = time.time() + self.has_new_remote_results = False + self._update() + + def set_remote_total(self, total: int): + self.peers_total = total + self.setToolTip(f'0/{total} remote responded') + self._update() + + def on_remote_results(self, new_items_count, peers_responded): + self.last_remote_result_time = time.time() + tool_tip = f'{peers_responded}/{self.peers_total} peers responded' + if self.total_remote_items_count: + tool_tip += f', {self.total_remote_items_count} new results' + self.setToolTip(tool_tip) + self.has_new_remote_results = True + self.new_remote_items_count += new_items_count + self.total_remote_items_count += new_items_count + self.peers_responded = peers_responded + self._update() diff --git a/src/tribler/gui/widgets/searchresultswidget.py b/src/tribler/gui/widgets/searchresultswidget.py index bef190a8453..b6ddb990ea3 100644 --- a/src/tribler/gui/widgets/searchresultswidget.py +++ b/src/tribler/gui/widgets/searchresultswidget.py @@ -60,6 +60,9 @@ def __init__(self, parent=None): self.hide_xxx = None self.search_request = None + connect(self.results_page_content.model_query_completed, self.on_local_query_completed) + connect(self.search_progress_bar.ready_to_update_results, self.on_ready_to_update_results) + def initialize(self, hide_xxx=False): self.hide_xxx = hide_xxx self.results_page_content.initialize_content_page(hide_xxx=hide_xxx) @@ -90,33 +93,38 @@ def search(self, query: Query) -> bool: self.last_search_query = query.original_query self.last_search_time = time.time() - self.results_page_content.initialize_root_model( - SearchResultsModel( - endpoint_url="search", - hide_xxx=self.results_page_content.hide_xxx, - original_query=query.original_query, - text_filter=to_fts_query(query.fts_text), - tags=list(query.tags), - type_filter=[REGULAR_TORRENT, CHANNEL_TORRENT, COLLECTION_NODE], - ) + model = SearchResultsModel( + endpoint_url="search", + hide_xxx=self.results_page_content.hide_xxx, + original_query=query.original_query, + text_filter=to_fts_query(query.fts_text), + tags=list(query.tags), + type_filter=[REGULAR_TORRENT, CHANNEL_TORRENT, COLLECTION_NODE], ) - self.setCurrentWidget(self.results_page_content) + self.results_page_content.initialize_root_model(model) + self.setCurrentWidget(self.results_page) self.results_page_content.format_search_title() + self.search_progress_bar.start() # After transitioning to the page with search results, we refresh the viewport since some rows might have been # rendered already with an incorrect row height. self.results_page_content.run_brain_dead_refresh() def register_request(response): - self.search_request = SearchRequest(response["request_uuid"], query, set(response["peers"])) + peers = set(response["peers"]) + self.search_request = SearchRequest(response["request_uuid"], query, peers) + self.search_progress_bar.set_remote_total(len(peers)) params = {'txt_filter': fts_query, 'hide_xxx': self.hide_xxx, 'tags': list(query.tags)} TriblerNetworkRequest('remote_query', register_request, method="PUT", url_params=params) return True + def on_local_query_completed(self): + self.search_progress_bar.on_local_results() + def reset(self): - if self.currentWidget() == self.results_page_content: + if self.currentWidget() == self.results_page: self.results_page_content.go_back_to_level(0) def update_loading_page(self, remote_results): @@ -125,8 +133,12 @@ def update_loading_page(self, remote_results): peer = remote_results["peer"] results = remote_results.get("results", []) + self.search_request.peers_complete.add(peer) self.search_request.remote_results.append(results) - self.results_page_content.model.on_remote_results(results) - self.results_page_content.format_search_title() + new_items = self.results_page_content.model.add_remote_results(results) + self.search_progress_bar.on_remote_results(len(new_items), len(self.search_request.peers_complete)) + + def on_ready_to_update_results(self): + self.results_page_content.root_model.show_remote_results() diff --git a/src/tribler/gui/widgets/tablecontentmodel.py b/src/tribler/gui/widgets/tablecontentmodel.py index 565f1592944..43160440791 100644 --- a/src/tribler/gui/widgets/tablecontentmodel.py +++ b/src/tribler/gui/widgets/tablecontentmodel.py @@ -601,30 +601,35 @@ def __init__(self, original_query, **kwargs): def format_title(self): q = self.original_query q = q if len(q) < 50 else q[:50] + '...' - title = f'Search results for {q}' - if self.remote_results: - title += f' (click to add {len(self.remote_results)} remote results)' - return title + return f'Search results for {q}' def perform_initial_query(self): return self.perform_query(first=1, last=200) + def on_query_results(self, response, remote=False, on_top=False): + super().on_query_results(response, remote=remote, on_top=on_top) + self.add_remote_results([]) # to trigger adding postponed results + self.show_remote_results() + @property def all_local_entries_loaded(self): return self.loaded - def on_remote_results(self, results): - self.remote_results_received = True + def add_remote_results(self, results): if not self.all_local_entries_loaded: self.postponed_remote_results.extend(results) - return + return [] results = self.postponed_remote_results + results self.postponed_remote_results = [] + new_items = [] for item in results: uid = get_item_uid(item) if uid not in self.item_uid_map and uid not in self.remote_results: + self.remote_results_received = True + new_items.append(item) self.remote_results[uid] = item + return new_items def show_remote_results(self): if not self.all_local_entries_loaded: From 5530a491942ea64fc26cdce3d7f89204853b1f78 Mon Sep 17 00:00:00 2001 From: Alexander Kozlovsky Date: Thu, 29 Sep 2022 23:23:58 +0200 Subject: [PATCH 12/32] Show only non-deleted regular torrents in search results list --- src/tribler/gui/widgets/searchresultswidget.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/tribler/gui/widgets/searchresultswidget.py b/src/tribler/gui/widgets/searchresultswidget.py index b6ddb990ea3..5eb972d0696 100644 --- a/src/tribler/gui/widgets/searchresultswidget.py +++ b/src/tribler/gui/widgets/searchresultswidget.py @@ -99,7 +99,8 @@ def search(self, query: Query) -> bool: original_query=query.original_query, text_filter=to_fts_query(query.fts_text), tags=list(query.tags), - type_filter=[REGULAR_TORRENT, CHANNEL_TORRENT, COLLECTION_NODE], + type_filter=[REGULAR_TORRENT], + exclude_deleted=True, ) self.results_page_content.initialize_root_model(model) self.setCurrentWidget(self.results_page) @@ -115,7 +116,8 @@ def register_request(response): self.search_request = SearchRequest(response["request_uuid"], query, peers) self.search_progress_bar.set_remote_total(len(peers)) - params = {'txt_filter': fts_query, 'hide_xxx': self.hide_xxx, 'tags': list(query.tags)} + params = {'txt_filter': fts_query, 'hide_xxx': self.hide_xxx, 'tags': list(query.tags), + 'metadata_type': REGULAR_TORRENT, 'exclude_deleted': True} TriblerNetworkRequest('remote_query', register_request, method="PUT", url_params=params) return True From bb5ddc6765ff98e2e07ba54635387c6c50a45923 Mon Sep 17 00:00:00 2001 From: Alexander Kozlovsky Date: Fri, 30 Sep 2022 07:33:38 +0200 Subject: [PATCH 13/32] Add tests for better coverage --- src/tribler/core/tests/test_search_utils.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/tribler/core/tests/test_search_utils.py b/src/tribler/core/tests/test_search_utils.py index 703d8f060aa..5b85b561fe6 100644 --- a/src/tribler/core/tests/test_search_utils.py +++ b/src/tribler/core/tests/test_search_utils.py @@ -1,6 +1,7 @@ import pytest -from tribler.core.utilities.search_utils import filter_keywords, split_into_keywords, torrent_rank +from tribler.core.utilities.search_utils import filter_keywords, item_rank, split_into_keywords, torrent_rank, \ + title_rank DAY = 60 * 60 * 24 @@ -89,3 +90,14 @@ def test_torrent_rank(): assert torrent_rank("Internet's Own Boy", "Internet's Own Boy") == pytest.approx(0.81) assert torrent_rank("Internet's Own Boy", "Internet's very Own Boy") == pytest.approx(0.75099337) assert torrent_rank("Internet's Own Boy", "Internet's very special Boy person") == pytest.approx(0.4353166986) + + +def test_title_rank(): + # tests for better covarage of corner cases + assert title_rank("", "title") == pytest.approx(1.0) + assert title_rank("query", "") == pytest.approx(0.0) + + +def test_item_rank(): + item = dict(name="abc", num_seeders=10, num_leechers=20) + assert item_rank("abc", item) == pytest.approx(0.81978445) From de8a7492d919a7d72a270cb6b6a6623b0204d8bd Mon Sep 17 00:00:00 2001 From: Alexander Kozlovsky Date: Tue, 11 Oct 2022 07:00:33 +0200 Subject: [PATCH 14/32] Use relative comparison of ranking results in test_torrent_rank instead of hardcoded values --- src/tribler/core/tests/test_search_utils.py | 120 ++++++++++++++------ 1 file changed, 83 insertions(+), 37 deletions(-) diff --git a/src/tribler/core/tests/test_search_utils.py b/src/tribler/core/tests/test_search_utils.py index 5b85b561fe6..c0e8055a60e 100644 --- a/src/tribler/core/tests/test_search_utils.py +++ b/src/tribler/core/tests/test_search_utils.py @@ -27,69 +27,115 @@ def test_torrent_rank(): query = 'Big Buck Bunny' # The exact match ranked as pretty high - assert torrent_rank(query, 'Big Buck Bunny') == pytest.approx(0.81) + + r1 = torrent_rank(query, 'Big Buck Bunny') # 0.81 + assert r1 > 0.8 # Seeders are good for the rank - assert torrent_rank(query, 'Big Buck Bunny', seeders=100, freshness=100 * DAY) == pytest.approx(0.876923) + + r2 = torrent_rank(query, 'Big Buck Bunny', seeders=100, freshness=100 * DAY) # 0.876923 # The more seeders the better - assert torrent_rank(query, 'Big Buck Bunny', seeders=1000, freshness=100 * DAY) == pytest.approx(0.9146853) + + r3 = torrent_rank(query, 'Big Buck Bunny', seeders=1000, freshness=100 * DAY) # 0.9146853 # The fewer days have passed since the creation of the torrent, the higher its rank - assert torrent_rank(query, 'Big Buck Bunny', seeders=1000, freshness=1 * DAY) == pytest.approx(0.9877126) + + r4 = torrent_rank(query, 'Big Buck Bunny', seeders=1000, freshness=1 * DAY) # 0.9877126 + + assert r1 < r2 < r3 < r4 # If a title contains non-matching words missed in the query string it is not as good as the exact match - assert torrent_rank(query, 'Big Buck Bunny II') == pytest.approx(0.80381679) + + r5 = torrent_rank(query, 'Big Buck Bunny II') # 0.80381679 # The closer to the start of the string non-matching words are placed in the title, the worse is rank - assert torrent_rank(query, 'Big Buck Brown Bunny') == pytest.approx(0.75061099) - assert torrent_rank(query, 'Big Bad Buck Bunny') == pytest.approx(0.74242068) - assert torrent_rank(query, 'Boring Big Buck Bunny') == pytest.approx(0.73125) + + r6 = torrent_rank(query, 'Big Buck Brown Bunny') # 0.75061099 + r7 = torrent_rank(query, 'Big Bad Buck Bunny') # 0.74242068 + r8 = torrent_rank(query, 'Boring Big Buck Bunny') # 0.73125 + + assert r8 < r7 < r6 < r5 < r1 # The more non-matching words are in the title, the worse is rank - assert torrent_rank(query, 'Big Buck A Bunny') == pytest.approx(0.75061099) - assert torrent_rank(query, 'Big Buck A B Bunny') == pytest.approx(0.699335863) - assert torrent_rank(query, 'Big Buck A B C Bunny') == pytest.approx(0.6546181) + + r9 = torrent_rank(query, 'Big Buck A Bunny') # 0.75061099 + r10 = torrent_rank(query, 'Big Buck A B Bunny') # 0.699335863 + r11 = torrent_rank(query, 'Big Buck A B C Bunny') # 0.6546181 + + assert r11 < r10 < r9 < r1 # Non-matching words close to the beginning of the title give a bigger penalty - assert torrent_rank(query, 'Big A Buck Bunny') == pytest.approx(0.742420681) - assert torrent_rank(query, 'Big A B Buck Bunny') == pytest.approx(0.6852494577) - assert torrent_rank(query, 'Big A B C Buck Bunny') == pytest.approx(0.636253776) - assert torrent_rank(query, 'A Big Buck Bunny') == pytest.approx(0.73125) - assert torrent_rank(query, 'A B Big Buck Bunny') == pytest.approx(0.66645569) - assert torrent_rank(query, 'A B C Big Buck Bunny') == pytest.approx(0.6122093) + r12 = torrent_rank(query, 'Big A Buck Bunny') # 0.742420681 + r13 = torrent_rank(query, 'Big A B Buck Bunny') # 0.6852494577 + r14 = torrent_rank(query, 'Big A B C Buck Bunny') # 0.636253776 + + assert r14 < r13 < r12 < r1 + + r15 = torrent_rank(query, 'A Big Buck Bunny') # 0.73125 + r16 = torrent_rank(query, 'A B Big Buck Bunny') # 0.66645569 + r17 = torrent_rank(query, 'A B C Big Buck Bunny') # 0.6122093 + + assert r17 < r16 < r15 < r1 + assert r15 < r12 and r16 < r13 and r17 < r14 # Wrong order of words in the title imposes a penalty to the rank - assert torrent_rank(query, 'Big Bunny Buck') == pytest.approx(0.7476923) + + r18 = torrent_rank(query, 'Big Bunny Buck') # 0.7476923 + + assert r18 < r1 # Missed query words imposes a really big penalty - assert torrent_rank(query, 'Big Buck') == pytest.approx(0.4725) + + r19 = torrent_rank(query, 'Big Buck') # 0.4725 + + assert r19 < 0.5 # The close the missed words to the beginning of the query, the worse - assert torrent_rank(query, 'Big Bunny') == pytest.approx(0.441818181) - assert torrent_rank(query, 'Buck Bunny') == pytest.approx(0.405) - - # The more seeders is still better, the more days from the check the less relevant the number of seeders is - assert torrent_rank(query, 'Buck Bunny', seeders=10, freshness=5 * DAY) == pytest.approx(0.44805194) - assert torrent_rank(query, 'Buck Bunny', seeders=100, freshness=5 * DAY) == pytest.approx(0.46821428) - assert torrent_rank(query, 'Buck Bunny', seeders=1000, freshness=5 * DAY) == pytest.approx(0.4883766) - assert torrent_rank(query, 'Buck Bunny', seeders=1000, freshness=10 * DAY) == pytest.approx(0.48306818) - assert torrent_rank(query, 'Buck Bunny', seeders=1000, freshness=20 * DAY) == pytest.approx(0.47563636) - - # The exact match - assert torrent_rank('Sintel', 'Sintel') == pytest.approx(0.81) + + r20 = torrent_rank(query, 'Big Bunny') # 0.441818181 + r21 = torrent_rank(query, 'Buck Bunny') # 0.405 + + assert r21 < r20 < r19 + + # The more seeders is still better + + r22 = torrent_rank(query, 'Buck Bunny', seeders=10, freshness=5 * DAY) # 0.44805194 + r23 = torrent_rank(query, 'Buck Bunny', seeders=100, freshness=5 * DAY) # 0.46821428 + r24 = torrent_rank(query, 'Buck Bunny', seeders=1000, freshness=5 * DAY) # 0.4883766 + + assert r21 < r22 < r23 < r24 + + # The more days from the check the less relevant the number of seeders is + + r25 = torrent_rank(query, 'Buck Bunny', seeders=1000, freshness=10 * DAY) # 0.48306818 + r26 = torrent_rank(query, 'Buck Bunny', seeders=1000, freshness=20 * DAY) # 0.47563636 + + assert r26 < r25 < r24 + + # The exact match has a good rank + r27 = torrent_rank('Sintel', 'Sintel') # 0.81 + assert r27 > 0.8 + # Non-matching words at the end of the title give slightly worse results - assert torrent_rank('Sintel', 'Sintel Part II') == pytest.approx(0.79553571) + r28 = torrent_rank('Sintel', 'Sintel Part II') # 0.79553571 + # Non-matching words at the beginning of the title are much worse - assert torrent_rank('Sintel', 'Part of Sintel') == pytest.approx(0.664925373) + r29 = torrent_rank('Sintel', 'Part of Sintel') # 0.664925373 + # Too many non-matching words give a bigger penalty - assert torrent_rank('Sintel', 'the.script.from.the.movie.Sintel.pdf') == pytest.approx(0.52105263) + r30 = torrent_rank('Sintel', 'the.script.from.the.movie.Sintel.pdf') # 0.52105263 + + assert r30 < r29 < r28 < r27 # Some more examples - assert torrent_rank("Internet's Own Boy", "Internet's Own Boy") == pytest.approx(0.81) - assert torrent_rank("Internet's Own Boy", "Internet's very Own Boy") == pytest.approx(0.75099337) - assert torrent_rank("Internet's Own Boy", "Internet's very special Boy person") == pytest.approx(0.4353166986) + + r31 = torrent_rank("Internet's Own Boy", "Internet's Own Boy") # 0.81 + r32 = torrent_rank("Internet's Own Boy", "Internet's very Own Boy") # 0.75099337 + r33 = torrent_rank("Internet's Own Boy", "Internet's very special Boy person") # 0.4353166986 + + assert r33 < r32 < r31 def test_title_rank(): From 2f73fe349730254d478d38da9084bc222b88404c Mon Sep 17 00:00:00 2001 From: Alexander Kozlovsky Date: Tue, 11 Oct 2022 07:55:12 +0200 Subject: [PATCH 15/32] Add a description for a complex sort_by expression --- .../components/metadata_store/db/store.py | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/src/tribler/core/components/metadata_store/db/store.py b/src/tribler/core/components/metadata_store/db/store.py index c273f518526..428ce604a44 100644 --- a/src/tribler/core/components/metadata_store/db/store.py +++ b/src/tribler/core/components/metadata_store/db/store.py @@ -736,6 +736,42 @@ def get_entries_query( if sort_by is None: if txt_filter: + # pylint: disable=W0105 + """ + The following call of `sort_by` produces an ORDER BY expression that looks like this: + + ORDER BY + case when "g"."metadata_type" = $CHANNEL_TORRENT then 1 + when "g"."metadata_type" = $COLLECTION_NODE then 2 + else 3 end, + + search_rank( + $QUERY_STRING, + g.title, + torrentstate.seeders + 0.1 * torrentstate.leechers, + $CURRENT_TIME - strftime('%s', g.torrent_date) + ) DESC, + + "torrentstate"."last_check" DESC, + + So, the channel torrents and channel folders are always on top if they are not filtered out. + Then regular torrents are selected in order of their relevance according to a search_rank() result. + If two torrents have the same search rank, they are ordered by the last time they were checked. + + The search_rank() function is called directly from the SQLite query, but is implemented in Python, + it is actually the torrent_rank() function from core/utilities/search_utils.py, wrapped with + keep_exception() to return possible exception from SQLite to Python. + + The search_rank() function receives the following arguments: + - the current query string (like "Big Buck Bunny"); + - the title of the current torrent; + - the number of seeders; + - the number of seconds since the torrent's creation time. + + There is no separate argument for the number of leechers, so it is just added to the number of seeders, + leechers are considered ten times less important than seeders. + """ + pony_query = pony_query.sort_by( f""" (1 if g.metadata_type == {CHANNEL_TORRENT} else 2 if g.metadata_type == {COLLECTION_NODE} else 3), From 6b41095000b8da75f0a7a53eddcd0d7af59af7af Mon Sep 17 00:00:00 2001 From: Alexander Kozlovsky Date: Tue, 11 Oct 2022 08:43:33 +0200 Subject: [PATCH 16/32] Use named coefficients in calculate_rank function instead of a hardcoded values --- src/tribler/core/utilities/search_utils.py | 26 ++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/src/tribler/core/utilities/search_utils.py b/src/tribler/core/utilities/search_utils.py index d7b4ba59f15..fbb8554b74e 100644 --- a/src/tribler/core/utilities/search_utils.py +++ b/src/tribler/core/utilities/search_utils.py @@ -91,6 +91,24 @@ def title_rank(query: str, title: str) -> float: return calculate_rank(query, title) +# These coefficients are found empirically. Their exact values are not very important for a relative ranking of results + +# The first word in a query is considered as a more important than the next one and so on, +# 5 means the 5th word in a query is twice as less important as the first one +POSITION_COEFF = 5 + +# Some big value for a penalty if a query word is totally missed from a torrent title +MISSED_WORD_PENALTY = 10 + +# If a torrent title contains some words at the very end that are not mentioned in a query, we add a very slight +# penalty for them. The *bigger* the REMAINDER_COEFF is, the *smaller* penalty we add for this excess words +REMAINDER_COEFF = 10 + +# The exact value of this coefficient is not important. It is used to convert total_error value to a rank value. +# The total_error value is some positive number. We want to have the resulted rank in range [0, 1]. +RANK_NORMALIZATION_COEFF = 10 + + def calculate_rank(query: List[str], title: List[str]) -> float: """ Calculate the similarity of the title to the query as a float value in range [0, 1]. @@ -105,7 +123,7 @@ def calculate_rank(query: List[str], title: List[str]) -> float: total_error = 0 for i, term in enumerate(query): # The first word is more important than the second word, and so on - term_weight = 5 / (5 + i) + term_weight = POSITION_COEFF / (POSITION_COEFF + i) found, skipped = find_term(term, title) if found: @@ -113,15 +131,15 @@ def calculate_rank(query: List[str], title: List[str]) -> float: total_error += skipped * term_weight else: # if the query word is not found in the title, add a big penalty for it - total_error += 10 * term_weight + total_error += MISSED_WORD_PENALTY * term_weight # a small penalty for excess words in the title that was not mentioned in the search phrase - remainder_weight = 1 / (10 + len(query)) + remainder_weight = 1 / (REMAINDER_COEFF + len(query)) remained_words_error = len(title) * remainder_weight total_error += remained_words_error # a search rank should be between 1 and 0 - return 10 / (10 + total_error) + return RANK_NORMALIZATION_COEFF / (RANK_NORMALIZATION_COEFF + total_error) def find_term(term: str, title: Deque[str]) -> Tuple[bool, int]: From fbcce1130a7d960664a6b0d2f11210b8e7972814 Mon Sep 17 00:00:00 2001 From: Alexander Kozlovsky Date: Wed, 12 Oct 2022 16:39:09 +0200 Subject: [PATCH 17/32] Add a comment to the `calculate_rank` function and a description to the `find_term` function --- src/tribler/core/utilities/search_utils.py | 68 ++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/src/tribler/core/utilities/search_utils.py b/src/tribler/core/utilities/search_utils.py index fbb8554b74e..a593c6379ad 100644 --- a/src/tribler/core/utilities/search_utils.py +++ b/src/tribler/core/utilities/search_utils.py @@ -125,6 +125,10 @@ def calculate_rank(query: List[str], title: List[str]) -> float: # The first word is more important than the second word, and so on term_weight = POSITION_COEFF / (POSITION_COEFF + i) + # Read the description of the `find_term` function to understand what is going on. Basically, we are trying + # to find each query word in the title words, calculate the penalty if the query word is not found or if there + # are some title words before it, and then rotate the skipped title words to the end of the title. This way, + # the least penalty got a title that has query words in the proper order at the beginning of the title. found, skipped = find_term(term, title) if found: # if the query word is found in the title, add penalty for skipped words in title before it @@ -146,6 +150,70 @@ def find_term(term: str, title: Deque[str]) -> Tuple[bool, int]: """ Finds the query word in the title. Returns whether it was found or not and the number of skipped words in the title. + + This is a helper function to efficiently answer a question of how close a query string and a title string are, + taking into account the ordering of words in both strings. + + The `term` parameter is a word from a search string. It is called `term` and not `word` because it can also be + a stemmed version of the word if the comparison algorithm implemented in the top-level `torrent_rank` function + works with stemmed words. The ability to work with stemmed words was added to `torrent_rank` and then removed, + as it currently does not give significant benefits, but it can be added again in the future. + + The `title` parameter is a deque of words from the torrent title. It also can be a deque of stemmed words + if the `torrent_rank` function supports stemming. + + The `find_term` function returns the boolean value of whether the term was found in the title deque or not and + the number of the skipped leading terms in the `title` deque. Also, it modifies the `title` deque in place by + removing the first entrance of the found term and rotating all leading non-matching terms to the end of the deque. + + An example: find_term('A', deque(['X', 'Y', 'A', 'B', 'C'])) returns `(True, 2)`, where True means that + the term 'A' was found in the `title` deque, and 2 is the number of skipped terms ('X', 'Y'). Also, it modifies + the `title` deque, so it starts looking like deque(['B', 'C', 'X', 'Y']). The found term 'A' was removed, and + the leading non-matching terms ('X', 'Y') was moved to the end of the deque. + + Now some examples of how the function can be used. To use the function, you can call it one time for each word + from the query and see: + - how many query words are missed in the title; + - how many excess or out-of-place title words are found before each query word; + - and how many title words are not mentioned in the query. + + Example 1, query "A B C", title "A B C": + find_term("A", deque(["A", "B", "C"])) -> (found=True, skipped=0, rest=deque(["B", "C"])) + find_term("B", deque(["B", "C"])) -> (found=True, skipped=0, rest=deque(["C"])) + find_term("C", deque(["C"])) -> (found=True, skipped=0, rest=deque([])) + Conclusion: exact match. + + Example 2, query "A B C", title "A B C D": + find_term("A", deque(["A", "B", "C", "D"])) -> (found=True, skipped=0, rest=deque(["B", "C", "D"])) + find_term("B", deque(["B", "C", "D"])) -> (found=True, skipped=0, rest=deque(["C", "D"])) + find_term("C", deque(["C", "D"])) -> (found=True, skipped=0, rest=deque(["D"])) + Conclusion: minor penalty for one excess word in the title that is not in the query. + + Example 3, query "A B C", title "X Y A B C": + find_term("A", deque(["X", "Y", "A", "B", "C"])) -> (found=True, skipped=2, rest=deque(["B", "C", "X", "Y"])) + find_term("B", deque(["B", "C", "X", "Y"])) -> (found=True, skipped=0, rest=deque(["C", "X", "Y"])) + find_term("C", deque(["C", "X", "Y"])) -> (found=True, skipped=0, rest=deque(["X", "Y"])) + Conclusion: major penalty for skipping two words at the beginning of the title plus a minor penalty for two + excess words in the title that are not in the query. + + Example 4, query "A B C", title "A B X Y C": + find_term("A", deque(["A", "B", "X", "Y", "C"])) -> (found=True, skipped=0, rest=deque(["B", "X", "Y", "C"])) + find_term("B", deque(["B", "X", "Y", "C"])) -> (found=True, skipped=0, rest=deque(["X", "Y", "C"])) + find_term("C", deque(["X", "Y", "C"])) -> (found=True, skipped=2, rest=deque(["X", "Y"])) + Conclusion: average penalty for skipping two words in the middle of the title plus a minor penalty for two + excess words in the title that are not in the query. + + Example 5, query "A B C", title "A C B": + find_term("A", deque(["A", "C", "B"])) -> (found=True, skipped=0, rest=deque(["C", "B"])) + find_term("B", deque(["C", "B"])) -> (found=True, skipped=1, rest=deque(["C"])) + find_term("C", deque(["C"])) -> (found=True, skipped=0, rest=deque(["C"])) + Conclusion: average penalty for skipping one word in the middle of the title. + + Example 6, query "A B C", title "A C X": + find_term("A", deque(["A", "C", "X"])) -> (found=True, skipped=0, rest=deque(["C", "X"])) + find_term("B", deque(["C", "X"])) -> (found=False, skipped=0, rest=deque(["C", "X"])) + find_term("C", deque(["C", "X"])) -> (found=True, skipped=0, rest=deque(["X"])) + Conclusion: huge penalty for missing one query word plus a minor penalty for one excess title word. """ try: skipped = title.index(term) From cd9cd541932bd9e28333e7d9a58dc9deaaf7b754 Mon Sep 17 00:00:00 2001 From: Alexander Kozlovsky Date: Fri, 21 Oct 2022 07:36:12 +0200 Subject: [PATCH 18/32] Highlight remote results --- src/tribler/gui/widgets/tablecontentdelegate.py | 14 ++++++++------ src/tribler/gui/widgets/tablecontentmodel.py | 6 ++++++ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/tribler/gui/widgets/tablecontentdelegate.py b/src/tribler/gui/widgets/tablecontentdelegate.py index 2979bd6a5d2..47553016cd5 100644 --- a/src/tribler/gui/widgets/tablecontentdelegate.py +++ b/src/tribler/gui/widgets/tablecontentdelegate.py @@ -35,7 +35,7 @@ ) from tribler.gui.utilities import format_votes, get_color, get_gui_setting, get_health, get_image_path, tr, \ get_objects_with_predicate -from tribler.gui.widgets.tablecontentmodel import Column +from tribler.gui.widgets.tablecontentmodel import Column, RemoteTableModel from tribler.gui.widgets.tableiconbuttons import DownloadIconButton PROGRESS_BAR_BACKGROUND = QColor("#444444") @@ -268,18 +268,20 @@ def split_rect_into_squares(r, buttons): yield QRect(x, y, w, h), button def paint(self, painter, option, index): - # Draw 'hover' state highlight for every cell of a row - if index.row() == self.hover_index.row(): + model: RemoteTableModel = index.model() + data_item = model.data_items[index.row()] + if index.row() == self.hover_index.row() or model.should_highlight_item(data_item): + # Draw 'hover' state highlight for every cell of a row option.state |= QStyle.State_MouseOver - if not self.paint_exact(painter, option, index): + if not self.paint_exact(painter, option, index, data_item): # Draw the rest of the columns super().paint(painter, option, index) - def paint_exact(self, painter, option, index): - data_item = index.model().data_items[index.row()] + def paint_exact(self, painter, option, index, data_item): for column, drawing_action in self.column_drawing_actions: if column in index.model().column_position and index.column() == index.model().column_position[column]: return drawing_action(painter, option, index, data_item) + return False def editorEvent(self, event, model, option, index): for control in self.controls: diff --git a/src/tribler/gui/widgets/tablecontentmodel.py b/src/tribler/gui/widgets/tablecontentmodel.py index 43160440791..29beec67be1 100644 --- a/src/tribler/gui/widgets/tablecontentmodel.py +++ b/src/tribler/gui/widgets/tablecontentmodel.py @@ -106,12 +106,15 @@ def __init__(self, parent=None): self.sort_desc = True self.saved_header_state = None self.saved_scroll_state = None + self.highlight_remote_results = True self.qt_object_destroyed = False self.group_by_name = False self.sort_by_rank = False self.text_filter = '' + self.highlight_remote_results = True + connect(self.destroyed, self.on_destroy) # Every remote query must be attributed to its specific model to avoid updating wrong models # on receiving a result. We achieve this by maintaining a set of in-flight remote queries. @@ -148,6 +151,9 @@ def reset(self): self.endResetModel() self.perform_query() + def should_highlight_item(self, data_item): + return self.highlight_remote_results and data_item.get('remote') + def sort(self, column_index, order): if not self.columns[column_index].sortable: return From 3565af4f34f72619160bf310a48ca8e00f59e030 Mon Sep 17 00:00:00 2001 From: Alexander Kozlovsky Date: Fri, 21 Oct 2022 10:14:58 +0200 Subject: [PATCH 19/32] Stop highlighting remote results after a small period --- src/tribler/gui/widgets/tablecontentmodel.py | 37 +++++++++++++++++--- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/src/tribler/gui/widgets/tablecontentmodel.py b/src/tribler/gui/widgets/tablecontentmodel.py index 29beec67be1..e498e910725 100644 --- a/src/tribler/gui/widgets/tablecontentmodel.py +++ b/src/tribler/gui/widgets/tablecontentmodel.py @@ -1,11 +1,13 @@ import json import logging +import time import uuid +from collections import deque from dataclasses import dataclass, field from enum import Enum, auto from typing import Callable, Dict, List -from PyQt5.QtCore import QAbstractTableModel, QModelIndex, QRectF, QSize, Qt, pyqtSignal +from PyQt5.QtCore import QAbstractTableModel, QModelIndex, QRectF, QSize, QTimerEvent, Qt, pyqtSignal from tribler.core.components.metadata_store.db.orm_bindings.channel_node import NEW from tribler.core.components.metadata_store.db.serialization import CHANNEL_TORRENT, COLLECTION_NODE, REGULAR_TORRENT, \ @@ -19,6 +21,8 @@ from tribler.gui.utilities import connect, format_size, format_votes, get_votes_rating_description, pretty_date, tr EXPANDING = 0 +HIGHLIGHTING_PERIOD_SECONDS = 1.0 +HIGHLIGHTING_TIMER_INTERVAL_MILLISECONDS = 100 class Column(Enum): @@ -106,14 +110,15 @@ def __init__(self, parent=None): self.sort_desc = True self.saved_header_state = None self.saved_scroll_state = None - self.highlight_remote_results = True self.qt_object_destroyed = False self.group_by_name = False self.sort_by_rank = False self.text_filter = '' - self.highlight_remote_results = True + self.highlight_remote_results = False + self.highlighted_items = deque() + self.highlight_timer = self.startTimer(HIGHLIGHTING_TIMER_INTERVAL_MILLISECONDS) connect(self.destroyed, self.on_destroy) # Every remote query must be attributed to its specific model to avoid updating wrong models @@ -152,7 +157,23 @@ def reset(self): self.perform_query() def should_highlight_item(self, data_item): - return self.highlight_remote_results and data_item.get('remote') + return (self.highlight_remote_results and data_item.get('remote') + and data_item['item_added_at'] > time.time() - HIGHLIGHTING_PERIOD_SECONDS) + + def timerEvent(self, event: QTimerEvent) -> None: + if self.highlight_remote_results and event.timerId() == self.highlight_timer: + self.stop_highlighting_old_items() + + def stop_highlighting_old_items(self): + now = time.time() + then = now - HIGHLIGHTING_PERIOD_SECONDS + last_column_offset = len(self.columns_dict) - 1 + while self.highlighted_items and self.highlighted_items[0]['item_added_at'] < then: + item = self.highlighted_items.popleft() + uid = get_item_uid(item) + row = self.item_uid_map.get(uid) + if row is not None: + self.dataChanged.emit(self.index(row, 0), self.index(row, last_column_offset)) def sort(self, column_index, order): if not self.columns[column_index].sortable: @@ -183,9 +204,13 @@ def add_items(self, new_items, on_top=False, remote=False): insert_index = 0 if on_top else len(self.data_items) unique_new_items = [] name_mapping = {item['name']: item for item in self.data_items} if self.group_by_name else {} + now = time.time() for item in new_items: if remote: item['remote'] = True + item['item_added_at'] = now + if self.highlight_remote_results: + self.highlighted_items.append(item) if self.sort_by_rank: if 'rank' not in item: item['rank'] = item_rank(self.text_filter, item) @@ -601,6 +626,7 @@ def __init__(self, original_query, **kwargs): super().__init__(channel_info={"name": title}, **kwargs) self.remote_results_received = False self.postponed_remote_results = [] + self.highlight_remote_results = True self.group_by_name = True self.sort_by_rank = True @@ -644,7 +670,8 @@ def show_remote_results(self): remote_items = list(self.remote_results.values()) self.remote_results.clear() self.remote_results_received = False - self.add_items(remote_items, remote=True) + if remote_items: + self.add_items(remote_items, remote=True) class PopularTorrentsModel(ChannelContentModel): From bc4bc218ff258d4fffe040e21db637e8cebb0a75 Mon Sep 17 00:00:00 2001 From: Alexander Kozlovsky Date: Tue, 19 Jul 2022 16:17:40 +0200 Subject: [PATCH 20/32] Add TARGET_PEERS_NUMBER constant that is used as a default value in RemovePeers strategy --- .../components/gigachannel/community/sync_strategy.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/tribler/core/components/gigachannel/community/sync_strategy.py b/src/tribler/core/components/gigachannel/community/sync_strategy.py index 01caab05ec1..c6fc1b3a300 100644 --- a/src/tribler/core/components/gigachannel/community/sync_strategy.py +++ b/src/tribler/core/components/gigachannel/community/sync_strategy.py @@ -3,6 +3,9 @@ from ipv8.peerdiscovery.discovery import DiscoveryStrategy +TARGET_PEERS_NUMBER = 20 + + class RemovePeers(DiscoveryStrategy): """ Synchronization strategy for remote query community. @@ -10,8 +13,12 @@ class RemovePeers(DiscoveryStrategy): Remove a random peer, if we have enough peers to walk to. """ + def __init__(self, overlay, target_peers_number=TARGET_PEERS_NUMBER): + super().__init__(overlay) + self.target_peers_number = target_peers_number + def take_step(self): with self.walk_lock: peers = self.overlay.get_peers() - if peers and len(peers) > 20: + if peers and len(peers) > self.target_peers_number: self.overlay.network.remove_peer(choice(peers)) From f8914f83442c3b95ea8f4d63b2badde0bbb5675b Mon Sep 17 00:00:00 2001 From: Alexander Kozlovsky Date: Mon, 24 Oct 2022 04:37:17 +0200 Subject: [PATCH 21/32] Add a comment to the `max_query_peers` option --- .../metadata_store/remote_query_community/settings.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/tribler/core/components/metadata_store/remote_query_community/settings.py b/src/tribler/core/components/metadata_store/remote_query_community/settings.py index 66ee1737047..9fcd21bdbf5 100644 --- a/src/tribler/core/components/metadata_store/remote_query_community/settings.py +++ b/src/tribler/core/components/metadata_store/remote_query_community/settings.py @@ -5,7 +5,16 @@ class RemoteQueryCommunitySettings(TriblerConfigSection): minimal_blob_size: int = 200 maximum_payload_size: int = 1300 max_entries: int = maximum_payload_size // minimal_blob_size + + # The next option is currently used by GigaChannelCommunity only. We probably should move it to the + # GigaChannelCommunity settings or to a dedicated search-related section. The value of the option is corresponding + # with the TARGET_PEERS_NUMBER of src/tribler/core/components/gigachannel/community/sync_strategy.py, that is, to + # the number of peers that GigaChannelCommunity will have after a long run (initially, the number of peers in + # GigaChannelCommunity can rise up to several hundred due to DiscoveryBooster). The number of parallel remote + # requests should be not too small (to have various results from remote peers) and not too big (to avoid flooding + # the network with exceedingly high number of queries). TARGET_PEERS_NUMBER looks like a good middle ground here. max_query_peers: int = 20 + max_response_size: int = 100 # Max number of entries returned by SQL query max_channel_query_back: int = 4 # Max number of entries to query back on receiving an unknown channel push_updates_back_enabled = True From cd8cb1c9f4076ac519e818d7bfc205cb2ceb0391 Mon Sep 17 00:00:00 2001 From: Alexander Kozlovsky Date: Mon, 31 Oct 2022 23:51:18 +0100 Subject: [PATCH 22/32] Rename: term -> word --- src/tribler/core/utilities/search_utils.py | 71 +++++++++++----------- 1 file changed, 34 insertions(+), 37 deletions(-) diff --git a/src/tribler/core/utilities/search_utils.py b/src/tribler/core/utilities/search_utils.py index a593c6379ad..416ef8b25d7 100644 --- a/src/tribler/core/utilities/search_utils.py +++ b/src/tribler/core/utilities/search_utils.py @@ -121,21 +121,21 @@ def calculate_rank(query: List[str], title: List[str]) -> float: title = deque(title) total_error = 0 - for i, term in enumerate(query): + for i, word in enumerate(query): # The first word is more important than the second word, and so on - term_weight = POSITION_COEFF / (POSITION_COEFF + i) + word_weight = POSITION_COEFF / (POSITION_COEFF + i) - # Read the description of the `find_term` function to understand what is going on. Basically, we are trying + # Read the description of the `find_word` function to understand what is going on. Basically, we are trying # to find each query word in the title words, calculate the penalty if the query word is not found or if there # are some title words before it, and then rotate the skipped title words to the end of the title. This way, # the least penalty got a title that has query words in the proper order at the beginning of the title. - found, skipped = find_term(term, title) + found, skipped = find_word(word, title) if found: # if the query word is found in the title, add penalty for skipped words in title before it - total_error += skipped * term_weight + total_error += skipped * word_weight else: # if the query word is not found in the title, add a big penalty for it - total_error += MISSED_WORD_PENALTY * term_weight + total_error += MISSED_WORD_PENALTY * word_weight # a small penalty for excess words in the title that was not mentioned in the search phrase remainder_weight = 1 / (REMAINDER_COEFF + len(query)) @@ -146,7 +146,7 @@ def calculate_rank(query: List[str], title: List[str]) -> float: return RANK_NORMALIZATION_COEFF / (RANK_NORMALIZATION_COEFF + total_error) -def find_term(term: str, title: Deque[str]) -> Tuple[bool, int]: +def find_word(word: str, title: Deque[str]) -> Tuple[bool, int]: """ Finds the query word in the title. Returns whether it was found or not and the number of skipped words in the title. @@ -154,22 +154,19 @@ def find_term(term: str, title: Deque[str]) -> Tuple[bool, int]: This is a helper function to efficiently answer a question of how close a query string and a title string are, taking into account the ordering of words in both strings. - The `term` parameter is a word from a search string. It is called `term` and not `word` because it can also be - a stemmed version of the word if the comparison algorithm implemented in the top-level `torrent_rank` function - works with stemmed words. The ability to work with stemmed words was added to `torrent_rank` and then removed, - as it currently does not give significant benefits, but it can be added again in the future. + The `word` parameter is a word from a search string. The `title` parameter is a deque of words from the torrent title. It also can be a deque of stemmed words if the `torrent_rank` function supports stemming. - The `find_term` function returns the boolean value of whether the term was found in the title deque or not and - the number of the skipped leading terms in the `title` deque. Also, it modifies the `title` deque in place by - removing the first entrance of the found term and rotating all leading non-matching terms to the end of the deque. + The `find_word` function returns the boolean value of whether the word was found in the title deque or not and + the number of the skipped leading words in the `title` deque. Also, it modifies the `title` deque in place by + removing the first entrance of the found word and rotating all leading non-matching words to the end of the deque. - An example: find_term('A', deque(['X', 'Y', 'A', 'B', 'C'])) returns `(True, 2)`, where True means that - the term 'A' was found in the `title` deque, and 2 is the number of skipped terms ('X', 'Y'). Also, it modifies - the `title` deque, so it starts looking like deque(['B', 'C', 'X', 'Y']). The found term 'A' was removed, and - the leading non-matching terms ('X', 'Y') was moved to the end of the deque. + An example: find_word('A', deque(['X', 'Y', 'A', 'B', 'C'])) returns `(True, 2)`, where True means that + the word 'A' was found in the `title` deque, and 2 is the number of skipped words ('X', 'Y'). Also, it modifies + the `title` deque, so it starts looking like deque(['B', 'C', 'X', 'Y']). The found word 'A' was removed, and + the leading non-matching words ('X', 'Y') was moved to the end of the deque. Now some examples of how the function can be used. To use the function, you can call it one time for each word from the query and see: @@ -178,45 +175,45 @@ def find_term(term: str, title: Deque[str]) -> Tuple[bool, int]: - and how many title words are not mentioned in the query. Example 1, query "A B C", title "A B C": - find_term("A", deque(["A", "B", "C"])) -> (found=True, skipped=0, rest=deque(["B", "C"])) - find_term("B", deque(["B", "C"])) -> (found=True, skipped=0, rest=deque(["C"])) - find_term("C", deque(["C"])) -> (found=True, skipped=0, rest=deque([])) + find_word("A", deque(["A", "B", "C"])) -> (found=True, skipped=0, rest=deque(["B", "C"])) + find_word("B", deque(["B", "C"])) -> (found=True, skipped=0, rest=deque(["C"])) + find_word("C", deque(["C"])) -> (found=True, skipped=0, rest=deque([])) Conclusion: exact match. Example 2, query "A B C", title "A B C D": - find_term("A", deque(["A", "B", "C", "D"])) -> (found=True, skipped=0, rest=deque(["B", "C", "D"])) - find_term("B", deque(["B", "C", "D"])) -> (found=True, skipped=0, rest=deque(["C", "D"])) - find_term("C", deque(["C", "D"])) -> (found=True, skipped=0, rest=deque(["D"])) + find_word("A", deque(["A", "B", "C", "D"])) -> (found=True, skipped=0, rest=deque(["B", "C", "D"])) + find_word("B", deque(["B", "C", "D"])) -> (found=True, skipped=0, rest=deque(["C", "D"])) + find_word("C", deque(["C", "D"])) -> (found=True, skipped=0, rest=deque(["D"])) Conclusion: minor penalty for one excess word in the title that is not in the query. Example 3, query "A B C", title "X Y A B C": - find_term("A", deque(["X", "Y", "A", "B", "C"])) -> (found=True, skipped=2, rest=deque(["B", "C", "X", "Y"])) - find_term("B", deque(["B", "C", "X", "Y"])) -> (found=True, skipped=0, rest=deque(["C", "X", "Y"])) - find_term("C", deque(["C", "X", "Y"])) -> (found=True, skipped=0, rest=deque(["X", "Y"])) + find_word("A", deque(["X", "Y", "A", "B", "C"])) -> (found=True, skipped=2, rest=deque(["B", "C", "X", "Y"])) + find_word("B", deque(["B", "C", "X", "Y"])) -> (found=True, skipped=0, rest=deque(["C", "X", "Y"])) + find_word("C", deque(["C", "X", "Y"])) -> (found=True, skipped=0, rest=deque(["X", "Y"])) Conclusion: major penalty for skipping two words at the beginning of the title plus a minor penalty for two excess words in the title that are not in the query. Example 4, query "A B C", title "A B X Y C": - find_term("A", deque(["A", "B", "X", "Y", "C"])) -> (found=True, skipped=0, rest=deque(["B", "X", "Y", "C"])) - find_term("B", deque(["B", "X", "Y", "C"])) -> (found=True, skipped=0, rest=deque(["X", "Y", "C"])) - find_term("C", deque(["X", "Y", "C"])) -> (found=True, skipped=2, rest=deque(["X", "Y"])) + find_word("A", deque(["A", "B", "X", "Y", "C"])) -> (found=True, skipped=0, rest=deque(["B", "X", "Y", "C"])) + find_word("B", deque(["B", "X", "Y", "C"])) -> (found=True, skipped=0, rest=deque(["X", "Y", "C"])) + find_word("C", deque(["X", "Y", "C"])) -> (found=True, skipped=2, rest=deque(["X", "Y"])) Conclusion: average penalty for skipping two words in the middle of the title plus a minor penalty for two excess words in the title that are not in the query. Example 5, query "A B C", title "A C B": - find_term("A", deque(["A", "C", "B"])) -> (found=True, skipped=0, rest=deque(["C", "B"])) - find_term("B", deque(["C", "B"])) -> (found=True, skipped=1, rest=deque(["C"])) - find_term("C", deque(["C"])) -> (found=True, skipped=0, rest=deque(["C"])) + find_word("A", deque(["A", "C", "B"])) -> (found=True, skipped=0, rest=deque(["C", "B"])) + find_word("B", deque(["C", "B"])) -> (found=True, skipped=1, rest=deque(["C"])) + find_word("C", deque(["C"])) -> (found=True, skipped=0, rest=deque(["C"])) Conclusion: average penalty for skipping one word in the middle of the title. Example 6, query "A B C", title "A C X": - find_term("A", deque(["A", "C", "X"])) -> (found=True, skipped=0, rest=deque(["C", "X"])) - find_term("B", deque(["C", "X"])) -> (found=False, skipped=0, rest=deque(["C", "X"])) - find_term("C", deque(["C", "X"])) -> (found=True, skipped=0, rest=deque(["X"])) + find_word("A", deque(["A", "C", "X"])) -> (found=True, skipped=0, rest=deque(["C", "X"])) + find_word("B", deque(["C", "X"])) -> (found=False, skipped=0, rest=deque(["C", "X"])) + find_word("C", deque(["C", "X"])) -> (found=True, skipped=0, rest=deque(["X"])) Conclusion: huge penalty for missing one query word plus a minor penalty for one excess title word. """ try: - skipped = title.index(term) + skipped = title.index(word) except ValueError: return False, 0 From 49e5f64f2e6b42b399c57205d03f0853ffdb4228 Mon Sep 17 00:00:00 2001 From: Alexander Kozlovsky Date: Tue, 1 Nov 2022 00:52:03 +0100 Subject: [PATCH 23/32] Docstrings added; leechers are passed as a separate parameter --- .../components/metadata_store/db/store.py | 11 ++- src/tribler/core/utilities/search_utils.py | 81 +++++++++++++++---- 2 files changed, 70 insertions(+), 22 deletions(-) diff --git a/src/tribler/core/components/metadata_store/db/store.py b/src/tribler/core/components/metadata_store/db/store.py index 428ce604a44..78089c42a55 100644 --- a/src/tribler/core/components/metadata_store/db/store.py +++ b/src/tribler/core/components/metadata_store/db/store.py @@ -185,7 +185,7 @@ def on_connect(_, connection): cursor.execute("PRAGMA synchronous = 0") sqlite_rank = keep_exception(torrent_rank) - connection.create_function('search_rank', 4, sqlite_rank) + connection.create_function('search_rank', 5, sqlite_rank) # pylint: enable=unused-variable @@ -748,7 +748,8 @@ def get_entries_query( search_rank( $QUERY_STRING, g.title, - torrentstate.seeders + 0.1 * torrentstate.leechers, + torrentstate.seeders, + torrentstate.leechers, $CURRENT_TIME - strftime('%s', g.torrent_date) ) DESC, @@ -766,17 +767,15 @@ def get_entries_query( - the current query string (like "Big Buck Bunny"); - the title of the current torrent; - the number of seeders; + - the number of leechers; - the number of seconds since the torrent's creation time. - - There is no separate argument for the number of leechers, so it is just added to the number of seeders, - leechers are considered ten times less important than seeders. """ pony_query = pony_query.sort_by( f""" (1 if g.metadata_type == {CHANNEL_TORRENT} else 2 if g.metadata_type == {COLLECTION_NODE} else 3), raw_sql('''search_rank( - $txt_filter, g.title, torrentstate.seeders + 0.1 * torrentstate.leechers, + $txt_filter, g.title, torrentstate.seeders, torrentstate.leechers, $int(time()) - strftime('%s', g.torrent_date) ) DESC'''), desc(g.health.last_check) # just to trigger the TorrentState table inclusion into the left join diff --git a/src/tribler/core/utilities/search_utils.py b/src/tribler/core/utilities/search_utils.py index 416ef8b25d7..96b9a3eba9d 100644 --- a/src/tribler/core/utilities/search_utils.py +++ b/src/tribler/core/utilities/search_utils.py @@ -34,49 +34,87 @@ def filter_keywords(keywords): return [kw for kw in keywords if len(kw) > 0 and kw not in DIALOG_STOPWORDS] -def item_rank(query: str, item: dict): +def item_rank(query: str, item: dict) -> float: + """ + Calculates the torrent rank for item received from remote query. Returns the torrent rank value in range [0, 1]. + + :param query: a user-defined query string + :param item: a dict with torrent info. + Should include key `name`, can include `num_seeders`, `num_leechers`, `updated` + :return: the torrent rank value in range [0, 1] + """ + title = item['name'] seeders = item.get('num_seeders', 0) leechers = item.get('num_leechers', 0) freshness = time.time() - item.get('updated', 0) - return torrent_rank(query, title, seeders + leechers * 0.1, freshness) + return torrent_rank(query, title, seeders, leechers, freshness) -def torrent_rank(query: str, title: str, seeders: int = 0, freshness: Optional[float] = 0) -> float: +def torrent_rank(query: str, title: str, seeders: int = 0, leechers: int = 0, freshness: Optional[float] = 0) -> float: """ Calculates search rank for a torrent. + + :param query: a user-defined query string + :param title: a torrent name + :param seeders: the number of seeders + :param leechers: the number of leechers + :param freshness: the number of seconds since the torrent creation. + Zero or negative value means the torrent creation date is unknown. + :return: the torrent rank value in range [0, 1] + Takes into account: - similarity of the title to the query string; - the reported number of seeders; - how long ago the torrent file was created. """ - freshness = max(0, freshness or 0) tr = title_rank(query or '', title or '') - sr = (seeders_rank(seeders or 0) + 9) / 10 # range [0.9, 1] + sr = (seeders_rank(seeders or 0, leechers or 0) + 9) / 10 # range [0.9, 1] fr = (freshness_rank(freshness) + 9) / 10 # range [0.9, 1] result = tr * sr * fr + # uncomment the next line to debug the function inside an SQL query: # print(f'*** {result} : {seeders}/{freshness} ({freshness / SECONDS_IN_DAY} days)/{title} | {query}') + return result -def seeders_rank(seeders: float) -> float: +LEECHERS_COEFF = 0.1 # leechers are considered ten times less important than seeders. + + +def seeders_rank(seeders: int, leechers: int = 0) -> float: """ - Calculates rank based on the number of seeders. The result is normalized to the range [0, 1] + Calculates rank based on the number of torrent's seeders and leechers. The result is normalized to the range [0, 1] + + :param seeders: the number of seeders for the torrent + :param leechers: the number of leechers for the torrent + :return: the torrent rank based on seeders and leechers, normalized to the range [0, 1] """ - return seeders / (100 + seeders) # inf seeders -> 1; 100 seeders -> 0.5; 10 seeders -> approx 0.1 + sl = seeders + leechers * LEECHERS_COEFF + return sl / (100 + sl) # inf seeders -> 1; 100 seeders -> 0.5; 10 seeders -> approx 0.1 -def freshness_rank(freshness: Optional[float] = 0): +def freshness_rank(freshness: Optional[float] = 0) -> float: """ - Calculates rank based on the torrent freshness. The result is normalized to the range [0, 1] + Calculates a rank value based on the torrent freshness. The result is normalized to the range [0, 1] + + :param freshness: number of seconds since the torrent creation. + Zero or negative values means the actual torrent creation date is unknown. + :return: the torrent rank based on freshness. The result is normalized to the range [0, 1] + + Example results: + 0 seconds since torrent creation -> the actual torrent creation date is unknown, freshness rank 0 + 1 second since torrent creation -> freshness rank 0.999 + 1 day since torrent creation -> freshness rank 0.967 + 30 days since torrent creation -> freshness rank 0.5 + 1 year since torrent creation -> freshness rank 0.0759 """ + freshness = max(0, freshness or 0) if not freshness: return 0 days = (freshness or 0) / SECONDS_IN_DAY - - return 1 / (1 + days / 30) # 2x drop per 30 days + return 1 / (1 + days / 30) word_re = re.compile(r'\w+', re.UNICODE) @@ -84,7 +122,11 @@ def freshness_rank(freshness: Optional[float] = 0): def title_rank(query: str, title: str) -> float: """ - Calculate the similarity of the title string to a query string, with or without stemming. + Calculate the similarity of the title string to a query string as a float value in range [0, 1] + + :param query: a user-defined query string + :param title: a torrent name + :return: the similarity of the title string to a query string as a float value in range [0, 1] """ query = word_re.findall(query.lower()) title = word_re.findall(title.lower()) @@ -111,7 +153,11 @@ def title_rank(query: str, title: str) -> float: def calculate_rank(query: List[str], title: List[str]) -> float: """ - Calculate the similarity of the title to the query as a float value in range [0, 1]. + Calculates the similarity of the title to the query as a float value in range [0, 1]. + + :param query: list of query words + :param title: list of title words + :return: the similarity of the title to the query as a float value in range [0, 1] """ if not query: return 1.0 @@ -148,8 +194,11 @@ def calculate_rank(query: List[str], title: List[str]) -> float: def find_word(word: str, title: Deque[str]) -> Tuple[bool, int]: """ - Finds the query word in the title. - Returns whether it was found or not and the number of skipped words in the title. + Finds the query word in the title. Returns whether it was found or not and the number of skipped words in the title. + + :param word: a word from the query + :param title: a list of words in the title + :return: a two-elements tuple, whether the word was found in the title and the number of skipped words This is a helper function to efficiently answer a question of how close a query string and a title string are, taking into account the ordering of words in both strings. From 65297ba253427d0743a911679e22784348c43643 Mon Sep 17 00:00:00 2001 From: Alexander Kozlovsky Date: Tue, 1 Nov 2022 08:37:24 +0100 Subject: [PATCH 24/32] Make test errors easier to understand --- src/tribler/core/tests/test_search_utils.py | 127 ++++++++------------ 1 file changed, 50 insertions(+), 77 deletions(-) diff --git a/src/tribler/core/tests/test_search_utils.py b/src/tribler/core/tests/test_search_utils.py index c0e8055a60e..bb7e9da0b85 100644 --- a/src/tribler/core/tests/test_search_utils.py +++ b/src/tribler/core/tests/test_search_utils.py @@ -25,117 +25,90 @@ def test_filter_keywords(): def test_torrent_rank(): query = 'Big Buck Bunny' - # The exact match ranked as pretty high - - r1 = torrent_rank(query, 'Big Buck Bunny') # 0.81 - assert r1 > 0.8 + title_match = torrent_rank(query, 'Big Buck Bunny') # 0.81 + assert title_match > 0.8 # Seeders are good for the rank - - r2 = torrent_rank(query, 'Big Buck Bunny', seeders=100, freshness=100 * DAY) # 0.876923 - # The more seeders the better - - r3 = torrent_rank(query, 'Big Buck Bunny', seeders=1000, freshness=100 * DAY) # 0.9146853 - # The fewer days have passed since the creation of the torrent, the higher its rank - - r4 = torrent_rank(query, 'Big Buck Bunny', seeders=1000, freshness=1 * DAY) # 0.9877126 - - assert r1 < r2 < r3 < r4 + assert torrent_rank(query, 'Big Buck Bunny', seeders=1000, freshness=1 * DAY) > \ + torrent_rank(query, 'Big Buck Bunny', seeders=1000, freshness=100 * DAY) > \ + torrent_rank(query, 'Big Buck Bunny', seeders=100, freshness=100 * DAY) > \ + title_match # If a title contains non-matching words missed in the query string it is not as good as the exact match - - r5 = torrent_rank(query, 'Big Buck Bunny II') # 0.80381679 - # The closer to the start of the string non-matching words are placed in the title, the worse is rank - - r6 = torrent_rank(query, 'Big Buck Brown Bunny') # 0.75061099 - r7 = torrent_rank(query, 'Big Bad Buck Bunny') # 0.74242068 - r8 = torrent_rank(query, 'Boring Big Buck Bunny') # 0.73125 - - assert r8 < r7 < r6 < r5 < r1 + assert title_match > \ + torrent_rank(query, 'Big Buck Bunny II') > \ + torrent_rank(query, 'Big Buck Brown Bunny') > \ + torrent_rank(query, 'Big Bad Buck Bunny') > \ + torrent_rank(query, 'Boring Big Buck Bunny') # The more non-matching words are in the title, the worse is rank - - r9 = torrent_rank(query, 'Big Buck A Bunny') # 0.75061099 - r10 = torrent_rank(query, 'Big Buck A B Bunny') # 0.699335863 - r11 = torrent_rank(query, 'Big Buck A B C Bunny') # 0.6546181 - - assert r11 < r10 < r9 < r1 + assert title_match > \ + torrent_rank(query, 'Big Buck A Bunny') > \ + torrent_rank(query, 'Big Buck A B Bunny') > \ + torrent_rank(query, 'Big Buck A B C Bunny') # Non-matching words close to the beginning of the title give a bigger penalty + assert title_match > \ + torrent_rank(query, 'Big A Buck Bunny') > \ + torrent_rank(query, 'Big A B Buck Bunny') > \ + torrent_rank(query, 'Big A B C Buck Bunny') - r12 = torrent_rank(query, 'Big A Buck Bunny') # 0.742420681 - r13 = torrent_rank(query, 'Big A B Buck Bunny') # 0.6852494577 - r14 = torrent_rank(query, 'Big A B C Buck Bunny') # 0.636253776 + assert title_match > \ + torrent_rank(query, 'A Big Buck Bunny') > \ + torrent_rank(query, 'A B Big Buck Bunny') > \ + torrent_rank(query, 'A B C Big Buck Bunny') - assert r14 < r13 < r12 < r1 + assert torrent_rank(query, 'Big A Buck Bunny') > \ + torrent_rank(query, 'A Big Buck Bunny') - r15 = torrent_rank(query, 'A Big Buck Bunny') # 0.73125 - r16 = torrent_rank(query, 'A B Big Buck Bunny') # 0.66645569 - r17 = torrent_rank(query, 'A B C Big Buck Bunny') # 0.6122093 + assert torrent_rank(query, 'Big A B Buck Bunny') > \ + torrent_rank(query, 'A B Big Buck Bunny') - assert r17 < r16 < r15 < r1 - assert r15 < r12 and r16 < r13 and r17 < r14 + assert torrent_rank(query, 'Big A B C Buck Bunny') > \ + torrent_rank(query, 'A B C Big Buck Bunny') # Wrong order of words in the title imposes a penalty to the rank - - r18 = torrent_rank(query, 'Big Bunny Buck') # 0.7476923 - - assert r18 < r1 + assert title_match > \ + torrent_rank(query, 'Big Bunny Buck') # Missed query words imposes a really big penalty - - r19 = torrent_rank(query, 'Big Buck') # 0.4725 - - assert r19 < 0.5 + assert torrent_rank(query, 'Big Buck') < 0.5 # The close the missed words to the beginning of the query, the worse - - r20 = torrent_rank(query, 'Big Bunny') # 0.441818181 - r21 = torrent_rank(query, 'Buck Bunny') # 0.405 - - assert r21 < r20 < r19 + assert torrent_rank(query, 'Big Buck') > \ + torrent_rank(query, 'Big Bunny') > \ + torrent_rank(query, 'Buck Bunny') # The more seeders is still better - - r22 = torrent_rank(query, 'Buck Bunny', seeders=10, freshness=5 * DAY) # 0.44805194 - r23 = torrent_rank(query, 'Buck Bunny', seeders=100, freshness=5 * DAY) # 0.46821428 - r24 = torrent_rank(query, 'Buck Bunny', seeders=1000, freshness=5 * DAY) # 0.4883766 - - assert r21 < r22 < r23 < r24 + assert torrent_rank(query, 'Buck Bunny', seeders=1000, freshness=5 * DAY) > \ + torrent_rank(query, 'Buck Bunny', seeders=100, freshness=5 * DAY) > \ + torrent_rank(query, 'Buck Bunny', seeders=10, freshness=5 * DAY) > \ + torrent_rank(query, 'Buck Bunny') # The more days from the check the less relevant the number of seeders is - - r25 = torrent_rank(query, 'Buck Bunny', seeders=1000, freshness=10 * DAY) # 0.48306818 - r26 = torrent_rank(query, 'Buck Bunny', seeders=1000, freshness=20 * DAY) # 0.47563636 - - assert r26 < r25 < r24 + assert torrent_rank(query, 'Buck Bunny', freshness=5 * DAY) > \ + torrent_rank(query, 'Buck Bunny', freshness=10 * DAY) > \ + torrent_rank(query, 'Buck Bunny', freshness=20 * DAY) # The exact match has a good rank - r27 = torrent_rank('Sintel', 'Sintel') # 0.81 - assert r27 > 0.8 + assert torrent_rank('Sintel', 'Sintel') > 0.8 # Non-matching words at the end of the title give slightly worse results - r28 = torrent_rank('Sintel', 'Sintel Part II') # 0.79553571 - # Non-matching words at the beginning of the title are much worse - r29 = torrent_rank('Sintel', 'Part of Sintel') # 0.664925373 - # Too many non-matching words give a bigger penalty - r30 = torrent_rank('Sintel', 'the.script.from.the.movie.Sintel.pdf') # 0.52105263 - - assert r30 < r29 < r28 < r27 + assert torrent_rank('Sintel', 'Sintel') > \ + torrent_rank('Sintel', 'Sintel Part II') > \ + torrent_rank('Sintel', 'Part of Sintel') > \ + torrent_rank('Sintel', 'the.script.from.the.movie.Sintel.pdf') # Some more examples - - r31 = torrent_rank("Internet's Own Boy", "Internet's Own Boy") # 0.81 - r32 = torrent_rank("Internet's Own Boy", "Internet's very Own Boy") # 0.75099337 - r33 = torrent_rank("Internet's Own Boy", "Internet's very special Boy person") # 0.4353166986 - - assert r33 < r32 < r31 + assert torrent_rank("Internet's Own Boy", "Internet's Own Boy") > \ + torrent_rank("Internet's Own Boy", "Internet's very Own Boy") > \ + torrent_rank("Internet's Own Boy", "Internet's very special Boy person") def test_title_rank(): From 70e071dc58a332dbc802b8822a0b232e85954ac4 Mon Sep 17 00:00:00 2001 From: Alexander Kozlovsky Date: Tue, 1 Nov 2022 09:42:02 +0100 Subject: [PATCH 25/32] Move examples for `find_work` function from the docstring to a unit test --- src/tribler/core/tests/test_search_utils.py | 57 ++++++++++++++++++- src/tribler/core/utilities/search_utils.py | 63 +++------------------ 2 files changed, 62 insertions(+), 58 deletions(-) diff --git a/src/tribler/core/tests/test_search_utils.py b/src/tribler/core/tests/test_search_utils.py index bb7e9da0b85..e58f6649cb8 100644 --- a/src/tribler/core/tests/test_search_utils.py +++ b/src/tribler/core/tests/test_search_utils.py @@ -1,7 +1,9 @@ +from collections import deque + import pytest -from tribler.core.utilities.search_utils import filter_keywords, item_rank, split_into_keywords, torrent_rank, \ - title_rank +from tribler.core.utilities.search_utils import filter_keywords, find_word, item_rank, split_into_keywords,\ + torrent_rank, title_rank DAY = 60 * 60 * 24 @@ -120,3 +122,54 @@ def test_title_rank(): def test_item_rank(): item = dict(name="abc", num_seeders=10, num_leechers=20) assert item_rank("abc", item) == pytest.approx(0.81978445) + + +def test_find_word(): + # To use the find_word function, you can call it one time for each word from the query and see: + # - how many query words are missed in the title; + # - how many excess or out-of-place title words are found before each query word; + # - and how many title words are not mentioned in the query. + + # Example 1, query "A B C", title "A B C" + title = deque(["A", "B", "C"]) + assert find_word("A", title) == (True, 0) and title == deque(["B", "C"]) + assert find_word("B", title) == (True, 0) and title == deque(["C"]) + assert find_word("C", title) == (True, 0) and title == deque([]) + # Conclusion: exact match. + + # Example 2, query "A B C", title "A B C D" + title = deque(["A", "B", "C", "D"]) + assert find_word("A", title) == (True, 0) and title == deque(["B", "C", "D"]) + assert find_word("B", title) == (True, 0) and title == deque(["C", "D"]) + assert find_word("C", title) == (True, 0) and title == deque(["D"]) + # Conclusion: minor penalty for one excess word in the title that is not in the query. + + # Example 3, query "A B C", title "X Y A B C" + title = deque(["X", "Y", "A", "B", "C"]) + assert find_word("A", title) == (True, 2) and title == deque(["B", "C", "X", "Y"]) + assert find_word("B", title) == (True, 0) and title == deque(["C", "X", "Y"]) + assert find_word("C", title) == (True, 0) and title == deque(["X", "Y"]) + # Conclusion: major penalty for skipping two words at the beginning of the title plus a minor penalty for two + # excess words in the title that are not in the query. + + # Example 4, query "A B C", title "A B X Y C" + title = deque(["A", "B", "X", "Y", "C"]) + assert find_word("A", title) == (True, 0) and title == deque(["B", "X", "Y", "C"]) + assert find_word("B", title) == (True, 0) and title == deque(["X", "Y", "C"]) + assert find_word("C", title) == (True, 2) and title == deque(["X", "Y"]) + # Conclusion: average penalty for skipping two words in the middle of the title plus a minor penalty for two + # excess words in the title that are not in the query. + + # Example 5, query "A B C", title "A C B" + title = deque(["A", "C", "B"]) + assert find_word("A", title) == (True, 0) and title == deque(["C", "B"]) + assert find_word("B", title) == (True, 1) and title == deque(["C"]) + assert find_word("C", title) == (True, 0) and title == deque([]) + # Conclusion: average penalty for skipping one word in the middle of the title. + + # Example 6, query "A B C", title "A C X" + title = deque(["A", "C", "X"]) + assert find_word("A", title) == (True, 0) and title == deque(["C", "X"]) + assert find_word("B", title) == (False, 0) and title == deque(["C", "X"]) + assert find_word("C", title) == (True, 0) and title == deque(["X"]) + # Conclusion: huge penalty for missing one query word plus a minor penalty for one excess title word. diff --git a/src/tribler/core/utilities/search_utils.py b/src/tribler/core/utilities/search_utils.py index 96b9a3eba9d..2a91cdd9f43 100644 --- a/src/tribler/core/utilities/search_utils.py +++ b/src/tribler/core/utilities/search_utils.py @@ -196,73 +196,24 @@ def find_word(word: str, title: Deque[str]) -> Tuple[bool, int]: """ Finds the query word in the title. Returns whether it was found or not and the number of skipped words in the title. - :param word: a word from the query - :param title: a list of words in the title + :param word: a word from the user-defined query string + :param title: a deque of words in the title :return: a two-elements tuple, whether the word was found in the title and the number of skipped words This is a helper function to efficiently answer a question of how close a query string and a title string are, taking into account the ordering of words in both strings. - The `word` parameter is a word from a search string. - - The `title` parameter is a deque of words from the torrent title. It also can be a deque of stemmed words - if the `torrent_rank` function supports stemming. - - The `find_word` function returns the boolean value of whether the word was found in the title deque or not and - the number of the skipped leading words in the `title` deque. Also, it modifies the `title` deque in place by - removing the first entrance of the found word and rotating all leading non-matching words to the end of the deque. + For efficiency reasons, the function modifies the `title` deque in place by removing the first entrance + of the found word and rotating all leading non-matching words to the end of the deque. It allows to efficiently + perform multiple calls of the `find_word` function for subsequent words from the same query string. An example: find_word('A', deque(['X', 'Y', 'A', 'B', 'C'])) returns `(True, 2)`, where True means that the word 'A' was found in the `title` deque, and 2 is the number of skipped words ('X', 'Y'). Also, it modifies the `title` deque, so it starts looking like deque(['B', 'C', 'X', 'Y']). The found word 'A' was removed, and - the leading non-matching words ('X', 'Y') was moved to the end of the deque. - - Now some examples of how the function can be used. To use the function, you can call it one time for each word - from the query and see: - - how many query words are missed in the title; - - how many excess or out-of-place title words are found before each query word; - - and how many title words are not mentioned in the query. - - Example 1, query "A B C", title "A B C": - find_word("A", deque(["A", "B", "C"])) -> (found=True, skipped=0, rest=deque(["B", "C"])) - find_word("B", deque(["B", "C"])) -> (found=True, skipped=0, rest=deque(["C"])) - find_word("C", deque(["C"])) -> (found=True, skipped=0, rest=deque([])) - Conclusion: exact match. - - Example 2, query "A B C", title "A B C D": - find_word("A", deque(["A", "B", "C", "D"])) -> (found=True, skipped=0, rest=deque(["B", "C", "D"])) - find_word("B", deque(["B", "C", "D"])) -> (found=True, skipped=0, rest=deque(["C", "D"])) - find_word("C", deque(["C", "D"])) -> (found=True, skipped=0, rest=deque(["D"])) - Conclusion: minor penalty for one excess word in the title that is not in the query. - - Example 3, query "A B C", title "X Y A B C": - find_word("A", deque(["X", "Y", "A", "B", "C"])) -> (found=True, skipped=2, rest=deque(["B", "C", "X", "Y"])) - find_word("B", deque(["B", "C", "X", "Y"])) -> (found=True, skipped=0, rest=deque(["C", "X", "Y"])) - find_word("C", deque(["C", "X", "Y"])) -> (found=True, skipped=0, rest=deque(["X", "Y"])) - Conclusion: major penalty for skipping two words at the beginning of the title plus a minor penalty for two - excess words in the title that are not in the query. - - Example 4, query "A B C", title "A B X Y C": - find_word("A", deque(["A", "B", "X", "Y", "C"])) -> (found=True, skipped=0, rest=deque(["B", "X", "Y", "C"])) - find_word("B", deque(["B", "X", "Y", "C"])) -> (found=True, skipped=0, rest=deque(["X", "Y", "C"])) - find_word("C", deque(["X", "Y", "C"])) -> (found=True, skipped=2, rest=deque(["X", "Y"])) - Conclusion: average penalty for skipping two words in the middle of the title plus a minor penalty for two - excess words in the title that are not in the query. - - Example 5, query "A B C", title "A C B": - find_word("A", deque(["A", "C", "B"])) -> (found=True, skipped=0, rest=deque(["C", "B"])) - find_word("B", deque(["C", "B"])) -> (found=True, skipped=1, rest=deque(["C"])) - find_word("C", deque(["C"])) -> (found=True, skipped=0, rest=deque(["C"])) - Conclusion: average penalty for skipping one word in the middle of the title. - - Example 6, query "A B C", title "A C X": - find_word("A", deque(["A", "C", "X"])) -> (found=True, skipped=0, rest=deque(["C", "X"])) - find_word("B", deque(["C", "X"])) -> (found=False, skipped=0, rest=deque(["C", "X"])) - find_word("C", deque(["C", "X"])) -> (found=True, skipped=0, rest=deque(["X"])) - Conclusion: huge penalty for missing one query word plus a minor penalty for one excess title word. + the leading non-matching words ('X', 'Y') were moved to the end of the deque. """ try: - skipped = title.index(word) + skipped = title.index(word) # find the query word placement in the title and the number of preceding words except ValueError: return False, 0 From 2f078d59c399f3a77d2cdff686f7c6e55e38733c Mon Sep 17 00:00:00 2001 From: Alexander Kozlovsky Date: Tue, 1 Nov 2022 09:45:58 +0100 Subject: [PATCH 26/32] Update a comment --- src/tribler/core/utilities/search_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tribler/core/utilities/search_utils.py b/src/tribler/core/utilities/search_utils.py index 2a91cdd9f43..e51d6262e10 100644 --- a/src/tribler/core/utilities/search_utils.py +++ b/src/tribler/core/utilities/search_utils.py @@ -91,7 +91,7 @@ def seeders_rank(seeders: int, leechers: int = 0) -> float: :return: the torrent rank based on seeders and leechers, normalized to the range [0, 1] """ sl = seeders + leechers * LEECHERS_COEFF - return sl / (100 + sl) # inf seeders -> 1; 100 seeders -> 0.5; 10 seeders -> approx 0.1 + return sl / (100 + sl) # infinity seeders -> rank 1.0; 100 seeders -> rank 0.5; 10 seeders -> approximately 0.1 def freshness_rank(freshness: Optional[float] = 0) -> float: From a670ffa75f5026beb7164627f01f3b93a0bec797 Mon Sep 17 00:00:00 2001 From: Alexander Kozlovsky Date: Tue, 1 Nov 2022 12:02:57 +0100 Subject: [PATCH 27/32] Add a SEEDERS_HALF_RANK named constant and a description for the `seeders_rank()` formula --- src/tribler/core/utilities/search_utils.py | 25 ++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/src/tribler/core/utilities/search_utils.py b/src/tribler/core/utilities/search_utils.py index e51d6262e10..1a495ab8529 100644 --- a/src/tribler/core/utilities/search_utils.py +++ b/src/tribler/core/utilities/search_utils.py @@ -79,19 +79,32 @@ def torrent_rank(query: str, title: str, seeders: int = 0, leechers: int = 0, fr return result -LEECHERS_COEFF = 0.1 # leechers are considered ten times less important than seeders. +LEECHERS_COEFF = 0.1 # How much leechers are less important compared to seeders (ten times less important) +SEEDERS_HALF_RANK = 100 # The number of seeders at which the seeders rank is 0.5 def seeders_rank(seeders: int, leechers: int = 0) -> float: """ - Calculates rank based on the number of torrent's seeders and leechers. The result is normalized to the range [0, 1] + Calculates rank based on the number of torrent's seeders and leechers - :param seeders: the number of seeders for the torrent - :param leechers: the number of leechers for the torrent + :param seeders: the number of seeders for the torrent. It is a positive value, usually in the range [0, 1000] + :param leechers: the number of leechers for the torrent. It is a positive value, usually in the range [0, 1000] :return: the torrent rank based on seeders and leechers, normalized to the range [0, 1] """ - sl = seeders + leechers * LEECHERS_COEFF - return sl / (100 + sl) # infinity seeders -> rank 1.0; 100 seeders -> rank 0.5; 10 seeders -> approximately 0.1 + + # The leechers are treated as less capable seeders + sl = seeders + leechers * LEECHERS_COEFF # Seeders and leechers combined + + # The function result has desired properties: + # * zero rank for zero seeders; + # * 0.5 rating for SEEDERS_HALF_RANK seeders; + # * 1.0 rating for an infinite number of seeders; + # * soft curve. + # It is possible to use different curves with the similar shape, for example: + # * 2 * arctan(x / SEEDERS_HALF_RANK) / PI, + # * 1 - exp(x * ln(0.5) / SEEDERS_HALF_RANK) + # but it does not actually matter in practice + return sl / (100 + sl) def freshness_rank(freshness: Optional[float] = 0) -> float: From 158b1790a4f39736f14b7c47fda8ffe7731cebc7 Mon Sep 17 00:00:00 2001 From: Alexander Kozlovsky Date: Tue, 1 Nov 2022 12:10:28 +0100 Subject: [PATCH 28/32] Add a description of the `freshness` parameter --- src/tribler/core/utilities/search_utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/tribler/core/utilities/search_utils.py b/src/tribler/core/utilities/search_utils.py index 1a495ab8529..dd2311cf7bc 100644 --- a/src/tribler/core/utilities/search_utils.py +++ b/src/tribler/core/utilities/search_utils.py @@ -59,8 +59,9 @@ def torrent_rank(query: str, title: str, seeders: int = 0, leechers: int = 0, fr :param title: a torrent name :param seeders: the number of seeders :param leechers: the number of leechers - :param freshness: the number of seconds since the torrent creation. - Zero or negative value means the torrent creation date is unknown. + :param freshness: the number of seconds since the torrent creation. Zero or negative value means the torrent + creation date is unknown. It is more convenient to use comparing to a timestamp, as it avoids + using the `time()` function call and simplifies testing. :return: the torrent rank value in range [0, 1] Takes into account: From 808bd27fb73ab2a23d829b8d2bc640e453cf6264 Mon Sep 17 00:00:00 2001 From: Alexander Kozlovsky Date: Tue, 1 Nov 2022 12:23:10 +0100 Subject: [PATCH 29/32] Add a description to a `freshness_rank` function's formula --- src/tribler/core/utilities/search_utils.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/tribler/core/utilities/search_utils.py b/src/tribler/core/utilities/search_utils.py index dd2311cf7bc..8573e49e169 100644 --- a/src/tribler/core/utilities/search_utils.py +++ b/src/tribler/core/utilities/search_utils.py @@ -125,8 +125,14 @@ def freshness_rank(freshness: Optional[float] = 0) -> float: """ freshness = max(0, freshness or 0) if not freshness: - return 0 - + return 0 # for freshness <= 0 the rank value is 0 because of an incorrect freshness value + + # The function declines from 1.0 to 0.0 on range (0..Infinity], with the following properties: + # * for just created torrents the rank value is close to 1.0 + # * for 30-days old torrents the rank value is 0.5 + # * for very old torrens the rank value is going to zero + # It was possible to use another formulas with the same properties (for example, exponent-based), + # the exact curve shape is not really important. days = (freshness or 0) / SECONDS_IN_DAY return 1 / (1 + days / 30) From 946e573b82f578f01dcdbea6f8b87b832c4df195 Mon Sep 17 00:00:00 2001 From: Alexander Kozlovsky Date: Tue, 1 Nov 2022 12:50:45 +0100 Subject: [PATCH 30/32] Add test_title_rank_range, test_freshness_rank_range, test_seeders_rank_range, test_torrent_rank_range --- src/tribler/core/tests/test_search_utils.py | 32 +++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/src/tribler/core/tests/test_search_utils.py b/src/tribler/core/tests/test_search_utils.py index e58f6649cb8..ba9418fbb9d 100644 --- a/src/tribler/core/tests/test_search_utils.py +++ b/src/tribler/core/tests/test_search_utils.py @@ -2,8 +2,8 @@ import pytest -from tribler.core.utilities.search_utils import filter_keywords, find_word, item_rank, split_into_keywords,\ - torrent_rank, title_rank +from tribler.core.utilities.search_utils import filter_keywords, find_word, freshness_rank, item_rank, seeders_rank, \ + split_into_keywords, torrent_rank, title_rank DAY = 60 * 60 * 24 @@ -25,6 +25,34 @@ def test_filter_keywords(): assert len(result) == 4 +def test_title_rank_range(): + assert title_rank('Big Buck Bunny', 'Big Buck Bunny') == 1 + + long_query = ' '.join(['foo'] * 1000) + long_title = ' '.join(['bar'] * 1000) + assert title_rank(long_query, long_title) == pytest.approx(0.03554968) + + +def test_freshness_rank_range(): + assert freshness_rank(-1) == 0 + assert freshness_rank(0) == 0 + assert freshness_rank(0.001) == pytest.approx(1.0) + assert freshness_rank(1000000000) == pytest.approx(0.0025852989) + + +def test_seeders_rank_range(): + assert seeders_rank(0) == 0 + assert seeders_rank(1000000) == pytest.approx(0.9999) + + +def test_torrent_rank_range(): + assert torrent_rank('Big Buck Bunny', 'Big Buck Bunny', seeders=1000000, freshness=0.01) == pytest.approx(0.99999) + + long_query = ' '.join(['foo'] * 1000) + long_title = ' '.join(['bar'] * 1000) + assert torrent_rank(long_query, long_title, freshness=1000000 * 365 * DAY) == pytest.approx(+0.02879524) + + def test_torrent_rank(): query = 'Big Buck Bunny' # The exact match ranked as pretty high From dabb40d3bd5cd729f3b56d1b1d05512ea4af7994 Mon Sep 17 00:00:00 2001 From: Alexander Kozlovsky Date: Wed, 2 Nov 2022 11:23:59 +0100 Subject: [PATCH 31/32] Rename the function: find_word -> find_word_and_rotate_title --- src/tribler/core/tests/test_search_utils.py | 42 ++++++++++----------- src/tribler/core/utilities/search_utils.py | 19 +++++----- 2 files changed, 31 insertions(+), 30 deletions(-) diff --git a/src/tribler/core/tests/test_search_utils.py b/src/tribler/core/tests/test_search_utils.py index ba9418fbb9d..b1a1fe94a18 100644 --- a/src/tribler/core/tests/test_search_utils.py +++ b/src/tribler/core/tests/test_search_utils.py @@ -2,8 +2,8 @@ import pytest -from tribler.core.utilities.search_utils import filter_keywords, find_word, freshness_rank, item_rank, seeders_rank, \ - split_into_keywords, torrent_rank, title_rank +from tribler.core.utilities.search_utils import filter_keywords, find_word_and_rotate_title, freshness_rank, item_rank,\ + seeders_rank, split_into_keywords, torrent_rank, title_rank DAY = 60 * 60 * 24 @@ -153,51 +153,51 @@ def test_item_rank(): def test_find_word(): - # To use the find_word function, you can call it one time for each word from the query and see: + # To use the find_word_and_rotate_title function, you can call it one time for each word from the query and see: # - how many query words are missed in the title; # - how many excess or out-of-place title words are found before each query word; # - and how many title words are not mentioned in the query. # Example 1, query "A B C", title "A B C" title = deque(["A", "B", "C"]) - assert find_word("A", title) == (True, 0) and title == deque(["B", "C"]) - assert find_word("B", title) == (True, 0) and title == deque(["C"]) - assert find_word("C", title) == (True, 0) and title == deque([]) + assert find_word_and_rotate_title("A", title) == (True, 0) and title == deque(["B", "C"]) + assert find_word_and_rotate_title("B", title) == (True, 0) and title == deque(["C"]) + assert find_word_and_rotate_title("C", title) == (True, 0) and title == deque([]) # Conclusion: exact match. # Example 2, query "A B C", title "A B C D" title = deque(["A", "B", "C", "D"]) - assert find_word("A", title) == (True, 0) and title == deque(["B", "C", "D"]) - assert find_word("B", title) == (True, 0) and title == deque(["C", "D"]) - assert find_word("C", title) == (True, 0) and title == deque(["D"]) + assert find_word_and_rotate_title("A", title) == (True, 0) and title == deque(["B", "C", "D"]) + assert find_word_and_rotate_title("B", title) == (True, 0) and title == deque(["C", "D"]) + assert find_word_and_rotate_title("C", title) == (True, 0) and title == deque(["D"]) # Conclusion: minor penalty for one excess word in the title that is not in the query. # Example 3, query "A B C", title "X Y A B C" title = deque(["X", "Y", "A", "B", "C"]) - assert find_word("A", title) == (True, 2) and title == deque(["B", "C", "X", "Y"]) - assert find_word("B", title) == (True, 0) and title == deque(["C", "X", "Y"]) - assert find_word("C", title) == (True, 0) and title == deque(["X", "Y"]) + assert find_word_and_rotate_title("A", title) == (True, 2) and title == deque(["B", "C", "X", "Y"]) + assert find_word_and_rotate_title("B", title) == (True, 0) and title == deque(["C", "X", "Y"]) + assert find_word_and_rotate_title("C", title) == (True, 0) and title == deque(["X", "Y"]) # Conclusion: major penalty for skipping two words at the beginning of the title plus a minor penalty for two # excess words in the title that are not in the query. # Example 4, query "A B C", title "A B X Y C" title = deque(["A", "B", "X", "Y", "C"]) - assert find_word("A", title) == (True, 0) and title == deque(["B", "X", "Y", "C"]) - assert find_word("B", title) == (True, 0) and title == deque(["X", "Y", "C"]) - assert find_word("C", title) == (True, 2) and title == deque(["X", "Y"]) + assert find_word_and_rotate_title("A", title) == (True, 0) and title == deque(["B", "X", "Y", "C"]) + assert find_word_and_rotate_title("B", title) == (True, 0) and title == deque(["X", "Y", "C"]) + assert find_word_and_rotate_title("C", title) == (True, 2) and title == deque(["X", "Y"]) # Conclusion: average penalty for skipping two words in the middle of the title plus a minor penalty for two # excess words in the title that are not in the query. # Example 5, query "A B C", title "A C B" title = deque(["A", "C", "B"]) - assert find_word("A", title) == (True, 0) and title == deque(["C", "B"]) - assert find_word("B", title) == (True, 1) and title == deque(["C"]) - assert find_word("C", title) == (True, 0) and title == deque([]) + assert find_word_and_rotate_title("A", title) == (True, 0) and title == deque(["C", "B"]) + assert find_word_and_rotate_title("B", title) == (True, 1) and title == deque(["C"]) + assert find_word_and_rotate_title("C", title) == (True, 0) and title == deque([]) # Conclusion: average penalty for skipping one word in the middle of the title. # Example 6, query "A B C", title "A C X" title = deque(["A", "C", "X"]) - assert find_word("A", title) == (True, 0) and title == deque(["C", "X"]) - assert find_word("B", title) == (False, 0) and title == deque(["C", "X"]) - assert find_word("C", title) == (True, 0) and title == deque(["X"]) + assert find_word_and_rotate_title("A", title) == (True, 0) and title == deque(["C", "X"]) + assert find_word_and_rotate_title("B", title) == (False, 0) and title == deque(["C", "X"]) + assert find_word_and_rotate_title("C", title) == (True, 0) and title == deque(["X"]) # Conclusion: huge penalty for missing one query word plus a minor penalty for one excess title word. diff --git a/src/tribler/core/utilities/search_utils.py b/src/tribler/core/utilities/search_utils.py index 8573e49e169..ca09b0398c3 100644 --- a/src/tribler/core/utilities/search_utils.py +++ b/src/tribler/core/utilities/search_utils.py @@ -191,11 +191,12 @@ def calculate_rank(query: List[str], title: List[str]) -> float: # The first word is more important than the second word, and so on word_weight = POSITION_COEFF / (POSITION_COEFF + i) - # Read the description of the `find_word` function to understand what is going on. Basically, we are trying - # to find each query word in the title words, calculate the penalty if the query word is not found or if there - # are some title words before it, and then rotate the skipped title words to the end of the title. This way, - # the least penalty got a title that has query words in the proper order at the beginning of the title. - found, skipped = find_word(word, title) + # Read the description of the `find_word_and_rotate_title` function to understand what is going on. + # Basically, we are trying to find each query word in the title words, calculate the penalty if the query word + # is not found or if there are some title words before it, and then rotate the skipped title words to the end + # of the title. This way, the least penalty got a title that has query words in the proper order at the + # beginning of the title. + found, skipped = find_word_and_rotate_title(word, title) if found: # if the query word is found in the title, add penalty for skipped words in title before it total_error += skipped * word_weight @@ -212,7 +213,7 @@ def calculate_rank(query: List[str], title: List[str]) -> float: return RANK_NORMALIZATION_COEFF / (RANK_NORMALIZATION_COEFF + total_error) -def find_word(word: str, title: Deque[str]) -> Tuple[bool, int]: +def find_word_and_rotate_title(word: str, title: Deque[str]) -> Tuple[bool, int]: """ Finds the query word in the title. Returns whether it was found or not and the number of skipped words in the title. @@ -225,10 +226,10 @@ def find_word(word: str, title: Deque[str]) -> Tuple[bool, int]: For efficiency reasons, the function modifies the `title` deque in place by removing the first entrance of the found word and rotating all leading non-matching words to the end of the deque. It allows to efficiently - perform multiple calls of the `find_word` function for subsequent words from the same query string. + perform multiple calls of the `find_word_and_rotate_title` function for subsequent words from the same query string. - An example: find_word('A', deque(['X', 'Y', 'A', 'B', 'C'])) returns `(True, 2)`, where True means that - the word 'A' was found in the `title` deque, and 2 is the number of skipped words ('X', 'Y'). Also, it modifies + An example: find_word_and_rotate_title('A', deque(['X', 'Y', 'A', 'B', 'C'])) returns `(True, 2)`, where True means + that the word 'A' was found in the `title` deque, and 2 is the number of skipped words ('X', 'Y'). Also, it modifies the `title` deque, so it starts looking like deque(['B', 'C', 'X', 'Y']). The found word 'A' was removed, and the leading non-matching words ('X', 'Y') were moved to the end of the deque. """ From c3df3a5340e938248ba826b0650f1f039450b17c Mon Sep 17 00:00:00 2001 From: Alexander Kozlovsky Date: Thu, 3 Nov 2022 16:49:11 +0100 Subject: [PATCH 32/32] Do not perform more than one full-text remote search in parallel --- .../remote_query_community.py | 39 +++++++++-- .../tests/test_remote_query_community.py | 70 +++++++++++++++---- .../tests/test_remote_search_by_tags.py | 8 +-- 3 files changed, 91 insertions(+), 26 deletions(-) diff --git a/src/tribler/core/components/metadata_store/remote_query_community/remote_query_community.py b/src/tribler/core/components/metadata_store/remote_query_community/remote_query_community.py index cf443e8d47a..aa6fd817fef 100644 --- a/src/tribler/core/components/metadata_store/remote_query_community/remote_query_community.py +++ b/src/tribler/core/components/metadata_store/remote_query_community/remote_query_community.py @@ -1,8 +1,10 @@ import json import struct +import time from asyncio import Future from binascii import unhexlify -from typing import List, Optional, Set +from itertools import count +from typing import Any, Dict, List, Optional, Set from ipv8.lazy_community import lazy_wrapper from ipv8.messaging.lazy_payload import VariablePayload, vp_compile @@ -26,7 +28,7 @@ BINARY_FIELDS = ("infohash", "channel_pk") -def sanitize_query(query_dict, cap=100): +def sanitize_query(query_dict: Dict[str, Any], cap=100) -> Dict[str, Any]: sanitized_dict = dict(query_dict) # We impose a cap on max numbers of returned entries to prevent DDOS-like attacks @@ -151,6 +153,8 @@ def __init__(self, my_peer, endpoint, network, self.add_message_handler(SelectResponsePayload, self.on_remote_select_response) self.eva = EVAProtocol(self, self.on_receive, self.on_send_complete, self.on_error) + self.remote_queries_in_progress = 0 + self.next_remote_query_num = count().__next__ # generator of sequential numbers, for logging & debug purposes async def on_receive(self, result: TransferResult): self.logger.debug(f"EVA data received: peer {hexlify(result.peer.mid)}, info {result.info}") @@ -183,16 +187,32 @@ def send_remote_select(self, peer, processing_callback=None, force_eva_response= self.ez_send(peer, RemoteSelectPayload(*args)) return request - async def process_rpc_query(self, json_bytes: bytes): + def should_limit_rate_for_query(self, sanitized_parameters: Dict[str, Any]) -> bool: + return 'txt_filter' in sanitized_parameters + + async def process_rpc_query_rate_limited(self, sanitized_parameters: Dict[str, Any]) -> List: + query_num = self.next_remote_query_num() + if self.remote_queries_in_progress and self.should_limit_rate_for_query(sanitized_parameters): + self.logger.warning(f'Ignore remote query {query_num} as another one is already processing. ' + f'The ignored query: {sanitized_parameters}') + return [] + + self.logger.info(f'Process remote query {query_num}: {sanitized_parameters}') + self.remote_queries_in_progress += 1 + t = time.time() + try: + return await self.process_rpc_query(sanitized_parameters) + finally: + self.remote_queries_in_progress -= 1 + self.logger.info(f'Remote query {query_num} processed in {time.time()-t} seconds: {sanitized_parameters}') + + async def process_rpc_query(self, sanitized_parameters: Dict[str, Any]) -> List: """ Retrieve the result of a database query from a third party, encoded as raw JSON bytes (through `dumps`). :raises TypeError: if the JSON contains invalid keys. :raises ValueError: if no JSON could be decoded. :raises pony.orm.dbapiprovider.OperationalError: if an illegal query was performed. """ - parameters = json.loads(json_bytes) - sanitized_parameters = sanitize_query(parameters, self.rqc_settings.max_response_size) - # tags should be extracted because `get_entries_threaded` doesn't expect them as a parameter tags = sanitized_parameters.pop('tags', None) @@ -237,9 +257,14 @@ async def on_remote_select_eva(self, peer, request_payload): async def on_remote_select(self, peer, request_payload): await self._on_remote_select_basic(peer, request_payload) + def parse_parameters(self, json_bytes: bytes) -> Dict[str, Any]: + parameters = json.loads(json_bytes) + return sanitize_query(parameters, self.rqc_settings.max_response_size) + async def _on_remote_select_basic(self, peer, request_payload, force_eva_response=False): try: - db_results = await self.process_rpc_query(request_payload.json) + sanitized_parameters = self.parse_parameters(request_payload.json) + db_results = await self.process_rpc_query_rate_limited(sanitized_parameters) # When we send our response to a host, we open a window of opportunity # for it to push back updates diff --git a/src/tribler/core/components/metadata_store/remote_query_community/tests/test_remote_query_community.py b/src/tribler/core/components/metadata_store/remote_query_community/tests/test_remote_query_community.py index 2a82b8c666d..fbd6d25d5bb 100644 --- a/src/tribler/core/components/metadata_store/remote_query_community/tests/test_remote_query_community.py +++ b/src/tribler/core/components/metadata_store/remote_query_community/tests/test_remote_query_community.py @@ -1,11 +1,10 @@ import random import string +import time from asyncio import sleep from binascii import unhexlify -from json import dumps from operator import attrgetter from os import urandom -from time import time from unittest.mock import Mock, patch from ipv8.keyvault.crypto import default_eccrypto @@ -112,7 +111,7 @@ async def test_remote_select(self): channel=channel, seeders=2 * i, leechers=i, - last_check=int(time()) + i, + last_check=int(time.time()) + i, ) kwargs_dict = {"txt_filter": "ubuntu*", "metadata_type": [REGULAR_TORRENT]} @@ -345,7 +344,7 @@ async def test_process_rpc_query_match_many(self): channel = self.channel_metadata(0).create_channel("a channel", "") add_random_torrent(self.torrent_metadata(0), name="a torrent", channel=channel) - results = await self.overlay(0).process_rpc_query(dumps({})) + results = await self.overlay(0).process_rpc_query({}) self.assertEqual(2, len(results)) channel_md, torrent_md = results if isinstance(results[0], self.channel_metadata(0)) else results[::-1] @@ -359,7 +358,7 @@ async def test_process_rpc_query_match_one(self): with db_session: self.channel_metadata(0).create_channel("a channel", "") - results = await self.overlay(0).process_rpc_query(dumps({})) + results = await self.overlay(0).process_rpc_query({}) self.assertEqual(1, len(results)) (channel_md,) = results @@ -369,22 +368,22 @@ async def test_process_rpc_query_match_none(self): """ Check if a correct query with no match in our database returns no result. """ - results = await self.overlay(0).process_rpc_query(dumps({})) + results = await self.overlay(0).process_rpc_query({}) self.assertEqual(0, len(results)) - async def test_process_rpc_query_match_empty_json(self): + def test_parse_parameters_match_empty_json(self): """ Check if processing an empty request causes a ValueError (JSONDecodeError) to be raised. """ with self.assertRaises(ValueError): - await self.overlay(0).process_rpc_query(b'') + self.overlay(0).parse_parameters(b'') - async def test_process_rpc_query_match_illegal_json(self): + def test_parse_parameters_match_illegal_json(self): """ Check if processing a request with illegal JSON causes a UnicodeDecodeError to be raised. """ with self.assertRaises(UnicodeDecodeError): - await self.overlay(0).process_rpc_query(b'{"akey":\x80}') + self.overlay(0).parse_parameters(b'{"akey":\x80}') async def test_process_rpc_query_match_invalid_json(self): """ @@ -394,21 +393,24 @@ async def test_process_rpc_query_match_invalid_json(self): self.channel_metadata(0).create_channel("a channel", "") query = b'{"id_":' + b'\x31' * 200 + b'}' with self.assertRaises(ValueError): - await self.overlay(0).process_rpc_query(query) + parameters = self.overlay(0).parse_parameters(query) + await self.overlay(0).process_rpc_query(parameters) async def test_process_rpc_query_match_invalid_key(self): """ Check if processing a request with invalid flags causes a UnicodeDecodeError to be raised. """ with self.assertRaises(TypeError): - await self.overlay(0).process_rpc_query(b'{"bla":":("}') + parameters = self.overlay(0).parse_parameters(b'{"bla":":("}') + await self.overlay(0).process_rpc_query(parameters) async def test_process_rpc_query_no_column(self): """ Check if processing a request with no database columns causes an OperationalError. """ with self.assertRaises(OperationalError): - await self.overlay(0).process_rpc_query(b'{"txt_filter":{"key":"bla"}}') + parameters = self.overlay(0).parse_parameters(b'{"txt_filter":{"key":"bla"}}') + await self.overlay(0).process_rpc_query(parameters) async def test_remote_query_big_response(self): @@ -574,3 +576,45 @@ async def test_remote_select_force_eva(self): await self.deliver_messages(timeout=0.5) self.nodes[1].overlay.eva.send_binary.assert_called_once() + + async def test_multiple_parallel_request(self): + peer_a = self.nodes[0].my_peer + a = self.nodes[0].overlay + b = self.nodes[1].overlay + + # Peer A has two torrents "foo" and "bar" + with db_session: + add_random_torrent(a.mds.TorrentMetadata, name="foo") + add_random_torrent(a.mds.TorrentMetadata, name="bar") + + # Peer B sends two parallel full-text search queries, only one of them should be processed + callback1 = Mock() + kwargs1 = {"txt_filter": "foo", "metadata_type": [REGULAR_TORRENT]} + b.send_remote_select(peer_a, **kwargs1, processing_callback=callback1) + + callback2 = Mock() + kwargs2 = {"txt_filter": "bar", "metadata_type": [REGULAR_TORRENT]} + b.send_remote_select(peer_a, **kwargs2, processing_callback=callback2) + + original_get_entries = MetadataStore.get_entries + # Add a delay to ensure that the first query is still being processed when the second one arrives + # (the mds.get_entries() method is a synchronous one and is called from a worker thread) + + def slow_get_entries(self, *args, **kwargs): + time.sleep(0.1) + return original_get_entries(self, *args, **kwargs) + + with patch.object(a, 'logger') as logger, patch.object(MetadataStore, 'get_entries', slow_get_entries): + await self.deliver_messages(timeout=0.5) + + torrents1 = list(b.mds.get_entries(**kwargs1)) + torrents2 = list(b.mds.get_entries(**kwargs2)) + + # Both remote queries should return results to the peer B... + assert callback1.called and callback2.called + # ...but one of them should return an empty list, as the database query was not actually executed + assert bool(torrents1) != bool(torrents2) + + # Check that on peer A there is exactly one warning about an ignored remote query + warnings = [call.args[0] for call in logger.warning.call_args_list] + assert len([msg for msg in warnings if msg.startswith('Ignore remote query')]) == 1 diff --git a/src/tribler/core/components/metadata_store/remote_query_community/tests/test_remote_search_by_tags.py b/src/tribler/core/components/metadata_store/remote_query_community/tests/test_remote_search_by_tags.py index 95321f121cc..0ab2fdb645b 100644 --- a/src/tribler/core/components/metadata_store/remote_query_community/tests/test_remote_search_by_tags.py +++ b/src/tribler/core/components/metadata_store/remote_query_community/tests/test_remote_search_by_tags.py @@ -72,9 +72,7 @@ def test_search_for_tags_only_valid_tags(self, mocked_get_subjects_intersection: async def test_process_rpc_query_no_tags(self, mocked_get_entries_threaded: AsyncMock): # test that in case of missed tags, the remote search works like normal remote search parameters = {'first': 0, 'infohash_set': None, 'last': 100} - json = dumps(parameters).encode('utf-8') - - await self.rqc.process_rpc_query(json) + await self.rqc.process_rpc_query(parameters) expected_parameters = {'infohash_set': None} expected_parameters.update(parameters) @@ -117,10 +115,8 @@ def _add(infohash): # Then we try to query search for three tags: 'tag1', 'tag2', 'tag3' parameters = {'first': 0, 'infohash_set': None, 'last': 100, 'tags': ['tag1']} - json = dumps(parameters).encode('utf-8') - with db_session: - query_results = [r.to_dict() for r in await self.rqc.process_rpc_query(json)] + query_results = [r.to_dict() for r in await self.rqc.process_rpc_query(parameters)] # Expected results: only one infohash (b'infohash1') should be returned. result_infohash_list = [r['infohash'] for r in query_results]