diff --git a/src/tribler/core/components/gigachannel/community/sync_strategy.py b/src/tribler/core/components/gigachannel/community/sync_strategy.py index 01caab05ec1..c6fc1b3a300 100644 --- a/src/tribler/core/components/gigachannel/community/sync_strategy.py +++ b/src/tribler/core/components/gigachannel/community/sync_strategy.py @@ -3,6 +3,9 @@ from ipv8.peerdiscovery.discovery import DiscoveryStrategy +TARGET_PEERS_NUMBER = 20 + + class RemovePeers(DiscoveryStrategy): """ Synchronization strategy for remote query community. @@ -10,8 +13,12 @@ class RemovePeers(DiscoveryStrategy): Remove a random peer, if we have enough peers to walk to. """ + def __init__(self, overlay, target_peers_number=TARGET_PEERS_NUMBER): + super().__init__(overlay) + self.target_peers_number = target_peers_number + def take_step(self): with self.walk_lock: peers = self.overlay.get_peers() - if peers and len(peers) > 20: + if peers and len(peers) > self.target_peers_number: self.overlay.network.remove_peer(choice(peers)) diff --git a/src/tribler/core/components/metadata_store/db/store.py b/src/tribler/core/components/metadata_store/db/store.py index 513ef6075ca..78089c42a55 100644 --- a/src/tribler/core/components/metadata_store/db/store.py +++ b/src/tribler/core/components/metadata_store/db/store.py @@ -10,6 +10,7 @@ from pony import orm from pony.orm import db_session, desc, left_join, raw_sql, select +from pony.orm.dbproviders.sqlite import keep_exception from tribler.core import notifications from tribler.core.components.metadata_store.db.orm_bindings import ( @@ -50,9 +51,11 @@ from tribler.core.utilities.notifier import Notifier from tribler.core.utilities.path_util import Path from tribler.core.utilities.pony_utils import get_max, get_or_create +from tribler.core.utilities.search_utils import torrent_rank from tribler.core.utilities.unicode import hexlify from tribler.core.utilities.utilities import MEMORY_DB + BETA_DB_VERSIONS = [0, 1, 2, 3, 4, 5] CURRENT_DB_VERSION = 14 @@ -167,7 +170,7 @@ def __init__( # with the static analysis. # pylint: disable=unused-variable @self._db.on_connect(provider='sqlite') - def sqlite_disable_sync(_, connection): + def on_connect(_, connection): cursor = connection.cursor() cursor.execute("PRAGMA journal_mode = WAL") cursor.execute("PRAGMA synchronous = NORMAL") @@ -180,6 +183,10 @@ def sqlite_disable_sync(_, connection): # losing power during a write will corrupt the database. cursor.execute("PRAGMA journal_mode = 0") cursor.execute("PRAGMA synchronous = 0") + + sqlite_rank = keep_exception(torrent_rank) + connection.create_function('search_rank', 5, sqlite_rank) + # pylint: enable=unused-variable self.MiscData = misc.define_binding(self._db) @@ -591,7 +598,7 @@ def torrent_exists_in_personal_channel(self, infohash): ) # pylint: disable=unused-argument - def search_keyword(self, query, lim=100): + def search_keyword(self, query): # Requires FTS5 table "FtsIndex" to be generated and populated. # FTS table is maintained automatically by SQL triggers. # BM25 ranking is embedded in FTS5. @@ -600,10 +607,11 @@ def search_keyword(self, query, lim=100): if not query or query == "*": return [] - fts_ids = raw_sql( - """SELECT rowid FROM ChannelNode WHERE rowid IN (SELECT rowid FROM FtsIndex WHERE FtsIndex MATCH $query - ORDER BY bm25(FtsIndex) LIMIT $lim) GROUP BY coalesce(infohash, rowid)""" - ) + fts_ids = raw_sql(""" + SELECT rowid FROM ChannelNode + WHERE rowid IN (SELECT rowid FROM FtsIndex WHERE FtsIndex MATCH $query) + GROUP BY coalesce(infohash, rowid) + """) return left_join(g for g in self.MetadataNode if g.rowid in fts_ids) # pylint: disable=E1135 @db_session @@ -639,7 +647,7 @@ def get_entries_query( if cls is None: cls = self.ChannelNode - pony_query = self.search_keyword(txt_filter, lim=1000) if txt_filter else left_join(g for g in cls) + pony_query = self.search_keyword(txt_filter) if txt_filter else left_join(g for g in cls) infohash_set = infohash_set or ({infohash} if infohash else None) if popular: if metadata_type != REGULAR_TORRENT: @@ -728,10 +736,49 @@ def get_entries_query( if sort_by is None: if txt_filter: + # pylint: disable=W0105 + """ + The following call of `sort_by` produces an ORDER BY expression that looks like this: + + ORDER BY + case when "g"."metadata_type" = $CHANNEL_TORRENT then 1 + when "g"."metadata_type" = $COLLECTION_NODE then 2 + else 3 end, + + search_rank( + $QUERY_STRING, + g.title, + torrentstate.seeders, + torrentstate.leechers, + $CURRENT_TIME - strftime('%s', g.torrent_date) + ) DESC, + + "torrentstate"."last_check" DESC, + + So, the channel torrents and channel folders are always on top if they are not filtered out. + Then regular torrents are selected in order of their relevance according to a search_rank() result. + If two torrents have the same search rank, they are ordered by the last time they were checked. + + The search_rank() function is called directly from the SQLite query, but is implemented in Python, + it is actually the torrent_rank() function from core/utilities/search_utils.py, wrapped with + keep_exception() to return possible exception from SQLite to Python. + + The search_rank() function receives the following arguments: + - the current query string (like "Big Buck Bunny"); + - the title of the current torrent; + - the number of seeders; + - the number of leechers; + - the number of seconds since the torrent's creation time. + """ + pony_query = pony_query.sort_by( f""" (1 if g.metadata_type == {CHANNEL_TORRENT} else 2 if g.metadata_type == {COLLECTION_NODE} else 3), - desc(g.health.seeders), desc(g.health.leechers) + raw_sql('''search_rank( + $txt_filter, g.title, torrentstate.seeders, torrentstate.leechers, + $int(time()) - strftime('%s', g.torrent_date) + ) DESC'''), + desc(g.health.last_check) # just to trigger the TorrentState table inclusion into the left join """ ) elif popular: diff --git a/src/tribler/core/components/metadata_store/remote_query_community/remote_query_community.py b/src/tribler/core/components/metadata_store/remote_query_community/remote_query_community.py index cf443e8d47a..aa6fd817fef 100644 --- a/src/tribler/core/components/metadata_store/remote_query_community/remote_query_community.py +++ b/src/tribler/core/components/metadata_store/remote_query_community/remote_query_community.py @@ -1,8 +1,10 @@ import json import struct +import time from asyncio import Future from binascii import unhexlify -from typing import List, Optional, Set +from itertools import count +from typing import Any, Dict, List, Optional, Set from ipv8.lazy_community import lazy_wrapper from ipv8.messaging.lazy_payload import VariablePayload, vp_compile @@ -26,7 +28,7 @@ BINARY_FIELDS = ("infohash", "channel_pk") -def sanitize_query(query_dict, cap=100): +def sanitize_query(query_dict: Dict[str, Any], cap=100) -> Dict[str, Any]: sanitized_dict = dict(query_dict) # We impose a cap on max numbers of returned entries to prevent DDOS-like attacks @@ -151,6 +153,8 @@ def __init__(self, my_peer, endpoint, network, self.add_message_handler(SelectResponsePayload, self.on_remote_select_response) self.eva = EVAProtocol(self, self.on_receive, self.on_send_complete, self.on_error) + self.remote_queries_in_progress = 0 + self.next_remote_query_num = count().__next__ # generator of sequential numbers, for logging & debug purposes async def on_receive(self, result: TransferResult): self.logger.debug(f"EVA data received: peer {hexlify(result.peer.mid)}, info {result.info}") @@ -183,16 +187,32 @@ def send_remote_select(self, peer, processing_callback=None, force_eva_response= self.ez_send(peer, RemoteSelectPayload(*args)) return request - async def process_rpc_query(self, json_bytes: bytes): + def should_limit_rate_for_query(self, sanitized_parameters: Dict[str, Any]) -> bool: + return 'txt_filter' in sanitized_parameters + + async def process_rpc_query_rate_limited(self, sanitized_parameters: Dict[str, Any]) -> List: + query_num = self.next_remote_query_num() + if self.remote_queries_in_progress and self.should_limit_rate_for_query(sanitized_parameters): + self.logger.warning(f'Ignore remote query {query_num} as another one is already processing. ' + f'The ignored query: {sanitized_parameters}') + return [] + + self.logger.info(f'Process remote query {query_num}: {sanitized_parameters}') + self.remote_queries_in_progress += 1 + t = time.time() + try: + return await self.process_rpc_query(sanitized_parameters) + finally: + self.remote_queries_in_progress -= 1 + self.logger.info(f'Remote query {query_num} processed in {time.time()-t} seconds: {sanitized_parameters}') + + async def process_rpc_query(self, sanitized_parameters: Dict[str, Any]) -> List: """ Retrieve the result of a database query from a third party, encoded as raw JSON bytes (through `dumps`). :raises TypeError: if the JSON contains invalid keys. :raises ValueError: if no JSON could be decoded. :raises pony.orm.dbapiprovider.OperationalError: if an illegal query was performed. """ - parameters = json.loads(json_bytes) - sanitized_parameters = sanitize_query(parameters, self.rqc_settings.max_response_size) - # tags should be extracted because `get_entries_threaded` doesn't expect them as a parameter tags = sanitized_parameters.pop('tags', None) @@ -237,9 +257,14 @@ async def on_remote_select_eva(self, peer, request_payload): async def on_remote_select(self, peer, request_payload): await self._on_remote_select_basic(peer, request_payload) + def parse_parameters(self, json_bytes: bytes) -> Dict[str, Any]: + parameters = json.loads(json_bytes) + return sanitize_query(parameters, self.rqc_settings.max_response_size) + async def _on_remote_select_basic(self, peer, request_payload, force_eva_response=False): try: - db_results = await self.process_rpc_query(request_payload.json) + sanitized_parameters = self.parse_parameters(request_payload.json) + db_results = await self.process_rpc_query_rate_limited(sanitized_parameters) # When we send our response to a host, we open a window of opportunity # for it to push back updates diff --git a/src/tribler/core/components/metadata_store/remote_query_community/settings.py b/src/tribler/core/components/metadata_store/remote_query_community/settings.py index ea380da426e..9fcd21bdbf5 100644 --- a/src/tribler/core/components/metadata_store/remote_query_community/settings.py +++ b/src/tribler/core/components/metadata_store/remote_query_community/settings.py @@ -5,7 +5,16 @@ class RemoteQueryCommunitySettings(TriblerConfigSection): minimal_blob_size: int = 200 maximum_payload_size: int = 1300 max_entries: int = maximum_payload_size // minimal_blob_size - max_query_peers: int = 5 + + # The next option is currently used by GigaChannelCommunity only. We probably should move it to the + # GigaChannelCommunity settings or to a dedicated search-related section. The value of the option is corresponding + # with the TARGET_PEERS_NUMBER of src/tribler/core/components/gigachannel/community/sync_strategy.py, that is, to + # the number of peers that GigaChannelCommunity will have after a long run (initially, the number of peers in + # GigaChannelCommunity can rise up to several hundred due to DiscoveryBooster). The number of parallel remote + # requests should be not too small (to have various results from remote peers) and not too big (to avoid flooding + # the network with exceedingly high number of queries). TARGET_PEERS_NUMBER looks like a good middle ground here. + max_query_peers: int = 20 + max_response_size: int = 100 # Max number of entries returned by SQL query max_channel_query_back: int = 4 # Max number of entries to query back on receiving an unknown channel push_updates_back_enabled = True diff --git a/src/tribler/core/components/metadata_store/remote_query_community/tests/test_remote_query_community.py b/src/tribler/core/components/metadata_store/remote_query_community/tests/test_remote_query_community.py index 2a82b8c666d..fbd6d25d5bb 100644 --- a/src/tribler/core/components/metadata_store/remote_query_community/tests/test_remote_query_community.py +++ b/src/tribler/core/components/metadata_store/remote_query_community/tests/test_remote_query_community.py @@ -1,11 +1,10 @@ import random import string +import time from asyncio import sleep from binascii import unhexlify -from json import dumps from operator import attrgetter from os import urandom -from time import time from unittest.mock import Mock, patch from ipv8.keyvault.crypto import default_eccrypto @@ -112,7 +111,7 @@ async def test_remote_select(self): channel=channel, seeders=2 * i, leechers=i, - last_check=int(time()) + i, + last_check=int(time.time()) + i, ) kwargs_dict = {"txt_filter": "ubuntu*", "metadata_type": [REGULAR_TORRENT]} @@ -345,7 +344,7 @@ async def test_process_rpc_query_match_many(self): channel = self.channel_metadata(0).create_channel("a channel", "") add_random_torrent(self.torrent_metadata(0), name="a torrent", channel=channel) - results = await self.overlay(0).process_rpc_query(dumps({})) + results = await self.overlay(0).process_rpc_query({}) self.assertEqual(2, len(results)) channel_md, torrent_md = results if isinstance(results[0], self.channel_metadata(0)) else results[::-1] @@ -359,7 +358,7 @@ async def test_process_rpc_query_match_one(self): with db_session: self.channel_metadata(0).create_channel("a channel", "") - results = await self.overlay(0).process_rpc_query(dumps({})) + results = await self.overlay(0).process_rpc_query({}) self.assertEqual(1, len(results)) (channel_md,) = results @@ -369,22 +368,22 @@ async def test_process_rpc_query_match_none(self): """ Check if a correct query with no match in our database returns no result. """ - results = await self.overlay(0).process_rpc_query(dumps({})) + results = await self.overlay(0).process_rpc_query({}) self.assertEqual(0, len(results)) - async def test_process_rpc_query_match_empty_json(self): + def test_parse_parameters_match_empty_json(self): """ Check if processing an empty request causes a ValueError (JSONDecodeError) to be raised. """ with self.assertRaises(ValueError): - await self.overlay(0).process_rpc_query(b'') + self.overlay(0).parse_parameters(b'') - async def test_process_rpc_query_match_illegal_json(self): + def test_parse_parameters_match_illegal_json(self): """ Check if processing a request with illegal JSON causes a UnicodeDecodeError to be raised. """ with self.assertRaises(UnicodeDecodeError): - await self.overlay(0).process_rpc_query(b'{"akey":\x80}') + self.overlay(0).parse_parameters(b'{"akey":\x80}') async def test_process_rpc_query_match_invalid_json(self): """ @@ -394,21 +393,24 @@ async def test_process_rpc_query_match_invalid_json(self): self.channel_metadata(0).create_channel("a channel", "") query = b'{"id_":' + b'\x31' * 200 + b'}' with self.assertRaises(ValueError): - await self.overlay(0).process_rpc_query(query) + parameters = self.overlay(0).parse_parameters(query) + await self.overlay(0).process_rpc_query(parameters) async def test_process_rpc_query_match_invalid_key(self): """ Check if processing a request with invalid flags causes a UnicodeDecodeError to be raised. """ with self.assertRaises(TypeError): - await self.overlay(0).process_rpc_query(b'{"bla":":("}') + parameters = self.overlay(0).parse_parameters(b'{"bla":":("}') + await self.overlay(0).process_rpc_query(parameters) async def test_process_rpc_query_no_column(self): """ Check if processing a request with no database columns causes an OperationalError. """ with self.assertRaises(OperationalError): - await self.overlay(0).process_rpc_query(b'{"txt_filter":{"key":"bla"}}') + parameters = self.overlay(0).parse_parameters(b'{"txt_filter":{"key":"bla"}}') + await self.overlay(0).process_rpc_query(parameters) async def test_remote_query_big_response(self): @@ -574,3 +576,45 @@ async def test_remote_select_force_eva(self): await self.deliver_messages(timeout=0.5) self.nodes[1].overlay.eva.send_binary.assert_called_once() + + async def test_multiple_parallel_request(self): + peer_a = self.nodes[0].my_peer + a = self.nodes[0].overlay + b = self.nodes[1].overlay + + # Peer A has two torrents "foo" and "bar" + with db_session: + add_random_torrent(a.mds.TorrentMetadata, name="foo") + add_random_torrent(a.mds.TorrentMetadata, name="bar") + + # Peer B sends two parallel full-text search queries, only one of them should be processed + callback1 = Mock() + kwargs1 = {"txt_filter": "foo", "metadata_type": [REGULAR_TORRENT]} + b.send_remote_select(peer_a, **kwargs1, processing_callback=callback1) + + callback2 = Mock() + kwargs2 = {"txt_filter": "bar", "metadata_type": [REGULAR_TORRENT]} + b.send_remote_select(peer_a, **kwargs2, processing_callback=callback2) + + original_get_entries = MetadataStore.get_entries + # Add a delay to ensure that the first query is still being processed when the second one arrives + # (the mds.get_entries() method is a synchronous one and is called from a worker thread) + + def slow_get_entries(self, *args, **kwargs): + time.sleep(0.1) + return original_get_entries(self, *args, **kwargs) + + with patch.object(a, 'logger') as logger, patch.object(MetadataStore, 'get_entries', slow_get_entries): + await self.deliver_messages(timeout=0.5) + + torrents1 = list(b.mds.get_entries(**kwargs1)) + torrents2 = list(b.mds.get_entries(**kwargs2)) + + # Both remote queries should return results to the peer B... + assert callback1.called and callback2.called + # ...but one of them should return an empty list, as the database query was not actually executed + assert bool(torrents1) != bool(torrents2) + + # Check that on peer A there is exactly one warning about an ignored remote query + warnings = [call.args[0] for call in logger.warning.call_args_list] + assert len([msg for msg in warnings if msg.startswith('Ignore remote query')]) == 1 diff --git a/src/tribler/core/components/metadata_store/remote_query_community/tests/test_remote_search_by_tags.py b/src/tribler/core/components/metadata_store/remote_query_community/tests/test_remote_search_by_tags.py index 95321f121cc..0ab2fdb645b 100644 --- a/src/tribler/core/components/metadata_store/remote_query_community/tests/test_remote_search_by_tags.py +++ b/src/tribler/core/components/metadata_store/remote_query_community/tests/test_remote_search_by_tags.py @@ -72,9 +72,7 @@ def test_search_for_tags_only_valid_tags(self, mocked_get_subjects_intersection: async def test_process_rpc_query_no_tags(self, mocked_get_entries_threaded: AsyncMock): # test that in case of missed tags, the remote search works like normal remote search parameters = {'first': 0, 'infohash_set': None, 'last': 100} - json = dumps(parameters).encode('utf-8') - - await self.rqc.process_rpc_query(json) + await self.rqc.process_rpc_query(parameters) expected_parameters = {'infohash_set': None} expected_parameters.update(parameters) @@ -117,10 +115,8 @@ def _add(infohash): # Then we try to query search for three tags: 'tag1', 'tag2', 'tag3' parameters = {'first': 0, 'infohash_set': None, 'last': 100, 'tags': ['tag1']} - json = dumps(parameters).encode('utf-8') - with db_session: - query_results = [r.to_dict() for r in await self.rqc.process_rpc_query(json)] + query_results = [r.to_dict() for r in await self.rqc.process_rpc_query(parameters)] # Expected results: only one infohash (b'infohash1') should be returned. result_infohash_list = [r['infohash'] for r in query_results] diff --git a/src/tribler/core/components/metadata_store/restapi/search_endpoint.py b/src/tribler/core/components/metadata_store/restapi/search_endpoint.py index e0710d53db0..a7a1963871f 100644 --- a/src/tribler/core/components/metadata_store/restapi/search_endpoint.py +++ b/src/tribler/core/components/metadata_store/restapi/search_endpoint.py @@ -1,3 +1,4 @@ +import time from collections import defaultdict from typing import Dict, List @@ -119,13 +120,27 @@ async def search(self, request): def search_db(): with db_session: + t1 = time.time() pony_query = mds.get_entries(**sanitized) + t2 = time.time() search_results = [r.to_simple_dict() for r in pony_query] + t3 = time.time() if include_total: total = mds.get_total_count(**sanitized) + t4 = time.time() max_rowid = mds.get_max_rowid() + t5 = time.time() + self._logger.info(f'Search performance for {sanitized}:\n' + f'Main query executed in {t2 - t1:.6} seconds;\n' + f'Result constructed in {t3 - t2:.6} seconds;\n' + f'Total rows count calculated in {t4 - t3:.6} seconds;\n' + f'Max rowid determined in {t5 - t4:.6} seconds.') else: total = max_rowid = None + self._logger.info(f'Search performance for {sanitized}:\n' + f'Main query executed in {t2 - t1:.6} seconds;\n' + f'Result constructed in {t3 - t2:.6} seconds.') + return search_results, total, max_rowid try: diff --git a/src/tribler/core/tests/test_search_utils.py b/src/tribler/core/tests/test_search_utils.py index 0506d136f57..b1a1fe94a18 100644 --- a/src/tribler/core/tests/test_search_utils.py +++ b/src/tribler/core/tests/test_search_utils.py @@ -1,4 +1,12 @@ -from tribler.core.utilities.search_utils import filter_keywords, split_into_keywords +from collections import deque + +import pytest + +from tribler.core.utilities.search_utils import filter_keywords, find_word_and_rotate_title, freshness_rank, item_rank,\ + seeders_rank, split_into_keywords, torrent_rank, title_rank + + +DAY = 60 * 60 * 24 def test_split_into_keywords(): @@ -15,3 +23,181 @@ def test_filter_keywords(): result = filter_keywords(["to", "be", "or", "not", "to", "be"]) assert isinstance(result, list) assert len(result) == 4 + + +def test_title_rank_range(): + assert title_rank('Big Buck Bunny', 'Big Buck Bunny') == 1 + + long_query = ' '.join(['foo'] * 1000) + long_title = ' '.join(['bar'] * 1000) + assert title_rank(long_query, long_title) == pytest.approx(0.03554968) + + +def test_freshness_rank_range(): + assert freshness_rank(-1) == 0 + assert freshness_rank(0) == 0 + assert freshness_rank(0.001) == pytest.approx(1.0) + assert freshness_rank(1000000000) == pytest.approx(0.0025852989) + + +def test_seeders_rank_range(): + assert seeders_rank(0) == 0 + assert seeders_rank(1000000) == pytest.approx(0.9999) + + +def test_torrent_rank_range(): + assert torrent_rank('Big Buck Bunny', 'Big Buck Bunny', seeders=1000000, freshness=0.01) == pytest.approx(0.99999) + + long_query = ' '.join(['foo'] * 1000) + long_title = ' '.join(['bar'] * 1000) + assert torrent_rank(long_query, long_title, freshness=1000000 * 365 * DAY) == pytest.approx(+0.02879524) + + +def test_torrent_rank(): + query = 'Big Buck Bunny' + # The exact match ranked as pretty high + title_match = torrent_rank(query, 'Big Buck Bunny') # 0.81 + assert title_match > 0.8 + + # Seeders are good for the rank + # The more seeders the better + # The fewer days have passed since the creation of the torrent, the higher its rank + assert torrent_rank(query, 'Big Buck Bunny', seeders=1000, freshness=1 * DAY) > \ + torrent_rank(query, 'Big Buck Bunny', seeders=1000, freshness=100 * DAY) > \ + torrent_rank(query, 'Big Buck Bunny', seeders=100, freshness=100 * DAY) > \ + title_match + + # If a title contains non-matching words missed in the query string it is not as good as the exact match + # The closer to the start of the string non-matching words are placed in the title, the worse is rank + assert title_match > \ + torrent_rank(query, 'Big Buck Bunny II') > \ + torrent_rank(query, 'Big Buck Brown Bunny') > \ + torrent_rank(query, 'Big Bad Buck Bunny') > \ + torrent_rank(query, 'Boring Big Buck Bunny') + + # The more non-matching words are in the title, the worse is rank + assert title_match > \ + torrent_rank(query, 'Big Buck A Bunny') > \ + torrent_rank(query, 'Big Buck A B Bunny') > \ + torrent_rank(query, 'Big Buck A B C Bunny') + + # Non-matching words close to the beginning of the title give a bigger penalty + assert title_match > \ + torrent_rank(query, 'Big A Buck Bunny') > \ + torrent_rank(query, 'Big A B Buck Bunny') > \ + torrent_rank(query, 'Big A B C Buck Bunny') + + assert title_match > \ + torrent_rank(query, 'A Big Buck Bunny') > \ + torrent_rank(query, 'A B Big Buck Bunny') > \ + torrent_rank(query, 'A B C Big Buck Bunny') + + assert torrent_rank(query, 'Big A Buck Bunny') > \ + torrent_rank(query, 'A Big Buck Bunny') + + assert torrent_rank(query, 'Big A B Buck Bunny') > \ + torrent_rank(query, 'A B Big Buck Bunny') + + assert torrent_rank(query, 'Big A B C Buck Bunny') > \ + torrent_rank(query, 'A B C Big Buck Bunny') + + # Wrong order of words in the title imposes a penalty to the rank + assert title_match > \ + torrent_rank(query, 'Big Bunny Buck') + + # Missed query words imposes a really big penalty + assert torrent_rank(query, 'Big Buck') < 0.5 + + # The close the missed words to the beginning of the query, the worse + assert torrent_rank(query, 'Big Buck') > \ + torrent_rank(query, 'Big Bunny') > \ + torrent_rank(query, 'Buck Bunny') + + # The more seeders is still better + assert torrent_rank(query, 'Buck Bunny', seeders=1000, freshness=5 * DAY) > \ + torrent_rank(query, 'Buck Bunny', seeders=100, freshness=5 * DAY) > \ + torrent_rank(query, 'Buck Bunny', seeders=10, freshness=5 * DAY) > \ + torrent_rank(query, 'Buck Bunny') + + # The more days from the check the less relevant the number of seeders is + assert torrent_rank(query, 'Buck Bunny', freshness=5 * DAY) > \ + torrent_rank(query, 'Buck Bunny', freshness=10 * DAY) > \ + torrent_rank(query, 'Buck Bunny', freshness=20 * DAY) + + # The exact match has a good rank + assert torrent_rank('Sintel', 'Sintel') > 0.8 + + # Non-matching words at the end of the title give slightly worse results + # Non-matching words at the beginning of the title are much worse + # Too many non-matching words give a bigger penalty + assert torrent_rank('Sintel', 'Sintel') > \ + torrent_rank('Sintel', 'Sintel Part II') > \ + torrent_rank('Sintel', 'Part of Sintel') > \ + torrent_rank('Sintel', 'the.script.from.the.movie.Sintel.pdf') + + # Some more examples + assert torrent_rank("Internet's Own Boy", "Internet's Own Boy") > \ + torrent_rank("Internet's Own Boy", "Internet's very Own Boy") > \ + torrent_rank("Internet's Own Boy", "Internet's very special Boy person") + + +def test_title_rank(): + # tests for better covarage of corner cases + assert title_rank("", "title") == pytest.approx(1.0) + assert title_rank("query", "") == pytest.approx(0.0) + + +def test_item_rank(): + item = dict(name="abc", num_seeders=10, num_leechers=20) + assert item_rank("abc", item) == pytest.approx(0.81978445) + + +def test_find_word(): + # To use the find_word_and_rotate_title function, you can call it one time for each word from the query and see: + # - how many query words are missed in the title; + # - how many excess or out-of-place title words are found before each query word; + # - and how many title words are not mentioned in the query. + + # Example 1, query "A B C", title "A B C" + title = deque(["A", "B", "C"]) + assert find_word_and_rotate_title("A", title) == (True, 0) and title == deque(["B", "C"]) + assert find_word_and_rotate_title("B", title) == (True, 0) and title == deque(["C"]) + assert find_word_and_rotate_title("C", title) == (True, 0) and title == deque([]) + # Conclusion: exact match. + + # Example 2, query "A B C", title "A B C D" + title = deque(["A", "B", "C", "D"]) + assert find_word_and_rotate_title("A", title) == (True, 0) and title == deque(["B", "C", "D"]) + assert find_word_and_rotate_title("B", title) == (True, 0) and title == deque(["C", "D"]) + assert find_word_and_rotate_title("C", title) == (True, 0) and title == deque(["D"]) + # Conclusion: minor penalty for one excess word in the title that is not in the query. + + # Example 3, query "A B C", title "X Y A B C" + title = deque(["X", "Y", "A", "B", "C"]) + assert find_word_and_rotate_title("A", title) == (True, 2) and title == deque(["B", "C", "X", "Y"]) + assert find_word_and_rotate_title("B", title) == (True, 0) and title == deque(["C", "X", "Y"]) + assert find_word_and_rotate_title("C", title) == (True, 0) and title == deque(["X", "Y"]) + # Conclusion: major penalty for skipping two words at the beginning of the title plus a minor penalty for two + # excess words in the title that are not in the query. + + # Example 4, query "A B C", title "A B X Y C" + title = deque(["A", "B", "X", "Y", "C"]) + assert find_word_and_rotate_title("A", title) == (True, 0) and title == deque(["B", "X", "Y", "C"]) + assert find_word_and_rotate_title("B", title) == (True, 0) and title == deque(["X", "Y", "C"]) + assert find_word_and_rotate_title("C", title) == (True, 2) and title == deque(["X", "Y"]) + # Conclusion: average penalty for skipping two words in the middle of the title plus a minor penalty for two + # excess words in the title that are not in the query. + + # Example 5, query "A B C", title "A C B" + title = deque(["A", "C", "B"]) + assert find_word_and_rotate_title("A", title) == (True, 0) and title == deque(["C", "B"]) + assert find_word_and_rotate_title("B", title) == (True, 1) and title == deque(["C"]) + assert find_word_and_rotate_title("C", title) == (True, 0) and title == deque([]) + # Conclusion: average penalty for skipping one word in the middle of the title. + + # Example 6, query "A B C", title "A C X" + title = deque(["A", "C", "X"]) + assert find_word_and_rotate_title("A", title) == (True, 0) and title == deque(["C", "X"]) + assert find_word_and_rotate_title("B", title) == (False, 0) and title == deque(["C", "X"]) + assert find_word_and_rotate_title("C", title) == (True, 0) and title == deque(["X"]) + # Conclusion: huge penalty for missing one query word plus a minor penalty for one excess title word. diff --git a/src/tribler/core/utilities/search_utils.py b/src/tribler/core/utilities/search_utils.py index 6c4180a41b9..ca09b0398c3 100644 --- a/src/tribler/core/utilities/search_utils.py +++ b/src/tribler/core/utilities/search_utils.py @@ -1,13 +1,18 @@ """ Search utilities. -Author(s): Jelle Roozenburg, Arno Bakker +Author(s): Jelle Roozenburg, Arno Bakker, Alexander Kozlovsky """ import re +import time +from collections import deque +from typing import Deque, List, Optional, Tuple RE_KEYWORD_SPLIT = re.compile(r"[\W_]", re.UNICODE) DIALOG_STOPWORDS = {'an', 'and', 'by', 'for', 'from', 'of', 'the', 'to', 'with'} +SECONDS_IN_DAY = 60 * 60 * 24 + def split_into_keywords(string, to_filter_stopwords=False): """ @@ -27,3 +32,212 @@ def split_into_keywords(string, to_filter_stopwords=False): def filter_keywords(keywords): return [kw for kw in keywords if len(kw) > 0 and kw not in DIALOG_STOPWORDS] + + +def item_rank(query: str, item: dict) -> float: + """ + Calculates the torrent rank for item received from remote query. Returns the torrent rank value in range [0, 1]. + + :param query: a user-defined query string + :param item: a dict with torrent info. + Should include key `name`, can include `num_seeders`, `num_leechers`, `updated` + :return: the torrent rank value in range [0, 1] + """ + + title = item['name'] + seeders = item.get('num_seeders', 0) + leechers = item.get('num_leechers', 0) + freshness = time.time() - item.get('updated', 0) + return torrent_rank(query, title, seeders, leechers, freshness) + + +def torrent_rank(query: str, title: str, seeders: int = 0, leechers: int = 0, freshness: Optional[float] = 0) -> float: + """ + Calculates search rank for a torrent. + + :param query: a user-defined query string + :param title: a torrent name + :param seeders: the number of seeders + :param leechers: the number of leechers + :param freshness: the number of seconds since the torrent creation. Zero or negative value means the torrent + creation date is unknown. It is more convenient to use comparing to a timestamp, as it avoids + using the `time()` function call and simplifies testing. + :return: the torrent rank value in range [0, 1] + + Takes into account: + - similarity of the title to the query string; + - the reported number of seeders; + - how long ago the torrent file was created. + """ + tr = title_rank(query or '', title or '') + sr = (seeders_rank(seeders or 0, leechers or 0) + 9) / 10 # range [0.9, 1] + fr = (freshness_rank(freshness) + 9) / 10 # range [0.9, 1] + result = tr * sr * fr + + # uncomment the next line to debug the function inside an SQL query: + # print(f'*** {result} : {seeders}/{freshness} ({freshness / SECONDS_IN_DAY} days)/{title} | {query}') + + return result + + +LEECHERS_COEFF = 0.1 # How much leechers are less important compared to seeders (ten times less important) +SEEDERS_HALF_RANK = 100 # The number of seeders at which the seeders rank is 0.5 + + +def seeders_rank(seeders: int, leechers: int = 0) -> float: + """ + Calculates rank based on the number of torrent's seeders and leechers + + :param seeders: the number of seeders for the torrent. It is a positive value, usually in the range [0, 1000] + :param leechers: the number of leechers for the torrent. It is a positive value, usually in the range [0, 1000] + :return: the torrent rank based on seeders and leechers, normalized to the range [0, 1] + """ + + # The leechers are treated as less capable seeders + sl = seeders + leechers * LEECHERS_COEFF # Seeders and leechers combined + + # The function result has desired properties: + # * zero rank for zero seeders; + # * 0.5 rating for SEEDERS_HALF_RANK seeders; + # * 1.0 rating for an infinite number of seeders; + # * soft curve. + # It is possible to use different curves with the similar shape, for example: + # * 2 * arctan(x / SEEDERS_HALF_RANK) / PI, + # * 1 - exp(x * ln(0.5) / SEEDERS_HALF_RANK) + # but it does not actually matter in practice + return sl / (100 + sl) + + +def freshness_rank(freshness: Optional[float] = 0) -> float: + """ + Calculates a rank value based on the torrent freshness. The result is normalized to the range [0, 1] + + :param freshness: number of seconds since the torrent creation. + Zero or negative values means the actual torrent creation date is unknown. + :return: the torrent rank based on freshness. The result is normalized to the range [0, 1] + + Example results: + 0 seconds since torrent creation -> the actual torrent creation date is unknown, freshness rank 0 + 1 second since torrent creation -> freshness rank 0.999 + 1 day since torrent creation -> freshness rank 0.967 + 30 days since torrent creation -> freshness rank 0.5 + 1 year since torrent creation -> freshness rank 0.0759 + """ + freshness = max(0, freshness or 0) + if not freshness: + return 0 # for freshness <= 0 the rank value is 0 because of an incorrect freshness value + + # The function declines from 1.0 to 0.0 on range (0..Infinity], with the following properties: + # * for just created torrents the rank value is close to 1.0 + # * for 30-days old torrents the rank value is 0.5 + # * for very old torrens the rank value is going to zero + # It was possible to use another formulas with the same properties (for example, exponent-based), + # the exact curve shape is not really important. + days = (freshness or 0) / SECONDS_IN_DAY + return 1 / (1 + days / 30) + + +word_re = re.compile(r'\w+', re.UNICODE) + + +def title_rank(query: str, title: str) -> float: + """ + Calculate the similarity of the title string to a query string as a float value in range [0, 1] + + :param query: a user-defined query string + :param title: a torrent name + :return: the similarity of the title string to a query string as a float value in range [0, 1] + """ + query = word_re.findall(query.lower()) + title = word_re.findall(title.lower()) + return calculate_rank(query, title) + + +# These coefficients are found empirically. Their exact values are not very important for a relative ranking of results + +# The first word in a query is considered as a more important than the next one and so on, +# 5 means the 5th word in a query is twice as less important as the first one +POSITION_COEFF = 5 + +# Some big value for a penalty if a query word is totally missed from a torrent title +MISSED_WORD_PENALTY = 10 + +# If a torrent title contains some words at the very end that are not mentioned in a query, we add a very slight +# penalty for them. The *bigger* the REMAINDER_COEFF is, the *smaller* penalty we add for this excess words +REMAINDER_COEFF = 10 + +# The exact value of this coefficient is not important. It is used to convert total_error value to a rank value. +# The total_error value is some positive number. We want to have the resulted rank in range [0, 1]. +RANK_NORMALIZATION_COEFF = 10 + + +def calculate_rank(query: List[str], title: List[str]) -> float: + """ + Calculates the similarity of the title to the query as a float value in range [0, 1]. + + :param query: list of query words + :param title: list of title words + :return: the similarity of the title to the query as a float value in range [0, 1] + """ + if not query: + return 1.0 + + if not title: + return 0.0 + + title = deque(title) + total_error = 0 + for i, word in enumerate(query): + # The first word is more important than the second word, and so on + word_weight = POSITION_COEFF / (POSITION_COEFF + i) + + # Read the description of the `find_word_and_rotate_title` function to understand what is going on. + # Basically, we are trying to find each query word in the title words, calculate the penalty if the query word + # is not found or if there are some title words before it, and then rotate the skipped title words to the end + # of the title. This way, the least penalty got a title that has query words in the proper order at the + # beginning of the title. + found, skipped = find_word_and_rotate_title(word, title) + if found: + # if the query word is found in the title, add penalty for skipped words in title before it + total_error += skipped * word_weight + else: + # if the query word is not found in the title, add a big penalty for it + total_error += MISSED_WORD_PENALTY * word_weight + + # a small penalty for excess words in the title that was not mentioned in the search phrase + remainder_weight = 1 / (REMAINDER_COEFF + len(query)) + remained_words_error = len(title) * remainder_weight + total_error += remained_words_error + + # a search rank should be between 1 and 0 + return RANK_NORMALIZATION_COEFF / (RANK_NORMALIZATION_COEFF + total_error) + + +def find_word_and_rotate_title(word: str, title: Deque[str]) -> Tuple[bool, int]: + """ + Finds the query word in the title. Returns whether it was found or not and the number of skipped words in the title. + + :param word: a word from the user-defined query string + :param title: a deque of words in the title + :return: a two-elements tuple, whether the word was found in the title and the number of skipped words + + This is a helper function to efficiently answer a question of how close a query string and a title string are, + taking into account the ordering of words in both strings. + + For efficiency reasons, the function modifies the `title` deque in place by removing the first entrance + of the found word and rotating all leading non-matching words to the end of the deque. It allows to efficiently + perform multiple calls of the `find_word_and_rotate_title` function for subsequent words from the same query string. + + An example: find_word_and_rotate_title('A', deque(['X', 'Y', 'A', 'B', 'C'])) returns `(True, 2)`, where True means + that the word 'A' was found in the `title` deque, and 2 is the number of skipped words ('X', 'Y'). Also, it modifies + the `title` deque, so it starts looking like deque(['B', 'C', 'X', 'Y']). The found word 'A' was removed, and + the leading non-matching words ('X', 'Y') were moved to the end of the deque. + """ + try: + skipped = title.index(word) # find the query word placement in the title and the number of preceding words + except ValueError: + return False, 0 + + title.rotate(-skipped) # rotate skipped words to the end + title.popleft() # remove found word + return True, skipped diff --git a/src/tribler/gui/qt_resources/search_results.ui b/src/tribler/gui/qt_resources/search_results.ui index 7ef8639c575..c610b6c2063 100644 --- a/src/tribler/gui/qt_resources/search_results.ui +++ b/src/tribler/gui/qt_resources/search_results.ui @@ -110,7 +110,38 @@ - + + + + + + + 0 + 0 + + + + + 16777215 + 10 + + + + 0 + + + Qt::AlignLeading|Qt::AlignLeft|Qt::AlignVCenter + + + + + + + + + + + @@ -129,6 +160,12 @@
tribler.gui.widgets.timeoutprogressbar.h
1
+ + SearchProgressBar + QProgressBar +
tribler.gui.widgets.search_progress_bar.h
+ 1 +
diff --git a/src/tribler/gui/tests/test_gui.py b/src/tribler/gui/tests/test_gui.py index ad057b8b787..622c622a64d 100644 --- a/src/tribler/gui/tests/test_gui.py +++ b/src/tribler/gui/tests/test_gui.py @@ -384,19 +384,17 @@ def test_search_suggestions(window): def test_search(window): window.top_search_bar.setText("a") # This is likely to trigger some search results QTest.keyClick(window.top_search_bar, Qt.Key_Enter) - wait_for_variable(window, "search_results_page.search_request") + QTest.qWait(100) screenshot(window, name="search_loading_page") - QTest.mouseClick(window.search_results_page.show_results_button, Qt.LeftButton) tst_channels_widget( window, - window.search_results_page.results_page, + window.search_results_page.results_page_content, "search_results", sort_column=2, test_filter=False, test_subscribe=False, ) - @pytest.mark.guitest def test_add_download_url(window): go_to_and_wait_for_downloads(window) diff --git a/src/tribler/gui/widgets/channelcontentswidget.py b/src/tribler/gui/widgets/channelcontentswidget.py index 34bebc5824a..2d150fc7413 100644 --- a/src/tribler/gui/widgets/channelcontentswidget.py +++ b/src/tribler/gui/widgets/channelcontentswidget.py @@ -1,7 +1,7 @@ from base64 import b64encode from PyQt5 import uic -from PyQt5.QtCore import QDir, QTimer, Qt +from PyQt5.QtCore import QDir, QTimer, Qt, pyqtSignal from PyQt5.QtGui import QIcon from PyQt5.QtWidgets import QAction, QFileDialog @@ -39,6 +39,9 @@ # pylint: disable=too-many-instance-attributes, too-many-public-methods class ChannelContentsWidget(AddBreadcrumbOnShowMixin, widget_form, widget_class): + + model_query_completed = pyqtSignal() + def __init__(self, parent=None): widget_class.__init__(self, parent=parent) @@ -110,6 +113,10 @@ def personal_channel_model(self): def model(self): return self.channels_stack[-1] if self.channels_stack else None + @property + def root_model(self): + return self.channels_stack[0] if self.channels_stack else None + def on_channel_committed(self, response): if not response or not response.get("success", False): return @@ -260,6 +267,9 @@ def on_model_info_changed(self, changed_entries): self.model.channel_info["dirty"] = dirty self.update_labels() + def on_model_query_completed(self): + self.model_query_completed.emit() + def initialize_root_model_from_channel_info(self, channel_info): if channel_info.get("state") == CHANNEL_STATE.PERSONAL.value: self.default_channel_model = self.personal_channel_model @@ -293,10 +303,13 @@ def reset_view(self, text_filter=None, category_filter=None): def disconnect_current_model(self): disconnect(self.window().core_manager.events_manager.node_info_updated, self.model.update_node_info) disconnect(self.model.info_changed, self.on_model_info_changed) + disconnect(self.model.query_complete, self.on_model_query_completed) + self.controller.unset_model() # Disconnect the selectionChanged signal def connect_current_model(self): connect(self.model.info_changed, self.on_model_info_changed) + connect(self.model.query_complete, self.on_model_query_completed) connect(self.window().core_manager.events_manager.node_info_updated, self.model.update_node_info) @property @@ -318,6 +331,10 @@ def on_breadcrumb_clicked(self, tgt_level): # Reset the view if the user clicks on the last part of the breadcrumb self.reset_view() + def format_search_title(self): + text = self.model.format_title() + self.channel_name_label.setText(text) + def _set_filter_controls_from_model(self): # This should typically be called under freeze_controls context manager content_category = ContentCategories.get(self.model.category_filter) diff --git a/src/tribler/gui/widgets/channelsmenulistwidget.py b/src/tribler/gui/widgets/channelsmenulistwidget.py index 910fee64050..5c62b6818ce 100644 --- a/src/tribler/gui/widgets/channelsmenulistwidget.py +++ b/src/tribler/gui/widgets/channelsmenulistwidget.py @@ -133,7 +133,7 @@ def on_query_results(self, response): self.items_set = frozenset(entry_to_tuple(channel_info) for channel_info in channels) - def load_channels(self, request=None): + def load_channels(self): TriblerNetworkRequest(self.base_url, self.on_query_results, url_params={"subscribed": True, "last": 1000}) def reload_if_necessary(self, changed_entries): diff --git a/src/tribler/gui/widgets/search_progress_bar.py b/src/tribler/gui/widgets/search_progress_bar.py new file mode 100644 index 00000000000..b6747cae675 --- /dev/null +++ b/src/tribler/gui/widgets/search_progress_bar.py @@ -0,0 +1,104 @@ +import time + +from PyQt5.QtCore import QTimer, pyqtSignal +from PyQt5.QtWidgets import QProgressBar + +from tribler.gui.utilities import connect + +MAX_VALUE = 10000 +UPDATE_DELAY = 0.5 +REMOTE_DELAY = 0.25 + + +class SearchProgressBar(QProgressBar): + ready_to_update_results = pyqtSignal() + + def __init__(self, parent=None, timeout=20): + super().__init__(parent) + self.timeout_interval = timeout + self.timer = QTimer() + self.timer.setSingleShot(False) + self.timer.setInterval(100) # update the progress bar tick + + self.start_time = None + self.last_update_time = None + self.last_remote_result_time = None + self.has_new_remote_results = False + self.peers_total = 0 + self.peers_responded = 0 + self.new_remote_items_count = 0 + self.total_remote_items_count = 0 + + self._value = 0 + self.setValue(0) + self.setMaximum(MAX_VALUE) + + connect(self.timer.timeout, self._update) + + def start(self): + t = time.time() + self.start_time = t + self.peers_total = 0 + self.peers_responded = 0 + self.setToolTip('') + self.setValue(0) + self.timer.start() + self.show() + + def _update(self): + if self.start_time is None: + return + + t = time.time() + + time_progress = (t - self.start_time) / self.timeout_interval + response_progress = (self.peers_responded / self.peers_total) if self.peers_total else 0 + scale = 1 - ((1 - time_progress) * (1 - response_progress)) ** 2 + value = int(scale * MAX_VALUE) + self.setValue(value) + + timeout = time_progress >= 1 + most_peers_responded = self.peers_total > 0 and self.peers_responded / self.peers_total >= 0.8 + active_transfers_finished = self.last_remote_result_time and t - self.last_remote_result_time > REMOTE_DELAY + + should_stop = timeout or (most_peers_responded and active_transfers_finished) + + if self.last_update_time is not None and self.has_new_remote_results \ + and (t - self.last_update_time > UPDATE_DELAY and active_transfers_finished or should_stop): + self.last_update_time = t + self.has_new_remote_results = False + self.new_remote_items_count = 0 + self.ready_to_update_results.emit() + + if should_stop: + self.stop() + + def stop(self): + self.start_time = None + self.timer.stop() + self.hide() + + def mousePressEvent(self, _): + self.stop() + + def on_local_results(self): + self.last_update_time = time.time() + self.has_new_remote_results = False + self._update() + + def set_remote_total(self, total: int): + self.peers_total = total + self.setToolTip(f'0/{total} remote responded') + self._update() + + def on_remote_results(self, new_items_count, peers_responded): + self.last_remote_result_time = time.time() + tool_tip = f'{peers_responded}/{self.peers_total} peers responded' + if self.total_remote_items_count: + tool_tip += f', {self.total_remote_items_count} new results' + self.setToolTip(tool_tip) + self.has_new_remote_results = True + self.new_remote_items_count += new_items_count + self.total_remote_items_count += new_items_count + self.peers_responded = peers_responded + self._update() diff --git a/src/tribler/gui/widgets/searchresultswidget.py b/src/tribler/gui/widgets/searchresultswidget.py index 9ce466c53f4..5eb972d0696 100644 --- a/src/tribler/gui/widgets/searchresultswidget.py +++ b/src/tribler/gui/widgets/searchresultswidget.py @@ -60,44 +60,18 @@ def __init__(self, parent=None): self.hide_xxx = None self.search_request = None + connect(self.results_page_content.model_query_completed, self.on_local_query_completed) + connect(self.search_progress_bar.ready_to_update_results, self.on_ready_to_update_results) + def initialize(self, hide_xxx=False): self.hide_xxx = hide_xxx - self.results_page.initialize_content_page(hide_xxx=hide_xxx) - self.results_page.channel_torrents_filter_input.setHidden(True) - connect(self.timeout_progress_bar.timeout, self.show_results) - connect(self.show_results_button.clicked, self.show_results) + self.results_page_content.initialize_content_page(hide_xxx=hide_xxx) + self.results_page_content.channel_torrents_filter_input.setHidden(True) @property def has_results(self): return self.last_search_query is not None - def show_results(self, *_): - if self.search_request is None: - # Fixes a race condition where the user clicks the show_results button before the search request - # has been registered by the Core - return - self.timeout_progress_bar.stop() - query = self.search_request.query - self.results_page.initialize_root_model( - SearchResultsModel( - channel_info={ - "name": (tr("Search results for %s") % query.original_query) - if len(query.original_query) < 50 - else f"{query.original_query[:50]}..." - }, - endpoint_url="search", - hide_xxx=self.results_page.hide_xxx, - text_filter=to_fts_query(query.fts_text), - tags=list(query.tags), - type_filter=[REGULAR_TORRENT], - ) - ) - self.setCurrentWidget(self.results_page) - - # After transitioning to the page with search results, we refresh the viewport since some rows might have been - # rendered already with an incorrect row height. - self.results_page.run_brain_dead_refresh() - def check_can_show(self, query): if ( self.last_search_query == query @@ -119,32 +93,54 @@ def search(self, query: Query) -> bool: self.last_search_query = query.original_query self.last_search_time = time.time() - # Trigger remote search + model = SearchResultsModel( + endpoint_url="search", + hide_xxx=self.results_page_content.hide_xxx, + original_query=query.original_query, + text_filter=to_fts_query(query.fts_text), + tags=list(query.tags), + type_filter=[REGULAR_TORRENT], + exclude_deleted=True, + ) + self.results_page_content.initialize_root_model(model) + self.setCurrentWidget(self.results_page) + self.results_page_content.format_search_title() + self.search_progress_bar.start() + + # After transitioning to the page with search results, we refresh the viewport since some rows might have been + # rendered already with an incorrect row height. + self.results_page_content.run_brain_dead_refresh() + def register_request(response): - self._logger.info(f'Request registered: {response}') - self.search_request = SearchRequest(response["request_uuid"], query, set(response["peers"])) - self.state_label.setText(format_search_loading_label(self.search_request)) - self.timeout_progress_bar.start() - self.setCurrentWidget(self.loading_page) + peers = set(response["peers"]) + self.search_request = SearchRequest(response["request_uuid"], query, peers) + self.search_progress_bar.set_remote_total(len(peers)) - params = {'txt_filter': fts_query, 'hide_xxx': self.hide_xxx, 'tags': list(query.tags)} + params = {'txt_filter': fts_query, 'hide_xxx': self.hide_xxx, 'tags': list(query.tags), + 'metadata_type': REGULAR_TORRENT, 'exclude_deleted': True} TriblerNetworkRequest('remote_query', register_request, method="PUT", url_params=params) + return True + def on_local_query_completed(self): + self.search_progress_bar.on_local_results() + def reset(self): if self.currentWidget() == self.results_page: - self.results_page.go_back_to_level(0) + self.results_page_content.go_back_to_level(0) def update_loading_page(self, remote_results): - if ( - not self.search_request - or remote_results.get("uuid") != self.search_request.uuid - or self.currentWidget() == self.results_page - ): + if not self.search_request or self.search_request.uuid != remote_results.get("uuid"): return + peer = remote_results["peer"] + results = remote_results.get("results", []) + self.search_request.peers_complete.add(peer) - self.search_request.remote_results.append(remote_results.get("results", [])) - self.state_label.setText(format_search_loading_label(self.search_request)) - if self.search_request.complete: - self.show_results() + self.search_request.remote_results.append(results) + + new_items = self.results_page_content.model.add_remote_results(results) + self.search_progress_bar.on_remote_results(len(new_items), len(self.search_request.peers_complete)) + + def on_ready_to_update_results(self): + self.results_page_content.root_model.show_remote_results() diff --git a/src/tribler/gui/widgets/tablecontentdelegate.py b/src/tribler/gui/widgets/tablecontentdelegate.py index c9fd42ce1e7..47553016cd5 100644 --- a/src/tribler/gui/widgets/tablecontentdelegate.py +++ b/src/tribler/gui/widgets/tablecontentdelegate.py @@ -35,7 +35,7 @@ ) from tribler.gui.utilities import format_votes, get_color, get_gui_setting, get_health, get_image_path, tr, \ get_objects_with_predicate -from tribler.gui.widgets.tablecontentmodel import Column +from tribler.gui.widgets.tablecontentmodel import Column, RemoteTableModel from tribler.gui.widgets.tableiconbuttons import DownloadIconButton PROGRESS_BAR_BACKGROUND = QColor("#444444") @@ -268,18 +268,20 @@ def split_rect_into_squares(r, buttons): yield QRect(x, y, w, h), button def paint(self, painter, option, index): - # Draw 'hover' state highlight for every cell of a row - if index.row() == self.hover_index.row(): + model: RemoteTableModel = index.model() + data_item = model.data_items[index.row()] + if index.row() == self.hover_index.row() or model.should_highlight_item(data_item): + # Draw 'hover' state highlight for every cell of a row option.state |= QStyle.State_MouseOver - if not self.paint_exact(painter, option, index): + if not self.paint_exact(painter, option, index, data_item): # Draw the rest of the columns super().paint(painter, option, index) - def paint_exact(self, painter, option, index): - data_item = index.model().data_items[index.row()] + def paint_exact(self, painter, option, index, data_item): for column, drawing_action in self.column_drawing_actions: if column in index.model().column_position and index.column() == index.model().column_position[column]: return drawing_action(painter, option, index, data_item) + return False def editorEvent(self, event, model, option, index): for control in self.controls: @@ -372,9 +374,22 @@ class TagsMixin: edit_tags_icon = QIcon(get_image_path("edit_white.png")) edit_tags_icon_hover = QIcon(get_image_path("edit_orange.png")) - def draw_title_and_tags( - self, painter: QPainter, option: QStyleOptionViewItem, index: QModelIndex, data_item: Dict - ) -> None: + def draw_title_and_tags(self, painter: QPainter, option: QStyleOptionViewItem, index: QModelIndex, + data_item: Dict) -> None: + debug = False # change to True to see the search rank of items and to highlight remote items + item_name = data_item["name"] + + group = data_item.get("group") + if group: + has_remote_items = any(group_item.get('remote') for group_item in group.values()) + item_name += f" (+ {len(group)} similar{' *' if debug and has_remote_items else ''})" + + if debug: + rank = data_item.get("rank") + if rank is not None: + item_name += f' rank: {rank:.6}' + if data_item.get('remote'): + item_name = '* ' + item_name painter.setRenderHint(QPainter.Antialiasing, True) title_text_pos = option.rect.topLeft() title_text_height = 60 if data_item["type"] == SNIPPET else 28 @@ -391,7 +406,7 @@ def draw_title_and_tags( painter.drawText( QRectF(title_text_x, title_text_y, option.rect.width() - 6, title_text_height), Qt.AlignVCenter, - data_item["name"], + item_name, ) if data_item["type"] == SNIPPET: diff --git a/src/tribler/gui/widgets/tablecontentmodel.py b/src/tribler/gui/widgets/tablecontentmodel.py index 00df68041c4..e498e910725 100644 --- a/src/tribler/gui/widgets/tablecontentmodel.py +++ b/src/tribler/gui/widgets/tablecontentmodel.py @@ -1,15 +1,18 @@ import json import logging +import time import uuid +from collections import deque from dataclasses import dataclass, field from enum import Enum, auto from typing import Callable, Dict, List -from PyQt5.QtCore import QAbstractTableModel, QModelIndex, QRectF, QSize, Qt, pyqtSignal +from PyQt5.QtCore import QAbstractTableModel, QModelIndex, QRectF, QSize, QTimerEvent, Qt, pyqtSignal from tribler.core.components.metadata_store.db.orm_bindings.channel_node import NEW from tribler.core.components.metadata_store.db.serialization import CHANNEL_TORRENT, COLLECTION_NODE, REGULAR_TORRENT, \ SNIPPET +from tribler.core.utilities.search_utils import item_rank from tribler.core.utilities.simpledefs import CHANNELS_VIEW_UUID, CHANNEL_STATE from tribler.core.utilities.utilities import to_fts_query @@ -18,6 +21,8 @@ from tribler.gui.utilities import connect, format_size, format_votes, get_votes_rating_description, pretty_date, tr EXPANDING = 0 +HIGHLIGHTING_PERIOD_SECONDS = 1.0 +HIGHLIGHTING_TIMER_INTERVAL_MILLISECONDS = 100 class Column(Enum): @@ -98,7 +103,6 @@ def __init__(self, parent=None): self.columns_dict = define_columns() self.data_items = [] - self.remote_items = [] self.max_rowid = None self.local_total = None self.item_load_batch = 50 @@ -108,6 +112,14 @@ def __init__(self, parent=None): self.saved_scroll_state = None self.qt_object_destroyed = False + self.group_by_name = False + self.sort_by_rank = False + self.text_filter = '' + + self.highlight_remote_results = False + self.highlighted_items = deque() + self.highlight_timer = self.startTimer(HIGHLIGHTING_TIMER_INTERVAL_MILLISECONDS) + connect(self.destroyed, self.on_destroy) # Every remote query must be attributed to its specific model to avoid updating wrong models # on receiving a result. We achieve this by maintaining a set of in-flight remote queries. @@ -138,13 +150,31 @@ def reset(self): self.beginResetModel() self.loaded = False self.data_items = [] - self.remote_items = [] self.max_rowid = None self.local_total = None self.item_uid_map = {} self.endResetModel() self.perform_query() + def should_highlight_item(self, data_item): + return (self.highlight_remote_results and data_item.get('remote') + and data_item['item_added_at'] > time.time() - HIGHLIGHTING_PERIOD_SECONDS) + + def timerEvent(self, event: QTimerEvent) -> None: + if self.highlight_remote_results and event.timerId() == self.highlight_timer: + self.stop_highlighting_old_items() + + def stop_highlighting_old_items(self): + now = time.time() + then = now - HIGHLIGHTING_PERIOD_SECONDS + last_column_offset = len(self.columns_dict) - 1 + while self.highlighted_items and self.highlighted_items[0]['item_added_at'] < then: + item = self.highlighted_items.popleft() + uid = get_item_uid(item) + row = self.item_uid_map.get(uid) + if row is not None: + self.dataChanged.emit(self.index(row, 0), self.index(row, last_column_offset)) + def sort(self, column_index, order): if not self.columns[column_index].sortable: return @@ -168,28 +198,71 @@ def add_items(self, new_items, on_top=False, remote=False): if not new_items: return - if remote and not self.all_local_entries_loaded: - self.remote_items.extend(new_items) - return - # Note: If we want to block the signal like itemChanged, we must use QSignalBlocker object or blockSignals # Only add unique items to the table model and reverse mapping from unique ids to rows is built. insert_index = 0 if on_top else len(self.data_items) unique_new_items = [] + name_mapping = {item['name']: item for item in self.data_items} if self.group_by_name else {} + now = time.time() for item in new_items: + if remote: + item['remote'] = True + item['item_added_at'] = now + if self.highlight_remote_results: + self.highlighted_items.append(item) + if self.sort_by_rank: + if 'rank' not in item: + item['rank'] = item_rank(self.text_filter, item) + item_uid = get_item_uid(item) if item_uid not in self.item_uid_map: - self.item_uid_map[item_uid] = insert_index - if 'infohash' in item: - self.item_uid_map[item['infohash']] = insert_index - unique_new_items.append(item) - insert_index += 1 + + prev_item = name_mapping.get(item['name']) + if self.group_by_name and prev_item is not None and not on_top and prev_item['type'] == REGULAR_TORRENT: + group = prev_item.setdefault('group', {}) + if item_uid not in group: + group[item_uid] = item + else: + self.item_uid_map[item_uid] = insert_index + if 'infohash' in item: + self.item_uid_map[item['infohash']] = insert_index + unique_new_items.append(item) + + if self.group_by_name and item['type'] == REGULAR_TORRENT and prev_item is None: + name_mapping[item['name']] = item + + insert_index += 1 # If no new items are found, skip if not unique_new_items: return + if remote and self.sort_by_rank: + torrents = [item for item in self.data_items if item['type'] == REGULAR_TORRENT] + non_torrents = [item for item in self.data_items if item['type'] != REGULAR_TORRENT] + + new_torrents = [item for item in unique_new_items if item['type'] == REGULAR_TORRENT] + new_non_torrents = [item for item in unique_new_items if item['type'] != REGULAR_TORRENT] + + torrents += new_torrents + non_torrents += new_non_torrents + + torrents.sort(key = lambda item: item['rank'], reverse=True) + new_data_items = non_torrents + torrents + + new_item_uid_map = {} + for item in new_data_items: + item_uid = get_item_uid(item) + new_item_uid_map[item_uid] = insert_index + if 'infohash' in item: + new_item_uid_map[item['infohash']] = insert_index + self.beginResetModel() + self.data_items = new_data_items + self.item_uid_map = new_item_uid_map + self.endResetModel() + return + # Else if remote items, to make space for new unique items shift the existing items if on_top and insert_index > 0: new_items_map = {} @@ -211,11 +284,6 @@ def add_items(self, new_items, on_top=False, remote=False): self.data_items.extend(unique_new_items) self.endInsertRows() - if self.all_local_entries_loaded: - remote_items = self.remote_items - self.remote_items = [] - self.add_items(remote_items, remote=True) # to filter non-unique entries - def remove_items(self, items): uids_to_remove = [] rows_to_remove = [] @@ -258,6 +326,9 @@ def remove_items(self, items): self.info_changed.emit(items) + def perform_initial_query(self): + self.perform_query() + def perform_query(self, **kwargs): """ Fetch results for a given query. @@ -304,7 +375,7 @@ def on_query_results(self, response, remote=False, on_top=False): if not remote: if "total" in response: self.local_total = response["total"] - self.channel_info["total"] = self.local_total + len(self.remote_items) + self.channel_info["total"] = self.local_total elif self.channel_info.get("total"): self.channel_info["total"] += len(response["results"]) @@ -316,8 +387,8 @@ def on_query_results(self, response, remote=False, on_top=False): if update_labels: self.info_changed.emit(response['results']) - self.query_complete.emit() self.loaded = True + self.query_complete.emit() return True @@ -360,7 +431,7 @@ def __init__( self.endpoint_url_override = endpoint_url # Load the initial batch of entries - self.perform_query() + self.perform_initial_query() @property def edit_enabled(self): @@ -548,7 +619,59 @@ def perform_query(self, **kwargs): class SearchResultsModel(ChannelContentModel): - pass + def __init__(self, original_query, **kwargs): + self.original_query = original_query + self.remote_results = {} + title = self.format_title() + super().__init__(channel_info={"name": title}, **kwargs) + self.remote_results_received = False + self.postponed_remote_results = [] + self.highlight_remote_results = True + self.group_by_name = True + self.sort_by_rank = True + + def format_title(self): + q = self.original_query + q = q if len(q) < 50 else q[:50] + '...' + return f'Search results for {q}' + + def perform_initial_query(self): + return self.perform_query(first=1, last=200) + + def on_query_results(self, response, remote=False, on_top=False): + super().on_query_results(response, remote=remote, on_top=on_top) + self.add_remote_results([]) # to trigger adding postponed results + self.show_remote_results() + + @property + def all_local_entries_loaded(self): + return self.loaded + + def add_remote_results(self, results): + if not self.all_local_entries_loaded: + self.postponed_remote_results.extend(results) + return [] + + results = self.postponed_remote_results + results + self.postponed_remote_results = [] + new_items = [] + for item in results: + uid = get_item_uid(item) + if uid not in self.item_uid_map and uid not in self.remote_results: + self.remote_results_received = True + new_items.append(item) + self.remote_results[uid] = item + return new_items + + def show_remote_results(self): + if not self.all_local_entries_loaded: + return + + remote_items = list(self.remote_results.values()) + self.remote_results.clear() + self.remote_results_received = False + if remote_items: + self.add_items(remote_items, remote=True) class PopularTorrentsModel(ChannelContentModel):