From a88db81e8a80ac11c3fb9e404316de181225e50b Mon Sep 17 00:00:00 2001 From: drew2a Date: Tue, 25 Jan 2022 12:35:55 +0100 Subject: [PATCH] Add Tag Rules Processor --- .../db/orm_bindings/torrent_metadata.py | 14 ++- .../components/metadata_store/db/store.py | 93 ++++++++------- .../metadata_store_component.py | 9 +- .../components/tag/rules/tag_rules.py | 63 ++++++++++ .../tag/rules/tag_rules_processor.py | 110 ++++++++++++++++++ .../tag/rules/tests/test_general_rules.py | 91 +++++++++++++++ .../rules/tests/test_tag_rules_processor.py | 110 ++++++++++++++++++ .../components/tag/tag_component.py | 15 ++- .../tag/tests/test_tag_component.py | 3 +- 9 files changed, 457 insertions(+), 51 deletions(-) create mode 100644 src/tribler-core/tribler_core/components/tag/rules/tag_rules.py create mode 100644 src/tribler-core/tribler_core/components/tag/rules/tag_rules_processor.py create mode 100644 src/tribler-core/tribler_core/components/tag/rules/tests/test_general_rules.py create mode 100644 src/tribler-core/tribler_core/components/tag/rules/tests/test_tag_rules_processor.py diff --git a/src/tribler-core/tribler_core/components/metadata_store/db/orm_bindings/torrent_metadata.py b/src/tribler-core/tribler_core/components/metadata_store/db/orm_bindings/torrent_metadata.py index 725ed5b4943..2ac9dec0be3 100644 --- a/src/tribler-core/tribler_core/components/metadata_store/db/orm_bindings/torrent_metadata.py +++ b/src/tribler-core/tribler_core/components/metadata_store/db/orm_bindings/torrent_metadata.py @@ -8,10 +8,12 @@ from tribler_core.components.metadata_store.category_filter.family_filter import default_xxx_filter from tribler_core.components.metadata_store.db.orm_bindings.channel_node import COMMITTED from tribler_core.components.metadata_store.db.serialization import EPOCH, REGULAR_TORRENT, TorrentMetadataPayload +from tribler_core.notifier import Notifier from tribler_core.utilities.tracker_utils import get_uniformed_tracker_url from tribler_core.utilities.unicode import ensure_unicode, hexlify NULL_KEY_SUBST = b"\00" +NEW_TORRENT_METADATA_CREATED: str = 'TorrentMetadata:new_torrent_metadata_created' # This function is used to devise id_ from infohash in deterministic way. Used in FFA channels. @@ -44,7 +46,7 @@ def tdef_to_metadata_dict(tdef): } -def define_binding(db): +def define_binding(db, notifier: Notifier, tag_version: int): class TorrentMetadata(db.MetadataNode): """ This ORM binding class is intended to store Torrent objects, i.e. infohashes along with some related metadata. @@ -61,12 +63,13 @@ class TorrentMetadata(db.MetadataNode): # Local xxx = orm.Optional(float, default=0) health = orm.Optional('TorrentState', reverse='metadata') + tag_version = orm.Required(int, default=0) # Special class-level properties _payload_class = TorrentMetadataPayload payload_arguments = _payload_class.__init__.__code__.co_varnames[ - : _payload_class.__init__.__code__.co_argcount - ][1:] + : _payload_class.__init__.__code__.co_argcount + ][1:] nonpersonal_attributes = db.MetadataNode.nonpersonal_attributes + ( 'infohash', 'size', @@ -86,6 +89,11 @@ def __init__(self, *args, **kwargs): if 'tracker_info' in kwargs: self.add_tracker(kwargs["tracker_info"]) + if notifier: + notifier.notify(NEW_TORRENT_METADATA_CREATED, + infohash=kwargs.get("infohash"), + title=self.title) + self.tag_version = tag_version def add_tracker(self, tracker_url): sanitized_url = get_uniformed_tracker_url(tracker_url) diff --git a/src/tribler-core/tribler_core/components/metadata_store/db/store.py b/src/tribler-core/tribler_core/components/metadata_store/db/store.py index e646133f75f..36cb2c6d8be 100644 --- a/src/tribler-core/tribler_core/components/metadata_store/db/store.py +++ b/src/tribler-core/tribler_core/components/metadata_store/db/store.py @@ -4,7 +4,7 @@ from asyncio import get_event_loop from datetime import datetime, timedelta from time import sleep, time -from typing import Union +from typing import Optional, Union from lz4.frame import LZ4FrameDecompressor @@ -49,11 +49,12 @@ from tribler_core.components.metadata_store.remote_query_community.payload_checker import process_payload from tribler_core.exceptions import InvalidSignatureException from tribler_core.utilities.path_util import Path +from tribler_core.utilities.pony_utils import get_or_create from tribler_core.utilities.unicode import hexlify from tribler_core.utilities.utilities import MEMORY_DB BETA_DB_VERSIONS = [0, 1, 2, 3, 4, 5] -CURRENT_DB_VERSION = 13 +CURRENT_DB_VERSION = 14 MIN_BATCH_SIZE = 10 MAX_BATCH_SIZE = 1000 @@ -61,7 +62,6 @@ POPULAR_TORRENTS_FRESHNESS_PERIOD = 60 * 60 * 24 # Last day POPULAR_TORRENTS_COUNT = 100 - # This table should never be used from ORM directly. # It is created as a VIRTUAL table by raw SQL and # maintained by SQL triggers. @@ -136,14 +136,15 @@ class MetadataStore: def __init__( - self, - db_filename: Union[Path, type(MEMORY_DB)], - channels_dir, - my_key, - disable_sync=False, - notifier=None, - check_tables=True, - db_version: int = CURRENT_DB_VERSION, + self, + db_filename: Union[Path, type(MEMORY_DB)], + channels_dir, + my_key, + disable_sync=False, + notifier=None, + check_tables=True, + db_version: int = CURRENT_DB_VERSION, + tag_version: int = 0 ): self.notifier = notifier # Reference to app-level notification service self.db_path = db_filename @@ -190,7 +191,11 @@ def sqlite_disable_sync(_, connection): self.MetadataNode = metadata_node.define_binding(self._db) self.CollectionNode = collection_node.define_binding(self._db) - self.TorrentMetadata = torrent_metadata.define_binding(self._db) + self.TorrentMetadata = torrent_metadata.define_binding( + self._db, + notifier=notifier, + tag_version=tag_version + ) self.ChannelMetadata = channel_metadata.define_binding(self._db) self.JsonNode = json_node.define_binding(self._db, db_version) @@ -242,6 +247,14 @@ def wrapper(): return await get_event_loop().run_in_executor(None, wrapper) + def set_value(self, key: str, value: str): + key_value = get_or_create(self.MiscData, name=key) + key_value.value = value + + def get_value(self, key: str, default: Optional[str] = None) -> Optional[str]: + data = self.MiscData.get(name=key) + return data.value if data else default + def drop_indexes(self): cursor = self._db.get_connection().cursor() cursor.execute("select name from sqlite_master where type='index' and name like 'idx_%'") @@ -391,9 +404,9 @@ def process_channel_dir(self, dirname, public_key, id_, **kwargs): if not channel: return if ( - blob_sequence_number <= channel.start_timestamp - or blob_sequence_number <= channel.local_version - or blob_sequence_number > channel.timestamp + blob_sequence_number <= channel.start_timestamp + or blob_sequence_number <= channel.local_version + or blob_sequence_number > channel.timestamp ): continue try: @@ -595,28 +608,28 @@ def search_keyword(self, query, lim=100): @db_session def get_entries_query( - self, - metadata_type=None, - channel_pk=None, - exclude_deleted=False, - hide_xxx=False, - exclude_legacy=False, - origin_id=None, - sort_by=None, - sort_desc=True, - max_rowid=None, - txt_filter=None, - subscribed=None, - category=None, - attribute_ranges=None, - infohash=None, - infohash_set=None, - id_=None, - complete_channel=None, - self_checked_torrent=None, - cls=None, - health_checked_after=None, - popular=None, + self, + metadata_type=None, + channel_pk=None, + exclude_deleted=False, + hide_xxx=False, + exclude_legacy=False, + origin_id=None, + sort_by=None, + sort_desc=True, + max_rowid=None, + txt_filter=None, + subscribed=None, + category=None, + attribute_ranges=None, + infohash=None, + infohash_set=None, + id_=None, + complete_channel=None, + self_checked_torrent=None, + cls=None, + health_checked_after=None, + popular=None, ): """ This method implements REST-friendly way to get entries from the database. @@ -662,8 +675,8 @@ def get_entries_query( if attribute_ranges is not None: for attr, left, right in attribute_ranges: if ( - self.ChannelNode._adict_.get(attr) # pylint: disable=W0212 - or self.ChannelNode._subclass_adict_.get(attr) # pylint: disable=W0212 + self.ChannelNode._adict_.get(attr) # pylint: disable=W0212 + or self.ChannelNode._subclass_adict_.get(attr) # pylint: disable=W0212 ) is None: # Check against code injection raise AttributeError("Tried to query for non-existent attribute") if left is not None: @@ -737,7 +750,7 @@ def get_entries(self, first=1, last=None, **kwargs): :return: A list of class members """ pony_query = self.get_entries_query(**kwargs) - result = pony_query[(first or 1) - 1 : last] + result = pony_query[(first or 1) - 1: last] for entry in result: # ACHTUNG! This is necessary in order to load entry.health inside db_session, # to be able to perform successfully `entry.to_simple_dict()` later diff --git a/src/tribler-core/tribler_core/components/metadata_store/metadata_store_component.py b/src/tribler-core/tribler_core/components/metadata_store/metadata_store_component.py index 493cba017df..965d7afc663 100644 --- a/src/tribler-core/tribler_core/components/metadata_store/metadata_store_component.py +++ b/src/tribler-core/tribler_core/components/metadata_store/metadata_store_component.py @@ -3,8 +3,7 @@ from tribler_core.components.base import Component from tribler_core.components.key.key_component import KeyComponent from tribler_core.components.metadata_store.db.store import MetadataStore -from tribler_core.components.metadata_store.utils import generate_test_channels -from tribler_core.components.tag.tag_component import TagComponent +from tribler_core.components.tag.rules.tag_rules_processor import TagRulesProcessor class MetadataStoreComponent(Component): @@ -42,13 +41,11 @@ async def run(self): key_component.primary_key, notifier=self.session.notifier, disable_sync=config.gui_test_mode, + tag_version=TagRulesProcessor.version ) self.mds = metadata_store - self.session.notifier.add_observer(NTFY.TORRENT_METADATA_ADDED, + self.session.notifier.add_observer(NTFY.TORRENT_METADATA_ADDED.value, metadata_store.TorrentMetadata.add_ffa_from_dict) - if config.gui_test_mode: - tag_component = await self.require_component(TagComponent) - generate_test_channels(metadata_store, tag_component.tags_db) async def shutdown(self): await super().shutdown() diff --git a/src/tribler-core/tribler_core/components/tag/rules/tag_rules.py b/src/tribler-core/tribler_core/components/tag/rules/tag_rules.py new file mode 100644 index 00000000000..1fe007cd463 --- /dev/null +++ b/src/tribler-core/tribler_core/components/tag/rules/tag_rules.py @@ -0,0 +1,63 @@ +import re +from typing import AnyStr, Iterable, Optional, Pattern, Sequence + +from tribler_core.components.tag.community.tag_validator import is_valid_tag + +# Each regex expression should contain just a single capturing group: +square_brackets_re = re.compile(r'\[([^\[\]]+)]') +parentheses_re = re.compile(r'\(([^()]+)\)') +extension_re = re.compile(r'\.(\w{3,4})$') +delimiter_re = re.compile(r'([^\s.,/|]+)') + +tags_in_square_brackets = [ + square_brackets_re, # extract content from square brackets + delimiter_re # divide content by "," or "." or " " or "/" +] + +tags_in_parentheses = [ + parentheses_re, # extract content from brackets + delimiter_re # divide content by "," or "." or " " or "/" +] + +tags_in_extension = [ + extension_re # extract an extension +] + +RulesList = Sequence[Sequence[Pattern[AnyStr]]] +default_rules: RulesList = [ + tags_in_square_brackets, + tags_in_parentheses, + tags_in_extension +] + + +def extract_tags(text: str, rules: Optional[RulesList] = None) -> Iterable[str]: + """ Extract tags by using the giving rules. + + Rules are represented by an array of an array of regexes. + Each rule contains one or more regex expressions. + + During the `text` processing, each rule will be applied to the `text` value. + All extracted tags will be returned. + + During application of the particular rule, `text` will be split into + tokens by application of the first regex expression. Then, second regex + expression will be applied to each tokens that were extracted on the + previous step. + This process will be repeated until regex expression ends. + """ + rules = rules or default_rules + for rule in rules: + text_set = {text} + for regex in rule: + next_text_set = set() + for token in text_set: + for match in regex.finditer(token): + next_text_set |= set(match.groups()) + text_set = next_text_set + yield from text_set + + +def extract_only_valid_tags(text: str, rules: Optional[RulesList] = None) -> Iterable[str]: + extracted_tags_gen = (t.lower() for t in extract_tags(text, rules)) + yield from (t for t in extracted_tags_gen if is_valid_tag(t)) diff --git a/src/tribler-core/tribler_core/components/tag/rules/tag_rules_processor.py b/src/tribler-core/tribler_core/components/tag/rules/tag_rules_processor.py new file mode 100644 index 00000000000..13085e67877 --- /dev/null +++ b/src/tribler-core/tribler_core/components/tag/rules/tag_rules_processor.py @@ -0,0 +1,110 @@ +import logging +from typing import Optional, Set + +from ipv8.taskmanager import TaskManager + +from pony.orm import db_session + +import tribler_core.components.metadata_store.db.orm_bindings.torrent_metadata as torrent_metadata +import tribler_core.components.metadata_store.db.store as MDS +from tribler_core.components.metadata_store.db.serialization import REGULAR_TORRENT +from tribler_core.components.tag.community.tag_payload import TagOperation, TagOperationEnum +from tribler_core.components.tag.db.tag_db import ( + CLOCK_FOR_AUTOGENERATED_TAGS, + PUBLIC_KEY_FOR_AUTO_GENERATED_TAGS, + TagDatabase, +) +from tribler_core.components.tag.rules.tag_rules import extract_only_valid_tags +from tribler_core.notifier import Notifier + +LAST_PROCESSED_TORRENT_ID = 'last_processed_torrent_id' + + +class TagRulesProcessor(TaskManager): + # this value must be incremented in the case of new rules set has been applied + version: int = 1 + + def __init__(self, notifier: Notifier, db: TagDatabase, mds: MDS.MetadataStore, + batch_size: int = 1000, interval: float = 10): + """ + Default values for batch_size and interval are chosen so that tag processing is not too heavy + fot CPU and with this values 360k items will be processed within the hour. + """ + super().__init__() + self.logger = logging.getLogger(self.__class__.__name__) + + self.notifier = notifier + self.db = db + self.mds = mds + self.batch_size = batch_size + self.interval = interval + self.notifier.add_observer(torrent_metadata.NEW_TORRENT_METADATA_CREATED, + callback=self.process_torrent_title) + self.register_task(name=self.process_batch.__name__, + interval=interval, + task=self.process_batch) + + @db_session + def process_batch(self) -> int: + def query(_start, _end): + return lambda t: _start < t.rowid and t.rowid <= _end and \ + t.metadata_type == REGULAR_TORRENT and \ + t.tag_version < self.version + + start = int(self.mds.get_value(LAST_PROCESSED_TORRENT_ID, default='0')) + end = start + self.batch_size + self.logger.info(f'Processing batch [{start}...{end}]') + + batch = self.mds.TorrentMetadata.select(query(start, end)) + processed = 0 + added = 0 + for torrent in batch: + added += self.process_torrent_title(torrent.infohash, torrent.title) + torrent.tag_version = self.version + processed += 1 + + self.logger.info(f'Processed: {processed} titles. Added {added} tags.') + max_row_id = self.mds.get_max_rowid() + + is_beyond_the_boundary = end > max_row_id + if is_beyond_the_boundary: + self._schedule_new_process_batch_round() + else: + self.mds.set_value(LAST_PROCESSED_TORRENT_ID, str(end)) + return processed + + def process_torrent_title(self, infohash: Optional[bytes] = None, title: Optional[str] = None) -> int: + if not infohash or not title: + return 0 + tags = set(extract_only_valid_tags(title)) + if tags: + self.save_tags(infohash, tags) + return len(tags) + + @db_session + def save_tags(self, infohash: bytes, tags: Set[str]): + self.logger.debug(f'Save: {len(tags)} tags') + for tag in tags: + operation = TagOperation( + infohash=infohash, + operation=TagOperationEnum.ADD, + clock=CLOCK_FOR_AUTOGENERATED_TAGS, + creator_public_key=PUBLIC_KEY_FOR_AUTO_GENERATED_TAGS, + tag=tag + ) + # we want auto generated operation to act like a normal operation + # therefore we use 2 as a `counter_increment` to immediately pass + # SHOW_THRESHOLD + self.db.add_auto_generated_tag_operation(operation=operation) + + def _schedule_new_process_batch_round(self): + self.logger.info('All items in TorrentMetadata have been processed.') + self.mds.set_value(LAST_PROCESSED_TORRENT_ID, '0') + self.logger.info('Set last_processed_torrent_id to 0') + self.interval *= 2 + self.logger.info(f'Double the interval. New interval: {self.interval}') + self.batch_size *= 2 + self.logger.info(f'Double the batch size. New batch size: {self.batch_size}') + self.replace_task(self.process_batch.__name__, + interval=self.interval, + task=self.process_batch) diff --git a/src/tribler-core/tribler_core/components/tag/rules/tests/test_general_rules.py b/src/tribler-core/tribler_core/components/tag/rules/tests/test_general_rules.py new file mode 100644 index 00000000000..fa6b93e149c --- /dev/null +++ b/src/tribler-core/tribler_core/components/tag/rules/tests/test_general_rules.py @@ -0,0 +1,91 @@ +import pytest + +from tribler_core.components.tag.rules.tag_rules import ( + delimiter_re, + extension_re, + extract_only_valid_tags, + extract_tags, + parentheses_re, + square_brackets_re, + tags_in_parentheses, + tags_in_square_brackets, +) + +DELIMITERS = [ + ('word1 word2 word3', ['word1', 'word2', 'word3']), + ('word1,word2,word3', ['word1', 'word2', 'word3']), + ('word1/word2/word3', ['word1', 'word2', 'word3']), + ('word1|word2|word3', ['word1', 'word2', 'word3']), + ('word1 /.,word2', ['word1', 'word2']), +] + +SQUARE_BRACKETS = [ + ('[word1] [word2 word3]', ['word1', 'word2 word3']), + ('[word1 [word2] word3]', ['word2']), +] + +PARENTHESES = [ + ('(word1) (word2 word3)', ['word1', 'word2 word3']), + ('(word1 (word2) word3)', ['word2']), +] + +EXTENSIONS = [ + ('some.ext', ['ext']), + ('some.ext4', ['ext4']), + ('some', []), + ('some. ext', []), + ('some.ext ', []), +] + + +@pytest.mark.parametrize('text, words', DELIMITERS) +def test_delimiter(text, words): + assert delimiter_re.findall(text) == words + + +@pytest.mark.parametrize('text, words', SQUARE_BRACKETS) +def test_square_brackets(text, words): + assert square_brackets_re.findall(text) == words + + +@pytest.mark.parametrize('text, words', PARENTHESES) +def test_parentheses(text, words): + assert parentheses_re.findall(text) == words + + +@pytest.mark.parametrize('text, words', EXTENSIONS) +def test_extension(text, words): + # test regex + assert extension_re.findall(text) == words + + +def test_tags_in_square_brackets(): + # test that tags_in_square_brackets rule works correctly with extract_tags function + text = 'text [tag1, tag2] text1 [tag3|tag4] text2, (tag5, tag6)' + expected_tags = {'tag1', 'tag2', 'tag3', 'tag4'} + + actual_tags = set(extract_tags(text, rules=[tags_in_square_brackets])) + assert actual_tags == expected_tags + + +def test_tags_in_parentheses(): + # test that tags_in_parentheses rule works correctly with extract_tags function + text = 'text (tag1, tag2) text1 (tag3|tag4) text2, [tag5, tag6]' + expected_tags = {'tag1', 'tag2', 'tag3', 'tag4'} + + actual_tags = set(extract_tags(text, rules=[tags_in_parentheses])) + assert actual_tags == expected_tags + + +def test_default_rules(): + # test that default_rules works correctly with extract_tags function + text = 'text (tag1, tag2) text1 (tag3|tag4) text2, [tag5, tag6].ext' + expected_tags = {'tag1', 'tag2', 'tag3', 'tag4', 'tag5', 'tag6', 'ext'} + + actual_tags = set(extract_tags(text)) + assert actual_tags == expected_tags + + +def test_extract_only_valid_tags(): + # test that extract_only_valid_tags extracts only valid tags + assert set(extract_only_valid_tags('[valid-tag, in va li d]')) == {'valid-tag'} diff --git a/src/tribler-core/tribler_core/components/tag/rules/tests/test_tag_rules_processor.py b/src/tribler-core/tribler_core/components/tag/rules/tests/test_tag_rules_processor.py new file mode 100644 index 00000000000..d1580381a43 --- /dev/null +++ b/src/tribler-core/tribler_core/components/tag/rules/tests/test_tag_rules_processor.py @@ -0,0 +1,110 @@ +from types import SimpleNamespace +from unittest.mock import Mock, patch + +import pytest + +from tribler_core.components.metadata_store.db.orm_bindings.torrent_metadata import NEW_TORRENT_METADATA_CREATED +from tribler_core.components.tag.community.tag_payload import TagOperation, TagOperationEnum +from tribler_core.components.tag.db.tag_db import CLOCK_FOR_AUTOGENERATED_TAGS, PUBLIC_KEY_FOR_AUTO_GENERATED_TAGS +from tribler_core.components.tag.rules.tag_rules_processor import LAST_PROCESSED_TORRENT_ID, TagRulesProcessor + +TEST_BATCH_SIZE = 100 +TEST_INTERVAL = 0.1 + +# pylint: disable=redefined-outer-name, protected-access +@pytest.fixture +def tag_rules_processor(): + return TagRulesProcessor(notifier=Mock(), db=Mock(), mds=Mock(), batch_size=TEST_BATCH_SIZE, interval=TEST_INTERVAL) + + +def test_constructor(tag_rules_processor: TagRulesProcessor): + # test that constructor of TagRulesProcessor works as expected + assert tag_rules_processor.batch_size == TEST_BATCH_SIZE + assert tag_rules_processor.interval == TEST_INTERVAL + + m: Mock = tag_rules_processor.notifier.add_observer + m.assert_called_with(NEW_TORRENT_METADATA_CREATED, callback=tag_rules_processor.process_torrent_title) + + +@patch.object(TagRulesProcessor, 'save_tags') +def test_process_torrent_file(mocked_save_tags: Mock, tag_rules_processor: TagRulesProcessor): + # test on None + assert not tag_rules_processor.process_torrent_title(infohash=None, title='title') + assert not tag_rules_processor.process_torrent_title(infohash=b'infohash', title=None) + + # test that process_torrent_title doesn't find any tags in the title + assert not tag_rules_processor.process_torrent_title(infohash=b'infohash', title='title') + mocked_save_tags.assert_not_called() + + # test that process_torrent_title does find tags in the title + assert tag_rules_processor.process_torrent_title(infohash=b'infohash', title='title [tag]') == 1 + mocked_save_tags.assert_called_with(b'infohash', {'tag'}) + + +def test_save_tags(tag_rules_processor: TagRulesProcessor): + # test that tag_rules_processor calls TagDatabase with correct args + expected_calls = [{'operation': TagOperation(infohash=b'infohash', operation=TagOperationEnum.ADD, + clock=CLOCK_FOR_AUTOGENERATED_TAGS, + creator_public_key=PUBLIC_KEY_FOR_AUTO_GENERATED_TAGS, + tag='tag1')}, + {'operation': TagOperation(infohash=b'infohash', operation=TagOperationEnum.ADD, + clock=CLOCK_FOR_AUTOGENERATED_TAGS, + creator_public_key=PUBLIC_KEY_FOR_AUTO_GENERATED_TAGS, + tag='tag2')}] + + tag_rules_processor.save_tags(infohash=b'infohash', tags={'tag1', 'tag2'}) + actual_calls = [c.kwargs for c in tag_rules_processor.db.add_auto_generated_tag_operation.mock_calls] + + # compare two lists of dict + assert [c for c in actual_calls if c not in expected_calls] == [] + + +@patch.object(TagRulesProcessor, 'replace_task') +def test_schedule_new_process_batch_round(mocked_replace_task: Mock, tag_rules_processor: TagRulesProcessor): + tag_rules_processor._schedule_new_process_batch_round() + assert tag_rules_processor.interval == TEST_INTERVAL * 2 + assert tag_rules_processor.batch_size == TEST_BATCH_SIZE * 2 + tag_rules_processor.mds.set_value.assert_called_with(LAST_PROCESSED_TORRENT_ID, '0') + mocked_replace_task.assert_called_once() + + +@patch.object(TagRulesProcessor, 'process_torrent_title', new=Mock(return_value=1)) +def test_process_batch_within_the_boundary(tag_rules_processor: TagRulesProcessor): + # test inner logic of `process_batch` in case this batch located within the boundary + returned_batch_size = TEST_BATCH_SIZE // 2 # let's return a half of requested items + + def select(_): + return [SimpleNamespace(infohash=i, title=i) for i in range(returned_batch_size)] + + tag_rules_processor.mds.TorrentMetadata.select = select + tag_rules_processor.mds.get_value = lambda *_, **__: 0 # let's start from 0 for LAST_PROCESSED_TORRENT_ID + + # let's specify `max_rowid` in such a way that it is far more than end of the current batch + tag_rules_processor.mds.get_max_rowid = lambda: TEST_BATCH_SIZE * 10 + + # assert that actually returned count of processed items is equal to `returned_batch_size` + assert tag_rules_processor.process_batch() == returned_batch_size + + # assert that actually stored last_processed_torrent_id is equal to `TEST_BATCH_SIZE` + tag_rules_processor.mds.set_value.assert_called_with(LAST_PROCESSED_TORRENT_ID, str(TEST_BATCH_SIZE)) + + +@patch.object(TagRulesProcessor, '_schedule_new_process_batch_round') +@patch.object(TagRulesProcessor, 'process_torrent_title', new=Mock(return_value=1)) +def test_process_batch_beyond_the_boundary(mocked_schedule_new_process_batch_round: Mock, + tag_rules_processor: TagRulesProcessor): + # test inner logic of `process_batch` in case this batch located within the boundary + returned_batch_size = TEST_BATCH_SIZE // 2 # let's return a half of requested items + + def select(_): + return [SimpleNamespace(infohash=i, title=i) for i in range(returned_batch_size)] + + tag_rules_processor.mds.get_value = lambda *_, **__: 0 # let's start from 0 for LAST_PROCESSED_TORRENT_ID + tag_rules_processor.mds.TorrentMetadata.select = select + + # let's specify `max_rowid` in such a way that it is less than end of the current batch + tag_rules_processor.mds.get_max_rowid = lambda: returned_batch_size // 2 + + # assert that actually returned count of processed items is equal to `returned_batch_size` + assert tag_rules_processor.process_batch() == returned_batch_size + mocked_schedule_new_process_batch_round.assert_called_once() diff --git a/src/tribler-core/tribler_core/components/tag/tag_component.py b/src/tribler-core/tribler_core/components/tag/tag_component.py index 44eadb1d949..cdd1a2940f8 100644 --- a/src/tribler-core/tribler_core/components/tag/tag_component.py +++ b/src/tribler-core/tribler_core/components/tag/tag_component.py @@ -1,10 +1,13 @@ from tribler_common.simpledefs import STATEDIR_DB_DIR +import tribler_core.components.metadata_store.metadata_store_component as metadata_store_component from tribler_core.components.base import Component from tribler_core.components.ipv8.ipv8_component import Ipv8Component from tribler_core.components.key.key_component import KeyComponent +from tribler_core.components.metadata_store.utils import generate_test_channels from tribler_core.components.tag.community.tag_community import TagCommunity from tribler_core.components.tag.db.tag_db import TagDatabase +from tribler_core.components.tag.rules.tag_rules_processor import TagRulesProcessor class TagComponent(Component): @@ -12,6 +15,7 @@ class TagComponent(Component): community: TagCommunity = None tags_db: TagDatabase = None + rules_processor: TagRulesProcessor = None _ipv8_component: Ipv8Component = None async def run(self): @@ -19,12 +23,13 @@ async def run(self): self._ipv8_component = await self.require_component(Ipv8Component) key_component = await self.require_component(KeyComponent) + mds_component = await self.require_component(metadata_store_component.MetadataStoreComponent) db_path = self.session.config.state_dir / STATEDIR_DB_DIR / "tags.db" if self.session.config.gui_test_mode: db_path = ":memory:" - self.tags_db = TagDatabase(str(db_path)) + self.tags_db = TagDatabase(str(db_path), create_tables=True) self.community = TagCommunity( self._ipv8_component.peer, self._ipv8_component.ipv8.endpoint, @@ -32,9 +37,17 @@ async def run(self): db=self.tags_db, tags_key=key_component.secondary_key ) + self.rules_processor = TagRulesProcessor( + notifier=self.session.notifier, + db=self.tags_db, + mds=mds_component.mds, + ) self._ipv8_component.initialise_community_by_default(self.community) + if self.session.config.gui_test_mode: + generate_test_channels(mds_component.mds, self.tags_db) + async def shutdown(self): await super().shutdown() if self._ipv8_component and self.community: diff --git a/src/tribler-core/tribler_core/components/tag/tests/test_tag_component.py b/src/tribler-core/tribler_core/components/tag/tests/test_tag_component.py index 16cb000d1aa..7e341714809 100644 --- a/src/tribler-core/tribler_core/components/tag/tests/test_tag_component.py +++ b/src/tribler-core/tribler_core/components/tag/tests/test_tag_component.py @@ -3,6 +3,7 @@ from tribler_core.components.base import Session from tribler_core.components.ipv8.ipv8_component import Ipv8Component from tribler_core.components.key.key_component import KeyComponent +from tribler_core.components.metadata_store.metadata_store_component import MetadataStoreComponent from tribler_core.components.tag.tag_component import TagComponent # pylint: disable=protected-access @@ -10,7 +11,7 @@ @pytest.mark.asyncio async def test_tag_component(tribler_config): - components = [KeyComponent(), Ipv8Component(), TagComponent()] + components = [MetadataStoreComponent(), KeyComponent(), Ipv8Component(), TagComponent()] async with Session(tribler_config, components).start(): comp = TagComponent.instance() assert comp.started_event.is_set() and not comp.failed