From a9b415b1021b97fe213758111518e939c671cd84 Mon Sep 17 00:00:00 2001 From: drew2a Date: Wed, 12 Jan 2022 12:31:07 +0100 Subject: [PATCH] Refactor tag_rules --- .../components/tag/rules/tag_rules.py | 23 +++---- .../tag/rules/tests/test_general_rules.py | 66 ++++++++++++++----- 2 files changed, 60 insertions(+), 29 deletions(-) diff --git a/src/tribler-core/tribler_core/components/tag/rules/tag_rules.py b/src/tribler-core/tribler_core/components/tag/rules/tag_rules.py index a83059a2d19..5bbc980fd85 100644 --- a/src/tribler-core/tribler_core/components/tag/rules/tag_rules.py +++ b/src/tribler-core/tribler_core/components/tag/rules/tag_rules.py @@ -1,29 +1,31 @@ import re -from typing import Iterable, Set +from typing import AnyStr, Iterable, Optional, Pattern, Sequence -from tribler_core.components.tag.community.tag_payload import TagOperation, TagOperationEnum from tribler_core.components.tag.community.tag_validator import is_valid_tag -from tribler_core.components.tag.db.tag_db import TagDatabase -delimiter = re.compile(r'([^\s.,/]+)') +square_brackets = re.compile(r'\[([^\[\]]+)]') +brackets = re.compile(r'\(([^()]+)\)') +delimiter = re.compile(r'([^\s.,/|]+)') tags_in_square_brackets = [ - re.compile(r'\[([^\[\]]+)]'), # extract content from square brackets + square_brackets, # extract content from square brackets delimiter # divide content by "," or "." or " " or "/" ] tags_in_brackets = [ - re.compile(r'\(([^()]+)\)'), # extract content from brackets + brackets, # extract content from brackets delimiter # divide content by "," or "." or " " or "/" ] -rules = [ +RulesList = Sequence[Sequence[Pattern[AnyStr]]] +default_rules: RulesList = [ tags_in_square_brackets, tags_in_brackets ] -def extract_tags(text: str) -> Iterable[str]: +def extract_tags(text: str, rules: Optional[RulesList] = None) -> Iterable[str]: + rules = rules or default_rules for rule in rules: text_set = {text} for regex in rule: @@ -35,7 +37,6 @@ def extract_tags(text: str) -> Iterable[str]: yield from text_set -def extract_only_valid_tags(text: str) -> Iterable[str]: - extracted_tags_gen = (t.lower() for t in extract_tags(text)) +def extract_only_valid_tags(text: str, rules: Optional[RulesList] = None) -> Iterable[str]: + extracted_tags_gen = (t.lower() for t in extract_tags(text, rules)) yield from (t for t in extracted_tags_gen if is_valid_tag(t)) - diff --git a/src/tribler-core/tribler_core/components/tag/rules/tests/test_general_rules.py b/src/tribler-core/tribler_core/components/tag/rules/tests/test_general_rules.py index c21aa493abe..bbc88e9f658 100644 --- a/src/tribler-core/tribler_core/components/tag/rules/tests/test_general_rules.py +++ b/src/tribler-core/tribler_core/components/tag/rules/tests/test_general_rules.py @@ -1,27 +1,57 @@ -from tribler_core.components.tag.rules.tag_rules import extract_only_valid_tags, extract_tags +from tribler_core.components.tag.rules.tag_rules import brackets, delimiter, extract_only_valid_tags, extract_tags, \ + square_brackets, tags_in_brackets, tags_in_square_brackets -def test_extract_tags_from_brackets(): - actual = set(extract_tags('[tag]')) - expected = {'tag'} - assert actual == expected +def test_delimiter(): + # assert that delimiter splits words correctly + assert delimiter.findall('word1 word2 word3') == ['word1', 'word2', 'word3'] + assert delimiter.findall('word1.word2.word3') == ['word1', 'word2', 'word3'] + assert delimiter.findall('word1,word2,word3') == ['word1', 'word2', 'word3'] + assert delimiter.findall('word1/word2/word3') == ['word1', 'word2', 'word3'] + assert delimiter.findall('word1|word2|word3') == ['word1', 'word2', 'word3'] - actual = set(extract_tags('[tag1, tag2]')) - expected = {'tag1', 'tag2'} - assert actual == expected + assert delimiter.findall('word1 /.,word2') == ['word1', 'word2'] - actual = set(extract_tags('[tag1, tag2][tag3, tag4]')) - expected = {'tag1', 'tag2', 'tag3', 'tag4'} - assert actual == expected - actual = set(extract_tags('text [tag1] text [tag2] text')) - expected = {'tag1', 'tag2'} - assert actual == expected +def test_square_brackets(): + # test that square_brackets regex correctly defines content in square brackets + assert square_brackets.findall('[word1] [word2 word3]') == ['word1', 'word2 word3'] + assert square_brackets.findall('[word1 [word2] word3]') == ['word2'] - actual = set(extract_tags('[[tag1]] [tag2 [tag3] tag4] [tag5')) - expected = {'tag1', 'tag3'} - assert actual == expected + +def test_brackets(): + # test that brackets regex correctly defines content in brackets + assert brackets.findall('(word1) (word2 word3)') == ['word1', 'word2 word3'] + assert brackets.findall('(word1 (word2) word3)') == ['word2'] + + +def test_tags_in_square_brackets(): + # test that tags_in_square_brackets rule works correctly with extract_tags function + text = 'text [tag1, tag2] text1 [tag3|tag4] text2, (tag5, tag6)' + expected_tags = {'tag1', 'tag2', 'tag3', 'tag4'} + + actual_tags = set(extract_tags(text, rules=[tags_in_square_brackets])) + assert actual_tags == expected_tags + + +def test_tags_in_brackets(): + # test that tags_in_brackets rule works correctly with extract_tags function + text = 'text (tag1, tag2) text1 (tag3|tag4) text2, [tag5, tag6]' + expected_tags = {'tag1', 'tag2', 'tag3', 'tag4'} + + actual_tags = set(extract_tags(text, rules=[tags_in_brackets])) + assert actual_tags == expected_tags + + +def test_default_rules(): + # test that default_rules works correctly with extract_tags function + text = 'text (tag1, tag2) text1 (tag3|tag4) text2, [tag5, tag6]' + expected_tags = {'tag1', 'tag2', 'tag3', 'tag4', 'tag5', 'tag6'} + + actual_tags = set(extract_tags(text)) + assert actual_tags == expected_tags def test_extract_only_valid_tags(): - assert set(extract_only_valid_tags('[valid-tag in va li d]')) == {'valid-tag'} + # test that extract_only_valid_tags extracts only valid tags + assert set(extract_only_valid_tags('[valid-tag, in va li d]')) == {'valid-tag'}