Skip to content

Commit

Permalink
Refactor tag_rules
Browse files Browse the repository at this point in the history
  • Loading branch information
drew2a committed Jan 12, 2022
1 parent ca6b8e0 commit a9b415b
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 29 deletions.
23 changes: 12 additions & 11 deletions src/tribler-core/tribler_core/components/tag/rules/tag_rules.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,31 @@
import re
from typing import Iterable, Set
from typing import AnyStr, Iterable, Optional, Pattern, Sequence

from tribler_core.components.tag.community.tag_payload import TagOperation, TagOperationEnum
from tribler_core.components.tag.community.tag_validator import is_valid_tag
from tribler_core.components.tag.db.tag_db import TagDatabase

delimiter = re.compile(r'([^\s.,/]+)')
square_brackets = re.compile(r'\[([^\[\]]+)]')
brackets = re.compile(r'\(([^()]+)\)')
delimiter = re.compile(r'([^\s.,/|]+)')

tags_in_square_brackets = [
re.compile(r'\[([^\[\]]+)]'), # extract content from square brackets
square_brackets, # extract content from square brackets
delimiter # divide content by "," or "." or " " or "/"
]

tags_in_brackets = [
re.compile(r'\(([^()]+)\)'), # extract content from brackets
brackets, # extract content from brackets
delimiter # divide content by "," or "." or " " or "/"
]

rules = [
RulesList = Sequence[Sequence[Pattern[AnyStr]]]
default_rules: RulesList = [
tags_in_square_brackets,
tags_in_brackets
]


def extract_tags(text: str) -> Iterable[str]:
def extract_tags(text: str, rules: Optional[RulesList] = None) -> Iterable[str]:
rules = rules or default_rules
for rule in rules:
text_set = {text}
for regex in rule:
Expand All @@ -35,7 +37,6 @@ def extract_tags(text: str) -> Iterable[str]:
yield from text_set


def extract_only_valid_tags(text: str) -> Iterable[str]:
extracted_tags_gen = (t.lower() for t in extract_tags(text))
def extract_only_valid_tags(text: str, rules: Optional[RulesList] = None) -> Iterable[str]:
extracted_tags_gen = (t.lower() for t in extract_tags(text, rules))
yield from (t for t in extracted_tags_gen if is_valid_tag(t))

Original file line number Diff line number Diff line change
@@ -1,27 +1,57 @@
from tribler_core.components.tag.rules.tag_rules import extract_only_valid_tags, extract_tags
from tribler_core.components.tag.rules.tag_rules import brackets, delimiter, extract_only_valid_tags, extract_tags, \
square_brackets, tags_in_brackets, tags_in_square_brackets


def test_extract_tags_from_brackets():
actual = set(extract_tags('[tag]'))
expected = {'tag'}
assert actual == expected
def test_delimiter():
# assert that delimiter splits words correctly
assert delimiter.findall('word1 word2 word3') == ['word1', 'word2', 'word3']
assert delimiter.findall('word1.word2.word3') == ['word1', 'word2', 'word3']
assert delimiter.findall('word1,word2,word3') == ['word1', 'word2', 'word3']
assert delimiter.findall('word1/word2/word3') == ['word1', 'word2', 'word3']
assert delimiter.findall('word1|word2|word3') == ['word1', 'word2', 'word3']

actual = set(extract_tags('[tag1, tag2]'))
expected = {'tag1', 'tag2'}
assert actual == expected
assert delimiter.findall('word1 /.,word2') == ['word1', 'word2']

actual = set(extract_tags('[tag1, tag2][tag3, tag4]'))
expected = {'tag1', 'tag2', 'tag3', 'tag4'}
assert actual == expected

actual = set(extract_tags('text [tag1] text [tag2] text'))
expected = {'tag1', 'tag2'}
assert actual == expected
def test_square_brackets():
# test that square_brackets regex correctly defines content in square brackets
assert square_brackets.findall('[word1] [word2 word3]') == ['word1', 'word2 word3']
assert square_brackets.findall('[word1 [word2] word3]') == ['word2']

actual = set(extract_tags('[[tag1]] [tag2 [tag3] tag4] [tag5'))
expected = {'tag1', 'tag3'}
assert actual == expected

def test_brackets():
# test that brackets regex correctly defines content in brackets
assert brackets.findall('(word1) (word2 word3)') == ['word1', 'word2 word3']
assert brackets.findall('(word1 (word2) word3)') == ['word2']


def test_tags_in_square_brackets():
# test that tags_in_square_brackets rule works correctly with extract_tags function
text = 'text [tag1, tag2] text1 [tag3|tag4] text2, (tag5, tag6)'
expected_tags = {'tag1', 'tag2', 'tag3', 'tag4'}

actual_tags = set(extract_tags(text, rules=[tags_in_square_brackets]))
assert actual_tags == expected_tags


def test_tags_in_brackets():
# test that tags_in_brackets rule works correctly with extract_tags function
text = 'text (tag1, tag2) text1 (tag3|tag4) text2, [tag5, tag6]'
expected_tags = {'tag1', 'tag2', 'tag3', 'tag4'}

actual_tags = set(extract_tags(text, rules=[tags_in_brackets]))
assert actual_tags == expected_tags


def test_default_rules():
# test that default_rules works correctly with extract_tags function
text = 'text (tag1, tag2) text1 (tag3|tag4) text2, [tag5, tag6]'
expected_tags = {'tag1', 'tag2', 'tag3', 'tag4', 'tag5', 'tag6'}

actual_tags = set(extract_tags(text))
assert actual_tags == expected_tags


def test_extract_only_valid_tags():
assert set(extract_only_valid_tags('[valid-tag in va li d]')) == {'valid-tag'}
# test that extract_only_valid_tags extracts only valid tags
assert set(extract_only_valid_tags('[valid-tag, in va li d]')) == {'valid-tag'}

0 comments on commit a9b415b

Please sign in to comment.