diff --git a/src/tribler/core/tests/test_search_utils.py b/src/tribler/core/tests/test_search_utils.py index ba9418fbb9d..4ec6d16859c 100644 --- a/src/tribler/core/tests/test_search_utils.py +++ b/src/tribler/core/tests/test_search_utils.py @@ -2,7 +2,7 @@ import pytest -from tribler.core.utilities.search_utils import filter_keywords, find_word, freshness_rank, item_rank, seeders_rank, \ +from tribler.core.utilities.search_utils import filter_keywords, find_word_and_rotate_title, freshness_rank, item_rank, seeders_rank, \ split_into_keywords, torrent_rank, title_rank @@ -153,51 +153,51 @@ def test_item_rank(): def test_find_word(): - # To use the find_word function, you can call it one time for each word from the query and see: + # To use the find_word_and_rotate_title function, you can call it one time for each word from the query and see: # - how many query words are missed in the title; # - how many excess or out-of-place title words are found before each query word; # - and how many title words are not mentioned in the query. # Example 1, query "A B C", title "A B C" title = deque(["A", "B", "C"]) - assert find_word("A", title) == (True, 0) and title == deque(["B", "C"]) - assert find_word("B", title) == (True, 0) and title == deque(["C"]) - assert find_word("C", title) == (True, 0) and title == deque([]) + assert find_word_and_rotate_title("A", title) == (True, 0) and title == deque(["B", "C"]) + assert find_word_and_rotate_title("B", title) == (True, 0) and title == deque(["C"]) + assert find_word_and_rotate_title("C", title) == (True, 0) and title == deque([]) # Conclusion: exact match. # Example 2, query "A B C", title "A B C D" title = deque(["A", "B", "C", "D"]) - assert find_word("A", title) == (True, 0) and title == deque(["B", "C", "D"]) - assert find_word("B", title) == (True, 0) and title == deque(["C", "D"]) - assert find_word("C", title) == (True, 0) and title == deque(["D"]) + assert find_word_and_rotate_title("A", title) == (True, 0) and title == deque(["B", "C", "D"]) + assert find_word_and_rotate_title("B", title) == (True, 0) and title == deque(["C", "D"]) + assert find_word_and_rotate_title("C", title) == (True, 0) and title == deque(["D"]) # Conclusion: minor penalty for one excess word in the title that is not in the query. # Example 3, query "A B C", title "X Y A B C" title = deque(["X", "Y", "A", "B", "C"]) - assert find_word("A", title) == (True, 2) and title == deque(["B", "C", "X", "Y"]) - assert find_word("B", title) == (True, 0) and title == deque(["C", "X", "Y"]) - assert find_word("C", title) == (True, 0) and title == deque(["X", "Y"]) + assert find_word_and_rotate_title("A", title) == (True, 2) and title == deque(["B", "C", "X", "Y"]) + assert find_word_and_rotate_title("B", title) == (True, 0) and title == deque(["C", "X", "Y"]) + assert find_word_and_rotate_title("C", title) == (True, 0) and title == deque(["X", "Y"]) # Conclusion: major penalty for skipping two words at the beginning of the title plus a minor penalty for two # excess words in the title that are not in the query. # Example 4, query "A B C", title "A B X Y C" title = deque(["A", "B", "X", "Y", "C"]) - assert find_word("A", title) == (True, 0) and title == deque(["B", "X", "Y", "C"]) - assert find_word("B", title) == (True, 0) and title == deque(["X", "Y", "C"]) - assert find_word("C", title) == (True, 2) and title == deque(["X", "Y"]) + assert find_word_and_rotate_title("A", title) == (True, 0) and title == deque(["B", "X", "Y", "C"]) + assert find_word_and_rotate_title("B", title) == (True, 0) and title == deque(["X", "Y", "C"]) + assert find_word_and_rotate_title("C", title) == (True, 2) and title == deque(["X", "Y"]) # Conclusion: average penalty for skipping two words in the middle of the title plus a minor penalty for two # excess words in the title that are not in the query. # Example 5, query "A B C", title "A C B" title = deque(["A", "C", "B"]) - assert find_word("A", title) == (True, 0) and title == deque(["C", "B"]) - assert find_word("B", title) == (True, 1) and title == deque(["C"]) - assert find_word("C", title) == (True, 0) and title == deque([]) + assert find_word_and_rotate_title("A", title) == (True, 0) and title == deque(["C", "B"]) + assert find_word_and_rotate_title("B", title) == (True, 1) and title == deque(["C"]) + assert find_word_and_rotate_title("C", title) == (True, 0) and title == deque([]) # Conclusion: average penalty for skipping one word in the middle of the title. # Example 6, query "A B C", title "A C X" title = deque(["A", "C", "X"]) - assert find_word("A", title) == (True, 0) and title == deque(["C", "X"]) - assert find_word("B", title) == (False, 0) and title == deque(["C", "X"]) - assert find_word("C", title) == (True, 0) and title == deque(["X"]) + assert find_word_and_rotate_title("A", title) == (True, 0) and title == deque(["C", "X"]) + assert find_word_and_rotate_title("B", title) == (False, 0) and title == deque(["C", "X"]) + assert find_word_and_rotate_title("C", title) == (True, 0) and title == deque(["X"]) # Conclusion: huge penalty for missing one query word plus a minor penalty for one excess title word. diff --git a/src/tribler/core/utilities/search_utils.py b/src/tribler/core/utilities/search_utils.py index 8573e49e169..ca09b0398c3 100644 --- a/src/tribler/core/utilities/search_utils.py +++ b/src/tribler/core/utilities/search_utils.py @@ -191,11 +191,12 @@ def calculate_rank(query: List[str], title: List[str]) -> float: # The first word is more important than the second word, and so on word_weight = POSITION_COEFF / (POSITION_COEFF + i) - # Read the description of the `find_word` function to understand what is going on. Basically, we are trying - # to find each query word in the title words, calculate the penalty if the query word is not found or if there - # are some title words before it, and then rotate the skipped title words to the end of the title. This way, - # the least penalty got a title that has query words in the proper order at the beginning of the title. - found, skipped = find_word(word, title) + # Read the description of the `find_word_and_rotate_title` function to understand what is going on. + # Basically, we are trying to find each query word in the title words, calculate the penalty if the query word + # is not found or if there are some title words before it, and then rotate the skipped title words to the end + # of the title. This way, the least penalty got a title that has query words in the proper order at the + # beginning of the title. + found, skipped = find_word_and_rotate_title(word, title) if found: # if the query word is found in the title, add penalty for skipped words in title before it total_error += skipped * word_weight @@ -212,7 +213,7 @@ def calculate_rank(query: List[str], title: List[str]) -> float: return RANK_NORMALIZATION_COEFF / (RANK_NORMALIZATION_COEFF + total_error) -def find_word(word: str, title: Deque[str]) -> Tuple[bool, int]: +def find_word_and_rotate_title(word: str, title: Deque[str]) -> Tuple[bool, int]: """ Finds the query word in the title. Returns whether it was found or not and the number of skipped words in the title. @@ -225,10 +226,10 @@ def find_word(word: str, title: Deque[str]) -> Tuple[bool, int]: For efficiency reasons, the function modifies the `title` deque in place by removing the first entrance of the found word and rotating all leading non-matching words to the end of the deque. It allows to efficiently - perform multiple calls of the `find_word` function for subsequent words from the same query string. + perform multiple calls of the `find_word_and_rotate_title` function for subsequent words from the same query string. - An example: find_word('A', deque(['X', 'Y', 'A', 'B', 'C'])) returns `(True, 2)`, where True means that - the word 'A' was found in the `title` deque, and 2 is the number of skipped words ('X', 'Y'). Also, it modifies + An example: find_word_and_rotate_title('A', deque(['X', 'Y', 'A', 'B', 'C'])) returns `(True, 2)`, where True means + that the word 'A' was found in the `title` deque, and 2 is the number of skipped words ('X', 'Y'). Also, it modifies the `title` deque, so it starts looking like deque(['B', 'C', 'X', 'Y']). The found word 'A' was removed, and the leading non-matching words ('X', 'Y') were moved to the end of the deque. """