Skip to content

Commit

Permalink
Rename the function: find_word -> find_word_and_rotate_title
Browse files Browse the repository at this point in the history
  • Loading branch information
kozlovsky committed Nov 4, 2022
1 parent 946e573 commit 6df2eb0
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 29 deletions.
40 changes: 20 additions & 20 deletions src/tribler/core/tests/test_search_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import pytest

from tribler.core.utilities.search_utils import filter_keywords, find_word, freshness_rank, item_rank, seeders_rank, \
from tribler.core.utilities.search_utils import filter_keywords, find_word_and_rotate_title, freshness_rank, item_rank, seeders_rank, \
split_into_keywords, torrent_rank, title_rank


Expand Down Expand Up @@ -153,51 +153,51 @@ def test_item_rank():


def test_find_word():
# To use the find_word function, you can call it one time for each word from the query and see:
# To use the find_word_and_rotate_title function, you can call it one time for each word from the query and see:
# - how many query words are missed in the title;
# - how many excess or out-of-place title words are found before each query word;
# - and how many title words are not mentioned in the query.

# Example 1, query "A B C", title "A B C"
title = deque(["A", "B", "C"])
assert find_word("A", title) == (True, 0) and title == deque(["B", "C"])
assert find_word("B", title) == (True, 0) and title == deque(["C"])
assert find_word("C", title) == (True, 0) and title == deque([])
assert find_word_and_rotate_title("A", title) == (True, 0) and title == deque(["B", "C"])
assert find_word_and_rotate_title("B", title) == (True, 0) and title == deque(["C"])
assert find_word_and_rotate_title("C", title) == (True, 0) and title == deque([])
# Conclusion: exact match.

# Example 2, query "A B C", title "A B C D"
title = deque(["A", "B", "C", "D"])
assert find_word("A", title) == (True, 0) and title == deque(["B", "C", "D"])
assert find_word("B", title) == (True, 0) and title == deque(["C", "D"])
assert find_word("C", title) == (True, 0) and title == deque(["D"])
assert find_word_and_rotate_title("A", title) == (True, 0) and title == deque(["B", "C", "D"])
assert find_word_and_rotate_title("B", title) == (True, 0) and title == deque(["C", "D"])
assert find_word_and_rotate_title("C", title) == (True, 0) and title == deque(["D"])
# Conclusion: minor penalty for one excess word in the title that is not in the query.

# Example 3, query "A B C", title "X Y A B C"
title = deque(["X", "Y", "A", "B", "C"])
assert find_word("A", title) == (True, 2) and title == deque(["B", "C", "X", "Y"])
assert find_word("B", title) == (True, 0) and title == deque(["C", "X", "Y"])
assert find_word("C", title) == (True, 0) and title == deque(["X", "Y"])
assert find_word_and_rotate_title("A", title) == (True, 2) and title == deque(["B", "C", "X", "Y"])
assert find_word_and_rotate_title("B", title) == (True, 0) and title == deque(["C", "X", "Y"])
assert find_word_and_rotate_title("C", title) == (True, 0) and title == deque(["X", "Y"])
# Conclusion: major penalty for skipping two words at the beginning of the title plus a minor penalty for two
# excess words in the title that are not in the query.

# Example 4, query "A B C", title "A B X Y C"
title = deque(["A", "B", "X", "Y", "C"])
assert find_word("A", title) == (True, 0) and title == deque(["B", "X", "Y", "C"])
assert find_word("B", title) == (True, 0) and title == deque(["X", "Y", "C"])
assert find_word("C", title) == (True, 2) and title == deque(["X", "Y"])
assert find_word_and_rotate_title("A", title) == (True, 0) and title == deque(["B", "X", "Y", "C"])
assert find_word_and_rotate_title("B", title) == (True, 0) and title == deque(["X", "Y", "C"])
assert find_word_and_rotate_title("C", title) == (True, 2) and title == deque(["X", "Y"])
# Conclusion: average penalty for skipping two words in the middle of the title plus a minor penalty for two
# excess words in the title that are not in the query.

# Example 5, query "A B C", title "A C B"
title = deque(["A", "C", "B"])
assert find_word("A", title) == (True, 0) and title == deque(["C", "B"])
assert find_word("B", title) == (True, 1) and title == deque(["C"])
assert find_word("C", title) == (True, 0) and title == deque([])
assert find_word_and_rotate_title("A", title) == (True, 0) and title == deque(["C", "B"])
assert find_word_and_rotate_title("B", title) == (True, 1) and title == deque(["C"])
assert find_word_and_rotate_title("C", title) == (True, 0) and title == deque([])
# Conclusion: average penalty for skipping one word in the middle of the title.

# Example 6, query "A B C", title "A C X"
title = deque(["A", "C", "X"])
assert find_word("A", title) == (True, 0) and title == deque(["C", "X"])
assert find_word("B", title) == (False, 0) and title == deque(["C", "X"])
assert find_word("C", title) == (True, 0) and title == deque(["X"])
assert find_word_and_rotate_title("A", title) == (True, 0) and title == deque(["C", "X"])
assert find_word_and_rotate_title("B", title) == (False, 0) and title == deque(["C", "X"])
assert find_word_and_rotate_title("C", title) == (True, 0) and title == deque(["X"])
# Conclusion: huge penalty for missing one query word plus a minor penalty for one excess title word.
19 changes: 10 additions & 9 deletions src/tribler/core/utilities/search_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,11 +191,12 @@ def calculate_rank(query: List[str], title: List[str]) -> float:
# The first word is more important than the second word, and so on
word_weight = POSITION_COEFF / (POSITION_COEFF + i)

# Read the description of the `find_word` function to understand what is going on. Basically, we are trying
# to find each query word in the title words, calculate the penalty if the query word is not found or if there
# are some title words before it, and then rotate the skipped title words to the end of the title. This way,
# the least penalty got a title that has query words in the proper order at the beginning of the title.
found, skipped = find_word(word, title)
# Read the description of the `find_word_and_rotate_title` function to understand what is going on.
# Basically, we are trying to find each query word in the title words, calculate the penalty if the query word
# is not found or if there are some title words before it, and then rotate the skipped title words to the end
# of the title. This way, the least penalty got a title that has query words in the proper order at the
# beginning of the title.
found, skipped = find_word_and_rotate_title(word, title)
if found:
# if the query word is found in the title, add penalty for skipped words in title before it
total_error += skipped * word_weight
Expand All @@ -212,7 +213,7 @@ def calculate_rank(query: List[str], title: List[str]) -> float:
return RANK_NORMALIZATION_COEFF / (RANK_NORMALIZATION_COEFF + total_error)


def find_word(word: str, title: Deque[str]) -> Tuple[bool, int]:
def find_word_and_rotate_title(word: str, title: Deque[str]) -> Tuple[bool, int]:
"""
Finds the query word in the title. Returns whether it was found or not and the number of skipped words in the title.
Expand All @@ -225,10 +226,10 @@ def find_word(word: str, title: Deque[str]) -> Tuple[bool, int]:
For efficiency reasons, the function modifies the `title` deque in place by removing the first entrance
of the found word and rotating all leading non-matching words to the end of the deque. It allows to efficiently
perform multiple calls of the `find_word` function for subsequent words from the same query string.
perform multiple calls of the `find_word_and_rotate_title` function for subsequent words from the same query string.
An example: find_word('A', deque(['X', 'Y', 'A', 'B', 'C'])) returns `(True, 2)`, where True means that
the word 'A' was found in the `title` deque, and 2 is the number of skipped words ('X', 'Y'). Also, it modifies
An example: find_word_and_rotate_title('A', deque(['X', 'Y', 'A', 'B', 'C'])) returns `(True, 2)`, where True means
that the word 'A' was found in the `title` deque, and 2 is the number of skipped words ('X', 'Y'). Also, it modifies
the `title` deque, so it starts looking like deque(['B', 'C', 'X', 'Y']). The found word 'A' was removed, and
the leading non-matching words ('X', 'Y') were moved to the end of the deque.
"""
Expand Down

0 comments on commit 6df2eb0

Please sign in to comment.