Skip to content

Commit

Permalink
Move examples for find_work function from the docstring to a unit test
Browse files Browse the repository at this point in the history
  • Loading branch information
kozlovsky committed Nov 1, 2022
1 parent efe27da commit 52d0e5d
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 58 deletions.
57 changes: 55 additions & 2 deletions src/tribler/core/tests/test_search_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from collections import deque

import pytest

from tribler.core.utilities.search_utils import filter_keywords, item_rank, split_into_keywords, torrent_rank, \
title_rank
from tribler.core.utilities.search_utils import filter_keywords, find_word, item_rank, split_into_keywords,\
torrent_rank, title_rank


DAY = 60 * 60 * 24
Expand Down Expand Up @@ -120,3 +122,54 @@ def test_title_rank():
def test_item_rank():
item = dict(name="abc", num_seeders=10, num_leechers=20)
assert item_rank("abc", item) == pytest.approx(0.81978445)


def test_find_word():
# To use the find_word function, you can call it one time for each word from the query and see:
# - how many query words are missed in the title;
# - how many excess or out-of-place title words are found before each query word;
# - and how many title words are not mentioned in the query.

# Example 1, query "A B C", title "A B C"
title = deque(["A", "B", "C"])
assert find_word("A", title) == (True, 0) and title == deque(["B", "C"])
assert find_word("B", title) == (True, 0) and title == deque(["C"])
assert find_word("C", title) == (True, 0) and title == deque([])
# Conclusion: exact match.

# Example 2, query "A B C", title "A B C D"
title = deque(["A", "B", "C", "D"])
assert find_word("A", title) == (True, 0) and title == deque(["B", "C", "D"])
assert find_word("B", title) == (True, 0) and title == deque(["C", "D"])
assert find_word("C", title) == (True, 0) and title == deque(["D"])
# Conclusion: minor penalty for one excess word in the title that is not in the query.

# Example 3, query "A B C", title "X Y A B C"
title = deque(["X", "Y", "A", "B", "C"])
assert find_word("A", title) == (True, 2) and title == deque(["B", "C", "X", "Y"])
assert find_word("B", title) == (True, 0) and title == deque(["C", "X", "Y"])
assert find_word("C", title) == (True, 0) and title == deque(["X", "Y"])
# Conclusion: major penalty for skipping two words at the beginning of the title plus a minor penalty for two
# excess words in the title that are not in the query.

# Example 4, query "A B C", title "A B X Y C"
title = deque(["A", "B", "X", "Y", "C"])
assert find_word("A", title) == (True, 0) and title == deque(["B", "X", "Y", "C"])
assert find_word("B", title) == (True, 0) and title == deque(["X", "Y", "C"])
assert find_word("C", title) == (True, 2) and title == deque(["X", "Y"])
# Conclusion: average penalty for skipping two words in the middle of the title plus a minor penalty for two
# excess words in the title that are not in the query.

# Example 5, query "A B C", title "A C B"
title = deque(["A", "C", "B"])
assert find_word("A", title) == (True, 0) and title == deque(["C", "B"])
assert find_word("B", title) == (True, 1) and title == deque(["C"])
assert find_word("C", title) == (True, 0) and title == deque([])
# Conclusion: average penalty for skipping one word in the middle of the title.

# Example 6, query "A B C", title "A C X"
title = deque(["A", "C", "X"])
assert find_word("A", title) == (True, 0) and title == deque(["C", "X"])
assert find_word("B", title) == (False, 0) and title == deque(["C", "X"])
assert find_word("C", title) == (True, 0) and title == deque(["X"])
# Conclusion: huge penalty for missing one query word plus a minor penalty for one excess title word.
63 changes: 7 additions & 56 deletions src/tribler/core/utilities/search_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,73 +196,24 @@ def find_word(word: str, title: Deque[str]) -> Tuple[bool, int]:
"""
Finds the query word in the title. Returns whether it was found or not and the number of skipped words in the title.
:param word: a word from the query
:param title: a list of words in the title
:param word: a word from the user-defined query string
:param title: a deque of words in the title
:return: a two-elements tuple, whether the word was found in the title and the number of skipped words
This is a helper function to efficiently answer a question of how close a query string and a title string are,
taking into account the ordering of words in both strings.
The `word` parameter is a word from a search string.
The `title` parameter is a deque of words from the torrent title. It also can be a deque of stemmed words
if the `torrent_rank` function supports stemming.
The `find_word` function returns the boolean value of whether the word was found in the title deque or not and
the number of the skipped leading words in the `title` deque. Also, it modifies the `title` deque in place by
removing the first entrance of the found word and rotating all leading non-matching words to the end of the deque.
For efficiency reasons, the function modifies the `title` deque in place by removing the first entrance
of the found word and rotating all leading non-matching words to the end of the deque. It allows to efficiently
perform multiple calls of the `find_word` function for subsequent words from the same query string.
An example: find_word('A', deque(['X', 'Y', 'A', 'B', 'C'])) returns `(True, 2)`, where True means that
the word 'A' was found in the `title` deque, and 2 is the number of skipped words ('X', 'Y'). Also, it modifies
the `title` deque, so it starts looking like deque(['B', 'C', 'X', 'Y']). The found word 'A' was removed, and
the leading non-matching words ('X', 'Y') was moved to the end of the deque.
Now some examples of how the function can be used. To use the function, you can call it one time for each word
from the query and see:
- how many query words are missed in the title;
- how many excess or out-of-place title words are found before each query word;
- and how many title words are not mentioned in the query.
Example 1, query "A B C", title "A B C":
find_word("A", deque(["A", "B", "C"])) -> (found=True, skipped=0, rest=deque(["B", "C"]))
find_word("B", deque(["B", "C"])) -> (found=True, skipped=0, rest=deque(["C"]))
find_word("C", deque(["C"])) -> (found=True, skipped=0, rest=deque([]))
Conclusion: exact match.
Example 2, query "A B C", title "A B C D":
find_word("A", deque(["A", "B", "C", "D"])) -> (found=True, skipped=0, rest=deque(["B", "C", "D"]))
find_word("B", deque(["B", "C", "D"])) -> (found=True, skipped=0, rest=deque(["C", "D"]))
find_word("C", deque(["C", "D"])) -> (found=True, skipped=0, rest=deque(["D"]))
Conclusion: minor penalty for one excess word in the title that is not in the query.
Example 3, query "A B C", title "X Y A B C":
find_word("A", deque(["X", "Y", "A", "B", "C"])) -> (found=True, skipped=2, rest=deque(["B", "C", "X", "Y"]))
find_word("B", deque(["B", "C", "X", "Y"])) -> (found=True, skipped=0, rest=deque(["C", "X", "Y"]))
find_word("C", deque(["C", "X", "Y"])) -> (found=True, skipped=0, rest=deque(["X", "Y"]))
Conclusion: major penalty for skipping two words at the beginning of the title plus a minor penalty for two
excess words in the title that are not in the query.
Example 4, query "A B C", title "A B X Y C":
find_word("A", deque(["A", "B", "X", "Y", "C"])) -> (found=True, skipped=0, rest=deque(["B", "X", "Y", "C"]))
find_word("B", deque(["B", "X", "Y", "C"])) -> (found=True, skipped=0, rest=deque(["X", "Y", "C"]))
find_word("C", deque(["X", "Y", "C"])) -> (found=True, skipped=2, rest=deque(["X", "Y"]))
Conclusion: average penalty for skipping two words in the middle of the title plus a minor penalty for two
excess words in the title that are not in the query.
Example 5, query "A B C", title "A C B":
find_word("A", deque(["A", "C", "B"])) -> (found=True, skipped=0, rest=deque(["C", "B"]))
find_word("B", deque(["C", "B"])) -> (found=True, skipped=1, rest=deque(["C"]))
find_word("C", deque(["C"])) -> (found=True, skipped=0, rest=deque(["C"]))
Conclusion: average penalty for skipping one word in the middle of the title.
Example 6, query "A B C", title "A C X":
find_word("A", deque(["A", "C", "X"])) -> (found=True, skipped=0, rest=deque(["C", "X"]))
find_word("B", deque(["C", "X"])) -> (found=False, skipped=0, rest=deque(["C", "X"]))
find_word("C", deque(["C", "X"])) -> (found=True, skipped=0, rest=deque(["X"]))
Conclusion: huge penalty for missing one query word plus a minor penalty for one excess title word.
the leading non-matching words ('X', 'Y') were moved to the end of the deque.
"""
try:
skipped = title.index(word)
skipped = title.index(word) # find the query word placement in the title and the number of preceding words
except ValueError:
return False, 0

Expand Down

0 comments on commit 52d0e5d

Please sign in to comment.