Move examples for find_work function from the docstring to a unit test

Tribler · Nov 1, 2022 · 52d0e5d · 52d0e5d
1 parent efe27da
commit 52d0e5d
Show file tree

Hide file tree

Showing 2 changed files with 62 additions and 58 deletions.
diff --git a/src/tribler/core/tests/test_search_utils.py b/src/tribler/core/tests/test_search_utils.py
@@ -1,7 +1,9 @@
+from collections import deque
+
 import pytest
 
-from tribler.core.utilities.search_utils import filter_keywords, item_rank, split_into_keywords, torrent_rank, \
-    title_rank
+from tribler.core.utilities.search_utils import filter_keywords, find_word, item_rank, split_into_keywords,\
+    torrent_rank, title_rank
 
 
 DAY = 60 * 60 * 24
@@ -120,3 +122,54 @@ def test_title_rank():
 def test_item_rank():
     item = dict(name="abc", num_seeders=10, num_leechers=20)
     assert item_rank("abc", item) == pytest.approx(0.81978445)
+
+
+def test_find_word():
+    # To use the find_word function, you can call it one time for each word from the query and see:
+    # - how many query words are missed in the title;
+    # - how many excess or out-of-place title words are found before each query word;
+    # - and how many title words are not mentioned in the query.
+
+    # Example 1, query "A B C", title "A B C"
+    title = deque(["A", "B", "C"])
+    assert find_word("A", title) == (True, 0) and title == deque(["B", "C"])
+    assert find_word("B", title) == (True, 0) and title == deque(["C"])
+    assert find_word("C", title) == (True, 0) and title == deque([])
+    # Conclusion: exact match.
+
+    # Example 2, query "A B C", title "A B C D"
+    title = deque(["A", "B", "C", "D"])
+    assert find_word("A", title) == (True, 0) and title == deque(["B", "C", "D"])
+    assert find_word("B", title) == (True, 0) and title == deque(["C", "D"])
+    assert find_word("C", title) == (True, 0) and title == deque(["D"])
+    # Conclusion: minor penalty for one excess word in the title that is not in the query.
+
+    # Example 3, query "A B C", title "X Y A B C"
+    title = deque(["X", "Y", "A", "B", "C"])
+    assert find_word("A", title) == (True, 2) and title == deque(["B", "C", "X", "Y"])
+    assert find_word("B", title) == (True, 0) and title == deque(["C", "X", "Y"])
+    assert find_word("C", title) == (True, 0) and title == deque(["X", "Y"])
+    # Conclusion: major penalty for skipping two words at the beginning of the title plus a minor penalty for two
+    # excess words in the title that are not in the query.
+
+    # Example 4, query "A B C", title "A B X Y C"
+    title = deque(["A", "B", "X", "Y", "C"])
+    assert find_word("A", title) == (True, 0) and title == deque(["B", "X", "Y", "C"])
+    assert find_word("B", title) == (True, 0) and title == deque(["X", "Y", "C"])
+    assert find_word("C", title) == (True, 2) and title == deque(["X", "Y"])
+    # Conclusion: average penalty for skipping two words in the middle of the title plus a minor penalty for two
+    # excess words in the title that are not in the query.
+
+    # Example 5, query "A B C", title "A C B"
+    title = deque(["A", "C", "B"])
+    assert find_word("A", title) == (True, 0) and title == deque(["C", "B"])
+    assert find_word("B", title) == (True, 1) and title == deque(["C"])
+    assert find_word("C", title) == (True, 0) and title == deque([])
+    # Conclusion: average penalty for skipping one word in the middle of the title.
+
+    # Example 6, query "A B C", title "A C X"
+    title = deque(["A", "C", "X"])
+    assert find_word("A", title) == (True, 0) and title == deque(["C", "X"])
+    assert find_word("B", title) == (False, 0) and title == deque(["C", "X"])
+    assert find_word("C", title) == (True, 0) and title == deque(["X"])
+    # Conclusion: huge penalty for missing one query word plus a minor penalty for one excess title word.
diff --git a/src/tribler/core/utilities/search_utils.py b/src/tribler/core/utilities/search_utils.py
@@ -196,73 +196,24 @@ def find_word(word: str, title: Deque[str]) -> Tuple[bool, int]:
     """
     Finds the query word in the title. Returns whether it was found or not and the number of skipped words in the title.
 
-    :param word: a word from the query
-    :param title: a list of words in the title
+    :param word: a word from the user-defined query string
+    :param title: a deque of words in the title
     :return: a two-elements tuple, whether the word was found in the title and the number of skipped words
 
     This is a helper function to efficiently answer a question of how close a query string and a title string are,
     taking into account the ordering of words in both strings.
 
-    The `word` parameter is a word from a search string.
-
-    The `title` parameter is a deque of words from the torrent title. It also can be a deque of stemmed words
-    if the `torrent_rank` function supports stemming.
-
-    The `find_word` function returns the boolean value of whether the word was found in the title deque or not and
-    the number of the skipped leading words in the `title` deque. Also, it modifies the `title` deque in place by
-    removing the first entrance of the found word and rotating all leading non-matching words to the end of the deque.
+    For efficiency reasons, the function modifies the `title` deque in place by removing the first entrance
+    of the found word and rotating all leading non-matching words to the end of the deque. It allows to efficiently
+    perform multiple calls of the `find_word` function for subsequent words from the same query string.
 
     An example: find_word('A', deque(['X', 'Y', 'A', 'B', 'C'])) returns `(True, 2)`, where True means that
     the word 'A' was found in the `title` deque, and 2 is the number of skipped words ('X', 'Y'). Also, it modifies
     the `title` deque, so it starts looking like deque(['B', 'C', 'X', 'Y']). The found word 'A' was removed, and
-    the leading non-matching words ('X', 'Y') was moved to the end of the deque.
-
-    Now some examples of how the function can be used. To use the function, you can call it one time for each word
-    from the query and see:
-    - how many query words are missed in the title;
-    - how many excess or out-of-place title words are found before each query word;
-    - and how many title words are not mentioned in the query.
-
-    Example 1, query "A B C", title "A B C":
-    find_word("A", deque(["A", "B", "C"])) -> (found=True, skipped=0, rest=deque(["B", "C"]))
-    find_word("B", deque(["B", "C"])) -> (found=True, skipped=0, rest=deque(["C"]))
-    find_word("C", deque(["C"])) -> (found=True, skipped=0, rest=deque([]))
-    Conclusion: exact match.
-
-    Example 2, query "A B C", title "A B C D":
-    find_word("A", deque(["A", "B", "C", "D"])) -> (found=True, skipped=0, rest=deque(["B", "C", "D"]))
-    find_word("B", deque(["B", "C", "D"])) -> (found=True, skipped=0, rest=deque(["C", "D"]))
-    find_word("C", deque(["C", "D"])) -> (found=True, skipped=0, rest=deque(["D"]))
-    Conclusion: minor penalty for one excess word in the title that is not in the query.
-
-    Example 3, query "A B C", title "X Y A B C":
-    find_word("A", deque(["X", "Y", "A", "B", "C"])) -> (found=True, skipped=2, rest=deque(["B", "C", "X", "Y"]))
-    find_word("B", deque(["B", "C", "X", "Y"])) -> (found=True, skipped=0, rest=deque(["C", "X", "Y"]))
-    find_word("C", deque(["C", "X", "Y"])) -> (found=True, skipped=0, rest=deque(["X", "Y"]))
-    Conclusion: major penalty for skipping two words at the beginning of the title plus a minor penalty for two
-    excess words in the title that are not in the query.
-
-    Example 4, query "A B C", title "A B X Y C":
-    find_word("A", deque(["A", "B", "X", "Y", "C"])) -> (found=True, skipped=0, rest=deque(["B", "X", "Y", "C"]))
-    find_word("B", deque(["B", "X", "Y", "C"])) -> (found=True, skipped=0, rest=deque(["X", "Y", "C"]))
-    find_word("C", deque(["X", "Y", "C"])) -> (found=True, skipped=2, rest=deque(["X", "Y"]))
-    Conclusion: average penalty for skipping two words in the middle of the title plus a minor penalty for two
-    excess words in the title that are not in the query.
-
-    Example 5, query "A B C", title "A C B":
-    find_word("A", deque(["A", "C", "B"])) -> (found=True, skipped=0, rest=deque(["C", "B"]))
-    find_word("B", deque(["C", "B"])) -> (found=True, skipped=1, rest=deque(["C"]))
-    find_word("C", deque(["C"])) -> (found=True, skipped=0, rest=deque(["C"]))
-    Conclusion: average penalty for skipping one word in the middle of the title.
-
-    Example 6, query "A B C", title "A C X":
-    find_word("A", deque(["A", "C", "X"])) -> (found=True, skipped=0, rest=deque(["C", "X"]))
-    find_word("B", deque(["C", "X"])) -> (found=False, skipped=0, rest=deque(["C", "X"]))
-    find_word("C", deque(["C", "X"])) -> (found=True, skipped=0, rest=deque(["X"]))
-    Conclusion: huge penalty for missing one query word plus a minor penalty for one excess title word.
+    the leading non-matching words ('X', 'Y') were moved to the end of the deque.
     """
     try:
-        skipped = title.index(word)
+        skipped = title.index(word)  # find the query word placement in the title and the number of preceding words
     except ValueError:
         return False, 0