adbar · adbar · May 3, 2024 · May 2, 2024 · May 2, 2024 · May 2, 2024
diff --git a/tests/cli_tests.py b/tests/cli_tests.py
@@ -17,8 +17,8 @@
 
 from trafilatura import cli, cli_utils, spider  # settings
 from trafilatura.downloads import add_to_compressed_dict, fetch_url
-from trafilatura.filters import LANGID_FLAG
 from trafilatura.settings import args_to_extractor
+from trafilatura.utils import LANGID_FLAG
 
 logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
 RESOURCES_DIR = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'resources')

diff --git a/tests/deduplication_tests.py b/tests/deduplication_tests.py
@@ -0,0 +1,129 @@
+# pylint:disable-msg=I1101
+"""
+Unit tests for the trafilatura's text hashing and cache.
+"""
+
+from lxml import etree, html
+
+import trafilatura.deduplication
+
+from trafilatura import extract
+from trafilatura.cli_utils import generate_hash_filename
+from trafilatura.core import Extractor
+from trafilatura.deduplication import (LRUCache, Simhash, content_fingerprint,
+                                       duplicate_test)
+
+
+DEFAULT_OPTIONS = Extractor()
+
+
+def test_hashes():
+    "Test hashing functions."
+    content = "abcde ijk l, "*10
+    assert content_fingerprint(content) == "528497a1d07b66d6"
+    assert generate_hash_filename(content) == "42LNugG3Sc95646i"
+
+
+
+def test_simhash():
+    "Test similarity calculation based on Simhash class."
+    # https://en.wiktionary.org/wiki/put_lipstick_on_a_pig
+    factor = 1
+    hashes = []
+    hashes.append(Simhash("This is like putting lipstick on a pig."*factor))
+    # hashes.append(Simhash("This is like putting lipstick on a pig.123"*factor))
+    hashes.append(Simhash("This is just like putting lipstick on a pig."*factor))
+    hashes.append(Simhash("Putting lipstick on a pig is what this is about."*factor))
+    hashes.append(Simhash("The words are completely different but let's see."*factor))
+
+    sims = [hashes[0].similarity(h) for h in hashes]
+    assert sims[0] == 1.0 and min(sims) == sims[-1]
+
+    # sanity checks
+    assert Simhash(existing_hash=hashes[0].to_hex()).hash == hashes[0].hash
+    assert int(hex(hashes[0].hash)[2:], 16) == hashes[0].hash
+    assert Simhash(existing_hash=hashes[0].to_hex()).hash == hashes[0].hash
+
+    # re-hashed
+    assert Simhash(existing_hash="aghj").hash == 18446744073709551615
+    assert Simhash(existing_hash="18446744073709551615").hash == 18446744073709551615
+    assert Simhash(existing_hash=123).hash != 123
+    assert Simhash(existing_hash=18446744073709551615).hash == 18446744073709551615
+    assert Simhash(existing_hash=None).hash == Simhash().hash
+
+    # similarity
+    assert Simhash("abcde").similarity(Simhash("abcde")) == 1.0
+    assert Simhash("abcde").similarity(Simhash("abcde", length=2)) != 1.0
+    assert Simhash("abcde").similarity(Simhash("fghij")) < 0.6
+    assert Simhash("abcde "*100).similarity(Simhash("abcde")) == 1.0
+
+
+def test_lrucache():
+    '''test basic duplicate detection'''
+    lru_test = LRUCache(maxsize=2)
+    trafilatura.deduplication.LRU_TEST = lru_test
+    my_body = etree.Element('body')
+
+    ### element too short
+    #my_element = html.fromstring('<p>AAAA BBBB</p>')
+    #my_body.append(my_element)
+    #put_in_cache(my_body)
+    #assert duplicate_test(my_element, DEFAULT_CONFIG) is False
+    ### cached element
+    my_element = html.fromstring('<p>AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB</p>')
+    my_body.append(my_element)
+    assert duplicate_test(my_element, DEFAULT_OPTIONS) is False
+    assert duplicate_test(my_element, DEFAULT_OPTIONS) is False
+    assert duplicate_test(my_body, DEFAULT_OPTIONS) is False
+    assert duplicate_test(my_element, DEFAULT_OPTIONS) is True
+    other_body = etree.Element('body')
+    other_element = html.fromstring('<p>CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD</p>')
+    other_body.append(other_element)
+    assert duplicate_test(other_body, DEFAULT_OPTIONS) is False
+    assert duplicate_test(other_element, DEFAULT_OPTIONS) is False
+    assert duplicate_test(other_body, DEFAULT_OPTIONS) is False
+    assert duplicate_test(other_element, DEFAULT_OPTIONS) is True
+    yet_another_body = etree.Element('body')
+    yet_another_element = html.fromstring('<p>EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF</p>')
+    yet_another_body.append(yet_another_element)
+    assert duplicate_test(yet_another_body, DEFAULT_OPTIONS) is False
+    assert duplicate_test(yet_another_body, DEFAULT_OPTIONS) is False
+    assert duplicate_test(yet_another_body, DEFAULT_OPTIONS) is False
+    # 2 elements in cache, original element has been cleared?
+    # print(LRU_TEST.maxsize, LRU_TEST.full)
+    assert duplicate_test(other_element, DEFAULT_OPTIONS) is True
+    assert duplicate_test(yet_another_element, DEFAULT_OPTIONS) is True
+    assert duplicate_test(my_element, DEFAULT_OPTIONS) is False
+    # clear the cache
+    lru_test.clear()
+    assert duplicate_test(other_element, DEFAULT_OPTIONS) is False
+    # get wrong key
+    assert lru_test.get('tralala') == -1
+
+
+def test_dedup():
+    "Test paragraph-level deduplication."
+    my_p = '<p>abc</p>'
+    doc = html.fromstring('<html><body>' + my_p*50 + '</body></html>')
+    trafilatura.deduplication.LRU_TEST = LRUCache(maxsize=2)
+    assert extract(doc, deduplicate=True) is not None
+    assert extract(doc, deduplicate=True) is not None
+    assert extract(doc, deduplicate=True) is not None
+    assert extract(doc, deduplicate=True) is None
+
+    # paragraph level
+    trafilatura.deduplication.LRU_TEST = LRUCache(maxsize=2)
+    my_p = etree.fromstring('<p>' + 'abc'*50 + '</p>')
+    options = DEFAULT_OPTIONS
+    options.dedup = True
+    assert trafilatura.htmlprocessing.process_node(my_p, options) is not None
+    assert trafilatura.htmlprocessing.process_node(my_p, options) is not None
+    assert trafilatura.htmlprocessing.process_node(my_p, options) is not None
+    assert trafilatura.htmlprocessing.process_node(my_p, options) is None
+
+
+if __name__ == "__main__":
+    test_hashes()
+    test_simhash()
+    test_lrucache()
+    test_dedup()
diff --git a/tests/filters_tests.py b/tests/filters_tests.py
@@ -1,33 +1,20 @@
 # pylint:disable-msg=I1101,W1401
 """
-Unit tests for the trafilatura's text filters and cache.
+Unit tests for the trafilatura's text filters.
 """
 
-# language detection
-try:
-    import py3langid
-    LANGID_FLAG = True
-except ImportError:
-    LANGID_FLAG = False
+from lxml import html
 
-
-from lxml import etree, html
-
-import trafilatura.filters
-from trafilatura import extract, bare_extraction
-from trafilatura.core import Extractor
-from trafilatura.filters import (check_html_lang, duplicate_test,
-                                 language_filter)
-from trafilatura.lru import LRUCache
+from trafilatura import extract
 from trafilatura.metadata import Document
 from trafilatura.settings import DEFAULT_CONFIG
+from trafilatura.utils import LANGID_FLAG, check_html_lang, language_filter
+
 
 ZERO_CONFIG = DEFAULT_CONFIG
 ZERO_CONFIG['DEFAULT']['MIN_OUTPUT_SIZE'] = '0'
 ZERO_CONFIG['DEFAULT']['MIN_EXTRACTED_SIZE'] = '0'
 
-DEFAULT_OPTIONS = Extractor()
-
 SAMPLE_META = Document()
 
 
@@ -59,22 +46,7 @@ def test_filters():
     assert extract(doc, include_formatting=True, max_tree_size=500) is None
     doc = html.fromstring('<html><body>' + my_p*499 + '</body></html>')
     assert extract(doc, include_formatting=True, max_tree_size=500) is not None
-    ## deduplication
-    doc = html.fromstring('<html><body>' + my_p*50 + '</body></html>')
-    trafilatura.filters.LRU_TEST = LRUCache(maxsize=2)
-    assert extract(doc, deduplicate=True) is not None
-    assert extract(doc, deduplicate=True) is not None
-    assert extract(doc, deduplicate=True) is not None
-    assert extract(doc, deduplicate=True) is None
-    # paragraph level
-    trafilatura.filters.LRU_TEST = LRUCache(maxsize=2)
-    my_p = etree.fromstring('<p>' + 'abc'*50 + '</p>')
-    options = DEFAULT_OPTIONS
-    options.dedup = True
-    assert trafilatura.htmlprocessing.process_node(my_p, options) is not None
-    assert trafilatura.htmlprocessing.process_node(my_p, options) is not None
-    assert trafilatura.htmlprocessing.process_node(my_p, options) is not None
-    assert trafilatura.htmlprocessing.process_node(my_p, options) is None
+
     # HTML lang filter
     # no lang
     assert check_html_lang(html.fromstring('<html><body></body></html>'), target_language='en') is True
@@ -104,61 +76,19 @@ def test_filters():
     assert check_html_lang(html.fromstring('<html lang="en-US"><head><meta property="og:locale" content="de_DE" /></head><body></body></html>'), target_language='de', strict=True) is True
 
 
-def test_lrucache():
-    '''test basic duplicate detection'''
-    lru_test = LRUCache(maxsize=2)
-    trafilatura.filters.LRU_TEST = lru_test
-    my_body = etree.Element('body')
-
-    ### element too short
-    #my_element = html.fromstring('<p>AAAA BBBB</p>')
-    #my_body.append(my_element)
-    #put_in_cache(my_body)
-    #assert duplicate_test(my_element, DEFAULT_CONFIG) is False
-    ### cached element
-    my_element = html.fromstring('<p>AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB</p>')
-    my_body.append(my_element)
-    assert duplicate_test(my_element, DEFAULT_OPTIONS) is False
-    assert duplicate_test(my_element, DEFAULT_OPTIONS) is False
-    assert duplicate_test(my_body, DEFAULT_OPTIONS) is False
-    assert duplicate_test(my_element, DEFAULT_OPTIONS) is True
-    other_body = etree.Element('body')
-    other_element = html.fromstring('<p>CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD</p>')
-    other_body.append(other_element)
-    assert duplicate_test(other_body, DEFAULT_OPTIONS) is False
-    assert duplicate_test(other_element, DEFAULT_OPTIONS) is False
-    assert duplicate_test(other_body, DEFAULT_OPTIONS) is False
-    assert duplicate_test(other_element, DEFAULT_OPTIONS) is True
-    yet_another_body = etree.Element('body')
-    yet_another_element = html.fromstring('<p>EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF</p>')
-    yet_another_body.append(yet_another_element)
-    assert duplicate_test(yet_another_body, DEFAULT_OPTIONS) is False
-    assert duplicate_test(yet_another_body, DEFAULT_OPTIONS) is False
-    assert duplicate_test(yet_another_body, DEFAULT_OPTIONS) is False
-    # 2 elements in cache, original element has been cleared?
-    # print(LRU_TEST.maxsize, LRU_TEST.full)
-    assert duplicate_test(other_element, DEFAULT_OPTIONS) is True
-    assert duplicate_test(yet_another_element, DEFAULT_OPTIONS) is True
-    assert duplicate_test(my_element, DEFAULT_OPTIONS) is False
-    # clear the cache
-    lru_test.clear()
-    assert duplicate_test(other_element, DEFAULT_OPTIONS) is False
-    # get wrong key
-    assert lru_test.get('tralala') == -1
-
 
 def test_prune_xpath():
     '''test xpath pruning (parameter in extract and bare_extraction)'''
     #create example html
     def doc():
         my_p = '<p>abc</p>'
         return html.fromstring('<html><body>' + my_p*50 + '</body></html>')
-    
+
     def doc2():
         my_p = '<p>abc</p>'
         my_h1 = '<h1>ABC</h1>'
         return html.fromstring('<html><body>' + my_h1 + my_p*50 + '</body></html>')
-    
+
     def doc3():
         my_p = '<p>abc</p>'
         my_h1 = '<h1>ABC</h1>'
@@ -178,5 +108,4 @@ def doc3():
 
 if __name__ == '__main__':
     test_filters()
-    test_lrucache()
     test_prune_xpath()
diff --git a/tests/hashing_tests.py b/tests/hashing_tests.py
diff --git a/tests/sitemaps_tests.py b/tests/sitemaps_tests.py
@@ -10,7 +10,8 @@
 
 import trafilatura
 from trafilatura import sitemaps
-from trafilatura.utils import decode_file, is_similar_domain
+from trafilatura.deduplication import is_similar_domain
+from trafilatura.utils import decode_file
 
 logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
 

diff --git a/tests/spider_tests.py b/tests/spider_tests.py
@@ -5,20 +5,17 @@
 
 import logging
 import sys
+
 from collections import deque
 
 import pytest
+
 from courlan import UrlStore
 
 from trafilatura import spider
 from trafilatura.settings import DEFAULT_CONFIG
+from trafilatura.utils import LANGID_FLAG
 
-# language detection
-try:
-    import py3langid
-    LANGID_FLAG = True
-except ImportError:
-    LANGID_FLAG = False
 
 logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)