Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

re-group classes and functions linked to deduplication #582

Merged
merged 7 commits into from
May 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tests/cli_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@

from trafilatura import cli, cli_utils, spider # settings
from trafilatura.downloads import add_to_compressed_dict, fetch_url
from trafilatura.filters import LANGID_FLAG
from trafilatura.settings import args_to_extractor
from trafilatura.utils import LANGID_FLAG

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
RESOURCES_DIR = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'resources')
Expand Down
129 changes: 129 additions & 0 deletions tests/deduplication_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# pylint:disable-msg=I1101
"""
Unit tests for the trafilatura's text hashing and cache.
"""

from lxml import etree, html

import trafilatura.deduplication

from trafilatura import extract
from trafilatura.cli_utils import generate_hash_filename
from trafilatura.core import Extractor
from trafilatura.deduplication import (LRUCache, Simhash, content_fingerprint,
duplicate_test)


DEFAULT_OPTIONS = Extractor()


def test_hashes():
"Test hashing functions."
content = "abcde ijk l, "*10
assert content_fingerprint(content) == "528497a1d07b66d6"
assert generate_hash_filename(content) == "42LNugG3Sc95646i"



def test_simhash():
"Test similarity calculation based on Simhash class."
# https://en.wiktionary.org/wiki/put_lipstick_on_a_pig
factor = 1
hashes = []
hashes.append(Simhash("This is like putting lipstick on a pig."*factor))
# hashes.append(Simhash("This is like putting lipstick on a pig.123"*factor))
hashes.append(Simhash("This is just like putting lipstick on a pig."*factor))
hashes.append(Simhash("Putting lipstick on a pig is what this is about."*factor))
hashes.append(Simhash("The words are completely different but let's see."*factor))

sims = [hashes[0].similarity(h) for h in hashes]
assert sims[0] == 1.0 and min(sims) == sims[-1]

# sanity checks
assert Simhash(existing_hash=hashes[0].to_hex()).hash == hashes[0].hash
assert int(hex(hashes[0].hash)[2:], 16) == hashes[0].hash
assert Simhash(existing_hash=hashes[0].to_hex()).hash == hashes[0].hash

# re-hashed
assert Simhash(existing_hash="aghj").hash == 18446744073709551615
assert Simhash(existing_hash="18446744073709551615").hash == 18446744073709551615
assert Simhash(existing_hash=123).hash != 123
assert Simhash(existing_hash=18446744073709551615).hash == 18446744073709551615
assert Simhash(existing_hash=None).hash == Simhash().hash

# similarity
assert Simhash("abcde").similarity(Simhash("abcde")) == 1.0
assert Simhash("abcde").similarity(Simhash("abcde", length=2)) != 1.0
assert Simhash("abcde").similarity(Simhash("fghij")) < 0.6
assert Simhash("abcde "*100).similarity(Simhash("abcde")) == 1.0


def test_lrucache():
'''test basic duplicate detection'''
lru_test = LRUCache(maxsize=2)
trafilatura.deduplication.LRU_TEST = lru_test
my_body = etree.Element('body')

### element too short
#my_element = html.fromstring('<p>AAAA BBBB</p>')
#my_body.append(my_element)
#put_in_cache(my_body)
#assert duplicate_test(my_element, DEFAULT_CONFIG) is False
### cached element
my_element = html.fromstring('<p>AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB</p>')
my_body.append(my_element)
assert duplicate_test(my_element, DEFAULT_OPTIONS) is False
assert duplicate_test(my_element, DEFAULT_OPTIONS) is False
assert duplicate_test(my_body, DEFAULT_OPTIONS) is False
assert duplicate_test(my_element, DEFAULT_OPTIONS) is True
other_body = etree.Element('body')
other_element = html.fromstring('<p>CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD</p>')
other_body.append(other_element)
assert duplicate_test(other_body, DEFAULT_OPTIONS) is False
assert duplicate_test(other_element, DEFAULT_OPTIONS) is False
assert duplicate_test(other_body, DEFAULT_OPTIONS) is False
assert duplicate_test(other_element, DEFAULT_OPTIONS) is True
yet_another_body = etree.Element('body')
yet_another_element = html.fromstring('<p>EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF</p>')
yet_another_body.append(yet_another_element)
assert duplicate_test(yet_another_body, DEFAULT_OPTIONS) is False
assert duplicate_test(yet_another_body, DEFAULT_OPTIONS) is False
assert duplicate_test(yet_another_body, DEFAULT_OPTIONS) is False
# 2 elements in cache, original element has been cleared?
# print(LRU_TEST.maxsize, LRU_TEST.full)
assert duplicate_test(other_element, DEFAULT_OPTIONS) is True
assert duplicate_test(yet_another_element, DEFAULT_OPTIONS) is True
assert duplicate_test(my_element, DEFAULT_OPTIONS) is False
# clear the cache
lru_test.clear()
assert duplicate_test(other_element, DEFAULT_OPTIONS) is False
# get wrong key
assert lru_test.get('tralala') == -1


def test_dedup():
"Test paragraph-level deduplication."
my_p = '<p>abc</p>'
doc = html.fromstring('<html><body>' + my_p*50 + '</body></html>')
trafilatura.deduplication.LRU_TEST = LRUCache(maxsize=2)
assert extract(doc, deduplicate=True) is not None
assert extract(doc, deduplicate=True) is not None
assert extract(doc, deduplicate=True) is not None
assert extract(doc, deduplicate=True) is None

# paragraph level
trafilatura.deduplication.LRU_TEST = LRUCache(maxsize=2)
my_p = etree.fromstring('<p>' + 'abc'*50 + '</p>')
options = DEFAULT_OPTIONS
options.dedup = True
assert trafilatura.htmlprocessing.process_node(my_p, options) is not None
assert trafilatura.htmlprocessing.process_node(my_p, options) is not None
assert trafilatura.htmlprocessing.process_node(my_p, options) is not None
assert trafilatura.htmlprocessing.process_node(my_p, options) is None


if __name__ == "__main__":
test_hashes()
test_simhash()
test_lrucache()
test_dedup()
87 changes: 8 additions & 79 deletions tests/filters_tests.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,20 @@
# pylint:disable-msg=I1101,W1401
"""
Unit tests for the trafilatura's text filters and cache.
Unit tests for the trafilatura's text filters.
"""

# language detection
try:
import py3langid
LANGID_FLAG = True
except ImportError:
LANGID_FLAG = False
from lxml import html


from lxml import etree, html

import trafilatura.filters
from trafilatura import extract, bare_extraction
from trafilatura.core import Extractor
from trafilatura.filters import (check_html_lang, duplicate_test,
language_filter)
from trafilatura.lru import LRUCache
from trafilatura import extract
from trafilatura.metadata import Document
from trafilatura.settings import DEFAULT_CONFIG
from trafilatura.utils import LANGID_FLAG, check_html_lang, language_filter


ZERO_CONFIG = DEFAULT_CONFIG
ZERO_CONFIG['DEFAULT']['MIN_OUTPUT_SIZE'] = '0'
ZERO_CONFIG['DEFAULT']['MIN_EXTRACTED_SIZE'] = '0'

DEFAULT_OPTIONS = Extractor()

SAMPLE_META = Document()


Expand Down Expand Up @@ -59,22 +46,7 @@ def test_filters():
assert extract(doc, include_formatting=True, max_tree_size=500) is None
doc = html.fromstring('<html><body>' + my_p*499 + '</body></html>')
assert extract(doc, include_formatting=True, max_tree_size=500) is not None
## deduplication
doc = html.fromstring('<html><body>' + my_p*50 + '</body></html>')
trafilatura.filters.LRU_TEST = LRUCache(maxsize=2)
assert extract(doc, deduplicate=True) is not None
assert extract(doc, deduplicate=True) is not None
assert extract(doc, deduplicate=True) is not None
assert extract(doc, deduplicate=True) is None
# paragraph level
trafilatura.filters.LRU_TEST = LRUCache(maxsize=2)
my_p = etree.fromstring('<p>' + 'abc'*50 + '</p>')
options = DEFAULT_OPTIONS
options.dedup = True
assert trafilatura.htmlprocessing.process_node(my_p, options) is not None
assert trafilatura.htmlprocessing.process_node(my_p, options) is not None
assert trafilatura.htmlprocessing.process_node(my_p, options) is not None
assert trafilatura.htmlprocessing.process_node(my_p, options) is None

# HTML lang filter
# no lang
assert check_html_lang(html.fromstring('<html><body></body></html>'), target_language='en') is True
Expand Down Expand Up @@ -104,61 +76,19 @@ def test_filters():
assert check_html_lang(html.fromstring('<html lang="en-US"><head><meta property="og:locale" content="de_DE" /></head><body></body></html>'), target_language='de', strict=True) is True


def test_lrucache():
'''test basic duplicate detection'''
lru_test = LRUCache(maxsize=2)
trafilatura.filters.LRU_TEST = lru_test
my_body = etree.Element('body')

### element too short
#my_element = html.fromstring('<p>AAAA BBBB</p>')
#my_body.append(my_element)
#put_in_cache(my_body)
#assert duplicate_test(my_element, DEFAULT_CONFIG) is False
### cached element
my_element = html.fromstring('<p>AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB</p>')
my_body.append(my_element)
assert duplicate_test(my_element, DEFAULT_OPTIONS) is False
assert duplicate_test(my_element, DEFAULT_OPTIONS) is False
assert duplicate_test(my_body, DEFAULT_OPTIONS) is False
assert duplicate_test(my_element, DEFAULT_OPTIONS) is True
other_body = etree.Element('body')
other_element = html.fromstring('<p>CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD</p>')
other_body.append(other_element)
assert duplicate_test(other_body, DEFAULT_OPTIONS) is False
assert duplicate_test(other_element, DEFAULT_OPTIONS) is False
assert duplicate_test(other_body, DEFAULT_OPTIONS) is False
assert duplicate_test(other_element, DEFAULT_OPTIONS) is True
yet_another_body = etree.Element('body')
yet_another_element = html.fromstring('<p>EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF</p>')
yet_another_body.append(yet_another_element)
assert duplicate_test(yet_another_body, DEFAULT_OPTIONS) is False
assert duplicate_test(yet_another_body, DEFAULT_OPTIONS) is False
assert duplicate_test(yet_another_body, DEFAULT_OPTIONS) is False
# 2 elements in cache, original element has been cleared?
# print(LRU_TEST.maxsize, LRU_TEST.full)
assert duplicate_test(other_element, DEFAULT_OPTIONS) is True
assert duplicate_test(yet_another_element, DEFAULT_OPTIONS) is True
assert duplicate_test(my_element, DEFAULT_OPTIONS) is False
# clear the cache
lru_test.clear()
assert duplicate_test(other_element, DEFAULT_OPTIONS) is False
# get wrong key
assert lru_test.get('tralala') == -1


def test_prune_xpath():
'''test xpath pruning (parameter in extract and bare_extraction)'''
#create example html
def doc():
my_p = '<p>abc</p>'
return html.fromstring('<html><body>' + my_p*50 + '</body></html>')

def doc2():
my_p = '<p>abc</p>'
my_h1 = '<h1>ABC</h1>'
return html.fromstring('<html><body>' + my_h1 + my_p*50 + '</body></html>')

def doc3():
my_p = '<p>abc</p>'
my_h1 = '<h1>ABC</h1>'
Expand All @@ -178,5 +108,4 @@ def doc3():

if __name__ == '__main__':
test_filters()
test_lrucache()
test_prune_xpath()
50 changes: 0 additions & 50 deletions tests/hashing_tests.py

This file was deleted.

3 changes: 2 additions & 1 deletion tests/sitemaps_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@

import trafilatura
from trafilatura import sitemaps
from trafilatura.utils import decode_file, is_similar_domain
from trafilatura.deduplication import is_similar_domain
from trafilatura.utils import decode_file

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

Expand Down
9 changes: 3 additions & 6 deletions tests/spider_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,17 @@

import logging
import sys

from collections import deque

import pytest

from courlan import UrlStore

from trafilatura import spider
from trafilatura.settings import DEFAULT_CONFIG
from trafilatura.utils import LANGID_FLAG

# language detection
try:
import py3langid
LANGID_FLAG = True
except ImportError:
LANGID_FLAG = False

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

Expand Down
Loading
Loading