diff --git a/tests/cli_tests.py b/tests/cli_tests.py
index 5cf0c854..3ffd477d 100644
--- a/tests/cli_tests.py
+++ b/tests/cli_tests.py
@@ -21,224 +21,244 @@
from trafilatura.utils import LANGID_FLAG
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
-RESOURCES_DIR = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'resources')
+RESOURCES_DIR = os.path.join(os.path.abspath(os.path.dirname(__file__)), "resources")
def test_parser():
- '''test argument parsing for the command-line interface'''
- testargs = ['', '-fvv', '--xmltei', '--no-tables', '-u', 'https://www.example.org']
- with patch.object(sys, 'argv', testargs):
+ """test argument parsing for the command-line interface"""
+ testargs = ["", "-fvv", "--xmltei", "--no-tables", "-u", "https://www.example.org"]
+ with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
assert args.fast is True
assert args.verbose == 2
assert args.no_tables is False
assert args.xmltei is True
- assert args.URL == 'https://www.example.org'
+ assert args.URL == "https://www.example.org"
args = cli.map_args(args)
- assert args.output_format == 'xmltei'
- testargs = ['', '-out', 'csv', '--no-tables', '-u', 'https://www.example.org']
- with patch.object(sys, 'argv', testargs):
+ assert args.output_format == "xmltei"
+ testargs = ["", "-out", "csv", "--no-tables", "-u", "https://www.example.org"]
+ with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
assert args.fast is False
assert args.verbose == 0
- assert args.output_format == 'csv'
+ assert args.output_format == "csv"
assert args.no_tables is False
# test args mapping
- testargs = ['', '--markdown']
- with patch.object(sys, 'argv', testargs):
+ testargs = ["", "--markdown"]
+ with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
args = cli.map_args(args)
- assert args.output_format == 'markdown'
- testargs = ['', '--xml', '--no-comments', '--precision', '--recall']
- with patch.object(sys, 'argv', testargs):
+ assert args.output_format == "markdown"
+ testargs = ["", "--xml", "--no-comments", "--precision", "--recall"]
+ with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
args = cli.map_args(args)
- assert args.output_format == 'xml' and args.no_comments is False
+ assert args.output_format == "xml" and args.no_comments is False
# combination possible (?)
assert args.precision is True and args.recall is True
args.xml, args.csv = False, True
args = cli.map_args(args)
- assert args.output_format == 'csv'
+ assert args.output_format == "csv"
args.csv, args.json = False, True
args = cli.map_args(args)
- assert args.output_format == 'json'
- testargs = ['', '--only-with-metadata']
- with patch.object(sys, 'argv', testargs):
+ assert args.output_format == "json"
+ testargs = ["", "--only-with-metadata"]
+ with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
args = cli.map_args(args)
assert args.only_with_metadata is True
# process_args
- args.input_dir = '/dev/null'
+ args.input_dir = "/dev/null"
args.verbose = 1
- args.blacklist = os.path.join(RESOURCES_DIR, 'list-discard.txt')
+ args.blacklist = os.path.join(RESOURCES_DIR, "list-discard.txt")
cli.process_args(args)
assert len(args.blacklist) == 3
# filter
- testargs = ['', '-i', 'resources/list-discard.txt', '--url-filter', 'test1', 'test2']
- with patch.object(sys, 'argv', testargs):
- args = cli.parse_args(testargs)
- assert args.input_file == 'resources/list-discard.txt'
- assert args.url_filter == ['test1', 'test2']
- args.input_file = os.path.join(RESOURCES_DIR, 'list-discard.txt')
- args.blacklist = os.path.join(RESOURCES_DIR, 'list-discard.txt')
+ testargs = [
+ "",
+ "-i",
+ "resources/list-discard.txt",
+ "--url-filter",
+ "test1",
+ "test2",
+ ]
+ with patch.object(sys, "argv", testargs):
+ args = cli.parse_args(testargs)
+ assert args.input_file == "resources/list-discard.txt"
+ assert args.url_filter == ["test1", "test2"]
+ args.input_file = os.path.join(RESOURCES_DIR, "list-discard.txt")
+ args.blacklist = os.path.join(RESOURCES_DIR, "list-discard.txt")
f = io.StringIO()
with redirect_stdout(f):
cli.process_args(args)
assert len(f.getvalue()) == 0
# input directory
- testargs = ['', '--input-dir', 'resources/test/']
- with patch.object(sys, 'argv', testargs):
+ testargs = ["", "--input-dir", "resources/test/"]
+ with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
f = io.StringIO()
with redirect_stdout(f):
cli.process_args(args)
assert len(f.getvalue()) == 0
# version
- testargs = ['', '--version']
+ testargs = ["", "--version"]
with pytest.raises(SystemExit) as e, redirect_stdout(f):
- with patch.object(sys, 'argv', testargs):
+ with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
assert e.type == SystemExit
assert e.value.code == 0
- assert re.match(r'Trafilatura [0-9]\.[0-9]\.[0-9] - Python [0-9]\.[0-9]+\.[0-9]', f.getvalue())
+ assert re.match(
+ r"Trafilatura [0-9]\.[0-9]\.[0-9] - Python [0-9]\.[0-9]+\.[0-9]", f.getvalue()
+ )
# test deprecations
- with patch.object(sys, 'argv', ['', '--inputfile', 'test.txt']), pytest.raises(ValueError):
+ with patch.object(sys, "argv", ["", "--inputfile", "test.txt"]), pytest.raises(
+ ValueError
+ ):
cli.map_args(cli.parse_args(testargs))
- for arg in ('--with-metadata', '--nocomments', '--notables', '--hash-as-name'):
- testargs = ['', arg]
- with patch.object(sys, 'argv', testargs), pytest.raises(ValueError):
+ for arg in ("--with-metadata", "--nocomments", "--notables", "--hash-as-name"):
+ testargs = ["", arg]
+ with patch.object(sys, "argv", testargs), pytest.raises(ValueError):
cli.map_args(cli.parse_args(testargs))
- testargs = ['', '--inputdir', 'test1']
- with patch.object(sys, 'argv', testargs), pytest.raises(ValueError):
+ testargs = ["", "--inputdir", "test1"]
+ with patch.object(sys, "argv", testargs), pytest.raises(ValueError):
cli.map_args(cli.parse_args(testargs))
- testargs = ['', '--outputdir', 'test2']
- with patch.object(sys, 'argv', testargs), pytest.raises(ValueError):
+ testargs = ["", "--outputdir", "test2"]
+ with patch.object(sys, "argv", testargs), pytest.raises(ValueError):
cli.map_args(cli.parse_args(testargs))
def test_climain():
- '''test arguments and main CLI entrypoint'''
+ """test arguments and main CLI entrypoint"""
# exit status required: 0
# Windows platforms
- if os.name == 'nt':
+ if os.name == "nt":
trafilatura_bin = os.path.join(sys.prefix, "Scripts", "trafilatura")
# other platforms
else:
- trafilatura_bin = 'trafilatura'
+ trafilatura_bin = "trafilatura"
# help display
- assert subprocess.run([trafilatura_bin, '--help']).returncode == 0
+ assert subprocess.run([trafilatura_bin, "--help"]).returncode == 0
# piped input
- empty_input = b'
'
+ empty_input = b""
assert subprocess.run([trafilatura_bin], input=empty_input).returncode == 0
# input directory walking and processing
env = os.environ.copy()
- if os.name == 'nt':
+ if os.name == "nt":
# Force encoding to utf-8 for Windows (seem to be a problem only in GitHub Actions)
- env['PYTHONIOENCODING'] = 'utf-8'
- assert subprocess.run([trafilatura_bin, '--input-dir', RESOURCES_DIR], env=env).returncode == 0
+ env["PYTHONIOENCODING"] = "utf-8"
+ assert (
+ subprocess.run(
+ [trafilatura_bin, "--input-dir", RESOURCES_DIR], env=env
+ ).returncode
+ == 0
+ )
def test_input_type():
- '''test input type errors'''
- testfile = 'docs/trafilatura-demo.gif'
- testargs = ['', '-u', 'http']
- with patch.object(sys, 'argv', testargs):
+ """test input type errors"""
+ testfile = "docs/trafilatura-demo.gif"
+ testargs = ["", "-u", "http"]
+ with patch.object(sys, "argv", testargs):
assert cli.main() is None
- testargs = ['', '-v']
- with patch.object(sys, 'argv', testargs):
+ testargs = ["", "-v"]
+ with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
- with open(testfile, 'rb') as f:
+ with open(testfile, "rb") as f:
teststring = f.read(1024)
assert cli.examine(teststring, args) is None
- testfile = 'docs/usage.rst'
- with open(testfile, 'r', encoding="utf-8") as f:
+ testfile = "docs/usage.rst"
+ with open(testfile, "r", encoding="utf-8") as f:
teststring = f.read()
assert cli.examine(teststring, args) is None
# test file list
- assert 10 <= len(list(cli_utils.generate_filelist(RESOURCES_DIR))) <= 20
+ assert 10 <= len(list(cli_utils.generate_filelist(RESOURCES_DIR))) <= 21
def test_sysoutput():
- '''test command-line output with respect to CLI arguments'''
- testargs = ['', '--csv', '-o', '/root/forbidden/']
- with patch.object(sys, 'argv', testargs):
+ """test command-line output with respect to CLI arguments"""
+ testargs = ["", "--csv", "-o", "/root/forbidden/"]
+ with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
- filepath, destdir = cli_utils.determine_output_path(args, args.output_dir, '')
- assert len(filepath) >= 10 and filepath.endswith('.csv')
- assert destdir == '/root/forbidden/'
+ filepath, destdir = cli_utils.determine_output_path(args, args.output_dir, "")
+ assert len(filepath) >= 10 and filepath.endswith(".csv")
+ assert destdir == "/root/forbidden/"
# doesn't work the same on Windows
- if os.name != 'nt':
+ if os.name != "nt":
assert cli_utils.check_outputdir_status(args.output_dir) is False
else:
assert cli_utils.check_outputdir_status(args.output_dir) is True
- testargs = ['', '--xml', '-o', '/tmp/you-touch-my-tralala']
- with patch.object(sys, 'argv', testargs):
+ testargs = ["", "--xml", "-o", "/tmp/you-touch-my-tralala"]
+ with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
assert cli_utils.check_outputdir_status(args.output_dir) is True
# test fileslug for name
- filepath, destdir = cli_utils.determine_output_path(args, args.output_dir, '', new_filename='AAZZ')
- assert filepath.endswith('AAZZ.xml')
+ filepath, destdir = cli_utils.determine_output_path(
+ args, args.output_dir, "", new_filename="AAZZ"
+ )
+ assert filepath.endswith("AAZZ.xml")
# test json output
args2 = args
args2.xml, args2.json = False, True
args2 = cli.map_args(args2)
- filepath2, destdir2 = cli_utils.determine_output_path(args, args.output_dir, '', new_filename='AAZZ')
- assert filepath2.endswith('AAZZ.json')
+ filepath2, destdir2 = cli_utils.determine_output_path(
+ args, args.output_dir, "", new_filename="AAZZ"
+ )
+ assert filepath2.endswith("AAZZ.json")
assert "you-touch-my-tralala" in destdir2
# test directory counter
# doesn't work the same on Windows
- if os.name != 'nt':
- assert cli_utils.determine_counter_dir('testdir', 0) == 'testdir/1'
+ if os.name != "nt":
+ assert cli_utils.determine_counter_dir("testdir", 0) == "testdir/1"
else:
- assert cli_utils.determine_counter_dir('testdir', 0) == 'testdir\\1'
+ assert cli_utils.determine_counter_dir("testdir", 0) == "testdir\\1"
# test file writing
- testargs = ['', '--csv', '-o', '/dev/null/', '-b', '/dev/null/']
- with patch.object(sys, 'argv', testargs):
+ testargs = ["", "--csv", "-o", "/dev/null/", "-b", "/dev/null/"]
+ with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
- result = 'DADIDA'
+ result = "DADIDA"
cli_utils.write_result(result, args)
# process with backup directory and no counter
options = args_to_extractor(args)
- assert cli_utils.process_result('DADIDA', args, None, options) is None
+ assert cli_utils.process_result("DADIDA", args, None, options) is None
# test keeping dir structure
- testargs = ['', '-i', 'myinputdir/', '-o', 'test/', '--keep-dirs']
- with patch.object(sys, 'argv', testargs):
+ testargs = ["", "-i", "myinputdir/", "-o", "test/", "--keep-dirs"]
+ with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
- filepath, destdir = cli_utils.determine_output_path(args, 'testfile.txt', '')
- assert filepath == 'test/testfile.txt'
+ filepath, destdir = cli_utils.determine_output_path(args, "testfile.txt", "")
+ assert filepath == "test/testfile.txt"
# test hash as output file name
assert args.keep_dirs is True
args.keep_dirs = False
- filepath, destdir = cli_utils.determine_output_path(args, 'testfile.txt', '')
- assert filepath == 'test/uOHdo6wKo4IK0pkL.txt'
+ filepath, destdir = cli_utils.determine_output_path(args, "testfile.txt", "")
+ assert filepath == "test/uOHdo6wKo4IK0pkL.txt"
def test_download():
- '''test page download and command-line interface'''
- testargs = ['', '-v']
- with patch.object(sys, 'argv', testargs):
+ """test page download and command-line interface"""
+ testargs = ["", "-v"]
+ with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
assert cli.examine(None, args) is None
- assert cli.examine(' ', args) is None
- assert cli.examine('0'*int(10e7), args) is None
- #url = 'https://httpbun.org/status/200'
- #teststring = fetch_url(url)
- #assert teststring is None # too small
- #assert cli.examine(teststring, args, url) is None
- #url = 'https://httpbun.org/links/2/2'
- #teststring = fetch_url(url)
- #assert teststring is not None
- #assert cli.examine(teststring, args, url) is None
- url = 'https://httpbun.com/html'
+ assert cli.examine(" ", args) is None
+ assert cli.examine("0" * int(10e7), args) is None
+ # url = 'https://httpbun.org/status/200'
+ # teststring = fetch_url(url)
+ # assert teststring is None # too small
+ # assert cli.examine(teststring, args, url) is None
+ # url = 'https://httpbun.org/links/2/2'
+ # teststring = fetch_url(url)
+ # assert teststring is not None
+ # assert cli.examine(teststring, args, url) is None
+ url = "https://httpbun.com/html"
teststring = fetch_url(url)
assert teststring is not None
assert cli.examine(teststring, args, url) is not None
# test exit code for faulty URLs
- testargs = ['', '-u', 'https://1234.yz/']
- with patch.object(sys, 'argv', testargs):
+ testargs = ["", "-u", "https://1234.yz/"]
+ with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
with pytest.raises(SystemExit) as e:
cli.process_args(args)
@@ -247,33 +267,40 @@ def test_download():
# @patch('trafilatura.settings.MAX_FILES_PER_DIRECTORY', 1)
def test_cli_pipeline():
- '''test command-line processing pipeline'''
+ """test command-line processing pipeline"""
# straight command-line input
- #testargs = ['', 'Text']
- #with patch.object(sys, 'argv', testargs):
+ # testargs = ['', 'Text']
+ # with patch.object(sys, 'argv', testargs):
# args = cli.parse_args(testargs)
- #f = io.StringIO()
- #with redirect_stdout(f):
+ # f = io.StringIO()
+ # with redirect_stdout(f):
# cli.process_args(args)
- #assert len(f.getvalue()) == 0
+ # assert len(f.getvalue()) == 0
# Force encoding to utf-8 for Windows in future processes spawned by multiprocessing.Pool
- os.environ['PYTHONIOENCODING'] = "utf-8"
+ os.environ["PYTHONIOENCODING"] = "utf-8"
# test URL listing
- testargs = ['', '--list']
- with patch.object(sys, 'argv', testargs):
+ testargs = ["", "--list"]
+ with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
assert cli_utils.url_processing_pipeline(args, UrlStore()) is False
# test inputlist + blacklist
- testargs = ['', '-i', os.path.join(RESOURCES_DIR, 'list-process.txt')]
- with patch.object(sys, 'argv', testargs):
+ testargs = ["", "-i", os.path.join(RESOURCES_DIR, "list-process.txt")]
+ with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
my_urls = cli_utils.load_input_urls(args)
assert my_urls is not None and len(my_urls) == 3
- testargs = ['', '-i', os.path.join(RESOURCES_DIR, 'list-process.txt'), '--blacklist', os.path.join(RESOURCES_DIR, 'list-discard.txt'), '--archived']
- with patch.object(sys, 'argv', testargs):
+ testargs = [
+ "",
+ "-i",
+ os.path.join(RESOURCES_DIR, "list-process.txt"),
+ "--blacklist",
+ os.path.join(RESOURCES_DIR, "list-discard.txt"),
+ "--archived",
+ ]
+ with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
assert args.blacklist is not None
# test backoff between domain requests
@@ -288,53 +315,68 @@ def test_cli_pipeline():
url_store = add_to_compressed_dict(my_urls, args.blacklist, None, None)
cli_utils.url_processing_pipeline(args, url_store)
# test backup
- testargs = ['', '--backup-dir', '/tmp/']
- with patch.object(sys, 'argv', testargs):
+ testargs = ["", "--backup-dir", "/tmp/"]
+ with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
- cli_utils.archive_html('00Test', args)
+ cli_utils.archive_html("00Test", args)
# test date-based exclusion
- testargs = ['', '-out', 'xml', '--only-with-metadata']
- with patch.object(sys, 'argv', testargs):
+ testargs = ["", "-out", "xml", "--only-with-metadata"]
+ with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
- with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r', encoding="utf-8") as f:
+ with open(
+ os.path.join(RESOURCES_DIR, "httpbin_sample.html"), "r", encoding="utf-8"
+ ) as f:
teststring = f.read()
assert cli.examine(teststring, args) is None
- testargs = ['', '-out', 'xml', '--only-with-metadata', '--precision']
- with patch.object(sys, 'argv', testargs):
+ testargs = ["", "-out", "xml", "--only-with-metadata", "--precision"]
+ with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
- with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r', encoding="utf-8") as f:
+ with open(
+ os.path.join(RESOURCES_DIR, "httpbin_sample.html"), "r", encoding="utf-8"
+ ) as f:
teststring = f.read()
assert cli.examine(teststring, args) is None
# test JSON output
- testargs = ['', '-out', 'json', '--recall']
- with patch.object(sys, 'argv', testargs):
+ testargs = ["", "-out", "json", "--recall"]
+ with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
- with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r', encoding="utf-8") as f:
+ with open(
+ os.path.join(RESOURCES_DIR, "httpbin_sample.html"), "r", encoding="utf-8"
+ ) as f:
teststring = f.read()
assert cli.examine(teststring, args) is not None
# sitemaps: tested in --explore
- testargs = ['', '--sitemap', 'https://sitemaps.org/sitemap.xml', '--list', '--parallel', '1']
- with patch.object(sys, 'argv', testargs):
+ testargs = [
+ "",
+ "--sitemap",
+ "https://sitemaps.org/sitemap.xml",
+ "--list",
+ "--parallel",
+ "1",
+ ]
+ with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
f = io.StringIO()
with redirect_stdout(f):
cli.process_args(args)
assert f.getvalue().strip().endswith("https://www.sitemaps.org/zh_TW/terms.html")
# CLI options
- testargs = ['', '--links', '--images']
- with patch.object(sys, 'argv', testargs):
+ testargs = ["", "--links", "--images"]
+ with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
- with open(os.path.join(RESOURCES_DIR, 'http_sample.html'), 'r', encoding="utf-8") as f:
+ with open(
+ os.path.join(RESOURCES_DIR, "http_sample.html"), "r", encoding="utf-8"
+ ) as f:
teststring = f.read()
result = cli.examine(teststring, args)
- assert '[link](testlink.html)' in result and 'test.jpg' in result
+ assert "[link](testlink.html)" in result and "test.jpg" in result
def test_file_processing():
"Test file processing pipeline on actual directories."
# dry-run file processing pipeline
- testargs = ['', '--parallel', '1', '--input-dir', '/dev/null']
- with patch.object(sys, 'argv', testargs):
+ testargs = ["", "--parallel", "1", "--input-dir", "/dev/null"]
+ with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
cli_utils.file_processing_pipeline(args)
# file processing pipeline on resources/
@@ -348,10 +390,12 @@ def test_file_processing():
def test_cli_config_file():
"Test if the configuration file is loaded correctly from the CLI."
- testargs = ['', '--input-dir', '/dev/null', '--config-file', 'newsettings.cfg']
- with patch.object(sys, 'argv', testargs):
+ testargs = ["", "--input-dir", "/dev/null", "--config-file", "newsettings.cfg"]
+ with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
- with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r', encoding="utf-8") as f:
+ with open(
+ os.path.join(RESOURCES_DIR, "httpbin_sample.html"), "r", encoding="utf-8"
+ ) as f:
teststring = f.read()
args.config_file = os.path.join(RESOURCES_DIR, args.config_file)
options = args_to_extractor(args)
@@ -359,61 +403,83 @@ def test_cli_config_file():
def test_input_filtering():
- '''test internal functions to filter urls'''
- testargs = ['']
- with patch.object(sys, 'argv', testargs):
+ """test internal functions to filter urls"""
+ testargs = [""]
+ with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
# load dictionary
- args.input_file = os.path.join(RESOURCES_DIR, 'list-process.txt')
+ args.input_file = os.path.join(RESOURCES_DIR, "list-process.txt")
url_store = cli.load_input_dict(args)
- assert len(url_store.find_known_urls('https://httpbin.org')) == 3
- args.input_file = os.path.join(RESOURCES_DIR, 'list-process.txt')
- args.blacklist = {'httpbin.org/status/404'}
+ assert len(url_store.find_known_urls("https://httpbin.org")) == 3
+ args.input_file = os.path.join(RESOURCES_DIR, "list-process.txt")
+ args.blacklist = {"httpbin.org/status/404"}
url_store = cli.load_input_dict(args)
- assert len(url_store.find_known_urls('https://httpbin.org')) == 2
+ assert len(url_store.find_known_urls("https://httpbin.org")) == 2
# deduplication and filtering
- inputlist = ['https://example.org/1', 'https://example.org/2', 'https://example.org/2', 'https://example.org/3', 'https://example.org/4', 'https://example.org/5', 'https://example.org/6']
- args.blacklist = {'example.org/1', 'example.org/3', 'example.org/5'}
+ inputlist = [
+ "https://example.org/1",
+ "https://example.org/2",
+ "https://example.org/2",
+ "https://example.org/3",
+ "https://example.org/4",
+ "https://example.org/5",
+ "https://example.org/6",
+ ]
+ args.blacklist = {"example.org/1", "example.org/3", "example.org/5"}
url_store = add_to_compressed_dict(inputlist, blacklist=args.blacklist)
- assert url_store.find_known_urls('https://example.org') == ['https://example.org/2', 'https://example.org/4', 'https://example.org/6']
+ assert url_store.find_known_urls("https://example.org") == [
+ "https://example.org/2",
+ "https://example.org/4",
+ "https://example.org/6",
+ ]
# URL in blacklist
- args.input_file = os.path.join(RESOURCES_DIR, 'list-process.txt')
+ args.input_file = os.path.join(RESOURCES_DIR, "list-process.txt")
my_urls = cli_utils.load_input_urls(args)
- my_blacklist = cli_utils.load_blacklist(os.path.join(RESOURCES_DIR, 'list-discard.txt'))
+ my_blacklist = cli_utils.load_blacklist(
+ os.path.join(RESOURCES_DIR, "list-discard.txt")
+ )
url_store = add_to_compressed_dict(my_urls, blacklist=my_blacklist)
assert len(url_store.dump_urls()) == 0
# other method
- args.input_file = os.path.join(RESOURCES_DIR, 'list-process.txt')
- args.blacklist = os.path.join(RESOURCES_DIR, 'list-discard.txt')
+ args.input_file = os.path.join(RESOURCES_DIR, "list-process.txt")
+ args.blacklist = os.path.join(RESOURCES_DIR, "list-discard.txt")
args.blacklist = cli_utils.load_blacklist(args.blacklist)
url_store = cli_utils.load_input_dict(args)
assert len(url_store.dump_urls()) == 0
# URL filter
- args.input_file = os.path.join(RESOURCES_DIR, 'list-process.txt')
+ args.input_file = os.path.join(RESOURCES_DIR, "list-process.txt")
my_urls = cli_utils.load_input_urls(args)
- url_store = add_to_compressed_dict(my_urls, blacklist=None, url_filter=['status'], url_store=None)
+ url_store = add_to_compressed_dict(
+ my_urls, blacklist=None, url_filter=["status"], url_store=None
+ )
assert len(url_store.urldict) == 1
- url_store = add_to_compressed_dict(my_urls, blacklist=None, url_filter=['teststring'], url_store=None)
+ url_store = add_to_compressed_dict(
+ my_urls, blacklist=None, url_filter=["teststring"], url_store=None
+ )
assert len(url_store.urldict) == 0
- url_store = add_to_compressed_dict(my_urls, blacklist=None, url_filter=['status', 'teststring'], url_store=None)
+ url_store = add_to_compressed_dict(
+ my_urls, blacklist=None, url_filter=["status", "teststring"], url_store=None
+ )
assert len(url_store.urldict) == 1
# malformed URLs
- url_store = add_to_compressed_dict(['123345', 'https://www.example.org/1'])
+ url_store = add_to_compressed_dict(["123345", "https://www.example.org/1"])
assert len(url_store.urldict) == 1
# double URLs
- args.input_file = os.path.join(RESOURCES_DIR, 'redundant-urls.txt')
+ args.input_file = os.path.join(RESOURCES_DIR, "redundant-urls.txt")
my_urls = cli_utils.load_input_urls(args)
url_store = add_to_compressed_dict(my_urls)
- assert len(url_store.find_known_urls('https://example.org')) == 1
+ assert len(url_store.find_known_urls("https://example.org")) == 1
# filter before exploration
- input_store = add_to_compressed_dict(["https://example.org/1", "https://sitemaps.org/test"])
+ input_store = add_to_compressed_dict(
+ ["https://example.org/1", "https://sitemaps.org/test"]
+ )
input_urls = ["https://example.org", "http://sitemaps.org/", "https://test.info/"]
url_store = cli_utils.build_exploration_dict(input_store, input_urls, args)
assert url_store.get_known_domains() == ["https://test.info"]
@@ -422,68 +488,78 @@ def test_input_filtering():
def test_crawling():
"Test crawling and exploration functions."
- testargs = ['', '--crawl', '']
- with patch.object(sys, 'argv', testargs):
+ testargs = ["", "--crawl", ""]
+ with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
cli_utils.cli_crawler(args)
- testargs = ['', '--crawl', ' ']
- with patch.object(sys, 'argv', testargs):
+ testargs = ["", "--crawl", " "]
+ with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
cli_utils.cli_crawler(args)
- testargs = ['', '--crawl', 'https://httpbun.com/html']
- with patch.object(sys, 'argv', testargs):
+ testargs = ["", "--crawl", "https://httpbun.com/html"]
+ with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
f = io.StringIO()
with redirect_stdout(f):
cli_utils.cli_crawler(args)
- assert f.getvalue() == 'https://httpbun.com/html\n'
+ assert f.getvalue() == "https://httpbun.com/html\n"
spider.URL_STORE = UrlStore(compressed=False, strict=False)
# links permitted
- testargs = ['', '--crawl', 'https://httpbun.com/links/1/1', '--list', '--parallel', '1']
- with patch.object(sys, 'argv', testargs):
+ testargs = [
+ "",
+ "--crawl",
+ "https://httpbun.com/links/1/1",
+ "--list",
+ "--parallel",
+ "1",
+ ]
+ with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
f = io.StringIO()
with redirect_stdout(f):
cli_utils.cli_crawler(args)
# possibly a bug on Github actions, should be 2 URLs
- assert f.getvalue() in ('https://httpbun.com/links/1/1\nhttps://httpbun.com/links/1/0\n', 'https://httpbun.com/links/1/1\n')
+ assert f.getvalue() in (
+ "https://httpbun.com/links/1/1\nhttps://httpbun.com/links/1/0\n",
+ "https://httpbun.com/links/1/1\n",
+ )
spider.URL_STORE = UrlStore(compressed=False, strict=False)
# 0 links permitted
- args.crawl = 'https://httpbun.com/links/4/4'
+ args.crawl = "https://httpbun.com/links/4/4"
f = io.StringIO()
with redirect_stdout(f):
cli_utils.cli_crawler(args, n=0)
## should be 6 (5 URLs as output), possibly a bug on Actions CI/CD
- assert len(f.getvalue().split('\n')) in (2, 6)
+ assert len(f.getvalue().split("\n")) in (2, 6)
spider.URL_STORE = UrlStore(compressed=False, strict=False)
# Exploration (Sitemap + Crawl)
- testargs = ['', '--explore', 'https://httpbun.com/html', '--list']
- with patch.object(sys, 'argv', testargs):
+ testargs = ["", "--explore", "https://httpbun.com/html", "--list"]
+ with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
f = io.StringIO()
with redirect_stdout(f):
cli.process_args(args)
- assert f.getvalue().strip() == 'https://httpbun.com/html'
+ assert f.getvalue().strip() == "https://httpbun.com/html"
def test_probing():
"Test webpage probing functions."
- url = 'https://example.org/'
- conf = os.path.join(RESOURCES_DIR, 'zerolength.cfg')
- testargs = ['', '--probe', url, '--target-language', 'de', '--config-file', conf]
- with patch.object(sys, 'argv', testargs):
+ url = "https://example.org/"
+ conf = os.path.join(RESOURCES_DIR, "zerolength.cfg")
+ testargs = ["", "--probe", url, "--target-language", "de", "--config-file", conf]
+ with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
f = io.StringIO()
with redirect_stdout(f):
cli.process_args(args)
if LANGID_FLAG:
- assert f.getvalue().strip() == ''
- args.target_language = 'en'
+ assert f.getvalue().strip() == ""
+ args.target_language = "en"
f2 = io.StringIO()
with redirect_stdout(f2):
cli.process_args(args)
@@ -492,7 +568,7 @@ def test_probing():
assert f.getvalue().strip() == url
-if __name__ == '__main__':
+if __name__ == "__main__":
test_parser()
test_climain()
test_input_type()
diff --git a/tests/resources/mozilla.org.firefox.developer.html b/tests/resources/mozilla.org.firefox.developer.html
new file mode 100644
index 00000000..a9d746d9
--- /dev/null
+++ b/tests/resources/mozilla.org.firefox.developer.html
@@ -0,0 +1,461 @@
+
+
+
+
+
+
+
+
+
+
+ Welcome to Firefox Developer Edition
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ WebIDE
+ Develop, deploy and debug Firefox OS apps directly in your browser, or on a Firefox OS device, with
+ this tool that replaces App Manager.
+ Learn more about
+ WebIDE
+
+
+
+
+
+
+ Valence
+ Develop and debug your apps across multiple browsers and devices with this powerful extension that
+ comes pre-installed with Firefox Developer Edition.
+ Learn
+ more about Valence
+
+
+
+
+
Important: Sync your new profile
+
+ Developer Edition comes with a new profile so you can run it alongside other versions of Firefox. To
+ access your bookmarks, browsing history and more, you need to sync the profile with your existing Firefox
+ Account, or create a new one.
+ Learn more
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Portions of this content are ©1998–2015 by individual mozilla.org contributors. Content
+ available under a Creative Commons license .
+
+
+
+
+
+ Mozilla:
+
+
+
+ Firefox:
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
index dd9a37ef..70b45f94 100644
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@@ -14,6 +14,7 @@
from lxml import etree, html
+
try:
from cchardet import detect
except ImportError:
@@ -29,6 +30,7 @@
handle_table, handle_textelem)
from trafilatura.meta import reset_caches
from trafilatura.metadata import Document
+from trafilatura.readability_lxml import is_probably_readerable
from trafilatura.settings import DEFAULT_CONFIG, TAG_CATALOG, use_config
from trafilatura.utils import (LANGID_FLAG, detect_encoding, is_dubious_html, is_image_file,
language_classifier, load_html, normalize_unicode,
@@ -1292,6 +1294,120 @@ def test_config_loading():
assert config is not None
+def test_is_probably_readerable():
+ """
+ Test is_probably_readerable function.
+ """
+ very_small_str = "hello there"
+ small_str = "hello there " * 11
+ large_str = "hello there " * 12
+ very_large_str = "hello there " * 50
+ linebreaks_str = f"{large_str} " * 10
+
+ very_small_doc = load_html(f"{very_small_str}
")
+ small_doc = load_html(f"{small_str}
")
+ large_doc = load_html(f"{large_str}
")
+ very_large_doc = load_html(f"{very_large_str}
")
+ likely_doc = load_html(
+ f"{very_large_str}
{very_large_str}
{very_large_str}
"
+ )
+ unlikely_doc = load_html(
+ f""
+ )
+ visible_doc = load_html(
+ f"{very_large_str}
{very_large_str}
{very_large_str}
"
+ )
+ invisible_doc = load_html(
+ f"{very_large_str}
{very_large_str}
{very_large_str}
"
+ )
+ linebreaks_doc = load_html(
+ f"{linebreaks_str * 10}
"
+ )
+ no_linebreaks_doc = load_html(f"{large_str * 10}
")
+
+ # should only declare large documents as readerable when default options
+ assert not is_probably_readerable(very_small_doc)
+ assert not is_probably_readerable(small_doc)
+ assert not is_probably_readerable(large_doc)
+ assert is_probably_readerable(very_large_doc)
+
+ # should declare small and large documents as readerable when lower min_content_length
+ options = {"min_content_length": 120, "min_score": 0}
+ assert not is_probably_readerable(very_small_doc, options)
+ assert is_probably_readerable(small_doc, options)
+ assert is_probably_readerable(large_doc, options)
+ assert is_probably_readerable(very_large_doc, options)
+
+ # should only declare largest document as readerable when higher min_content_length
+ options = {"min_content_length": 200, "min_score": 0}
+ assert not is_probably_readerable(very_small_doc, options)
+ assert not is_probably_readerable(small_doc, options)
+ assert not is_probably_readerable(large_doc, options)
+ assert is_probably_readerable(very_large_doc, options)
+
+ # should declare large documents as readerable when lower min_score
+ options = {"min_content_length": 0, "min_score": 4}
+ assert not is_probably_readerable(very_small_doc, options)
+ assert is_probably_readerable(small_doc, options)
+ assert is_probably_readerable(large_doc, options)
+ assert is_probably_readerable(very_large_doc, options)
+
+ # should declare large documents as readerable when higher min_score
+ options = {"min_content_length": 0, "min_score": 11.5}
+ assert not is_probably_readerable(very_small_doc, options)
+ assert not is_probably_readerable(small_doc, options)
+ assert is_probably_readerable(large_doc, options)
+ assert is_probably_readerable(very_large_doc, options)
+
+ # should check id and class attributes
+ assert is_probably_readerable(likely_doc)
+ assert not is_probably_readerable(unlikely_doc)
+
+ # should check linebreaks in div elements
+ assert is_probably_readerable(linebreaks_doc)
+ assert not is_probably_readerable(no_linebreaks_doc)
+
+ called = False
+
+ def visibility_checker_invisible(node):
+ nonlocal called
+ called = True
+ return False
+
+ # should use node visibility checker provided as option - not visible
+ options = {"visibility_checker": visibility_checker_invisible}
+ assert not is_probably_readerable(very_large_doc, options)
+ assert called
+
+ called = False
+
+ def visibility_checker_visible(node):
+ nonlocal called
+ called = True
+ return True
+
+ # should use node visibility checker provided as option - visible
+ options = {"visibility_checker": visibility_checker_visible}
+ assert is_probably_readerable(very_large_doc, options)
+ assert called
+
+ # should use default node visibility checker
+ assert is_probably_readerable(visible_doc)
+ assert not is_probably_readerable(invisible_doc)
+
+ # https://github.com/mozilla/readability/blob/main/test/test-pages/mozilla-2/source.html#L22
+ with open(
+ path.join(RESOURCES_DIR, "mozilla.org.firefox.developer.html"),
+ "r",
+ encoding="utf-8",
+ ) as f:
+ teststring = f.read()
+
+ doc = load_html(teststring)
+ assert not is_probably_readerable(doc)
+
+
+
if __name__ == '__main__':
test_config_loading()
test_trim()
@@ -1316,3 +1432,4 @@ def test_config_loading():
test_nonstd_html_entities()
test_large_doc_performance()
test_lang_detection()
+ test_is_probably_readerable()
diff --git a/trafilatura/readability_lxml.py b/trafilatura/readability_lxml.py
index e962ef70..5f0e7e3d 100644
--- a/trafilatura/readability_lxml.py
+++ b/trafilatura/readability_lxml.py
@@ -17,16 +17,16 @@
License of forked code: Apache-2.0.
"""
-
import logging
import re
+from math import sqrt
from operator import attrgetter
from lxml.etree import tostring
from lxml.html import fragment_fromstring
-from .utils import trim
+from .utils import load_html, trim
LOGGER = logging.getLogger(__name__)
@@ -90,6 +90,7 @@ def text_length(elem):
class Candidate:
"Defines a class to score candidate elements."
+
__slots__ = ["score", "elem"]
def __init__(self, score, elem):
@@ -337,7 +338,9 @@ def sanitize(self, node, candidates):
allowed = set()
# Conditionally clean s, s, and s
- for elem in reversed(node.xpath("//table|//ul|//div|//aside|//header|//footer|//section")):
+ for elem in reversed(
+ node.xpath("//table|//ul|//div|//aside|//header|//footer|//section")
+ ):
if elem in allowed:
continue
weight = self.class_weight(elem)
@@ -440,3 +443,78 @@ def sanitize(self, node, candidates):
self.doc = node
return _tostring(self.doc)
+
+
+"""
+Port of isProbablyReaderable from mozilla/readability.js to Python.
+
+https://github.com/mozilla/readability
+
+License of forked code: Apache-2.0.
+"""
+
+REGEXPS = {
+ "unlikelyCandidates": re.compile(
+ r"-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote",
+ re.I,
+ ),
+ "okMaybeItsACandidate": re.compile(
+ r"and|article|body|column|content|main|shadow", re.I
+ ),
+}
+
+DISPLAY_NONE = re.compile(r"display:\s*none", re.I)
+
+
+def is_node_visible(node):
+ """
+ Checks if the node is visible by considering style, attributes, and class.
+ """
+
+ if "style" in node.attrib and DISPLAY_NONE.search(node.get("style")):
+ return False
+ if "hidden" in node.attrib:
+ return False
+ if node.get("aria-hidden") == "true" and "fallback-image" not in node.get(
+ "class", ""
+ ):
+ return False
+ return True
+
+
+def is_probably_readerable(html, options={}):
+ """
+ Decides whether or not the document is reader-able without parsing the whole thing.
+ """
+ doc = load_html(html)
+
+ min_content_length = options.get("min_content_length", 140)
+ min_score = options.get("min_score", 20)
+ visibility_checker = options.get("visibility_checker", is_node_visible)
+
+ nodes = set(doc.xpath(".//p | .//pre | .//article"))
+ nodes.update(node.getparent() for node in doc.xpath(".//div/br"))
+
+ score = 0
+ for node in nodes:
+ if not visibility_checker(node):
+ continue
+
+ class_and_id = f"{node.get('class', '')} {node.get('id', '')}"
+ if REGEXPS["unlikelyCandidates"].search(class_and_id) and not REGEXPS[
+ "okMaybeItsACandidate"
+ ].search(class_and_id):
+ continue
+
+ if node.xpath("./parent::li/p"):
+ continue
+
+ text_content_length = len(node.text_content().strip())
+ if text_content_length < min_content_length:
+ continue
+
+ score += sqrt(text_content_length - min_content_length)
+ if score > min_score:
+ return True
+
+ return False