diff --git a/tests/cli_tests.py b/tests/cli_tests.py
index 5cf0c854..3ffd477d 100644
--- a/tests/cli_tests.py
+++ b/tests/cli_tests.py
@@ -21,224 +21,244 @@
 from trafilatura.utils import LANGID_FLAG
 
 logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
-RESOURCES_DIR = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'resources')
+RESOURCES_DIR = os.path.join(os.path.abspath(os.path.dirname(__file__)), "resources")
 
 
 def test_parser():
-    '''test argument parsing for the command-line interface'''
-    testargs = ['', '-fvv', '--xmltei', '--no-tables', '-u', 'https://www.example.org']
-    with patch.object(sys, 'argv', testargs):
+    """test argument parsing for the command-line interface"""
+    testargs = ["", "-fvv", "--xmltei", "--no-tables", "-u", "https://www.example.org"]
+    with patch.object(sys, "argv", testargs):
         args = cli.parse_args(testargs)
     assert args.fast is True
     assert args.verbose == 2
     assert args.no_tables is False
     assert args.xmltei is True
-    assert args.URL == 'https://www.example.org'
+    assert args.URL == "https://www.example.org"
     args = cli.map_args(args)
-    assert args.output_format == 'xmltei'
-    testargs = ['', '-out', 'csv', '--no-tables', '-u', 'https://www.example.org']
-    with patch.object(sys, 'argv', testargs):
+    assert args.output_format == "xmltei"
+    testargs = ["", "-out", "csv", "--no-tables", "-u", "https://www.example.org"]
+    with patch.object(sys, "argv", testargs):
         args = cli.parse_args(testargs)
     assert args.fast is False
     assert args.verbose == 0
-    assert args.output_format == 'csv'
+    assert args.output_format == "csv"
     assert args.no_tables is False
     # test args mapping
-    testargs = ['', '--markdown']
-    with patch.object(sys, 'argv', testargs):
+    testargs = ["", "--markdown"]
+    with patch.object(sys, "argv", testargs):
         args = cli.parse_args(testargs)
     args = cli.map_args(args)
-    assert args.output_format == 'markdown'
-    testargs = ['', '--xml', '--no-comments', '--precision', '--recall']
-    with patch.object(sys, 'argv', testargs):
+    assert args.output_format == "markdown"
+    testargs = ["", "--xml", "--no-comments", "--precision", "--recall"]
+    with patch.object(sys, "argv", testargs):
         args = cli.parse_args(testargs)
     args = cli.map_args(args)
-    assert args.output_format == 'xml' and args.no_comments is False
+    assert args.output_format == "xml" and args.no_comments is False
     # combination possible (?)
     assert args.precision is True and args.recall is True
     args.xml, args.csv = False, True
     args = cli.map_args(args)
-    assert args.output_format == 'csv'
+    assert args.output_format == "csv"
     args.csv, args.json = False, True
     args = cli.map_args(args)
-    assert args.output_format == 'json'
-    testargs = ['', '--only-with-metadata']
-    with patch.object(sys, 'argv', testargs):
+    assert args.output_format == "json"
+    testargs = ["", "--only-with-metadata"]
+    with patch.object(sys, "argv", testargs):
         args = cli.parse_args(testargs)
     args = cli.map_args(args)
     assert args.only_with_metadata is True
     # process_args
-    args.input_dir = '/dev/null'
+    args.input_dir = "/dev/null"
     args.verbose = 1
-    args.blacklist = os.path.join(RESOURCES_DIR, 'list-discard.txt')
+    args.blacklist = os.path.join(RESOURCES_DIR, "list-discard.txt")
     cli.process_args(args)
     assert len(args.blacklist) == 3
     # filter
-    testargs = ['', '-i', 'resources/list-discard.txt', '--url-filter', 'test1', 'test2']
-    with patch.object(sys, 'argv', testargs):
-        args = cli.parse_args(testargs)
-    assert args.input_file == 'resources/list-discard.txt'
-    assert args.url_filter == ['test1', 'test2']
-    args.input_file = os.path.join(RESOURCES_DIR, 'list-discard.txt')
-    args.blacklist = os.path.join(RESOURCES_DIR, 'list-discard.txt')
+    testargs = [
+        "",
+        "-i",
+        "resources/list-discard.txt",
+        "--url-filter",
+        "test1",
+        "test2",
+    ]
+    with patch.object(sys, "argv", testargs):
+        args = cli.parse_args(testargs)
+    assert args.input_file == "resources/list-discard.txt"
+    assert args.url_filter == ["test1", "test2"]
+    args.input_file = os.path.join(RESOURCES_DIR, "list-discard.txt")
+    args.blacklist = os.path.join(RESOURCES_DIR, "list-discard.txt")
     f = io.StringIO()
     with redirect_stdout(f):
         cli.process_args(args)
     assert len(f.getvalue()) == 0
     # input directory
-    testargs = ['', '--input-dir', 'resources/test/']
-    with patch.object(sys, 'argv', testargs):
+    testargs = ["", "--input-dir", "resources/test/"]
+    with patch.object(sys, "argv", testargs):
         args = cli.parse_args(testargs)
     f = io.StringIO()
     with redirect_stdout(f):
         cli.process_args(args)
     assert len(f.getvalue()) == 0
     # version
-    testargs = ['', '--version']
+    testargs = ["", "--version"]
     with pytest.raises(SystemExit) as e, redirect_stdout(f):
-        with patch.object(sys, 'argv', testargs):
+        with patch.object(sys, "argv", testargs):
             args = cli.parse_args(testargs)
     assert e.type == SystemExit
     assert e.value.code == 0
-    assert re.match(r'Trafilatura [0-9]\.[0-9]\.[0-9] - Python [0-9]\.[0-9]+\.[0-9]', f.getvalue())
+    assert re.match(
+        r"Trafilatura [0-9]\.[0-9]\.[0-9] - Python [0-9]\.[0-9]+\.[0-9]", f.getvalue()
+    )
 
     # test deprecations
-    with patch.object(sys, 'argv', ['', '--inputfile', 'test.txt']), pytest.raises(ValueError):
+    with patch.object(sys, "argv", ["", "--inputfile", "test.txt"]), pytest.raises(
+        ValueError
+    ):
         cli.map_args(cli.parse_args(testargs))
 
-    for arg in ('--with-metadata', '--nocomments', '--notables', '--hash-as-name'):
-        testargs = ['', arg]
-        with patch.object(sys, 'argv', testargs), pytest.raises(ValueError):
+    for arg in ("--with-metadata", "--nocomments", "--notables", "--hash-as-name"):
+        testargs = ["", arg]
+        with patch.object(sys, "argv", testargs), pytest.raises(ValueError):
             cli.map_args(cli.parse_args(testargs))
 
-    testargs = ['', '--inputdir', 'test1']
-    with patch.object(sys, 'argv', testargs), pytest.raises(ValueError):
+    testargs = ["", "--inputdir", "test1"]
+    with patch.object(sys, "argv", testargs), pytest.raises(ValueError):
         cli.map_args(cli.parse_args(testargs))
-    testargs = ['', '--outputdir', 'test2']
-    with patch.object(sys, 'argv', testargs), pytest.raises(ValueError):
+    testargs = ["", "--outputdir", "test2"]
+    with patch.object(sys, "argv", testargs), pytest.raises(ValueError):
         cli.map_args(cli.parse_args(testargs))
 
 
 def test_climain():
-    '''test arguments and main CLI entrypoint'''
+    """test arguments and main CLI entrypoint"""
     # exit status required: 0
     # Windows platforms
-    if os.name == 'nt':
+    if os.name == "nt":
         trafilatura_bin = os.path.join(sys.prefix, "Scripts", "trafilatura")
     # other platforms
     else:
-        trafilatura_bin = 'trafilatura'
+        trafilatura_bin = "trafilatura"
     # help display
-    assert subprocess.run([trafilatura_bin, '--help']).returncode == 0
+    assert subprocess.run([trafilatura_bin, "--help"]).returncode == 0
     # piped input
-    empty_input = b'<html><body></body></html>'
+    empty_input = b"<html><body></body></html>"
     assert subprocess.run([trafilatura_bin], input=empty_input).returncode == 0
     # input directory walking and processing
     env = os.environ.copy()
-    if os.name == 'nt':
+    if os.name == "nt":
         # Force encoding to utf-8 for Windows (seem to be a problem only in GitHub Actions)
-        env['PYTHONIOENCODING'] = 'utf-8'
-    assert subprocess.run([trafilatura_bin, '--input-dir', RESOURCES_DIR], env=env).returncode == 0
+        env["PYTHONIOENCODING"] = "utf-8"
+    assert (
+        subprocess.run(
+            [trafilatura_bin, "--input-dir", RESOURCES_DIR], env=env
+        ).returncode
+        == 0
+    )
 
 
 def test_input_type():
-    '''test input type errors'''
-    testfile = 'docs/trafilatura-demo.gif'
-    testargs = ['', '-u', 'http']
-    with patch.object(sys, 'argv', testargs):
+    """test input type errors"""
+    testfile = "docs/trafilatura-demo.gif"
+    testargs = ["", "-u", "http"]
+    with patch.object(sys, "argv", testargs):
         assert cli.main() is None
-    testargs = ['', '-v']
-    with patch.object(sys, 'argv', testargs):
+    testargs = ["", "-v"]
+    with patch.object(sys, "argv", testargs):
         args = cli.parse_args(testargs)
-    with open(testfile, 'rb') as f:
+    with open(testfile, "rb") as f:
         teststring = f.read(1024)
     assert cli.examine(teststring, args) is None
-    testfile = 'docs/usage.rst'
-    with open(testfile, 'r', encoding="utf-8") as f:
+    testfile = "docs/usage.rst"
+    with open(testfile, "r", encoding="utf-8") as f:
         teststring = f.read()
     assert cli.examine(teststring, args) is None
     # test file list
-    assert 10 <= len(list(cli_utils.generate_filelist(RESOURCES_DIR))) <= 20
+    assert 10 <= len(list(cli_utils.generate_filelist(RESOURCES_DIR))) <= 21
 
 
 def test_sysoutput():
-    '''test command-line output with respect to CLI arguments'''
-    testargs = ['', '--csv', '-o', '/root/forbidden/']
-    with patch.object(sys, 'argv', testargs):
+    """test command-line output with respect to CLI arguments"""
+    testargs = ["", "--csv", "-o", "/root/forbidden/"]
+    with patch.object(sys, "argv", testargs):
         args = cli.parse_args(testargs)
-    filepath, destdir = cli_utils.determine_output_path(args, args.output_dir, '')
-    assert len(filepath) >= 10 and filepath.endswith('.csv')
-    assert destdir == '/root/forbidden/'
+    filepath, destdir = cli_utils.determine_output_path(args, args.output_dir, "")
+    assert len(filepath) >= 10 and filepath.endswith(".csv")
+    assert destdir == "/root/forbidden/"
     # doesn't work the same on Windows
-    if os.name != 'nt':
+    if os.name != "nt":
         assert cli_utils.check_outputdir_status(args.output_dir) is False
     else:
         assert cli_utils.check_outputdir_status(args.output_dir) is True
-    testargs = ['', '--xml', '-o', '/tmp/you-touch-my-tralala']
-    with patch.object(sys, 'argv', testargs):
+    testargs = ["", "--xml", "-o", "/tmp/you-touch-my-tralala"]
+    with patch.object(sys, "argv", testargs):
         args = cli.parse_args(testargs)
     assert cli_utils.check_outputdir_status(args.output_dir) is True
     # test fileslug for name
-    filepath, destdir = cli_utils.determine_output_path(args, args.output_dir, '', new_filename='AAZZ')
-    assert filepath.endswith('AAZZ.xml')
+    filepath, destdir = cli_utils.determine_output_path(
+        args, args.output_dir, "", new_filename="AAZZ"
+    )
+    assert filepath.endswith("AAZZ.xml")
     # test json output
     args2 = args
     args2.xml, args2.json = False, True
     args2 = cli.map_args(args2)
-    filepath2, destdir2 = cli_utils.determine_output_path(args, args.output_dir, '', new_filename='AAZZ')
-    assert filepath2.endswith('AAZZ.json')
+    filepath2, destdir2 = cli_utils.determine_output_path(
+        args, args.output_dir, "", new_filename="AAZZ"
+    )
+    assert filepath2.endswith("AAZZ.json")
     assert "you-touch-my-tralala" in destdir2
     # test directory counter
     # doesn't work the same on Windows
-    if os.name != 'nt':
-        assert cli_utils.determine_counter_dir('testdir', 0) == 'testdir/1'
+    if os.name != "nt":
+        assert cli_utils.determine_counter_dir("testdir", 0) == "testdir/1"
     else:
-        assert cli_utils.determine_counter_dir('testdir', 0) == 'testdir\\1'
+        assert cli_utils.determine_counter_dir("testdir", 0) == "testdir\\1"
     # test file writing
-    testargs = ['', '--csv', '-o', '/dev/null/', '-b', '/dev/null/']
-    with patch.object(sys, 'argv', testargs):
+    testargs = ["", "--csv", "-o", "/dev/null/", "-b", "/dev/null/"]
+    with patch.object(sys, "argv", testargs):
         args = cli.parse_args(testargs)
-    result = 'DADIDA'
+    result = "DADIDA"
     cli_utils.write_result(result, args)
     # process with backup directory and no counter
     options = args_to_extractor(args)
-    assert cli_utils.process_result('DADIDA', args, None, options) is None
+    assert cli_utils.process_result("DADIDA", args, None, options) is None
     # test keeping dir structure
-    testargs = ['', '-i', 'myinputdir/', '-o', 'test/', '--keep-dirs']
-    with patch.object(sys, 'argv', testargs):
+    testargs = ["", "-i", "myinputdir/", "-o", "test/", "--keep-dirs"]
+    with patch.object(sys, "argv", testargs):
         args = cli.parse_args(testargs)
-    filepath, destdir = cli_utils.determine_output_path(args, 'testfile.txt', '')
-    assert filepath == 'test/testfile.txt'
+    filepath, destdir = cli_utils.determine_output_path(args, "testfile.txt", "")
+    assert filepath == "test/testfile.txt"
     # test hash as output file name
     assert args.keep_dirs is True
     args.keep_dirs = False
-    filepath, destdir = cli_utils.determine_output_path(args, 'testfile.txt', '')
-    assert filepath == 'test/uOHdo6wKo4IK0pkL.txt'
+    filepath, destdir = cli_utils.determine_output_path(args, "testfile.txt", "")
+    assert filepath == "test/uOHdo6wKo4IK0pkL.txt"
 
 
 def test_download():
-    '''test page download and command-line interface'''
-    testargs = ['', '-v']
-    with patch.object(sys, 'argv', testargs):
+    """test page download and command-line interface"""
+    testargs = ["", "-v"]
+    with patch.object(sys, "argv", testargs):
         args = cli.parse_args(testargs)
     assert cli.examine(None, args) is None
-    assert cli.examine(' ', args) is None
-    assert cli.examine('0'*int(10e7), args) is None
-    #url = 'https://httpbun.org/status/200'
-    #teststring = fetch_url(url)
-    #assert teststring is None  # too small
-    #assert cli.examine(teststring, args, url) is None
-    #url = 'https://httpbun.org/links/2/2'
-    #teststring = fetch_url(url)
-    #assert teststring is not None
-    #assert cli.examine(teststring, args, url) is None
-    url = 'https://httpbun.com/html'
+    assert cli.examine(" ", args) is None
+    assert cli.examine("0" * int(10e7), args) is None
+    # url = 'https://httpbun.org/status/200'
+    # teststring = fetch_url(url)
+    # assert teststring is None  # too small
+    # assert cli.examine(teststring, args, url) is None
+    # url = 'https://httpbun.org/links/2/2'
+    # teststring = fetch_url(url)
+    # assert teststring is not None
+    # assert cli.examine(teststring, args, url) is None
+    url = "https://httpbun.com/html"
     teststring = fetch_url(url)
     assert teststring is not None
     assert cli.examine(teststring, args, url) is not None
     # test exit code for faulty URLs
-    testargs = ['', '-u', 'https://1234.yz/']
-    with patch.object(sys, 'argv', testargs):
+    testargs = ["", "-u", "https://1234.yz/"]
+    with patch.object(sys, "argv", testargs):
         args = cli.parse_args(testargs)
     with pytest.raises(SystemExit) as e:
         cli.process_args(args)
@@ -247,33 +267,40 @@ def test_download():
 
 # @patch('trafilatura.settings.MAX_FILES_PER_DIRECTORY', 1)
 def test_cli_pipeline():
-    '''test command-line processing pipeline'''
+    """test command-line processing pipeline"""
     # straight command-line input
-    #testargs = ['', '<html><body>Text</body></html>']
-    #with patch.object(sys, 'argv', testargs):
+    # testargs = ['', '<html><body>Text</body></html>']
+    # with patch.object(sys, 'argv', testargs):
     #    args = cli.parse_args(testargs)
-    #f = io.StringIO()
-    #with redirect_stdout(f):
+    # f = io.StringIO()
+    # with redirect_stdout(f):
     #    cli.process_args(args)
-    #assert len(f.getvalue()) == 0
+    # assert len(f.getvalue()) == 0
 
     # Force encoding to utf-8 for Windows in future processes spawned by multiprocessing.Pool
-    os.environ['PYTHONIOENCODING'] = "utf-8"
+    os.environ["PYTHONIOENCODING"] = "utf-8"
 
     # test URL listing
-    testargs = ['', '--list']
-    with patch.object(sys, 'argv', testargs):
+    testargs = ["", "--list"]
+    with patch.object(sys, "argv", testargs):
         args = cli.parse_args(testargs)
     assert cli_utils.url_processing_pipeline(args, UrlStore()) is False
 
     # test inputlist + blacklist
-    testargs = ['', '-i', os.path.join(RESOURCES_DIR, 'list-process.txt')]
-    with patch.object(sys, 'argv', testargs):
+    testargs = ["", "-i", os.path.join(RESOURCES_DIR, "list-process.txt")]
+    with patch.object(sys, "argv", testargs):
         args = cli.parse_args(testargs)
     my_urls = cli_utils.load_input_urls(args)
     assert my_urls is not None and len(my_urls) == 3
-    testargs = ['', '-i', os.path.join(RESOURCES_DIR, 'list-process.txt'), '--blacklist', os.path.join(RESOURCES_DIR, 'list-discard.txt'), '--archived']
-    with patch.object(sys, 'argv', testargs):
+    testargs = [
+        "",
+        "-i",
+        os.path.join(RESOURCES_DIR, "list-process.txt"),
+        "--blacklist",
+        os.path.join(RESOURCES_DIR, "list-discard.txt"),
+        "--archived",
+    ]
+    with patch.object(sys, "argv", testargs):
         args = cli.parse_args(testargs)
     assert args.blacklist is not None
     # test backoff between domain requests
@@ -288,53 +315,68 @@ def test_cli_pipeline():
     url_store = add_to_compressed_dict(my_urls, args.blacklist, None, None)
     cli_utils.url_processing_pipeline(args, url_store)
     # test backup
-    testargs = ['', '--backup-dir', '/tmp/']
-    with patch.object(sys, 'argv', testargs):
+    testargs = ["", "--backup-dir", "/tmp/"]
+    with patch.object(sys, "argv", testargs):
         args = cli.parse_args(testargs)
-    cli_utils.archive_html('00Test', args)
+    cli_utils.archive_html("00Test", args)
     # test date-based exclusion
-    testargs = ['', '-out', 'xml', '--only-with-metadata']
-    with patch.object(sys, 'argv', testargs):
+    testargs = ["", "-out", "xml", "--only-with-metadata"]
+    with patch.object(sys, "argv", testargs):
         args = cli.parse_args(testargs)
-    with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r', encoding="utf-8") as f:
+    with open(
+        os.path.join(RESOURCES_DIR, "httpbin_sample.html"), "r", encoding="utf-8"
+    ) as f:
         teststring = f.read()
     assert cli.examine(teststring, args) is None
-    testargs = ['', '-out', 'xml', '--only-with-metadata', '--precision']
-    with patch.object(sys, 'argv', testargs):
+    testargs = ["", "-out", "xml", "--only-with-metadata", "--precision"]
+    with patch.object(sys, "argv", testargs):
         args = cli.parse_args(testargs)
-    with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r', encoding="utf-8") as f:
+    with open(
+        os.path.join(RESOURCES_DIR, "httpbin_sample.html"), "r", encoding="utf-8"
+    ) as f:
         teststring = f.read()
     assert cli.examine(teststring, args) is None
     # test JSON output
-    testargs = ['', '-out', 'json', '--recall']
-    with patch.object(sys, 'argv', testargs):
+    testargs = ["", "-out", "json", "--recall"]
+    with patch.object(sys, "argv", testargs):
         args = cli.parse_args(testargs)
-    with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r', encoding="utf-8") as f:
+    with open(
+        os.path.join(RESOURCES_DIR, "httpbin_sample.html"), "r", encoding="utf-8"
+    ) as f:
         teststring = f.read()
     assert cli.examine(teststring, args) is not None
     # sitemaps: tested in --explore
-    testargs = ['', '--sitemap', 'https://sitemaps.org/sitemap.xml', '--list', '--parallel', '1']
-    with patch.object(sys, 'argv', testargs):
+    testargs = [
+        "",
+        "--sitemap",
+        "https://sitemaps.org/sitemap.xml",
+        "--list",
+        "--parallel",
+        "1",
+    ]
+    with patch.object(sys, "argv", testargs):
         args = cli.parse_args(testargs)
     f = io.StringIO()
     with redirect_stdout(f):
         cli.process_args(args)
     assert f.getvalue().strip().endswith("https://www.sitemaps.org/zh_TW/terms.html")
     # CLI options
-    testargs = ['', '--links', '--images']
-    with patch.object(sys, 'argv', testargs):
+    testargs = ["", "--links", "--images"]
+    with patch.object(sys, "argv", testargs):
         args = cli.parse_args(testargs)
-    with open(os.path.join(RESOURCES_DIR, 'http_sample.html'), 'r', encoding="utf-8") as f:
+    with open(
+        os.path.join(RESOURCES_DIR, "http_sample.html"), "r", encoding="utf-8"
+    ) as f:
         teststring = f.read()
     result = cli.examine(teststring, args)
-    assert '[link](testlink.html)' in result and 'test.jpg' in result
+    assert "[link](testlink.html)" in result and "test.jpg" in result
 
 
 def test_file_processing():
     "Test file processing pipeline on actual directories."
     # dry-run file processing pipeline
-    testargs = ['', '--parallel', '1', '--input-dir', '/dev/null']
-    with patch.object(sys, 'argv', testargs):
+    testargs = ["", "--parallel", "1", "--input-dir", "/dev/null"]
+    with patch.object(sys, "argv", testargs):
         args = cli.parse_args(testargs)
     cli_utils.file_processing_pipeline(args)
     # file processing pipeline on resources/
@@ -348,10 +390,12 @@ def test_file_processing():
 
 def test_cli_config_file():
     "Test if the configuration file is loaded correctly from the CLI."
-    testargs = ['', '--input-dir', '/dev/null', '--config-file', 'newsettings.cfg']
-    with patch.object(sys, 'argv', testargs):
+    testargs = ["", "--input-dir", "/dev/null", "--config-file", "newsettings.cfg"]
+    with patch.object(sys, "argv", testargs):
         args = cli.parse_args(testargs)
-    with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r', encoding="utf-8") as f:
+    with open(
+        os.path.join(RESOURCES_DIR, "httpbin_sample.html"), "r", encoding="utf-8"
+    ) as f:
         teststring = f.read()
     args.config_file = os.path.join(RESOURCES_DIR, args.config_file)
     options = args_to_extractor(args)
@@ -359,61 +403,83 @@ def test_cli_config_file():
 
 
 def test_input_filtering():
-    '''test internal functions to filter urls'''
-    testargs = ['']
-    with patch.object(sys, 'argv', testargs):
+    """test internal functions to filter urls"""
+    testargs = [""]
+    with patch.object(sys, "argv", testargs):
         args = cli.parse_args(testargs)
 
     # load dictionary
-    args.input_file = os.path.join(RESOURCES_DIR, 'list-process.txt')
+    args.input_file = os.path.join(RESOURCES_DIR, "list-process.txt")
     url_store = cli.load_input_dict(args)
-    assert len(url_store.find_known_urls('https://httpbin.org')) == 3
-    args.input_file = os.path.join(RESOURCES_DIR, 'list-process.txt')
-    args.blacklist = {'httpbin.org/status/404'}
+    assert len(url_store.find_known_urls("https://httpbin.org")) == 3
+    args.input_file = os.path.join(RESOURCES_DIR, "list-process.txt")
+    args.blacklist = {"httpbin.org/status/404"}
     url_store = cli.load_input_dict(args)
-    assert len(url_store.find_known_urls('https://httpbin.org')) == 2
+    assert len(url_store.find_known_urls("https://httpbin.org")) == 2
 
     # deduplication and filtering
-    inputlist = ['https://example.org/1', 'https://example.org/2', 'https://example.org/2', 'https://example.org/3', 'https://example.org/4', 'https://example.org/5', 'https://example.org/6']
-    args.blacklist = {'example.org/1', 'example.org/3', 'example.org/5'}
+    inputlist = [
+        "https://example.org/1",
+        "https://example.org/2",
+        "https://example.org/2",
+        "https://example.org/3",
+        "https://example.org/4",
+        "https://example.org/5",
+        "https://example.org/6",
+    ]
+    args.blacklist = {"example.org/1", "example.org/3", "example.org/5"}
     url_store = add_to_compressed_dict(inputlist, blacklist=args.blacklist)
-    assert url_store.find_known_urls('https://example.org') == ['https://example.org/2', 'https://example.org/4', 'https://example.org/6']
+    assert url_store.find_known_urls("https://example.org") == [
+        "https://example.org/2",
+        "https://example.org/4",
+        "https://example.org/6",
+    ]
 
     # URL in blacklist
-    args.input_file = os.path.join(RESOURCES_DIR, 'list-process.txt')
+    args.input_file = os.path.join(RESOURCES_DIR, "list-process.txt")
     my_urls = cli_utils.load_input_urls(args)
-    my_blacklist = cli_utils.load_blacklist(os.path.join(RESOURCES_DIR, 'list-discard.txt'))
+    my_blacklist = cli_utils.load_blacklist(
+        os.path.join(RESOURCES_DIR, "list-discard.txt")
+    )
     url_store = add_to_compressed_dict(my_urls, blacklist=my_blacklist)
     assert len(url_store.dump_urls()) == 0
     # other method
-    args.input_file = os.path.join(RESOURCES_DIR, 'list-process.txt')
-    args.blacklist = os.path.join(RESOURCES_DIR, 'list-discard.txt')
+    args.input_file = os.path.join(RESOURCES_DIR, "list-process.txt")
+    args.blacklist = os.path.join(RESOURCES_DIR, "list-discard.txt")
     args.blacklist = cli_utils.load_blacklist(args.blacklist)
     url_store = cli_utils.load_input_dict(args)
     assert len(url_store.dump_urls()) == 0
 
     # URL filter
-    args.input_file = os.path.join(RESOURCES_DIR, 'list-process.txt')
+    args.input_file = os.path.join(RESOURCES_DIR, "list-process.txt")
     my_urls = cli_utils.load_input_urls(args)
-    url_store = add_to_compressed_dict(my_urls, blacklist=None, url_filter=['status'], url_store=None)
+    url_store = add_to_compressed_dict(
+        my_urls, blacklist=None, url_filter=["status"], url_store=None
+    )
     assert len(url_store.urldict) == 1
-    url_store = add_to_compressed_dict(my_urls, blacklist=None, url_filter=['teststring'], url_store=None)
+    url_store = add_to_compressed_dict(
+        my_urls, blacklist=None, url_filter=["teststring"], url_store=None
+    )
     assert len(url_store.urldict) == 0
-    url_store = add_to_compressed_dict(my_urls, blacklist=None, url_filter=['status', 'teststring'], url_store=None)
+    url_store = add_to_compressed_dict(
+        my_urls, blacklist=None, url_filter=["status", "teststring"], url_store=None
+    )
     assert len(url_store.urldict) == 1
 
     # malformed URLs
-    url_store = add_to_compressed_dict(['123345', 'https://www.example.org/1'])
+    url_store = add_to_compressed_dict(["123345", "https://www.example.org/1"])
     assert len(url_store.urldict) == 1
 
     # double URLs
-    args.input_file = os.path.join(RESOURCES_DIR, 'redundant-urls.txt')
+    args.input_file = os.path.join(RESOURCES_DIR, "redundant-urls.txt")
     my_urls = cli_utils.load_input_urls(args)
     url_store = add_to_compressed_dict(my_urls)
-    assert len(url_store.find_known_urls('https://example.org')) == 1
+    assert len(url_store.find_known_urls("https://example.org")) == 1
 
     # filter before exploration
-    input_store = add_to_compressed_dict(["https://example.org/1", "https://sitemaps.org/test"])
+    input_store = add_to_compressed_dict(
+        ["https://example.org/1", "https://sitemaps.org/test"]
+    )
     input_urls = ["https://example.org", "http://sitemaps.org/", "https://test.info/"]
     url_store = cli_utils.build_exploration_dict(input_store, input_urls, args)
     assert url_store.get_known_domains() == ["https://test.info"]
@@ -422,68 +488,78 @@ def test_input_filtering():
 def test_crawling():
     "Test crawling and exploration functions."
 
-    testargs = ['', '--crawl', '']
-    with patch.object(sys, 'argv', testargs):
+    testargs = ["", "--crawl", ""]
+    with patch.object(sys, "argv", testargs):
         args = cli.parse_args(testargs)
     cli_utils.cli_crawler(args)
 
-    testargs = ['', '--crawl', ' ']
-    with patch.object(sys, 'argv', testargs):
+    testargs = ["", "--crawl", " "]
+    with patch.object(sys, "argv", testargs):
         args = cli.parse_args(testargs)
     cli_utils.cli_crawler(args)
 
-    testargs = ['', '--crawl', 'https://httpbun.com/html']
-    with patch.object(sys, 'argv', testargs):
+    testargs = ["", "--crawl", "https://httpbun.com/html"]
+    with patch.object(sys, "argv", testargs):
         args = cli.parse_args(testargs)
     f = io.StringIO()
     with redirect_stdout(f):
         cli_utils.cli_crawler(args)
-    assert f.getvalue() == 'https://httpbun.com/html\n'
+    assert f.getvalue() == "https://httpbun.com/html\n"
 
     spider.URL_STORE = UrlStore(compressed=False, strict=False)
     # links permitted
-    testargs = ['', '--crawl', 'https://httpbun.com/links/1/1', '--list', '--parallel', '1']
-    with patch.object(sys, 'argv', testargs):
+    testargs = [
+        "",
+        "--crawl",
+        "https://httpbun.com/links/1/1",
+        "--list",
+        "--parallel",
+        "1",
+    ]
+    with patch.object(sys, "argv", testargs):
         args = cli.parse_args(testargs)
     f = io.StringIO()
     with redirect_stdout(f):
         cli_utils.cli_crawler(args)
     # possibly a bug on Github actions, should be 2 URLs
-    assert f.getvalue() in ('https://httpbun.com/links/1/1\nhttps://httpbun.com/links/1/0\n', 'https://httpbun.com/links/1/1\n')
+    assert f.getvalue() in (
+        "https://httpbun.com/links/1/1\nhttps://httpbun.com/links/1/0\n",
+        "https://httpbun.com/links/1/1\n",
+    )
     spider.URL_STORE = UrlStore(compressed=False, strict=False)
     # 0 links permitted
-    args.crawl = 'https://httpbun.com/links/4/4'
+    args.crawl = "https://httpbun.com/links/4/4"
     f = io.StringIO()
     with redirect_stdout(f):
         cli_utils.cli_crawler(args, n=0)
     ## should be 6 (5 URLs as output), possibly a bug on Actions CI/CD
-    assert len(f.getvalue().split('\n')) in (2, 6)
+    assert len(f.getvalue().split("\n")) in (2, 6)
     spider.URL_STORE = UrlStore(compressed=False, strict=False)
 
     # Exploration (Sitemap + Crawl)
-    testargs = ['', '--explore', 'https://httpbun.com/html', '--list']
-    with patch.object(sys, 'argv', testargs):
+    testargs = ["", "--explore", "https://httpbun.com/html", "--list"]
+    with patch.object(sys, "argv", testargs):
         args = cli.parse_args(testargs)
     f = io.StringIO()
     with redirect_stdout(f):
         cli.process_args(args)
-    assert f.getvalue().strip() == 'https://httpbun.com/html'
+    assert f.getvalue().strip() == "https://httpbun.com/html"
 
 
 def test_probing():
     "Test webpage probing functions."
-    url = 'https://example.org/'
-    conf = os.path.join(RESOURCES_DIR, 'zerolength.cfg')
-    testargs = ['', '--probe', url, '--target-language', 'de', '--config-file', conf]
-    with patch.object(sys, 'argv', testargs):
+    url = "https://example.org/"
+    conf = os.path.join(RESOURCES_DIR, "zerolength.cfg")
+    testargs = ["", "--probe", url, "--target-language", "de", "--config-file", conf]
+    with patch.object(sys, "argv", testargs):
         args = cli.parse_args(testargs)
 
     f = io.StringIO()
     with redirect_stdout(f):
         cli.process_args(args)
     if LANGID_FLAG:
-        assert f.getvalue().strip() == ''
-        args.target_language = 'en'
+        assert f.getvalue().strip() == ""
+        args.target_language = "en"
         f2 = io.StringIO()
         with redirect_stdout(f2):
             cli.process_args(args)
@@ -492,7 +568,7 @@ def test_probing():
         assert f.getvalue().strip() == url
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     test_parser()
     test_climain()
     test_input_type()
diff --git a/tests/resources/mozilla.org.firefox.developer.html b/tests/resources/mozilla.org.firefox.developer.html
new file mode 100644
index 00000000..a9d746d9
--- /dev/null
+++ b/tests/resources/mozilla.org.firefox.developer.html
@@ -0,0 +1,461 @@
+<!-- https://github.com/mozilla/readability/blob/main/test/test-pages/mozilla-2/source.html#L22 -->
+<!doctype html>
+<html class="windows x86 no-js" lang="en" dir="ltr" data-latest-firefox="37.0.2" data-esr-versions="[31]">
+
+<head>
+	<meta charset="utf-8" />
+	<!--
+             _.-~-.
+           7''  Q..\
+        _7         (_
+      _7  _/    _q.  /
+    _7 . ___  /VVvv-'_                                            .
+   7/ / /~- \_\\      '-._     .-'                      /       //
+  ./ ( /-~-/||'=.__  '::. '-~'' {             ___   /  //     ./{
+ V   V-~-~| ||   __''_   ':::.   ''~-~.___.-'' _/  // / {_   /  {  /
+  VV/-~-~-|/ \ .'__'. '.    '::                     _ _ _        ''.
+  / /~~~~||VVV/ /  \ )  \        _ __ ___   ___ ___(_) | | __ _   .::'
+ / (~-~-~\\.-' /    \'   \::::. | '_ ` _ \ / _ \_  / | | |/ _` | :::'
+/..\    /..\__/      '     '::: | | | | | | (_) / /| | | | (_| | ::'
+vVVv    vVVv                 ': |_| |_| |_|\___/___|_|_|_|\__,_| ''
+
+Hi there, nice to meet you!
+
+Interested in having a direct impact on hundreds of millions of users? Join
+Mozilla, and become part of a global community that’s helping to build a
+brighter future for the Web.
+
+Visit https://careers.mozilla.org to learn about our current job openings.
+Visit https://www.mozilla.org/contribute for more ways to get involved and
+help support Mozilla.-->
+
+	<meta name="viewport" content="width=device-width, initial-scale=1" />
+	<meta name="robots" content="noindex" />
+	<title>Welcome to Firefox Developer Edition</title>
+	<meta name="description" content="" />
+	<meta property="og:type" content="website" />
+	<meta property="og:site_name" content="Mozilla" />
+	<meta property="og:locale" content="en_US" />
+	<meta property="og:url" content="https://www.mozilla.org/en-US/firefox/developer/" />
+	<meta property="og:image"
+		content="https://mozorg.cdn.mozilla.net/media/img/firefox/developer/page-image.03bbe7da3199.png" />
+	<meta property="og:title" content="Welcome to Firefox Developer Edition" />
+	<meta property="og:description"
+		content="Built for those who build the Web. Introducing the only browser made for developers." />
+	<meta property="fb:page_id" content="14696440021" />
+	<meta name="twitter:card" content="summary" />
+	<meta name="twitter:site" content="@firefox" />
+	<meta name="twitter:domain" content="mozilla.org" />
+	<meta name="twitter:app:name:googleplay" content="Firefox" />
+	<meta name="twitter:app:id:googleplay" content="org.mozilla.firefox" />
+	<link rel="author" type="text/plain" href="/humans.txt" />
+	<link rel="apple-touch-icon" type="image/png" sizes="180x180"
+		href="//mozorg.cdn.mozilla.net/media/img/firefox/ios-icon-180.7a8401f21915.png" />
+	<link rel="icon" type="image/png" sizes="196x196"
+		href="//mozorg.cdn.mozilla.net/media/img/firefox/favicon-196.223e1bcaf067.png" />
+	<link rel="shortcut icon" href="//mozorg.cdn.mozilla.net/media/img/firefox/favicon.dc6635050bf5.ico" />
+	<link rel="canonical" hreflang="en" href="https://www.mozilla.org/en-US/firefox/39.0a2/firstrun/" />
+	<link rel="alternate" hreflang="x-default" href="https://www.mozilla.org/firefox/39.0a2/firstrun/" />
+	<link rel="alternate" hreflang="an" href="https://www.mozilla.org/an/firefox/39.0a2/firstrun/" title="aragonés" />
+	<link rel="alternate" hreflang="ast" href="https://www.mozilla.org/ast/firefox/39.0a2/firstrun/" title="Asturianu" />
+	<link rel="alternate" hreflang="bg" href="https://www.mozilla.org/bg/firefox/39.0a2/firstrun/" title="Български" />
+	<link rel="alternate" hreflang="bn-IN" href="https://www.mozilla.org/bn-IN/firefox/39.0a2/firstrun/"
+		title="বাংলা (ভারত)" />
+	<link rel="alternate" hreflang="ca" href="https://www.mozilla.org/ca/firefox/39.0a2/firstrun/" title="Català" />
+	<link rel="alternate" hreflang="cs" href="https://www.mozilla.org/cs/firefox/39.0a2/firstrun/" title="Čeština" />
+	<link rel="alternate" hreflang="cy" href="https://www.mozilla.org/cy/firefox/39.0a2/firstrun/" title="Cymraeg" />
+	<link rel="alternate" hreflang="de" href="https://www.mozilla.org/de/firefox/39.0a2/firstrun/" title="Deutsch" />
+	<link rel="alternate" hreflang="dsb" href="https://www.mozilla.org/dsb/firefox/39.0a2/firstrun/"
+		title="Dolnoserbšćina" />
+	<link rel="alternate" hreflang="en-GB" href="https://www.mozilla.org/en-GB/firefox/39.0a2/firstrun/"
+		title="English (British)" />
+	<link rel="alternate" hreflang="en" href="https://www.mozilla.org/en-US/firefox/39.0a2/firstrun/" title="English" />
+	<link rel="alternate" hreflang="en-CA" href="https://www.mozilla.org/en-US/firefox/39.0a2/firstrun/"
+		title="English (Canada)" />
+	<link rel="alternate" hreflang="eo" href="https://www.mozilla.org/eo/firefox/39.0a2/firstrun/" title="Esperanto" />
+	<link rel="alternate" hreflang="es-AR" href="https://www.mozilla.org/es-AR/firefox/39.0a2/firstrun/"
+		title="Español (de Argentina)" />
+	<link rel="alternate" hreflang="es-CL" href="https://www.mozilla.org/es-CL/firefox/39.0a2/firstrun/"
+		title="Español (de Chile)" />
+	<link rel="alternate" hreflang="es-ES" href="https://www.mozilla.org/es-ES/firefox/39.0a2/firstrun/"
+		title="Español (de España)" />
+	<link rel="alternate" hreflang="es-MX" href="https://www.mozilla.org/es-MX/firefox/39.0a2/firstrun/"
+		title="Español (de México)" />
+	<link rel="alternate" hreflang="fr" href="https://www.mozilla.org/fr/firefox/39.0a2/firstrun/" title="Français" />
+	<link rel="alternate" hreflang="fy-NL" href="https://www.mozilla.org/fy-NL/firefox/39.0a2/firstrun/" title="Frysk" />
+	<link rel="alternate" hreflang="gd" href="https://www.mozilla.org/gd/firefox/39.0a2/firstrun/" title="Gàidhlig" />
+	<link rel="alternate" hreflang="hsb" href="https://www.mozilla.org/hsb/firefox/39.0a2/firstrun/"
+		title="Hornjoserbsce" />
+	<link rel="alternate" hreflang="hu" href="https://www.mozilla.org/hu/firefox/39.0a2/firstrun/" title="magyar" />
+	<link rel="alternate" hreflang="hy-AM" href="https://www.mozilla.org/hy-AM/firefox/39.0a2/firstrun/"
+		title="Հայերեն" />
+	<link rel="alternate" hreflang="id" href="https://www.mozilla.org/id/firefox/39.0a2/firstrun/"
+		title="Bahasa Indonesia" />
+	<link rel="alternate" hreflang="is" href="https://www.mozilla.org/is/firefox/39.0a2/firstrun/" title="íslenska" />
+	<link rel="alternate" hreflang="it" href="https://www.mozilla.org/it/firefox/39.0a2/firstrun/" title="Italiano" />
+	<link rel="alternate" hreflang="ja" href="https://www.mozilla.org/ja/firefox/39.0a2/firstrun/" title="日本語" />
+	<link rel="alternate" hreflang="lt" href="https://www.mozilla.org/lt/firefox/39.0a2/firstrun/"
+		title="lietuvių kalba" />
+	<link rel="alternate" hreflang="nl" href="https://www.mozilla.org/nl/firefox/39.0a2/firstrun/" title="Nederlands" />
+	<link rel="alternate" hreflang="pt-BR" href="https://www.mozilla.org/pt-BR/firefox/39.0a2/firstrun/"
+		title="Português (do Brasil)" />
+	<link rel="alternate" hreflang="pt-PT" href="https://www.mozilla.org/pt-PT/firefox/39.0a2/firstrun/"
+		title="Português (Europeu)" />
+	<link rel="alternate" hreflang="ru" href="https://www.mozilla.org/ru/firefox/39.0a2/firstrun/" title="Русский" />
+	<link rel="alternate" hreflang="sk" href="https://www.mozilla.org/sk/firefox/39.0a2/firstrun/" title="slovenčina" />
+	<link rel="alternate" hreflang="sl" href="https://www.mozilla.org/sl/firefox/39.0a2/firstrun/" title="Slovenščina" />
+	<link rel="alternate" hreflang="son" href="https://www.mozilla.org/son/firefox/39.0a2/firstrun/" title="Soŋay" />
+	<link rel="alternate" hreflang="sq" href="https://www.mozilla.org/sq/firefox/39.0a2/firstrun/" title="Shqip" />
+	<link rel="alternate" hreflang="sv-SE" href="https://www.mozilla.org/sv-SE/firefox/39.0a2/firstrun/"
+		title="Svenska" />
+	<link rel="alternate" hreflang="tr" href="https://www.mozilla.org/tr/firefox/39.0a2/firstrun/" title="Türkçe" />
+	<link rel="alternate" hreflang="uk" href="https://www.mozilla.org/uk/firefox/39.0a2/firstrun/" title="Українська" />
+	<link rel="alternate" hreflang="uz" href="https://www.mozilla.org/uz/firefox/39.0a2/firstrun/" title="Oʻzbek tili" />
+	<link rel="alternate" hreflang="zh-CN" href="https://www.mozilla.org/zh-CN/firefox/39.0a2/firstrun/"
+		title="中文 (简体)" />
+	<link rel="alternate" hreflang="zh-TW" href="https://www.mozilla.org/zh-TW/firefox/39.0a2/firstrun/"
+		title="正體中文 (繁體)" />
+
+
+
+	<link href="//mozorg.cdn.mozilla.net/media/css/tabzilla-min.c4ec201287fa.css" rel="stylesheet" type="text/css" />
+	<!--[if lte IE 8]>
+            <script src="//mozorg.cdn.mozilla.net/media/js/libs/html5shiv.d580a4cd1cb4.js"></script>
+    <![endif]-->
+
+	<!--[if lte IE 7]>
+            <link href="//mozorg.cdn.mozilla.net/media/css/oldIE-bundle.fc1d1a0990cc.css" rel="stylesheet" type="text/css" />    <![endif]-->
+
+	<!--[if !lte IE 7]><!-->
+	<link href="//mozorg.cdn.mozilla.net/media/css/firefox_developer_firstrun-bundle.c1bf35b84c00.css" rel="stylesheet"
+		type="text/css" />
+	<!--<![endif]-->
+
+
+	<script type="text/javascript" src="//mozorg.cdn.mozilla.net/media/js/site-bundle.4d72c30b1a11.js"
+		charset="utf-8"></script>
+
+	<script>
+		var _gaq = _gaq || [];
+		var pluginUrl = '//www.google-analytics.com/plugins/ga/inpage_linkid.js';
+		_gaq.push(['_require', 'inpage_linkid', pluginUrl]);
+		_gaq.push(['_setAccount', 'UA-36116321-1']);
+		_gaq.push(['_setAllowLinker', true]);
+		_gaq.push(['_setAllowAnchor', true]);
+		_gaq.push(['_gat._anonymizeIp']);
+		_gaq.push(['_trackPageview']);
+
+		(function () {
+			var ga = document.createElement('script');
+			ga.type = 'text/javascript';
+			ga.async = true;
+
+			var prefix = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www');
+			ga.src = prefix + '.google-analytics.com/ga.js';
+
+			var s = document.getElementsByTagName('script')[0];
+			s.parentNode.insertBefore(ga, s);
+		})();
+	</script>
+</head>
+
+<body id="firefox-developer-firstrun" class="html-ltr blueprint">
+	<div id="strings" data-global-close="Close" data-global-next="Next" data-global-previous="Previous"
+		data-global-update-firefox="Update your Firefox" data-devtools-title="Developer Tools"
+		data-devtools-text="The most complete browser made just for developers, Firefox Developer Edition has every dev tool you’ll need built right in."
+		data-next-webide="Next: WebIDE" data-webide-title="Try WebIDE"
+		data-webide-text="Develop, deploy and debug Firefox OS apps directly in your browser or on a Firefox OS device."
+		data-next-sync="Next: Sync" data-sync-title="Important"
+		data-sync-text="Sync your new Developer Edition profile to your Firefox Account to access bookmarks, browsing history, passwords and more from your existing Firefox profile."
+		data-doorhanger-sync="Sync now" data-doorhanger-close="Close" data-doorhanger-nothanks="No thanks"
+		data-webide-icon="//mozorg.cdn.mozilla.net/media/img/firefox/dev-firstrun/webide-blue.b8e098d7c9d9.png"
+		data-webide-icon-high-res="//mozorg.cdn.mozilla.net/media/img/firefox/dev-firstrun/webide-blue-high-res.707008e1b9c2.png"
+		data-devtools-icon="//mozorg.cdn.mozilla.net/media/img/firefox/dev-firstrun/devtools-blue.f5802a402e31.png"
+		data-devtools-icon-high-res="//mozorg.cdn.mozilla.net/media/img/firefox/dev-firstrun/devtools-blue-high-res.977645f39d48.png"
+		data-sync-icon="//mozorg.cdn.mozilla.net/media/img/firefox/dev-firstrun/sync-blue.c4ab116c7489.png"
+		data-sync-icon-high-res="//mozorg.cdn.mozilla.net/media/img/firefox/dev-firstrun/sync-blue-high-res.25933e6416f9.png"
+		data-sync-reminder-title="Before you go&hellip;"
+		data-sync-reminder-text="If you continue without syncing your new Developer Edition profile, you could lose access to important browsing data from your existing Firefox profile. To sync now, choose the Sync option from this menu.">
+	</div>
+	<div id="outer-wrapper">
+
+
+		<div id="wrapper">
+
+			<header id="masthead">
+				<a href="/en-US/" id="tabzilla" data-infobar="update translation">Mozilla</a>
+
+
+				<h2><img class="js " src="" data-processed="false"
+						data-src="//mozorg.cdn.mozilla.net/media/img/firefox/firstrun/dev/title.949ac051aba3.png"
+						data-high-res="true"
+						data-high-res-src="//mozorg.cdn.mozilla.net/media/img/firefox/firstrun/dev/title-high-res.3bd820c2e8da.png"
+						width="220" alt="Firefox Developer Edition" height="84" /><noscript><img class=""
+							src="//mozorg.cdn.mozilla.net/media/img/firefox/firstrun/dev/title.949ac051aba3.png" width="220"
+							alt="Firefox Developer Edition" height="84" /></noscript></h2>
+
+
+
+
+			</header>
+
+
+
+
+			<main role="main" class="sync-reminder">
+				<section class="intro container">
+					<header>
+						<h1>Welcome to <span>Firefox Developer Edition</span></h1>
+						<p>Get to know the features that make it the most complete browser for building the Web.</p>
+					</header>
+					<ul class="features">
+						<li class="feature">
+							<a href="https://www.youtube.com/watch?v=1R9_WdXwUsE" rel="external" class="video-play">
+								<img src="//mozorg.cdn.mozilla.net/media/img/firefox/firstrun/dev/feature-webide.16763db341cb.jpg"
+									alt="Screenshot" class="screenshot" />
+							</a>
+							<h2>WebIDE</h2>
+							<p>Develop, deploy and debug Firefox OS apps directly in your browser, or on a Firefox OS device, with
+								this tool that replaces App Manager.</p>
+							<a href="https://developer.mozilla.org/docs/Tools/WebIDE" rel="external" class="more">Learn more about
+								WebIDE</a>
+							<div class="responsive-video-container">
+								<div class="video" data-video-id="1R9_WdXwUsE"></div>
+							</div>
+						</li>
+						<li class="feature">
+							<a href="https://www.youtube.com/watch?v=eH0R10Ga4Hs" rel="external" class="video-play">
+								<img src="//mozorg.cdn.mozilla.net/media/img/firefox/firstrun/dev/feature-valence.251f9def4d8d.jpg"
+									alt="Screenshot" class="screenshot" />
+							</a>
+							<h2>Valence</h2>
+							<p>Develop and debug your apps across multiple browsers and devices with this powerful extension that
+								comes pre-installed with Firefox Developer Edition.</p>
+							<a href="https://developer.mozilla.org/docs/Tools/Firefox_Tools_Adapter" rel="external" class="more">Learn
+								more about Valence</a>
+							<div class="responsive-video-container">
+								<div class="video" data-video-id="eH0R10Ga4Hs"></div>
+							</div>
+						</li>
+					</ul>
+					<div class="notice">
+						<h4>Important: Sync your new profile</h4>
+						<p>
+							Developer Edition comes with a new profile so you can run it alongside other versions of Firefox. To
+							access your bookmarks, browsing history and more, you need to sync the profile with your existing Firefox
+							Account, or create a new one.
+							<a href="https://support.mozilla.org/kb/recover-lost-bookmarks-firefox-developer-edition" rel="external"
+								class="more">Learn more</a>
+						</p>
+					</div>
+				</section>
+
+				<section class="more-features">
+					<div class="container">
+						<header>
+							<h2>Features and tools</h2>
+						</header>
+						<ul class="features">
+							<li class="feature">
+								<a href="https://www.youtube.com/watch?v=eQqNfkqIJdw" rel="external" class="video-play">
+									<img src="//mozorg.cdn.mozilla.net/media/img/firefox/firstrun/dev/feature-inspector.c791bf1f1a59.jpg"
+										alt="Screenshot" class="screenshot" />
+								</a>
+								<h2>Page Inspector</h2>
+								<p>Examine the HTML and CSS of any Web page and easily modify the structure and layout of a page.</p>
+								<a href="https://developer.mozilla.org/docs/Tools/Page_Inspector" rel="external" class="more">Learn more
+									about Page Inspector</a>
+								<div class="responsive-video-container">
+									<div class="video" data-video-id="eQqNfkqIJdw"></div>
+								</div>
+							</li>
+							<li class="feature">
+								<a href="https://www.youtube.com/watch?v=iEDk8o9ehlw" rel="external" class="video-play">
+									<img src="//mozorg.cdn.mozilla.net/media/img/firefox/firstrun/dev/feature-console.42666aaf6d03.jpg"
+										alt="Screenshot" class="screenshot" />
+								</a>
+								<h2>Web Console</h2>
+								<p>See logged information associated with a Web page and use Web Console to interact with Web pages
+									using JavaScript.</p>
+								<a href="https://developer.mozilla.org/docs/Tools/Web_Console" rel="external" class="more">Learn more
+									about Web Console</a>
+								<div class="responsive-video-container">
+									<div class="video" data-video-id="iEDk8o9ehlw"></div>
+								</div>
+							</li>
+							<li class="feature">
+								<a href="https://www.youtube.com/watch?v=OS4AxYFLCIE" rel="external" class="video-play">
+									<img src="//mozorg.cdn.mozilla.net/media/img/firefox/firstrun/dev/feature-debugger.02ed86fb0c9f.jpg"
+										alt="Screenshot" class="screenshot" />
+								</a>
+								<h2>JavaScript Debugger</h2>
+								<p>Step through JavaScript code and examine or modify its state to help track down bugs.</p>
+								<a href="https://developer.mozilla.org/docs/Tools/Debugger" rel="external" class="more">Learn more about
+									JavaScript Debugger</a>
+								<div class="responsive-video-container">
+									<div class="video" data-video-id="OS4AxYFLCIE"></div>
+								</div>
+							</li>
+							<li class="feature">
+								<a href="https://www.youtube.com/watch?v=w4zSG53Qlbk" rel="external" class="video-play">
+									<img src="//mozorg.cdn.mozilla.net/media/img/firefox/firstrun/dev/feature-network.740d6082b3f6.jpg"
+										alt="Screenshot" class="screenshot" />
+								</a>
+								<h2>Network Monitor</h2>
+								<p>See all the network requests your browser makes, how long each request takes and details of each
+									request.</p>
+								<a href="https://developer.mozilla.org/docs/Tools/Network_Monitor" rel="external" class="more">Learn
+									more about Network Monitor</a>
+								<div class="responsive-video-container">
+									<div class="video" data-video-id="w4zSG53Qlbk"></div>
+								</div>
+							</li>
+							<li class="feature">
+								<a href="https://www.youtube.com/watch?v=R_qDaLQ8ghg" rel="external" class="video-play">
+									<img src="//mozorg.cdn.mozilla.net/media/img/firefox/firstrun/dev/feature-webaudio.a10ebc48d017.jpg"
+										alt="Screenshot" class="screenshot" />
+								</a>
+								<h2>Web Audio Editor</h2>
+								<p>Inspect and interact with Web Audio API in real time to ensure that all audio nodes are connected in
+									the way you expect.</p>
+								<a href="https://developer.mozilla.org/docs/Tools/Web_Audio_Editor" rel="external" class="more">Learn
+									more about Web Audio Editor</a>
+								<div class="responsive-video-container">
+									<div class="video" data-video-id="R_qDaLQ8ghg"></div>
+								</div>
+							</li>
+							<li class="feature">
+								<a href="https://www.youtube.com/watch?v=3kdBvvIZIqU" rel="external" class="video-play">
+									<img
+										src="//mozorg.cdn.mozilla.net/media/img/firefox/firstrun/dev/feature-style-editor.87c5d2017506.jpg"
+										alt="Screenshot" class="screenshot" />
+								</a>
+								<h2>Style Editor</h2>
+								<p>View and edit CSS styles associated with a Web page, create new ones and apply existing CSS
+									stylesheets to any page.</p>
+								<a href="https://developer.mozilla.org/docs/Tools/Style_Editor" rel="external" class="more">Learn more
+									about Style Editor</a>
+								<div class="responsive-video-container">
+									<div class="video" data-video-id="3kdBvvIZIqU"></div>
+								</div>
+							</li>
+						</ul>
+					</div>
+				</section>
+			</main>
+
+
+
+		</div><!-- close #wrapper -->
+
+		<footer id="colophon" class="">
+			<nav class="row">
+				<div class="col col-1">
+					<h1 class="logo"><a href="/en-US/">Mozilla</a></h1>
+					<p class="license">Portions of this content are ©1998–2015 by individual mozilla.org contributors. Content
+						available under a <a href="/en-US/foundation/licensing/website-content/">Creative Commons license</a>.</p>
+				</div>
+				<div class="col col-2">
+					<ul class="links-join">
+						<li><a href="/en-US/contact/spaces/">Contact Us</a></li>
+						<li class="wrap"><a href="/en-US/about/partnerships/">Partner with Us</a></li>
+						<li class="clear"><a
+								href="https://sendto.mozilla.org/page/contribute/givenow-seq?preset=2&amp;source=mozillaorg_footer&amp;ref=EOYFR2014&amp;utm_campaign=EOYFR2014&amp;utm_source=mozilla.org&amp;utm_medium=referral&amp;utm_content=mozillaorg_footer"
+								class="donate">Donate</a></li>
+						<li class="wrap"><a href="https://affiliates.mozilla.org/">Firefox Affiliates</a></li>
+						<li class="clear"><a href="https://wiki.mozilla.org/Webdev/GetInvolved/mozilla.org">Contribute to this
+								site</a></li>
+					</ul>
+					<ul class="links-legal">
+						<li><a href="/en-US/privacy/">Privacy</a></li>
+						<li class="wrap"><a href="/en-US/privacy/websites/#cookies">Cookies</a></li>
+						<li class="wrap"><a href="/en-US/about/legal/">Legal</a></li>
+						<li class="clear"><a href="/en-US/about/legal/fraud-report/">Report Trademark Abuse</a></li>
+					</ul>
+				</div>
+				<div class="col col-3">
+					<ul class="links-social">
+						<li>
+							Mozilla:
+							<ul>
+								<li><a href="https://twitter.com/mozilla">Twitter<span> (@mozilla)</span></a></li>
+								<li><a href="https://www.facebook.com/mozilla">Facebook<span> (Mozilla)</span></a></li>
+							</ul>
+						</li>
+						<li>
+							Firefox:
+							<ul>
+								<li><a href="https://twitter.com/firefox">Twitter<span> (@firefox)</span></a></li>
+								<li><a href="https://www.facebook.com/Firefox">Facebook<span> (Firefox)</span></a></li>
+								<li><a href="https://www.youtube.com/firefoxchannel">YouTube<span> (firefoxchannel)</span></a></li>
+							</ul>
+						</li>
+					</ul>
+					<div class="lang-switcher">
+						<form id="lang_form" method="get" action="#">
+							<label for="language">Other languages:</label>
+							<select id="language" name="lang" dir="ltr">
+								<option lang="an" value="an">aragonés</option>
+								<option lang="ast" value="ast">Asturianu</option>
+								<option lang="bg" value="bg">Български</option>
+								<option lang="bn-IN" value="bn-IN">বাংলা (ভারত)</option>
+								<option lang="ca" value="ca">Català</option>
+								<option lang="cs" value="cs">Čeština</option>
+								<option lang="cy" value="cy">Cymraeg</option>
+								<option lang="de" value="de">Deutsch</option>
+								<option lang="dsb" value="dsb">Dolnoserbšćina</option>
+								<option lang="en-GB" value="en-GB">English (British)</option>
+								<option lang="en-US" value="en-US" selected>English</option>
+								<option lang="eo" value="eo">Esperanto</option>
+								<option lang="es-AR" value="es-AR">Español (de Argentina)</option>
+								<option lang="es-CL" value="es-CL">Español (de Chile)</option>
+								<option lang="es-ES" value="es-ES">Español (de España)</option>
+								<option lang="es-MX" value="es-MX">Español (de México)</option>
+								<option lang="fr" value="fr">Français</option>
+								<option lang="fy-NL" value="fy-NL">Frysk</option>
+								<option lang="gd" value="gd">Gàidhlig</option>
+								<option lang="hsb" value="hsb">Hornjoserbsce</option>
+								<option lang="hu" value="hu">magyar</option>
+								<option lang="hy-AM" value="hy-AM">Հայերեն</option>
+								<option lang="id" value="id">Bahasa Indonesia</option>
+								<option lang="is" value="is">íslenska</option>
+								<option lang="it" value="it">Italiano</option>
+								<option lang="ja" value="ja">日本語</option>
+								<option lang="lt" value="lt">lietuvių kalba</option>
+								<option lang="nl" value="nl">Nederlands</option>
+								<option lang="pt-BR" value="pt-BR">Português (do Brasil)</option>
+								<option lang="pt-PT" value="pt-PT">Português (Europeu)</option>
+								<option lang="ru" value="ru">Русский</option>
+								<option lang="sk" value="sk">slovenčina</option>
+								<option lang="sl" value="sl">Slovenščina</option>
+								<option lang="son" value="son">Soŋay</option>
+								<option lang="sq" value="sq">Shqip</option>
+								<option lang="sv-SE" value="sv-SE">Svenska</option>
+								<option lang="tr" value="tr">Türkçe</option>
+								<option lang="uk" value="uk">Українська</option>
+								<option lang="uz" value="uz">Oʻzbek tili</option>
+								<option lang="zh-CN" value="zh-CN">中文 (简体)</option>
+								<option lang="zh-TW" value="zh-TW">正體中文 (繁體)</option>
+							</select>
+							<noscript>
+								<button type="submit">Go</button>
+							</noscript>
+						</form>
+					</div>
+				</div>
+			</nav>
+		</footer>
+	</div><!-- close #outer-wrapper -->
+
+	<!--[if IE 9]>
+      <script src="//mozorg.cdn.mozilla.net/media/js/libs/matchMedia.3fd01d1af18b.js"></script>
+    <![endif]-->
+
+	<script type="text/javascript" src="//mozorg.cdn.mozilla.net/media/js/firefox-resp-bundle.ec015d66a726.js"
+		charset="utf-8"></script>
+	<script src="//mozorg.cdn.mozilla.net/en-US/tabzilla/tabzilla.js"></script>
+	<script type="text/javascript"
+		src="//mozorg.cdn.mozilla.net/media/js/firefox_developer_firstrun-bundle.28d5bb93eb60.js" charset="utf-8"></script>
+</body>
+
+</html>
\ No newline at end of file
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
index dd9a37ef..70b45f94 100644
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@@ -14,6 +14,7 @@
 
 from lxml import etree, html
 
+
 try:
     from cchardet import detect
 except ImportError:
@@ -29,6 +30,7 @@
                                         handle_table, handle_textelem)
 from trafilatura.meta import reset_caches
 from trafilatura.metadata import Document
+from trafilatura.readability_lxml import is_probably_readerable
 from trafilatura.settings import DEFAULT_CONFIG, TAG_CATALOG, use_config
 from trafilatura.utils import (LANGID_FLAG, detect_encoding, is_dubious_html, is_image_file,
                                language_classifier, load_html, normalize_unicode,
@@ -1292,6 +1294,120 @@ def test_config_loading():
     assert config is not None
 
 
+def test_is_probably_readerable():
+    """
+    Test is_probably_readerable function.
+    """
+    very_small_str = "hello there"
+    small_str = "hello there " * 11
+    large_str = "hello there " * 12
+    very_large_str = "hello there " * 50
+    linebreaks_str = f"{large_str} <br>" * 10
+
+    very_small_doc = load_html(f"<html><p id='main'>{very_small_str}</p></html>")
+    small_doc = load_html(f"<html><p id='main'>{small_str}</p></html>")
+    large_doc = load_html(f"<html><p id='main'>{large_str}</p></html>")
+    very_large_doc = load_html(f"<html><p id='main'>{very_large_str}</p></html>")
+    likely_doc = load_html(
+        f"<html><p id='main' class='header'>{very_large_str}</p><p id='header' class='article'>{very_large_str}</p><p id='footer' class='body'>{very_large_str}</p></html>"
+    )
+    unlikely_doc = load_html(
+        f"<html><p id='header'>{very_large_str}</p><p class='footer'>{very_large_str}</p></html>"
+    )
+    visible_doc = load_html(
+        f"<html><p id='main' style='display: block'>{very_large_str}</p><p id='main'>{very_large_str}</p><p id='main' aria-hidden='false'>{very_large_str}</p></html>"
+    )
+    invisible_doc = load_html(
+        f"<html><p id='main' style='display: none'>{very_large_str}</p><p id='main' hidden>{very_large_str}</p><p id='main' aria-hidden='true'>{very_large_str}</p></html>"
+    )
+    linebreaks_doc = load_html(
+        f"<html><div>{linebreaks_str * 10}</div></html>"
+    )
+    no_linebreaks_doc = load_html(f"<html><div>{large_str * 10}</div></html>")
+
+    # should only declare large documents as readerable when default options
+    assert not is_probably_readerable(very_small_doc)
+    assert not is_probably_readerable(small_doc)
+    assert not is_probably_readerable(large_doc)
+    assert is_probably_readerable(very_large_doc)
+
+    # should declare small and large documents as readerable when lower min_content_length
+    options = {"min_content_length": 120, "min_score": 0}
+    assert not is_probably_readerable(very_small_doc, options)
+    assert is_probably_readerable(small_doc, options)
+    assert is_probably_readerable(large_doc, options)
+    assert is_probably_readerable(very_large_doc, options)
+
+    # should only declare largest document as readerable when higher min_content_length
+    options = {"min_content_length": 200, "min_score": 0}
+    assert not is_probably_readerable(very_small_doc, options)
+    assert not is_probably_readerable(small_doc, options)
+    assert not is_probably_readerable(large_doc, options)
+    assert is_probably_readerable(very_large_doc, options)
+
+    # should declare large documents as readerable when lower min_score
+    options = {"min_content_length": 0, "min_score": 4}
+    assert not is_probably_readerable(very_small_doc, options)
+    assert is_probably_readerable(small_doc, options)
+    assert is_probably_readerable(large_doc, options)
+    assert is_probably_readerable(very_large_doc, options)
+
+    # should declare large documents as readerable when higher min_score
+    options = {"min_content_length": 0, "min_score": 11.5}
+    assert not is_probably_readerable(very_small_doc, options)
+    assert not is_probably_readerable(small_doc, options)
+    assert is_probably_readerable(large_doc, options)
+    assert is_probably_readerable(very_large_doc, options)
+
+    # should check id and class attributes
+    assert is_probably_readerable(likely_doc)
+    assert not is_probably_readerable(unlikely_doc)
+
+    # should check linebreaks in div elements
+    assert is_probably_readerable(linebreaks_doc)
+    assert not is_probably_readerable(no_linebreaks_doc)
+
+    called = False
+
+    def visibility_checker_invisible(node):
+        nonlocal called
+        called = True
+        return False
+
+    # should use node visibility checker provided as option - not visible
+    options = {"visibility_checker": visibility_checker_invisible}
+    assert not is_probably_readerable(very_large_doc, options)
+    assert called
+
+    called = False
+
+    def visibility_checker_visible(node):
+        nonlocal called
+        called = True
+        return True
+
+    # should use node visibility checker provided as option - visible
+    options = {"visibility_checker": visibility_checker_visible}
+    assert is_probably_readerable(very_large_doc, options)
+    assert called
+
+    # should use default node visibility checker 
+    assert is_probably_readerable(visible_doc)
+    assert not is_probably_readerable(invisible_doc)
+
+    # https://github.com/mozilla/readability/blob/main/test/test-pages/mozilla-2/source.html#L22
+    with open(
+        path.join(RESOURCES_DIR, "mozilla.org.firefox.developer.html"),
+        "r",
+        encoding="utf-8",
+    ) as f:
+        teststring = f.read()
+
+    doc = load_html(teststring)
+    assert not is_probably_readerable(doc)
+
+
+
 if __name__ == '__main__':
     test_config_loading()
     test_trim()
@@ -1316,3 +1432,4 @@ def test_config_loading():
     test_nonstd_html_entities()
     test_large_doc_performance()
     test_lang_detection()
+    test_is_probably_readerable()
diff --git a/trafilatura/readability_lxml.py b/trafilatura/readability_lxml.py
index e962ef70..5f0e7e3d 100644
--- a/trafilatura/readability_lxml.py
+++ b/trafilatura/readability_lxml.py
@@ -17,16 +17,16 @@
 License of forked code: Apache-2.0.
 """
 
-
 import logging
 import re
 
+from math import sqrt
 from operator import attrgetter
 
 from lxml.etree import tostring
 from lxml.html import fragment_fromstring
 
-from .utils import trim
+from .utils import load_html, trim
 
 LOGGER = logging.getLogger(__name__)
 
@@ -90,6 +90,7 @@ def text_length(elem):
 
 class Candidate:
     "Defines a class to score candidate elements."
+
     __slots__ = ["score", "elem"]
 
     def __init__(self, score, elem):
@@ -337,7 +338,9 @@ def sanitize(self, node, candidates):
 
         allowed = set()
         # Conditionally clean <table>s, <ul>s, and <div>s
-        for elem in reversed(node.xpath("//table|//ul|//div|//aside|//header|//footer|//section")):
+        for elem in reversed(
+            node.xpath("//table|//ul|//div|//aside|//header|//footer|//section")
+        ):
             if elem in allowed:
                 continue
             weight = self.class_weight(elem)
@@ -440,3 +443,78 @@ def sanitize(self, node, candidates):
 
         self.doc = node
         return _tostring(self.doc)
+
+
+"""
+Port of isProbablyReaderable from mozilla/readability.js to Python.
+
+https://github.com/mozilla/readability
+
+License of forked code: Apache-2.0.
+"""
+
+REGEXPS = {
+    "unlikelyCandidates": re.compile(
+        r"-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote",
+        re.I,
+    ),
+    "okMaybeItsACandidate": re.compile(
+        r"and|article|body|column|content|main|shadow", re.I
+    ),
+}
+
+DISPLAY_NONE = re.compile(r"display:\s*none", re.I)
+
+
+def is_node_visible(node):
+    """
+    Checks if the node is visible by considering style, attributes, and class.
+    """
+
+    if "style" in node.attrib and DISPLAY_NONE.search(node.get("style")):
+        return False
+    if "hidden" in node.attrib:
+        return False
+    if node.get("aria-hidden") == "true" and "fallback-image" not in node.get(
+        "class", ""
+    ):
+        return False
+    return True
+
+
+def is_probably_readerable(html, options={}):
+    """
+    Decides whether or not the document is reader-able without parsing the whole thing.
+    """
+    doc = load_html(html)
+
+    min_content_length = options.get("min_content_length", 140)
+    min_score = options.get("min_score", 20)
+    visibility_checker = options.get("visibility_checker", is_node_visible)
+
+    nodes = set(doc.xpath(".//p | .//pre | .//article"))
+    nodes.update(node.getparent() for node in doc.xpath(".//div/br"))
+
+    score = 0
+    for node in nodes:
+        if not visibility_checker(node):
+            continue
+
+        class_and_id = f"{node.get('class', '')} {node.get('id', '')}"
+        if REGEXPS["unlikelyCandidates"].search(class_and_id) and not REGEXPS[
+            "okMaybeItsACandidate"
+        ].search(class_and_id):
+            continue
+
+        if node.xpath("./parent::li/p"):
+            continue
+
+        text_content_length = len(node.text_content().strip())
+        if text_content_length < min_content_length:
+            continue
+
+        score += sqrt(text_content_length - min_content_length)
+        if score > min_score:
+            return True
+
+    return False