feat(tests): convert unittest to pytest

x
AndyTheFactory · Oct 31, 2023 · 45c4e8d · 45c4e8d
1 parent 8855f00
commit 45c4e8d
Show file tree

Hide file tree

Showing 16 changed files with 463 additions and 884 deletions.
diff --git a/newspaper/mthreading.py b/newspaper/mthreading.py
@@ -133,8 +133,3 @@ def set(self, news_list, threads_per_source=1, override_threads=None):
                 self.pool.add_task(news_object.download_articles)
             else:
                 self.pool.add_task(news_object.download)
-        for news_object in news_list:
-            if isinstance(news_object, Source):
-                self.pool.add_task(news_object.download_articles)
-            else:
-                self.pool.add_task(news_object.download)
diff --git a/newspaper/urls.py b/newspaper/urls.py
@@ -20,8 +20,10 @@
 MAX_FILE_MEMO = 20000
 
 _STRICT_DATE_REGEX_PREFIX = r"(?<=\W)"
-DATE_REGEX = r"([\./\-_]{0,1}(19|20)\d{2})[\./\-_]{0,1}\
-        (([0-3]{0,1}[0-9][\./\-_])|(\w{3,5}[\./\-_]))([0-3]{0,1}[0-9][\./\-]{0,1})?"
+DATE_REGEX = (
+    r"([\./\-_\s]?(19|20)\d{2})[\./\-_\s]?"
+    "(([0-3]?[0-9][\./\-_\s])|(\w{3,5}[\./\-_\s]))([0-3]?[0-9][\./\-]?)?"
+)
 STRICT_DATE_REGEX = _STRICT_DATE_REGEX_PREFIX + DATE_REGEX
 
 ALLOWED_TYPES = [

diff --git a/tests/data/txt/arabic.txt → tests/data/txt/arabic_article.txt b/tests/data/txt/arabic.txt → tests/data/txt/arabic_article.txt
diff --git a/tests/data/txt/chinese.txt → tests/data/txt/chinese_article.txt b/tests/data/txt/chinese.txt → tests/data/txt/chinese_article.txt
diff --git a/tests/data/txt/japanese.txt → tests/data/txt/japanese_article.txt b/tests/data/txt/japanese.txt → tests/data/txt/japanese_article.txt
diff --git a/tests/data/txt/japanese2.txt → tests/data/txt/japanese_article2.txt b/tests/data/txt/japanese2.txt → tests/data/txt/japanese_article2.txt
diff --git a/tests/data/txt/spanish.txt → tests/data/txt/spanish_article.txt b/tests/data/txt/spanish.txt → tests/data/txt/spanish_article.txt
diff --git a/tests/data/txt/thai.txt → tests/data/txt/thai_article.txt b/tests/data/txt/thai.txt → tests/data/txt/thai_article.txt
diff --git a/tests/test_article.py b/tests/test_article.py
@@ -1,4 +1,5 @@
 # pytest file for testing the article class
+from pathlib import Path
 import pytest
 from dateutil.parser import parse as date_parser
 import newspaper
@@ -128,3 +129,24 @@ def test_article_nlp(self, cnn_article):
             ]
         )
         assert article.summary.strip() == summary
+
+    def test_download_inexisting_file(self):
+        url = "file://" + str(
+            Path(__file__).resolve().parent / "data/html/does_not_exist.html"
+        )
+        article = Article(url=url)
+        article.download()
+        assert article.download_state == ArticleDownloadState.FAILED_RESPONSE
+        assert article.download_exception_msg == "No such file or directory"
+        assert article.html == ""
+
+    def test_download_file_schema(self):
+        url = "file://" + str(
+            Path(__file__).resolve().parent / "data/html/cnn_article.html"
+        )
+        article = Article(url=url)
+        article.download()
+
+        assert len(article.html) == 75404
+        assert article.download_state == ArticleDownloadState.SUCCESS
+        assert article.download_exception_msg is None
diff --git a/tests/test_configuration.py b/tests/test_configuration.py
@@ -0,0 +1,44 @@
+from newspaper.article import Article
+from newspaper.source import Source
+
+
+class TestConfiguration:
+    # not sure if these tests do verify anything useful
+    def test_article_default_params(self):
+        a = Article(
+            url="http://www.cnn.com/2013/11/27/"
+            "travel/weather-thanksgiving/index.html"
+        )
+        assert "en" == a.config.language
+        assert a.config.memoize_articles
+        assert a.config.use_meta_language
+
+    def test_article_custom_params(self):
+        a = Article(
+            url="http://www.cnn.com/2013/11/27/travel/"
+            "weather-thanksgiving/index.html",
+            language="zh",
+            memoize_articles=False,
+        )
+        assert "zh" == a.config.language
+        assert not a.config.memoize_articles
+        assert not a.config.use_meta_language
+
+    def test_source_default_params(self):
+        s = Source(url="http://cnn.com")
+        assert "en" == s.config.language
+        assert 20000 == s.config.MAX_FILE_MEMO
+        assert s.config.memoize_articles
+        assert s.config.use_meta_language
+
+    def test_source_custom_params(self):
+        s = Source(
+            url="http://cnn.com",
+            memoize_articles=False,
+            MAX_FILE_MEMO=10000,
+            language="en",
+        )
+        assert not s.config.memoize_articles
+        assert 10000 == s.config.MAX_FILE_MEMO
+        assert "en" == s.config.language
+        assert not s.config.use_meta_language
diff --git a/tests/test_languages.py b/tests/test_languages.py
@@ -0,0 +1,54 @@
+import pytest
+
+import newspaper
+from tests import conftest
+
+
+@pytest.fixture
+def language_article_fixture():
+    return [
+        (
+            "chinese_article",
+            "http://news.sohu.com/20050601/n225789219.shtml",
+            "zh",
+        ),
+        (
+            "arabic_article",
+            "http://arabic.cnn.com/2013/middle_east/8/2/syria.clashes/index.html",
+            "ar",
+        ),
+        (
+            "spanish_article",
+            "http://ultimahora.es/mallorca/noticia/noticias/local/fiscal"
+            "ia-anticorrupcion-estudia-recurre-imputacion-infanta.html",
+            "es",
+        ),
+        (
+            "japanese_article",
+            "https://www.nikkei.com/article/DGXMZO31897660Y8A610C1000000/?n_cid=DSTPCS001",
+            "ja",
+        ),
+        (
+            "japanese_article2",
+            "http://www.afpbb.com/articles/-/3178894",
+            "ja",
+        ),
+        (
+            "thai_article",
+            "https://prachatai.com/journal/2019/01/80642",
+            "th",
+        ),
+    ]
+
+
+class TestLanguages:
+    def test_full_extract(self, language_article_fixture):
+        for filename, url, language in language_article_fixture:
+            html_content = conftest.get_data(filename, "html")
+            text_content = conftest.get_data(filename, "txt")
+            article = newspaper.Article(url, language=language)
+            article.download(html_content)
+            article.parse()
+
+            assert article.text.strip() == text_content.strip()
+            # assert fulltext(article.html).strip() == text_content.strip()
diff --git a/tests/test_misc.py b/tests/test_misc.py
@@ -0,0 +1,40 @@
+import os
+import pytest
+import newspaper
+from newspaper.configuration import Configuration
+
+
+def test_hot_trending():
+    hot_stuff = newspaper.hot()
+    assert len(hot_stuff) > 0
+
+
+def test_popular_urls():
+    popular_urls = newspaper.popular_urls()
+    assert len(popular_urls) > 0
+
+
+def test_languages():
+    languages = newspaper.languages()
+    assert len(languages) > 10
+
+
+# Skip if GITHUB_ACTIONS
+@pytest.mark.skipif(os.getenv("GITHUB_ACTIONS"), reason="Skip if GITHUB_ACTIONS")
+def test_multithread_download():
+    config = Configuration()
+    config.memoize_articles = False
+    slate_paper = newspaper.build("http://slate.com", config=config)
+    tc_paper = newspaper.build("http://techcrunch.com", config=config)
+    espn_paper = newspaper.build("http://espn.com", config=config)
+
+    papers = [slate_paper, tc_paper, espn_paper]
+    for paper in papers:
+        paper.articles = paper.articles[:20]
+    newspaper.news_pool.set(papers, threads_per_source=2)
+
+    newspaper.news_pool.join()
+
+    assert len(slate_paper.articles[-1].html) > 0
+    assert len(espn_paper.articles[-1].html) > 0
+    assert len(tc_paper.articles[-1].html) > 0
diff --git a/tests/test_parsing.py b/tests/test_parsing.py
@@ -0,0 +1,133 @@
+import re
+import pytest
+from pathlib import Path
+from newspaper.extractors import ContentExtractor
+from newspaper.parsers import Parser
+from newspaper.configuration import Configuration
+from newspaper.urls import STRICT_DATE_REGEX, prepare_url, valid_url
+
+
+@pytest.fixture
+def title_fixture():
+    return [
+        ("<title>Test title</title>", "Test title"),
+        ("<title>Test page » Test title</title>", "Test title"),
+        ("<title>Test page &raquo; Test title</title>", "Test title"),
+        (
+            "<title>Test page and «something in quotes»</title>",
+            "Test page and «something in quotes»",
+        ),
+    ]
+
+
+@pytest.fixture
+def canonical_url_fixture():
+    return [
+        ("", '<link rel="canonical" href="http://www.example.com/article.html">'),
+        (
+            "http://www.example.com/article?foo=bar",
+            '<link rel="canonical" href="article.html">',
+        ),
+        (
+            "http://www.example.com/article?foo=bar",
+            '<meta property="og:url" content="article.html">',
+        ),
+        (
+            "http://www.example.com/article?foo=bar",
+            '<meta property="og:url" content="www.example.com/article.html">',
+        ),
+    ]
+
+
+def get_url_filecontent(filename):
+    with open(Path(__file__).parent / "data" / filename, "r") as f:
+        lines = f.readlines()
+        return [tuple(line.strip().split(" ")) for line in lines]
+
+
+@pytest.fixture
+def meta_image_fixture():
+    return [
+        (
+            '<meta property="og:image" '
+            'content="https://example.com/meta_img_filename.jpg" />'
+            '<meta name="og:image" '
+            'content="https://example.com/meta_another_img_filename.jpg"/>',
+            "https://example.com/meta_img_filename.jpg",
+        ),
+        (
+            '<meta property="og:image" content="" />'
+            '<meta name="og:image" '
+            'content="https://example.com/meta_another_img_filename.jpg"/>',
+            "https://example.com/meta_another_img_filename.jpg",
+        ),
+        ('<meta property="og:image" content="" />' '<meta name="og:image" />', ""),
+        (
+            '<meta property="og:image" content="" />'
+            '<meta name="og:image" />'
+            '<link rel="img_src" href="https://example.com/meta_link_image.jpg" />',
+            "https://example.com/meta_link_image.jpg",
+        ),
+        (
+            '<meta property="og:image" content="" />'
+            '<meta name="og:image" />'
+            '<link rel="image_src" href="https://example.com/meta_link_image2.jpg" />',
+            "https://example.com/meta_link_image2.jpg",
+        ),
+        (
+            '<meta property="og:image" content="" />'
+            '<meta name="og:image" />'
+            '<link rel="icon" href="https://example.com/meta_link_rel_icon.ico" />',
+            "https://example.com/meta_link_rel_icon.ico",
+        ),
+    ]
+
+
+class TestExtractor:
+    def test_title_extraction(self, title_fixture):
+        extractor = ContentExtractor(Configuration())
+        parser = Parser()
+
+        for html, title in title_fixture:
+            doc = parser.fromstring(html)
+            assert extractor.get_title(doc) == title
+
+    def test_canonical_url_extraction(self, canonical_url_fixture):
+        extractor = ContentExtractor(Configuration())
+        parser = Parser()
+
+        for article_url, html in canonical_url_fixture:
+            doc = parser.fromstring(html)
+            assert (
+                extractor.get_canonical_link(article_url, doc)
+                == "http://www.example.com/article.html"
+            )
+
+    def test_meta_image_extraction(self, meta_image_fixture):
+        extractor = ContentExtractor(Configuration())
+        parser = Parser()
+
+        for html, expected in meta_image_fixture:
+            doc = parser.fromstring(html)
+            assert (
+                extractor.get_meta_img_url(
+                    "http://www.example.com/article?foo=bar", doc
+                )
+                == expected
+            )
+
+    @pytest.mark.skip(reason="Does not pass, not sure what it tests")
+    def test_valid_url(self):
+        for is_valid, url in get_url_filecontent("test_urls.txt"):
+            assert valid_url(url, test=True) == bool(int(is_valid))
+
+    def test_pubdate(self):
+        # not a real test... we test the regex??
+        # TODO: add a real test
+        for is_pubdate, url in get_url_filecontent("test_urls_pubdate.txt"):
+            date_match = re.search(STRICT_DATE_REGEX, url)
+            assert bool(date_match) == bool(int(is_pubdate)), f"Failed on {url}"
+
+    def test_prepare_url(self):
+        for real, url, source in get_url_filecontent("test_prepare_urls.txt"):
+            assert real == prepare_url(url, source)
diff --git a/tests/test_pdf.py b/tests/test_pdf.py
@@ -0,0 +1,37 @@
+import os
+import pytest
+from newspaper import Article
+
+
+@pytest.fixture
+def pdf_article_fixture():
+    return "https://www.adobe.com/pdf/pdfs/ISO32000-1PublicPatentLicense.pdf"
+
+
+# Do not run in GitHub Actions
+@pytest.mark.skipif(
+    "GITHUB_ACTIONS" in os.environ, reason="Do not run in GitHub Actions"
+)
+def test_pdf_ignore(pdf_article_fixture):
+    empty_pdf = "%PDF-"  # empty pdf file
+    article = Article(
+        url=pdf_article_fixture,
+        ignored_content_types_defaults={
+            "application/pdf": empty_pdf,
+            "application/x-pdf": empty_pdf,
+            "application/x-bzpdf": empty_pdf,
+            "application/x-gzpdf": empty_pdf,
+        },
+    )
+    article.download()
+    assert article.html == empty_pdf
+
+
+# Do not run in GitHub Actions
+@pytest.mark.skipif(
+    "GITHUB_ACTIONS" in os.environ, reason="Do not run in GitHub Actions"
+)
+def test_pdf_download(pdf_article_fixture):
+    article = Article(url=pdf_article_fixture)
+    article.download()
+    assert article.html.startswith("%PDF-") and article.html.strip().endswith("%%EOF")