Skip to content

Commit

Permalink
feat(tests): convert unittest to pytest
Browse files Browse the repository at this point in the history
x
  • Loading branch information
AndyTheFactory committed Oct 31, 2023
1 parent 8855f00 commit 45c4e8d
Show file tree
Hide file tree
Showing 16 changed files with 463 additions and 884 deletions.
5 changes: 0 additions & 5 deletions newspaper/mthreading.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,3 @@ def set(self, news_list, threads_per_source=1, override_threads=None):
self.pool.add_task(news_object.download_articles)
else:
self.pool.add_task(news_object.download)
for news_object in news_list:
if isinstance(news_object, Source):
self.pool.add_task(news_object.download_articles)
else:
self.pool.add_task(news_object.download)
6 changes: 4 additions & 2 deletions newspaper/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@
MAX_FILE_MEMO = 20000

_STRICT_DATE_REGEX_PREFIX = r"(?<=\W)"
DATE_REGEX = r"([\./\-_]{0,1}(19|20)\d{2})[\./\-_]{0,1}\
(([0-3]{0,1}[0-9][\./\-_])|(\w{3,5}[\./\-_]))([0-3]{0,1}[0-9][\./\-]{0,1})?"
DATE_REGEX = (
r"([\./\-_\s]?(19|20)\d{2})[\./\-_\s]?"
"(([0-3]?[0-9][\./\-_\s])|(\w{3,5}[\./\-_\s]))([0-3]?[0-9][\./\-]?)?"
)
STRICT_DATE_REGEX = _STRICT_DATE_REGEX_PREFIX + DATE_REGEX

ALLOWED_TYPES = [
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
22 changes: 22 additions & 0 deletions tests/test_article.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# pytest file for testing the article class
from pathlib import Path
import pytest
from dateutil.parser import parse as date_parser
import newspaper
Expand Down Expand Up @@ -128,3 +129,24 @@ def test_article_nlp(self, cnn_article):
]
)
assert article.summary.strip() == summary

def test_download_inexisting_file(self):
url = "file://" + str(
Path(__file__).resolve().parent / "data/html/does_not_exist.html"
)
article = Article(url=url)
article.download()
assert article.download_state == ArticleDownloadState.FAILED_RESPONSE
assert article.download_exception_msg == "No such file or directory"
assert article.html == ""

def test_download_file_schema(self):
url = "file://" + str(
Path(__file__).resolve().parent / "data/html/cnn_article.html"
)
article = Article(url=url)
article.download()

assert len(article.html) == 75404
assert article.download_state == ArticleDownloadState.SUCCESS
assert article.download_exception_msg is None
44 changes: 44 additions & 0 deletions tests/test_configuration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from newspaper.article import Article
from newspaper.source import Source


class TestConfiguration:
# not sure if these tests do verify anything useful
def test_article_default_params(self):
a = Article(
url="http://www.cnn.com/2013/11/27/"
"travel/weather-thanksgiving/index.html"
)
assert "en" == a.config.language
assert a.config.memoize_articles
assert a.config.use_meta_language

def test_article_custom_params(self):
a = Article(
url="http://www.cnn.com/2013/11/27/travel/"
"weather-thanksgiving/index.html",
language="zh",
memoize_articles=False,
)
assert "zh" == a.config.language
assert not a.config.memoize_articles
assert not a.config.use_meta_language

def test_source_default_params(self):
s = Source(url="http://cnn.com")
assert "en" == s.config.language
assert 20000 == s.config.MAX_FILE_MEMO
assert s.config.memoize_articles
assert s.config.use_meta_language

def test_source_custom_params(self):
s = Source(
url="http://cnn.com",
memoize_articles=False,
MAX_FILE_MEMO=10000,
language="en",
)
assert not s.config.memoize_articles
assert 10000 == s.config.MAX_FILE_MEMO
assert "en" == s.config.language
assert not s.config.use_meta_language
54 changes: 54 additions & 0 deletions tests/test_languages.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import pytest

import newspaper
from tests import conftest


@pytest.fixture
def language_article_fixture():
return [
(
"chinese_article",
"http://news.sohu.com/20050601/n225789219.shtml",
"zh",
),
(
"arabic_article",
"http://arabic.cnn.com/2013/middle_east/8/2/syria.clashes/index.html",
"ar",
),
(
"spanish_article",
"http://ultimahora.es/mallorca/noticia/noticias/local/fiscal"
"ia-anticorrupcion-estudia-recurre-imputacion-infanta.html",
"es",
),
(
"japanese_article",
"https://www.nikkei.com/article/DGXMZO31897660Y8A610C1000000/?n_cid=DSTPCS001",
"ja",
),
(
"japanese_article2",
"http://www.afpbb.com/articles/-/3178894",
"ja",
),
(
"thai_article",
"https://prachatai.com/journal/2019/01/80642",
"th",
),
]


class TestLanguages:
def test_full_extract(self, language_article_fixture):
for filename, url, language in language_article_fixture:
html_content = conftest.get_data(filename, "html")
text_content = conftest.get_data(filename, "txt")
article = newspaper.Article(url, language=language)
article.download(html_content)
article.parse()

assert article.text.strip() == text_content.strip()
# assert fulltext(article.html).strip() == text_content.strip()
40 changes: 40 additions & 0 deletions tests/test_misc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import os
import pytest
import newspaper
from newspaper.configuration import Configuration


def test_hot_trending():
hot_stuff = newspaper.hot()
assert len(hot_stuff) > 0


def test_popular_urls():
popular_urls = newspaper.popular_urls()
assert len(popular_urls) > 0


def test_languages():
languages = newspaper.languages()
assert len(languages) > 10


# Skip if GITHUB_ACTIONS
@pytest.mark.skipif(os.getenv("GITHUB_ACTIONS"), reason="Skip if GITHUB_ACTIONS")
def test_multithread_download():
config = Configuration()
config.memoize_articles = False
slate_paper = newspaper.build("http://slate.com", config=config)
tc_paper = newspaper.build("http://techcrunch.com", config=config)
espn_paper = newspaper.build("http://espn.com", config=config)

papers = [slate_paper, tc_paper, espn_paper]
for paper in papers:
paper.articles = paper.articles[:20]
newspaper.news_pool.set(papers, threads_per_source=2)

newspaper.news_pool.join()

assert len(slate_paper.articles[-1].html) > 0
assert len(espn_paper.articles[-1].html) > 0
assert len(tc_paper.articles[-1].html) > 0
133 changes: 133 additions & 0 deletions tests/test_parsing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import re
import pytest
from pathlib import Path
from newspaper.extractors import ContentExtractor
from newspaper.parsers import Parser
from newspaper.configuration import Configuration
from newspaper.urls import STRICT_DATE_REGEX, prepare_url, valid_url


@pytest.fixture
def title_fixture():
return [
("<title>Test title</title>", "Test title"),
("<title>Test page » Test title</title>", "Test title"),
("<title>Test page &raquo; Test title</title>", "Test title"),
(
"<title>Test page and «something in quotes»</title>",
"Test page and «something in quotes»",
),
]


@pytest.fixture
def canonical_url_fixture():
return [
("", '<link rel="canonical" href="http://www.example.com/article.html">'),
(
"http://www.example.com/article?foo=bar",
'<link rel="canonical" href="article.html">',
),
(
"http://www.example.com/article?foo=bar",
'<meta property="og:url" content="article.html">',
),
(
"http://www.example.com/article?foo=bar",
'<meta property="og:url" content="www.example.com/article.html">',
),
]


def get_url_filecontent(filename):
with open(Path(__file__).parent / "data" / filename, "r") as f:
lines = f.readlines()
return [tuple(line.strip().split(" ")) for line in lines]


@pytest.fixture
def meta_image_fixture():
return [
(
'<meta property="og:image" '
'content="https://example.com/meta_img_filename.jpg" />'
'<meta name="og:image" '
'content="https://example.com/meta_another_img_filename.jpg"/>',
"https://example.com/meta_img_filename.jpg",
),
(
'<meta property="og:image" content="" />'
'<meta name="og:image" '
'content="https://example.com/meta_another_img_filename.jpg"/>',
"https://example.com/meta_another_img_filename.jpg",
),
('<meta property="og:image" content="" />' '<meta name="og:image" />', ""),
(
'<meta property="og:image" content="" />'
'<meta name="og:image" />'
'<link rel="img_src" href="https://example.com/meta_link_image.jpg" />',
"https://example.com/meta_link_image.jpg",
),
(
'<meta property="og:image" content="" />'
'<meta name="og:image" />'
'<link rel="image_src" href="https://example.com/meta_link_image2.jpg" />',
"https://example.com/meta_link_image2.jpg",
),
(
'<meta property="og:image" content="" />'
'<meta name="og:image" />'
'<link rel="icon" href="https://example.com/meta_link_rel_icon.ico" />',
"https://example.com/meta_link_rel_icon.ico",
),
]


class TestExtractor:
def test_title_extraction(self, title_fixture):
extractor = ContentExtractor(Configuration())
parser = Parser()

for html, title in title_fixture:
doc = parser.fromstring(html)
assert extractor.get_title(doc) == title

def test_canonical_url_extraction(self, canonical_url_fixture):
extractor = ContentExtractor(Configuration())
parser = Parser()

for article_url, html in canonical_url_fixture:
doc = parser.fromstring(html)
assert (
extractor.get_canonical_link(article_url, doc)
== "http://www.example.com/article.html"
)

def test_meta_image_extraction(self, meta_image_fixture):
extractor = ContentExtractor(Configuration())
parser = Parser()

for html, expected in meta_image_fixture:
doc = parser.fromstring(html)
assert (
extractor.get_meta_img_url(
"http://www.example.com/article?foo=bar", doc
)
== expected
)

@pytest.mark.skip(reason="Does not pass, not sure what it tests")
def test_valid_url(self):
for is_valid, url in get_url_filecontent("test_urls.txt"):
assert valid_url(url, test=True) == bool(int(is_valid))

def test_pubdate(self):
# not a real test... we test the regex??
# TODO: add a real test
for is_pubdate, url in get_url_filecontent("test_urls_pubdate.txt"):
date_match = re.search(STRICT_DATE_REGEX, url)
assert bool(date_match) == bool(int(is_pubdate)), f"Failed on {url}"

def test_prepare_url(self):
for real, url, source in get_url_filecontent("test_prepare_urls.txt"):
assert real == prepare_url(url, source)
37 changes: 37 additions & 0 deletions tests/test_pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import os
import pytest
from newspaper import Article


@pytest.fixture
def pdf_article_fixture():
return "https://www.adobe.com/pdf/pdfs/ISO32000-1PublicPatentLicense.pdf"


# Do not run in GitHub Actions
@pytest.mark.skipif(
"GITHUB_ACTIONS" in os.environ, reason="Do not run in GitHub Actions"
)
def test_pdf_ignore(pdf_article_fixture):
empty_pdf = "%PDF-" # empty pdf file
article = Article(
url=pdf_article_fixture,
ignored_content_types_defaults={
"application/pdf": empty_pdf,
"application/x-pdf": empty_pdf,
"application/x-bzpdf": empty_pdf,
"application/x-gzpdf": empty_pdf,
},
)
article.download()
assert article.html == empty_pdf


# Do not run in GitHub Actions
@pytest.mark.skipif(
"GITHUB_ACTIONS" in os.environ, reason="Do not run in GitHub Actions"
)
def test_pdf_download(pdf_article_fixture):
article = Article(url=pdf_article_fixture)
article.download()
assert article.html.startswith("%PDF-") and article.html.strip().endswith("%%EOF")
Loading

0 comments on commit 45c4e8d

Please sign in to comment.