-
Notifications
You must be signed in to change notification settings - Fork 50
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(tests): convert unittest to pytest
x
- Loading branch information
1 parent
8855f00
commit 45c4e8d
Showing
16 changed files
with
463 additions
and
884 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
from newspaper.article import Article | ||
from newspaper.source import Source | ||
|
||
|
||
class TestConfiguration: | ||
# not sure if these tests do verify anything useful | ||
def test_article_default_params(self): | ||
a = Article( | ||
url="http://www.cnn.com/2013/11/27/" | ||
"travel/weather-thanksgiving/index.html" | ||
) | ||
assert "en" == a.config.language | ||
assert a.config.memoize_articles | ||
assert a.config.use_meta_language | ||
|
||
def test_article_custom_params(self): | ||
a = Article( | ||
url="http://www.cnn.com/2013/11/27/travel/" | ||
"weather-thanksgiving/index.html", | ||
language="zh", | ||
memoize_articles=False, | ||
) | ||
assert "zh" == a.config.language | ||
assert not a.config.memoize_articles | ||
assert not a.config.use_meta_language | ||
|
||
def test_source_default_params(self): | ||
s = Source(url="http://cnn.com") | ||
assert "en" == s.config.language | ||
assert 20000 == s.config.MAX_FILE_MEMO | ||
assert s.config.memoize_articles | ||
assert s.config.use_meta_language | ||
|
||
def test_source_custom_params(self): | ||
s = Source( | ||
url="http://cnn.com", | ||
memoize_articles=False, | ||
MAX_FILE_MEMO=10000, | ||
language="en", | ||
) | ||
assert not s.config.memoize_articles | ||
assert 10000 == s.config.MAX_FILE_MEMO | ||
assert "en" == s.config.language | ||
assert not s.config.use_meta_language |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
import pytest | ||
|
||
import newspaper | ||
from tests import conftest | ||
|
||
|
||
@pytest.fixture | ||
def language_article_fixture(): | ||
return [ | ||
( | ||
"chinese_article", | ||
"http://news.sohu.com/20050601/n225789219.shtml", | ||
"zh", | ||
), | ||
( | ||
"arabic_article", | ||
"http://arabic.cnn.com/2013/middle_east/8/2/syria.clashes/index.html", | ||
"ar", | ||
), | ||
( | ||
"spanish_article", | ||
"http://ultimahora.es/mallorca/noticia/noticias/local/fiscal" | ||
"ia-anticorrupcion-estudia-recurre-imputacion-infanta.html", | ||
"es", | ||
), | ||
( | ||
"japanese_article", | ||
"https://www.nikkei.com/article/DGXMZO31897660Y8A610C1000000/?n_cid=DSTPCS001", | ||
"ja", | ||
), | ||
( | ||
"japanese_article2", | ||
"http://www.afpbb.com/articles/-/3178894", | ||
"ja", | ||
), | ||
( | ||
"thai_article", | ||
"https://prachatai.com/journal/2019/01/80642", | ||
"th", | ||
), | ||
] | ||
|
||
|
||
class TestLanguages: | ||
def test_full_extract(self, language_article_fixture): | ||
for filename, url, language in language_article_fixture: | ||
html_content = conftest.get_data(filename, "html") | ||
text_content = conftest.get_data(filename, "txt") | ||
article = newspaper.Article(url, language=language) | ||
article.download(html_content) | ||
article.parse() | ||
|
||
assert article.text.strip() == text_content.strip() | ||
# assert fulltext(article.html).strip() == text_content.strip() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
import os | ||
import pytest | ||
import newspaper | ||
from newspaper.configuration import Configuration | ||
|
||
|
||
def test_hot_trending(): | ||
hot_stuff = newspaper.hot() | ||
assert len(hot_stuff) > 0 | ||
|
||
|
||
def test_popular_urls(): | ||
popular_urls = newspaper.popular_urls() | ||
assert len(popular_urls) > 0 | ||
|
||
|
||
def test_languages(): | ||
languages = newspaper.languages() | ||
assert len(languages) > 10 | ||
|
||
|
||
# Skip if GITHUB_ACTIONS | ||
@pytest.mark.skipif(os.getenv("GITHUB_ACTIONS"), reason="Skip if GITHUB_ACTIONS") | ||
def test_multithread_download(): | ||
config = Configuration() | ||
config.memoize_articles = False | ||
slate_paper = newspaper.build("http://slate.com", config=config) | ||
tc_paper = newspaper.build("http://techcrunch.com", config=config) | ||
espn_paper = newspaper.build("http://espn.com", config=config) | ||
|
||
papers = [slate_paper, tc_paper, espn_paper] | ||
for paper in papers: | ||
paper.articles = paper.articles[:20] | ||
newspaper.news_pool.set(papers, threads_per_source=2) | ||
|
||
newspaper.news_pool.join() | ||
|
||
assert len(slate_paper.articles[-1].html) > 0 | ||
assert len(espn_paper.articles[-1].html) > 0 | ||
assert len(tc_paper.articles[-1].html) > 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
import re | ||
import pytest | ||
from pathlib import Path | ||
from newspaper.extractors import ContentExtractor | ||
from newspaper.parsers import Parser | ||
from newspaper.configuration import Configuration | ||
from newspaper.urls import STRICT_DATE_REGEX, prepare_url, valid_url | ||
|
||
|
||
@pytest.fixture | ||
def title_fixture(): | ||
return [ | ||
("<title>Test title</title>", "Test title"), | ||
("<title>Test page » Test title</title>", "Test title"), | ||
("<title>Test page » Test title</title>", "Test title"), | ||
( | ||
"<title>Test page and «something in quotes»</title>", | ||
"Test page and «something in quotes»", | ||
), | ||
] | ||
|
||
|
||
@pytest.fixture | ||
def canonical_url_fixture(): | ||
return [ | ||
("", '<link rel="canonical" href="http://www.example.com/article.html">'), | ||
( | ||
"http://www.example.com/article?foo=bar", | ||
'<link rel="canonical" href="article.html">', | ||
), | ||
( | ||
"http://www.example.com/article?foo=bar", | ||
'<meta property="og:url" content="article.html">', | ||
), | ||
( | ||
"http://www.example.com/article?foo=bar", | ||
'<meta property="og:url" content="www.example.com/article.html">', | ||
), | ||
] | ||
|
||
|
||
def get_url_filecontent(filename): | ||
with open(Path(__file__).parent / "data" / filename, "r") as f: | ||
lines = f.readlines() | ||
return [tuple(line.strip().split(" ")) for line in lines] | ||
|
||
|
||
@pytest.fixture | ||
def meta_image_fixture(): | ||
return [ | ||
( | ||
'<meta property="og:image" ' | ||
'content="https://example.com/meta_img_filename.jpg" />' | ||
'<meta name="og:image" ' | ||
'content="https://example.com/meta_another_img_filename.jpg"/>', | ||
"https://example.com/meta_img_filename.jpg", | ||
), | ||
( | ||
'<meta property="og:image" content="" />' | ||
'<meta name="og:image" ' | ||
'content="https://example.com/meta_another_img_filename.jpg"/>', | ||
"https://example.com/meta_another_img_filename.jpg", | ||
), | ||
('<meta property="og:image" content="" />' '<meta name="og:image" />', ""), | ||
( | ||
'<meta property="og:image" content="" />' | ||
'<meta name="og:image" />' | ||
'<link rel="img_src" href="https://example.com/meta_link_image.jpg" />', | ||
"https://example.com/meta_link_image.jpg", | ||
), | ||
( | ||
'<meta property="og:image" content="" />' | ||
'<meta name="og:image" />' | ||
'<link rel="image_src" href="https://example.com/meta_link_image2.jpg" />', | ||
"https://example.com/meta_link_image2.jpg", | ||
), | ||
( | ||
'<meta property="og:image" content="" />' | ||
'<meta name="og:image" />' | ||
'<link rel="icon" href="https://example.com/meta_link_rel_icon.ico" />', | ||
"https://example.com/meta_link_rel_icon.ico", | ||
), | ||
] | ||
|
||
|
||
class TestExtractor: | ||
def test_title_extraction(self, title_fixture): | ||
extractor = ContentExtractor(Configuration()) | ||
parser = Parser() | ||
|
||
for html, title in title_fixture: | ||
doc = parser.fromstring(html) | ||
assert extractor.get_title(doc) == title | ||
|
||
def test_canonical_url_extraction(self, canonical_url_fixture): | ||
extractor = ContentExtractor(Configuration()) | ||
parser = Parser() | ||
|
||
for article_url, html in canonical_url_fixture: | ||
doc = parser.fromstring(html) | ||
assert ( | ||
extractor.get_canonical_link(article_url, doc) | ||
== "http://www.example.com/article.html" | ||
) | ||
|
||
def test_meta_image_extraction(self, meta_image_fixture): | ||
extractor = ContentExtractor(Configuration()) | ||
parser = Parser() | ||
|
||
for html, expected in meta_image_fixture: | ||
doc = parser.fromstring(html) | ||
assert ( | ||
extractor.get_meta_img_url( | ||
"http://www.example.com/article?foo=bar", doc | ||
) | ||
== expected | ||
) | ||
|
||
@pytest.mark.skip(reason="Does not pass, not sure what it tests") | ||
def test_valid_url(self): | ||
for is_valid, url in get_url_filecontent("test_urls.txt"): | ||
assert valid_url(url, test=True) == bool(int(is_valid)) | ||
|
||
def test_pubdate(self): | ||
# not a real test... we test the regex?? | ||
# TODO: add a real test | ||
for is_pubdate, url in get_url_filecontent("test_urls_pubdate.txt"): | ||
date_match = re.search(STRICT_DATE_REGEX, url) | ||
assert bool(date_match) == bool(int(is_pubdate)), f"Failed on {url}" | ||
|
||
def test_prepare_url(self): | ||
for real, url, source in get_url_filecontent("test_prepare_urls.txt"): | ||
assert real == prepare_url(url, source) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
import os | ||
import pytest | ||
from newspaper import Article | ||
|
||
|
||
@pytest.fixture | ||
def pdf_article_fixture(): | ||
return "https://www.adobe.com/pdf/pdfs/ISO32000-1PublicPatentLicense.pdf" | ||
|
||
|
||
# Do not run in GitHub Actions | ||
@pytest.mark.skipif( | ||
"GITHUB_ACTIONS" in os.environ, reason="Do not run in GitHub Actions" | ||
) | ||
def test_pdf_ignore(pdf_article_fixture): | ||
empty_pdf = "%PDF-" # empty pdf file | ||
article = Article( | ||
url=pdf_article_fixture, | ||
ignored_content_types_defaults={ | ||
"application/pdf": empty_pdf, | ||
"application/x-pdf": empty_pdf, | ||
"application/x-bzpdf": empty_pdf, | ||
"application/x-gzpdf": empty_pdf, | ||
}, | ||
) | ||
article.download() | ||
assert article.html == empty_pdf | ||
|
||
|
||
# Do not run in GitHub Actions | ||
@pytest.mark.skipif( | ||
"GITHUB_ACTIONS" in os.environ, reason="Do not run in GitHub Actions" | ||
) | ||
def test_pdf_download(pdf_article_fixture): | ||
article = Article(url=pdf_article_fixture) | ||
article.download() | ||
assert article.html.startswith("%PDF-") and article.html.strip().endswith("%%EOF") |
Oops, something went wrong.