Skip to content

Commit

Permalink
fix(parse): make the url date parsing stricter. Issue #514
Browse files Browse the repository at this point in the history
  • Loading branch information
AndyTheFactory committed Nov 2, 2023
1 parent 9df8c16 commit 0cc1e83
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 3 deletions.
3 changes: 2 additions & 1 deletion newspaper/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@
_STRICT_DATE_REGEX_PREFIX = r"(?<=\W)"
DATE_REGEX = (
r"([\./\-_\s]?(19|20)\d{2})[\./\-_\s]?"
"(([0-3]?[0-9][\./\-_\s])|(\w{3,5}[\./\-_\s]))([0-3]?[0-9][\./\-]?)?"
r"(([0-3]?[0-9][\./\-_\s])|(\w{3,5}[\./\-_\s]))"
r"([0-3]?[0-9]([\./\-\+\?]|$))"
)
STRICT_DATE_REGEX = _STRICT_DATE_REGEX_PREFIX + DATE_REGEX

Expand Down
1 change: 1 addition & 0 deletions tests/data/test_urls_pubdate.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
1 http://clatl.com/cribnotes/archives/2013/10/11/rrest-plays-album-release-party-tonight-drops-goodnight-chanteal
1 http://www.miamiherald.com/2013/10/11/3683673/2-new-wind-farms-planned-in-oklahoma.html
0 http://eatocracy.cnn.com/category/make/recipes/
0 https://prachatai.com/journal/2020/06/88083
4 changes: 2 additions & 2 deletions tests/test_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def canonical_url_fixture():
def get_url_filecontent(filename):
with open(Path(__file__).parent / "data" / filename, "r") as f:
lines = f.readlines()
return [tuple(line.strip().split(" ")) for line in lines]
return [tuple(line.strip().split(" ")) for line in lines if " " in line]


@pytest.fixture
Expand Down Expand Up @@ -119,7 +119,7 @@ def test_meta_image_extraction(self, meta_image_fixture):
@pytest.mark.skip(reason="Does not pass, not sure what it tests")
def test_valid_url(self):
for is_valid, url in get_url_filecontent("test_urls.txt"):
assert valid_url(url, test=True) == bool(int(is_valid))
assert valid_url(url, test=True) == bool(int(is_valid)), "Failed on " + url

def test_pubdate(self):
# not a real test... we test the regex??
Expand Down

0 comments on commit 0cc1e83

Please sign in to comment.