Skip to content

Commit

Permalink
fix(parsing): improved publication date extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
AndyTheFactory committed Oct 27, 2023
1 parent 79553f6 commit 4d137eb
Show file tree
Hide file tree
Showing 5 changed files with 163 additions and 178 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ pip-log.txt
.coverage
.tox
nosetests.xml
.pytest_cache

# Translations
*.mo
Expand All @@ -40,6 +41,7 @@ nosetests.xml
.project
.pydevproject
venv
.vscode

# Ruff cache
.ruff_cache
Expand Down
3 changes: 3 additions & 0 deletions newspaper/extractors/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from newspaper.extractors.content_extractor import ContentExtractor

__all__ = ["ContentExtractor"]
213 changes: 35 additions & 178 deletions newspaper/extractors.py → newspaper/extractors/content_extractor.py
Original file line number Diff line number Diff line change
@@ -1,82 +1,32 @@
# -*- coding: utf-8 -*-
# Much of the logging code here was forked from https://github.com/codelucas/newspaper
# Copyright (c) Lucas Ou-Yang (codelucas)

"""
Newspaper uses much of python-goose's extraction code. View their license:
https://github.com/codelucas/newspaper/blob/master/GOOSE-LICENSE.txt
Keep all html page extraction code within this file. Abstract any
lxml or soup parsing code in the parsers.py file!
"""
from newspaper import urls
from newspaper.extractors.defines import (
MOTLEY_REPLACEMENT,
TITLE_REPLACEMENTS,
PIPE_SPLITTER,
DASH_SPLITTER,
UNDERSCORE_SPLITTER,
SLASH_SPLITTER,
ARROWS_SPLITTER,
RE_LANG,
PUBLISH_DATE_TAGS,
NO_STRINGS,
A_REL_TAG_SELECTOR,
A_HREF_TAG_SELECTOR,
url_stopwords,
)

import copy
import logging
import re
from collections import defaultdict
from datetime import datetime

from dateutil.parser import parse as date_parser
from tldextract import tldextract
from urllib.parse import urljoin, urlparse, urlunparse

from . import urls
from .utils import StringReplacement, StringSplitter

log = logging.getLogger(__name__)

MOTLEY_REPLACEMENT = StringReplacement("�", "")
ESCAPED_FRAGMENT_REPLACEMENT = StringReplacement("#!", "?_escaped_fragment_=")
TITLE_REPLACEMENTS = StringReplacement("»", "»")
PIPE_SPLITTER = StringSplitter("\\|")
DASH_SPLITTER = StringSplitter(" - ")
UNDERSCORE_SPLITTER = StringSplitter("_")
SLASH_SPLITTER = StringSplitter("/")
ARROWS_SPLITTER = StringSplitter(" » ")
COLON_SPLITTER = StringSplitter(":")
SPACE_SPLITTER = StringSplitter(" ")
NO_STRINGS = set()
A_REL_TAG_SELECTOR = "a[rel=tag]"
A_HREF_TAG_SELECTOR = (
"a[href*='/tag/'], a[href*='/tags/'], " "a[href*='/topic/'], a[href*='?keyword=']"
)
RE_LANG = r"^[A-Za-z]{2}$"

good_paths = [
"story",
"article",
"feature",
"featured",
"slides",
"slideshow",
"gallery",
"news",
"video",
"media",
"v",
"radio",
"press",
]
bad_chunks = [
"careers",
"contact",
"about",
"faq",
"terms",
"privacy",
"advert",
"preferences",
"feedback",
"info",
"browse",
"howto",
"account",
"subscribe",
"donate",
"shop",
"admin",
]
bad_domains = ["amazon", "doubleclick", "twitter"]


class ContentExtractor(object):
def __init__(self, config):
Expand Down Expand Up @@ -217,59 +167,34 @@ def parse_date_str(date_str):
# specifier, e.g. /2014/04/
return None

date_matches = []
date_match = re.search(urls.STRICT_DATE_REGEX, url)
if date_match:
date_str = date_match.group(0)
datetime_obj = parse_date_str(date_str)
if datetime_obj:
return datetime_obj

PUBLISH_DATE_TAGS = [
{
"attribute": "property",
"value": "rnews:datePublished",
"content": "content",
},
{
"attribute": "property",
"value": "article:published_time",
"content": "content",
},
{
"attribute": "name",
"value": "OriginalPublicationDate",
"content": "content",
},
{"attribute": "itemprop", "value": "datePublished", "content": "datetime"},
{
"attribute": "property",
"value": "og:published_time",
"content": "content",
},
{
"attribute": "name",
"value": "article_date_original",
"content": "content",
},
{"attribute": "name", "value": "publication_date", "content": "content"},
{"attribute": "name", "value": "sailthru.date", "content": "content"},
{"attribute": "name", "value": "PublishDate", "content": "content"},
{"attribute": "pubdate", "value": "pubdate", "content": "datetime"},
{"attribute": "name", "value": "publish_date", "content": "content"},
]
date_matches.append((datetime_obj, 10)) # date and matchscore

for known_meta_tag in PUBLISH_DATE_TAGS:
meta_tags = self.parser.getElementsByTag(
doc, attr=known_meta_tag["attribute"], value=known_meta_tag["value"]
)
if meta_tags:
date_str = self.parser.getAttribute(
meta_tags[0], known_meta_tag["content"]
)
for meta_tag in meta_tags:
date_str = self.parser.getAttribute(meta_tag, known_meta_tag["content"])
datetime_obj = parse_date_str(date_str)
if datetime_obj:
return datetime_obj

return None
score = 6
if meta_tag.attrib.get("name") == known_meta_tag["value"]:
score += 2
days_diff = (datetime.now().date() - datetime_obj.date()).days
if days_diff < 0: # articles from the future
score -= 2
elif days_diff > 25 * 365: # very old articles
score -= 1
date_matches.append((datetime_obj, score))

date_matches.sort(key=lambda x: x[1], reverse=True)
return date_matches[0][0] if date_matches else None

def get_title(self, doc):
"""Fetch the article title and analyze it
Expand Down Expand Up @@ -747,74 +672,6 @@ def get_category_urls(self, source_url, doc):
"or size path chunks" % p_url
)
)
stopwords = [
"about",
"help",
"privacy",
"legal",
"feedback",
"sitemap",
"profile",
"account",
"mobile",
"sitemap",
"facebook",
"myspace",
"twitter",
"linkedin",
"bebo",
"friendster",
"stumbleupon",
"youtube",
"vimeo",
"store",
"mail",
"preferences",
"maps",
"password",
"imgur",
"flickr",
"search",
"subscription",
"itunes",
"siteindex",
"events",
"stop",
"jobs",
"careers",
"newsletter",
"subscribe",
"academy",
"shopping",
"purchase",
"site-map",
"shop",
"donate",
"newsletter",
"product",
"advert",
"info",
"tickets",
"coupons",
"forum",
"board",
"archive",
"browse",
"howto",
"how to",
"faq",
"terms",
"charts",
"services",
"contact",
"plus",
"admin",
"login",
"signup",
"register",
"developer",
"proxy",
]

_valid_categories = []

Expand All @@ -825,7 +682,7 @@ def get_category_urls(self, source_url, doc):
subdomain = tldextract.extract(p_url).subdomain
conjunction = path + " " + subdomain
bad = False
for badword in stopwords:
for badword in url_stopwords:
if badword.lower() in conjunction.lower():
if self.config.verbose:
print(
Expand Down
Loading

0 comments on commit 4d137eb

Please sign in to comment.