Skip to content

Commit

Permalink
feat(parser): add yoast schema parse for date extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
AndyTheFactory committed Oct 27, 2023
1 parent d7608da commit 39a5cff
Showing 1 changed file with 20 additions and 0 deletions.
20 changes: 20 additions & 0 deletions newspaper/extractors/content_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import re
from collections import defaultdict
from datetime import datetime
import json

from dateutil.parser import parse as date_parser
from tldextract import tldextract
Expand Down Expand Up @@ -175,6 +176,25 @@ def parse_date_str(date_str):
if datetime_obj:
date_matches.append((datetime_obj, 10)) # date and matchscore

# yoast seo structured data
yoast_script_tag = self.parser.getElementsByTag(
doc, tag="script", attr="type", value="application/ld+json"
)
if yoast_script_tag:
for script_tag in yoast_script_tag:
if "yoast-schema-graph" in script_tag.attrib.get("class"):
try:
schema_json = json.loads(script_tag.text)
except Exception:
continue

g = schema_json.get("@graph", [])
for item in g:
date_str = item.get("datePublished")
datetime_obj = parse_date_str(date_str)
if datetime_obj:
date_matches.append((datetime_obj, 10))

for known_meta_tag in PUBLISH_DATE_TAGS:
meta_tags = self.parser.getElementsByTag(
doc, attr=known_meta_tag["attribute"], value=known_meta_tag["value"]
Expand Down

0 comments on commit 39a5cff

Please sign in to comment.