Skip to content

Commit

Permalink
extraction: better precision & shorter code (#105)
Browse files Browse the repository at this point in the history
* change code to get better precision

* clean tree before the rest

* stricter trimming

* review regexes

* simplify expressions + more precision

* clean code
  • Loading branch information
adbar authored Oct 19, 2023
1 parent 5ba8f70 commit c88924a
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 128 deletions.
106 changes: 48 additions & 58 deletions htmldate/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
from .extractors import (
discard_unwanted,
extract_url_date,
extract_partial_url_date,
idiosyncrasies_search,
img_search,
json_search,
Expand All @@ -33,7 +32,8 @@
FAST_PREPEND,
SLOW_PREPEND,
FREE_TEXT_EXPRESSIONS,
MAX_TEXT_SIZE,
MAX_SEGMENT_LEN,
MIN_SEGMENT_LEN,
YEAR_PATTERN,
YMD_PATTERN,
COPYRIGHT_PATTERN,
Expand All @@ -58,7 +58,7 @@
TWO_COMP_REGEX,
)
from .settings import CACHE_SIZE, CLEANING_LIST, MAX_POSSIBLE_CANDIDATES
from .utils import clean_html, load_html
from .utils import clean_html, load_html, trim_text
from .validators import (
check_extracted_reference,
compare_values,
Expand Down Expand Up @@ -208,12 +208,12 @@ def examine_date_elements(

for elem in elements:
# trim
text = " ".join(elem.text_content().split()).strip()
text = trim_text(elem.text_content())
# simple length heuristic
if len(text) > 6: # could be 8 or 9
if len(text) > MIN_SEGMENT_LEN:
# shorten and try the beginning of the string
# trim non-digits at the end of the string
text = NON_DIGITS_REGEX.sub("", text[:MAX_TEXT_SIZE])
text = NON_DIGITS_REGEX.sub("", text[:MAX_SEGMENT_LEN])
LOGGER.debug(
"analyzing (HTML): %s",
" ".join(logstring(elem).split())[:100],
Expand All @@ -224,9 +224,9 @@ def examine_date_elements(
if attempt:
return attempt
# try link title (Blogspot)
title_attr = elem.get("title", "").strip()
if len(title_attr) > 0:
title_attr = NON_DIGITS_REGEX.sub("", title_attr[:MAX_TEXT_SIZE])
title_attr = trim_text(elem.get("title", ""))
if len(title_attr) > MIN_SEGMENT_LEN:
title_attr = NON_DIGITS_REGEX.sub("", title_attr[:MAX_SEGMENT_LEN])
attempt = try_date_expr(
title_attr, outputformat, extensive_search, min_date, max_date
)
Expand Down Expand Up @@ -1037,27 +1037,40 @@ def find_date(
if abbr_result is not None:
return abbr_result

# expressions + text_content
# first, prune tree
try:
search_tree, discarded = discard_unwanted(
clean_html(deepcopy(tree), CLEANING_LIST)
)
# rare LXML error: no NULL bytes or control characters
except ValueError: # pragma: no cover
search_tree = tree
LOGGER.error("lxml cleaner error")

# define expressions + text_content
if extensive_search:
date_expr = SLOW_PREPEND + DATE_EXPRESSIONS
else:
date_expr = FAST_PREPEND + DATE_EXPRESSIONS

# first try in pruned tree
search_tree, discarded = discard_unwanted(deepcopy(tree))
# then look for expressions
dateresult = examine_date_elements(
search_tree, date_expr, outputformat, extensive_search, min_date, max_date
)
if dateresult is not None:
return dateresult

# TODO: decide on this
# search in discarded parts (e.g. archive.org-banner)
# for subtree in discarded:
# dateresult = examine_date_elements(subtree, DATE_EXPRESSIONS,
# outputformat, extensive_search, min_date, max_date)
# if dateresult is not None:
# return dateresult
# look for expressions
dateresult = examine_date_elements(
search_tree,
".//title|.//h1",
outputformat,
extensive_search,
min_date,
max_date,
)
if dateresult is not None:
return dateresult

# try time elements
time_result = examine_time_elements(
Expand All @@ -1066,66 +1079,43 @@ def find_date(
if time_result is not None:
return time_result

# clean before string search
try:
cleaned_html = clean_html(tree, CLEANING_LIST)
# rare LXML error: no NULL bytes or control characters
except ValueError: # pragma: no cover
cleaned_html = tree
LOGGER.error("lxml cleaner error")
# TODO: decide on this
# search in discarded parts (e.g. archive.org-banner)
# for subtree in discarded:
# dateresult = examine_date_elements(subtree, DATE_EXPRESSIONS,
# outputformat, extensive_search, min_date, max_date)
# if dateresult is not None:
# return dateresult

# robust conversion to string
try:
htmlstring = tostring(cleaned_html, pretty_print=False, encoding="unicode")
htmlstring = tostring(search_tree, pretty_print=False, encoding="unicode")
except UnicodeDecodeError:
htmlstring = tostring(cleaned_html, pretty_print=False).decode(
"utf-8", "ignore"
)
# remove comments by hand as faulty in lxml?
# htmlstring = re.sub(r'<!--.+?-->', '', htmlstring, flags=re.DOTALL)
htmlstring = tostring(search_tree, pretty_print=False).decode("utf-8", "ignore")

# date regex timestamp rescue
timestamp_result = timestamp_search(htmlstring, outputformat, min_date, max_date)
if timestamp_result is not None:
return timestamp_result

# try image elements
img_result = img_search(search_tree, outputformat, min_date, max_date)
if img_result is not None:
return img_result

# precise patterns and idiosyncrasies
text_result = idiosyncrasies_search(htmlstring, outputformat, min_date, max_date)
if text_result is not None:
return text_result

# title
for title_elem in tree.iter("title", "h1"):
attempt = try_date_expr(
title_elem.text_content(),
outputformat,
extensive_search,
min_date,
max_date,
)
if attempt is not None:
return attempt

# last try: URL 2
if url is not None:
dateresult = extract_partial_url_date(url, outputformat, min_date, max_date)
if dateresult is not None:
return dateresult

# try image elements
img_result = img_search(tree, outputformat, min_date, max_date)
if img_result is not None:
return img_result

# last resort
if extensive_search:
LOGGER.debug("extensive search started")
# TODO: further tests & decide according to original_date
reference = 0
for segment in cleaned_html.xpath(FREE_TEXT_EXPRESSIONS):
for segment in search_tree.xpath(FREE_TEXT_EXPRESSIONS):
segment = segment.strip()
# basic filter: minimum could be 8 or 9
if not 6 < len(segment) < MAX_TEXT_SIZE:
if not MIN_SEGMENT_LEN < len(segment) < MAX_SEGMENT_LEN:
continue
reference = compare_reference(
reference,
Expand Down
53 changes: 12 additions & 41 deletions htmldate/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

# own
from .settings import CACHE_SIZE
from .utils import trim_text
from .validators import convert_date, date_validator


Expand All @@ -48,21 +49,16 @@
)


FAST_PREPEND = ".//*[(self::div or self::li or self::p or self::span)]"
FAST_PREPEND = ".//*[(self::div or self::h2 or self::h3 or self::h4 or self::li or self::p or self::span or self::time or self::ul)]"
# self::b or self::em or self::font or self::i or self::strong
SLOW_PREPEND = ".//*"

DATE_EXPRESSIONS = """
[
contains(translate(@id|@class|@itemprop, "D", "d"), 'date') or
contains(translate(@id|@class|@itemprop, "D", "d"), 'datum') or
contains(translate(@id|@class, "M", "m"), 'meta') or
contains(@id|@class, 'time') or
@class='meta' or
contains(translate(@id|@class, "M", "m"), 'metadata') or
contains(translate(@id|@class, "M", "m"), 'meta-') or
contains(translate(@id|@class, "M", "m"), '-meta') or
contains(translate(@id|@class, "M", "m"), '_meta') or
contains(translate(@id|@class, "M", "m"), 'postmeta') or
contains(@id|@class, 'publish') or
contains(@id|@class, 'footer') or
contains(@class, 'info') or
Expand All @@ -83,13 +79,15 @@
contains(@class, 'parution')
] |
.//footer | .//small
"""
"""

# further tests needed:
# or contains(@class, 'article')
# or contains(@id, 'lastmod') or contains(@class, 'updated')

FREE_TEXT_EXPRESSIONS = FAST_PREPEND + "/text()"
MAX_TEXT_SIZE = 48
MIN_SEGMENT_LEN = 6
MAX_SEGMENT_LEN = 52

# discard parts of the webpage
# archive.org banner inserts
Expand Down Expand Up @@ -129,12 +127,11 @@
)

COMPLETE_URL = re.compile(r"\D([0-9]{4})[/_-]([0-9]{1,2})[/_-]([0-9]{1,2})(?:\D|$)")
PARTIAL_URL = re.compile(r"\D([0-9]{4})[/_-]([0-9]{2})(?:\D|$)")

JSON_MODIFIED = re.compile(r'"dateModified": ?"([0-9]{4}-[0-9]{2}-[0-9]{2})', re.I)
JSON_PUBLISHED = re.compile(r'"datePublished": ?"([0-9]{4}-[0-9]{2}-[0-9]{2})', re.I)
TIMESTAMP_PATTERN = re.compile(
r"([0-9]{4}-[0-9]{2}-[0-9]{2}|[0-9]{2}\.[0-9]{2}\.[0-9]{4}).[0-9]{2}:[0-9]{2}:[0-9]{2}"
r"([0-9]{4}-[0-9]{2}-[0-9]{2}).[0-9]{2}:[0-9]{2}:[0-9]{2}"
)

# English, French, German, Indonesian and Turkish dates cache
Expand All @@ -159,26 +156,22 @@

TEXT_DATE_PATTERN = re.compile(r"[.:,_/ -]|^\d+$")


DISCARD_PATTERNS = re.compile(
r"^\d{2}:\d{2}(?: |:|$)|"
r"^\D*\d{4}\D*$|"
r"[$€¥Ұ£¢₽₱฿#₹]|" # currency symbols and special characters
r"[A-Z]{3}[^A-Z]|" # currency codes
r"(?:^|\D)(?:\+\d{2}|\d{3}|\d{5})\D|" # tel./IPs/postal codes
r"ftps?|https?|sftp|" # protocols
r"\.(com|net|org|info|gov|edu|de|fr|io)\b|" # TLDs
r"\.(?:com|net|org|info|gov|edu|de|fr|io)\b|" # TLDs
r"IBAN|[A-Z]{2}[0-9]{2}|" # bank accounts
r"®" # ©
)
# further testing required:
# \d[,.]\d+ # currency amounts
# leads to errors: ^\D+\d{3,}\D+

# use of regex module for speed?
TEXT_PATTERNS = re.compile(
r'(?:date[^0-9"]{,20}|updated|published) *?(?:in)? *?:? *?([0-9]{1,4})[./]([0-9]{1,2})[./]([0-9]{2,4})|' # EN
r"(?:Datum|Stand|[Vv]eröffentlicht am):? ?([0-9]{1,2})\.([0-9]{1,2})\.([0-9]{2,4})|" # DE
r'(?:date[^0-9"]{,20}|updated|published|on)(?:[ :])*?([0-9]{1,4})[./]([0-9]{1,2})[./]([0-9]{2,4})|' # EN
r"(?:Datum|Stand|Veröffentlicht am):? ?([0-9]{1,2})\.([0-9]{1,2})\.([0-9]{2,4})|" # DE
r"(?:güncellen?me|yayı(?:m|n)lan?ma) *?(?:tarihi)? *?:? *?([0-9]{1,2})[./]([0-9]{1,2})[./]([0-9]{2,4})|"
r"([0-9]{1,2})[./]([0-9]{1,2})[./]([0-9]{2,4}) *?(?:'de|'da|'te|'ta|’de|’da|’te|’ta|tarihinde) *(?:güncellendi|yayı(?:m|n)landı)", # TR
re.I,
Expand Down Expand Up @@ -248,28 +241,6 @@ def extract_url_date(
return None


def extract_partial_url_date(
testurl: str, outputformat: str, min_date: datetime, max_date: datetime
) -> Optional[str]:
"""Extract an approximate date out of an URL string in Y-M format"""
match = PARTIAL_URL.search(testurl)
if match:
dateresult = match[0] + "/01"
LOGGER.debug("found partial date in URL: %s", dateresult)
try:
dateobject = datetime(int(match[1]), int(match[2]), 1)
if (
date_validator(
dateobject, outputformat, earliest=min_date, latest=max_date
)
is True
):
return dateobject.strftime(outputformat)
except ValueError as err:
LOGGER.debug("conversion error: %s %s", dateresult, err)
return None


def correct_year(year: int) -> int:
"""Adapt year from YY to YYYY format"""
if year < 100:
Expand Down Expand Up @@ -461,7 +432,7 @@ def try_date_expr(
return None

# trim
string = " ".join(string.strip()[:MAX_TEXT_SIZE].split())
string = trim_text(string)[:MAX_SEGMENT_LEN]

# formal constraint: 4 to 18 digits
if not string or not 4 <= sum(map(str.isdigit, string)) <= 18:
Expand Down
4 changes: 2 additions & 2 deletions htmldate/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
"embed",
"frame",
"frameset",
"figure",
"iframe",
"label",
"map",
"math",
Expand All @@ -42,4 +42,4 @@
"track",
"video",
]
# "iframe", "input", "layer", "param", "source"
# "figure", "input", "layer", "param", "source"
5 changes: 5 additions & 0 deletions htmldate/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,3 +226,8 @@ def clean_html(tree: HtmlElement, elemlist: List[str]) -> HtmlElement:
except AttributeError: # pragma: no cover
element.getparent().remove(element)
return tree


def trim_text(string: str) -> str:
"Remove superfluous space and normalize remaining space."
return " ".join(string.split()).strip()
27 changes: 0 additions & 27 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@
custom_parse,
discard_unwanted,
external_date_parser,
extract_partial_url_date,
regex_parse,
try_date_expr,
)
Expand Down Expand Up @@ -1229,13 +1228,6 @@ def test_url():
)
== "2012-11-29"
)
assert (
find_date(
"<html><body><p>Aaa, bbb.</p></body></html>",
url="http://www.kreditwesen.org/widerstand-berlin/2012-11/keine-kurzung-bei-der-jugend-klubs-konnen-vorerst-aufatmen-bvv-beschliest-haushaltsplan/",
)
== "2012-11-01"
)
assert (
find_date(
"<html><body><p>Aaa, bbb.</p></body></html>",
Expand All @@ -1250,29 +1242,10 @@ def test_url():
)
== "2019-06-26"
)
assert (
extract_partial_url_date(
"https://testsite.org/2018/01/test", "%Y-%m-%d", MIN_DATE, LATEST_POSSIBLE
)
== "2018-01-01"
)
assert (
extract_partial_url_date(
"https://testsite.org/2018/33/test", "%Y-%m-%d", MIN_DATE, LATEST_POSSIBLE
)
is None
)


def test_approximate_url():
"""test url parameter"""
assert (
find_date(
"<html><body><p>Aaa, bbb.</p></body></html>",
url="http://example.com/blog/2016/07/key-words",
)
== "2016-07-01"
)
assert (
find_date(
"<html><body><p>Aaa, bbb.</p></body></html>",
Expand Down

0 comments on commit c88924a

Please sign in to comment.