extraction: better precision & shorter code (#105)

* change code to get better precision * clean tree before the rest * stricter trimming * review regexes * simplify expressions + more precision * clean code
adbar · Oct 19, 2023 · c88924a · c88924a
1 parent 5ba8f70
commit c88924a
Show file tree

Hide file tree

Showing 5 changed files with 67 additions and 128 deletions.
diff --git a/htmldate/core.py b/htmldate/core.py
@@ -22,7 +22,6 @@
 from .extractors import (
     discard_unwanted,
     extract_url_date,
-    extract_partial_url_date,
     idiosyncrasies_search,
     img_search,
     json_search,
@@ -33,7 +32,8 @@
     FAST_PREPEND,
     SLOW_PREPEND,
     FREE_TEXT_EXPRESSIONS,
-    MAX_TEXT_SIZE,
+    MAX_SEGMENT_LEN,
+    MIN_SEGMENT_LEN,
     YEAR_PATTERN,
     YMD_PATTERN,
     COPYRIGHT_PATTERN,
@@ -58,7 +58,7 @@
     TWO_COMP_REGEX,
 )
 from .settings import CACHE_SIZE, CLEANING_LIST, MAX_POSSIBLE_CANDIDATES
-from .utils import clean_html, load_html
+from .utils import clean_html, load_html, trim_text
 from .validators import (
     check_extracted_reference,
     compare_values,
@@ -208,12 +208,12 @@ def examine_date_elements(
 
     for elem in elements:
         # trim
-        text = " ".join(elem.text_content().split()).strip()
+        text = trim_text(elem.text_content())
         # simple length heuristic
-        if len(text) > 6:  # could be 8 or 9
+        if len(text) > MIN_SEGMENT_LEN:
             # shorten and try the beginning of the string
             # trim non-digits at the end of the string
-            text = NON_DIGITS_REGEX.sub("", text[:MAX_TEXT_SIZE])
+            text = NON_DIGITS_REGEX.sub("", text[:MAX_SEGMENT_LEN])
             LOGGER.debug(
                 "analyzing (HTML): %s",
                 " ".join(logstring(elem).split())[:100],
@@ -224,9 +224,9 @@ def examine_date_elements(
             if attempt:
                 return attempt
         # try link title (Blogspot)
-        title_attr = elem.get("title", "").strip()
-        if len(title_attr) > 0:
-            title_attr = NON_DIGITS_REGEX.sub("", title_attr[:MAX_TEXT_SIZE])
+        title_attr = trim_text(elem.get("title", ""))
+        if len(title_attr) > MIN_SEGMENT_LEN:
+            title_attr = NON_DIGITS_REGEX.sub("", title_attr[:MAX_SEGMENT_LEN])
             attempt = try_date_expr(
                 title_attr, outputformat, extensive_search, min_date, max_date
             )
@@ -1037,27 +1037,40 @@ def find_date(
     if abbr_result is not None:
         return abbr_result
 
-    # expressions + text_content
+    # first, prune tree
+    try:
+        search_tree, discarded = discard_unwanted(
+            clean_html(deepcopy(tree), CLEANING_LIST)
+        )
+    # rare LXML error: no NULL bytes or control characters
+    except ValueError:  # pragma: no cover
+        search_tree = tree
+        LOGGER.error("lxml cleaner error")
+
+    # define expressions + text_content
     if extensive_search:
         date_expr = SLOW_PREPEND + DATE_EXPRESSIONS
     else:
         date_expr = FAST_PREPEND + DATE_EXPRESSIONS
 
-    # first try in pruned tree
-    search_tree, discarded = discard_unwanted(deepcopy(tree))
+    # then look for expressions
     dateresult = examine_date_elements(
         search_tree, date_expr, outputformat, extensive_search, min_date, max_date
     )
     if dateresult is not None:
         return dateresult
 
-    # TODO: decide on this
-    # search in discarded parts (e.g. archive.org-banner)
-    # for subtree in discarded:
-    #    dateresult = examine_date_elements(subtree, DATE_EXPRESSIONS,
-    #        outputformat, extensive_search, min_date, max_date)
-    #    if dateresult is not None:
-    #        return dateresult
+    # look for expressions
+    dateresult = examine_date_elements(
+        search_tree,
+        ".//title|.//h1",
+        outputformat,
+        extensive_search,
+        min_date,
+        max_date,
+    )
+    if dateresult is not None:
+        return dateresult
 
     # try time elements
     time_result = examine_time_elements(
@@ -1066,66 +1079,43 @@ def find_date(
     if time_result is not None:
         return time_result
 
-    # clean before string search
-    try:
-        cleaned_html = clean_html(tree, CLEANING_LIST)
-    # rare LXML error: no NULL bytes or control characters
-    except ValueError:  # pragma: no cover
-        cleaned_html = tree
-        LOGGER.error("lxml cleaner error")
+    # TODO: decide on this
+    # search in discarded parts (e.g. archive.org-banner)
+    # for subtree in discarded:
+    #    dateresult = examine_date_elements(subtree, DATE_EXPRESSIONS,
+    #        outputformat, extensive_search, min_date, max_date)
+    #    if dateresult is not None:
+    #        return dateresult
 
     # robust conversion to string
     try:
-        htmlstring = tostring(cleaned_html, pretty_print=False, encoding="unicode")
+        htmlstring = tostring(search_tree, pretty_print=False, encoding="unicode")
     except UnicodeDecodeError:
-        htmlstring = tostring(cleaned_html, pretty_print=False).decode(
-            "utf-8", "ignore"
-        )
-    # remove comments by hand as faulty in lxml?
-    # htmlstring = re.sub(r'<!--.+?-->', '', htmlstring, flags=re.DOTALL)
+        htmlstring = tostring(search_tree, pretty_print=False).decode("utf-8", "ignore")
 
     # date regex timestamp rescue
     timestamp_result = timestamp_search(htmlstring, outputformat, min_date, max_date)
     if timestamp_result is not None:
         return timestamp_result
 
+    # try image elements
+    img_result = img_search(search_tree, outputformat, min_date, max_date)
+    if img_result is not None:
+        return img_result
+
     # precise patterns and idiosyncrasies
     text_result = idiosyncrasies_search(htmlstring, outputformat, min_date, max_date)
     if text_result is not None:
         return text_result
 
-    # title
-    for title_elem in tree.iter("title", "h1"):
-        attempt = try_date_expr(
-            title_elem.text_content(),
-            outputformat,
-            extensive_search,
-            min_date,
-            max_date,
-        )
-        if attempt is not None:
-            return attempt
-
-    # last try: URL 2
-    if url is not None:
-        dateresult = extract_partial_url_date(url, outputformat, min_date, max_date)
-        if dateresult is not None:
-            return dateresult
-
-    # try image elements
-    img_result = img_search(tree, outputformat, min_date, max_date)
-    if img_result is not None:
-        return img_result
-
     # last resort
     if extensive_search:
         LOGGER.debug("extensive search started")
         # TODO: further tests & decide according to original_date
         reference = 0
-        for segment in cleaned_html.xpath(FREE_TEXT_EXPRESSIONS):
+        for segment in search_tree.xpath(FREE_TEXT_EXPRESSIONS):
             segment = segment.strip()
-            # basic filter: minimum could be 8 or 9
-            if not 6 < len(segment) < MAX_TEXT_SIZE:
+            if not MIN_SEGMENT_LEN < len(segment) < MAX_SEGMENT_LEN:
                 continue
             reference = compare_reference(
                 reference,

diff --git a/htmldate/extractors.py b/htmldate/extractors.py
@@ -24,6 +24,7 @@
 
 # own
 from .settings import CACHE_SIZE
+from .utils import trim_text
 from .validators import convert_date, date_validator
 
 
@@ -48,21 +49,16 @@
 )
 
 
-FAST_PREPEND = ".//*[(self::div or self::li or self::p or self::span)]"
+FAST_PREPEND = ".//*[(self::div or self::h2 or self::h3 or self::h4 or self::li or self::p or self::span or self::time or self::ul)]"
 # self::b or self::em or self::font or self::i or self::strong
 SLOW_PREPEND = ".//*"
 
 DATE_EXPRESSIONS = """
 [
     contains(translate(@id|@class|@itemprop, "D", "d"), 'date') or
     contains(translate(@id|@class|@itemprop, "D", "d"), 'datum') or
+    contains(translate(@id|@class, "M", "m"), 'meta') or
     contains(@id|@class, 'time') or
-    @class='meta' or
-    contains(translate(@id|@class, "M", "m"), 'metadata') or
-    contains(translate(@id|@class, "M", "m"), 'meta-') or
-    contains(translate(@id|@class, "M", "m"), '-meta') or
-    contains(translate(@id|@class, "M", "m"), '_meta') or
-    contains(translate(@id|@class, "M", "m"), 'postmeta') or
     contains(@id|@class, 'publish') or
     contains(@id|@class, 'footer') or
     contains(@class, 'info') or
@@ -83,13 +79,15 @@
     contains(@class, 'parution')
 ] |
 .//footer | .//small
-    """
+"""
+
 # further tests needed:
 # or contains(@class, 'article')
 # or contains(@id, 'lastmod') or contains(@class, 'updated')
 
 FREE_TEXT_EXPRESSIONS = FAST_PREPEND + "/text()"
-MAX_TEXT_SIZE = 48
+MIN_SEGMENT_LEN = 6
+MAX_SEGMENT_LEN = 52
 
 # discard parts of the webpage
 # archive.org banner inserts
@@ -129,12 +127,11 @@
 )
 
 COMPLETE_URL = re.compile(r"\D([0-9]{4})[/_-]([0-9]{1,2})[/_-]([0-9]{1,2})(?:\D|$)")
-PARTIAL_URL = re.compile(r"\D([0-9]{4})[/_-]([0-9]{2})(?:\D|$)")
 
 JSON_MODIFIED = re.compile(r'"dateModified": ?"([0-9]{4}-[0-9]{2}-[0-9]{2})', re.I)
 JSON_PUBLISHED = re.compile(r'"datePublished": ?"([0-9]{4}-[0-9]{2}-[0-9]{2})', re.I)
 TIMESTAMP_PATTERN = re.compile(
-    r"([0-9]{4}-[0-9]{2}-[0-9]{2}|[0-9]{2}\.[0-9]{2}\.[0-9]{4}).[0-9]{2}:[0-9]{2}:[0-9]{2}"
+    r"([0-9]{4}-[0-9]{2}-[0-9]{2}).[0-9]{2}:[0-9]{2}:[0-9]{2}"
 )
 
 # English, French, German, Indonesian and Turkish dates cache
@@ -159,26 +156,22 @@
 
 TEXT_DATE_PATTERN = re.compile(r"[.:,_/ -]|^\d+$")
 
-
 DISCARD_PATTERNS = re.compile(
     r"^\d{2}:\d{2}(?: |:|$)|"
     r"^\D*\d{4}\D*$|"
     r"[$€¥Ұ£¢₽₱฿#₹]|"  # currency symbols and special characters
     r"[A-Z]{3}[^A-Z]|"  # currency codes
     r"(?:^|\D)(?:\+\d{2}|\d{3}|\d{5})\D|"  # tel./IPs/postal codes
     r"ftps?|https?|sftp|"  # protocols
-    r"\.(com|net|org|info|gov|edu|de|fr|io)\b|"  # TLDs
+    r"\.(?:com|net|org|info|gov|edu|de|fr|io)\b|"  # TLDs
     r"IBAN|[A-Z]{2}[0-9]{2}|"  # bank accounts
     r"®"  # ©
 )
-# further testing required:
-# \d[,.]\d+  # currency amounts
-# leads to errors: ^\D+\d{3,}\D+
 
 # use of regex module for speed?
 TEXT_PATTERNS = re.compile(
-    r'(?:date[^0-9"]{,20}|updated|published) *?(?:in)? *?:? *?([0-9]{1,4})[./]([0-9]{1,2})[./]([0-9]{2,4})|'  # EN
-    r"(?:Datum|Stand|[Vv]eröffentlicht am):? ?([0-9]{1,2})\.([0-9]{1,2})\.([0-9]{2,4})|"  # DE
+    r'(?:date[^0-9"]{,20}|updated|published|on)(?:[ :])*?([0-9]{1,4})[./]([0-9]{1,2})[./]([0-9]{2,4})|'  # EN
+    r"(?:Datum|Stand|Veröffentlicht am):? ?([0-9]{1,2})\.([0-9]{1,2})\.([0-9]{2,4})|"  # DE
     r"(?:güncellen?me|yayı(?:m|n)lan?ma) *?(?:tarihi)? *?:? *?([0-9]{1,2})[./]([0-9]{1,2})[./]([0-9]{2,4})|"
     r"([0-9]{1,2})[./]([0-9]{1,2})[./]([0-9]{2,4}) *?(?:'de|'da|'te|'ta|’de|’da|’te|’ta|tarihinde) *(?:güncellendi|yayı(?:m|n)landı)",  # TR
     re.I,
@@ -248,28 +241,6 @@ def extract_url_date(
     return None
 
 
-def extract_partial_url_date(
-    testurl: str, outputformat: str, min_date: datetime, max_date: datetime
-) -> Optional[str]:
-    """Extract an approximate date out of an URL string in Y-M format"""
-    match = PARTIAL_URL.search(testurl)
-    if match:
-        dateresult = match[0] + "/01"
-        LOGGER.debug("found partial date in URL: %s", dateresult)
-        try:
-            dateobject = datetime(int(match[1]), int(match[2]), 1)
-            if (
-                date_validator(
-                    dateobject, outputformat, earliest=min_date, latest=max_date
-                )
-                is True
-            ):
-                return dateobject.strftime(outputformat)
-        except ValueError as err:
-            LOGGER.debug("conversion error: %s %s", dateresult, err)
-    return None
-
-
 def correct_year(year: int) -> int:
     """Adapt year from YY to YYYY format"""
     if year < 100:
@@ -461,7 +432,7 @@ def try_date_expr(
         return None
 
     # trim
-    string = " ".join(string.strip()[:MAX_TEXT_SIZE].split())
+    string = trim_text(string)[:MAX_SEGMENT_LEN]
 
     # formal constraint: 4 to 18 digits
     if not string or not 4 <= sum(map(str.isdigit, string)) <= 18:

diff --git a/htmldate/settings.py b/htmldate/settings.py
@@ -30,7 +30,7 @@
     "embed",
     "frame",
     "frameset",
-    "figure",
+    "iframe",
     "label",
     "map",
     "math",
@@ -42,4 +42,4 @@
     "track",
     "video",
 ]
-# "iframe", "input", "layer", "param", "source"
+# "figure", "input", "layer", "param", "source"
diff --git a/htmldate/utils.py b/htmldate/utils.py
@@ -226,3 +226,8 @@ def clean_html(tree: HtmlElement, elemlist: List[str]) -> HtmlElement:
         except AttributeError:  # pragma: no cover
             element.getparent().remove(element)
     return tree
+
+
+def trim_text(string: str) -> str:
+    "Remove superfluous space and normalize remaining space."
+    return " ".join(string.split()).strip()
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -48,7 +48,6 @@
     custom_parse,
     discard_unwanted,
     external_date_parser,
-    extract_partial_url_date,
     regex_parse,
     try_date_expr,
 )
@@ -1229,13 +1228,6 @@ def test_url():
         )
         == "2012-11-29"
     )
-    assert (
-        find_date(
-            "<html><body><p>Aaa, bbb.</p></body></html>",
-            url="http://www.kreditwesen.org/widerstand-berlin/2012-11/keine-kurzung-bei-der-jugend-klubs-konnen-vorerst-aufatmen-bvv-beschliest-haushaltsplan/",
-        )
-        == "2012-11-01"
-    )
     assert (
         find_date(
             "<html><body><p>Aaa, bbb.</p></body></html>",
@@ -1250,29 +1242,10 @@ def test_url():
         )
         == "2019-06-26"
     )
-    assert (
-        extract_partial_url_date(
-            "https://testsite.org/2018/01/test", "%Y-%m-%d", MIN_DATE, LATEST_POSSIBLE
-        )
-        == "2018-01-01"
-    )
-    assert (
-        extract_partial_url_date(
-            "https://testsite.org/2018/33/test", "%Y-%m-%d", MIN_DATE, LATEST_POSSIBLE
-        )
-        is None
-    )
 
 
 def test_approximate_url():
     """test url parameter"""
-    assert (
-        find_date(
-            "<html><body><p>Aaa, bbb.</p></body></html>",
-            url="http://example.com/blog/2016/07/key-words",
-        )
-        == "2016-07-01"
-    )
     assert (
         find_date(
             "<html><body><p>Aaa, bbb.</p></body></html>",