From f6afda7fa8f94317c56b21eec15027295652a834 Mon Sep 17 00:00:00 2001 From: Angelo Gladding Date: Thu, 30 Nov 2023 01:27:55 -0800 Subject: [PATCH] Add language support (#210) Add language details to embedded properties and root microformats using the following order of specificity: - embedded properties (class=e-* lang=..) - root microformats (class=h-* lang=..) - document root () --- CHANGELOG.md | 1 + mf2py/parse_property.py | 11 +++++++++-- mf2py/parser.py | 17 +++++++++++++---- test/examples/language.html | 15 +++++++++++++++ test/test_parser.py | 11 +++++++++++ 5 files changed, 49 insertions(+), 6 deletions(-) create mode 100644 test/examples/language.html diff --git a/CHANGELOG.md b/CHANGELOG.md index b100367..996773a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ All notable changes to this project will be documented in this file. - make relative URLs in e-* properties absolute (#201) - fix whitespace in plaintext conversion (#207) - add srcset support (#209) +- add language support (#210) ## 1.1.3 - 2023-06-28 - reduce instances where photo is implied (#135) diff --git a/mf2py/parse_property.py b/mf2py/parse_property.py index 286e621..30c565a 100644 --- a/mf2py/parse_property.py +++ b/mf2py/parse_property.py @@ -94,13 +94,20 @@ def datetime(el, default_date=None): ) -def embedded(el, base_url=""): +def embedded(el, root_lang, document_lang, base_url=""): """Process e-* properties""" for tag in el.find_all(): for attr in ("href", "src", "cite", "data", "poster"): if attr in tag.attrs: tag.attrs[attr] = try_urljoin(base_url, tag.attrs[attr]) - return { + prop_value = { "html": el.decode_contents().strip(), # secret bs4 method to get innerHTML "value": get_textContent(el, replace_img=True, base_url=base_url), } + if lang := el.attrs.get("lang"): + prop_value["lang"] = lang + elif root_lang: + prop_value["lang"] = root_lang + elif document_lang: + prop_value["lang"] = document_lang + return prop_value diff --git a/mf2py/parser.py b/mf2py/parser.py index 414ede6..5cba036 100644 --- a/mf2py/parser.py +++ b/mf2py/parser.py @@ -68,6 +68,7 @@ def __init__(self, doc=None, url=None, html_parser=None): "version": __version__, }, } + self.lang = None # use default parser if none specified self.__html_parser__ = html_parser or "html5lib" @@ -128,6 +129,8 @@ def __init__(self, doc=None, url=None, html_parser=None): self.__url__ = try_urljoin(self.__url__, poss_base_url) if self.__doc__ is not None: + if document := self.__doc__.find("html"): + self.lang = document.attrs.get("lang") # parse! self.parse() @@ -161,13 +164,15 @@ def handle_microformat( el = backcompat.apply_rules(el, self.__html_parser__) root_class_names = mf2_classes.root(el.get("class", [])) + root_lang = el.attrs.get("lang") + # parse for properties and children for child in get_children(el): ( child_props, child_children, child_parsed_types_aggregation, - ) = parse_props(child) + ) = parse_props(child, root_lang) for key, new_value in child_props.items(): prop_value = properties.get(key, []) prop_value.extend(new_value) @@ -239,9 +244,13 @@ def handle_microformat( else: microformat["value"] = simple_value + if root_lang: + microformat["lang"] = root_lang + elif self.lang: + microformat["lang"] = self.lang return microformat - def parse_props(el): + def parse_props(el, root_lang): """Parse the properties from a single element""" props = {} children = [] @@ -363,7 +372,7 @@ def parse_props(el): embedded_el = copy.copy(embedded_el) temp_fixes.rm_templates(embedded_el) e_value = parse_property.embedded( - embedded_el, base_url=self.__url__ + embedded_el, root_lang, self.lang, base_url=self.__url__ ) if root_class_names: @@ -394,7 +403,7 @@ def parse_props(el): child_properties, child_microformats, child_parsed_types_aggregation, - ) = parse_props(child) + ) = parse_props(child, root_lang) for prop_name in child_properties: v = props.get(prop_name, []) v.extend(child_properties[prop_name]) diff --git a/test/examples/language.html b/test/examples/language.html new file mode 100644 index 0000000..79273ee --- /dev/null +++ b/test/examples/language.html @@ -0,0 +1,15 @@ + +
+

Romero

+
+
+

Un titolo italiano

+
With an english summary
+
Con un riassunto italiano
+
+
+

En svensk titel

+
With an english summary
+
Och svensk huvudtext
+
+ diff --git a/test/test_parser.py b/test/test_parser.py index 2dc2f6a..9f2f53a 100644 --- a/test/test_parser.py +++ b/test/test_parser.py @@ -1116,3 +1116,14 @@ def test_all_u_cases(): make_labelled_cmp("all_u_cases_" + str(i))( "http://example.com/test", result["items"][0]["properties"]["url"][i] ) + + +def test_language(): + result = parse_fixture("language.html") + assert result["items"][0]["lang"] == "it" + assert result["items"][1]["lang"] == "it" + assert result["items"][1]["properties"]["content"][0]["lang"] == "en" + assert result["items"][1]["properties"]["content"][1]["lang"] == "it" + assert result["items"][2]["lang"] == "sv" + assert result["items"][2]["properties"]["content"][0]["lang"] == "en" + assert result["items"][2]["properties"]["content"][1]["lang"] == "sv"