wip: tag handler redesign.

weblyzard · Feb 6, 2024 · b4ce0ae · b4ce0ae
1 parent 53c2680
commit b4ce0ae
Show file tree

Hide file tree

Showing 8 changed files with 228 additions and 186 deletions.
diff --git a/src/inscriptis/html_engine.py b/src/inscriptis/html_engine.py
@@ -7,10 +7,27 @@
 from lxml.etree import Comment
 
 from inscriptis.annotation import Annotation
-from inscriptis.model.html_element import DEFAULT_HTML_ELEMENT
 from inscriptis.model.canvas import Canvas
 from inscriptis.model.config import ParserConfig
-from inscriptis.model.table import Table, TableCell
+from inscriptis.model.html_document_state import HtmlDocumentState
+from inscriptis.model.html_element import DEFAULT_HTML_ELEMENT
+from inscriptis.model.tag.a_tag import a_start_handler, a_end_handler
+from inscriptis.model.tag.br_tag import br_start_handler
+from inscriptis.model.tag.img_tag import img_start_handler
+from inscriptis.model.tag.list_tag import (
+    ul_start_handler,
+    ol_start_handler,
+    li_start_handler,
+    ul_end_handler,
+    ol_end_handler,
+)
+from inscriptis.model.tag.table_tag import (
+    table_start_handler,
+    tr_start_handler,
+    td_start_handler,
+    table_end_handler,
+    td_end_handler,
+)
 
 
 class Inscriptis:
@@ -35,90 +52,75 @@ class Inscriptis:
       text = parser.get_text()
     """
 
-    UL_COUNTER = ("* ", "+ ", "o ", "- ")
-    UL_COUNTER_LEN = len(UL_COUNTER)
-
     def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None):
         # use the default configuration, if no config object is provided
-        self.html_tree = html_tree
         self.config = config or ParserConfig()
 
         # setup start and end tag call tables
         self.start_tag_handler_dict = {
-            "table": self._start_table,
-            "tr": self._start_tr,
-            "td": self._start_td,
-            "th": self._start_td,
-            "ul": self._start_ul,
-            "ol": self._start_ol,
-            "li": self._start_li,
-            "br": self._newline,
-            "a": self._start_a if self.config.parse_a() else None,
-            "img": self._start_img if self.config.display_images else None,
+            "table": table_start_handler,
+            "tr": tr_start_handler,
+            "td": td_start_handler,
+            "th": td_start_handler,
+            "ul": ul_start_handler,
+            "ol": ol_start_handler,
+            "li": li_start_handler,
+            "br": br_start_handler,
+            "a": a_start_handler if self.config.parse_a() else None,
+            "img": img_start_handler if self.config.display_images else None,
         }
         self.end_tag_handler_dict = {
-            "table": self._end_table,
-            "ul": self._end_ul,
-            "ol": self._end_ol,
-            "td": self._end_td,
-            "th": self._end_td,
-            "a": self._end_a if self.config.parse_a() else None,
+            "table": table_end_handler,
+            "ul": ul_end_handler,
+            "ol": ol_end_handler,
+            "td": td_end_handler,
+            "th": td_end_handler,
+            "a": a_end_handler if self.config.parse_a() else None,
         }
 
-        # instance variables
-        self.canvas = Canvas()
-        self._css = self.config.css
-        self._apply_attributes = self.config.attribute_handler.apply_attributes
-
-        self.tags = [self._css["body"].set_canvas(self.canvas)]
-        self.current_table = []
-        self._li_counter = []
-        self._last_caption = None
-
-        # used if display_links is enabled
-        self._link_target = ""
+        # parse the HTML tree
+        state = HtmlDocumentState(config)
+        self.canvas = self._parse_html_tree(state, html_tree)
 
-    def _parse_html_tree(self, tree):
+    def _parse_html_tree(self, state: HtmlDocumentState, tree) -> Canvas:
         """Parse the HTML tree.
 
         Args:
             tree: the HTML tree to parse.
         """
         if isinstance(tree.tag, str):
-            self.handle_starttag(tree.tag, tree.attrib)
-            cur = self.tags[-1]
+            self.handle_starttag(state, tree.tag, tree.attrib)
+            cur = state.tags[-1]
             cur.canvas.open_tag(cur)
 
-            self.tags[-1].write(tree.text)
+            state.tags[-1].write(tree.text)
 
             for node in tree:
-                self._parse_html_tree(node)
+                self._parse_html_tree(state, node)
 
-            self.handle_endtag(tree.tag)
-            prev = self.tags.pop()
+            # handle the endtag
+            if handler := self.end_tag_handler_dict.get(tree.tag):
+                handler(state)
+            prev = state.tags.pop()
             prev.canvas.close_tag(prev)
 
             # write the tail text to the element's container
-            self.tags[-1].write(tree.tail)
+            state.tags[-1].write(tree.tail)
 
         elif tree.tag is Comment and tree.tail:
-            self.tags[-1].canvas.write(self.tags[-1], tree.tail)
+            state.tags[-1].canvas.write(state.tags[-1], tree.tail)
+
+        return state.canvas
 
     def get_text(self) -> str:
         """Return the text extracted from the HTML page."""
-        self._parse_html_tree(self.html_tree)
         return self.canvas.get_text()
 
     def get_annotations(self) -> List[Annotation]:
         """Return the annotations extracted from the HTML page."""
-        if not self.canvas.get_text():
-            raise ValueError(
-                "No text to annotate available yet. "
-                "Have you already parsed the page with get_text?"
-            )
         return self.canvas.annotations
 
-    def handle_starttag(self, tag, attrs):
+    def handle_starttag(self, state, tag, attrs, handler):
         """Handle HTML start tags.
 
         Compute the style of the current :class:`HtmlElement`, based on
@@ -135,135 +137,15 @@ def handle_starttag(self, tag, attrs):
           attrs: a dictionary of HTML attributes and their respective values.
         """
         # use the css to handle tags known to it :)
-        cur = self.tags[-1].get_refined_html_element(
-            self._apply_attributes(
+        cur = state.tags[-1].get_refined_html_element(
+            state.apply_attributes(
                 attrs,
-                html_element=self._css.get(tag, DEFAULT_HTML_ELEMENT)
+                html_element=state.css.get(tag, DEFAULT_HTML_ELEMENT)
                 .__copy__()
                 .set_tag(tag),
             )
         )
-        self.tags.append(cur)
+        state.tags.append(cur)
 
-        handler = self.start_tag_handler_dict.get(tag)
         if handler:
             handler(attrs)
-
-    def handle_endtag(self, tag):
-        """Handle HTML end tags.
-
-        Look up the handler for closing the tag in :attr:`end_tag_handler_dict`
-        and execute it, if available.
-
-        Args:
-          tag: the HTML end tag to process.
-        """
-        handler = self.end_tag_handler_dict.get(tag)
-        if handler:
-            handler()
-
-    def _start_ul(self, _):
-        self._li_counter.append(self.get_bullet())
-
-    def _end_ul(self):
-        self._li_counter.pop()
-
-    def _start_img(self, attrs):
-        image_text = attrs.get("alt", "") or attrs.get("title", "")
-        if image_text and not (
-            self.config.deduplicate_captions and image_text == self._last_caption
-        ):
-            self.tags[-1].write(f"[{image_text}]")
-            self._last_caption = image_text
-
-    def _start_a(self, attrs):
-        self._link_target = ""
-        if self.config.display_links:
-            self._link_target = attrs.get("href", "")
-        if self.config.display_anchors:
-            self._link_target = self._link_target or attrs.get("name", "")
-
-        if self._link_target:
-            self.tags[-1].write("[")
-
-    def _end_a(self):
-        if self._link_target:
-            self.tags[-1].write(f"]({self._link_target})")
-
-    def _start_ol(self, _):
-        self._li_counter.append(1)
-
-    def _end_ol(self):
-        self._li_counter.pop()
-
-    def _start_li(self, _):
-        bullet = self._li_counter[-1] if self._li_counter else "* "
-        if isinstance(bullet, int):
-            self._li_counter[-1] += 1
-            self.tags[-1].list_bullet = f"{bullet}. "
-        else:
-            self.tags[-1].list_bullet = bullet
-
-        self.tags[-1].write("")
-
-    def _start_table(self, _):
-        self.tags[-1].set_canvas(Canvas())
-        self.current_table.append(
-            Table(
-                left_margin_len=self.tags[-1].canvas.left_margin,
-                cell_separator=self.config.table_cell_separator,
-            )
-        )
-
-    def _start_tr(self, _):
-        if self.current_table:
-            self.current_table[-1].add_row()
-
-    def _start_td(self, _):
-        if self.current_table:
-            # open td tag
-            table_cell = TableCell(
-                align=self.tags[-1].align, valign=self.tags[-1].valign
-            )
-            self.tags[-1].canvas = table_cell
-            self.current_table[-1].add_cell(table_cell)
-
-    def _end_td(self):
-        if self.current_table:
-            self.tags[-1].canvas.close_tag(self.tags[-1])
-
-    def _end_table(self):
-        if self.current_table:
-            self._end_td()
-        table = self.current_table.pop()
-        # last tag before the table: self.tags[-2]
-        # table tag: self.tags[-1]
-
-        out_of_table_text = self.tags[-1].canvas.get_text().strip()
-        if out_of_table_text:
-            self.tags[-2].write(out_of_table_text)
-            self.tags[-2].canvas.write_newline()
-
-        start_idx = self.tags[-2].canvas.current_block.idx
-        self.tags[-2].write_verbatim_text(table.get_text())
-        self.tags[-2].canvas._flush_inline()
-
-        # transfer annotations from the current tag
-        if self.tags[-1].annotation:
-            end_idx = self.tags[-2].canvas.current_block.idx
-            for a in self.tags[-1].annotation:
-                self.tags[-2].canvas.annotations.append(
-                    Annotation(start_idx, end_idx, a)
-                )
-
-        # transfer in-table annotations
-        self.tags[-2].canvas.annotations.extend(
-            table.get_annotations(start_idx, self.tags[-2].canvas.left_margin)
-        )
-
-    def _newline(self, _):
-        self.tags[-1].canvas.write_newline()
-
-    def get_bullet(self) -> str:
-        """Return the bullet that correspond to the given index."""
-        return Inscriptis.UL_COUNTER[len(self._li_counter) % Inscriptis.UL_COUNTER_LEN]
diff --git a/src/inscriptis/model/html_document_state.py b/src/inscriptis/model/html_document_state.py
@@ -0,0 +1,21 @@
+from inscriptis import ParserConfig
+from inscriptis.model.canvas import Canvas
+
+
+class HtmlDocumentState:
+    """Represents the state of the parsed html document."""
+
+    def __init__(self, config: ParserConfig = None):
+        # instance variables
+        self.canvas = Canvas()
+        self.config = config
+        self.css = config.css
+        self.apply_attributes = config.attribute_handler.apply_attributes
+
+        self.tags = [self.css["body"].set_canvas(self.canvas)]
+        self.current_table = []
+        self.li_counter = []
+        self.last_caption = None
+
+        # used if display_links is enabled
+        self.link_target = ""
diff --git a/src/inscriptis/model/tag/a_tag.py b/src/inscriptis/model/tag/a_tag.py
@@ -0,0 +1,21 @@
+"""Handle the <a> tag."""
+
+from inscriptis.model.html_document_state import HtmlDocumentState
+
+
+def a_start_handler(state: HtmlDocumentState, attrs):
+    """Handle the <a> tag."""
+    state.link_target = ""
+    if state.config.display_links:
+        state.link_target = attrs.get("href", "")
+    if state.config.display_anchors:
+        state.link_target = state.link_target or attrs.get("name", "")
+
+    if state.link_target:
+        state.tags[-1].write("[")
+
+
+def a_end_handler(state: HtmlDocumentState):
+    """Handle the </a> tag."""
+    if state.link_target:
+        state.tags[-1].write(f"]({state.link_target})")
diff --git a/src/inscriptis/model/tag/br_tag.py b/src/inscriptis/model/tag/br_tag.py
@@ -0,0 +1,7 @@
+"""Handle the <br> tag."""
+from inscriptis.model.html_document_state import HtmlDocumentState
+
+
+def br_start_handler(state: HtmlDocumentState, _):
+    """Handle the <br> tag."""
+    state.tags[-1].canvas.write_newline()
diff --git a/src/inscriptis/model/tag/img_tag.py b/src/inscriptis/model/tag/img_tag.py
@@ -0,0 +1,13 @@
+"""Handle the <img> tag."""
+
+from inscriptis.model.html_document_state import HtmlDocumentState
+
+
+def img_start_handler(state: HtmlDocumentState, attrs):
+    """Handle the <img> tag."""
+    image_text = attrs.get("alt", "") or attrs.get("title", "")
+    if image_text and not (
+        state.config.deduplicate_captions and image_text == state.last_caption
+    ):
+        state.tags[-1].write(f"[{image_text}]")
+        state.last_caption = image_text