Skip to content

Commit

Permalink
wip: tag handler redesign.
Browse files Browse the repository at this point in the history
  • Loading branch information
AlbertWeichselbraun committed Feb 6, 2024
1 parent 53c2680 commit b4ce0ae
Show file tree
Hide file tree
Showing 8 changed files with 228 additions and 186 deletions.
230 changes: 56 additions & 174 deletions src/inscriptis/html_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,27 @@
from lxml.etree import Comment

from inscriptis.annotation import Annotation
from inscriptis.model.html_element import DEFAULT_HTML_ELEMENT
from inscriptis.model.canvas import Canvas
from inscriptis.model.config import ParserConfig
from inscriptis.model.table import Table, TableCell
from inscriptis.model.html_document_state import HtmlDocumentState
from inscriptis.model.html_element import DEFAULT_HTML_ELEMENT
from inscriptis.model.tag.a_tag import a_start_handler, a_end_handler
from inscriptis.model.tag.br_tag import br_start_handler
from inscriptis.model.tag.img_tag import img_start_handler
from inscriptis.model.tag.list_tag import (
ul_start_handler,
ol_start_handler,
li_start_handler,
ul_end_handler,
ol_end_handler,
)
from inscriptis.model.tag.table_tag import (
table_start_handler,
tr_start_handler,
td_start_handler,
table_end_handler,
td_end_handler,
)


class Inscriptis:
Expand All @@ -35,90 +52,75 @@ class Inscriptis:
text = parser.get_text()
"""

UL_COUNTER = ("* ", "+ ", "o ", "- ")
UL_COUNTER_LEN = len(UL_COUNTER)

def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None):
# use the default configuration, if no config object is provided
self.html_tree = html_tree
self.config = config or ParserConfig()

# setup start and end tag call tables
self.start_tag_handler_dict = {
"table": self._start_table,
"tr": self._start_tr,
"td": self._start_td,
"th": self._start_td,
"ul": self._start_ul,
"ol": self._start_ol,
"li": self._start_li,
"br": self._newline,
"a": self._start_a if self.config.parse_a() else None,
"img": self._start_img if self.config.display_images else None,
"table": table_start_handler,
"tr": tr_start_handler,
"td": td_start_handler,
"th": td_start_handler,
"ul": ul_start_handler,
"ol": ol_start_handler,
"li": li_start_handler,
"br": br_start_handler,
"a": a_start_handler if self.config.parse_a() else None,
"img": img_start_handler if self.config.display_images else None,
}
self.end_tag_handler_dict = {
"table": self._end_table,
"ul": self._end_ul,
"ol": self._end_ol,
"td": self._end_td,
"th": self._end_td,
"a": self._end_a if self.config.parse_a() else None,
"table": table_end_handler,
"ul": ul_end_handler,
"ol": ol_end_handler,
"td": td_end_handler,
"th": td_end_handler,
"a": a_end_handler if self.config.parse_a() else None,
}

# instance variables
self.canvas = Canvas()
self._css = self.config.css
self._apply_attributes = self.config.attribute_handler.apply_attributes

self.tags = [self._css["body"].set_canvas(self.canvas)]
self.current_table = []
self._li_counter = []
self._last_caption = None

# used if display_links is enabled
self._link_target = ""
# parse the HTML tree
state = HtmlDocumentState(config)
self.canvas = self._parse_html_tree(state, html_tree)

def _parse_html_tree(self, tree):
def _parse_html_tree(self, state: HtmlDocumentState, tree) -> Canvas:
"""Parse the HTML tree.
Args:
tree: the HTML tree to parse.
"""
if isinstance(tree.tag, str):
self.handle_starttag(tree.tag, tree.attrib)
cur = self.tags[-1]
self.handle_starttag(state, tree.tag, tree.attrib)
cur = state.tags[-1]
cur.canvas.open_tag(cur)

self.tags[-1].write(tree.text)
state.tags[-1].write(tree.text)

for node in tree:
self._parse_html_tree(node)
self._parse_html_tree(state, node)

self.handle_endtag(tree.tag)
prev = self.tags.pop()
# handle the endtag
if handler := self.end_tag_handler_dict.get(tree.tag):
handler(state)
prev = state.tags.pop()
prev.canvas.close_tag(prev)

# write the tail text to the element's container
self.tags[-1].write(tree.tail)
state.tags[-1].write(tree.tail)

elif tree.tag is Comment and tree.tail:
self.tags[-1].canvas.write(self.tags[-1], tree.tail)
state.tags[-1].canvas.write(state.tags[-1], tree.tail)

return state.canvas

def get_text(self) -> str:
"""Return the text extracted from the HTML page."""
self._parse_html_tree(self.html_tree)
return self.canvas.get_text()

def get_annotations(self) -> List[Annotation]:
"""Return the annotations extracted from the HTML page."""
if not self.canvas.get_text():
raise ValueError(
"No text to annotate available yet. "
"Have you already parsed the page with get_text?"
)
return self.canvas.annotations

def handle_starttag(self, tag, attrs):
def handle_starttag(self, state, tag, attrs, handler):
"""Handle HTML start tags.
Compute the style of the current :class:`HtmlElement`, based on
Expand All @@ -135,135 +137,15 @@ def handle_starttag(self, tag, attrs):
attrs: a dictionary of HTML attributes and their respective values.
"""
# use the css to handle tags known to it :)
cur = self.tags[-1].get_refined_html_element(
self._apply_attributes(
cur = state.tags[-1].get_refined_html_element(
state.apply_attributes(
attrs,
html_element=self._css.get(tag, DEFAULT_HTML_ELEMENT)
html_element=state.css.get(tag, DEFAULT_HTML_ELEMENT)
.__copy__()
.set_tag(tag),
)
)
self.tags.append(cur)
state.tags.append(cur)

handler = self.start_tag_handler_dict.get(tag)
if handler:
handler(attrs)

def handle_endtag(self, tag):
"""Handle HTML end tags.
Look up the handler for closing the tag in :attr:`end_tag_handler_dict`
and execute it, if available.
Args:
tag: the HTML end tag to process.
"""
handler = self.end_tag_handler_dict.get(tag)
if handler:
handler()

def _start_ul(self, _):
self._li_counter.append(self.get_bullet())

def _end_ul(self):
self._li_counter.pop()

def _start_img(self, attrs):
image_text = attrs.get("alt", "") or attrs.get("title", "")
if image_text and not (
self.config.deduplicate_captions and image_text == self._last_caption
):
self.tags[-1].write(f"[{image_text}]")
self._last_caption = image_text

def _start_a(self, attrs):
self._link_target = ""
if self.config.display_links:
self._link_target = attrs.get("href", "")
if self.config.display_anchors:
self._link_target = self._link_target or attrs.get("name", "")

if self._link_target:
self.tags[-1].write("[")

def _end_a(self):
if self._link_target:
self.tags[-1].write(f"]({self._link_target})")

def _start_ol(self, _):
self._li_counter.append(1)

def _end_ol(self):
self._li_counter.pop()

def _start_li(self, _):
bullet = self._li_counter[-1] if self._li_counter else "* "
if isinstance(bullet, int):
self._li_counter[-1] += 1
self.tags[-1].list_bullet = f"{bullet}. "
else:
self.tags[-1].list_bullet = bullet

self.tags[-1].write("")

def _start_table(self, _):
self.tags[-1].set_canvas(Canvas())
self.current_table.append(
Table(
left_margin_len=self.tags[-1].canvas.left_margin,
cell_separator=self.config.table_cell_separator,
)
)

def _start_tr(self, _):
if self.current_table:
self.current_table[-1].add_row()

def _start_td(self, _):
if self.current_table:
# open td tag
table_cell = TableCell(
align=self.tags[-1].align, valign=self.tags[-1].valign
)
self.tags[-1].canvas = table_cell
self.current_table[-1].add_cell(table_cell)

def _end_td(self):
if self.current_table:
self.tags[-1].canvas.close_tag(self.tags[-1])

def _end_table(self):
if self.current_table:
self._end_td()
table = self.current_table.pop()
# last tag before the table: self.tags[-2]
# table tag: self.tags[-1]

out_of_table_text = self.tags[-1].canvas.get_text().strip()
if out_of_table_text:
self.tags[-2].write(out_of_table_text)
self.tags[-2].canvas.write_newline()

start_idx = self.tags[-2].canvas.current_block.idx
self.tags[-2].write_verbatim_text(table.get_text())
self.tags[-2].canvas._flush_inline()

# transfer annotations from the current tag
if self.tags[-1].annotation:
end_idx = self.tags[-2].canvas.current_block.idx
for a in self.tags[-1].annotation:
self.tags[-2].canvas.annotations.append(
Annotation(start_idx, end_idx, a)
)

# transfer in-table annotations
self.tags[-2].canvas.annotations.extend(
table.get_annotations(start_idx, self.tags[-2].canvas.left_margin)
)

def _newline(self, _):
self.tags[-1].canvas.write_newline()

def get_bullet(self) -> str:
"""Return the bullet that correspond to the given index."""
return Inscriptis.UL_COUNTER[len(self._li_counter) % Inscriptis.UL_COUNTER_LEN]
21 changes: 21 additions & 0 deletions src/inscriptis/model/html_document_state.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from inscriptis import ParserConfig
from inscriptis.model.canvas import Canvas


class HtmlDocumentState:
"""Represents the state of the parsed html document."""

def __init__(self, config: ParserConfig = None):
# instance variables
self.canvas = Canvas()
self.config = config
self.css = config.css
self.apply_attributes = config.attribute_handler.apply_attributes

self.tags = [self.css["body"].set_canvas(self.canvas)]
self.current_table = []
self.li_counter = []
self.last_caption = None

# used if display_links is enabled
self.link_target = ""
21 changes: 21 additions & 0 deletions src/inscriptis/model/tag/a_tag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
"""Handle the <a> tag."""

from inscriptis.model.html_document_state import HtmlDocumentState


def a_start_handler(state: HtmlDocumentState, attrs):
"""Handle the <a> tag."""
state.link_target = ""
if state.config.display_links:
state.link_target = attrs.get("href", "")
if state.config.display_anchors:
state.link_target = state.link_target or attrs.get("name", "")

if state.link_target:
state.tags[-1].write("[")


def a_end_handler(state: HtmlDocumentState):
"""Handle the </a> tag."""
if state.link_target:
state.tags[-1].write(f"]({state.link_target})")
7 changes: 7 additions & 0 deletions src/inscriptis/model/tag/br_tag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
"""Handle the <br> tag."""
from inscriptis.model.html_document_state import HtmlDocumentState


def br_start_handler(state: HtmlDocumentState, _):
"""Handle the <br> tag."""
state.tags[-1].canvas.write_newline()
13 changes: 13 additions & 0 deletions src/inscriptis/model/tag/img_tag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
"""Handle the <img> tag."""

from inscriptis.model.html_document_state import HtmlDocumentState


def img_start_handler(state: HtmlDocumentState, attrs):
"""Handle the <img> tag."""
image_text = attrs.get("alt", "") or attrs.get("title", "")
if image_text and not (
state.config.deduplicate_captions and image_text == state.last_caption
):
state.tags[-1].write(f"[{image_text}]")
state.last_caption = image_text
Loading

0 comments on commit b4ce0ae

Please sign in to comment.