-
Notifications
You must be signed in to change notification settings - Fork 28
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
53c2680
commit b4ce0ae
Showing
8 changed files
with
228 additions
and
186 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
from inscriptis import ParserConfig | ||
from inscriptis.model.canvas import Canvas | ||
|
||
|
||
class HtmlDocumentState: | ||
"""Represents the state of the parsed html document.""" | ||
|
||
def __init__(self, config: ParserConfig = None): | ||
# instance variables | ||
self.canvas = Canvas() | ||
self.config = config | ||
self.css = config.css | ||
self.apply_attributes = config.attribute_handler.apply_attributes | ||
|
||
self.tags = [self.css["body"].set_canvas(self.canvas)] | ||
self.current_table = [] | ||
self.li_counter = [] | ||
self.last_caption = None | ||
|
||
# used if display_links is enabled | ||
self.link_target = "" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
"""Handle the <a> tag.""" | ||
|
||
from inscriptis.model.html_document_state import HtmlDocumentState | ||
|
||
|
||
def a_start_handler(state: HtmlDocumentState, attrs): | ||
"""Handle the <a> tag.""" | ||
state.link_target = "" | ||
if state.config.display_links: | ||
state.link_target = attrs.get("href", "") | ||
if state.config.display_anchors: | ||
state.link_target = state.link_target or attrs.get("name", "") | ||
|
||
if state.link_target: | ||
state.tags[-1].write("[") | ||
|
||
|
||
def a_end_handler(state: HtmlDocumentState): | ||
"""Handle the </a> tag.""" | ||
if state.link_target: | ||
state.tags[-1].write(f"]({state.link_target})") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
"""Handle the <br> tag.""" | ||
from inscriptis.model.html_document_state import HtmlDocumentState | ||
|
||
|
||
def br_start_handler(state: HtmlDocumentState, _): | ||
"""Handle the <br> tag.""" | ||
state.tags[-1].canvas.write_newline() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
"""Handle the <img> tag.""" | ||
|
||
from inscriptis.model.html_document_state import HtmlDocumentState | ||
|
||
|
||
def img_start_handler(state: HtmlDocumentState, attrs): | ||
"""Handle the <img> tag.""" | ||
image_text = attrs.get("alt", "") or attrs.get("title", "") | ||
if image_text and not ( | ||
state.config.deduplicate_captions and image_text == state.last_caption | ||
): | ||
state.tags[-1].write(f"[{image_text}]") | ||
state.last_caption = image_text |
Oops, something went wrong.