Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Convert tags, skip_tags, recognized_tags to sets; fix doctests; f-strings #694

Merged
merged 5 commits into from
Jan 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
242 changes: 123 additions & 119 deletions bleach/html5lib_shim.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,127 +81,129 @@

#: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17
#: https://html.spec.whatwg.org/multipage/indices.html#elements-3
HTML_TAGS = [
"a",
"abbr",
"address",
"area",
"article",
"aside",
"audio",
"b",
"base",
"bdi",
"bdo",
"blockquote",
"body",
"br",
"button",
"canvas",
"caption",
"cite",
"code",
"col",
"colgroup",
"data",
"datalist",
"dd",
"del",
"details",
"dfn",
"dialog",
"div",
"dl",
"dt",
"em",
"embed",
"fieldset",
"figcaption",
"figure",
"footer",
"form",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"head",
"header",
"hgroup",
"hr",
"html",
"i",
"iframe",
"img",
"input",
"ins",
"kbd",
"keygen",
"label",
"legend",
"li",
"link",
"map",
"mark",
"menu",
"meta",
"meter",
"nav",
"noscript",
"object",
"ol",
"optgroup",
"option",
"output",
"p",
"param",
"picture",
"pre",
"progress",
"q",
"rp",
"rt",
"ruby",
"s",
"samp",
"script",
"section",
"select",
"slot",
"small",
"source",
"span",
"strong",
"style",
"sub",
"summary",
"sup",
"table",
"tbody",
"td",
"template",
"textarea",
"tfoot",
"th",
"thead",
"time",
"title",
"tr",
"track",
"u",
"ul",
"var",
"video",
"wbr",
]
HTML_TAGS = frozenset(
(
"a",
"abbr",
"address",
"area",
"article",
"aside",
"audio",
"b",
"base",
"bdi",
"bdo",
"blockquote",
"body",
"br",
"button",
"canvas",
"caption",
"cite",
"code",
"col",
"colgroup",
"data",
"datalist",
"dd",
"del",
"details",
"dfn",
"dialog",
"div",
"dl",
"dt",
"em",
"embed",
"fieldset",
"figcaption",
"figure",
"footer",
"form",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"head",
"header",
"hgroup",
"hr",
"html",
"i",
"iframe",
"img",
"input",
"ins",
"kbd",
"keygen",
"label",
"legend",
"li",
"link",
"map",
"mark",
"menu",
"meta",
"meter",
"nav",
"noscript",
"object",
"ol",
"optgroup",
"option",
"output",
"p",
"param",
"picture",
"pre",
"progress",
"q",
"rp",
"rt",
"ruby",
"s",
"samp",
"script",
"section",
"select",
"slot",
"small",
"source",
"span",
"strong",
"style",
"sub",
"summary",
"sup",
"table",
"tbody",
"td",
"template",
"textarea",
"tfoot",
"th",
"thead",
"time",
"title",
"tr",
"track",
"u",
"ul",
"var",
"video",
"wbr",
)
)


#: List of block level HTML tags, as per https://github.com/mozilla/bleach/issues/369
#: from mozilla on 2019.07.11
#: https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements#Elements
HTML_TAGS_BLOCK_LEVEL = frozenset(
[
(
"address",
"article",
"aside",
Expand Down Expand Up @@ -235,7 +237,7 @@
"section",
"table",
"ul",
]
)
)


Expand Down Expand Up @@ -476,15 +478,17 @@ class BleachHTMLParser(HTMLParser):

def __init__(self, tags, strip, consume_entities, **kwargs):
"""
:arg tags: list of allowed tags--everything else is either stripped or
:arg tags: set of allowed tags--everything else is either stripped or
escaped; if None, then this doesn't look at tags at all
:arg strip: whether to strip disallowed tags (True) or escape them (False);
if tags=None, then this doesn't have any effect
:arg consume_entities: whether to consume entities (default behavior) or
leave them as is when tokenizing (BleachHTMLTokenizer-added behavior)

"""
self.tags = [tag.lower() for tag in tags] if tags is not None else None
self.tags = (
frozenset((tag.lower() for tag in tags)) if tags is not None else None
)
self.strip = strip
self.consume_entities = consume_entities
super().__init__(**kwargs)
Expand Down Expand Up @@ -694,7 +698,7 @@ def escape_base_amp(self, stoken):
# Only leave entities in that are not ambiguous. If they're
# ambiguous, then we escape the ampersand.
if entity is not None and convert_entity(entity) is not None:
yield "&" + entity + ";"
yield f"&{entity};"

# Length of the entity plus 2--one for & at the beginning
# and one for ; at the end
Expand Down
17 changes: 9 additions & 8 deletions bleach/linkifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,17 +120,18 @@ def __init__(
:arg list callbacks: list of callbacks to run when adjusting tag attributes;
defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``

:arg list skip_tags: list of tags that you don't want to linkify the
contents of; for example, you could set this to ``['pre']`` to skip
linkifying contents of ``pre`` tags
:arg set skip_tags: set of tags that you don't want to linkify the
contents of; for example, you could set this to ``{'pre'}`` to skip
linkifying contents of ``pre`` tags; ``None`` means you don't
want linkify to skip any tags

:arg bool parse_email: whether or not to linkify email addresses

:arg url_re: url matching regex

:arg email_re: email matching regex

:arg list recognized_tags: the list of tags that linkify knows about;
:arg set recognized_tags: the set of tags that linkify knows about;
everything else gets escaped

:returns: linkified text as unicode
Expand All @@ -145,7 +146,7 @@ def __init__(
# Create a parser/tokenizer that allows all HTML tags and escapes
# anything not in that list.
self.parser = html5lib_shim.BleachHTMLParser(
tags=recognized_tags,
tags=frozenset(recognized_tags),
strip=False,
consume_entities=False,
namespaceHTMLElements=False,
Expand Down Expand Up @@ -221,8 +222,8 @@ def __init__(
:arg list callbacks: list of callbacks to run when adjusting tag attributes;
defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``

:arg list skip_tags: list of tags that you don't want to linkify the
contents of; for example, you could set this to ``['pre']`` to skip
:arg set skip_tags: set of tags that you don't want to linkify the
contents of; for example, you could set this to ``{'pre'}`` to skip
linkifying contents of ``pre`` tags

:arg bool parse_email: whether or not to linkify email addresses
Expand All @@ -235,7 +236,7 @@ def __init__(
super().__init__(source)

self.callbacks = callbacks or []
self.skip_tags = skip_tags or []
self.skip_tags = skip_tags or {}
self.parse_email = parse_email

self.url_re = url_re
Expand Down
Loading