Skip to content

Commit

Permalink
improve striptags performance
Browse files Browse the repository at this point in the history
  • Loading branch information
davidism committed Jan 19, 2024
1 parent 4c397ef commit 750e22b
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 9 deletions.
3 changes: 3 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ Version 2.1.4

Unreleased

- Don't use regular expressions for ``striptags``, avoiding a performance
issue. :pr:`413`


Version 2.1.3
-------------
Expand Down
43 changes: 35 additions & 8 deletions src/markupsafe/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import functools
import re
import string
import sys
import typing as t
Expand All @@ -16,9 +15,6 @@ def __html__(self) -> str:

__version__ = "2.1.4.dev"

_strip_comments_re = re.compile(r"<!--.*?-->", re.DOTALL)
_strip_tags_re = re.compile(r"<.*?>", re.DOTALL)


def _simple_escaping_wrapper(func: "t.Callable[_P, str]") -> "t.Callable[_P, Markup]":
@functools.wraps(func)
Expand Down Expand Up @@ -162,10 +158,41 @@ def striptags(self) -> str:
>>> Markup("Main &raquo;\t<em>About</em>").striptags()
'Main » About'
"""
# Use two regexes to avoid ambiguous matches.
value = _strip_comments_re.sub("", self)
value = _strip_tags_re.sub("", value)
value = " ".join(value.split())
# collapse spaces
value = " ".join(self.split())

# Look for comments then tags separately. Otherwise, a comment that
# contains a tag would end early, leaving some of the comment behind.

while True:
# keep finding comment start marks
start = value.find("<!--")

if start == -1:
break

# find a comment end mark beyond the start, otherwise stop
end = value.find("-->", start)

if end == -1:
break

value = f"{value[:start]}{value[end + 3:]}"

# remove tags using the same method
while True:
start = value.find("<")

if start == -1:
break

end = value.find(">", start)

if end == -1:
break

value = f"{value[:start]}{value[end + 1:]}"

return self.__class__(value).unescape()

@classmethod
Expand Down
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tox]
envlist =
py3{12,11,10,9,8,7}
py3{12,11,10,9,8}
pypy310
style
typing
Expand Down

0 comments on commit 750e22b

Please sign in to comment.