diff --git a/CHANGES.rst b/CHANGES.rst index c2c49745..0d9d458f 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -3,6 +3,9 @@ Version 2.1.4 Unreleased +- Don't use regular expressions for ``striptags``, avoiding a performance + issue. :pr:`413` + Version 2.1.3 ------------- diff --git a/src/markupsafe/__init__.py b/src/markupsafe/__init__.py index be67b8ba..714b66f7 100644 --- a/src/markupsafe/__init__.py +++ b/src/markupsafe/__init__.py @@ -1,5 +1,4 @@ import functools -import re import string import sys import typing as t @@ -16,9 +15,6 @@ def __html__(self) -> str: __version__ = "2.1.4.dev" -_strip_comments_re = re.compile(r"", re.DOTALL) -_strip_tags_re = re.compile(r"<.*?>", re.DOTALL) - def _simple_escaping_wrapper(func: "t.Callable[_P, str]") -> "t.Callable[_P, Markup]": @functools.wraps(func) @@ -162,10 +158,41 @@ def striptags(self) -> str: >>> Markup("Main »\tAbout").striptags() 'Main ยป About' """ - # Use two regexes to avoid ambiguous matches. - value = _strip_comments_re.sub("", self) - value = _strip_tags_re.sub("", value) - value = " ".join(value.split()) + # collapse spaces + value = " ".join(self.split()) + + # Look for comments then tags separately. Otherwise, a comment that + # contains a tag would end early, leaving some of the comment behind. + + while True: + # keep finding comment start marks + start = value.find("", start) + + if end == -1: + break + + value = f"{value[:start]}{value[end + 3:]}" + + # remove tags using the same method + while True: + start = value.find("<") + + if start == -1: + break + + end = value.find(">", start) + + if end == -1: + break + + value = f"{value[:start]}{value[end + 1:]}" + return self.__class__(value).unescape() @classmethod diff --git a/tox.ini b/tox.ini index be507721..79a91c73 100644 --- a/tox.ini +++ b/tox.ini @@ -1,6 +1,6 @@ [tox] envlist = - py3{12,11,10,9,8,7} + py3{12,11,10,9,8} pypy310 style typing