From 524e4daa56ead7e551f2c9c0d5ea9460884ad521 Mon Sep 17 00:00:00 2001 From: Eric Norige <127622562+eanorige@users.noreply.github.com> Date: Thu, 26 Oct 2023 11:54:38 -0700 Subject: [PATCH] Speedup line_offset property (#1392) * Replace dynamic regex with string find operation * Add cache of where each line starts so we don't have quadratic behavior identifying line numbers when importing large chunks of html --- docs/changelog.md | 6 ++++++ markdown/htmlparser.py | 22 +++++++++++++--------- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/docs/changelog.md b/docs/changelog.md index 2f9e9250..614177c6 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -8,6 +8,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). See the [Contributing Guide](contributing.md) for details. +## [unreleased] + +### Fixed + +* Fix a performance problem with HTML extraction where large HTML input could trigger quadratic line counting behavior (PR#1392). + ## [3.5] -- 2023-10-06 ### Added diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index bf70b73d..4dbb1587 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -83,6 +83,8 @@ def __init__(self, md, *args, **kwargs): # Block tags that should contain no content (self closing) self.empty_tags = set(['hr']) + self.lineno_start_cache = [0] + # This calls self.reset super().__init__(*args, **kwargs) self.md = md @@ -94,6 +96,8 @@ def reset(self): self.stack = [] # When `inraw==True`, stack contains a list of tags self._cache = [] self.cleandoc = [] + self.lineno_start_cache = [0] + super().reset() def close(self): @@ -114,15 +118,15 @@ def close(self): @property def line_offset(self) -> int: """Returns char index in `self.rawdata` for the start of the current line. """ - if self.lineno > 1 and '\n' in self.rawdata: - m = re.match(r'([^\n]*\n){{{}}}'.format(self.lineno-1), self.rawdata) - if m: - return m.end() - else: # pragma: no cover - # Value of `self.lineno` must exceed total number of lines. - # Find index of beginning of last line. - return self.rawdata.rfind('\n') - return 0 + for ii in range(len(self.lineno_start_cache)-1, self.lineno-1): + last_line_start_pos = self.lineno_start_cache[ii] + lf_pos = self.rawdata.find('\n', last_line_start_pos) + if lf_pos == -1: + # No more newlines found. Use end of raw data as start of line beyond end. + lf_pos = len(self.rawdata) + self.lineno_start_cache.append(lf_pos+1) + + return self.lineno_start_cache[self.lineno-1] def at_line_start(self) -> bool: """