Skip to content

Commit

Permalink
Fix quadratic behavior with inline HTML
Browse files Browse the repository at this point in the history
Repeated starting sequences like `<?`, `<!DECL ` or `<![CDATA[` could
lead to quadratic behavior if no matching ending sequence was found.
Separate the inline HTML scanners. Remember if scanning the whole input
for a specific ending sequence failed and skip subsequent scans.

The basic idea is to remove suffixes `>`, `?>` and `]]>` from the
respective regex. Since these regexes are already constructed to match
lazily, they will stop before an ending sequence. To check whether an
ending sequence was found, we can simply test whether the input buffer
is large enough to hold the match plus a potential suffix. If the regex
doesn't find the ending sequence, it will match so many characters that
this test is guaranteed to fail. In this case, we set a flag to avoid
further attempts to execute the regex.

To check which inline HTML regex to use, we inspect the start of the
text buffer. This allows some fixed characters to be removed from the
start of some regexes. `matchlen`  is adjusted with a single addition
that accounts for both the relevant prefix and suffix.

Fixes commonmark#299.
  • Loading branch information
nwellnhof authored and kevinbackhouse committed Oct 12, 2022
1 parent 9d57d8a commit 8d4b76b
Show file tree
Hide file tree
Showing 4 changed files with 14,024 additions and 10,378 deletions.
53 changes: 52 additions & 1 deletion src/inlines.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,14 @@ typedef struct bracket {
bool in_bracket_image1;
} bracket;

#define FLAG_SKIP_HTML_CDATA (1u << 0)
#define FLAG_SKIP_HTML_DECLARATION (1u << 1)
#define FLAG_SKIP_HTML_PI (1u << 2)

typedef struct subject{
cmark_mem *mem;
cmark_chunk input;
unsigned flags;
int line;
bufsize_t pos;
int block_offset;
Expand Down Expand Up @@ -163,6 +168,7 @@ static void subject_from_buf(cmark_mem *mem, int line_number, int block_offset,
int i;
e->mem = mem;
e->input = *chunk;
e->flags = 0;
e->line = line_number;
e->pos = 0;
e->block_offset = block_offset;
Expand Down Expand Up @@ -899,7 +905,52 @@ static cmark_node *handle_pointy_brace(subject *subj, int options) {
}

// finally, try to match an html tag
matchlen = scan_html_tag(&subj->input, subj->pos);
if (subj->pos + 2 <= subj->input.len) {
int c = subj->input.data[subj->pos];
if (c == '!') {
c = subj->input.data[subj->pos+1];
if (c == '-') {
matchlen = scan_html_comment(&subj->input, subj->pos + 2);
if (matchlen > 0)
matchlen += 2; // prefix "<-"
} else if (c == '[') {
if ((subj->flags & FLAG_SKIP_HTML_CDATA) == 0) {
matchlen = scan_html_cdata(&subj->input, subj->pos + 2);
if (matchlen > 0) {
// The regex doesn't require the final "]]>". But if we're not at
// the end of input, it must come after the match. Otherwise,
// disable subsequent scans to avoid quadratic behavior.
matchlen += 5; // prefix "![", suffix "]]>"
if (subj->pos + matchlen > subj->input.len) {
subj->flags |= FLAG_SKIP_HTML_CDATA;
matchlen = 0;
}
}
}
} else if ((subj->flags & FLAG_SKIP_HTML_DECLARATION) == 0) {
matchlen = scan_html_declaration(&subj->input, subj->pos + 1);
if (matchlen > 0) {
matchlen += 2; // prefix "!", suffix ">"
if (subj->pos + matchlen > subj->input.len) {
subj->flags |= FLAG_SKIP_HTML_DECLARATION;
matchlen = 0;
}
}
}
} else if (c == '?') {
if ((subj->flags & FLAG_SKIP_HTML_PI) == 0) {
// Note that we allow an empty match.
matchlen = scan_html_pi(&subj->input, subj->pos + 1);
matchlen += 3; // prefix "?", suffix "?>"
if (subj->pos + matchlen > subj->input.len) {
subj->flags |= FLAG_SKIP_HTML_PI;
matchlen = 0;
}
}
} else {
matchlen = scan_html_tag(&subj->input, subj->pos);
}
}
if (matchlen > 0) {
contents = cmark_chunk_dup(&subj->input, subj->pos - 1, matchlen + 1);
subj->pos += matchlen;
Expand Down
Loading

0 comments on commit 8d4b76b

Please sign in to comment.