Skip to content

Commit

Permalink
Fix slow HTML elements collector for the pre case
Browse files Browse the repository at this point in the history
```
name                           old time/op    new time/op    delta
ElementsCollectorWriterPre-10    25.2µs ± 1%     3.4µs ± 0%  -86.54%  (p=0.029 n=4+4)

name                           old alloc/op   new alloc/op   delta
ElementsCollectorWriterPre-10      624B ± 0%      142B ± 0%  -77.18%  (p=0.029 n=4+4)

name                           old allocs/op  new allocs/op  delta
ElementsCollectorWriterPre-10      16.0 ± 0%       6.0 ± 0%  -62.50%  (p=0.029 n=4+4)
```

Fixes #10698
  • Loading branch information
bep committed Feb 5, 2023
1 parent 4f4a1c0 commit f9fc0e0
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 8 deletions.
73 changes: 65 additions & 8 deletions publisher/htmlElementsCollector.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ var (

skipInnerElementRe = regexp.MustCompile(`(?i)^(pre|textarea|script|style)`)
skipAllElementRe = regexp.MustCompile(`(?i)^!DOCTYPE`)
endTagRe = regexp.MustCompile(`(?i)<\/\s*([a-zA-Z]+)\s*>$`)

exceptionList = map[string]bool{
"thead": true,
Expand Down Expand Up @@ -312,11 +311,7 @@ func htmlLexElementStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc
if w.r != '>' {
return false
}
m := endTagRe.FindSubmatch(w.buff.Bytes())
if m == nil {
return false
}
return bytes.EqualFold(m[1], tagNameCopy)
return isClosedByTag(w.buff.Bytes(), tagNameCopy)
},
htmlLexStart,
))
Expand Down Expand Up @@ -428,8 +423,9 @@ func parseHTMLElement(elStr string) (el htmlElement, err error) {
}

// Variants of s
// <body class="b a">
// <div>
//
// <body class="b a">
// <div>
func parseStartTag(s string) string {
spaceIndex := strings.IndexFunc(s, func(r rune) bool {
return unicode.IsSpace(r)
Expand All @@ -441,3 +437,64 @@ func parseStartTag(s string) string {

return s[1:spaceIndex]
}

// isClosedByTag reports whether b ends with a closing tag for tagName.
func isClosedByTag(b, tagName []byte) bool {
if len(b) == 0 {
return false
}

if b[len(b)-1] != '>' {
return false
}

var (
lo int
hi int

state int
inWord bool
)

LOOP:
for i := len(b) - 2; i >= 0; i-- {
switch {
case b[i] == '<':
if state != 1 {
return false
}
state = 2
break LOOP
case b[i] == '/':
if state != 0 {
return false
}
state++
if inWord {
lo = i + 1
inWord = false
}
case isSpace(b[i]):
if inWord {
lo = i + 1
inWord = false
}
default:
if !inWord {
hi = i + 1
inWord = true
}
}
}

if state != 2 {
return false
}

return bytes.EqualFold(tagName, b[lo:hi])

}

func isSpace(b byte) bool {
return b == ' ' || b == '\t' || b == '\n'
}
28 changes: 28 additions & 0 deletions publisher/htmlElementsCollector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,34 @@ func TestClassCollector(t *testing.T) {

}

func TestEndsWithTag(t *testing.T) {
c := qt.New((t))

for _, test := range []struct {
name string
s string
tagName string
expect bool
}{
{"empty", "", "div", false},
{"no match", "foo", "div", false},
{"no close", "foo<div>", "div", false},
{"no close 2", "foo/div>", "div", false},
{"no close 2", "foo//div>", "div", false},
{"no tag", "foo</>", "div", false},
{"match", "foo</div>", "div", true},
{"match space", "foo< / div>", "div", true},
{"match space 2", "foo< / div \n>", "div", true},
{"match case", "foo</DIV>", "div", true},
} {
c.Run(test.name, func(c *qt.C) {
got := isClosedByTag([]byte(test.s), []byte(test.tagName))
c.Assert(got, qt.Equals, test.expect)
})
}

}

func BenchmarkElementsCollectorWriter(b *testing.B) {
const benchHTML = `
<!DOCTYPE html>
Expand Down

0 comments on commit f9fc0e0

Please sign in to comment.