diff --git a/publisher/htmlElementsCollector.go b/publisher/htmlElementsCollector.go index 1823a832776..d9479aafaa5 100644 --- a/publisher/htmlElementsCollector.go +++ b/publisher/htmlElementsCollector.go @@ -64,7 +64,7 @@ type cssClassCollectorWriter struct { buff bytes.Buffer isCollecting bool - dropValue bool + inPreTag string inQuote bool quoteValue byte @@ -90,49 +90,58 @@ func (w *cssClassCollectorWriter) Write(p []byte) (n int, err error) { b := p[i] w.toggleIfQuote(b) if !w.inQuote && b == '>' { - w.endCollecting(false) + w.endCollecting() break } w.buff.WriteByte(b) } if !w.isCollecting { - if w.dropValue { - w.buff.Reset() - } else { - // First check if we have processed this element before. - w.collector.mu.RLock() - - // See https://github.com/dominikh/go-tools/issues/723 - //lint:ignore S1030 This construct avoids memory allocation for the string. - seen := w.collector.elementSet[string(w.buff.Bytes())] - w.collector.mu.RUnlock() - if seen { - w.buff.Reset() - continue + if w.inPreTag != "" { + s := w.buff.String() + if tagName, isEnd := w.parseEndTag(s); isEnd && w.inPreTag == tagName { + w.inPreTag = "" } + w.buff.Reset() + continue + } - s := w.buff.String() + // First check if we have processed this element before. + w.collector.mu.RLock() + // See https://github.com/dominikh/go-tools/issues/723 + //lint:ignore S1030 This construct avoids memory allocation for the string. + seen := w.collector.elementSet[string(w.buff.Bytes())] + w.collector.mu.RUnlock() + if seen { w.buff.Reset() + continue + } - if strings.HasPrefix(s, "") { - continue - } + s := w.buff.String() - key := s + w.buff.Reset() - s, tagName := w.insertStandinHTMLElement(s) - el := parseHTMLElement(s) - el.Tag = tagName + if strings.HasPrefix(s, "") { + continue + } - w.collector.mu.Lock() - w.collector.elementSet[key] = true - if el.Tag != "" { - w.collector.elements = append(w.collector.elements, el) - } - w.collector.mu.Unlock() + key := s + + s, tagName := w.insertStandinHTMLElement(s) + el := parseHTMLElement(s) + el.Tag = tagName + if w.isPreFormatted(tagName) { + w.inPreTag = tagName } + + w.collector.mu.Lock() + w.collector.elementSet[key] = true + if el.Tag != "" { + w.collector.elements = append(w.collector.elements, el) + } + w.collector.mu.Unlock() + } } } @@ -140,6 +149,11 @@ func (w *cssClassCollectorWriter) Write(p []byte) (n int, err error) { return } +// No need to look inside these for HTML elements. +func (c *cssClassCollectorWriter) isPreFormatted(s string) bool { + return s == "pre" || s == "textarea" || s == "script" +} + // The net/html parser does not handle single table elements as input, e.g. tbody. // We only care about the element/class/ids, so just store away the original tag name // and pretend it's a
foobar`, f("div pre", "foo preclass", "")}, + {"Textare tags content should be skipped", ``, f("div textarea", "foo textareaclass", "")}, } { c.Run(test.name, func(c *qt.C) { w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())