From 530852ae31e6855f83eb605782e8a75531381ba2 Mon Sep 17 00:00:00 2001 From: Paul Smith Date: Thu, 10 Oct 2024 21:17:11 -0500 Subject: [PATCH] parser: Handle self-closing elements and void elements (#127) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add `selfClosing` field to `nodeElement` struct to track self-closing tags - Update `match` function to accept multiple token types - Check for self-closing tags in `parseElement` and set `selfClosing` field - Return early for void elements in `parseElement` - Refactor `parseChildren` to handle both start and self-closing tags - Add `isVoidElement` function to check if a tag is a void element - Update tests to cover self-closing and void elements 🌟 These changes improve the HTML parser to properly handle self-closing elements and void elements according to the HTML spec. Fixes #126 --- ast.go | 1 + parser.go | 62 ++++++++++++++++++++++++++++++++++++-------------- parser_test.go | 23 ++++++++++++++++++- 3 files changed, 68 insertions(+), 18 deletions(-) diff --git a/ast.go b/ast.go index 05fe69b..d2cf661 100644 --- a/ast.go +++ b/ast.go @@ -177,6 +177,7 @@ type nodeElement struct { startTagNodes []node children []node pos span + selfClosing bool } func (e nodeElement) Pos() span { return e.pos } diff --git a/parser.go b/parser.go index 1a34d2d..0793dab 100644 --- a/parser.go +++ b/parser.go @@ -420,23 +420,35 @@ func newTag(tagname []byte, attrs []*attr) tag { return tag{name: string(tagname), attrs: attrs} } -func (p *htmlParser) match(typ html.TokenType) bool { - return p.toktyp == typ +func (p *htmlParser) match(types ...html.TokenType) bool { + for _, typ := range types { + if p.toktyp == typ { + return true + } + } + return false } func (p *htmlParser) parseElement() node { var result *nodeElement - // FIXME(paulsmith): handle self-closing elements - if !p.match(html.StartTagToken) { - p.errorf("expected an HTML element start tag, got %s", p.toktyp) + if !p.match(html.StartTagToken, html.SelfClosingTagToken) { + p.errorf("expected an HTML element start or self-closing tag, got %s", p.toktyp) } result = new(nodeElement) result.tag = newTag(p.tagname, p.attrs) result.pos.start = p.parser.offset - len(p.raw) result.pos.end = p.parser.offset + if p.match(html.SelfClosingTagToken) { + result.selfClosing = true + } result.startTagNodes = p.parseStartTag() + + if isVoidElement(result.tag.name) { + return result + } + p.advance() result.children = p.parseChildren() @@ -462,31 +474,28 @@ func (p *htmlParser) parseChildren() []node { var elemStack []*nodeElement loop: for { - switch p.toktyp { + switch tt := p.toktyp; tt { case html.ErrorToken: if p.err == io.EOF { break loop } else { p.errorf("HTML tokenizer: %w", p.err) } - case html.SelfClosingTagToken: - elem := new(nodeElement) - elem.tag = newTag(p.tagname, p.attrs) - elem.pos.start = p.parser.offset - len(p.raw) - elem.pos.end = p.parser.offset - elem.startTagNodes = p.parseStartTag() - p.advance() - result = append(result, elem) - case html.StartTagToken: + case html.StartTagToken, html.SelfClosingTagToken: elem := new(nodeElement) elem.tag = newTag(p.tagname, p.attrs) elem.pos.start = p.parser.offset - len(p.raw) elem.pos.end = p.parser.offset elem.startTagNodes = p.parseStartTag() + if tt == html.SelfClosingTagToken { + elem.selfClosing = true + } p.advance() - elem.children = p.parseChildren() + if !isVoidElement(elem.tag.name) { + elem.children = p.parseChildren() + elemStack = append(elemStack, elem) + } result = append(result, elem) - elemStack = append(elemStack, elem) case html.EndTagToken: if len(elemStack) == 0 { return result @@ -534,6 +543,25 @@ loop: return result } +func isVoidElement(tagname string) bool { + // Per spec: https://html.spec.whatwg.org/multipage/syntax.html#void-elements + voidElements := map[string]bool{"area": true, + "base": true, + "br": true, + "col": true, + "embed": true, + "hr": true, + "img": true, + "input": true, + "link": true, + "meta": true, + "source": true, + "track": true, + "wbr": true, + } + return voidElements[tagname] +} + type Optional[T any] struct { value *T } diff --git a/parser_test.go b/parser_test.go index 72aec20..1d19c5d 100644 --- a/parser_test.go +++ b/parser_test.go @@ -405,6 +405,27 @@ func TestParse(t *testing.T) { }, }, }, + { + `^if true {
}`, + &syntaxTree{ + nodes: []node{ + &nodeIf{ + cond: &nodeGoStrExpr{expr: "true", pos: span{start: 4, end: 8}}, + then: &nodeBlock{ + nodes: []node{ + &nodeLiteral{str: " ", pos: span{10, 11}}, + &nodeElement{ + tag: tag{name: "br"}, + startTagNodes: []node{&nodeLiteral{str: "
", pos: span{start: 11, end: 16}}}, + pos: span{11, 16}, + selfClosing: true, + }, + }, + }, + }, + }, + }, + }, } opts := cmp.AllowUnexported(unexported...) for _, test := range tests { @@ -450,7 +471,7 @@ func TestParseSyntaxErrors(t *testing.T) { { `^if true { -}`, 2, 13, +}`, 3, 2, }, // FIXME(paulsmith): add more syntax errors }