From 70745685494b365c866363486aa01b892d62aade Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Erik=20Pedersen?= Date: Thu, 18 Oct 2018 10:21:23 +0200 Subject: [PATCH] hugolib: Integrate new page parser See #5324 --- hugolib/page.go | 46 ++---------- hugolib/page_content.go | 83 ++++++++++++++++++++++ hugolib/shortcode.go | 14 +--- hugolib/shortcode_test.go | 14 ++-- parser/pageparser/item.go | 2 + parser/pageparser/page_tokens_getters.go | 19 +++++ parser/pageparser/pagelexer.go | 44 ++++++------ parser/pageparser/pageparser.go | 56 +++++++++++++-- parser/pageparser/pageparser_intro_test.go | 11 +-- 9 files changed, 195 insertions(+), 94 deletions(-) create mode 100644 hugolib/page_content.go create mode 100644 parser/pageparser/page_tokens_getters.go diff --git a/hugolib/page.go b/hugolib/page.go index e867dd52560..7c3cfccb02f 100644 --- a/hugolib/page.go +++ b/hugolib/page.go @@ -141,6 +141,7 @@ type Page struct { contentv template.HTML summary template.HTML TableOfContents template.HTML + // Passed to the shortcodes pageWithoutContent *PageWithoutContent @@ -161,7 +162,6 @@ type Page struct { extension string contentType string - renderable bool Layout string @@ -171,13 +171,8 @@ type Page struct { linkTitle string - frontmatter []byte - - // rawContent is the raw content read from the content file. - rawContent []byte - - // workContent is a copy of rawContent that may be mutated during site build. - workContent []byte + // Content items. + pageContent // whether the content is in a CJK language. isCJKLanguage bool @@ -1756,39 +1751,6 @@ func (p *Page) shouldRenderTo(f output.Format) bool { return found } -func (p *Page) parse(reader io.Reader) error { - psr, err := parser.ReadFrom(reader) - - if err != nil { - return err - } - - p.renderable = psr.IsRenderable() - p.frontmatter = psr.FrontMatter() - p.rawContent = psr.Content() - p.lang = p.Source.File.Lang() - - meta, err := psr.Metadata() - if err != nil { - return _errors.Wrap(err, "error in front matter") - } - if meta == nil { - // missing frontmatter equivalent to empty frontmatter - meta = map[string]interface{}{} - } - - if p.s != nil && p.s.owner != nil { - gi, enabled := p.s.owner.gitInfo.forPage(p) - if gi != nil { - p.GitInfo = gi - } else if enabled { - p.s.Log.WARN.Printf("Failed to find GitInfo for page %q", p.Path()) - } - } - - return p.update(meta) -} - func (p *Page) RawContent() string { return string(p.rawContent) } @@ -1871,7 +1833,7 @@ func (p *Page) SaveSource() error { // TODO(bep) lazy consolidate func (p *Page) processShortcodes() error { p.shortcodeState = newShortcodeHandler(p) - tmpContent, err := p.shortcodeState.extractShortcodes(p.workContent, p.withoutContent()) + tmpContent, err := p.shortcodeState.extractShortcodes(p.parsed.Tokens(), p.withoutContent()) if err != nil { return err } diff --git a/hugolib/page_content.go b/hugolib/page_content.go new file mode 100644 index 00000000000..02b34c773e4 --- /dev/null +++ b/hugolib/page_content.go @@ -0,0 +1,83 @@ +// Copyright 2018 The Hugo Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hugolib + +import ( + "io" + + "github.com/gohugoio/hugo/parser" + "github.com/gohugoio/hugo/parser/pageparser" + "github.com/pkg/errors" +) + +// The content related items on a Page. +type pageContent struct { + renderable bool + + frontmatter []byte + + // rawContent is the raw content read from the content file. + rawContent []byte + + // workContent is a copy of rawContent that may be mutated during site build. + workContent []byte + + // The AST of the parsed page. Contains information about: + // shortcodes, front matter, summary indicators. + parsed pageparser.Result +} + +func (p *Page) parse(reader io.Reader) error { + // TODO(bep) 2errors consolidate when done + rs := reader.(io.ReadSeeker) + + psr, err := parser.ReadFrom(rs) + if err != nil { + return err + } + + rs.Seek(0, 0) + + psr2, err := pageparser.ReadFrom(rs) + if err != nil { + return err + } + + p.parsed = psr2 + + p.renderable = psr.IsRenderable() + p.frontmatter = psr.FrontMatter() + p.rawContent = psr.Content() + p.lang = p.Source.File.Lang() + + meta, err := psr.Metadata() + if err != nil { + return errors.Wrap(err, "error in front matter") + } + if meta == nil { + // missing frontmatter equivalent to empty frontmatter + meta = map[string]interface{}{} + } + + if p.s != nil && p.s.owner != nil { + gi, enabled := p.s.owner.gitInfo.forPage(p) + if gi != nil { + p.GitInfo = gi + } else if enabled { + p.s.Log.WARN.Printf("Failed to find GitInfo for page %q", p.Path()) + } + } + + return p.update(meta) +} diff --git a/hugolib/shortcode.go b/hugolib/shortcode.go index a21a10ad242..8fad595c1ca 100644 --- a/hugolib/shortcode.go +++ b/hugolib/shortcode.go @@ -615,19 +615,7 @@ Loop: var shortCodeStart = []byte("{{") -func (s *shortcodeHandler) extractShortcodes(input []byte, p *PageWithoutContent) (string, error) { - - startIdx := bytes.Index(input, shortCodeStart) - - // short cut for docs with no shortcodes - if startIdx < 0 { - return string(input), nil - } - - // the parser takes a string; - // since this is an internal API, it could make sense to use the mutable []byte all the way, but - // it seems that the time isn't really spent in the byte copy operations, and the impl. gets a lot cleaner - pt := pageparser.ParseFrom(input, startIdx) +func (s *shortcodeHandler) extractShortcodes(pt *pageparser.Tokens, p *PageWithoutContent) (string, error) { result := bp.GetBuffer() defer bp.PutBuffer(result) diff --git a/hugolib/shortcode_test.go b/hugolib/shortcode_test.go index f8837810c91..5f9e2e3454a 100644 --- a/hugolib/shortcode_test.go +++ b/hugolib/shortcode_test.go @@ -365,11 +365,11 @@ func TestExtractShortcodes(t *testing.T) { expectErrorMsg string }{ {"text", "Some text.", "map[]", "Some text.", ""}, - {"invalid right delim", "{{< tag }}", "", false, ":4:.*unrecognized character.*}"}, - {"invalid close", "\n{{< /tag >}}", "", false, ":5:.*got closing shortcode, but none is open"}, - {"invalid close2", "\n\n{{< tag >}}{{< /anotherTag >}}", "", false, ":6: closing tag for shortcode 'anotherTag' does not match start tag"}, - {"unterminated quote 1", `{{< figure src="im caption="S" >}}`, "", false, ":4:.got pos.*"}, - {"unterminated quote 1", `{{< figure src="im" caption="S >}}`, "", false, ":4:.*unterm.*}"}, + {"invalid right delim", "{{< tag }}", "", false, ":8:.*unrecognized character.*}"}, + {"invalid close", "\n{{< /tag >}}", "", false, ":9:.*got closing shortcode, but none is open"}, + {"invalid close2", "\n\n{{< tag >}}{{< /anotherTag >}}", "", false, ":10: closing tag for shortcode 'anotherTag' does not match start tag"}, + {"unterminated quote 1", `{{< figure src="im caption="S" >}}`, "", false, ":8:.got pos.*"}, + {"unterminated quote 1", `{{< figure src="im" caption="S >}}`, "", false, ":8:.*unterm.*}"}, {"one shortcode, no markup", "{{< tag >}}", "", testScPlaceholderRegexp, ""}, {"one shortcode, markup", "{{% tag %}}", "", testScPlaceholderRegexp, ""}, {"one pos param", "{{% tag param1 %}}", `tag([\"param1\"], true){[]}"]`, testScPlaceholderRegexp, ""}, @@ -405,7 +405,7 @@ func TestExtractShortcodes(t *testing.T) { fmt.Sprintf("Hello %sworld%s. And that's it.", testScPlaceholderRegexp, testScPlaceholderRegexp), ""}, } { - p, _ := pageFromString(simplePage, "simple.md", func(templ tpl.TemplateHandler) error { + p, _ := pageFromString(simplePage+this.input, "simple.md", func(templ tpl.TemplateHandler) error { templ.AddTemplate("_internal/shortcodes/tag.html", `tag`) templ.AddTemplate("_internal/shortcodes/sc1.html", `sc1`) templ.AddTemplate("_internal/shortcodes/sc2.html", `sc2`) @@ -424,7 +424,7 @@ func TestExtractShortcodes(t *testing.T) { return fmt.Sprintf("HAHA%s-%dHBHB", shortcodePlaceholderPrefix, counter) } - content, err := s.extractShortcodes([]byte(this.input), p.withoutContent()) + content, err := s.extractShortcodes(p.parsed.Tokens(), p.withoutContent()) if b, ok := this.expect.(bool); ok && !b { if err == nil { diff --git a/parser/pageparser/item.go b/parser/pageparser/item.go index 6e93bb696d4..dc4d9f67e84 100644 --- a/parser/pageparser/item.go +++ b/parser/pageparser/item.go @@ -21,6 +21,8 @@ type Item struct { Val []byte } +type Items []Item + func (i Item) ValStr() string { return string(i.Val) } diff --git a/parser/pageparser/page_tokens_getters.go b/parser/pageparser/page_tokens_getters.go new file mode 100644 index 00000000000..a08225f3894 --- /dev/null +++ b/parser/pageparser/page_tokens_getters.go @@ -0,0 +1,19 @@ +// Copyright 2018 The Hugo Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pageparser + +// GetFrontMatter returns the unmarshalled frontmatter data. +func GetFrontMatter(items Items) (map[string]interface{}, error) { + return nil, nil +} diff --git a/parser/pageparser/pagelexer.go b/parser/pageparser/pagelexer.go index c15e977ca31..90cf3ddfc0d 100644 --- a/parser/pageparser/pagelexer.go +++ b/parser/pageparser/pagelexer.go @@ -50,14 +50,23 @@ type pageLexer struct { pos pos // input position start pos // item start position width pos // width of last element - lastPos pos // position of the last item returned by nextItem contentSections int lexerShortcodeState // items delivered to client - items []Item + items Items +} + +// Implement the Result interface +func (l *pageLexer) Tokens() *Tokens { + return &Tokens{lexer: l, items: l.items} +} + +func (l *pageLexer) Items() Items { + return l.items + } // note: the input position here is normally 0 (start), but @@ -79,6 +88,10 @@ func newPageLexer(input []byte, inputPosition pos, stateStart stateFunc) *pageLe return lexer } +func (l *pageLexer) newTokens() *Tokens { + return &Tokens{lexer: l, items: l.items} +} + // main loop func (l *pageLexer) run() *pageLexer { for l.state = l.stateStart; l.state != nil; { @@ -160,25 +173,12 @@ func (l *pageLexer) ignore() { var lf = []byte("\n") -// nice to have in error logs -func (l *pageLexer) lineNum() int { - return bytes.Count(l.input[:l.lastPos], lf) + 1 -} - // nil terminates the parser func (l *pageLexer) errorf(format string, args ...interface{}) stateFunc { l.items = append(l.items, Item{tError, l.start, []byte(fmt.Sprintf(format, args...))}) return nil } -// consumes and returns the next item -func (l *pageLexer) nextItem() Item { - item := l.items[0] - l.items = l.items[1:] - l.lastPos = item.pos - return item -} - func (l *pageLexer) consumeCRLF() bool { var consumed bool for _, r := range crLf { @@ -258,15 +258,16 @@ LOOP: case r == '#': return lexFrontMatterOrgMode case !isSpace(r) && !isEndOfLine(r): + // No front matter. if r == '<' { l.emit(tHTMLLead) // Not need to look further. Hugo treats this as plain HTML, // no front matter, no shortcodes, no nothing. l.pos = pos(len(l.input)) l.emit(tText) - break LOOP + } - return l.errorf("failed to detect front matter type; got unknown identifier %q", r) + break LOOP } } @@ -366,18 +367,19 @@ LOOP: } +func (l *pageLexer) printCurrentInput() { + fmt.Printf("input[%d:]: %q", l.pos, string(l.input[l.pos:])) +} + // Handle YAML or TOML front matter. func (l *pageLexer) lexFrontMatterSection(tp itemType, delimr rune, name string, delim []byte) stateFunc { + for i := 0; i < 2; i++ { if r := l.next(); r != delimr { return l.errorf("invalid %s delimiter", name) } } - if !l.consumeCRLF() { - return l.errorf("invalid %s delimiter", name) - } - // We don't care about the delimiters. l.ignore() diff --git a/parser/pageparser/pageparser.go b/parser/pageparser/pageparser.go index 948c05edf28..cc5c2ddf3c3 100644 --- a/parser/pageparser/pageparser.go +++ b/parser/pageparser/pageparser.go @@ -17,27 +17,69 @@ // See slides here: http://cuddle.googlecode.com/hg/talk/lex.html package pageparser -func Parse(input []byte) *Tokens { +import ( + "bytes" + "io" + "io/ioutil" + + "github.com/pkg/errors" +) + +type Result interface { + Tokens() *Tokens + Items() Items +} + +var _ Result = (*pageLexer)(nil) + +func Parse(input []byte) Result { return ParseFrom(input, 0) } -func ParseFrom(input []byte, from int) *Tokens { +// TODO(bep) consolidate (remove superflous) and have one or two Parse* methods. +func ReadFrom(r io.Reader) (Result, error) { + b, err := ioutil.ReadAll(r) + if err != nil { + return nil, errors.Wrap(err, "failed to read page content") + } + lexer := newPageLexer(b, 0, lexIntroSection) + lexer.run() + return lexer, nil + +} + +func ParseFrom(input []byte, from int) Result { lexer := newPageLexer(input, pos(from), lexMainSection) // TODO(bep) 2errors lexer.run() - return &Tokens{lexer: lexer} + return lexer } type Tokens struct { - lexer *pageLexer + lexer *pageLexer + items Items + lastPos pos // position of the last item returned by nextItem + token [3]Item // 3-item look-ahead is what we currently need peekCount int } +// consumes and returns the next item +func (l *Tokens) nextItem() Item { + item := l.items[0] + l.items = l.items[1:] + l.lastPos = item.pos + return item +} + +func (t *Tokens) Content() []byte { + return t.lexer.input +} + func (t *Tokens) Next() Item { if t.peekCount > 0 { t.peekCount-- } else { - t.token[0] = t.lexer.nextItem() + t.token[0] = t.nextItem() } return t.token[t.peekCount] } @@ -73,7 +115,7 @@ func (t *Tokens) Peek() Item { return t.token[t.peekCount-1] } t.peekCount = 1 - t.token[0] = t.lexer.nextItem() + t.token[0] = t.nextItem() return t.token[0] } @@ -91,5 +133,5 @@ func (t *Tokens) Consume(cnt int) { // LineNumber returns the current line number. Used for logging. func (t *Tokens) LineNumber() int { - return t.lexer.lineNum() + return bytes.Count(t.lexer.input[:t.lastPos], lf) + 1 } diff --git a/parser/pageparser/pageparser_intro_test.go b/parser/pageparser/pageparser_intro_test.go index 19e30dc9adb..fc4197682d6 100644 --- a/parser/pageparser/pageparser_intro_test.go +++ b/parser/pageparser/pageparser_intro_test.go @@ -33,9 +33,9 @@ func nti(tp itemType, val string) Item { var ( tstJSON = `{ "a": { "b": "\"Hugo\"}" } }` tstHTMLLead = nti(tHTMLLead, " <") - tstFrontMatterTOML = nti(tFrontMatterTOML, "foo = \"bar\"\n") - tstFrontMatterYAML = nti(tFrontMatterYAML, "foo: \"bar\"\n") - tstFrontMatterYAMLCRLF = nti(tFrontMatterYAML, "foo: \"bar\"\r\n") + tstFrontMatterTOML = nti(tFrontMatterTOML, "\nfoo = \"bar\"\n") + tstFrontMatterYAML = nti(tFrontMatterYAML, "\nfoo: \"bar\"\n") + tstFrontMatterYAMLCRLF = nti(tFrontMatterYAML, "\r\nfoo: \"bar\"\r\n") tstFrontMatterJSON = nti(tFrontMatterJSON, tstJSON+"\r\n") tstSomeText = nti(tText, "\nSome text.\n") tstSummaryDivider = nti(tSummaryDivider, "") @@ -55,7 +55,9 @@ var crLfReplacer = strings.NewReplacer("\r", "#", "\n", "$") var frontMatterTests = []lexerTest{ {"empty", "", []Item{tstEOF}}, {"HTML Document", ` `, []Item{tstHTMLLead, nti(tText, "html> "), tstEOF}}, + {"No front matter", "\nSome text.\n", []Item{tstSomeText, tstEOF}}, {"YAML front matter", "---\nfoo: \"bar\"\n---\n\nSome text.\n", []Item{tstFrontMatterYAML, tstSomeText, tstEOF}}, + {"YAML empty front matter", "---\n---\n\nSome text.\n", []Item{nti(tFrontMatterYAML, "\n"), tstSomeText, tstEOF}}, // Note that we keep all bytes as they are, but we need to handle CRLF {"YAML front matter CRLF", "---\r\nfoo: \"bar\"\r\n---\n\nSome text.\n", []Item{tstFrontMatterYAMLCRLF, tstSomeText, tstEOF}}, {"TOML front matter", "+++\nfoo = \"bar\"\n+++\n\nSome text.\n", []Item{tstFrontMatterTOML, tstSomeText, tstEOF}}, @@ -80,9 +82,10 @@ func TestFrontMatter(t *testing.T) { func collect(input []byte, skipFrontMatter bool, stateStart stateFunc) (items []Item) { l := newPageLexer(input, 0, stateStart) l.run() + t := l.newTokens() for { - item := l.nextItem() + item := t.nextItem() items = append(items, item) if item.typ == tEOF || item.typ == tError { break