hugolib: Integrate new page parser

See gohugoio#5324
bep · Oct 18, 2018 · 7074568 · 7074568
1 parent 057b16b
commit 7074568
Show file tree

Hide file tree

Showing 9 changed files with 195 additions and 94 deletions.
diff --git a/hugolib/page.go b/hugolib/page.go
@@ -141,6 +141,7 @@ type Page struct {
 	contentv        template.HTML
 	summary         template.HTML
 	TableOfContents template.HTML
+
 	// Passed to the shortcodes
 	pageWithoutContent *PageWithoutContent
 
@@ -161,7 +162,6 @@ type Page struct {
 
 	extension   string
 	contentType string
-	renderable  bool
 
 	Layout string
 
@@ -171,13 +171,8 @@ type Page struct {
 
 	linkTitle string
 
-	frontmatter []byte
-
-	// rawContent is the raw content read from the content file.
-	rawContent []byte
-
-	// workContent is a copy of rawContent that may be mutated during site build.
-	workContent []byte
+	// Content items.
+	pageContent
 
 	// whether the content is in a CJK language.
 	isCJKLanguage bool
@@ -1756,39 +1751,6 @@ func (p *Page) shouldRenderTo(f output.Format) bool {
 	return found
 }
 
-func (p *Page) parse(reader io.Reader) error {
-	psr, err := parser.ReadFrom(reader)
-
-	if err != nil {
-		return err
-	}
-
-	p.renderable = psr.IsRenderable()
-	p.frontmatter = psr.FrontMatter()
-	p.rawContent = psr.Content()
-	p.lang = p.Source.File.Lang()
-
-	meta, err := psr.Metadata()
-	if err != nil {
-		return _errors.Wrap(err, "error in front matter")
-	}
-	if meta == nil {
-		// missing frontmatter equivalent to empty frontmatter
-		meta = map[string]interface{}{}
-	}
-
-	if p.s != nil && p.s.owner != nil {
-		gi, enabled := p.s.owner.gitInfo.forPage(p)
-		if gi != nil {
-			p.GitInfo = gi
-		} else if enabled {
-			p.s.Log.WARN.Printf("Failed to find GitInfo for page %q", p.Path())
-		}
-	}
-
-	return p.update(meta)
-}
-
 func (p *Page) RawContent() string {
 	return string(p.rawContent)
 }
@@ -1871,7 +1833,7 @@ func (p *Page) SaveSource() error {
 // TODO(bep) lazy consolidate
 func (p *Page) processShortcodes() error {
 	p.shortcodeState = newShortcodeHandler(p)
-	tmpContent, err := p.shortcodeState.extractShortcodes(p.workContent, p.withoutContent())
+	tmpContent, err := p.shortcodeState.extractShortcodes(p.parsed.Tokens(), p.withoutContent())
 	if err != nil {
 		return err
 	}

diff --git a/hugolib/page_content.go b/hugolib/page_content.go
@@ -0,0 +1,83 @@
+// Copyright 2018 The Hugo Authors. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hugolib
+
+import (
+	"io"
+
+	"github.com/gohugoio/hugo/parser"
+	"github.com/gohugoio/hugo/parser/pageparser"
+	"github.com/pkg/errors"
+)
+
+// The content related items on a Page.
+type pageContent struct {
+	renderable bool
+
+	frontmatter []byte
+
+	// rawContent is the raw content read from the content file.
+	rawContent []byte
+
+	// workContent is a copy of rawContent that may be mutated during site build.
+	workContent []byte
+
+	// The AST of the parsed page. Contains information about:
+	// shortcodes, front matter, summary indicators.
+	parsed pageparser.Result
+}
+
+func (p *Page) parse(reader io.Reader) error {
+	// TODO(bep) 2errors consolidate when done
+	rs := reader.(io.ReadSeeker)
+
+	psr, err := parser.ReadFrom(rs)
+	if err != nil {
+		return err
+	}
+
+	rs.Seek(0, 0)
+
+	psr2, err := pageparser.ReadFrom(rs)
+	if err != nil {
+		return err
+	}
+
+	p.parsed = psr2
+
+	p.renderable = psr.IsRenderable()
+	p.frontmatter = psr.FrontMatter()
+	p.rawContent = psr.Content()
+	p.lang = p.Source.File.Lang()
+
+	meta, err := psr.Metadata()
+	if err != nil {
+		return errors.Wrap(err, "error in front matter")
+	}
+	if meta == nil {
+		// missing frontmatter equivalent to empty frontmatter
+		meta = map[string]interface{}{}
+	}
+
+	if p.s != nil && p.s.owner != nil {
+		gi, enabled := p.s.owner.gitInfo.forPage(p)
+		if gi != nil {
+			p.GitInfo = gi
+		} else if enabled {
+			p.s.Log.WARN.Printf("Failed to find GitInfo for page %q", p.Path())
+		}
+	}
+
+	return p.update(meta)
+}
diff --git a/hugolib/shortcode.go b/hugolib/shortcode.go
@@ -615,19 +615,7 @@ Loop:
 
 var shortCodeStart = []byte("{{")
 
-func (s *shortcodeHandler) extractShortcodes(input []byte, p *PageWithoutContent) (string, error) {
-
-	startIdx := bytes.Index(input, shortCodeStart)
-
-	// short cut for docs with no shortcodes
-	if startIdx < 0 {
-		return string(input), nil
-	}
-
-	// the parser takes a string;
-	// since this is an internal API, it could make sense to use the mutable []byte all the way, but
-	// it seems that the time isn't really spent in the byte copy operations, and the impl. gets a lot cleaner
-	pt := pageparser.ParseFrom(input, startIdx)
+func (s *shortcodeHandler) extractShortcodes(pt *pageparser.Tokens, p *PageWithoutContent) (string, error) {
 
 	result := bp.GetBuffer()
 	defer bp.PutBuffer(result)

diff --git a/hugolib/shortcode_test.go b/hugolib/shortcode_test.go
@@ -365,11 +365,11 @@ func TestExtractShortcodes(t *testing.T) {
 		expectErrorMsg   string
 	}{
 		{"text", "Some text.", "map[]", "Some text.", ""},
-		{"invalid right delim", "{{< tag }}", "", false, ":4:.*unrecognized character.*}"},
-		{"invalid close", "\n{{< /tag >}}", "", false, ":5:.*got closing shortcode, but none is open"},
-		{"invalid close2", "\n\n{{< tag >}}{{< /anotherTag >}}", "", false, ":6: closing tag for shortcode 'anotherTag' does not match start tag"},
-		{"unterminated quote 1", `{{< figure src="im caption="S" >}}`, "", false, ":4:.got pos.*"},
-		{"unterminated quote 1", `{{< figure src="im" caption="S >}}`, "", false, ":4:.*unterm.*}"},
+		{"invalid right delim", "{{< tag }}", "", false, ":8:.*unrecognized character.*}"},
+		{"invalid close", "\n{{< /tag >}}", "", false, ":9:.*got closing shortcode, but none is open"},
+		{"invalid close2", "\n\n{{< tag >}}{{< /anotherTag >}}", "", false, ":10: closing tag for shortcode 'anotherTag' does not match start tag"},
+		{"unterminated quote 1", `{{< figure src="im caption="S" >}}`, "", false, ":8:.got pos.*"},
+		{"unterminated quote 1", `{{< figure src="im" caption="S >}}`, "", false, ":8:.*unterm.*}"},
 		{"one shortcode, no markup", "{{< tag >}}", "", testScPlaceholderRegexp, ""},
 		{"one shortcode, markup", "{{% tag %}}", "", testScPlaceholderRegexp, ""},
 		{"one pos param", "{{% tag param1 %}}", `tag([\"param1\"], true){[]}"]`, testScPlaceholderRegexp, ""},
@@ -405,7 +405,7 @@ func TestExtractShortcodes(t *testing.T) {
 			fmt.Sprintf("Hello %sworld%s. And that's it.", testScPlaceholderRegexp, testScPlaceholderRegexp), ""},
 	} {
 
-		p, _ := pageFromString(simplePage, "simple.md", func(templ tpl.TemplateHandler) error {
+		p, _ := pageFromString(simplePage+this.input, "simple.md", func(templ tpl.TemplateHandler) error {
 			templ.AddTemplate("_internal/shortcodes/tag.html", `tag`)
 			templ.AddTemplate("_internal/shortcodes/sc1.html", `sc1`)
 			templ.AddTemplate("_internal/shortcodes/sc2.html", `sc2`)
@@ -424,7 +424,7 @@ func TestExtractShortcodes(t *testing.T) {
 			return fmt.Sprintf("HAHA%s-%dHBHB", shortcodePlaceholderPrefix, counter)
 		}
 
-		content, err := s.extractShortcodes([]byte(this.input), p.withoutContent())
+		content, err := s.extractShortcodes(p.parsed.Tokens(), p.withoutContent())
 
 		if b, ok := this.expect.(bool); ok && !b {
 			if err == nil {

diff --git a/parser/pageparser/item.go b/parser/pageparser/item.go
@@ -21,6 +21,8 @@ type Item struct {
 	Val []byte
 }
 
+type Items []Item
+
 func (i Item) ValStr() string {
 	return string(i.Val)
 }

diff --git a/parser/pageparser/page_tokens_getters.go b/parser/pageparser/page_tokens_getters.go
@@ -0,0 +1,19 @@
+// Copyright 2018 The Hugo Authors. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pageparser
+
+// GetFrontMatter returns the unmarshalled frontmatter data.
+func GetFrontMatter(items Items) (map[string]interface{}, error) {
+	return nil, nil
+}
diff --git a/parser/pageparser/pagelexer.go b/parser/pageparser/pagelexer.go
@@ -50,14 +50,23 @@ type pageLexer struct {
 	pos        pos // input position
 	start      pos // item start position
 	width      pos // width of last element
-	lastPos    pos // position of the last item returned by nextItem
 
 	contentSections int
 
 	lexerShortcodeState
 
 	// items delivered to client
-	items []Item
+	items Items
+}
+
+// Implement the Result interface
+func (l *pageLexer) Tokens() *Tokens {
+	return &Tokens{lexer: l, items: l.items}
+}
+
+func (l *pageLexer) Items() Items {
+	return l.items
+
 }
 
 // note: the input position here is normally 0 (start), but
@@ -79,6 +88,10 @@ func newPageLexer(input []byte, inputPosition pos, stateStart stateFunc) *pageLe
 	return lexer
 }
 
+func (l *pageLexer) newTokens() *Tokens {
+	return &Tokens{lexer: l, items: l.items}
+}
+
 // main loop
 func (l *pageLexer) run() *pageLexer {
 	for l.state = l.stateStart; l.state != nil; {
@@ -160,25 +173,12 @@ func (l *pageLexer) ignore() {
 
 var lf = []byte("\n")
 
-// nice to have in error logs
-func (l *pageLexer) lineNum() int {
-	return bytes.Count(l.input[:l.lastPos], lf) + 1
-}
-
 // nil terminates the parser
 func (l *pageLexer) errorf(format string, args ...interface{}) stateFunc {
 	l.items = append(l.items, Item{tError, l.start, []byte(fmt.Sprintf(format, args...))})
 	return nil
 }
 
-// consumes and returns the next item
-func (l *pageLexer) nextItem() Item {
-	item := l.items[0]
-	l.items = l.items[1:]
-	l.lastPos = item.pos
-	return item
-}
-
 func (l *pageLexer) consumeCRLF() bool {
 	var consumed bool
 	for _, r := range crLf {
@@ -258,15 +258,16 @@ LOOP:
 		case r == '#':
 			return lexFrontMatterOrgMode
 		case !isSpace(r) && !isEndOfLine(r):
+			// No front matter.
 			if r == '<' {
 				l.emit(tHTMLLead)
 				// Not need to look further. Hugo treats this as plain HTML,
 				// no front matter, no shortcodes, no nothing.
 				l.pos = pos(len(l.input))
 				l.emit(tText)
-				break LOOP
+
 			}
-			return l.errorf("failed to detect front matter type; got unknown identifier %q", r)
+			break LOOP
 		}
 	}
 
@@ -366,18 +367,19 @@ LOOP:
 
 }
 
+func (l *pageLexer) printCurrentInput() {
+	fmt.Printf("input[%d:]: %q", l.pos, string(l.input[l.pos:]))
+}
+
 // Handle YAML or TOML front matter.
 func (l *pageLexer) lexFrontMatterSection(tp itemType, delimr rune, name string, delim []byte) stateFunc {
+
 	for i := 0; i < 2; i++ {
 		if r := l.next(); r != delimr {
 			return l.errorf("invalid %s delimiter", name)
 		}
 	}
 
-	if !l.consumeCRLF() {
-		return l.errorf("invalid %s delimiter", name)
-	}
-
 	// We don't care about the delimiters.
 	l.ignore()