From ee0ce98813daa84faf6c4dd589abac1380a58772 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B8rn=20Erik=20Pedersen?=
 <bjorn.erik.pedersen@gmail.com>
Date: Wed, 17 Oct 2018 13:48:55 +0200
Subject: [PATCH] parser/pageparser: Add front matter etc. support

See #5324
---
 parser/pageparser/item.go            |  6 ++-
 parser/pageparser/pagelexer.go       | 68 ++++++++++++++++++----------
 parser/pageparser/pageparser_test.go | 34 +++++++++++---
 3 files changed, 76 insertions(+), 32 deletions(-)

diff --git a/parser/pageparser/item.go b/parser/pageparser/item.go
index ae2f6cbc9c1..fe6087522c7 100644
--- a/parser/pageparser/item.go
+++ b/parser/pageparser/item.go
@@ -85,6 +85,9 @@ const (
 	tError itemType = iota
 	tEOF
 
+	// page items
+	tHTMLLead // <
+
 	// shortcode items
 	tLeftDelimScNoMarkup
 	tRightDelimScNoMarkup
@@ -95,8 +98,7 @@ const (
 	tScParam
 	tScParamVal
 
-	//itemIdentifier
-	tText // plain text, used for everything outside the shortcodes
+	tText // plain text
 
 	// preserved for later - keywords come after this
 	tKeywordMarker
diff --git a/parser/pageparser/pagelexer.go b/parser/pageparser/pagelexer.go
index 5267c563453..84085da6209 100644
--- a/parser/pageparser/pagelexer.go
+++ b/parser/pageparser/pagelexer.go
@@ -44,7 +44,6 @@ type lexerShortcodeState struct {
 }
 
 type pageLexer struct {
-	name    string
 	input   string
 	state   stateFunc
 	pos     pos // input position
@@ -52,6 +51,11 @@ type pageLexer struct {
 	width   pos // width of last element
 	lastPos pos // position of the last item returned by nextItem
 
+	// Set once front matter is read OK.
+	frontMatterRead bool
+	// Set when we see a non-whitespace character
+	noneWhiteSpaceSeen bool
+
 	lexerShortcodeState
 
 	// items delivered to client
@@ -63,16 +67,15 @@ func Parse(s string) *Tokens {
 }
 
 func ParseFrom(s string, from int) *Tokens {
-	lexer := newPageLexer("default", s, pos(from))
+	lexer := newPageLexer(s, pos(from))
 	lexer.run()
 	return &Tokens{lexer: lexer}
 }
 
 // note: the input position here is normally 0 (start), but
 // can be set if position of first shortcode is known
-func newPageLexer(name, input string, inputPosition pos) *pageLexer {
+func newPageLexer(input string, inputPosition pos) *pageLexer {
 	lexer := &pageLexer{
-		name:  name,
 		input: input,
 		pos:   inputPosition,
 		lexerShortcodeState: lexerShortcodeState{
@@ -88,7 +91,7 @@ func newPageLexer(name, input string, inputPosition pos) *pageLexer {
 
 // main loop
 func (l *pageLexer) run() *pageLexer {
-	for l.state = lexTextOutsideShortcodes; l.state != nil; {
+	for l.state = lexMain; l.state != nil; {
 		l.state = l.state(l)
 	}
 	return l
@@ -178,28 +181,45 @@ func (l *pageLexer) nextItem() Item {
 	return item
 }
 
-// scans until an opening shortcode opening bracket.
-// if no shortcodes, it will keep on scanning until EOF
-func lexTextOutsideShortcodes(l *pageLexer) stateFunc {
+func lexMain(l *pageLexer) stateFunc {
+LOOP:
 	for {
-		if strings.HasPrefix(l.input[l.pos:], leftDelimScWithMarkup) || strings.HasPrefix(l.input[l.pos:], leftDelimScNoMarkup) {
-			if l.pos > l.start {
-				l.emit(tText)
-			}
-			if strings.HasPrefix(l.input[l.pos:], leftDelimScWithMarkup) {
-				l.currLeftDelimItem = tLeftDelimScWithMarkup
-				l.currRightDelimItem = tRightDelimScWithMarkup
-			} else {
-				l.currLeftDelimItem = tLeftDelimScNoMarkup
-				l.currRightDelimItem = tRightDelimScNoMarkup
+		// TODO(bep) 2errors split these in 2 somehow
+		if l.frontMatterRead {
+			l.noneWhiteSpaceSeen = true
+			if strings.HasPrefix(l.input[l.pos:], leftDelimScWithMarkup) || strings.HasPrefix(l.input[l.pos:], leftDelimScNoMarkup) {
+				if l.pos > l.start {
+					l.emit(tText)
+				}
+				if strings.HasPrefix(l.input[l.pos:], leftDelimScWithMarkup) {
+					l.currLeftDelimItem = tLeftDelimScWithMarkup
+					l.currRightDelimItem = tRightDelimScWithMarkup
+				} else {
+					l.currLeftDelimItem = tLeftDelimScNoMarkup
+					l.currRightDelimItem = tRightDelimScNoMarkup
+				}
+				return lexShortcodeLeftDelim
 			}
-			return lexShortcodeLeftDelim
-
 		}
-		if l.next() == eof {
-			break
+
+		r := l.next()
+		switch {
+		case r == eof:
+			break LOOP
+		case r == '<':
+			if !l.noneWhiteSpaceSeen {
+				l.emit(tHTMLLead)
+				// Not need to look further.
+				l.pos = pos(len(l.input))
+				l.emit(tText)
+				break LOOP
+			}
+		case !isSpace(r):
+			l.noneWhiteSpaceSeen = true
 		}
+
 	}
+
 	// Done!
 	if l.pos > l.start {
 		l.emit(tText)
@@ -234,14 +254,14 @@ func lexShortcodeComment(l *pageLexer) stateFunc {
 	l.ignore()
 	l.pos += pos(len(l.currentRightShortcodeDelim()))
 	l.emit(tText)
-	return lexTextOutsideShortcodes
+	return lexMain
 }
 
 func lexShortcodeRightDelim(l *pageLexer) stateFunc {
 	l.closingState = 0
 	l.pos += pos(len(l.currentRightShortcodeDelim()))
 	l.emit(l.currentRightShortcodeDelimItem())
-	return lexTextOutsideShortcodes
+	return lexMain
 }
 
 // either:
diff --git a/parser/pageparser/pageparser_test.go b/parser/pageparser/pageparser_test.go
index ceb439a65a7..148ecfcd5f5 100644
--- a/parser/pageparser/pageparser_test.go
+++ b/parser/pageparser/pageparser_test.go
@@ -17,7 +17,7 @@ import (
 	"testing"
 )
 
-type shortCodeLexerTest struct {
+type lexerTest struct {
 	name  string
 	input string
 	items []Item
@@ -39,7 +39,7 @@ var (
 	tstVal       = Item{tScParamVal, 0, "Hello World"}
 )
 
-var shortCodeLexerTests = []shortCodeLexerTest{
+var shortCodeLexerTests = []lexerTest{
 	{"empty", "", []Item{tstEOF}},
 	{"spaces", " \t\n", []Item{{tText, 0, " \t\n"}, tstEOF}},
 	{"text", `to be or not`, []Item{{tText, 0, "to be or not"}, tstEOF}},
@@ -159,7 +159,7 @@ var shortCodeLexerTests = []shortCodeLexerTest{
 func TestShortcodeLexer(t *testing.T) {
 	t.Parallel()
 	for i, test := range shortCodeLexerTests {
-		items := collect(&test)
+		items := collect(test.name, test.input, true)
 		if !equal(items, test.items) {
 			t.Errorf("[%d] %s: got\n\t%v\nexpected\n\t%v", i, test.name, items, test.items)
 		}
@@ -170,7 +170,7 @@ func BenchmarkShortcodeLexer(b *testing.B) {
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
 		for _, test := range shortCodeLexerTests {
-			items := collect(&test)
+			items := collect(test.name, test.input, true)
 			if !equal(items, test.items) {
 				b.Errorf("%s: got\n\t%v\nexpected\n\t%v", test.name, items, test.items)
 			}
@@ -178,8 +178,30 @@ func BenchmarkShortcodeLexer(b *testing.B) {
 	}
 }
 
-func collect(t *shortCodeLexerTest) (items []Item) {
-	l := newPageLexer(t.name, t.input, 0).run()
+var (
+	tstHTMLLead = Item{tHTMLLead, 0, "  <"}
+)
+
+var frontMatterTests = []lexerTest{
+	{"empty", "", []Item{tstEOF}},
+	{"HTML Document", `  <html>`, []Item{tstHTMLLead, Item{tText, 0, "html>"}, tstEOF}},
+}
+
+func TestFrontMatter(t *testing.T) {
+	t.Parallel()
+	for i, test := range frontMatterTests {
+		items := collect(test.name, test.input, false)
+		if !equal(items, test.items) {
+			t.Errorf("[%d] %s: got\n\t%v\nexpected\n\t%v", i, test.name, items, test.items)
+		}
+	}
+}
+
+func collect(name, input string, skipFrontMatter bool) (items []Item) {
+	l := newPageLexer(input, 0)
+	l.frontMatterRead = skipFrontMatter
+	l.run()
+
 	for {
 		item := l.nextItem()
 		items = append(items, item)