From ee0ce98813daa84faf6c4dd589abac1380a58772 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Erik=20Pedersen?= Date: Wed, 17 Oct 2018 13:48:55 +0200 Subject: [PATCH] parser/pageparser: Add front matter etc. support See #5324 --- parser/pageparser/item.go | 6 ++- parser/pageparser/pagelexer.go | 68 ++++++++++++++++++---------- parser/pageparser/pageparser_test.go | 34 +++++++++++--- 3 files changed, 76 insertions(+), 32 deletions(-) diff --git a/parser/pageparser/item.go b/parser/pageparser/item.go index ae2f6cbc9c1..fe6087522c7 100644 --- a/parser/pageparser/item.go +++ b/parser/pageparser/item.go @@ -85,6 +85,9 @@ const ( tError itemType = iota tEOF + // page items + tHTMLLead // < + // shortcode items tLeftDelimScNoMarkup tRightDelimScNoMarkup @@ -95,8 +98,7 @@ const ( tScParam tScParamVal - //itemIdentifier - tText // plain text, used for everything outside the shortcodes + tText // plain text // preserved for later - keywords come after this tKeywordMarker diff --git a/parser/pageparser/pagelexer.go b/parser/pageparser/pagelexer.go index 5267c563453..84085da6209 100644 --- a/parser/pageparser/pagelexer.go +++ b/parser/pageparser/pagelexer.go @@ -44,7 +44,6 @@ type lexerShortcodeState struct { } type pageLexer struct { - name string input string state stateFunc pos pos // input position @@ -52,6 +51,11 @@ type pageLexer struct { width pos // width of last element lastPos pos // position of the last item returned by nextItem + // Set once front matter is read OK. + frontMatterRead bool + // Set when we see a non-whitespace character + noneWhiteSpaceSeen bool + lexerShortcodeState // items delivered to client @@ -63,16 +67,15 @@ func Parse(s string) *Tokens { } func ParseFrom(s string, from int) *Tokens { - lexer := newPageLexer("default", s, pos(from)) + lexer := newPageLexer(s, pos(from)) lexer.run() return &Tokens{lexer: lexer} } // note: the input position here is normally 0 (start), but // can be set if position of first shortcode is known -func newPageLexer(name, input string, inputPosition pos) *pageLexer { +func newPageLexer(input string, inputPosition pos) *pageLexer { lexer := &pageLexer{ - name: name, input: input, pos: inputPosition, lexerShortcodeState: lexerShortcodeState{ @@ -88,7 +91,7 @@ func newPageLexer(name, input string, inputPosition pos) *pageLexer { // main loop func (l *pageLexer) run() *pageLexer { - for l.state = lexTextOutsideShortcodes; l.state != nil; { + for l.state = lexMain; l.state != nil; { l.state = l.state(l) } return l @@ -178,28 +181,45 @@ func (l *pageLexer) nextItem() Item { return item } -// scans until an opening shortcode opening bracket. -// if no shortcodes, it will keep on scanning until EOF -func lexTextOutsideShortcodes(l *pageLexer) stateFunc { +func lexMain(l *pageLexer) stateFunc { +LOOP: for { - if strings.HasPrefix(l.input[l.pos:], leftDelimScWithMarkup) || strings.HasPrefix(l.input[l.pos:], leftDelimScNoMarkup) { - if l.pos > l.start { - l.emit(tText) - } - if strings.HasPrefix(l.input[l.pos:], leftDelimScWithMarkup) { - l.currLeftDelimItem = tLeftDelimScWithMarkup - l.currRightDelimItem = tRightDelimScWithMarkup - } else { - l.currLeftDelimItem = tLeftDelimScNoMarkup - l.currRightDelimItem = tRightDelimScNoMarkup + // TODO(bep) 2errors split these in 2 somehow + if l.frontMatterRead { + l.noneWhiteSpaceSeen = true + if strings.HasPrefix(l.input[l.pos:], leftDelimScWithMarkup) || strings.HasPrefix(l.input[l.pos:], leftDelimScNoMarkup) { + if l.pos > l.start { + l.emit(tText) + } + if strings.HasPrefix(l.input[l.pos:], leftDelimScWithMarkup) { + l.currLeftDelimItem = tLeftDelimScWithMarkup + l.currRightDelimItem = tRightDelimScWithMarkup + } else { + l.currLeftDelimItem = tLeftDelimScNoMarkup + l.currRightDelimItem = tRightDelimScNoMarkup + } + return lexShortcodeLeftDelim } - return lexShortcodeLeftDelim - } - if l.next() == eof { - break + + r := l.next() + switch { + case r == eof: + break LOOP + case r == '<': + if !l.noneWhiteSpaceSeen { + l.emit(tHTMLLead) + // Not need to look further. + l.pos = pos(len(l.input)) + l.emit(tText) + break LOOP + } + case !isSpace(r): + l.noneWhiteSpaceSeen = true } + } + // Done! if l.pos > l.start { l.emit(tText) @@ -234,14 +254,14 @@ func lexShortcodeComment(l *pageLexer) stateFunc { l.ignore() l.pos += pos(len(l.currentRightShortcodeDelim())) l.emit(tText) - return lexTextOutsideShortcodes + return lexMain } func lexShortcodeRightDelim(l *pageLexer) stateFunc { l.closingState = 0 l.pos += pos(len(l.currentRightShortcodeDelim())) l.emit(l.currentRightShortcodeDelimItem()) - return lexTextOutsideShortcodes + return lexMain } // either: diff --git a/parser/pageparser/pageparser_test.go b/parser/pageparser/pageparser_test.go index ceb439a65a7..148ecfcd5f5 100644 --- a/parser/pageparser/pageparser_test.go +++ b/parser/pageparser/pageparser_test.go @@ -17,7 +17,7 @@ import ( "testing" ) -type shortCodeLexerTest struct { +type lexerTest struct { name string input string items []Item @@ -39,7 +39,7 @@ var ( tstVal = Item{tScParamVal, 0, "Hello World"} ) -var shortCodeLexerTests = []shortCodeLexerTest{ +var shortCodeLexerTests = []lexerTest{ {"empty", "", []Item{tstEOF}}, {"spaces", " \t\n", []Item{{tText, 0, " \t\n"}, tstEOF}}, {"text", `to be or not`, []Item{{tText, 0, "to be or not"}, tstEOF}}, @@ -159,7 +159,7 @@ var shortCodeLexerTests = []shortCodeLexerTest{ func TestShortcodeLexer(t *testing.T) { t.Parallel() for i, test := range shortCodeLexerTests { - items := collect(&test) + items := collect(test.name, test.input, true) if !equal(items, test.items) { t.Errorf("[%d] %s: got\n\t%v\nexpected\n\t%v", i, test.name, items, test.items) } @@ -170,7 +170,7 @@ func BenchmarkShortcodeLexer(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { for _, test := range shortCodeLexerTests { - items := collect(&test) + items := collect(test.name, test.input, true) if !equal(items, test.items) { b.Errorf("%s: got\n\t%v\nexpected\n\t%v", test.name, items, test.items) } @@ -178,8 +178,30 @@ func BenchmarkShortcodeLexer(b *testing.B) { } } -func collect(t *shortCodeLexerTest) (items []Item) { - l := newPageLexer(t.name, t.input, 0).run() +var ( + tstHTMLLead = Item{tHTMLLead, 0, " <"} +) + +var frontMatterTests = []lexerTest{ + {"empty", "", []Item{tstEOF}}, + {"HTML Document", ` `, []Item{tstHTMLLead, Item{tText, 0, "html>"}, tstEOF}}, +} + +func TestFrontMatter(t *testing.T) { + t.Parallel() + for i, test := range frontMatterTests { + items := collect(test.name, test.input, false) + if !equal(items, test.items) { + t.Errorf("[%d] %s: got\n\t%v\nexpected\n\t%v", i, test.name, items, test.items) + } + } +} + +func collect(name, input string, skipFrontMatter bool) (items []Item) { + l := newPageLexer(input, 0) + l.frontMatterRead = skipFrontMatter + l.run() + for { item := l.nextItem() items = append(items, item)