Skip to content

Commit

Permalink
Delay function body parsing.
Browse files Browse the repository at this point in the history
This patch skips parsing function bodies, instead caching tokens and
attaching the token stream to the AST. After the AST is built, functions
are parsed using cached tokens. This is fairly easy to do thanks to the
recent Lexer/SourceManager refactoring.

This is a critical step toward collapsing the binding and semantic
passes together, without re-introducing a dreaded reparse phase.

A quick estimate shows that memory use on this is pretty small. For
funcommands.sp (the biggest plugin in the test suite), it's about 200KB.
This is again thanks to the recent refactoring. Tokens are now 24 bytes
instead of 96 bytes.

Duplicating the AST would use about 3MB since the AST is so unoptimized,
and culling that down is a big project.
  • Loading branch information
dvander committed Oct 5, 2023
1 parent 92e1831 commit 4fa6332
Show file tree
Hide file tree
Showing 5 changed files with 143 additions and 9 deletions.
84 changes: 83 additions & 1 deletion compiler/lexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1221,6 +1221,14 @@ Lexer::Lexer(CompileContext& cc)
}
}

Lexer::~Lexer() {
while (!token_caches_.empty()) {
auto node = *token_caches_.begin();
token_caches_.remove(node);
delete node;
}
}

void Lexer::Init(std::shared_ptr<SourceFile> sf) {
freading_ = true;
EnterFile(std::move(sf), {});
Expand Down Expand Up @@ -1342,6 +1350,9 @@ int Lexer::lex() {
return current_token()->id;
}

if (!injected_token_stream_.empty())
return LexInjectedToken();

return LexNewToken();
}

Expand Down Expand Up @@ -1390,6 +1401,22 @@ int Lexer::LexNewToken() {
return tok->id;
}

int Lexer::LexInjectedToken() {
auto tok = advance_token_ptr();
*tok = ke::PopFront(&injected_token_stream_);

if (tok->id == tMAYBE_LABEL) {
if (allow_tags_) {
tok->id = tLABEL;
[[maybe_unused]] auto tok = ke::PopFront(&injected_token_stream_);
assert(tok.id == ':');
} else {
tok->id = tSYMBOL;
}
}
return tok->id;
}

void Lexer::FillTokenPos(token_pos_t* pos) {
uint32_t offset = state_.pos - state_.start;
if (!state_.macro)
Expand Down Expand Up @@ -1789,7 +1816,9 @@ void Lexer::LexSymbol(full_token_t* tok, sp::Atom* atom) {
tok->id = tSYMBOL;

if (peek() == ':' && peek2() != ':') {
if (allow_tags_) {
if (caching_tokens_) {
tok->id = tMAYBE_LABEL;
} else if (allow_tags_) {
tok->id = tLABEL;
advance();
} else if (cc_.types()->find(atom)) {
Expand Down Expand Up @@ -2571,3 +2600,56 @@ bool Lexer::IsSameSourceFile(const token_pos_t& a, const token_pos_t& b) {
return true;
return cc_.sources()->IsSameSourceFile(a, b);
}

void Lexer::AssertCleanState() {
assert(allow_keywords_);
assert(allow_substitutions_);
assert(!in_string_continuation_);
assert(allow_tags_);
assert(injected_token_stream_.empty());
}

TokenCache* Lexer::LexFunctionBody() {
TokenCache* cache = new TokenCache;
cache->require_newdecls = state_.require_newdecls;
cache->need_semicolon = state_.need_semicolon;

// To cache tokens we must be assured that the lexer state contains no
// surprises, otherwise, the uncached stream may resolve incorrectly.
AssertCleanState();

assert(current_token()->id == '{');
cache->tokens.emplace_back(std::move(*current_token()));

ke::SaveAndSet<bool> caching_tokens(&caching_tokens_, true);

int brace_balance = 1;
while (freading_) {
int tok = lex();
if (tok == 0)
break;
cache->tokens.emplace_back(std::move(*current_token()));

if (tok == '{') {
brace_balance++;
} else if (tok == '}') {
brace_balance--;
if (brace_balance == 0)
break;
}
}

cache->tokens.shrink_to_fit();
token_caches_.append(cache);
return cache;
}

void Lexer::InjectCachedTokens(TokenCache* cache) {
AssertCleanState();

injected_token_stream_ = std::move(cache->tokens);
token_caches_.remove(cache);
delete cache;

freading_ = true;
}
31 changes: 29 additions & 2 deletions compiler/lexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,11 @@
// 3. This notice may not be removed or altered from any source distribution.
#pragma once

#include <amtl/am-deque.h>
#include <amtl/am-hashtable.h>
#include <amtl/am-string.h>
#include <amtl/am-vector.h>
#include <amtl/am-inlinelist.h>
#include <shared/string-pool.h>

#include "compile-options.h"
Expand Down Expand Up @@ -213,6 +215,7 @@ enum TokenKind {
tEOL, /* newline, only returned by peek_new_line() */
tNEWDECL, /* for declloc() */
tENTERED_MACRO, /* internal lexer command */
tMAYBE_LABEL, /* internal lexer command, followed by ':' */
tLAST_TOKEN_ID
};

Expand Down Expand Up @@ -275,12 +278,19 @@ static constexpr int SKIPMODE = 1; /* bit field in "#if" stack */
static constexpr int PARSEMODE = 2; /* bit field in "#if" stack */
static constexpr int HANDLED_ELSE = 4; /* bit field in "#if" stack */

struct TokenCache : public ke::InlineListNode<TokenCache> {
std::deque<full_token_t> tokens;
bool require_newdecls;
bool need_semicolon;
};

class Lexer
{
friend class MacroProcessor;

public:
Lexer(CompileContext& cc);
~Lexer();

int lex();
int lex_same_line();
Expand All @@ -306,6 +316,17 @@ class Lexer
void LexDefinedKeyword();
bool HasMacro(sp::Atom* atom);

// Lexer must be at a '{' token. Lexes until it reaches a balanced '}' token,
// and returns a pointer to the cached tokens.
//
// The opening '{' token, even though already lexed, will be re-added to the
// stream. This is to avoid significantly changing parse_stmt.
TokenCache* LexFunctionBody();

// Consumes a TokenCache. The pointer is deleted after. Consumed tokens will
// be replayed by lex().
void InjectCachedTokens(TokenCache* cache);

full_token_t lex_tok() {
lex();
return *current_token();
Expand All @@ -317,7 +338,8 @@ class Lexer
const token_pos_t& pos() { return current_token()->start; }
std::string& deprecate() { return deprecate_; }
bool& allow_tags() { return allow_tags_; }
int& require_newdecls() { return state_.require_newdecls; }
bool& require_newdecls() { return state_.require_newdecls; }
bool& need_semicolon() { return state_.need_semicolon; }
bool freading() const { return freading_; }
int fcurrent() const { return state_.inpf->sources_index(); }
unsigned fline() const { return state_.fline; }
Expand All @@ -335,6 +357,7 @@ class Lexer
void HandleEof();
void HandleSkippedSection();
int LexNewToken();
int LexInjectedToken();
void LexIntoToken(full_token_t* tok);
void LexSymbolOrKeyword(full_token_t* tok);
int LexKeywordImpl(sp::Atom* atom);
Expand All @@ -356,6 +379,7 @@ class Lexer
void SkipUtf8Bom();
void PushLexerState();
bool IsSameSourceFile(const token_pos_t& a, const token_pos_t& b);
void AssertCleanState();

full_token_t* advance_token_ptr();
full_token_t* next_token();
Expand Down Expand Up @@ -470,7 +494,7 @@ class Lexer
int tokline = 0;
bool need_semicolon = false;
bool is_line_start = false;
int require_newdecls = 0;
bool require_newdecls = false;
size_t entry_preproc_if_stack_size = 0;
const unsigned char* start = nullptr;
const unsigned char* end = nullptr;
Expand All @@ -485,4 +509,7 @@ class Lexer

LexerState state_;
tr::vector<LexerState> prev_state_;
ke::InlineList<TokenCache> token_caches_;
std::deque<full_token_t> injected_token_stream_;
bool caching_tokens_ = false;
};
4 changes: 4 additions & 0 deletions compiler/parse-node.h
Original file line number Diff line number Diff line change
Expand Up @@ -1566,6 +1566,9 @@ class FunctionDecl : public Decl
Stmt* body() const { return body_; }
void set_body(Stmt* body) { body_ = body; }

TokenCache* tokens() const { return tokens_; }
void set_tokens(TokenCache* tokens) { tokens_ = tokens; }

void set_name(sp::Atom* name) { name_ = name; }

// The undecorated name.
Expand Down Expand Up @@ -1631,6 +1634,7 @@ class FunctionDecl : public Decl
ke::Maybe<int> this_tag_;
sp::Atom* alias_ = nullptr;
PoolString* deprecate_ = nullptr;
TokenCache* tokens_ = nullptr;
bool analyzed_ SP_BITFIELD(1);
bool analyze_result_ SP_BITFIELD(1);
bool is_public_ SP_BITFIELD(1);
Expand Down
32 changes: 26 additions & 6 deletions compiler/parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,25 @@ Parser::Parse()
add_to_end.pop_front();
}

while (!delayed_functions_.empty()) {
auto fun = ke::PopFront(&delayed_functions_);

auto tokens = fun->tokens();
fun->set_tokens(nullptr);

// Technically this is not good enough, as the lexer state could have
// changed in the middle of a function. But that's fairly complex to
// handle and pretty ridiculous as far as use cases go.
ke::SaveAndSet<bool> change_newdecls(&lexer_->require_newdecls(),
tokens->require_newdecls);
ke::SaveAndSet<bool> change_need_semicolon(&lexer_->need_semicolon(),
tokens->need_semicolon);

lexer_->InjectCachedTokens(tokens);
auto body = parse_stmt(false);
fun->set_body(BlockStmt::WrapStmt(body));
}

auto list = new StmtList(token_pos_t{}, stmts);
return new ParseTree(list);
}
Expand Down Expand Up @@ -1808,15 +1827,16 @@ Parser::parse_function(FunctionDecl* fun, int tokid, bool has_this)
break;
}

if (!lexer_->peek('{'))
if (!lexer_->match('{')) {
report(437);

Stmt* body = parse_stmt(false);
if (!body)
return false;
}

fun->set_body(BlockStmt::WrapStmt(body));
auto cache = lexer_->LexFunctionBody();
fun->set_tokens(cache);
fun->set_end_pos(lexer_->pos());
delayed_functions_.emplace_back(fun);

return true;
}

Expand Down Expand Up @@ -1987,7 +2007,7 @@ Parser::parse_methodmap_method(MethodmapDecl* map)
if (ret_type.type.ident != 0 && !is_static)
has_this = true;

ke::SaveAndSet<int> require_newdecls(&lexer_->require_newdecls(), TRUE);
ke::SaveAndSet<bool> require_newdecls(&lexer_->require_newdecls(), true);
if (!parse_function(fun, is_native ? tMETHODMAP : 0, has_this))
return nullptr;

Expand Down
1 change: 1 addition & 0 deletions compiler/parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -136,4 +136,5 @@ class Parser
std::vector<SymbolScope*> static_scopes_;
std::shared_ptr<Lexer> lexer_;
TypeDictionary* types_ = nullptr;
std::deque<FunctionDecl*> delayed_functions_;
};

0 comments on commit 4fa6332

Please sign in to comment.