Skip to content

Commit

Permalink
Track source locations more accurately.
Browse files Browse the repository at this point in the history
This replaces the "file" member of token_pos_t with a SourceLocation. A
source location is an index plus offset into a list of text streams
representing source files and macro expansions.

In theory, SourceLocation could fully replace token_pos_t. However,
decoding a SourceLocation back to a line number is very expensive, and
we need the line number extremely frequently. During lexing, we must
retain the line number in addition to the source location.

There are a few major benefits of this approach, versus the old "file,
line, col" tuple. First, SourceLocations are tiny. We can use them in
way more places, which makes the AST easier to annotate. Second, they
are way more accurate. We can now trace a token back to the macro that
defined it, and where that macro was defined, and what line included
that file.

But most importantly, this is a precursor to caching tokens. The
built-in lint checker needs to convert a token position back to the
source text stream, which was not easily possible in the old model.
  • Loading branch information
dvander committed Sep 30, 2023
1 parent 8ad5a00 commit 1fbb265
Show file tree
Hide file tree
Showing 11 changed files with 284 additions and 107 deletions.
2 changes: 1 addition & 1 deletion compiler/compile-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ CompileContext::CompileContext()

reports_ = std::make_unique<ReportManager>(*this);
options_ = std::make_unique<CompileOptions>();
sources_ = std::make_unique<SourceManager>(*this);
sources_ = std::make_unique<sp::SourceManager>(*this);
types_ = std::make_unique<TypeDictionary>(*this);
types_->init();
}
Expand Down
15 changes: 9 additions & 6 deletions compiler/compile-context.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,15 @@
class Lexer;
class ReportManager;
class SemaContext;
class SourceManager;
class SymbolScope;
class TypeDictionary;
struct CompileOptions;
struct symbol;

namespace sp {
class SourceManager;
} // namespace sp

// The thread-safe successor to scvars.
class CompileContext final
{
Expand All @@ -66,7 +69,7 @@ class CompileContext final
const std::shared_ptr<Lexer>& lexer() const { return lexer_; }
ReportManager* reports() const { return reports_.get(); }
CompileOptions* options() const { return options_.get(); }
SourceManager* sources() const { return sources_.get(); }
sp::SourceManager* sources() const { return sources_.get(); }
TypeDictionary* types() const { return types_.get(); }
sp::StringPool* atoms() { return &atoms_; }

Expand Down Expand Up @@ -102,8 +105,8 @@ class CompileContext final
std::string& outfname() { return outfname_; }
void set_outfname(const std::string& value) { outfname_ = value; }

std::shared_ptr<SourceFile> inpf_org() const { return inpf_org_; }
void set_inpf_org(std::shared_ptr<SourceFile> sf) { inpf_org_ = sf; }
std::shared_ptr<sp::SourceFile> inpf_org() const { return inpf_org_; }
void set_inpf_org(std::shared_ptr<sp::SourceFile> sf) { inpf_org_ = sf; }

bool must_abort() const { return must_abort_; }
void set_must_abort() { must_abort_ = true; }
Expand Down Expand Up @@ -135,8 +138,8 @@ class CompileContext final
std::unique_ptr<CompileOptions> options_;
std::string outfname_;
std::string errfname_;
std::unique_ptr<SourceManager> sources_;
std::shared_ptr<SourceFile> inpf_org_;
std::unique_ptr<sp::SourceManager> sources_;
std::shared_ptr<sp::SourceFile> inpf_org_;
std::unique_ptr<TypeDictionary> types_;
sp::StringPool atoms_;

Expand Down
5 changes: 4 additions & 1 deletion compiler/errors.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,10 @@ MessageBuilder::~MessageBuilder()

ErrorReport report;
report.number = number_;
report.fileno = cc.sources()->GetSourceFileIndex(where_);
if (where_.valid())
report.fileno = cc.sources()->GetSourceFileIndex(where_);
else
report.fileno = 0;
report.lineno = std::max(where_.line, 1);
if (report.fileno < 0)
report.fileno = cc.lexer()->fcurrent();
Expand Down
93 changes: 59 additions & 34 deletions compiler/lexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,30 +70,32 @@ static constexpr int kLitcharUtf8 = 0x1;
// Do not error, because the characters are being ignored.
static constexpr int kLitcharSkipping = 0x2;

bool
Lexer::PlungeQualifiedFile(const char* name)
{
bool Lexer::PlungeQualifiedFile(const char* name) {
auto fp = OpenFile(name);
if (!fp)
return false;

assert(!IsSkipping());
assert(skiplevel_ == ifstack_.size()); /* these two are always the same when "parsing" */

auto pos = current_token()->start;

state_.entry_preproc_if_stack_size = ifstack_.size();
PushLexerState();
EnterFile(std::move(fp));
EnterFile(std::move(fp), pos);
return true;
}

std::shared_ptr<SourceFile> Lexer::OpenFile(const std::string& name) {
AutoCountErrors detect_errors;

if (auto sf = cc_.sources()->Open(name, pos()))
if (auto sf = cc_.sources()->Open(name))
return sf;

static const std::vector<std::string> extensions = {".inc", ".p", ".pawn"};
for (const auto& extension : extensions) {
auto alt_name = name + extension;
if (auto sf = cc_.sources()->Open(alt_name, pos()))
if (auto sf = cc_.sources()->Open(alt_name))
return sf;
if (!detect_errors.ok())
return nullptr;
Expand Down Expand Up @@ -486,6 +488,7 @@ void Lexer::HandleDirectives() {
{
ke::SaveAndSet<bool> no_macros(&allow_substitutions_, false);
ke::SaveAndSet<bool> no_keywords(&allow_keywords_, false);

if (!needsymbol(&symbol))
break;
}
Expand All @@ -494,6 +497,8 @@ void Lexer::HandleDirectives() {
break;
}

auto macro_pos = current_token()->start;

ke::Maybe<tr::vector<int>> args;
if (match_char('(')) {
ke::SaveAndSet<bool> no_macros(&allow_substitutions_, false);
Expand Down Expand Up @@ -546,11 +551,12 @@ void Lexer::HandleDirectives() {
macro->pattern = symbol;
macro->documentation = std::move(deprecate_);
macro->deprecated = !macro->documentation.empty();
macro->pos = macro_pos;

tr::vector<size_t>* arg_positions = nullptr;
if (macro->args)
arg_positions = &macro->arg_positions;
macro->substitute = SkimUntilEndOfLine(arg_positions);
macro->substitute = cc_.atom(SkimUntilEndOfLine(arg_positions));

macros_[symbol] = std::move(macro);
break;
Expand Down Expand Up @@ -1230,7 +1236,7 @@ Lexer::Lexer(CompileContext& cc)

void Lexer::Init(std::shared_ptr<SourceFile> sf) {
freading_ = true;
EnterFile(std::move(sf));
EnterFile(std::move(sf), {});
}

void Lexer::Start() {
Expand Down Expand Up @@ -1326,8 +1332,7 @@ Lexer::PushSynthesizedToken(TokenKind kind, const token_pos_t& pos)
auto tok = current_token();
tok->id = kind;
tok->atom = nullptr;
tok->start.line = state_.tokline;
tok->start.file_ = state_.inpf->sources_index();
tok->start = token_pos_t(pos, state_.tokline);
lexpush();
return tok;
}
Expand Down Expand Up @@ -1365,6 +1370,9 @@ int Lexer::LexNewToken() {
FillTokenPos(&tok->start);
return tok->id = tEOL;
}

// Always fill a valid location.
FillTokenPos(&tok->start);
return 0;
}

Expand Down Expand Up @@ -1396,8 +1404,12 @@ int Lexer::LexNewToken() {
}

void Lexer::FillTokenPos(token_pos_t* pos) {
pos->line = state_.tokline;
pos->file_ = state_.inpf->sources_index();
uint32_t offset = state_.pos - state_.start;
if (!state_.macro)
*pos = token_pos_t(state_.loc_range.FilePos(offset), state_.tokline);
else
*pos = token_pos_t(state_.loc_range.MacroPos(offset), state_.tokline);
assert(pos->valid());
}

void Lexer::LexIntoToken(full_token_t* tok) {
Expand Down Expand Up @@ -1927,20 +1939,16 @@ void Lexer::NeedTokenError(int token, int got) {

// If the next token is on the current line, return that token. Otherwise,
// return tNEWLINE.
int
Lexer::peek_same_line()
{
int Lexer::peek_same_line() {
// We should not call this without having parsed at least one token.
assert(token_buffer_->num_tokens > 0);

auto sm = cc_.sources();

// If there's tokens pushed back, then |fline| is the line of the furthest
// token parsed. If fline == current token's line, we are guaranteed any
// buffered token is still on the same line.
if (token_buffer_->depth > 0 &&
current_token()->start.line == state_.fline &&
sm->IsSameSourceFile(current_token()->start, next_token()->start))
IsSameSourceFile(current_token()->start, next_token()->start))
{
return next_token()->id ? next_token()->id : tEOL;
}
Expand All @@ -1955,7 +1963,7 @@ Lexer::peek_same_line()
// If the next token starts on the line the last token ends, then the next
// token is considered on the same line.
if (next.start.line == current_token()->start.line &&
sm->IsSameSourceFile(current_token()->start, next_token()->start))
IsSameSourceFile(current_token()->start, next_token()->start))
{
return next.id;
}
Expand Down Expand Up @@ -2237,7 +2245,7 @@ void Lexer::AddMacro(const char* pattern, const char* subst) {
auto atom = cc_.atom(pattern);
auto macro = std::make_shared<MacroEntry>();
macro->pattern = atom;
macro->substitute = subst;
macro->substitute = cc_.atom(subst);
macro->deprecated = false;

macros_[atom] = std::move(macro);
Expand Down Expand Up @@ -2306,11 +2314,11 @@ Lexer::NeedSemicolon()
return state_.need_semicolon;
}

void Lexer::EnterFile(std::shared_ptr<SourceFile>&& sf) {
void Lexer::EnterFile(std::shared_ptr<SourceFile>&& sf, const token_pos_t& from) {
auto& cc = CompileContext::get();

state_.inpf = std::move(sf);
state_.inpf_loc = cc_.sources()->GetLocationRangeEntryForFile(state_.inpf);
state_.loc_range = cc_.sources()->EnterFile(state_.inpf, from);
state_.need_semicolon = cc.options()->need_semicolon;
state_.require_newdecls = cc.options()->require_newdecls;
state_.fline = 1;
Expand Down Expand Up @@ -2424,6 +2432,8 @@ bool Lexer::EnterMacro(std::shared_ptr<MacroEntry> macro) {

ke::SaveAndSet<bool> no_eof(&allow_end_of_file_, false);

auto expansion_pos = current_token()->start;

if (macros_in_use_.count(macro.get()))
return false;

Expand Down Expand Up @@ -2452,20 +2462,26 @@ bool Lexer::EnterMacro(std::shared_ptr<MacroEntry> macro) {

PushLexerState();

Atom* text = nullptr;
if (macro->args) {
state_.pattern = PerformMacroSubstitution(macro.get(), macro_args);
state_.start = reinterpret_cast<const unsigned char*>(state_.pattern.c_str());
state_.end = state_.start + state_.pattern.size();
// Atomization is important here since it keeps the macro text alive
// during lexing, since we do not pre-lex its tokens.
//
// We used to not atomize here, in which case it was stored on the
// lexer state.
text = cc_.atom(PerformMacroSubstitution(macro.get(), macro_args));
} else {
state_.start = reinterpret_cast<const unsigned char*>(macro->substitute.c_str());
state_.end = state_.start + macro->substitute.size();
text = macro->substitute;
}
state_.start = reinterpret_cast<const unsigned char*>(text->chars());
state_.end = state_.start + text->length();
state_.line_start = state_.start;
state_.pos = state_.start;
state_.macro = macro;
state_.inpf = prev_state_.back().inpf;
state_.fline = prev_state_.back().fline;
state_.tokline = prev_state_.back().tokline;
state_.loc_range = cc_.sources()->EnterMacro(macro->pos, expansion_pos, text);

// Save any tokens we peeked ahead.
state_.token_buffer = token_buffer_;
Expand Down Expand Up @@ -2533,29 +2549,38 @@ std::string Lexer::PerformMacroSubstitution(MacroEntry* macro,
std::string out;

size_t last_start = 0;
const auto& substitute = macro->substitute->str();
for (const auto& pos : macro->arg_positions) {
assert(pos >= last_start);
assert(macro->substitute[pos] == '%');
assert(IsDigit(macro->substitute[pos + 1]));
assert(substitute[pos] == '%');
assert(IsDigit(substitute[pos + 1]));

out += macro->substitute.substr(last_start, pos - last_start);
out += substitute.substr(last_start, pos - last_start);
last_start = pos + 2;

char arg_pos = macro->substitute[pos + 1] - '0';
char arg_pos = substitute[pos + 1] - '0';
auto iter = args.find(arg_pos);
if (iter == args.end()) {
out.push_back(macro->substitute[pos]);
out.push_back(macro->substitute[pos + 1]);
out.push_back(substitute[pos]);
out.push_back(substitute[pos + 1]);
continue;
}
out += iter->second;
}

out += macro->substitute.substr(last_start);
out += substitute.substr(last_start);
return out;
}

void Lexer::SkipUtf8Bom() {
if (state_.pos[0] == 0xef && state_.pos[1] == 0xbb && state_.pos[2] == 0xbf)
state_.pos += 3;
}

bool Lexer::IsSameSourceFile(const token_pos_t& a, const token_pos_t& b) {
// Almost always, we'll be looking at the most recent location. peek_same_line
// is extremely hot so keep this fast-path fast.
if (state_.loc_range.owns(a) && state_.loc_range.owns(b))
return true;
return cc_.sources()->IsSameSourceFile(a, b);
}
19 changes: 10 additions & 9 deletions compiler/lexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -296,10 +296,10 @@ class Lexer
void lexpush();
void lexclr(int clreol);

void Init(std::shared_ptr<SourceFile> sf);
void Init(std::shared_ptr<sp::SourceFile> sf);
void Start();
bool PlungeFile(const char* name, int try_currentpath, int try_includepaths);
std::shared_ptr<SourceFile> OpenFile(const std::string& name);
std::shared_ptr<sp::SourceFile> OpenFile(const std::string& name);
bool NeedSemicolon();
void AddMacro(const char* pattern, const char* subst);
void LexStringContinuation();
Expand All @@ -323,7 +323,7 @@ class Lexer
bool freading() const { return freading_; }
int fcurrent() const { return state_.inpf->sources_index(); }
unsigned fline() const { return state_.fline; }
SourceFile* inpf() const { return state_.inpf.get(); }
sp::SourceFile* inpf() const { return state_.inpf.get(); }

unsigned char const* char_stream() const { return state_.pos; }
unsigned char const* line_start() const { return state_.line_start; }
Expand All @@ -348,7 +348,7 @@ class Lexer
full_token_t* PushSynthesizedToken(TokenKind kind, const token_pos_t& pos);
void SynthesizeIncludePathToken();
void SetFileDefines(std::string file);
void EnterFile(std::shared_ptr<SourceFile>&& fp);
void EnterFile(std::shared_ptr<sp::SourceFile>&& fp, const token_pos_t& from);
void FillTokenPos(token_pos_t* pos);
void SkipLineWhitespace();
std::string SkimUntilEndOfLine(tr::vector<size_t>* macro_args = nullptr);
Expand All @@ -357,6 +357,7 @@ class Lexer
void NeedTokenError(int expected, int got);
void SkipUtf8Bom();
void PushLexerState();
bool IsSameSourceFile(const token_pos_t& a, const token_pos_t& b);

full_token_t* advance_token_ptr();
full_token_t* next_token();
Expand Down Expand Up @@ -416,11 +417,12 @@ class Lexer

private:
struct MacroEntry {
sp::Atom* pattern;
sp::Atom* pattern = nullptr;
ke::Maybe<tr::vector<int>> args;
tr::vector<size_t> arg_positions;
std::string substitute;
sp::Atom* substitute = nullptr;
std::string documentation;
token_pos_t pos;
bool deprecated;
};
std::shared_ptr<MacroEntry> FindMacro(sp::Atom* atom);
Expand Down Expand Up @@ -464,8 +466,8 @@ class Lexer
void operator =(const LexerState &) = delete;
LexerState& operator =(LexerState&&) = default;

std::shared_ptr<SourceFile> inpf;
LREntry inpf_loc;
std::shared_ptr<sp::SourceFile> inpf;
sp::LocationRange loc_range;
// Visual line in the file.
int fline = 0;
// Line # for token processing.
Expand All @@ -483,7 +485,6 @@ class Lexer

// Macro specific.
std::shared_ptr<MacroEntry> macro;
std::string pattern;
};

LexerState state_;
Expand Down
Loading

0 comments on commit 1fbb265

Please sign in to comment.