Skip to content

Commit

Permalink
clean up tokenizer options and defaults
Browse files Browse the repository at this point in the history
- Disallow multiline strings.
- Disallow a letter immediately following a number without intervening whitespace.
- Replace distinct bool option flags with a single options int.

PiperOrigin-RevId: 467829817
  • Loading branch information
ericsalo authored and copybara-github committed Aug 16, 2022
1 parent 6861966 commit 922a858
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 56 deletions.
67 changes: 24 additions & 43 deletions upb/io/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -213,30 +213,9 @@ struct upb_Tokenizer {
upb_String* record_target;
int record_start;

// Options.
bool allow_f_after_float;
bool allow_multiline_strings;
bool require_space_after_number;
bool report_whitespace;
bool report_newlines;
bool comment_style_cpp;
bool comment_style_sh;
int options;
};

static void upb_Tokenizer_Options(upb_Tokenizer* t, int options) {
t->allow_f_after_float =
(options & kUpb_TokenizerOption_AllowFAfterFloat) != 0;
t->allow_multiline_strings =
(options & kUpb_TokenizerOption_AllowMultilineStrings) != 0;
t->require_space_after_number =
(options & kUpb_TokenizerOption_AllowFieldImmediatelyAfterNumber) == 0;
t->report_whitespace = (options & (kUpb_TokenizerOption_ReportWhitespace |
kUpb_TokenizerOption_ReportNewlines)) != 0;
t->report_newlines = (options & kUpb_TokenizerOption_ReportNewlines) != 0;
t->comment_style_sh = (options & kUpb_TokenizerOption_CommentStyleShell) != 0;
t->comment_style_cpp = !t->comment_style_sh;
}

// -------------------------------------------------------------------
// Internal helpers.

Expand Down Expand Up @@ -400,12 +379,8 @@ static void ConsumeString(upb_Tokenizer* t, char delimiter) {
return;

case '\n': {
if (!t->allow_multiline_strings) {
AddError(t, "String literals cannot cross line boundaries.");
return;
}
NextChar(t);
break;
AddError(t, "String literals cannot cross line boundaries.");
return;
}

case '\\': {
Expand Down Expand Up @@ -503,12 +478,12 @@ static upb_TokenType ConsumeNumber(upb_Tokenizer* t, bool started_with_zero,
"\"e\" must be followed by exponent.");
}

if (t->allow_f_after_float && (TryConsume(t, 'f') || TryConsume(t, 'F'))) {
is_float = true;
if (t->options & kUpb_TokenizerOption_AllowFAfterFloat) {
if (TryConsume(t, 'f') || TryConsume(t, 'F')) is_float = true;
}
}

if (LookingAt(t, upb_Tokenizer_IsLetter) && t->require_space_after_number) {
if (LookingAt(t, upb_Tokenizer_IsLetter)) {
AddError(t, "Need space between number and identifier.");
} else if (t->current_char == '.') {
if (is_float) {
Expand Down Expand Up @@ -586,7 +561,10 @@ static void ConsumeBlockComment(upb_Tokenizer* t, upb_String* content) {
// If we're at the start of a new comment, consume it and return what kind
// of comment it is.
static upb_CommentType TryConsumeCommentStart(upb_Tokenizer* t) {
if (t->comment_style_cpp && TryConsume(t, '/')) {
const bool style_sh = t->options & kUpb_TokenizerOption_CommentStyleShell;
const bool style_cpp = !style_sh;

if (style_cpp && TryConsume(t, '/')) {
if (TryConsume(t, '/')) {
return kUpb_CommentType_Line;
} else if (TryConsume(t, '*')) {
Expand All @@ -600,7 +578,7 @@ static upb_CommentType TryConsumeCommentStart(upb_Tokenizer* t) {
t->current.end_column = t->column;
return kUpb_CommentType_SlashNot;
}
} else if (t->comment_style_sh && TryConsume(t, '#')) {
} else if (style_sh && TryConsume(t, '#')) {
return kUpb_CommentType_Line;
} else {
return kUpb_CommentType_None;
Expand All @@ -610,7 +588,7 @@ static upb_CommentType TryConsumeCommentStart(upb_Tokenizer* t) {
// If we're looking at a TYPE_WHITESPACE token and `report_whitespace` is true,
// consume it and return true.
static bool TryConsumeWhitespace(upb_Tokenizer* t) {
if (t->report_newlines) {
if (t->options & kUpb_TokenizerOption_ReportNewlines) {
if (TryConsumeOne(t, upb_Tokenizer_IsWhitespaceNoNewline)) {
ConsumeZeroOrMore(t, upb_Tokenizer_IsWhitespaceNoNewline);
t->current.type = kUpb_TokenType_Whitespace;
Expand All @@ -621,20 +599,19 @@ static bool TryConsumeWhitespace(upb_Tokenizer* t) {
if (TryConsumeOne(t, upb_Tokenizer_IsWhitespace)) {
ConsumeZeroOrMore(t, upb_Tokenizer_IsWhitespace);
t->current.type = kUpb_TokenType_Whitespace;
return t->report_whitespace;
return (t->options & kUpb_TokenizerOption_ReportWhitespace) != 0;
}
return false;
}

// If we're looking at a TYPE_NEWLINE token and `report_newlines` is true,
// consume it and return true.
static bool TryConsumeNewline(upb_Tokenizer* t) {
if (!t->report_whitespace || !t->report_newlines) {
return false;
}
if (TryConsume(t, '\n')) {
t->current.type = kUpb_TokenType_Newline;
return true;
if (t->options & kUpb_TokenizerOption_ReportNewlines) {
if (TryConsume(t, '\n')) {
t->current.type = kUpb_TokenType_Newline;
return true;
}
}
return false;
}
Expand Down Expand Up @@ -1055,13 +1032,17 @@ upb_Tokenizer* upb_Tokenizer_New(const void* data, size_t size,
t->record_target = NULL;
t->record_start = -1;

// ReportNewlines implies ReportWhitespace.
if (options & kUpb_TokenizerOption_ReportNewlines) {
options |= kUpb_TokenizerOption_ReportWhitespace;
}
t->options = options;

t->previous_type = kUpb_TokenType_Start;
t->previous_line = 0;
t->previous_column = 0;
t->previous_end_column = 0;

upb_Tokenizer_Options(t, options);

upb_Token_Init(&t->current, arena);

if (size) {
Expand Down
18 changes: 5 additions & 13 deletions upb/io/tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,24 +87,16 @@ typedef enum {
// interpreted as floats. For all other purposes, the 'f' is ignored.
kUpb_TokenizerOption_AllowFAfterFloat = 1 << 0,

// If set, allow string literals to span multiple lines.
// Do not use this; for Google-internal cleanup only.
kUpb_TokenizerOption_AllowMultilineStrings = 1 << 1,

// If set, allow a field name to appear immediately after a number without
// requiring any intervening whitespace as a delimiter.
// Do not use this; for Google-internal cleanup only.
kUpb_TokenizerOption_AllowFieldImmediatelyAfterNumber = 1 << 2,

// If set, whitespace tokens are reported by Next().
kUpb_TokenizerOption_ReportWhitespace = 1 << 3,
kUpb_TokenizerOption_ReportWhitespace = 1 << 1,

// If set, newline tokens are reported by Next(). Implies ReportWhitespace.
kUpb_TokenizerOption_ReportNewlines = 1 << 4,
// If set, newline tokens are reported by Next().
// This is a superset of ReportWhitespace.
kUpb_TokenizerOption_ReportNewlines = 1 << 2,

// By default the tokenizer expects C-style (/* */) comments.
// If set, it expects shell-style (#) comments instead.
kUpb_TokenizerOption_CommentStyleShell = 1 << 5,
kUpb_TokenizerOption_CommentStyleShell = 1 << 3,
} upb_Tokenizer_Option;

// Abstract interface for an object which collects the errors that occur
Expand Down

0 comments on commit 922a858

Please sign in to comment.