Skip to content

Commit

Permalink
feat(Lezer grammar): Rework grammar with semantic parse tree (PRQL#3588)
Browse files Browse the repository at this point in the history
Co-authored-by: Maximilian Roos <[email protected]>
  • Loading branch information
vanillajonathan and max-sixty authored Oct 2, 2023
1 parent 62e6eed commit 12369c9
Show file tree
Hide file tree
Showing 12 changed files with 232 additions and 102 deletions.
4 changes: 2 additions & 2 deletions grammars/prql-lezer/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,10 @@
"@lezer/generator": "^1.5.1",
"@rollup/plugin-node-resolve": "^15.2.1",
"mocha": "^10.2.0",
"rollup": "^3.29.2"
"rollup": "^3.29.4"
},
"dependencies": {
"@lezer/highlight": "^1.1.6",
"@lezer/lr": "^1.3.11"
"@lezer/lr": "^1.3.12"
}
}
2 changes: 2 additions & 0 deletions grammars/prql-lezer/src/highlight.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@ export const prqlHighlight = styleTags({
case: t.controlKeyword,
in: t.operatorKeyword,
Comment: t.lineComment,
Docblock: t.docString,
Integer: t.integer,
Float: t.float,
String: t.string,
F_string: t.special(t.string),
R_string: t.special(t.string),
S_string: t.special(t.string),
TimeUnit: t.unit,
"( )": t.paren,
"[ ]": t.squareBracket,
"{ }": t.brace,
Expand Down
149 changes: 94 additions & 55 deletions grammars/prql-lezer/src/prql.grammar
Original file line number Diff line number Diff line change
Expand Up @@ -3,114 +3,153 @@
// normal functions?
// - A few small TODOs included below

@precedence {
times @left,
plus @left,
compare @left,
and @left,
or @left
}

@top Query { Statements }
@skip { space | Comment | wrapped_line }

Statements { newline* Query_def? (Def)* Pipeline_stmt? end }
@skip { space | Comment | Docblock | wrappedLine }

Query_def { @specialize<ident_part, "prql"> Named_arg* newline+ }
Statements { newline* QueryDefinition? VariableDeclaration* PipelineStatement? end }

Pipeline_stmt { Pipeline ( ~ambig_newline newline+ | end )}
QueryDefinition { @specialize<identPart, "prql"> NamedArg+ newline+ }

Pipeline { expr_call (pipe+ expr_call)* }
PipelineStatement { Pipeline ( ~ambigNewline newline+ | end )}

pipe { "|" | ~ambig_newline newline }
Pipeline { exprCall (pipe+ exprCall)* | test }

Tuple { "{" newline* tuple_item (("," newline* ) tuple_item)* ","? newline* "}" }
pipe { "|" | ~ambigNewline newline }

tuple_item { Assign_call | expr_call | Case_branch }
TupleExpression { "{" newline* tupleItem (("," newline* ) tupleItem)* ","? newline* "}" }

expr_call { Expr | Func_call }
tupleItem { AssignCall | exprCall | CaseBranch }

exprCall { expression | CallExpression }
// Ideally we would force a space after `Ident` to prevent an invalid s-string
// being parsed as a Func_call, e.g. `s"{a"` -> `s` & `"{a"'. But we
// being parsed as a CallExpression, e.g. `s"{a"` -> `s` & `"{a"'. But we
// can't seem to force a space because it's in our skip, and I can't see
// a way of changing the skip expression to only specialize on a single item
Func_call { Ident ( (Named_arg | Assign | Expr))+ }
CallExpression { Ident ( (NamedArg | Assign | expression))+ }

NamedArg { identPart ":" expression }
Assign { identPart "=" expression }
AssignCall { identPart "=" exprCall }
CaseBranch { expression "=>" expression }
// Possibly we could only accept case branches inside the TupleExpression?
CaseExpression { @specialize<identPart, "case"> TupleExpression }

// !term is required here, otherwise `[x+emp_no, -3]` is parsed as `x` & `+emp_no`
term { !term Ident | number | Tuple | DateTime | Nested_pipeline | op_unary term | String | R_string | S_string | F_string | Range | Array | Case }
NestedPipeline { "(" newline* Pipeline ~ambigNewline newline? ")" }

Expr { term (( Op_bin ) term )* }
commaSep<expr> { newline* expr ("," newline* expr newline*)* ","? }
LogicOp<expr> { expr }

// The name "test" here means equality testing. It is name used in the Python grammar.
test { testInner }

testInner { binaryTest | unaryTest | expression }

binaryTest[@name="BinaryExpression"] {
testInner !or LogicOp<"||" | "??"> testInner |
testInner !and LogicOp<"&&"> testInner |
testInner !compare (CompareOp | kw<"in">) testInner
}

Named_arg { ident_part ":" Expr }
Assign { ident_part "=" Expr }
Assign_call { ident_part "=" expr_call }
Case_branch { Expr "=>" Expr }
Array { "[" newline* Expr (("," newline* ) Expr)* ","? newline* "]" }
// Possibly we could only accept case branches inside the Tuple?
Case { @specialize<ident_part, "case"> Tuple }
unaryTest[@name="UnaryExpression"] { kw<"!"> testInner }

expression[@isGroup=Expression] {
BinaryExpression |
ArrayExpression |
TupleExpression |
CaseExpression |
DateTime |
RangeExpression |
Ident |
number |
String | F_string | R_string | S_string |
TimeUnit
}

Nested_pipeline { "(" newline* Pipeline ~ambig_newline newline? ")" }
ArrayExpression { "[" commaSep<test | "*" expression>? "]" }

@precedence { term @left }
BinaryExpression {
expression !plus ArithOp { "+" | "-" } expression |
expression !times ArithOp { "*" | "/" | "%" | "//" } expression
}

// Because this is outside tokens, we can't disallow whitespace.
// It's outside tokens because otherwise it conflicts with Ident
Ident { ident_part ( "." (ident_part | "*"))* }
Ident { identPart ( "." (identPart | "*"))* }

number { Integer | Float }

kw<term> { @specialize[@name={term}]<identifier, term> }

// I don't think it's possible to have `Op_bin` & `Op_unary` as tokens — that
// would mean `-` can't be both unary & bin.
Op_bin { op_bin_only | !term op_unary }
VariableDeclaration { @specialize<identPart, "let"> identPart "=" (NestedPipeline (newline+ | end) | Lambda) }

number { Integer time_unit? | Float }
Lambda { LambdaParam* "->" exprCall ( newline+ | end ) }
TypeDefinition { "<" TypeTerm ( "|" TypeTerm)* ">" }
TypeTerm { identPart TypeDefinition? }
LambdaParam { identPart TypeDefinition? (":" expression)? }

@tokens {
CompareOp { "==" | "!=" | "~=" | ">=" | "<=" | ">" | "<" }
date { @digit+ "-" @digit+ "-" @digit+ }
time { @digit+ ":" @digit+ ( ":" @digit+ ( "." @digit+ )? )? }
// We can't seem to set the number of digits, so this will allow any
// combination of digits & hyphens.
DateTime { "@" ( date | time | date "T" time ( "Z" | ( "-" | "+" ) @digit+ ":" @digit+ )? ) }
time_unit { "years" | "months" | "weeks" | "days" | "hours" | "minutes" | "seconds" | "milliseconds" | "microseconds" }
identifier_char { @asciiLetter | $[_\u{a1}-\u{10ffff}] }
ident_part { identifier_char (identifier_char | "_" | @digit )* }
TimeUnit { @digit+ ("years" | "months" | "weeks" | "days" | "hours" | "minutes" | "seconds" | "milliseconds" | "microseconds") }
identifierChar { @asciiLetter | $[_\u{a1}-\u{10ffff}] }
identPart { identifierChar (identifierChar | "_" | @digit )* }
identifier { identPart }
Integer { @digit ( @digit | "_" )* ("e" Integer)? }
Float { @digit ( @digit | "_" )* "." @digit ( @digit | "_" )* ("e" Integer)? }
// TODO: This is not as precise as PRQL, which doesn't allow trailing
// underscores and allows no digit before the decimal point.
space { " "+ }
Docblock { "#!" ![\n]* }
Comment { "#" ![\n]* }
op_bin_only { "*" | "/" | "//" | "%" | "!=" | ">=" | "<=" | "~=" | ">" | "<" | "??" | "&&" | "||" }
op_unary { "-" | "+" | "==" }
//op_bin_only { "*" | "/" | "//" | "%" | "!=" | ">=" | "<=" | "~=" | ">" | "<" | "??" | "&&" | "||" }
//op_unary { "-" | "+" | "==" }
end { @eof }
line_wrap { "\\" }
wrapped_line { newline+ (Comment newline+)* line_wrap }
lineWrap { "\\" }
wrappedLine { newline+ (Comment newline+)* lineWrap }
newline { "\n" }
// TODO: Because this can also be used to compile to BETWEEN, ranges should
// allow any literal, and arguably any expression.
Range { @digit+ ".." @digit+ }
RangeExpression { @digit+ ".." @digit+ }
// Couldn't manage to do these & the interpolated as a template; couldn't
// find how to negate a variable template
String { $["] !["]* $["] | $['] ![']* $['] }
// Matches any character except hamburger. This is a workaround.
String { '"""' ![🍔]* '"""' | "'''" ![🍔]* "'''" | $["] !["]* $["] | $['] ![']* $['] }

// TODO: not getting the interpolations highlighted; it just shows the whole
// string as a string, because these are all within the `@tokens` block. But
// they need to be within this block, because it's not possible to have
// negations (e.g. `![{'}` outside it.
String_interpolated_single { $['] ( ![{'] | Interpolation_inner_single )* $['] }
String_interpolated_double { $["] ( ![{"] | Interpolation_inner_double )* $["] }
stringInterpolatedSingle { $['] ( ![{'] | interpolationInnerSingle )* $['] }
stringInterpolatedDouble { $["] ( ![{"] | interpolationInnerDouble )* $["] }

Interpolation_inner_single { "{" ![}']* "}" }
Interpolation_inner_double { "{" ![}"]* "}" }
interpolationInnerSingle { "{" ![}']* "}" }
interpolationInnerDouble { "{" ![}"]* "}" }

Interpolated_string<prefix> { prefix ( String_interpolated_double | String_interpolated_single ) }
interpolatedString<prefix> { prefix ( stringInterpolatedDouble | stringInterpolatedSingle ) }

R_string {Interpolated_string<'r'>}
S_string {Interpolated_string<'s'>}
F_string {Interpolated_string<'f'>}
R_string { interpolatedString<'r'> }
S_string { interpolatedString<'s'> }
F_string { interpolatedString<'f'> }

// We need to give precedence to `Op_bin` so we don't get `x+y` as `x` & `+y`.
// R, S & F strings have precedence over idents beginning with r / s / f (we could
// use specialize but I think means we need to redefine `String` for each)
@precedence { R_string, S_string, F_string, op_bin_only, ident_part }

@precedence { Range, Float, Integer, time_unit, ident_part }
@precedence { RangeExpression, R_string, S_string, F_string, Float, TimeUnit, Integer, identPart, identifier, Docblock, Comment }
}

Def { @specialize<ident_part, "let"> ident_part "=" (Nested_pipeline (newline+ | end) | Lambda) }

Lambda { Lambda_param* "->" expr_call ( newline+ | end ) }
Type_def { "<" Type_term ( "|" Type_term)* ">" }
Type_term { ident_part Type_def? }
Lambda_param { ident_part Type_def? (":" Expr)? }

@external propSource prqlHighlight from "./highlight"

@detectDelim
10 changes: 5 additions & 5 deletions grammars/prql-lezer/test/arithmetics.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,36 +4,36 @@

==>

Query(Statements(Pipeline_stmt(Pipeline(Expr(Integer,Op_bin,Float)))))
Query(Statements(PipelineStatement(Pipeline(BinaryExpression(Integer,ArithOp,Float)))))

# Minus

10 - 10

==>

Query(Statements(Pipeline_stmt(Pipeline(Expr(Integer,Op_bin,Integer)))))
Query(Statements(PipelineStatement(Pipeline(BinaryExpression(Integer,ArithOp,Integer)))))

# Multiply

10 * 10

==>

Query(Statements(Pipeline_stmt(Pipeline(Expr(Integer,Op_bin,Integer)))))
Query(Statements(PipelineStatement(Pipeline(BinaryExpression(Integer,ArithOp,Integer)))))

# Divide

10 / 10

==>

Query(Statements(Pipeline_stmt(Pipeline(Expr(Integer,Op_bin,Integer)))))
Query(Statements(PipelineStatement(Pipeline(BinaryExpression(Integer,ArithOp,Integer)))))

# Multiple ops

10 + 10 + 10

==>

Query(Statements(Pipeline_stmt(Pipeline(Expr(Integer,Op_bin,Integer,Op_bin,Integer)))))
Query(Statements(PipelineStatement(Pipeline(BinaryExpression(BinaryExpression(Integer,ArithOp,Integer),ArithOp,Integer)))))
22 changes: 19 additions & 3 deletions grammars/prql-lezer/test/arrays.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

==>

Query(Statements(Pipeline_stmt(Pipeline(Expr(Array(Expr(Ident),Expr(Ident),Expr(Ident)))))))
Query(Statements(PipelineStatement(Pipeline(ArrayExpression(Ident,Ident,Ident)))))

# Array on multiple lines

Expand All @@ -16,7 +16,7 @@ Query(Statements(Pipeline_stmt(Pipeline(Expr(Array(Expr(Ident),Expr(Ident),Expr(

==>

Query(Statements(Pipeline_stmt(Pipeline(Expr(Array(Expr(Ident),Expr(Ident),Expr(Ident)))))))
Query(Statements(PipelineStatement(Pipeline(ArrayExpression(Ident,Ident,Ident)))))

# Array on multiple lines with blank lines

Expand All @@ -32,4 +32,20 @@ Query(Statements(Pipeline_stmt(Pipeline(Expr(Array(Expr(Ident),Expr(Ident),Expr(

==>

Query(Statements(Pipeline_stmt(Pipeline(Expr(Array(Expr(Ident),Expr(Ident),Expr(Ident)))))))
Query(Statements(PipelineStatement(Pipeline(ArrayExpression(Ident,Ident,Ident)))))

# Array on integers

[1, 2, 3]

==>

Query(Statements(PipelineStatement(Pipeline(ArrayExpression(Integer,Integer,Integer)))))

# Array on floats

[1.1, 2.2, 3.3]

==>

Query(Statements(PipelineStatement(Pipeline(ArrayExpression(Float,Float,Float)))))
14 changes: 7 additions & 7 deletions grammars/prql-lezer/test/datetime.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,52 +4,52 @@

==>

Query(Statements(Pipeline_stmt(Pipeline(Expr(DateTime)))))
Query(Statements(PipelineStatement(Pipeline(DateTime))))

# Time HH:MM

@08:30

==>

Query(Statements(Pipeline_stmt(Pipeline(Expr(DateTime)))))
Query(Statements(PipelineStatement(Pipeline(DateTime))))

# Time HH:MM:SS

@12:00:00

==>

Query(Statements(Pipeline_stmt(Pipeline(Expr(DateTime)))))
Query(Statements(PipelineStatement(Pipeline(DateTime))))

# Time HH:MM:SS.xxx

@12:00:00.500

==>

Query(Statements(Pipeline_stmt(Pipeline(Expr(DateTime)))))
Query(Statements(PipelineStatement(Pipeline(DateTime))))

# Date and time

@1970-01-01T12:00:00

==>

Query(Statements(Pipeline_stmt(Pipeline(Expr(DateTime)))))
Query(Statements(PipelineStatement(Pipeline(DateTime))))

# Date and time with timezone

@1970-01-01T12:00:00+01:00

==>

Query(Statements(Pipeline_stmt(Pipeline(Expr(DateTime)))))
Query(Statements(PipelineStatement(Pipeline(DateTime))))

# Date and time in UTC

@1970-01-01T12:00:00Z

==>

Query(Statements(Pipeline_stmt(Pipeline(Expr(DateTime)))))
Query(Statements(PipelineStatement(Pipeline(DateTime))))
Loading

0 comments on commit 12369c9

Please sign in to comment.