Skip to content

Commit

Permalink
ESQL: Improve grammar to allow identifiers with . (#100740)
Browse files Browse the repository at this point in the history
Extend the unquoted identifier to contain . not just numbers. Without it
 the lexer picks the characters as decimal literal which leads to errors
Additionally fix a bug in quoting identifiers.

Fix #100312
  • Loading branch information
costin authored Dec 12, 2023
1 parent 1bc8477 commit a8a956f
Show file tree
Hide file tree
Showing 21 changed files with 2,255 additions and 1,711 deletions.
6 changes: 6 additions & 0 deletions docs/changelog/100740.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 100740
summary: "ESQL: Referencing expressions that contain backticks requires <<esql-identifiers,escaping those backticks>>."
area: ES|QL
type: enhancement
issues:
- 100312
Original file line number Diff line number Diff line change
Expand Up @@ -139,9 +139,9 @@ emp_no:integer | birth_date:date | x:date
;

evalDateTruncGrouping
from employees | eval y = date_trunc(1 year, hire_date) | stats count(emp_no) by y | sort y | keep y, count(emp_no) | limit 5;
from employees | eval y = date_trunc(1 year, hire_date) | stats c = count(emp_no) by y | sort y | keep y, c | limit 5;

y:date | count(emp_no):long
y:date | c:long
1985-01-01T00:00:00.000Z | 11
1986-01-01T00:00:00.000Z | 11
1987-01-01T00:00:00.000Z | 15
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -782,3 +782,12 @@ FROM sample_data

median_duration:double | client_ip:ip
;

fieldEscaping#[skip:-8.12.99, reason:Fixed bug in 8.13 of removing the leading/trailing backquotes of an identifier]
FROM sample_data
| stats count(`event_duration`) | keep `count(``event_duration``)`
;

count(`event_duration`):l
7
;
284 changes: 234 additions & 50 deletions x-pack/plugin/esql/src/main/antlr/EsqlBaseLexer.g4
Original file line number Diff line number Diff line change
@@ -1,24 +1,24 @@
lexer grammar EsqlBaseLexer;

DISSECT : 'dissect' -> pushMode(EXPRESSION);
DROP : 'drop' -> pushMode(SOURCE_IDENTIFIERS);
ENRICH : 'enrich' -> pushMode(SOURCE_IDENTIFIERS);
EVAL : 'eval' -> pushMode(EXPRESSION);
EXPLAIN : 'explain' -> pushMode(EXPLAIN_MODE);
FROM : 'from' -> pushMode(SOURCE_IDENTIFIERS);
GROK : 'grok' -> pushMode(EXPRESSION);
INLINESTATS : 'inlinestats' -> pushMode(EXPRESSION);
KEEP : 'keep' -> pushMode(SOURCE_IDENTIFIERS);
LIMIT : 'limit' -> pushMode(EXPRESSION);
MV_EXPAND : 'mv_expand' -> pushMode(SOURCE_IDENTIFIERS);
PROJECT : 'project' -> pushMode(SOURCE_IDENTIFIERS);
RENAME : 'rename' -> pushMode(SOURCE_IDENTIFIERS);
ROW : 'row' -> pushMode(EXPRESSION);
SHOW : 'show' -> pushMode(EXPRESSION);
SORT : 'sort' -> pushMode(EXPRESSION);
STATS : 'stats' -> pushMode(EXPRESSION);
WHERE : 'where' -> pushMode(EXPRESSION);
UNKNOWN_CMD : ~[ \r\n\t[\]/]+ -> pushMode(EXPRESSION);
DISSECT : 'dissect' -> pushMode(EXPRESSION_MODE);
DROP : 'drop' -> pushMode(PROJECT_MODE);
ENRICH : 'enrich' -> pushMode(ENRICH_MODE);
EVAL : 'eval' -> pushMode(EXPRESSION_MODE);
EXPLAIN : 'explain' -> pushMode(EXPLAIN_MODE);
FROM : 'from' -> pushMode(FROM_MODE);
GROK : 'grok' -> pushMode(EXPRESSION_MODE);
INLINESTATS : 'inlinestats' -> pushMode(EXPRESSION_MODE);
KEEP : 'keep' -> pushMode(PROJECT_MODE);
LIMIT : 'limit' -> pushMode(EXPRESSION_MODE);
MV_EXPAND : 'mv_expand' -> pushMode(MVEXPAND_MODE);
PROJECT : 'project' -> pushMode(PROJECT_MODE);
RENAME : 'rename' -> pushMode(RENAME_MODE);
ROW : 'row' -> pushMode(EXPRESSION_MODE);
SHOW : 'show' -> pushMode(SHOW_MODE);
SORT : 'sort' -> pushMode(EXPRESSION_MODE);
STATS : 'stats' -> pushMode(EXPRESSION_MODE);
WHERE : 'where' -> pushMode(EXPRESSION_MODE);
UNKNOWN_CMD : ~[ \r\n\t[\]/]+ -> pushMode(EXPRESSION_MODE);

LINE_COMMENT
: '//' ~[\r\n]* '\r'? '\n'? -> channel(HIDDEN)
Expand All @@ -31,16 +31,20 @@ MULTILINE_COMMENT
WS
: [ \r\n\t]+ -> channel(HIDDEN)
;


//
// Explain
//
mode EXPLAIN_MODE;
EXPLAIN_OPENING_BRACKET : '[' -> type(OPENING_BRACKET), pushMode(DEFAULT_MODE);
EXPLAIN_PIPE : '|' -> type(PIPE), popMode;
EXPLAIN_OPENING_BRACKET : OPENING_BRACKET -> type(OPENING_BRACKET), pushMode(DEFAULT_MODE);
EXPLAIN_PIPE : PIPE -> type(PIPE), popMode;
EXPLAIN_WS : WS -> channel(HIDDEN);
EXPLAIN_LINE_COMMENT : LINE_COMMENT -> channel(HIDDEN);
EXPLAIN_MULTILINE_COMMENT : MULTILINE_COMMENT -> channel(HIDDEN);

mode EXPRESSION;
//
// Expression - used by most command
//
mode EXPRESSION_MODE;

PIPE : '|' -> popMode;

Expand All @@ -64,6 +68,27 @@ fragment EXPONENT
: [Ee] [+-]? DIGIT+
;

fragment ASPERAND
: '@'
;

fragment BACKQUOTE
: '`'
;

fragment BACKQUOTE_BLOCK
: ~'`'
| '``'
;

fragment UNDERSCORE
: '_'
;

fragment UNQUOTED_ID_BODY
: (LETTER | DIGIT | UNDERSCORE)
;

STRING
: '"' (ESCAPE_SEQUENCE | UNESCAPED_CHARS)* '"'
| '"""' (~[\r\n])*? '"""' '"'? '"'?
Expand Down Expand Up @@ -103,8 +128,6 @@ PARAM: '?';
RLIKE: 'rlike';
RP : ')';
TRUE : 'true';
INFO : 'info';
FUNCTIONS : 'functions';

EQ : '==';
NEQ : '!=';
Expand All @@ -124,19 +147,18 @@ PERCENT : '%';
// mode. Thus, the two popModes on CLOSING_BRACKET. The other way could as
// the start of a multivalued field constant. To line up with the double pop
// the explain mode needs, we double push when we see that.
OPENING_BRACKET : '[' -> pushMode(EXPRESSION), pushMode(EXPRESSION);
OPENING_BRACKET : '[' -> pushMode(EXPRESSION_MODE), pushMode(EXPRESSION_MODE);
CLOSING_BRACKET : ']' -> popMode, popMode;


UNQUOTED_IDENTIFIER
: LETTER (LETTER | DIGIT | '_')*
: LETTER UNQUOTED_ID_BODY*
// only allow @ at beginning of identifier to keep the option to allow @ as infix operator in the future
// also, single `_` and `@` characters are not valid identifiers
| ('_' | '@') (LETTER | DIGIT | '_')+
| (UNDERSCORE | ASPERAND) UNQUOTED_ID_BODY+
;

QUOTED_IDENTIFIER
: '`' ( ~'`' | '``' )* '`'
: BACKQUOTE BACKQUOTE_BLOCK+ BACKQUOTE
;

EXPR_LINE_COMMENT
Expand All @@ -150,42 +172,204 @@ EXPR_MULTILINE_COMMENT
EXPR_WS
: WS -> channel(HIDDEN)
;
//
// FROM command
//
mode FROM_MODE;
FROM_PIPE : PIPE -> type(PIPE), popMode;
FROM_OPENING_BRACKET : OPENING_BRACKET -> type(OPENING_BRACKET), pushMode(FROM_MODE), pushMode(FROM_MODE);
FROM_CLOSING_BRACKET : CLOSING_BRACKET -> type(CLOSING_BRACKET), popMode, popMode;
FROM_COMMA : COMMA -> type(COMMA);
FROM_ASSIGN : ASSIGN -> type(ASSIGN);

METADATA: 'metadata';

fragment FROM_UNQUOTED_IDENTIFIER_PART
: ~[=`|,[\]/ \t\r\n]
| '/' ~[*/] // allow single / but not followed by another / or * which would start a comment
;

mode SOURCE_IDENTIFIERS;
FROM_UNQUOTED_IDENTIFIER
: FROM_UNQUOTED_IDENTIFIER_PART+
;

FROM_QUOTED_IDENTIFIER
: QUOTED_IDENTIFIER -> type(QUOTED_IDENTIFIER)
;

FROM_LINE_COMMENT
: LINE_COMMENT -> channel(HIDDEN)
;

FROM_MULTILINE_COMMENT
: MULTILINE_COMMENT -> channel(HIDDEN)
;

FROM_WS
: WS -> channel(HIDDEN)
;
//
// DROP, KEEP, PROJECT
//
mode PROJECT_MODE;
PROJECT_PIPE : PIPE -> type(PIPE), popMode;
PROJECT_DOT: DOT -> type(DOT);
PROJECT_COMMA : COMMA -> type(COMMA);

fragment UNQUOTED_ID_BODY_WITH_PATTERN
: (LETTER | DIGIT | UNDERSCORE | ASTERISK)
;

PROJECT_UNQUOTED_IDENTIFIER
: (LETTER | ASTERISK) UNQUOTED_ID_BODY_WITH_PATTERN*
| (UNDERSCORE | ASPERAND) UNQUOTED_ID_BODY_WITH_PATTERN+
;

PROJECT_QUOTED_IDENTIFIER
: QUOTED_IDENTIFIER -> type(QUOTED_IDENTIFIER)
;

PROJECT_LINE_COMMENT
: LINE_COMMENT -> channel(HIDDEN)
;

PROJECT_MULTILINE_COMMENT
: MULTILINE_COMMENT -> channel(HIDDEN)
;

PROJECT_WS
: WS -> channel(HIDDEN)
;
//
// | RENAME a.b AS x, c AS y
//
mode RENAME_MODE;
RENAME_PIPE : PIPE -> type(PIPE), popMode;
RENAME_ASSIGN : ASSIGN -> type(ASSIGN);
RENAME_COMMA : COMMA -> type(COMMA);
RENAME_DOT: DOT -> type(DOT);

SRC_PIPE : '|' -> type(PIPE), popMode;
SRC_OPENING_BRACKET : '[' -> type(OPENING_BRACKET), pushMode(SOURCE_IDENTIFIERS), pushMode(SOURCE_IDENTIFIERS);
SRC_CLOSING_BRACKET : ']' -> popMode, popMode, type(CLOSING_BRACKET);
SRC_COMMA : ',' -> type(COMMA);
SRC_ASSIGN : '=' -> type(ASSIGN);
AS : 'as';
METADATA: 'metadata';
ON : 'on';
WITH : 'with';

SRC_UNQUOTED_IDENTIFIER
: SRC_UNQUOTED_IDENTIFIER_PART+
RENAME_QUOTED_IDENTIFIER
: QUOTED_IDENTIFIER -> type(QUOTED_IDENTIFIER)
;

fragment SRC_UNQUOTED_IDENTIFIER_PART
: ~[=`|,[\]/ \t\r\n]+
| '/' ~[*/] // allow single / but not followed by another / or * which would start a comment
// use the unquoted pattern to let the parser invalidate fields with *
RENAME_UNQUOTED_IDENTIFIER
: PROJECT_UNQUOTED_IDENTIFIER -> type(PROJECT_UNQUOTED_IDENTIFIER)
;

RENAME_LINE_COMMENT
: LINE_COMMENT -> channel(HIDDEN)
;

SRC_QUOTED_IDENTIFIER
: QUOTED_IDENTIFIER
RENAME_MULTILINE_COMMENT
: MULTILINE_COMMENT -> channel(HIDDEN)
;

SRC_LINE_COMMENT
RENAME_WS
: WS -> channel(HIDDEN)
;

// | ENRICH ON key WITH fields
mode ENRICH_MODE;
ENRICH_PIPE : PIPE -> type(PIPE), popMode;

ON : 'on' -> pushMode(ENRICH_FIELD_MODE);
WITH : 'with' -> pushMode(ENRICH_FIELD_MODE);

// use the unquoted pattern to let the parser invalidate fields with *
ENRICH_POLICY_UNQUOTED_IDENTIFIER
: FROM_UNQUOTED_IDENTIFIER -> type(FROM_UNQUOTED_IDENTIFIER)
;

ENRICH_QUOTED_IDENTIFIER
: QUOTED_IDENTIFIER -> type(QUOTED_IDENTIFIER)
;

ENRICH_LINE_COMMENT
: LINE_COMMENT -> channel(HIDDEN)
;

ENRICH_MULTILINE_COMMENT
: MULTILINE_COMMENT -> channel(HIDDEN)
;

ENRICH_WS
: WS -> channel(HIDDEN)
;

// submode for Enrich to allow different lexing between policy identifier (loose) and field identifiers
mode ENRICH_FIELD_MODE;
ENRICH_FIELD_PIPE : PIPE -> type(PIPE), popMode, popMode;
ENRICH_FIELD_ASSIGN : ASSIGN -> type(ASSIGN);
ENRICH_FIELD_COMMA : COMMA -> type(COMMA);
ENRICH_FIELD_DOT: DOT -> type(DOT);

ENRICH_FIELD_WITH : WITH -> type(WITH) ;

ENRICH_FIELD_UNQUOTED_IDENTIFIER
: PROJECT_UNQUOTED_IDENTIFIER -> type(PROJECT_UNQUOTED_IDENTIFIER)
;

ENRICH_FIELD_QUOTED_IDENTIFIER
: QUOTED_IDENTIFIER -> type(QUOTED_IDENTIFIER)
;

ENRICH_FIELD_LINE_COMMENT
: LINE_COMMENT -> channel(HIDDEN)
;

ENRICH_FIELD_MULTILINE_COMMENT
: MULTILINE_COMMENT -> channel(HIDDEN)
;

ENRICH_FIELD_WS
: WS -> channel(HIDDEN)
;

mode MVEXPAND_MODE;
MVEXPAND_PIPE : PIPE -> type(PIPE), popMode;
MVEXPAND_DOT: DOT -> type(DOT);

MVEXPAND_QUOTED_IDENTIFIER
: QUOTED_IDENTIFIER -> type(QUOTED_IDENTIFIER)
;

MVEXPAND_UNQUOTED_IDENTIFIER
: UNQUOTED_IDENTIFIER -> type(UNQUOTED_IDENTIFIER)
;

MVEXPAND_LINE_COMMENT
: LINE_COMMENT -> channel(HIDDEN)
;

MVEXPAND_MULTILINE_COMMENT
: MULTILINE_COMMENT -> channel(HIDDEN)
;

MVEXPAND_WS
: WS -> channel(HIDDEN)
;

//
// SHOW INFO
//
mode SHOW_MODE;
SHOW_PIPE : PIPE -> type(PIPE), popMode;

INFO : 'info';
FUNCTIONS : 'functions';

SHOW_LINE_COMMENT
: LINE_COMMENT -> channel(HIDDEN)
;

SRC_MULTILINE_COMMENT
SHOW_MULTILINE_COMMENT
: MULTILINE_COMMENT -> channel(HIDDEN)
;

SRC_WS
SHOW_WS
: WS -> channel(HIDDEN)
;
Loading

0 comments on commit a8a956f

Please sign in to comment.