From 805bffceea0bd011507911f02fa227a1733a756a Mon Sep 17 00:00:00 2001 From: Andy C Date: Fri, 13 Jan 2023 21:04:59 -0500 Subject: [PATCH] [osh-language] Compatiblity for DQ inside backticks inside DQ The Oil parsing model is smart enough to parse double quotes inside backticks inside double quotes: echo "x `echo "hi"`" Shells aren't, and use this syntax instead: echo "x `echo \"hi\"`" This is issue #1449. --- frontend/id_kind_def.py | 2 +- frontend/lexer_def.py | 8 +++++--- osh/word_parse.py | 38 ++++++++++++++++++++++++++++---------- spec/command-sub.test.sh | 18 ++++++++++-------- test/spec.sh | 2 +- 5 files changed, 45 insertions(+), 23 deletions(-) diff --git a/frontend/id_kind_def.py b/frontend/id_kind_def.py index cfd98bd9c9..705bc23eed 100755 --- a/frontend/id_kind_def.py +++ b/frontend/id_kind_def.py @@ -243,7 +243,7 @@ def AddKinds(spec): # For recognizing \` and \" and \\ within backticks. There's an extra layer # of backslash quoting. - spec.AddKind('Backtick', ['Right', 'Quoted', 'Other']) + spec.AddKind('Backtick', ['Right', 'Quoted', 'DoubleQuote', 'Other']) spec.AddKind('History', ['Op', 'Num', 'Search', 'Other']) diff --git a/frontend/lexer_def.py b/frontend/lexer_def.py index 715fb0e49c..03399d1ed9 100644 --- a/frontend/lexer_def.py +++ b/frontend/lexer_def.py @@ -359,9 +359,11 @@ def R(pat, tok_type): # Preprocessing before ShCommand LEXER_DEF[lex_mode_e.Backtick] = [ C(r'`', Id.Backtick_Right), - # A backslash, and then one of the SAME FOUR escaped chars in the DQ mode. - R(r'\\[$`"\\]', Id.Backtick_Quoted), - R(r'[^`\\\0]+', Id.Backtick_Other), # contiguous run of litera + # A backslash, and then $ or ` or \ + R(r'\\[$`\\]', Id.Backtick_Quoted), + # \" treated specially, depending on whether bacticks are double-quoted! + R(r'\\"', Id.Backtick_DoubleQuote), + R(r'[^`\\\0]+', Id.Backtick_Other), # contiguous run of literals R(r'[^\0]', Id.Backtick_Other), # anything else ] diff --git a/osh/word_parse.py b/osh/word_parse.py index d2193e178e..a02d8d2a7a 100644 --- a/osh/word_parse.py +++ b/osh/word_parse.py @@ -447,7 +447,7 @@ def _ParseVarExpr(self, arg_lex_mode, allow_query=False): def ReadBracedVarSub(self, left_token): # type: (Token) -> Tuple[braced_var_sub, Token] """ For Oil expressions like var x = ${x:-"default"}. """ - part = self._ReadBracedVarSub(left_token, False) # not quoted + part = self._ReadBracedVarSub(left_token, d_quoted=False) last_token = self.cur_token return part, last_token @@ -680,10 +680,10 @@ def _ReadDoubleQuotedLeftParts(self): # type: () -> word_part_t """Read substitution parts in a double quoted context.""" if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick): - return self._ReadCommandSub(self.token_type) + return self._ReadCommandSub(self.token_type, d_quoted=True) if self.token_type == Id.Left_DollarBrace: - return self._ReadBracedVarSub(self.cur_token, True) # DQ + return self._ReadBracedVarSub(self.cur_token, d_quoted=True) if self.token_type == Id.Left_DollarDParen: return self._ReadArithSub() @@ -741,10 +741,10 @@ def _ReadUnquotedLeftParts(self, try_triple_quote, triple_out): if self.token_type in ( Id.Left_DollarParen, Id.Left_Backtick, Id.Left_ProcSubIn, Id.Left_ProcSubOut): - return self._ReadCommandSub(self.token_type) + return self._ReadCommandSub(self.token_type, d_quoted=False) if self.token_type == Id.Left_DollarBrace: - return self._ReadBracedVarSub(self.cur_token, False) # not DQ + return self._ReadBracedVarSub(self.cur_token, d_quoted=False) if self.token_type == Id.Left_DollarDParen: return self._ReadArithSub() @@ -920,8 +920,8 @@ def ReadDoubleQuoted(self, left_token, parts): self._ReadLikeDQ(left_token, True, parts) return self.cur_token - def _ReadCommandSub(self, left_id): - # type: (Id_t) -> command_sub + def _ReadCommandSub(self, left_id, d_quoted=False): + # type: (Id_t, bool) -> command_sub """ NOTE: This is not in the grammar, because word parts aren't in the grammar! @@ -968,22 +968,40 @@ def _ReadCommandSub(self, left_id): self._Next(lex_mode_e.Backtick) # advance past ` + log("d_quoted %s", d_quoted) + parts = [] # type: List[str] while True: self._Peek() - #print(self.cur_token) + #log("TOK %s", self.cur_token) + if self.token_type == Id.Backtick_Quoted: - parts.append(self.cur_token.val[1:]) # remove leading \ + parts.append(self.cur_token.val[1:]) # Remove leading \ + + elif self.token_type == Id.Backtick_DoubleQuote: + # Compatibility: If backticks are double quoted, then double quotes + # within them have to be \" + # Shells aren't smart enough to match nested " and ` quotes (but OSH + # is) + if d_quoted: + parts.append(self.cur_token.val[1:]) # Remove leading \ + else: + parts.append(self.cur_token.val) + elif self.token_type == Id.Backtick_Other: parts.append(self.cur_token.val) + elif self.token_type == Id.Backtick_Right: break + elif self.token_type == Id.Eof_Real: # Note: this parse error is in the ORIGINAL context. No code_str yet. p_die('Unexpected EOF while looking for closing backtick', token=left_token) + else: raise AssertionError(self.cur_token) + self._Next(lex_mode_e.Backtick) # Calculate right SPID on CommandSub BEFORE re-parsing. @@ -1529,7 +1547,7 @@ def _ReadCompoundWord3(self, lex_mode, eof_type, empty_ok): # Users can also use look at ,(*.py|*.sh) if (self.parse_opts.parse_at() and self.token_type == Id.ExtGlob_At and num_parts == 0): - cs_part = self._ReadCommandSub(Id.Left_AtParen) + cs_part = self._ReadCommandSub(Id.Left_AtParen, d_quoted=False) # RARE mutation of tok.id! cs_part.left_token.id = Id.Left_AtParen part = cs_part # for type safety diff --git a/spec/command-sub.test.sh b/spec/command-sub.test.sh index 4bff9dda22..52c14294f6 100644 --- a/spec/command-sub.test.sh +++ b/spec/command-sub.test.sh @@ -129,6 +129,16 @@ cat $file " ## END +#### Quoting " within `` +echo 1 `echo \"` +#echo 2 `echo \\"` +#echo 3 `echo \\\"` +#echo 4 `echo \\\\"` + +## STDOUT: +1 " +## END + #### Quoting $ within `` echo 1 `echo $` echo 2 `echo \$` @@ -232,14 +242,6 @@ echo `echo \\"foo\\"` "foo" ## END -# Documented in doc/known-differences.md (and Morbig paper brought up the same -# issue) -## OK osh STDOUT: -"foo" -foo -"foo" -## END - #### More levels of double quotes in backticks # Shells don't agree here, some of them give you form feeds! # There are two levels of processing I don't understand. diff --git a/test/spec.sh b/test/spec.sh index acfa1cc204..0aad181737 100755 --- a/test/spec.sh +++ b/test/spec.sh @@ -297,7 +297,7 @@ osh-only() { # Regress bugs bugs() { - sh-spec spec/bugs.test.sh --osh-failures-allowed 3 \ + sh-spec spec/bugs.test.sh --osh-failures-allowed 2 \ ${REF_SHELLS[@]} $ZSH $BUSYBOX_ASH $OSH_LIST "$@" }