From a5dc98b6d4b4f23a6c31884c6f7edf5905656ba8 Mon Sep 17 00:00:00 2001 From: Earlopain <14981592+Earlopain@users.noreply.github.com> Date: Mon, 30 Dec 2024 16:02:01 +0100 Subject: [PATCH] Better handle heredoc escaping in the parser translator This does a few things: * After a line continuation in a heredoc, emit only a single token * Dedent ` heredocs * Properly handle escapes in single and doublequoted heredocs Some of the heredoc escaping logic should be applied to strings as well, but I'll leave that for a different PR. --- lib/prism/translation/parser/lexer.rb | 95 +++++++++++++++++++-------- test/prism/ruby/parser_test.rb | 6 -- 2 files changed, 68 insertions(+), 33 deletions(-) diff --git a/lib/prism/translation/parser/lexer.rb b/lib/prism/translation/parser/lexer.rb index b453337b1d..81590bc3ae 100644 --- a/lib/prism/translation/parser/lexer.rb +++ b/lib/prism/translation/parser/lexer.rb @@ -201,7 +201,7 @@ class Lexer ] # Heredocs are complex and require us to keep track of a bit of info to refer to later - HeredocData = Struct.new(:identifier, :common_whitespace, keyword_init: true) + HeredocData = Struct.new(:identifier, :common_whitespace, :quote, keyword_init: true) private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES, :HeredocData @@ -316,7 +316,7 @@ def to_a # the parser gem doesn't simplify strings when its value ends in a newline unless (string_value = next_token.value).end_with?("\n") next_location = token.location.join(next_next_token.location) - value = string_value.gsub("\\\\", "\\") + value = unescape_string(string_value) type = :tSTRING location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset]) index += 2 @@ -327,19 +327,23 @@ def to_a heredoc = HeredocData.new( identifier: value.match(/<<[-~]?["'`]?(?.*?)["'`]?\z/)[:heredoc_identifier], common_whitespace: 0, + quote: quote, ) if quote == "`" type = :tXSTRING_BEG - value = "<<`" - else - # The parser gem trims whitespace from squiggly heredocs. We must record - # the most common whitespace to later remove. - if heredoc_type == "~" - heredoc.common_whitespace = calculate_heredoc_whitespace(index) - end + end - value = "<<#{quote == "'" || quote == "\"" ? quote : "\""}" + # The parser gem trims whitespace from squiggly heredocs. We must record + # the most common whitespace to later remove. + if heredoc_type == "~" || heredoc_type == "`" + heredoc.common_whitespace = calculate_heredoc_whitespace(index) + end + + if quote == "'" || quote == '"' || quote == "`" + value = "<<#{quote}" + else + value = '<<"' end heredoc_stack.push(heredoc) @@ -350,31 +354,43 @@ def to_a is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line # The parser gem only removes indentation when the heredoc is not nested not_nested = heredoc_stack.size == 1 - if is_first_token_on_line && not_nested && (heredoc = heredoc_stack[0]).common_whitespace > 0 + current_heredoc = heredoc_stack.last + if is_first_token_on_line && not_nested && current_heredoc.common_whitespace > 0 value = trim_heredoc_whitespace(value, heredoc) end + if current_heredoc + value = unescape_heredoc(value, heredoc) + end else + # When the parser gem encounters a line continuation inside of a multiline string, + # it emits a single string node. The backslash (and remaining newline) is removed. + current_line = +"" + adjustment = 0 start_offset = offset_cache[token.location.start_offset] - lines.map do |line| - newline = line.end_with?("\r\n") ? "\r\n" : "\n" + emit = false + + lines.each.with_index do |line, index| chomped_line = line.chomp - if match = chomped_line.match(/(?\\+)\z/) - adjustment = match[:backslashes].size / 2 - adjusted_line = chomped_line.delete_suffix("\\" * adjustment) - if match[:backslashes].size.odd? - adjusted_line.delete_suffix!("\\") - adjustment += 2 - else - adjusted_line << newline - end + + # When the line ends with an odd number of backslashes, it must be a line continuation. + if chomped_line[/\\{1,}\z/]&.length&.odd? + chomped_line.delete_suffix!("\\") + current_line << chomped_line + adjustment += 2 + # If the string ends with a line continuation emit the remainder + emit = index == lines.count - 1 else - adjusted_line = line - adjustment = 0 + current_line << line + emit = true end - end_offset = start_offset + adjusted_line.bytesize + adjustment - tokens << [:tSTRING_CONTENT, [adjusted_line, Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])]] - start_offset = end_offset + if emit + end_offset = start_offset + current_line.bytesize + adjustment + tokens << [:tSTRING_CONTENT, [unescape_string(current_line), Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])]] + start_offset = end_offset + current_line = +"" + adjustment = 0 + end end next end @@ -524,6 +540,31 @@ def trim_heredoc_whitespace(string, heredoc) string[trimmed_characters..] end + + # Naive string escaping handling. Should be closer to the "unescape_heredoc" method + def unescape_string(string) + string.gsub("\\\\", "\\") + end + + # Escape sequences that have special and should appear unescaped in the resulting string. + ESCAPES = { + "a" => "\a", "b" => "\b", "e" => "\e", "f" => "\f", + "n" => "\n", "r" => "\r", "s" => "\s", "t" => "\t", + "v" => "\v", "\\\\" => "\\" + }.freeze + private_constant :ESCAPES + + # TODO: Does not handle "\u1234" and other longer-form escapes. + def unescape_heredoc(string, heredoc) + # In single-quoted heredocs, everything is taken literally. + return string if heredoc.quote == "'" + + # When double-quoted, escape sequences can be written literally. For example, "\\n" becomes "\n", + # and "\\\\n" becomes "\\n". Unknown escapes sequences, like "\\o" simply become "o". + string.gsub(/\\./) do |match| + ESCAPES[match[1]] || match[1] + end + end end end end diff --git a/test/prism/ruby/parser_test.rb b/test/prism/ruby/parser_test.rb index e1a41635aa..cbdbcd53fd 100644 --- a/test/prism/ruby/parser_test.rb +++ b/test/prism/ruby/parser_test.rb @@ -102,13 +102,9 @@ class ParserTest < TestCase "seattlerb/difficult6__7.txt", "seattlerb/difficult6__8.txt", "seattlerb/dsym_esc_to_sym.txt", - "seattlerb/heredoc__backslash_dos_format.txt", - "seattlerb/heredoc_backslash_nl.txt", "seattlerb/heredoc_unicode.txt", "seattlerb/heredoc_with_carriage_return_escapes_windows.txt", "seattlerb/heredoc_with_carriage_return_escapes.txt", - "seattlerb/heredoc_with_interpolation_and_carriage_return_escapes_windows.txt", - "seattlerb/heredoc_with_interpolation_and_carriage_return_escapes.txt", "seattlerb/module_comments.txt", "seattlerb/parse_line_block_inline_comment_leading_newlines.txt", "seattlerb/parse_line_block_inline_comment.txt", @@ -128,14 +124,12 @@ class ParserTest < TestCase "seattlerb/str_newline_hash_line_number.txt", "seattlerb/TestRubyParserShared.txt", "unparser/corpus/literal/assignment.txt", - "unparser/corpus/literal/dstr.txt", "unparser/corpus/semantic/opasgn.txt", "whitequark/args.txt", "whitequark/beginless_erange_after_newline.txt", "whitequark/beginless_irange_after_newline.txt", "whitequark/bug_ascii_8bit_in_literal.txt", "whitequark/bug_def_no_paren_eql_begin.txt", - "whitequark/dedenting_heredoc.txt", "whitequark/forward_arg_with_open_args.txt", "whitequark/lbrace_arg_after_command_args.txt", "whitequark/multiple_pattern_matches.txt",