Skip to content

Commit

Permalink
Better handle heredoc escaping in the parser translator
Browse files Browse the repository at this point in the history
This does a few things:
* After a line continuation in a heredoc, emit only a single token
* Dedent ` heredocs
* Properly handle escapes in single and doublequoted heredocs

Some of the heredoc escaping logic should be applied to strings as well,
but I'll leave that for a different PR.
  • Loading branch information
Earlopain committed Dec 30, 2024
1 parent 7bfd527 commit a5dc98b
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 33 deletions.
95 changes: 68 additions & 27 deletions lib/prism/translation/parser/lexer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ class Lexer
]

# Heredocs are complex and require us to keep track of a bit of info to refer to later
HeredocData = Struct.new(:identifier, :common_whitespace, keyword_init: true)
HeredocData = Struct.new(:identifier, :common_whitespace, :quote, keyword_init: true)

private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES, :HeredocData

Expand Down Expand Up @@ -316,7 +316,7 @@ def to_a
# the parser gem doesn't simplify strings when its value ends in a newline
unless (string_value = next_token.value).end_with?("\n")
next_location = token.location.join(next_next_token.location)
value = string_value.gsub("\\\\", "\\")
value = unescape_string(string_value)
type = :tSTRING
location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
index += 2
Expand All @@ -327,19 +327,23 @@ def to_a
heredoc = HeredocData.new(
identifier: value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier],
common_whitespace: 0,
quote: quote,
)

if quote == "`"
type = :tXSTRING_BEG
value = "<<`"
else
# The parser gem trims whitespace from squiggly heredocs. We must record
# the most common whitespace to later remove.
if heredoc_type == "~"
heredoc.common_whitespace = calculate_heredoc_whitespace(index)
end
end

value = "<<#{quote == "'" || quote == "\"" ? quote : "\""}"
# The parser gem trims whitespace from squiggly heredocs. We must record
# the most common whitespace to later remove.
if heredoc_type == "~" || heredoc_type == "`"
heredoc.common_whitespace = calculate_heredoc_whitespace(index)
end

if quote == "'" || quote == '"' || quote == "`"
value = "<<#{quote}"
else
value = '<<"'
end

heredoc_stack.push(heredoc)
Expand All @@ -350,31 +354,43 @@ def to_a
is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line
# The parser gem only removes indentation when the heredoc is not nested
not_nested = heredoc_stack.size == 1
if is_first_token_on_line && not_nested && (heredoc = heredoc_stack[0]).common_whitespace > 0
current_heredoc = heredoc_stack.last
if is_first_token_on_line && not_nested && current_heredoc.common_whitespace > 0
value = trim_heredoc_whitespace(value, heredoc)
end
if current_heredoc
value = unescape_heredoc(value, heredoc)
end
else
# When the parser gem encounters a line continuation inside of a multiline string,
# it emits a single string node. The backslash (and remaining newline) is removed.
current_line = +""
adjustment = 0
start_offset = offset_cache[token.location.start_offset]
lines.map do |line|
newline = line.end_with?("\r\n") ? "\r\n" : "\n"
emit = false

lines.each.with_index do |line, index|
chomped_line = line.chomp
if match = chomped_line.match(/(?<backslashes>\\+)\z/)
adjustment = match[:backslashes].size / 2
adjusted_line = chomped_line.delete_suffix("\\" * adjustment)
if match[:backslashes].size.odd?
adjusted_line.delete_suffix!("\\")
adjustment += 2
else
adjusted_line << newline
end

# When the line ends with an odd number of backslashes, it must be a line continuation.
if chomped_line[/\\{1,}\z/]&.length&.odd?
chomped_line.delete_suffix!("\\")
current_line << chomped_line
adjustment += 2
# If the string ends with a line continuation emit the remainder
emit = index == lines.count - 1
else
adjusted_line = line
adjustment = 0
current_line << line
emit = true
end

end_offset = start_offset + adjusted_line.bytesize + adjustment
tokens << [:tSTRING_CONTENT, [adjusted_line, Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])]]
start_offset = end_offset
if emit
end_offset = start_offset + current_line.bytesize + adjustment
tokens << [:tSTRING_CONTENT, [unescape_string(current_line), Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])]]
start_offset = end_offset
current_line = +""
adjustment = 0
end
end
next
end
Expand Down Expand Up @@ -524,6 +540,31 @@ def trim_heredoc_whitespace(string, heredoc)

string[trimmed_characters..]
end

# Naive string escaping handling. Should be closer to the "unescape_heredoc" method
def unescape_string(string)
string.gsub("\\\\", "\\")
end

# Escape sequences that have special and should appear unescaped in the resulting string.
ESCAPES = {
"a" => "\a", "b" => "\b", "e" => "\e", "f" => "\f",
"n" => "\n", "r" => "\r", "s" => "\s", "t" => "\t",
"v" => "\v", "\\\\" => "\\"
}.freeze
private_constant :ESCAPES

# TODO: Does not handle "\u1234" and other longer-form escapes.
def unescape_heredoc(string, heredoc)
# In single-quoted heredocs, everything is taken literally.
return string if heredoc.quote == "'"

# When double-quoted, escape sequences can be written literally. For example, "\\n" becomes "\n",
# and "\\\\n" becomes "\\n". Unknown escapes sequences, like "\\o" simply become "o".
string.gsub(/\\./) do |match|
ESCAPES[match[1]] || match[1]
end
end
end
end
end
Expand Down
6 changes: 0 additions & 6 deletions test/prism/ruby/parser_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -102,13 +102,9 @@ class ParserTest < TestCase
"seattlerb/difficult6__7.txt",
"seattlerb/difficult6__8.txt",
"seattlerb/dsym_esc_to_sym.txt",
"seattlerb/heredoc__backslash_dos_format.txt",
"seattlerb/heredoc_backslash_nl.txt",
"seattlerb/heredoc_unicode.txt",
"seattlerb/heredoc_with_carriage_return_escapes_windows.txt",
"seattlerb/heredoc_with_carriage_return_escapes.txt",
"seattlerb/heredoc_with_interpolation_and_carriage_return_escapes_windows.txt",
"seattlerb/heredoc_with_interpolation_and_carriage_return_escapes.txt",
"seattlerb/module_comments.txt",
"seattlerb/parse_line_block_inline_comment_leading_newlines.txt",
"seattlerb/parse_line_block_inline_comment.txt",
Expand All @@ -128,14 +124,12 @@ class ParserTest < TestCase
"seattlerb/str_newline_hash_line_number.txt",
"seattlerb/TestRubyParserShared.txt",
"unparser/corpus/literal/assignment.txt",
"unparser/corpus/literal/dstr.txt",
"unparser/corpus/semantic/opasgn.txt",
"whitequark/args.txt",
"whitequark/beginless_erange_after_newline.txt",
"whitequark/beginless_irange_after_newline.txt",
"whitequark/bug_ascii_8bit_in_literal.txt",
"whitequark/bug_def_no_paren_eql_begin.txt",
"whitequark/dedenting_heredoc.txt",
"whitequark/forward_arg_with_open_args.txt",
"whitequark/lbrace_arg_after_command_args.txt",
"whitequark/multiple_pattern_matches.txt",
Expand Down

0 comments on commit a5dc98b

Please sign in to comment.