Skip to content

Commit

Permalink
Allow extracting deeply nested calls in Python
Browse files Browse the repository at this point in the history
Currently the Python extractor does not support deeply nested gettext
calls (deeper than as a direct argument to the top-level gettext call).

e.g.
```py
_("Hello %s", _("Person"))
_("Hello %s",
  random_function(", ".join([_("Person 1"), _("Person 2")])))
```

The extraction code was refactored quite a bit to simplify the flow and
support this use-case.

Fixes python-babel#1125
(meanwhile also fixes python-babel#1123)
  • Loading branch information
dylankiss committed Oct 17, 2024
1 parent f91754b commit 71b33d0
Show file tree
Hide file tree
Showing 2 changed files with 150 additions and 94 deletions.
198 changes: 117 additions & 81 deletions babel/messages/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,14 +502,6 @@ def extract_python(
:param options: a dictionary of additional options (optional)
:rtype: ``iterator``
"""
funcname = lineno = message_lineno = None
call_stack = -1
buf = []
messages = []
translator_comments = []
in_def = in_translator_comments = False
comment_tag = None

encoding = parse_encoding(fileobj) or options.get('encoding', 'UTF-8')
future_flags = parse_future_flags(fileobj, encoding)
next_line = lambda: fileobj.readline().decode(encoding)
Expand All @@ -520,103 +512,147 @@ def extract_python(
# currently parsing one.
current_fstring_start = None

for tok, value, (lineno, _), _, _ in tokens:
if call_stack == -1 and tok == NAME and value in ('def', 'class'):
# Keep the stack of all function calls and its related contextual variables,
# so we can handle nested gettext calls.
function_stack = []
# Keep the last encountered function/variable name for when we encounter
# an opening parenthesis
last_name = None
# Keep track of whether we're in a class or function definition
in_def = False
# Keep track of whether we're in a block of translator comments
in_translator_comments = False
# Keep track of the last encountered translator comments
translator_comments = []
# Keep track of the (split) strings encountered
message_buffer = []

for token, value, (line_no, _), _, _ in tokens:
if token == NAME and value in ('def', 'class'):
# We're entering a class or function definition
in_def = True
elif tok == OP and value == '(':
if in_def:
# Avoid false positives for declarations such as:
# def gettext(arg='message'):
in_def = False
continue
if funcname:
message_lineno = lineno
call_stack += 1
elif in_def and tok == OP and value == ':':
# End of a class definition without parens
continue

elif in_def and token == OP and value in ('(', ':'):
# We're in a class or function definition and should not do anything
in_def = False
continue
elif call_stack == -1 and tok == COMMENT:

elif token == OP and value == '(' and last_name:
# We're entering a function call
cur_translator_comments = translator_comments
if function_stack and function_stack[-1]['function_line_no'] == line_no:
# If our current function call is on the same line as the previous one,
# copy their translator comments, since they also apply to us.
cur_translator_comments = function_stack[-1]['translator_comments']

# We add all information needed later for the current function call
function_stack.append({
'function_line_no': line_no,
'function_name': last_name,
'message_line_no': None,
'messages': [],
'translator_comments': cur_translator_comments,
})
translator_comments = []
message_buffer.clear()

elif token == COMMENT:
# Strip the comment token from the line
value = value[1:].strip()
if in_translator_comments and \
translator_comments[-1][0] == lineno - 1:
if in_translator_comments and translator_comments[-1][0] == line_no - 1:
# We're already inside a translator comment, continue appending
translator_comments.append((lineno, value))
translator_comments.append((line_no, value))
continue
# If execution reaches this point, let's see if comment line
# starts with one of the comment tags

for comment_tag in comment_tags:
if value.startswith(comment_tag):
# Comment starts with one of the comment tags,
# so let's start capturing it
in_translator_comments = True
translator_comments.append((lineno, value))
translator_comments.append((line_no, value))
break
elif funcname and call_stack == 0:
nested = (tok == NAME and value in keywords)
if (tok == OP and value == ')') or nested:
if buf:
messages.append(''.join(buf))
del buf[:]

elif function_stack and function_stack[-1]['function_name'] in keywords:
# We're inside a translation function call
if token == OP and value == ')':
# The call has ended, so we yield the translatable term(s)
messages = function_stack[-1]['messages']
line_no = (
function_stack[-1]['message_line_no']
or function_stack[-1]['function_line_no']
)
cur_translator_comments = function_stack[-1]['translator_comments']

if message_buffer:
messages.append(''.join(message_buffer))
message_buffer.clear()
else:
messages.append(None)

messages = tuple(messages) if len(messages) > 1 else messages[0]
# Comments don't apply unless they immediately
# precede the message
if translator_comments and \
translator_comments[-1][0] < message_lineno - 1:
translator_comments = []

yield (message_lineno, funcname, messages,
[comment[1] for comment in translator_comments])

funcname = lineno = message_lineno = None
call_stack = -1
messages = []
translator_comments = []
in_translator_comments = False
if nested:
funcname = value
elif tok == STRING:
val = _parse_python_string(value, encoding, future_flags)
if val is not None:
buf.append(val)
if (
cur_translator_comments
and cur_translator_comments[-1][0] < line_no - 1
):
# The translator comments are not immediately preceding the current
# term, so we skip them.
cur_translator_comments = []

yield (
line_no,
function_stack[-1]['function_name'],
messages,
[comment[1] for comment in cur_translator_comments],
)

function_stack.pop()

elif token == STRING:
# We've encountered a string inside a translation function call
string_value = _parse_python_string(value, encoding, future_flags)
if not function_stack[-1]['message_line_no']:
function_stack[-1]['message_line_no'] = line_no
if string_value is not None:
message_buffer.append(string_value)

# Python 3.12+, see https://peps.python.org/pep-0701/#new-tokens
elif tok == FSTRING_START:
elif token == FSTRING_START:
current_fstring_start = value
elif tok == FSTRING_MIDDLE:
elif token == FSTRING_MIDDLE:
if current_fstring_start is not None:
current_fstring_start += value
elif tok == FSTRING_END:
elif token == FSTRING_END:
if current_fstring_start is not None:
fstring = current_fstring_start + value
val = _parse_python_string(fstring, encoding, future_flags)
if val is not None:
buf.append(val)

elif tok == OP and value == ',':
if buf:
messages.append(''.join(buf))
del buf[:]
string_value = _parse_python_string(fstring, encoding, future_flags)
if string_value is not None:
message_buffer.append(string_value)

elif token == OP and value == ',':
# End of a function call argument
if message_buffer:
function_stack[-1]['messages'].append(''.join(message_buffer))
message_buffer.clear()
else:
messages.append(None)
if translator_comments:
# We have translator comments, and since we're on a
# comma(,) user is allowed to break into a new line
# Let's increase the last comment's lineno in order
# for the comment to still be a valid one
old_lineno, old_comment = translator_comments.pop()
translator_comments.append((old_lineno + 1, old_comment))
elif call_stack > 0 and tok == OP and value == ')':
call_stack -= 1
elif funcname and call_stack == -1:
funcname = None
elif tok == NAME and value in keywords:
funcname = value
function_stack[-1]['messages'].append(None)

elif function_stack and token == OP and value == ')':
function_stack.pop()

if in_translator_comments and translator_comments[-1][0] < line_no:
# We have a newline in between the comments, so they don't belong
# together anymore
in_translator_comments = False

if token == NAME:
last_name = value
if function_stack and not function_stack[-1]['message_line_no']:
function_stack[-1]['message_line_no'] = line_no

if (current_fstring_start is not None
and tok not in {FSTRING_START, FSTRING_MIDDLE}
if (
current_fstring_start is not None
and token not in {FSTRING_START, FSTRING_MIDDLE}
):
# In Python 3.12, tokens other than FSTRING_* mean the
# f-string is dynamic, so we don't wan't to extract it.
Expand Down
46 changes: 33 additions & 13 deletions tests/messages/test_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,10 @@ def test_comments_with_calls_that_spawn_multiple_lines(self):
messages = list(extract.extract_python(buf, ('ngettext', '_'), ['NOTE:'],

{'strip_comment_tags': False}))
assert messages[0] == (3, 'ngettext', ('Catalog deleted.', 'Catalogs deleted.', None), ['NOTE: This Comment SHOULD Be Extracted'])
assert messages[0] == (2, 'ngettext', ('Catalog deleted.', 'Catalogs deleted.', None), ['NOTE: This Comment SHOULD Be Extracted'])
assert messages[1] == (6, '_', 'Locale deleted.', ['NOTE: This Comment SHOULD Be Extracted'])
assert messages[2] == (10, 'ngettext', ('Foo deleted.', 'Foos deleted.', None), ['NOTE: This Comment SHOULD Be Extracted'])
assert messages[3] == (15, 'ngettext', ('Bar deleted.', 'Bars deleted.', None), ['NOTE: This Comment SHOULD Be Extracted', 'NOTE: And This One Too'])
assert messages[3] == (14, 'ngettext', ('Bar deleted.', 'Bars deleted.', None), ['NOTE: This Comment SHOULD Be Extracted', 'NOTE: And This One Too'])

def test_declarations(self):
buf = BytesIO(b"""\
Expand Down Expand Up @@ -422,24 +422,44 @@ def test_nested_messages(self):
# NOTE: Third
_(u'Hello, {0} and {1}!', _(u'Heungsub'),
_(u'Armin'))
# NOTE: Fourth
_("Hello %(person)", person=random_function(_("Person")))
# NOTE: Fifth
_("Hello %(people)",
person=random_function(
", ".join([_("Person 1"), _("Person 2")])
)
)
""")
messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))
assert messages[0][2] == ('Hello, {name}!', None)
assert messages[0][2] == 'Foo Bar'
assert messages[0][3] == ['NOTE: First']
assert messages[1][2] == 'Foo Bar'
assert messages[1][3] == []
assert messages[2][2] == ('Hello, {name1} and {name2}!', None)
assert messages[1][2] == ('Hello, {name}!', None)
assert messages[1][3] == ['NOTE: First']
assert messages[2][2] == 'Heungsub'
assert messages[2][3] == ['NOTE: Second']
assert messages[3][2] == 'Heungsub'
assert messages[3][2] == 'Armin'
assert messages[3][3] == []
assert messages[4][2] == 'Armin'
assert messages[4][3] == []
assert messages[5][2] == ('Hello, {0} and {1}!', None)
assert messages[4][2] == ('Hello, {name1} and {name2}!', None, None)
assert messages[4][3] == ['NOTE: Second']
assert messages[5][2] == 'Heungsub'
assert messages[5][3] == ['NOTE: Third']
assert messages[6][2] == 'Heungsub'
assert messages[6][2] == 'Armin'
assert messages[6][3] == []
assert messages[7][2] == 'Armin'
assert messages[7][3] == []
assert messages[7][2] == ('Hello, {0} and {1}!', None, None)
assert messages[7][3] == ['NOTE: Third']
assert messages[8][2] == 'Person'
assert messages[8][3] == ['NOTE: Fourth']
assert messages[9][2] == ('Hello %(person)', None)
assert messages[9][3] == ['NOTE: Fourth']
assert messages[10][2] == 'Person 1'
assert messages[10][3] == []
assert messages[11][2] == 'Person 2'
assert messages[11][3] == []
assert messages[12][2] == ('Hello %(people)', None)
assert messages[12][3] == ['NOTE: Fifth']


class ExtractTestCase(unittest.TestCase):
Expand Down

0 comments on commit 71b33d0

Please sign in to comment.