From a135a6d2c6d503b186695f01efa7eed65611b04e Mon Sep 17 00:00:00 2001 From: Pablo Galindo Salgado Date: Mon, 11 Dec 2023 11:44:22 +0000 Subject: [PATCH] gh-112943: Correctly compute end offsets for multiline tokens in the tokenize module (#112949) --- Lib/test/test_tokenize.py | 10 ++++++++++ ...023-12-11-00-50-00.gh-issue-112943.RHNZie.rst | 2 ++ Parser/pegen.c | 16 +++++++++++----- Parser/pegen.h | 1 + Python/Python-tokenize.c | 2 +- 5 files changed, 25 insertions(+), 6 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2023-12-11-00-50-00.gh-issue-112943.RHNZie.rst diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 290f4608c5e739..21e8637a7ca905 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -615,6 +615,16 @@ def test_string(self): OP '}' (3, 0) (3, 1) FSTRING_MIDDLE '__' (3, 1) (3, 3) FSTRING_END "'" (3, 3) (3, 4) + """) + + self.check_tokenize("""\ + '''Autorzy, którzy tą jednostkę mają wpisani jako AKTUALNA -- czyli + aktualni pracownicy, obecni pracownicy''' +""", """\ + INDENT ' ' (1, 0) (1, 4) + STRING "'''Autorzy, którzy tą jednostkę mają wpisani jako AKTUALNA -- czyli\\n aktualni pracownicy, obecni pracownicy'''" (1, 4) (2, 45) + NEWLINE '\\n' (2, 45) (2, 46) + DEDENT '' (3, 0) (3, 0) """) def test_function(self): diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-12-11-00-50-00.gh-issue-112943.RHNZie.rst b/Misc/NEWS.d/next/Core and Builtins/2023-12-11-00-50-00.gh-issue-112943.RHNZie.rst new file mode 100644 index 00000000000000..4bc2fe7c26d904 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2023-12-11-00-50-00.gh-issue-112943.RHNZie.rst @@ -0,0 +1,2 @@ +Correctly compute end column offsets for multiline tokens in the +:mod:`tokenize` module. Patch by Pablo Galindo diff --git a/Parser/pegen.c b/Parser/pegen.c index 0c60394e4f199b..7766253a76066f 100644 --- a/Parser/pegen.c +++ b/Parser/pegen.c @@ -19,12 +19,8 @@ _PyPegen_interactive_exit(Parser *p) } Py_ssize_t -_PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset) +_PyPegen_byte_offset_to_character_offset_raw(const char* str, Py_ssize_t col_offset) { - const char *str = PyUnicode_AsUTF8(line); - if (!str) { - return -1; - } Py_ssize_t len = strlen(str); if (col_offset > len + 1) { col_offset = len + 1; @@ -39,6 +35,16 @@ _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset) return size; } +Py_ssize_t +_PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset) +{ + const char *str = PyUnicode_AsUTF8(line); + if (!str) { + return -1; + } + return _PyPegen_byte_offset_to_character_offset_raw(str, col_offset); +} + // Here, mark is the start of the node, while p->mark is the end. // If node==NULL, they should be the same. int diff --git a/Parser/pegen.h b/Parser/pegen.h index 424f80acd7be3b..57b45a54c36c57 100644 --- a/Parser/pegen.h +++ b/Parser/pegen.h @@ -149,6 +149,7 @@ expr_ty _PyPegen_name_token(Parser *p); expr_ty _PyPegen_number_token(Parser *p); void *_PyPegen_string_token(Parser *p); Py_ssize_t _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset); +Py_ssize_t _PyPegen_byte_offset_to_character_offset_raw(const char*, Py_ssize_t col_offset); // Error handling functions and APIs typedef enum { diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c index 364fe55d0a05e4..a7891709b3b44a 100644 --- a/Python/Python-tokenize.c +++ b/Python/Python-tokenize.c @@ -225,7 +225,7 @@ tokenizeriter_next(tokenizeriterobject *it) col_offset = _PyPegen_byte_offset_to_character_offset(line, token.start - line_start); } if (token.end != NULL && token.end >= it->tok->line_start) { - end_col_offset = _PyPegen_byte_offset_to_character_offset(line, token.end - it->tok->line_start); + end_col_offset = _PyPegen_byte_offset_to_character_offset_raw(it->tok->line_start, token.end - it->tok->line_start); } if (it->tok->tok_extra_tokens) {