From a135a6d2c6d503b186695f01efa7eed65611b04e Mon Sep 17 00:00:00 2001
From: Pablo Galindo Salgado <Pablogsal@gmail.com>
Date: Mon, 11 Dec 2023 11:44:22 +0000
Subject: [PATCH] gh-112943: Correctly compute end offsets for multiline tokens
 in the tokenize module (#112949)

---
 Lib/test/test_tokenize.py                        | 10 ++++++++++
 ...023-12-11-00-50-00.gh-issue-112943.RHNZie.rst |  2 ++
 Parser/pegen.c                                   | 16 +++++++++++-----
 Parser/pegen.h                                   |  1 +
 Python/Python-tokenize.c                         |  2 +-
 5 files changed, 25 insertions(+), 6 deletions(-)
 create mode 100644 Misc/NEWS.d/next/Core and Builtins/2023-12-11-00-50-00.gh-issue-112943.RHNZie.rst

diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 290f4608c5e739..21e8637a7ca905 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -615,6 +615,16 @@ def test_string(self):
     OP         '}'           (3, 0) (3, 1)
     FSTRING_MIDDLE '__'          (3, 1) (3, 3)
     FSTRING_END "'"           (3, 3) (3, 4)
+    """)
+
+        self.check_tokenize("""\
+    '''Autorzy, którzy tą jednostkę mają wpisani jako AKTUALNA -- czyli
+    aktualni pracownicy, obecni pracownicy'''
+""", """\
+    INDENT     '    '        (1, 0) (1, 4)
+    STRING     "'''Autorzy, którzy tą jednostkę mają wpisani jako AKTUALNA -- czyli\\n    aktualni pracownicy, obecni pracownicy'''" (1, 4) (2, 45)
+    NEWLINE    '\\n'          (2, 45) (2, 46)
+    DEDENT     ''            (3, 0) (3, 0)
     """)
 
     def test_function(self):
diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-12-11-00-50-00.gh-issue-112943.RHNZie.rst b/Misc/NEWS.d/next/Core and Builtins/2023-12-11-00-50-00.gh-issue-112943.RHNZie.rst
new file mode 100644
index 00000000000000..4bc2fe7c26d904
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2023-12-11-00-50-00.gh-issue-112943.RHNZie.rst	
@@ -0,0 +1,2 @@
+Correctly compute end column offsets for multiline tokens in the
+:mod:`tokenize` module. Patch by Pablo Galindo
diff --git a/Parser/pegen.c b/Parser/pegen.c
index 0c60394e4f199b..7766253a76066f 100644
--- a/Parser/pegen.c
+++ b/Parser/pegen.c
@@ -19,12 +19,8 @@ _PyPegen_interactive_exit(Parser *p)
 }
 
 Py_ssize_t
-_PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
+_PyPegen_byte_offset_to_character_offset_raw(const char* str, Py_ssize_t col_offset)
 {
-    const char *str = PyUnicode_AsUTF8(line);
-    if (!str) {
-        return -1;
-    }
     Py_ssize_t len = strlen(str);
     if (col_offset > len + 1) {
         col_offset = len + 1;
@@ -39,6 +35,16 @@ _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
     return size;
 }
 
+Py_ssize_t
+_PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
+{
+    const char *str = PyUnicode_AsUTF8(line);
+    if (!str) {
+        return -1;
+    }
+    return _PyPegen_byte_offset_to_character_offset_raw(str, col_offset);
+}
+
 // Here, mark is the start of the node, while p->mark is the end.
 // If node==NULL, they should be the same.
 int
diff --git a/Parser/pegen.h b/Parser/pegen.h
index 424f80acd7be3b..57b45a54c36c57 100644
--- a/Parser/pegen.h
+++ b/Parser/pegen.h
@@ -149,6 +149,7 @@ expr_ty _PyPegen_name_token(Parser *p);
 expr_ty _PyPegen_number_token(Parser *p);
 void *_PyPegen_string_token(Parser *p);
 Py_ssize_t _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset);
+Py_ssize_t _PyPegen_byte_offset_to_character_offset_raw(const char*, Py_ssize_t col_offset);
 
 // Error handling functions and APIs
 typedef enum {
diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c
index 364fe55d0a05e4..a7891709b3b44a 100644
--- a/Python/Python-tokenize.c
+++ b/Python/Python-tokenize.c
@@ -225,7 +225,7 @@ tokenizeriter_next(tokenizeriterobject *it)
         col_offset = _PyPegen_byte_offset_to_character_offset(line, token.start - line_start);
     }
     if (token.end != NULL && token.end >= it->tok->line_start) {
-        end_col_offset = _PyPegen_byte_offset_to_character_offset(line, token.end - it->tok->line_start);
+        end_col_offset = _PyPegen_byte_offset_to_character_offset_raw(it->tok->line_start, token.end - it->tok->line_start);
     }
 
     if (it->tok->tok_extra_tokens) {