Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SNOW-787480: fix json load encoding error #1528

Merged
merged 14 commits into from
May 2, 2023
1 change: 1 addition & 0 deletions DESCRIPTION.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Source code is also available at: https://github.com/snowflakedb/snowflake-conne

- v3.0.4(TBD)
- Fixed a bug in which `cursor.execute()` could modify the argument statement_params dictionary object when executing a multistatement query.
- Added the json_result_force_utf8_decoding connection parameter to force decoding JSON content in utf-8 when the result format is JSON.

- v3.0.3(April 20, 2023)
- Fixed a bug that prints error in logs for GET command on GCS.
Expand Down
7 changes: 7 additions & 0 deletions src/snowflake/connector/connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,10 @@ def DefaultConverterClass() -> type:
True,
bool,
), # Whether to log imported packages in telemetry
"json_result_force_utf8_decoding": (
False,
sfc-gh-stan marked this conversation as resolved.
Show resolved Hide resolved
bool,
), # Whether to force the JSON content to be decoded in utf-8, it is only effective when result format is JSON
}

APPLICATION_RE = re.compile(r"[\w\d_]+")
Expand Down Expand Up @@ -265,6 +269,9 @@ class SnowflakeConnection:
enable_connection_diag: when true, clients will generate a connectivity diagnostic report.
connection_diag_log_path: path to location to create diag report with enable_connection_diag.
connection_diag_whitelist_path: path to a whitelist.json file to test with enable_connection_diag.
json_result_force_utf8_decoding: When true, json result will be decoded in utf-8,
when false, the encoding of the content is auto-detected. Default value is false.
This parameter is only effective when the result format is JSON.
"""

OCSP_ENV_LOCK = Lock()
Expand Down
19 changes: 17 additions & 2 deletions src/snowflake/connector/result_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ def remote_chunk_info(c: dict[str, Any]) -> RemoteChunkInfo:
schema,
column_converters,
cursor._use_dict_result,
json_result_force_utf8_decoding=cursor._connection._json_result_force_utf8_decoding,
)
for c in chunks
]
Expand Down Expand Up @@ -384,6 +385,8 @@ def __init__(
schema: Sequence[ResultMetadata],
column_converters: Sequence[tuple[str, SnowflakeConverterType]],
use_dict_result: bool,
*,
json_result_force_utf8_decoding: bool = False,
) -> None:
super().__init__(
rowcount,
Expand All @@ -392,6 +395,7 @@ def __init__(
schema,
use_dict_result,
)
self._json_result_force_utf8_decoding = json_result_force_utf8_decoding
self.column_converters = column_converters

@classmethod
Expand Down Expand Up @@ -420,10 +424,21 @@ def _load(self, response: Response) -> list:

Returns:
Whatever ``json.loads`` return, but in a list.
Unfortunately there's not type hint for this.
Unfortunately there's no type hint for this.
For context: https://github.com/python/typing/issues/182
"""
read_data = response.text
# if users specify how to decode the data, we decode the bytes using the specified encoding
if self._json_result_force_utf8_decoding:
try:
read_data = str(response.content, "utf-8", errors="strict")
except Exception as exc:
err_msg = f"failed to decode json result content due to error {exc!r}"
logger.error(err_msg)
raise Error(msg=err_msg)
else:
# note: SNOW-787480 response.apparent_encoding is unreliable, chardet.detect can be wrong which is used by
# response.text to decode content, check issue: https://github.com/chardet/chardet/issues/148
read_data = response.text
return json.loads("".join(["[", read_data, "]"]))

def _parse(
Expand Down
27 changes: 27 additions & 0 deletions test/integ/test_cursor.py
Original file line number Diff line number Diff line change
Expand Up @@ -1618,3 +1618,30 @@ def test_multi_statement_failure(conn_cnx):
CLIENT_VERSION,
(type(None), str),
)


@pytest.mark.skipolddriver
def test_decoding_utf8_for_json_result(conn_cnx):
# SNOW-787480, if not explicitly setting utf-8 decoding, the data will be
# detected decoding as windows-1250 by chardet.detect
with conn_cnx() as con, con.cursor() as cur:
cur.execute("alter session set python_connector_query_result_format='JSON'")
sfc-gh-aling marked this conversation as resolved.
Show resolved Hide resolved
ret = cur.execute(
"""select '"",' || '"",' || '"",' || '"",' || '"",' || 'Ofigràfic' || '"",' from TABLE(GENERATOR(ROWCOUNT => 5000)) v;"""
).fetchall()
assert len(ret) == 5000
# This test case is tricky, for most of the test cases, the decoding is incorrect and can could be different
# on different platforms, however, due to randomness, in rare cases the decoding is indeed utf-8,
# the backend behavior is flaky
assert ret[0] in (
('"","","","","",OfigrĂ\xa0fic"",',),
('"","","","","",Ofigràfic"",',),
)

with conn_cnx(json_result_force_utf8_decoding=True) as con, con.cursor() as cur:
cur.execute("alter session set python_connector_query_result_format='JSON'")
sfc-gh-aling marked this conversation as resolved.
Show resolved Hide resolved
ret = cur.execute(
"""select '"",' || '"",' || '"",' || '"",' || '"",' || 'Ofigràfic' || '"",' from TABLE(GENERATOR(ROWCOUNT => 5000)) v;"""
sfc-gh-aling marked this conversation as resolved.
Show resolved Hide resolved
).fetchall()
assert len(ret) == 5000
assert ret[0] == ('"","","","","",Ofigràfic"",',)