Skip to content

Commit

Permalink
fix: Support json type in athena2pandas (#2806)
Browse files Browse the repository at this point in the history
  • Loading branch information
DavidKatz-il authored May 7, 2024
1 parent c6dea4e commit 1516516
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 1 deletion.
2 changes: 1 addition & 1 deletion awswrangler/_data_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,7 @@ def athena2pandas(dtype: str, dtype_backend: str | None = None) -> str: # noqa:
return "decimal" if dtype_backend != "pyarrow" else "double[pyarrow]"
if dtype in ("binary", "varbinary"):
return "bytes" if dtype_backend != "pyarrow" else "binary[pyarrow]"
if any(dtype.startswith(t) for t in ["array", "row", "map", "struct"]):
if any(dtype.startswith(t) for t in ["array", "row", "map", "struct", "json"]):
return "object"
if dtype == "geometry":
return "string"
Expand Down
41 changes: 41 additions & 0 deletions tests/unit/test_athena.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,6 +560,47 @@ def test_athena_read_list(glue_database):
assert df["col0"].iloc[0] == "[1, 2, 3]"


def test_athena_read_json(glue_database):
sql = """
WITH dataset AS (
SELECT
CAST('HELLO ATHENA' AS JSON) AS some_str,
CAST(12345 AS JSON) AS some_int,
CAST(MAP(ARRAY['a', 'b'], ARRAY[1,2]) AS JSON) AS some_map
)
SELECT * FROM dataset
"""
df = wr.athena.read_sql_query(sql=sql, database=glue_database, ctas_approach=False)
assert len(df) == 1
assert len(df.index) == 1
assert len(df.columns) == 3
assert df["some_str"].iloc[0] == '"HELLO ATHENA"'
assert df["some_int"].iloc[0] == "12345"
assert df["some_map"].iloc[0] == '{"a":1,"b":2}'


def test_athena_read_json_extract(glue_database):
sql = """
WITH dataset AS (
SELECT '{"name": "Susan Smith",
"org": "engineering",
"projects": [{"name":"project1", "completed":false},
{"name":"project2", "completed":true}]}'
AS myblob
)
SELECT
json_extract(myblob, '$.name') AS name,
json_extract(myblob, '$.projects') AS projects
FROM dataset
"""
df = wr.athena.read_sql_query(sql=sql, database=glue_database, ctas_approach=False)
assert len(df) == 1
assert len(df.index) == 1
assert len(df.columns) == 2
assert df["name"].iloc[0] == '"Susan Smith"'
assert df["projects"].iloc[0] == '[{"name":"project1","completed":false},{"name":"project2","completed":true}]'


def test_sanitize_dataframe_column_names():
with pytest.warns(UserWarning, match=r"Duplicate*"):
test_df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
Expand Down

0 comments on commit 1516516

Please sign in to comment.