Skip to content

Commit

Permalink
fix: edit pyarrow stringify to better handle emojis and accents (#22881)
Browse files Browse the repository at this point in the history
  • Loading branch information
eschutho authored Jan 30, 2023
1 parent c839d0d commit f2b61fc
Show file tree
Hide file tree
Showing 5 changed files with 97 additions and 12 deletions.
7 changes: 6 additions & 1 deletion superset/result_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,12 @@ def stringify_values(array: NDArray[Any]) -> NDArray[Any]:
# pandas <NA> type cannot be converted to string
obj[na_obj] = None # type: ignore
else:
obj[...] = stringify(obj) # type: ignore
try:
# for simple string conversions
# this handles odd character types better
obj[...] = obj.astype(str) # type: ignore
except ValueError:
obj[...] = stringify(obj) # type: ignore

return result

Expand Down
4 changes: 2 additions & 2 deletions superset/utils/pandas_postprocessing/boxplot.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,10 @@ def boxplot(
"""

def quartile1(series: Series) -> float:
return np.nanpercentile(series, 25, interpolation="midpoint") # type: ignore
return np.nanpercentile(series, 25, method="midpoint")

def quartile3(series: Series) -> float:
return np.nanpercentile(series, 75, interpolation="midpoint") # type: ignore
return np.nanpercentile(series, 75, method="midpoint")

if whisker_type == PostProcessingBoxplotWhiskerType.TUKEY:

Expand Down
4 changes: 2 additions & 2 deletions tests/integration_tests/result_set_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,13 +169,13 @@ def test_nested_types(self):
"id": 4,
"dict_arr": '[{"table_name": "unicode_test", "database_id": 1}]',
"num_arr": "[1, 2, 3]",
"map_col": '{"chart_name": "scatter"}',
"map_col": "{'chart_name': 'scatter'}",
},
{
"id": 3,
"dict_arr": '[{"table_name": "birth_names", "database_id": 1}]',
"num_arr": "[4, 5, 6]",
"map_col": '{"chart_name": "plot"}',
"map_col": "{'chart_name': 'plot'}",
},
],
)
Expand Down
82 changes: 81 additions & 1 deletion tests/unit_tests/dataframe_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,87 @@ def test_df_to_records_NaT_type() -> None:

assert df_to_records(df) == [
{"date": None},
{"date": '"2023-01-06T20:50:31.749000+00:00"'},
{"date": "2023-01-06 20:50:31.749000+00:00"},
]


def test_df_to_records_mixed_emoji_type() -> None:
from superset.db_engine_specs import BaseEngineSpec
from superset.result_set import SupersetResultSet

data = [
("What's up?", "This is a string text", 1),
("What's up?", "This is a string with an 😍 added", 2),
("What's up?", NaT, 3),
("What's up?", "Last emoji 😁", 4),
]

cursor_descr: DbapiDescription = [
("question", "varchar", None, None, None, None, False),
("response", "varchar", None, None, None, None, False),
("count", "integer", None, None, None, None, False),
]

results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
df = results.to_pandas_df()

assert df_to_records(df) == [
{"question": "What's up?", "response": "This is a string text", "count": 1},
{
"question": "What's up?",
"response": "This is a string with an 😍 added",
"count": 2,
},
{
"question": "What's up?",
"response": None,
"count": 3,
},
{
"question": "What's up?",
"response": "Last emoji 😁",
"count": 4,
},
]


def test_df_to_records_mixed_accent_type() -> None:
from superset.db_engine_specs import BaseEngineSpec
from superset.result_set import SupersetResultSet

data = [
("What's up?", "This is a string text", 1),
("What's up?", "This is a string with áccent", 2),
("What's up?", NaT, 3),
("What's up?", "móre áccent", 4),
]

cursor_descr: DbapiDescription = [
("question", "varchar", None, None, None, None, False),
("response", "varchar", None, None, None, None, False),
("count", "integer", None, None, None, None, False),
]

results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
df = results.to_pandas_df()

assert df_to_records(df) == [
{"question": "What's up?", "response": "This is a string text", "count": 1},
{
"question": "What's up?",
"response": "This is a string with áccent",
"count": 2,
},
{
"question": "What's up?",
"response": None,
"count": 3,
},
{
"question": "What's up?",
"response": "móre áccent",
"count": 4,
},
]


Expand Down
12 changes: 6 additions & 6 deletions tests/unit_tests/result_set_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,10 +98,10 @@ def test_stringify_with_null_integers():

expected = np.array(
[
array(['"foo"', '"foo"', '"foo"'], dtype=object),
array(['"bar"', '"bar"', '"bar"'], dtype=object),
array(["foo", "foo", "foo"], dtype=object),
array(["bar", "bar", "bar"], dtype=object),
array([None, None, None], dtype=object),
array([None, "true", None], dtype=object),
array([None, "True", None], dtype=object),
]
)

Expand Down Expand Up @@ -132,10 +132,10 @@ def test_stringify_with_null_timestamps():

expected = np.array(
[
array(['"foo"', '"foo"', '"foo"'], dtype=object),
array(['"bar"', '"bar"', '"bar"'], dtype=object),
array(["foo", "foo", "foo"], dtype=object),
array(["bar", "bar", "bar"], dtype=object),
array([None, None, None], dtype=object),
array([None, "true", None], dtype=object),
array([None, "True", None], dtype=object),
]
)

Expand Down

0 comments on commit f2b61fc

Please sign in to comment.