Skip to content

Commit

Permalink
Handle categorical values
Browse files Browse the repository at this point in the history
Fixes old errors on production
  • Loading branch information
adamhooper committed Sep 26, 2019
1 parent 899aad1 commit 6421b0b
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 1 deletion.
18 changes: 18 additions & 0 deletions groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,16 @@ def groupby(
# (hopefully) the least computationally-intense.
agg_sets = "size"

# Got categoricals? Order the categories, so min/max work
category_colnames = {
agg.colname
for agg in aggregations
if agg.operation in {Operation.MIN, Operation.MAX}
and hasattr(table[colname], "cat")
}
for colname in category_colnames:
table[colname] = table[colname].cat.as_ordered()

if group_specs:
# aggs: DataFrame indexed by group
# out: just the group colnames, no values yet (we'll add them later)
Expand Down Expand Up @@ -336,6 +346,14 @@ def groupby(
except AttributeError:
out[outname] = series

# Remember those category colnames we converted to ordered? Now we need to
# undo that (and remove newly-unused categories).
for colname in out.columns:
column = out[colname]
if hasattr(column, "cat") and column.cat.ordered:
column.cat.remove_unused_categories(inplace=True)
column.cat.as_unordered(inplace=True)

return out


Expand Down
56 changes: 55 additions & 1 deletion test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ def test_aggregate_numbers(self):
),
)

def test_aggregate_strings(self):
def test_aggregate_text_values(self):
result = groupby(
pd.DataFrame({"A": [1, 1, 1], "B": ["a", "b", "a"]}),
[Group("A", None)],
Expand All @@ -367,6 +367,60 @@ def test_aggregate_strings(self):
),
)

def test_aggregate_text_category_values(self):
result = groupby(
pd.DataFrame(
{"A": [1, 1, 1], "B": pd.Series(["a", "b", "a"], dtype="category")}
),
[Group("A", None)],
[
Aggregation(Operation.SIZE, "B", "size"),
Aggregation(Operation.NUNIQUE, "B", "nunique"),
Aggregation(Operation.MIN, "B", "min"),
Aggregation(Operation.MAX, "B", "max"),
Aggregation(Operation.FIRST, "B", "first"),
],
)
assert_frame_equal(
result,
pd.DataFrame(
{
"A": [1],
"size": [3],
"nunique": [2],
"min": pd.Series(["a"], dtype="category"),
"max": pd.Series(["b"], dtype="category"),
"first": pd.Series(["a"], dtype="category"),
}
),
)

def test_aggregate_text_category_values_empty_still_has_object_dtype(self):
result = groupby(
pd.DataFrame({"A": [None]}, dtype=str).astype("category"),
[Group("A", None)],
[
Aggregation(Operation.SIZE, "A", "size"),
Aggregation(Operation.NUNIQUE, "A", "nunique"),
Aggregation(Operation.MIN, "A", "min"),
Aggregation(Operation.MAX, "A", "max"),
Aggregation(Operation.FIRST, "A", "first"),
],
)
assert_frame_equal(
result,
pd.DataFrame(
{
"A": pd.Series([], dtype=str).astype("category"),
"size": pd.Series([], dtype=int),
"nunique": pd.Series([], dtype=int),
"min": pd.Series([], dtype=str).astype("category"),
"max": pd.Series([], dtype=str).astype("category"),
"first": pd.Series([], dtype=str).astype("category"),
}
),
)

def test_aggregate_datetime_no_granularity(self):
result = groupby(
pd.DataFrame({"A": [dt(2018, 1, 4), dt(2018, 1, 5), dt(2018, 1, 4)]}),
Expand Down

0 comments on commit 6421b0b

Please sign in to comment.