Skip to content

Commit

Permalink
feat(python,rust): add drop_first parameter for to_dummies (issue #8246
Browse files Browse the repository at this point in the history
…) (#9143)
  • Loading branch information
EdmundsEcho authored Jun 23, 2023
1 parent c670555 commit 373a99a
Show file tree
Hide file tree
Showing 6 changed files with 62 additions and 28 deletions.
12 changes: 7 additions & 5 deletions polars/polars-ops/src/frame/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ pub trait DataFrameOps: IntoDf {
/// "code" => &["X1", "X2", "X3", "X3", "X2", "X2", "X1", "X1"]
/// }.unwrap();
///
/// let dummies = df.to_dummies().unwrap();
/// let dummies = df.to_dummies(None, false).unwrap();
/// dbg!(dummies);
/// # }
/// ```
Expand Down Expand Up @@ -73,24 +73,26 @@ pub trait DataFrameOps: IntoDf {
/// +------+------+------+--------+--------+--------+---------+---------+---------+
/// ```
#[cfg(feature = "to_dummies")]
fn to_dummies(&self, separator: Option<&str>) -> PolarsResult<DataFrame> {
self._to_dummies(None, separator)
fn to_dummies(&self, separator: Option<&str>, drop_first: bool) -> PolarsResult<DataFrame> {
self._to_dummies(None, separator, drop_first)
}

#[cfg(feature = "to_dummies")]
fn columns_to_dummies(
&self,
columns: Vec<&str>,
separator: Option<&str>,
drop_first: bool,
) -> PolarsResult<DataFrame> {
self._to_dummies(Some(columns), separator)
self._to_dummies(Some(columns), separator, drop_first)
}

#[cfg(feature = "to_dummies")]
fn _to_dummies(
&self,
columns: Option<Vec<&str>>,
separator: Option<&str>,
drop_first: bool,
) -> PolarsResult<DataFrame> {
let df = self.to_df();

Expand All @@ -101,7 +103,7 @@ pub trait DataFrameOps: IntoDf {
df.get_columns()
.par_iter()
.map(|s| match set.contains(s.name()) {
true => s.to_dummies(separator),
true => s.to_dummies(separator, drop_first),
false => Ok(s.clone().into_frame()),
})
.collect::<PolarsResult<Vec<_>>>()
Expand Down
12 changes: 6 additions & 6 deletions polars/polars-ops/src/series/ops/to_dummies.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,19 @@ type DummyType = i32;
type DummyCa = Int32Chunked;

pub trait ToDummies {
fn to_dummies(&self, separator: Option<&str>) -> PolarsResult<DataFrame>;
fn to_dummies(&self, separator: Option<&str>, drop_first: bool) -> PolarsResult<DataFrame>;
}

impl ToDummies for Series {
fn to_dummies(&self, separator: Option<&str>) -> PolarsResult<DataFrame> {
fn to_dummies(&self, separator: Option<&str>, drop_first: bool) -> PolarsResult<DataFrame> {
let sep = separator.unwrap_or("_");
let col_name = self.name();
let groups = self.group_tuples(true, false)?;
let groups = self.group_tuples(true, drop_first)?;

// safety: groups are in bounds
let columns = unsafe { self.agg_first(&groups) }
.iter()
.zip(groups.iter())
let columns = unsafe { self.agg_first(&groups) };
let columns = columns.iter().zip(groups.iter()).skip(drop_first as usize);
let columns = columns
.map(|(av, group)| {
// strings are formatted with extra \" \" in polars, so we
// extract the string
Expand Down
10 changes: 8 additions & 2 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -7697,7 +7697,11 @@ def quantile(
return self._from_pydf(self._df.quantile(quantile, interpolation))

def to_dummies(
self, columns: str | Sequence[str] | None = None, *, separator: str = "_"
self,
columns: str | Sequence[str] | None = None,
*,
separator: str = "_",
drop_first: bool = False,
) -> Self:
"""
Convert categorical variables into dummy/indicator variables.
Expand All @@ -7709,6 +7713,8 @@ def to_dummies(
If set to ``None`` (default), convert all columns.
separator
Separator/delimiter used when generating column names.
drop_first
Remove the first category from the variables being encoded.
Examples
--------
Expand All @@ -7733,7 +7739,7 @@ def to_dummies(
"""
if isinstance(columns, str):
columns = [columns]
return self._from_pydf(self._df.to_dummies(columns, separator))
return self._from_pydf(self._df.to_dummies(columns, separator, drop_first))

def unique(
self,
Expand Down
12 changes: 8 additions & 4 deletions py-polars/src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1235,16 +1235,20 @@ impl PyDataFrame {
Ok(df.into())
}

#[pyo3(signature = (columns, separator, drop_first=false))]
pub fn to_dummies(
&self,
columns: Option<Vec<String>>,
separator: Option<&str>,
drop_first: bool,
) -> PyResult<Self> {
let df = match columns {
Some(cols) => self
.df
.columns_to_dummies(cols.iter().map(|x| x as &str).collect(), separator),
None => self.df.to_dummies(separator),
Some(cols) => self.df.columns_to_dummies(
cols.iter().map(|x| x as &str).collect(),
separator,
drop_first,
),
None => self.df.to_dummies(separator, drop_first),
}
.map_err(PyPolarsErr::from)?;
Ok(df.into())
Expand Down
5 changes: 3 additions & 2 deletions py-polars/src/series/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -538,10 +538,11 @@ impl PySeries {
Ok(s.into())
}

fn to_dummies(&self, separator: Option<&str>) -> PyResult<PyDataFrame> {
#[pyo3(signature = (separator, drop_first=false))]
fn to_dummies(&self, separator: Option<&str>, drop_first: bool) -> PyResult<PyDataFrame> {
let df = self
.series
.to_dummies(separator)
.to_dummies(separator, drop_first)
.map_err(PyPolarsErr::from)?;
Ok(df.into())
}
Expand Down
39 changes: 30 additions & 9 deletions py-polars/tests/unit/test_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -818,14 +818,6 @@ def test_shift() -> None:
assert_frame_equal(a, b)


def test_to_dummies() -> None:
df = pl.DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 5]})
dummies = df.to_dummies()
assert dummies["A_a"].to_list() == [1, 0, 0]
assert dummies["A_b"].to_list() == [0, 1, 0]
assert dummies["A_c"].to_list() == [0, 0, 1]


def test_custom_groupby() -> None:
df = pl.DataFrame({"a": [1, 2, 1, 1], "b": ["a", "b", "c", "c"]})
out = df.groupby("b", maintain_order=True).agg(
Expand Down Expand Up @@ -877,9 +869,17 @@ def test_arg_where() -> None:
assert_series_equal(pl.arg_where(s, eager=True).cast(int), pl.Series([0, 2]))


def test_to_dummies2() -> None:
def test_to_dummies() -> None:
df = pl.DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 5]})
dummies = df.to_dummies()

assert dummies["A_a"].to_list() == [1, 0, 0]
assert dummies["A_b"].to_list() == [0, 1, 0]
assert dummies["A_c"].to_list() == [0, 0, 1]

df = pl.DataFrame({"a": [1, 2, 3]})
res = df.to_dummies()

expected = pl.DataFrame(
{"a_1": [1, 0, 0], "a_2": [0, 1, 0], "a_3": [0, 0, 1]}
).with_columns(pl.all().cast(pl.UInt8))
Expand All @@ -906,6 +906,27 @@ def test_to_dummies2() -> None:
) == {"x_0": [1, 0, 0], "x_1": [0, 1, 0], "x_2": [0, 0, 1]}


def test_to_dummies_drop_first() -> None:
df = pl.DataFrame(
{
"foo": [0, 1, 2],
"bar": [3, 4, 5],
"baz": ["x", "y", "z"],
}
)
dm = df.to_dummies()
dd = df.to_dummies(drop_first=True)

assert dd.columns == ["foo_1", "foo_2", "bar_4", "bar_5", "baz_y", "baz_z"]
assert set(dm.columns) - set(dd.columns) == {"foo_0", "bar_3", "baz_x"}
assert dm.select(dd.columns).frame_equal(dd)
assert dd.rows() == [
(0, 0, 0, 0, 0, 0),
(1, 0, 1, 0, 1, 0),
(0, 1, 0, 1, 0, 1),
]


def test_to_pandas(df: pl.DataFrame) -> None:
# pyarrow cannot deal with unsigned dictionary integer yet.
# pyarrow cannot convert a time64 w/ non-zero nanoseconds
Expand Down

0 comments on commit 373a99a

Please sign in to comment.