column order in print with by or datagrid()

vincentarelbundock · Oct 1, 2023 · 54d08d6 · 54d08d6
1 parent f8e7382
commit 54d08d6
Show file tree

Hide file tree

Showing 9 changed files with 53 additions and 14 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,6 +1,7 @@
 # dev
 
 * `hypothesis` accepts a float or integer to specify a different null hypothesis.
+* Better column order in printout when using `datagrid()` or `by`
 
 # 0.0.5
 

diff --git a/marginaleffects/classes.py b/marginaleffects/classes.py
@@ -1,11 +1,15 @@
 import polars as pl
 
 class MarginaleffectsDataFrame(pl.DataFrame):
-    def __init__(self, data=None, by=None, conf_level=0.95):
+    def __init__(self, data=None, by=None, conf_level=0.95, newdata=None):
         if isinstance(data, pl.DataFrame):
             self._df = data._df
             self.by = by
             self.conf_level = conf_level
+            if hasattr(newdata, "datagrid_explicit"):
+                self.datagrid_explicit = newdata.datagrid_explicit
+            else:
+                self.datagrid_explicit = []
             return
         super().__init__(data)
 
@@ -42,6 +46,9 @@ def __str__(self):
                 raise ValueError("by must be None or a string or a list of strings")
         else:
             valid = list(mapping.keys())
+
+        valid = self.datagrid_explicit + valid
+
         valid = [x for x in valid if x in self.columns]
         mapping = {key: mapping[key] for key in mapping if key in valid}
         tmp = self.select(valid).rename(mapping)

diff --git a/marginaleffects/comparisons.py b/marginaleffects/comparisons.py
@@ -273,9 +273,9 @@ def outer(x):
 
     out = get_transform(out, transform=transform)
     out = get_equivalence(out, equivalence=equivalence, df=np.inf)
-    out = sort_columns(out, by=by)
+    out = sort_columns(out, by=by, newdata=newdata)
 
-    out = MarginaleffectsDataFrame(out, by=by, conf_level=conf_level)
+    out = MarginaleffectsDataFrame(out, by=by, conf_level=conf_level, newdata=newdata)
     return out
 
 

diff --git a/marginaleffects/datagrid.py b/marginaleffects/datagrid.py
@@ -89,6 +89,8 @@ def datagrid(
 
     out = reduce(lambda x, y: x.join(y, how="cross"), out.values())
 
+    out.datagrid_explicit = list(kwargs.keys())
+
     return out
 
 
@@ -132,4 +134,6 @@ def datagridcf(model=None, newdata=None, **kwargs):
     # Create rowid and rowidcf
     result = result.with_columns(pl.Series(range(result.shape[0])).alias("rowidcf"))
 
+    result.datagrid_explicit = list(kwargs.keys())
+
     return result
diff --git a/marginaleffects/predictions.py b/marginaleffects/predictions.py
@@ -115,27 +115,43 @@ def predictions(
     y, exog = patsy.dmatrices(model.model.formula, newdata.to_pandas())
 
     # estimands
-    def fun(x):
+    def inner(x):
         out = get_predictions(model, np.array(x), exog)
+
+        if out.shape[0] == newdata.shape[0]:
+            cols = [x for x in newdata.columns if x not in out.columns]
+            out = pl.concat([out, newdata.select(cols)], how="horizontal")
+
+        # group
+        elif "group" in out.columns:
+            meta = newdata.join(out.select("group").unique(), how="cross")
+            cols = [x for x in meta.columns if x in out.columns]
+            out = meta.join(out, on=cols, how="left")
+
+        # not sure what happens here
+        else:
+            raise ValueError("Something went wrong")
+
         out = get_by(model, out, newdata=newdata, by=by, wts=wts)
         out = get_hypothesis(out, hypothesis=hypothesis)
         return out
 
-    out = fun(model.params)
+    out = inner(model.params)
+
     if vcov is not None:
-        J = get_jacobian(fun, model.params)
+        J = get_jacobian(inner, model.params)
         se = get_se(J, V)
         out = out.with_columns(pl.Series(se).alias("std_error"))
         out = get_z_p_ci(out, model, conf_level=conf_level, hypothesis_null=hypothesis_null)
     out = get_transform(out, transform=transform)
     out = get_equivalence(out, equivalence=equivalence)
-    out = sort_columns(out, by=by)
+    out = sort_columns(out, by=by, newdata=newdata)
 
     # unpad
     if "rowid" in out.columns and pad.shape[0] > 0:
         out = out[:-pad.shape[0]:]
 
-    out = MarginaleffectsDataFrame(out, by=by, conf_level=conf_level)
+    out = MarginaleffectsDataFrame(out, by=by, conf_level=conf_level, newdata=newdata)
     return out
 
 

diff --git a/marginaleffects/sanity.py b/marginaleffects/sanity.py
@@ -49,7 +49,7 @@ def sanitize_by(by):
     return by
 
 
-def sanitize_newdata(model, newdata, wts, by = []):
+def sanitize_newdata(model, newdata, wts, by=[]):
     modeldata = get_modeldata(model)
 
     if newdata is None:
@@ -63,10 +63,14 @@ def sanitize_newdata(model, newdata, wts, by = []):
 
     elif isinstance(newdata, pd.DataFrame):
         out = pl.from_pandas(newdata)
-        
+
     else:
         out = newdata
 
+    datagrid_explicit = None
+    if isinstance(out, pl.DataFrame) and hasattr(out, "datagrid_explicit"):
+        datagrid_explicit = out.datagrid_explicit
+
     if isinstance(by, list) and len(by) > 0:
         by = [x for x in by if x in out.columns]
         if len(by) > 0:
@@ -90,8 +94,10 @@ def sanitize_newdata(model, newdata, wts, by = []):
     if any([isinstance(out[x], pl.Categorical) for x in out.columns]):
         raise ValueError("Categorical type columns are not supported in `newdata`.")
 
-    return out
+    if datagrid_explicit is not None:
+        out.datagrid_explicit = datagrid_explicit
 
+    return out
 
 def sanitize_comparison(comparison, by, wts=None):
     out = comparison

diff --git a/marginaleffects/utils.py b/marginaleffects/utils.py
@@ -16,7 +16,7 @@ def get_modeldata(fit):
     return out
 
 
-def sort_columns(df, by=None):
+def sort_columns(df, by=None, newdata=None):
     cols = [
         "rowid",
         "group",
@@ -30,11 +30,16 @@ def sort_columns(df, by=None):
         "conf_low",
         "conf_high",
     ] + df.columns
+
     if by is not None:
         if isinstance(by, list):
             cols = by + cols
         else:
             cols = [by] + cols
+
+    if isinstance(newdata, pl.DataFrame) and hasattr(newdata, "datagrid_explicit"):
+        cols = newdata.datagrid_explicit + cols
+
     cols = [x for x in cols if x in df.columns]
     cols_unique = []
     for item in cols:

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "marginaleffects"
-version = "0.0.5.9001"
+version = "0.0.5.9002"
 description = ""
 authors = ["Vincent Arel-Bundock <[email protected]>"]
 readme = "README.md"

diff --git a/tests/test_by.py b/tests/test_by.py
@@ -3,7 +3,7 @@
 from pytest import approx
 import polars as pl
 from marginaleffects import *
-from .utilities import *
+# from .utilities import *
 import statsmodels.formula.api as smf
 
 Guerry = pl.read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/HistData/Guerry.csv", null_values = "NA").drop_nulls()