From 902bf2a5e99728a70cabc75260261591adcf249c Mon Sep 17 00:00:00 2001 From: Vincent Arel-Bundock Date: Sat, 23 Dec 2023 14:19:28 -0500 Subject: [PATCH] rye --- .flake8 | 4 - .python-version | 1 + docs/api/comparisons.md | 4 - docs/api/hypotheses.md | 4 - docs/api/predictions.md | 4 - docs/get_started.qmd | 291 ---------------------------------------- docs/index.md | 17 --- docs/reference.md | 5 - mkdocs.yml | 19 --- pyproject.toml | 70 +++++----- pyproject.toml.bak | 43 ++++++ requirements-dev.lock | 34 +++++ requirements.lock | 27 ++++ 13 files changed, 138 insertions(+), 385 deletions(-) delete mode 100644 .flake8 create mode 100644 .python-version delete mode 100644 docs/api/comparisons.md delete mode 100644 docs/api/hypotheses.md delete mode 100644 docs/api/predictions.md delete mode 100644 docs/get_started.qmd delete mode 100644 docs/index.md delete mode 100644 docs/reference.md delete mode 100644 mkdocs.yml create mode 100644 pyproject.toml.bak create mode 100644 requirements-dev.lock create mode 100644 requirements.lock diff --git a/.flake8 b/.flake8 deleted file mode 100644 index 6c70207..0000000 --- a/.flake8 +++ /dev/null @@ -1,4 +0,0 @@ -[flake8] -ignore = E501, E722, W503, E203 -exclude = ".git, __pycache__, .github" -per-file-ignores = __init__.py:F401, sanity.py:W605 \ No newline at end of file diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..57ef42f --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +cpython-x86_64-linux@3.12.0 diff --git a/docs/api/comparisons.md b/docs/api/comparisons.md deleted file mode 100644 index ed92f7d..0000000 --- a/docs/api/comparisons.md +++ /dev/null @@ -1,4 +0,0 @@ -# `comparisons` - -::: marginaleffects.comparisons - :docstring: diff --git a/docs/api/hypotheses.md b/docs/api/hypotheses.md deleted file mode 100644 index 086fc8d..0000000 --- a/docs/api/hypotheses.md +++ /dev/null @@ -1,4 +0,0 @@ -# `hypotheses` - -::: marginaleffects.hypotheses - :docstring: diff --git a/docs/api/predictions.md b/docs/api/predictions.md deleted file mode 100644 index fef6939..0000000 --- a/docs/api/predictions.md +++ /dev/null @@ -1,4 +0,0 @@ -# `predictions` - -::: marginaleffects.predictions - :docstring: diff --git a/docs/get_started.qmd b/docs/get_started.qmd deleted file mode 100644 index 71ed2c2..0000000 --- a/docs/get_started.qmd +++ /dev/null @@ -1,291 +0,0 @@ ---- -title: "`marginaleffects` for Python" -format: gfm ---- - -The `marginaleffects` package allows `Python` users to compute and plot three principal quantities of interest: (1) predictions, (2) comparisons, and (3) slopes. In addition, the package includes a convenience function to compute a fourth estimand, "marginal means", which is a special case of averaged predictions. `marginaleffects` can also average (or "marginalize") unit-level (or "conditional") estimates of all those quantities, and conduct hypothesis tests on them. - -## WARNING - -This is an *alpha* version of the package, released to gather feedback, feature requests, and bug reports from potential users. This version includes known bugs. There are also known discrepancies between the numerical results produced in Python and R. Please report any issues you encounter here: https://github.com/vincentarelbundock/pymarginaleffects/issues - - -## Supported models - -There is a good chance that this package will work with (nearly) all the models supported by [the `statsmodels` formula API,](https://www.statsmodels.org/stable/api.html#statsmodels-formula-api) ex: `ols`, `probit`, `logit`, `mnlogit`, `quantreg`, `poisson`, `negativebinomial`, `mixedlm`, `rlm`, etc. However, the package has only been tested with a subset of those, and some weirdness remains. Again: this is *alpha* software; it should not be used in critical applications yet. - - -## Installation - -Install the latest PyPi release: - -```{python} -#| eval: false -pip install marginaleffects -``` - -## Estimands: Predictions, Comparisons, and Slopes - - -## Definitions - -[_Predictions_:](predictions.html) - -> The outcome predicted by a fitted model on a specified scale for a given combination of values of the predictor variables, such as their observed values, their means, or factor levels. a.k.a. Fitted values, adjusted predictions. `predictions()`, `avg_predictions()`, `plot_predictions()`. - -[_Comparisons_:](comparisons.html) - -> Compare the predictions made by a model for different regressor values (e.g., college graduates vs. others): contrasts, differences, risk ratios, odds, etc. `comparisons()`, `avg_comparisons()`, `plot_comparisons()`. - -[_Slopes_:](slopes.html) - -> Partial derivative of the regression equation with respect to a regressor of interest. a.k.a. Marginal effects, trends. `slopes()`, `avg_slopes()`, `plot_slopes()`. - -[Hypothesis and Equivalence Tests:](hypothesis.html) - -> Hypothesis and equivalence tests can be conducted on linear or non-linear functions of model coefficients, or on any of the quantities computed by the `marginaleffects` packages (predictions, slopes, comparisons, marginal means, etc.). Uncertainy estimates can be obtained via the delta method (with or without robust standard errors), bootstrap, or simulation. - -Predictions, comparisons, and slopes are fundamentally unit-level (or "conditional") quantities. Except in the simplest linear case, estimates will typically vary based on the values of all the regressors in a model. Each of the observations in a dataset is thus associated with its own prediction, comparison, and slope estimates. Below, we will see that it can be useful to marginalize (or "average over") unit-level estimates to report an "average prediction", "average comparison", or "average slope". - -We now apply `marginaleffects` functions to compute each of the estimands described above. First, we fit a linear regression model with multiplicative interactions: - -#### Predictions - -```{python} -import numpy as np -import polars as pl -from marginaleffects import * -import statsmodels.formula.api as smf -mtcars = pl.read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/datasets/mtcars.csv") -mod = smf.ols("mpg ~ hp * wt * am", data = mtcars).fit() - -print(mod.summary().as_text()) -``` - -```{python} -#| include: false -pl.Config( - tbl_formatting="ASCII_MARKDOWN", - tbl_hide_column_data_types=True, - tbl_hide_dataframe_shape=True, -) -pl.Config.set_tbl_cols(8) -``` - -Then, we call the `predictions()` function. As noted above, predictions are unit-level estimates, so there is one specific prediction per observation. By default, the `predictions()` function makes one prediction per observation in the dataset that was used to fit the original model. Since `mtcars` has 32 rows, the `predictions()` outcome also has 32 rows: - -```{python} -pre = predictions(mod) - -pre.shape - -print(pre.head()) -``` - -#### Comparisons: Differences, Ratios, Log-Odds, Lift, etc. - -Now, we use the `comparisons()` function to compute the difference in predicted outcome when each of the predictors is incremented by 1 unit (one predictor at a time, holding all others constant). Once again, comparisons are unit-level quantities. And since there are 3 predictors in the model and our data has 32 rows, we obtain 96 comparisons: - -```{python} -cmp = comparisons(mod) - -cmp.shape - -print(cmp.head()) -``` - -The `comparisons()` function allows customized queries. For example, what happens to the predicted outcome when the `hp` variable increases from 100 to 120? - -```{python} -cmp = comparisons(mod, variables = {"hp": [120, 100]}) -print(cmp) -``` - -What happens to the predicted outcome when the `wt` variable increases by 1 standard deviation about its mean? - -```{python} -cmp = comparisons(mod, variables = {"hp": "sd"}) -print(cmp) -``` - -The `comparisons()` function also allows users to specify arbitrary functions of predictions, with the `comparison` argument. For example, what is the average ratio between predicted Miles per Gallon after an increase of 50 units in Horsepower? - -```{python} -cmp = comparisons( - mod, - variables = {"hp": 50}, - comparison = "ratioavg") -print(cmp) -``` - -#### Slopes: Derivatives and elasticities - -Consider a logistic regression model with a single predictor: - -```{python} -url = "https://vincentarelbundock.github.io/Rdatasets/csv/datasets/mtcars.csv" -mtcars = pl.read_csv(url) -mod = smf.logit("am ~ mpg", data = mtcars).fit() -``` - -We can estimate the slope of the prediction function with respect to the `mpg` variable at any point in the data space. For example, what is the slope of the prediction function at `mpg = 24`? - -```{python} -mfx = slopes(mod, newdata = datagrid(mpg = 24, newdata = mtcars)) -print(mfx) -``` - -This is equivalent to the result we obtain by taking the analytical derivative using the chain rule: - -```{python} -from scipy.stats import logistic -beta_0 = mod.params.iloc[0] -beta_1 = mod.params.iloc[1] -print(beta_1 * logistic.pdf(beta_0 + beta_1 * 24)) -``` - -This computes a "marginal effect (or slope) at the mean" or "at the median", that is, when all covariates are held at their mean or median values: - -```{python} -mfx = slopes(mod, newdata = "mean") -print(mfx) -``` -```{python} -mfx = slopes(mod, newdata = "median") -print(mfx) -``` - -We can also compute an "average slope" or "average marginaleffects" - -```{python} -mfx = avg_slopes(mod) -print(mfx) -``` - -Which again is equivalent to the analytical result: - -```{python} -np.mean(beta_1 * logistic.pdf(beta_0 + beta_1 * mtcars["mpg"])) -``` - -## Grid - -Predictions, comparisons, and slopes are typically "conditional" quantities which depend on the values of all the predictors in the model. By default, `marginaleffects` functions estimate quantities of interest for the empirical distribution of the data (i.e., for each row of the original dataset). However, users can specify the exact values of the predictors they want to investigate by using the `newdata` argument. - -`newdata` accepts data frames like this: - -```{python} -pre = predictions(mod, newdata = mtcars.tail(2)) -print(pre) -``` - -The [`datagrid` function gives us a powerful way to define a grid of predictors.](https://vincentarelbundock.github.io/marginaleffects/reference/datagrid.html) All the variables not mentioned explicitly in `datagrid()` are fixed to their mean or mode: - -```{python} -pre = predictions( - mod, - newdata = datagrid( - newdata = mtcars, - am = [0, 1], - wt = [mtcars["wt"].max(), mtcars["wt"].min()])) - -print(pre) -``` - -## Averaging - -Since predictions, comparisons, and slopes are conditional quantities, they can be a bit unwieldy. Often, it can be useful to report a one-number summary instead of one estimate per observation. Instead of presenting "conditional" estimates, some methodologists recommend reporting "marginal" estimates, that is, an average of unit-level estimates. - -(This use of the word "marginal" as "averaging" should not be confused with the term "marginal effect" which, in the econometrics tradition, corresponds to a partial derivative, or the effect of a "small/marginal" change.) - -To marginalize (average over) our unit-level estimates, we can use the `by` argument or the one of the convenience functions: `avg_predictions()`, `avg_comparisons()`, or `avg_slopes()`. For example, both of these commands give us the same result: the average predicted outcome in the `mtcars` dataset: - -```{python} -pre = avg_predictions(mod) -print(pre) -``` - -This is equivalent to manual computation by: - -```{python} -np.mean(mod.predict()) -``` - -The main `marginaleffects` functions all include a `by` argument, which allows us to marginalize within sub-groups of the data. For example, - -```{python} -cmp = avg_comparisons(mod, by = "am") -print(cmp) -``` - -Marginal Means are a special case of predictions, which are marginalized (or averaged) across a balanced grid of categorical predictors. To illustrate, we estimate a new model with categorical predictors: - -```{python} -dat = mtcars \ - .with_columns( - pl.col("am").cast(pl.Boolean), - pl.col("cyl").cast(pl.Utf8) - ) -mod_cat = smf.ols("mpg ~ am + cyl + hp", data = dat).fit() -``` - -We can compute marginal means manually using the functions already described: - -```{python} -#| eval: false -pre = avg_predictions( - mod_cat, - newdata = datagrid( - newdata = dat, - cyl = dat["cyl"].unique(), - am = dat["am"].unique()), - by = "am") - -print(pre) -``` - -```{python} -cmp = avg_comparisons(mod_cat) -print(cmp) -``` - - -## Hypothesis and equivalence tests - -The `hypotheses()` function and the `hypothesis` argument can be used to conduct linear and non-linear hypothesis tests on model coefficients, or on any of the quantities computed by the functions introduced above. - -Consider this model: - -```{python} -mod = smf.ols("mpg ~ qsec * drat", data = mtcars).fit() -mod.params -``` - -Can we reject the null hypothesis that the `drat` coefficient is 2 times the size of the `qsec` coefficient? - -```{python} -hyp = hypotheses(mod, "b3 = 2 * b2") -print(hyp) -``` - -The main functions in `marginaleffects` all have a `hypothesis` argument, which means that we can do complex model testing. For example, consider two slope estimates: - -```{python} -range = lambda x: [x.max(), x.min()] -cmp = comparisons( - mod, - variables = "drat", - newdata = datagrid(newdata = mtcars, qsec = range(mtcars["qsec"]))) -print(cmp) -``` - -Are these two contrasts significantly different from one another? To test this, we can use the `hypothesis` argument: - -```{python} -cmp = comparisons( - mod, - hypothesis = "b1 = b2", - variables = "drat", - newdata = datagrid(newdata = mtcars, qsec = range(mtcars["qsec"]))) -print(cmp) -``` \ No newline at end of file diff --git a/docs/index.md b/docs/index.md deleted file mode 100644 index 000ea34..0000000 --- a/docs/index.md +++ /dev/null @@ -1,17 +0,0 @@ -# Welcome to MkDocs - -For full documentation visit [mkdocs.org](https://www.mkdocs.org). - -## Commands - -* `mkdocs new [dir-name]` - Create a new project. -* `mkdocs serve` - Start the live-reloading docs server. -* `mkdocs build` - Build the documentation site. -* `mkdocs -h` - Print help message and exit. - -## Project layout - - mkdocs.yml # The configuration file. - docs/ - index.md # The documentation homepage. - ... # Other markdown pages, images and other files. diff --git a/docs/reference.md b/docs/reference.md deleted file mode 100644 index df51fec..0000000 --- a/docs/reference.md +++ /dev/null @@ -1,5 +0,0 @@ -# API documentation - -* [`predictions`](../api/predictions/index.html) -* [`comparisons`](../api/comparisons/index.html) -* [`hypotheses`](../api/hypotheses/index.html) diff --git a/mkdocs.yml b/mkdocs.yml deleted file mode 100644 index 4d0efb9..0000000 --- a/mkdocs.yml +++ /dev/null @@ -1,19 +0,0 @@ -site_name: marginaleffects for Python -nav: - - Home: index.md - - Getting Started: get_started.md - - Functions API: - - predictions: api/predictions.md - - comparisons: api/comparisons.md - - hypotheses: api/hypotheses.md - -theme: - name: "material" - -plugins: - - mkquartodocs - -markdown_extensions: - - admonition - - codehilite - - mkautodoc diff --git a/pyproject.toml b/pyproject.toml index 9914677..14ccdfa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,44 +1,40 @@ -[tool.poetry] +[project] name = "marginaleffects" -version = "0.0.6" -description = "" -authors = ["Vincent Arel-Bundock "] +version = "0.0.7" +description = "Compute and plot predictions, slopes, marginal means, and comparisons (contrasts, risk ratios, odds, etc.) for many classes of statistical models in Python. Conduct linear and non-linear hypothesis tests, or equivalence tests. Calculate uncertainty estimates using the delta method." +authors = [ + { name = "Vincent Arel-Bundock", email = "vincent.arel-bundock@umontreal.ca" } +] +dependencies = [ + "pandas>=2.0.3", + "numpy>=1.24.4", + "patsy>=0.5.4", + "polars>=0.20.2", + "pyarrow>=14.0.2", + "scipy>=1.10.1", + "matplotlib>=3.7.4", +] readme = "README.md" +requires-python = ">= 3.9" -[tool.poetry.dependencies] -python = "^3.9" -pandas = "^2.0.2" -numpy = "^1.25.0" -patsy = ">0.5.0" -polars = ">0.18.3" -pyarrow = ">14.0.0" -scipy = "^1.10.0" -matplotlib = "^3.7.2" - -[tool.poetry.group.dev.dependencies] -pytest = "^7.4.0" -jupyter = "^1.0.0" -mkdocs = "^1.4.3" -mkdocs-material = "^9.1.17" -mkautodoc = ">=0.2.0" -matplotlib = "^3.7.1" -statsmodels = ">0.14.0" -typing-extensions = "^4.7.0" -pytest-xdist = "^3.3.1" -bandit = "^1.7.5" -ruff = ">0.0.283" - -[tool.ruff] -ignore-init-module-imports = true - -[tool.ruff.extend-per-file-ignores] -"__init__.py" = ["F401"] +[project.scripts] +hello = "pymarginaleffects:hello" [build-system] -requires = ["poetry-core"] -build-backend = "poetry.core.masonry.api" +requires = ["hatchling"] +build-backend = "hatchling.build" -[tool.pytest.ini_options] -filterwarnings = [ - "ignore::DeprecationWarning", +[tool.rye] +managed = true +dev-dependencies = [ + "pytest>=7.4.3", + "pytest-xdist>=3.5.0", + "ruff>=0.1.9", + "statsmodels>=0.14.1", ] + +[tool.hatch.metadata] +allow-direct-references = true + +[tool.hatch.build.targets.wheel] +packages = ["src/pymarginaleffects"] diff --git a/pyproject.toml.bak b/pyproject.toml.bak new file mode 100644 index 0000000..245c572 --- /dev/null +++ b/pyproject.toml.bak @@ -0,0 +1,43 @@ +[tool.poetry] +name = "marginaleffects" +version = "0.0.6" +description = "" +authors = ["Vincent Arel-Bundock "] +readme = "README.md" + +[tool.poetry.dependencies] +python = "^3.9" +pandas = "^2.0.2" +numpy = "^1.25.0" +patsy = ">0.5.0" +polars = ">0.18.3" +pyarrow = ">14.0.0" +scipy = "^1.10.0" +matplotlib = "^3.7.2" + +[tool.poetry.group.dev.dependencies] +pytest = "^7.4.0" +jupyter = "^1.0.0" +mkdocs = "^1.4.3" +mkdocs-material = "^9.1.17" +mkautodoc = ">=0.2.0" +statsmodels = ">0.14.0" +typing-extensions = "^4.7.0" +pytest-xdist = "^3.3.1" +bandit = "^1.7.5" +ruff = ">0.0.283" + +[tool.ruff] +ignore-init-module-imports = true + +[tool.ruff.extend-per-file-ignores] +"__init__.py" = ["F401"] + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" + +[tool.pytest.ini_options] +filterwarnings = [ + "ignore::DeprecationWarning", +] diff --git a/requirements-dev.lock b/requirements-dev.lock new file mode 100644 index 0000000..a9346b1 --- /dev/null +++ b/requirements-dev.lock @@ -0,0 +1,34 @@ +# generated by rye +# use `rye lock` or `rye sync` to update this lockfile +# +# last locked with the following flags: +# pre: false +# features: [] +# all-features: false + +-e file:. +contourpy==1.2.0 +cycler==0.12.1 +execnet==2.0.2 +fonttools==4.47.0 +iniconfig==2.0.0 +kiwisolver==1.4.5 +matplotlib==3.8.2 +numpy==1.26.2 +packaging==23.2 +pandas==2.1.4 +patsy==0.5.4 +pillow==10.1.0 +pluggy==1.3.0 +polars==0.20.2 +pyarrow==14.0.2 +pyparsing==3.1.1 +pytest==7.4.3 +pytest-xdist==3.5.0 +python-dateutil==2.8.2 +pytz==2023.3.post1 +ruff==0.1.9 +scipy==1.11.4 +six==1.16.0 +statsmodels==0.14.1 +tzdata==2023.3 diff --git a/requirements.lock b/requirements.lock new file mode 100644 index 0000000..a000b16 --- /dev/null +++ b/requirements.lock @@ -0,0 +1,27 @@ +# generated by rye +# use `rye lock` or `rye sync` to update this lockfile +# +# last locked with the following flags: +# pre: false +# features: [] +# all-features: false + +-e file:. +contourpy==1.2.0 +cycler==0.12.1 +fonttools==4.47.0 +kiwisolver==1.4.5 +matplotlib==3.8.2 +numpy==1.26.2 +packaging==23.2 +pandas==2.1.4 +patsy==0.5.4 +pillow==10.1.0 +polars==0.20.2 +pyarrow==14.0.2 +pyparsing==3.1.1 +python-dateutil==2.8.2 +pytz==2023.3.post1 +scipy==1.11.4 +six==1.16.0 +tzdata==2023.3