ENH AnnData integration (#66)

* refactor: switching to AnnData up to genewise dispersion estimation (WIP) * refactor: switch to AnnData up to fit_dispersion_trend (WIP) * refactor: switch to AnnData up to calculate_cooks (WIP) * refactor: switch to AnnData up to refit_cooks (WIP) * refactor: porting DeseqStats to anndata (wip) * refactor: port DeseqStats to anndata (wip) * refactor: adapting _cooks_filtering to anndata (wip) * refactor(DeseqStats): port _cooks_filtering to AnnData, now running on synthetic outlier-less dataset (wip) * refactor: step-by-step pipeline running bug-free with anndata * fix(anndata transition): debug behavior when one or more genes have zeros everywhere * fix(anndata transition): debug behavior when there are Cooks outliers * refactor: update tests for anndata compatibility * refactor: DeseqDataSet now extends AnnData * refactor: remove counts / clinical index checks, which are performed by AnnData * refactor: update pytests now that DeseqDataSet extends Anndata * ci: remove docstring tests for methods that are inherited from AnnData * docs: remove obselete docstring fields * docs: update docstrings * refactor(DeseqDataSet): improve handling of all-zero genes * refactor(DeseqStats): clean up comments * refactor: require anndata at installation * fix: typo * ci: update pytest exceptions to reflect file name change (DeseqDataSet.py -> dds.py * ci: update pytest exceptions to reflect file name change (DeseqDataSet.py -> dds.py * docs: disable auto-generation * docs: update sphinx examples to reflect anndata integration * docs: update docstrings * docs: remove __init__ method from API docs * docs: update docstrings * docs: update getting started example * refactor: clean up inline comments * refactor: clean up inline comments * refactor: build list on non-zero gene indices instead of relying on anndata views as it leads to speed improvements * fix: fix bugs due to data not being assigned in ArrayViews * docs: add recursive option to autosummary * test: pull the list of anndata attributes to ignore in tests automatically * refactor: set `non_zero` as a varm field * refactor: set the varm "non_zero" key before calling `_fit_MoM_dispersions()` * refactor: rename "nz_data" to "nonzero_data" * refactor: rename "X" variable to "design_matrix" to lift ambiguity * refactor: avoid duplicating variables when calling np.array * style: split non-zero cooks update
owkin · Feb 16, 2023 · f9653d2 · f9653d2
1 parent 8ab5026
commit f9653d2
Show file tree

Hide file tree

Showing 17 changed files with 666 additions and 427 deletions.
diff --git a/docs/source/api/docstrings/pydeseq2.dds.DeseqDataSet.rst b/docs/source/api/docstrings/pydeseq2.dds.DeseqDataSet.rst
@@ -0,0 +1,21 @@
+pydeseq2.dds.DeseqDataSet
+=========================
+
+.. currentmodule:: pydeseq2.dds
+
+.. autoclass:: pydeseq2.dds.DeseqDataSet
+
+    .. rubric:: Methods
+
+    .. autosummary::
+
+        ~DeseqDataSet.calculate_cooks
+        ~DeseqDataSet.deseq2
+        ~DeseqDataSet.fit_LFC
+        ~DeseqDataSet.fit_MAP_dispersions
+        ~DeseqDataSet.fit_dispersion_prior
+        ~DeseqDataSet.fit_dispersion_trend
+        ~DeseqDataSet.fit_genewise_dispersions
+        ~DeseqDataSet.fit_size_factors
+        ~DeseqDataSet.refit
+
diff --git a/docs/source/api/docstrings/pydeseq2.ds.DeseqStats.rst b/docs/source/api/docstrings/pydeseq2.ds.DeseqStats.rst
@@ -0,0 +1,20 @@
+pydeseq2.ds.DeseqStats
+======================
+
+.. currentmodule:: pydeseq2.ds
+
+.. autoclass:: DeseqStats
+
+   .. rubric:: Methods
+
+   .. autosummary::
+
+      ~DeseqStats.lfc_shrink
+      ~DeseqStats.run_wald_test
+      ~DeseqStats.summary
+
+
+
+
+
+
diff --git a/docs/source/api/docstrings/pydeseq2.grid_search.rst b/docs/source/api/docstrings/pydeseq2.grid_search.rst
@@ -0,0 +1,32 @@
+pydeseq2.grid\_search
+=====================
+
+.. automodule:: pydeseq2.grid_search
+
+
+
+
+
+
+
+   .. rubric:: Functions
+
+   .. autosummary::
+
+      grid_fit_alpha
+      grid_fit_beta
+      grid_fit_shrink_beta
+      vec_nb_nll
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/docs/source/api/docstrings/pydeseq2.preprocessing.rst b/docs/source/api/docstrings/pydeseq2.preprocessing.rst
@@ -0,0 +1,29 @@
+pydeseq2.preprocessing
+======================
+
+.. automodule:: pydeseq2.preprocessing
+
+
+
+
+
+
+
+   .. rubric:: Functions
+
+   .. autosummary::
+
+      deseq2_norm
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/docs/source/api/docstrings/pydeseq2.utils.rst b/docs/source/api/docstrings/pydeseq2.utils.rst
@@ -0,0 +1,47 @@
+pydeseq2.utils
+==============
+
+.. automodule:: pydeseq2.utils
+
+
+
+
+
+
+
+   .. rubric:: Functions
+
+   .. autosummary::
+
+      build_design_matrix
+      dispersion_trend
+      dnb_nll
+      fit_alpha_mle
+      fit_lin_mu
+      fit_moments_dispersions
+      fit_rough_dispersions
+      get_num_processes
+      irls_solver
+      load_example_data
+      nb_nll
+      nbinomFn
+      nbinomGLM
+      robust_method_of_moments_disp
+      test_valid_counts
+      trimmed_cell_variance
+      trimmed_mean
+      trimmed_variance
+      wald_test
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/docs/source/api/index.rst b/docs/source/api/index.rst
@@ -9,8 +9,9 @@ PyDESeq2
     :toctree: docstrings
     :recursive:
 
-    dds.DeseqDataSet
-    ds.DeseqStats
+    ~dds.DeseqDataSet
+    ~ds.DeseqStats
     ~utils
     ~grid_search
-    ~preprocessing
+    ~preprocessing
+
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -62,24 +62,37 @@
     "python": ("https://docs.python.org/3", None),
     "numpy": ("https://numpy.org/doc/stable/", None),
     "pandas": ("https://pandas.pydata.org/docs/", None),
+    "anndata": ("https://anndata.readthedocs.io/en/latest/", None),
 }
 
 autosectionlabel_prefix_document = True
 
 # autodoc settings
 autodoc_default_options = {
-    "show-inheritance": True,
+    "show-inheritance": False,
+    "inherited-members": False,
     "members": True,
 }
 
 add_module_names = False
 
 autoclass_content = "both"
 autodoc_typehints = "both"
-autosummary_generate = True
+autosummary_generate = False
 autodoc_member_order = "groupwise"
 autodoc_docstring_signature = True
 
+# # This is the expected signature of the handler for this event, cf doc
+# def autodoc_skip_member_handler(app, what, name, obj, skip, options):
+#     # Basic approach; you might want a regex instead
+#     return name.endswith("__")
+#
+#
+# # Automatically called by sphinx at startup
+# def setup(app):
+#     # Connect the autodoc-skip-member event from apidoc to the callback
+#     app.connect("autodoc-skip-member", autodoc_skip_member_handler)
+#
 
 # Bibliography
 bibtex_bibfiles = ["refs.bib"]
@@ -205,6 +218,8 @@
 # The following elements are the link that auto doc were not able to do
 nitpick_ignore = [
     ("py:class", "pd.Series"),
+    #    ("py:class", "anndata.AnnData"),
+    #    ("py:class", "anndata._core.anndata.AnnData"),
     ("py:class", "ndarray"),
     ("py:class", "pydantic.main.BaseModel"),
     ("py:class", "torch.nn.modules.module.Module"),

diff --git a/examples/plot_minimal_pydeseq2_pipeline.py b/examples/plot_minimal_pydeseq2_pipeline.py
@@ -10,6 +10,7 @@
     :depth: 3
 
 We start by importing required packages and setting up an optional path to save results.
+
 """
 
 import os
@@ -175,13 +176,33 @@
         pkl.dump(dds, f)
 
 # %%
-# If needed, we may now access the fitted dispersions and LFCs (in natural log scale):
+# The :class:`DeseqDataSet` class extends the
+# :class:`AnnData <anndata.AnnData>`
+# class.
+
+print(dds)
+
+# %%
+# Hence, parameters are stored according to the :class:`AnnData <anndata.AnnData>` data
+# structure, with key-based data fields. In particular,
+#
+# - ``X`` stores the count data,
+# - ``obs`` stores design factors,
+# - ``obsm`` stores sample-level data, such as ``"design_matrix"`` and
+#   ``"size_factors"``,
+# - ``varm`` stores gene-level data, such as ``"dispersions"`` and ``"LFC"``.
+#
+#
+# As an example, here is how we would access dispersions and LFCs
+# (in natural log scale):
+
+# %%
 
-print(dds.dispersions)
+print(dds.varm["dispersions"])
 
 # %%
 
-print(dds.LFCs)
+print(dds.varm["LFC"])
 
 # %%
 # .. currentmodule:: pydeseq2.ds
@@ -301,7 +322,7 @@
 # Now, if we print log fold changes, we will have two columns in addition to the
 # intercept: one corresponding to the ``group`` variable, and the other to ``condition``.
 
-print(dds.LFCs)
+print(dds.varm["LFC"])
 
 # %%
 # .. currentmodule:: pydeseq2.ds

diff --git a/examples/plot_step_by_step.py b/examples/plot_step_by_step.py
@@ -102,30 +102,32 @@
 
 dds.fit_size_factors()
 
-dds.size_factors
+dds.obsm["size_factors"]
 
 # %%
 # Fit genewise dispersions
 # ^^^^^^^^^^^^^^^^^^^^^^^^
 
 dds.fit_genewise_dispersions()
 
-dds.genewise_dispersions
+dds.varm["genewise_dispersions"]
 
 # %%
 # Fit dispersion trend coefficients
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 dds.fit_dispersion_trend()
-dds.trend_coeffs
-dds.fitted_dispersions
+dds.uns["trend_coeffs"]
+dds.varm["fitted_dispersions"]
 
 # %%
 # Dispersion priors
 # ^^^^^^^^^^^^^^^^^
 
 dds.fit_dispersion_prior()
-print(f"logres_prior={dds._squared_logres}, sigma_prior={dds.prior_disp_var}")
+print(
+    f"logres_prior={dds.uns['_squared_logres']}, sigma_prior={dds.uns['prior_disp_var']}"
+)
 
 # %%
 # MAP Dispersions
@@ -138,8 +140,8 @@
 # stored in `dds.dispersions`.
 
 dds.fit_MAP_dispersions()
-dds.MAP_dispersions
-dds.dispersions
+dds.varm["MAP_dispersions"]
+dds.varm["dispersions"]
 
 # %%
 # Fit log fold changes
@@ -150,7 +152,7 @@
 # `DeseqStats` displays LFCs in log2 scale (see later on).
 
 dds.fit_LFC()
-dds.LFCs
+dds.varm["LFC"]
 
 # %%
 # Calculate Cooks distances and refit