diff --git a/.azure-pipelines.yml b/.azure-pipelines.yml
index 9f4f44d749..2086c20aa4 100644
--- a/.azure-pipelines.yml
+++ b/.azure-pipelines.yml
@@ -4,7 +4,7 @@ trigger:
 
 variables:
   python.version: '3.12'
-  PYTEST_ADDOPTS: '-v --color=yes --nunit-xml=test-data/test-results.xml'
+  PYTEST_ADDOPTS: '-v --color=yes --internet-tests --nunit-xml=test-data/test-results.xml'
   TEST_EXTRA: 'test-full'
   DEPENDENCIES_VERSION: "latest"  # |"pre-release" | "minimum-version"
   TEST_TYPE: "standard" # | "coverage"
@@ -44,11 +44,19 @@ jobs:
     inputs:
       key: '"python $(python.version)" | "$(Agent.OS)" | pyproject.toml'
       restoreKeys: |
-        python | "$(Agent.OS)"
-        python
+        "python" | "$(Agent.OS)"
+        "python"
       path: $(uv_cache_dir)
     displayName: Cache pip packages
 
+
+  - task: Cache@2
+    inputs:
+      key: '"pytest"'
+      restoreKeys: '"pytest"'
+      path: $(System.DefaultWorkingDirectory)/.pytest_cache/d
+    displayName: Cache pytest data
+
   - script: |
       export MPLBACKEND="agg"
       echo $MPLBACKEND
diff --git a/docs/extensions/patch_myst_nb.py b/docs/extensions/patch_myst_nb.py
new file mode 100644
index 0000000000..f370acaf5e
--- /dev/null
+++ b/docs/extensions/patch_myst_nb.py
@@ -0,0 +1,30 @@
+"""Extension to patch https://github.com/executablebooks/MyST-NB/pull/599."""
+
+# TODO once MyST-NB 1.1.1/1.2.0 is out, this can be removed.
+
+from __future__ import annotations
+
+from copy import copy
+from typing import TYPE_CHECKING
+
+from myst_nb.core.render import MditRenderMixin
+
+if TYPE_CHECKING:
+    from sphinx.application import Sphinx
+
+
+get_orig = MditRenderMixin.get_cell_level_config
+
+
+def get_cell_level_config(
+    self: MditRenderMixin,
+    field: str,
+    cell_metadata: dict[str, object],
+    line: int | None = None,
+):
+    rv = get_orig(self, field, cell_metadata, line)
+    return copy(rv)
+
+
+def setup(app: Sphinx):
+    MditRenderMixin.get_cell_level_config = get_cell_level_config
diff --git a/docs/references.bib b/docs/references.bib
index 561537cd92..d484922b9d 100644
--- a/docs/references.bib
+++ b/docs/references.bib
@@ -99,6 +99,21 @@ @article{Blondel2008
   pages     = {P10008},
 }
 
+@article{Burczynski2006,
+  author    = {Burczynski, Michael E. and Peterson, Ron L. and Twine, Natalie C. and Zuberek, Krystyna A. and Brodeur, Brendan J. and Casciotti, Lori and Maganti, Vasu and Reddy, Padma S. and Strahs, Andrew and Immermann, Fred and Spinelli, Walter and Schwertschlag, Ulrich and Slager, Anna M. and Cotreau, Monette M. and Dorner, Andrew J.},
+  title     = {Molecular Classification of Crohn’s Disease and Ulcerative Colitis Patients Using Transcriptional Profiles in Peripheral Blood Mononuclear Cells},
+  volume    = {8},
+  issn      = {1525-1578},
+  url       = {https://doi.org/10.2353/jmoldx.2006.050079},
+  doi       = {10.2353/jmoldx.2006.050079},
+  number    = {1},
+  journal   = {The Journal of Molecular Diagnostics},
+  publisher = {Elsevier BV},
+  year      = {2006},
+  month     = {feb},
+  pages     = {51--61},
+}
+
 @article{Butler2018,
   author    = {Butler, Andrew and Hoffman, Paul and Smibert, Peter and Papalexi, Efthymia and Satija, Rahul},
   title     = {Integrating single-cell transcriptomic data across different conditions, technologies, and species},
diff --git a/docs/release-notes/1.10.2.md b/docs/release-notes/1.10.2.md
index 6f1650be55..391fb2b90b 100644
--- a/docs/release-notes/1.10.2.md
+++ b/docs/release-notes/1.10.2.md
@@ -12,6 +12,7 @@
 * Fixed incorrect instructions in "testing" dev docs {pr}`2994` {smaller}`I Virshup`
 * Update marsilea tutorial to use `group_` methods {pr}`3001` {smaller}`I Virshup`
 * Fixed citations {pr}`3032` {smaller}`P Angerer`
+* Improve dataset documentation {pr}`3060` {smaller}`P Angerer`
 
 ```{rubric} Bug fixes
 ```
diff --git a/scanpy/_utils/_doctests.py b/scanpy/_utils/_doctests.py
index 6a339ab7f8..6a08099a24 100644
--- a/scanpy/_utils/_doctests.py
+++ b/scanpy/_utils/_doctests.py
@@ -26,3 +26,10 @@ def decorator(func: F) -> F:
         return func
 
     return decorator
+
+
+def doctest_internet(func: F) -> F:
+    """Mark function so doctest gets the internet mark."""
+
+    func._doctest_internet = True
+    return func
diff --git a/scanpy/datasets/_datasets.py b/scanpy/datasets/_datasets.py
index fb75119cb9..ccbc9a3bb3 100644
--- a/scanpy/datasets/_datasets.py
+++ b/scanpy/datasets/_datasets.py
@@ -4,14 +4,14 @@
 from pathlib import Path
 from typing import TYPE_CHECKING
 
-import anndata as ad
 import numpy as np
 import pandas as pd
+from anndata import AnnData
 
 from .. import _utils
-from .. import logging as logg
 from .._compat import old_positionals
 from .._settings import settings
+from .._utils._doctests import doctest_internet, doctest_needs
 from ..readwrite import read, read_visium
 from ._utils import check_datasetdir_exists, filter_oldformatwarning
 
@@ -20,6 +20,37 @@
 
     from .._utils import AnyRandom
 
+    VisiumSampleID = Literal[
+        "V1_Breast_Cancer_Block_A_Section_1",
+        "V1_Breast_Cancer_Block_A_Section_2",
+        "V1_Human_Heart",
+        "V1_Human_Lymph_Node",
+        "V1_Mouse_Kidney",
+        "V1_Adult_Mouse_Brain",
+        "V1_Mouse_Brain_Sagittal_Posterior",
+        "V1_Mouse_Brain_Sagittal_Posterior_Section_2",
+        "V1_Mouse_Brain_Sagittal_Anterior",
+        "V1_Mouse_Brain_Sagittal_Anterior_Section_2",
+        "V1_Human_Brain_Section_1",
+        "V1_Human_Brain_Section_2",
+        "V1_Adult_Mouse_Brain_Coronal_Section_1",
+        "V1_Adult_Mouse_Brain_Coronal_Section_2",
+        # spaceranger version 1.2.0
+        "Targeted_Visium_Human_Cerebellum_Neuroscience",
+        "Parent_Visium_Human_Cerebellum",
+        "Targeted_Visium_Human_SpinalCord_Neuroscience",
+        "Parent_Visium_Human_SpinalCord",
+        "Targeted_Visium_Human_Glioblastoma_Pan_Cancer",
+        "Parent_Visium_Human_Glioblastoma",
+        "Targeted_Visium_Human_BreastCancer_Immunology",
+        "Parent_Visium_Human_BreastCancer",
+        "Targeted_Visium_Human_OvarianCancer_Pan_Cancer",
+        "Targeted_Visium_Human_OvarianCancer_Immunology",
+        "Parent_Visium_Human_OvarianCancer",
+        "Targeted_Visium_Human_ColorectalCancer_GeneSignature",
+        "Parent_Visium_Human_ColorectalCancer",
+    ]
+
 HERE = Path(__file__).parent
 
 
@@ -33,7 +64,7 @@ def blobs(
     cluster_std: float = 1.0,
     n_observations: int = 640,
     random_state: AnyRandom = 0,
-) -> ad.AnnData:
+) -> AnnData:
     """\
     Gaussian Blobs.
 
@@ -55,6 +86,13 @@ def blobs(
     -------
     Annotated data matrix containing a observation annotation 'blobs' that
     indicates cluster identity.
+
+    Examples
+    --------
+    >>> import scanpy as sc
+    >>> sc.datasets.blobs()
+    AnnData object with n_obs × n_vars = 640 × 11
+        obs: 'blobs'
     """
     import sklearn.datasets
 
@@ -65,38 +103,43 @@ def blobs(
         cluster_std=cluster_std,
         random_state=random_state,
     )
-    return ad.AnnData(X, obs=dict(blobs=y.astype(str)))
+    return AnnData(X, obs=dict(blobs=y.astype(str)))
 
 
+@doctest_internet
 @check_datasetdir_exists
-def burczynski06() -> ad.AnnData:
+def burczynski06() -> AnnData:
     """\
-    Bulk data with conditions ulcerative colitis (UC) and Crohn's disease (CD).
+    Bulk data with conditions ulcerative colitis (UC) and Crohn’s disease (CD) :cite:p:`Burczynski2006`.
 
     The study assesses transcriptional profiles in peripheral blood mononuclear
     cells from 42 healthy individuals, 59 CD patients, and 26 UC patients by
     hybridization to microarrays interrogating more than 22,000 sequences.
 
-    Reference
-    ---------
-    Burczynski et al., "Molecular classification of Crohn's disease and
-    ulcerative colitis patients using transcriptional profiles in peripheral
-    blood mononuclear cells"
-    J Mol Diagn 8, 51 (2006). PMID:16436634.
+    Returns
+    -------
+    Annotated data matrix.
+
+    Examples
+    --------
+    >>> import scanpy as sc
+    >>> sc.datasets.burczynski06()
+    AnnData object with n_obs × n_vars = 127 × 22283
+        obs: 'groups'
     """
     filename = settings.datasetdir / "burczynski06/GDS1615_full.soft.gz"
     url = "ftp://ftp.ncbi.nlm.nih.gov/geo/datasets/GDS1nnn/GDS1615/soft/GDS1615_full.soft.gz"
-    adata = read(filename, backup_url=url)
-    return adata
+    return read(filename, backup_url=url)
 
 
-def krumsiek11() -> ad.AnnData:
+def krumsiek11() -> AnnData:
     """\
     Simulated myeloid progenitors :cite:p:`Krumsiek2011`.
 
     The literature-curated boolean network from :cite:t:`Krumsiek2011` was used to
-    simulate the data. It describes development to four cell fates: 'monocyte',
-    'erythrocyte', 'megakaryocyte' and 'neutrophil'.
+    simulate the data. It describes development to four cell fates annotated in
+    :attr:`~anndata.AnnData.obs`\\ `["cell_type"]`:
+    “monocyte” (`Mo`), “erythrocyte” (`Ery`), “megakaryocyte” (`Mk`) and “neutrophil” (`Neu`).
 
     See also the discussion of this data in :cite:t:`Wolf2019`.
 
@@ -105,14 +148,23 @@ def krumsiek11() -> ad.AnnData:
     Returns
     -------
     Annotated data matrix.
+
+    Examples
+    --------
+    >>> import scanpy as sc
+    >>> sc.datasets.krumsiek11()
+    UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
+        utils.warn_names_duplicates("obs")
+    AnnData object with n_obs × n_vars = 640 × 11
+        obs: 'cell_type'
+        uns: 'iroot', 'highlights'
     """
-    filename = HERE / "krumsiek11.txt"
     with settings.verbosity.override("error"):  # suppress output...
-        adata = read(filename, first_column_names=True)
+        adata = read(HERE / "krumsiek11.txt", first_column_names=True)
     adata.uns["iroot"] = 0
     fate_labels = {0: "Stem", 159: "Mo", 319: "Ery", 459: "Mk", 619: "Neu"}
     adata.uns["highlights"] = fate_labels
-    cell_type = np.array(["progenitor" for i in range(adata.n_obs)])
+    cell_type = pd.array(["progenitor"]).repeat(adata.n_obs)
     cell_type[80:160] = "Mo"
     cell_type[240:320] = "Ery"
     cell_type[400:480] = "Mk"
@@ -122,14 +174,32 @@ def krumsiek11() -> ad.AnnData:
     return adata
 
 
+@doctest_internet
+@doctest_needs("openpyxl")
 @check_datasetdir_exists
-def moignard15() -> ad.AnnData:
+def moignard15() -> AnnData:
     """\
     Hematopoiesis in early mouse embryos :cite:p:`Moignard2015`.
 
+    The data was obtained using qRT–PCR.
+    :attr:`~anndata.AnnData.X` contains the normalized dCt values from supp. table 7 of the publication.
+
+    :attr:`~anndata.AnnData.obs`\\ `["exp_groups"]` contains the stages derived by
+    flow sorting and GFP marker status:
+    “primitive streak” (`PS`), “neural plate” (`NP`), “head fold (`HF`),
+    “four somite” blood/GFP⁺ (4SG), and “four somite” endothelial/GFP¯ (`4SFG`).
+
     Returns
     -------
     Annotated data matrix.
+
+    Examples
+    --------
+    >>> import scanpy as sc
+    >>> sc.datasets.moignard15()
+    AnnData object with n_obs × n_vars = 3934 × 42
+        obs: 'exp_groups'
+        uns: 'iroot', 'exp_groups_colors'
     """
     filename = settings.datasetdir / "moignard15/nbt.3154-S3.xlsx"
     backup_url = "https://static-content.springer.com/esm/art%3A10.1038%2Fnbt.3154/MediaObjects/41587_2015_BFnbt3154_MOESM4_ESM.xlsx"
@@ -160,25 +230,30 @@ def moignard15() -> ad.AnnData:
     return adata
 
 
+@doctest_internet
 @check_datasetdir_exists
-def paul15() -> ad.AnnData:
+def paul15() -> AnnData:
     """\
     Development of Myeloid Progenitors :cite:p:`Paul2015`.
 
     Non-logarithmized raw data.
 
     The data has been sent out by Email from the Amit Lab. An R version for
-    loading the data can be found here
-    https://github.com/theislab/scAnalysisTutorial
+    loading the data can be found `here
+    <https://github.com/theislab/scAnalysisTutorial>`_.
 
     Returns
     -------
     Annotated data matrix.
+
+    Examples
+    --------
+    >>> import scanpy as sc
+    >>> sc.datasets.paul15()
+    AnnData object with n_obs × n_vars = 2730 × 3451
+        obs: 'paul15_clusters'
+        uns: 'iroot'
     """
-    logg.warning(
-        "In Scanpy 0.*, this returned logarithmized data. "
-        "Now it returns non-logarithmized data."
-    )
     import h5py
 
     filename = settings.datasetdir / "paul15/paul15.h5"
@@ -193,9 +268,9 @@ def paul15() -> ad.AnnData:
         clusters = f["cluster.id"][()].flatten().astype(int)
         infogenes_names = f["info.genes_strings"][()].astype(str)
     # each row has to correspond to a observation, therefore transpose
-    adata = ad.AnnData(X.transpose())
+    adata = AnnData(X.transpose())
     adata.var_names = gene_names
-    adata.row_names = cell_names
+    adata.obs_names = cell_names
     # names reflecting the cell type identifications from the paper
     cell_type = 6 * ["Ery"]
     cell_type += "MEP Mk GMP GMP DC Baso Baso Mo Mo Neu Neu Eos Lymph".split()
@@ -216,7 +291,7 @@ def paul15() -> ad.AnnData:
     return adata
 
 
-def toggleswitch() -> ad.AnnData:
+def toggleswitch() -> AnnData:
     """\
     Simulated toggleswitch.
 
@@ -227,6 +302,15 @@ def toggleswitch() -> ad.AnnData:
     Returns
     -------
     Annotated data matrix.
+
+    Examples
+    --------
+    >>> import scanpy as sc
+    >>> sc.datasets.toggleswitch()
+    UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
+        utils.warn_names_duplicates("obs")
+    AnnData object with n_obs × n_vars = 200 × 2
+        uns: 'iroot'
     """
     filename = HERE / "toggleswitch.txt"
     adata = read(filename, first_column_names=True)
@@ -235,23 +319,38 @@ def toggleswitch() -> ad.AnnData:
 
 
 @filter_oldformatwarning
-def pbmc68k_reduced() -> ad.AnnData:
+def pbmc68k_reduced() -> AnnData:
     """\
     Subsampled and processed 68k PBMCs.
 
-    10x PBMC 68k dataset from
-    https://support.10xgenomics.com/single-cell-gene-expression/datasets
+    `PBMC 68k dataset`_ from 10x Genomics.
 
-    The original PBMC 68k dataset was preprocessed using scanpy and was saved
-    keeping only 724 cells and 221 highly variable genes.
+    The original PBMC 68k dataset was preprocessed with steps including
+    :func:`~scanpy.pp.normalize_total`\\ [#norm]_ and :func:`~scanpy.pp.scale`.
+    It was saved keeping only 724 cells and 221 highly variable genes.
 
     The saved file contains the annotation of cell types (key: `'bulk_labels'`),
     UMAP coordinates, louvain clustering and gene rankings based on the
     `bulk_labels`.
 
+    .. [#norm] Back when the dataset was created, :func:`~scanpy.pp.normalize_per_cell` was used instead.
+    .. _PBMC 68k dataset: https://www.10xgenomics.com/datasets/fresh-68-k-pbm-cs-donor-a-1-standard-1-1-0
+
     Returns
     -------
     Annotated data matrix.
+
+    Examples
+    --------
+    >>> import scanpy as sc
+    >>> sc.datasets.pbmc68k_reduced()
+    AnnData object with n_obs × n_vars = 700 × 765
+        obs: 'bulk_labels', 'n_genes', 'percent_mito', 'n_counts', 'S_score', 'G2M_score', 'phase', 'louvain'
+        var: 'n_counts', 'means', 'dispersions', 'dispersions_norm', 'highly_variable'
+        uns: 'bulk_labels_colors', 'louvain', 'louvain_colors', 'neighbors', 'pca', 'rank_genes_groups'
+        obsm: 'X_pca', 'X_umap'
+        varm: 'PCs'
+        obsp: 'distances', 'connectivities'
     """
 
     filename = HERE / "10x_pbmc68k_reduced.h5ad"
@@ -260,24 +359,25 @@ def pbmc68k_reduced() -> ad.AnnData:
         return read(filename)
 
 
+@doctest_internet
 @filter_oldformatwarning
 @check_datasetdir_exists
-def pbmc3k() -> ad.AnnData:
+def pbmc3k() -> AnnData:
     """\
     3k PBMCs from 10x Genomics.
 
     The data consists in 3k PBMCs from a Healthy Donor and is freely available
-    from 10x Genomics (`here
-    <https://cf.10xgenomics.com/samples/cell-exp/1.1.0/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz>`__
-    from this `webpage
-    <https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/pbmc3k>`__).
+    from 10x Genomics (file_ from this webpage_).
 
-    The exact same data is also used in Seurat's
-    `basic clustering tutorial <https://satijalab.org/seurat/articles/pbmc3k_tutorial.html>`__.
+    The exact same data is also used in Seurat’s `basic clustering tutorial`_.
 
-    .. note::
+    .. _file: https://cf.10xgenomics.com/samples/cell-exp/1.1.0/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz
+    .. _webpage: https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/pbmc3k
+    .. _basic clustering tutorial: https://satijalab.org/seurat/articles/pbmc3k_tutorial.html
 
-        This downloads 5.9 MB of data upon the first call of the function and stores it in `./data/pbmc3k_raw.h5ad`.
+    .. note::
+       This downloads 5.9 MB of data upon the first call of the function and stores it in
+       :attr:`~scanpy._settings.ScanpyConfig.datasetdir`\\ `/pbmc3k_raw.h5ad`.
 
     The following code was run to produce the file.
 
@@ -298,38 +398,70 @@ def pbmc3k() -> ad.AnnData:
     Returns
     -------
     Annotated data matrix.
+
+    Examples
+    --------
+    >>> import scanpy as sc
+    >>> sc.datasets.pbmc3k()
+    AnnData object with n_obs × n_vars = 2700 × 32738
+        var: 'gene_ids'
     """
     url = "https://falexwolf.de/data/pbmc3k_raw.h5ad"
     adata = read(settings.datasetdir / "pbmc3k_raw.h5ad", backup_url=url)
     return adata
 
 
+@doctest_internet
 @filter_oldformatwarning
 @check_datasetdir_exists
-def pbmc3k_processed() -> ad.AnnData:
-    """Processed 3k PBMCs from 10x Genomics.
+def pbmc3k_processed() -> AnnData:
+    """\
+    Processed 3k PBMCs from 10x Genomics.
 
     Processed using the basic tutorial :doc:`/tutorials/basics/clustering-2017`.
 
+    For preprocessing, cells are filtered out that have few gene counts or too high a `percent_mito`.
+    The counts are logarithmized and only genes marked by :func:`~scanpy.pp.highly_variable_genes` are retained.
+    The :attr:`~anndata.AnnData.obs` variables `n_counts` and `percent_mito` are corrected for
+    using :func:`~scanpy.pp.regress_out`, and values are scaled and clipped by :func:`~scanpy.pp.scale`.
+    Finally, :func:`~scanpy.pp.pca` and :func:`~scanpy.pp.neighbors` are calculated.
+
+    As analysis steps, the embeddings :func:`~scanpy.tl.tsne` and :func:`~scanpy.tl.umap` are performed.
+    Communities are identified using :func:`~scanpy.tl.louvain` and marker genes using :func:`~scanpy.tl.rank_genes_groups`.
+
     Returns
     -------
     Annotated data matrix.
+
+    Examples
+    --------
+    >>> import scanpy as sc
+    >>> sc.datasets.pbmc3k_processed()
+    AnnData object with n_obs × n_vars = 2638 × 1838
+        obs: 'n_genes', 'percent_mito', 'n_counts', 'louvain'
+        var: 'n_cells'
+        uns: 'draw_graph', 'louvain', 'louvain_colors', 'neighbors', 'pca', 'rank_genes_groups'
+        obsm: 'X_pca', 'X_tsne', 'X_umap', 'X_draw_graph_fr'
+        varm: 'PCs'
+        obsp: 'distances', 'connectivities'
     """
+    url = "https://raw.githubusercontent.com/chanzuckerberg/cellxgene/main/example-dataset/pbmc3k.h5ad"
+
     with warnings.catch_warnings():
         warnings.filterwarnings("ignore", category=FutureWarning, module="anndata")
-        return read(
-            settings.datasetdir / "pbmc3k_processed.h5ad",
-            backup_url="https://raw.githubusercontent.com/chanzuckerberg/cellxgene/main/example-dataset/pbmc3k.h5ad",
-        )
+        return read(settings.datasetdir / "pbmc3k_processed.h5ad", backup_url=url)
 
 
 def _download_visium_dataset(
-    sample_id: str,
-    spaceranger_version: str,
+    sample_id: VisiumSampleID,
+    spaceranger_version: Literal["1.1.0", "1.2.0"],
+    *,
     base_dir: Path | None = None,
     download_image: bool = False,
-):
-    """
+) -> Path:
+    """\
+    Download Visium spatial data from 10x Genomics’ database.
+
     Params
     ------
     sample_id
@@ -344,7 +476,7 @@ def _download_visium_dataset(
     if base_dir is None:
         base_dir = settings.datasetdir
 
-    url_prefix = f"https://cf.10xgenomics.com/samples/spatial-exp/{spaceranger_version}/{sample_id}/"
+    url_prefix = f"https://cf.10xgenomics.com/samples/spatial-exp/{spaceranger_version}/{sample_id}"
 
     sample_dir = base_dir / sample_id
     sample_dir.mkdir(exist_ok=True)
@@ -353,9 +485,10 @@ def _download_visium_dataset(
     tar_filename = f"{sample_id}_spatial.tar.gz"
     tar_pth = sample_dir / tar_filename
     _utils.check_presence_download(
-        filename=tar_pth, backup_url=url_prefix + tar_filename
+        filename=tar_pth, backup_url=f"{url_prefix}/{tar_filename}"
     )
     with tarfile.open(tar_pth) as f:
+        f.extraction_filter = tarfile.data_filter
         for el in f:
             if not (sample_dir / el.name).exists():
                 f.extract(el, sample_dir)
@@ -363,79 +496,59 @@ def _download_visium_dataset(
     # Download counts
     _utils.check_presence_download(
         filename=sample_dir / "filtered_feature_bc_matrix.h5",
-        backup_url=url_prefix + f"{sample_id}_filtered_feature_bc_matrix.h5",
+        backup_url=f"{url_prefix}/{sample_id}_filtered_feature_bc_matrix.h5",
     )
 
     # Download image
     if download_image:
         _utils.check_presence_download(
             filename=sample_dir / "image.tif",
-            backup_url=url_prefix + f"{sample_id}_image.tif",
+            backup_url=f"{url_prefix}/{sample_id}_image.tif",
         )
 
+    return sample_dir
+
 
+@doctest_internet
 @check_datasetdir_exists
 def visium_sge(
-    sample_id: Literal[
-        "V1_Breast_Cancer_Block_A_Section_1",
-        "V1_Breast_Cancer_Block_A_Section_2",
-        "V1_Human_Heart",
-        "V1_Human_Lymph_Node",
-        "V1_Mouse_Kidney",
-        "V1_Adult_Mouse_Brain",
-        "V1_Mouse_Brain_Sagittal_Posterior",
-        "V1_Mouse_Brain_Sagittal_Posterior_Section_2",
-        "V1_Mouse_Brain_Sagittal_Anterior",
-        "V1_Mouse_Brain_Sagittal_Anterior_Section_2",
-        "V1_Human_Brain_Section_1",
-        "V1_Human_Brain_Section_2",
-        "V1_Adult_Mouse_Brain_Coronal_Section_1",
-        "V1_Adult_Mouse_Brain_Coronal_Section_2",
-        # spaceranger version 1.2.0
-        "Targeted_Visium_Human_Cerebellum_Neuroscience",
-        "Parent_Visium_Human_Cerebellum",
-        "Targeted_Visium_Human_SpinalCord_Neuroscience",
-        "Parent_Visium_Human_SpinalCord",
-        "Targeted_Visium_Human_Glioblastoma_Pan_Cancer",
-        "Parent_Visium_Human_Glioblastoma",
-        "Targeted_Visium_Human_BreastCancer_Immunology",
-        "Parent_Visium_Human_BreastCancer",
-        "Targeted_Visium_Human_OvarianCancer_Pan_Cancer",
-        "Targeted_Visium_Human_OvarianCancer_Immunology",
-        "Parent_Visium_Human_OvarianCancer",
-        "Targeted_Visium_Human_ColorectalCancer_GeneSignature",
-        "Parent_Visium_Human_ColorectalCancer",
-    ] = "V1_Breast_Cancer_Block_A_Section_1",
+    sample_id: VisiumSampleID = "V1_Breast_Cancer_Block_A_Section_1",
     *,
     include_hires_tiff: bool = False,
-) -> ad.AnnData:
+) -> AnnData:
     """\
-    Processed Visium Spatial Gene Expression data from 10x Genomics.
-    Database: https://support.10xgenomics.com/spatial-gene-expression/datasets
+    Processed Visium Spatial Gene Expression data from 10x Genomics’ database.
+
+    The database_ can be browsed online to find the ``sample_id`` you want.
+
+    .. _database: https://support.10xgenomics.com/spatial-gene-expression/datasets
 
     Parameters
     ----------
     sample_id
         The ID of the data sample in 10x’s spatial database.
     include_hires_tiff
-        Download and include the high-resolution tissue image (tiff) in `adata.uns["spatial"][sample_id]["metadata"]["source_image_path"]`.
+        Download and include the high-resolution tissue image (tiff) in
+        `adata.uns["spatial"][sample_id]["metadata"]["source_image_path"]`.
 
     Returns
     -------
     Annotated data matrix.
+
+    Examples
+    --------
+
+    >>> import scanpy as sc
+    >>> sc.datasets.visium_sge(sample_id='V1_Breast_Cancer_Block_A_Section_1')
+    AnnData object with n_obs × n_vars = 3798 × 36601
+        obs: 'in_tissue', 'array_row', 'array_col'
+        var: 'gene_ids', 'feature_types', 'genome'
+        uns: 'spatial'
+        obsm: 'spatial'
     """
-    if "V1_" in sample_id:
-        spaceranger_version = "1.1.0"
-    else:
-        spaceranger_version = "1.2.0"
-    _download_visium_dataset(
+    spaceranger_version = "1.1.0" if "V1_" in sample_id else "1.2.0"
+    sample_dir = _download_visium_dataset(
         sample_id, spaceranger_version, download_image=include_hires_tiff
     )
-    if include_hires_tiff:
-        adata = read_visium(
-            settings.datasetdir / sample_id,
-            source_image_path=settings.datasetdir / sample_id / "image.tif",
-        )
-    else:
-        adata = read_visium(settings.datasetdir / sample_id)
-    return adata
+    source_image_path = sample_dir / "image.tif" if include_hires_tiff else None
+    return read_visium(sample_dir, source_image_path=source_image_path)
diff --git a/scanpy/datasets/_ebi_expression_atlas.py b/scanpy/datasets/_ebi_expression_atlas.py
index 2b8b6f06ff..9f3bcb81ad 100644
--- a/scanpy/datasets/_ebi_expression_atlas.py
+++ b/scanpy/datasets/_ebi_expression_atlas.py
@@ -12,6 +12,7 @@
 
 from .. import logging as logg
 from .._settings import settings
+from .._utils._doctests import doctest_internet
 from ..readwrite import _download
 from ._utils import check_datasetdir_exists
 
@@ -98,30 +99,40 @@ def read_expression_from_archive(archive: ZipFile) -> anndata.AnnData:
     return adata
 
 
+@doctest_internet
 def ebi_expression_atlas(
     accession: str, *, filter_boring: bool = False
 ) -> anndata.AnnData:
     """\
-    Load a dataset from the `EBI Single Cell Expression Atlas
-    <https://www.ebi.ac.uk/gxa/sc/experiments>`__
+    Load a dataset from the EBI Single Cell Expression Atlas.
 
+    The atlas_ can be browsed online to find the ``accession`` you want.
     Downloaded datasets are saved in the directory specified by
     :attr:`~scanpy._settings.ScanpyConfig.datasetdir`.
 
+    .. _atlas: https://www.ebi.ac.uk/gxa/sc/experiments
+
     Params
     ------
     accession
         Dataset accession. Like ``E-GEOD-98816`` or ``E-MTAB-4888``.
-        This can be found in the url on the datasets page, for example
-        https://www.ebi.ac.uk/gxa/sc/experiments/E-GEOD-98816/results/tsne.
+        This can be found in the url on the datasets page, for example E-GEOD-98816_.
+
+        .. _E-GEOD-98816: https://www.ebi.ac.uk/gxa/sc/experiments/E-GEOD-98816/results/tsne
     filter_boring
         Whether boring labels in `.obs` should be automatically removed, such as
         labels with a single or :attr:`~anndata.AnnData.n_obs` distinct values.
 
+    Returns
+    -------
+    Annotated data matrix.
+
     Example
     -------
     >>> import scanpy as sc
-    >>> adata = sc.datasets.ebi_expression_atlas("E-MTAB-4888")
+    >>> sc.datasets.ebi_expression_atlas("E-MTAB-4888")  # doctest: +ELLIPSIS
+    AnnData object with n_obs × n_vars = 2261 × 23899
+        obs: 'Sample Characteristic[organism]', 'Sample Characteristic Ontology Term[organism]', ..., 'Factor Value[cell type]', 'Factor Value Ontology Term[cell type]'
     """
     experiment_dir = settings.datasetdir / accession
     dataset_path = experiment_dir / f"{accession}.h5ad"
diff --git a/scanpy/datasets/_utils.py b/scanpy/datasets/_utils.py
index 91337a40bc..fb0c609102 100644
--- a/scanpy/datasets/_utils.py
+++ b/scanpy/datasets/_utils.py
@@ -2,29 +2,37 @@
 
 import warnings
 from functools import wraps
+from typing import TYPE_CHECKING
 
 import anndata as ad
 from packaging.version import Version
 
 from .._settings import settings
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+    from typing import ParamSpec, TypeVar
 
-def check_datasetdir_exists(f):
+    P = ParamSpec("P")
+    R = TypeVar("R")
+
+
+def check_datasetdir_exists(f: Callable[P, R]) -> Callable[P, R]:
     @wraps(f)
-    def wrapper(*args, **kwargs):
+    def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
         settings.datasetdir.mkdir(exist_ok=True)
         return f(*args, **kwargs)
 
     return wrapper
 
 
-def filter_oldformatwarning(f):
+def filter_oldformatwarning(f: Callable[P, R]) -> Callable[P, R]:
     """
     Filters anndata.OldFormatWarning from being thrown by the wrapped function.
     """
 
     @wraps(f)
-    def wrapper(*args, **kwargs):
+    def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
         with warnings.catch_warnings():
             if Version(ad.__version__).release >= (0, 8):
                 warnings.filterwarnings(
diff --git a/scanpy/preprocessing/_deprecated/__init__.py b/scanpy/preprocessing/_deprecated/__init__.py
index f0b3947fd9..c2c363af01 100644
--- a/scanpy/preprocessing/_deprecated/__init__.py
+++ b/scanpy/preprocessing/_deprecated/__init__.py
@@ -55,12 +55,10 @@ def normalize_per_cell_weinreb16_deprecated(
 
 def zscore_deprecated(X: np.ndarray) -> np.ndarray:
     """\
-    Z-score standardize each variable/gene in X.
+    Z-score standardize each variable/gene in X :cite:p:`Weinreb2017`.
 
     Use `scale` instead.
 
-    Reference: Weinreb et al. (2017).
-
     Parameters
     ----------
     X
diff --git a/scanpy/preprocessing/_deprecated/highly_variable_genes.py b/scanpy/preprocessing/_deprecated/highly_variable_genes.py
index 404ac8f1fd..ff29536ac8 100644
--- a/scanpy/preprocessing/_deprecated/highly_variable_genes.py
+++ b/scanpy/preprocessing/_deprecated/highly_variable_genes.py
@@ -252,10 +252,7 @@ def filter_genes_fano_deprecated(X, Ecutoff, Vcutoff):
 
 
 def _filter_genes(X, e_cutoff, v_cutoff, meth):
-    """\
-    See `filter_genes_dispersion`.
-
-    Reference: Weinreb et al. (2017)."""
+    """See `filter_genes_dispersion` :cite:p:`Weinreb2017`."""
     if issparse(X):
         raise ValueError("Not defined for sparse input. See `filter_genes_dispersion`.")
     mean_filter = np.mean(X, axis=0) > e_cutoff
diff --git a/scanpy/tests/conftest.py b/scanpy/tests/conftest.py
index 71f38ba022..52bc61168a 100644
--- a/scanpy/tests/conftest.py
+++ b/scanpy/tests/conftest.py
@@ -144,25 +144,3 @@ def plt():
     from matplotlib import pyplot as plt
 
     return plt
-
-
-@pytest.fixture
-def tmp_dataset_dir(tmp_path_factory):
-    import scanpy
-
-    new_dir = tmp_path_factory.mktemp("scanpy_data")
-    old_dir = scanpy.settings.datasetdir
-    scanpy.settings.datasetdir = new_dir  # Set up
-    yield scanpy.settings.datasetdir
-    scanpy.settings.datasetdir = old_dir  # Tear down
-
-
-@pytest.fixture
-def tmp_write_dir(tmp_path_factory):
-    import scanpy
-
-    new_dir = tmp_path_factory.mktemp("scanpy_write")
-    old_dir = scanpy.settings.writedir
-    scanpy.settings.writedir = new_dir  # Set up
-    yield scanpy.settings.writedir
-    scanpy.settings.writedir = old_dir  # Tear down
diff --git a/scanpy/tests/test_datasets.py b/scanpy/tests/test_datasets.py
index d1230413a2..7e69821ac9 100644
--- a/scanpy/tests/test_datasets.py
+++ b/scanpy/tests/test_datasets.py
@@ -6,52 +6,62 @@
 
 import subprocess
 import warnings
+from collections import defaultdict
 from pathlib import Path
+from textwrap import dedent
+from typing import TYPE_CHECKING
 
 import numpy as np
 import pytest
 from anndata.tests.helpers import assert_adata_equal
 
 import scanpy as sc
+from testing.scanpy._pytest.marks import needs
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
 
-@pytest.fixture(scope="module")
-def tmp_dataset_dir(tmp_path_factory):
-    new_dir = tmp_path_factory.mktemp("scanpy_data")
-    old_dir = sc.settings.datasetdir
-    sc.settings.datasetdir = new_dir  # Set up
-    yield sc.settings.datasetdir
-    sc.settings.datasetdir = old_dir  # Tear down
+    from anndata import AnnData
 
 
 @pytest.mark.internet
-def test_burczynski06(tmp_dataset_dir):
-    adata = sc.datasets.burczynski06()
+def test_burczynski06():
+    with pytest.warns(UserWarning, match=r"Variable names are not unique"):
+        adata = sc.datasets.burczynski06()
     assert adata.shape == (127, 22283)
     assert not (adata.X == 0).any()
 
 
 @pytest.mark.internet
-def test_moignard15(tmp_dataset_dir):
-    adata = sc.datasets.moignard15()
+@needs.openpyxl
+def test_moignard15():
+    with warnings.catch_warnings():
+        # https://foss.heptapod.net/openpyxl/openpyxl/-/issues/2051
+        warnings.filterwarnings(
+            "ignore",
+            r"datetime\.datetime\.utcnow\(\) is deprecated",
+            category=DeprecationWarning,
+            module="openpyxl",
+        )
+        adata = sc.datasets.moignard15()
     assert adata.shape == (3934, 42)
 
 
 @pytest.mark.internet
-def test_paul15(tmp_dataset_dir):
+def test_paul15():
     sc.datasets.paul15()
 
 
 @pytest.mark.internet
-def test_pbmc3k(tmp_dataset_dir):
+def test_pbmc3k():
     adata = sc.datasets.pbmc3k()
     assert adata.shape == (2700, 32738)
     assert "CD8A" in adata.var_names
 
 
 @pytest.mark.internet
-def test_pbmc3k_processed(tmp_dataset_dir):
-    with pytest.warns(None) as records:
+def test_pbmc3k_processed():
+    with warnings.catch_warnings(record=True) as records:
         adata = sc.datasets.pbmc3k_processed()
     assert adata.shape == (2638, 1838)
     assert adata.raw.shape == (2638, 13714)
@@ -60,19 +70,18 @@ def test_pbmc3k_processed(tmp_dataset_dir):
 
 
 @pytest.mark.internet
-def test_ebi_expression_atlas(tmp_dataset_dir):
+def test_ebi_expression_atlas():
     adata = sc.datasets.ebi_expression_atlas("E-MTAB-4888")
-    assert adata.shape == (2315, 24051)  # This changes sometimes
+    # The shape changes sometimes
+    assert 2261 <= adata.shape[0] <= 2315
+    assert 23899 <= adata.shape[1] <= 24051
 
 
-def test_krumsiek11(tmp_dataset_dir):
+def test_krumsiek11():
     with pytest.warns(UserWarning, match=r"Observation names are not unique"):
         adata = sc.datasets.krumsiek11()
     assert adata.shape == (640, 11)
-    assert all(
-        np.unique(adata.obs["cell_type"])
-        == np.array(["Ery", "Mk", "Mo", "Neu", "progenitor"])
-    )
+    assert set(adata.obs["cell_type"]) == {"Ery", "Mk", "Mo", "Neu", "progenitor"}
 
 
 def test_blobs():
@@ -94,20 +103,33 @@ def test_pbmc68k_reduced():
 
 
 @pytest.mark.internet
-def test_visium_datasets(tmp_dataset_dir, tmpdir):
-    # Tests that reading/ downloading works and is does not have global effects
-    hheart = sc.datasets.visium_sge("V1_Human_Heart")
-    mbrain = sc.datasets.visium_sge("V1_Adult_Mouse_Brain")
-    hheart_again = sc.datasets.visium_sge("V1_Human_Heart")
+def test_visium_datasets():
+    """Tests that reading/ downloading works and is does not have global effects."""
+    with pytest.warns(UserWarning, match=r"Variable names are not unique"):
+        hheart = sc.datasets.visium_sge("V1_Human_Heart")
+    with pytest.warns(UserWarning, match=r"Variable names are not unique"):
+        hheart_again = sc.datasets.visium_sge("V1_Human_Heart")
     assert_adata_equal(hheart, hheart_again)
 
-    # Test that changing the dataset dir doesn't break reading
-    sc.settings.datasetdir = Path(tmpdir)
-    mbrain_again = sc.datasets.visium_sge("V1_Adult_Mouse_Brain")
+
+@pytest.mark.internet
+def test_visium_datasets_dir_change(tmp_path: Path):
+    """Test that changing the dataset dir doesn't break reading."""
+    with pytest.warns(UserWarning, match=r"Variable names are not unique"):
+        mbrain = sc.datasets.visium_sge("V1_Adult_Mouse_Brain")
+    sc.settings.datasetdir = tmp_path
+    with pytest.warns(UserWarning, match=r"Variable names are not unique"):
+        mbrain_again = sc.datasets.visium_sge("V1_Adult_Mouse_Brain")
     assert_adata_equal(mbrain, mbrain_again)
 
+
+@pytest.mark.internet
+def test_visium_datasets_images():
+    """Test that image download works and is does not have global effects."""
+
     # Test that downloading tissue image works
-    mbrain = sc.datasets.visium_sge("V1_Adult_Mouse_Brain", include_hires_tiff=True)
+    with pytest.warns(UserWarning, match=r"Variable names are not unique"):
+        mbrain = sc.datasets.visium_sge("V1_Adult_Mouse_Brain", include_hires_tiff=True)
     expected_image_path = sc.settings.datasetdir / "V1_Adult_Mouse_Brain" / "image.tif"
     image_path = Path(
         mbrain.uns["spatial"]["V1_Adult_Mouse_Brain"]["metadata"]["source_image_path"]
@@ -130,3 +152,39 @@ def test_download_failure():
 
     with pytest.raises(HTTPError):
         sc.datasets.ebi_expression_atlas("not_a_real_accession")
+
+
+# These are tested via doctest
+DS_INCLUDED = frozenset({"krumsiek11", "toggleswitch", "pbmc68k_reduced"})
+# These have parameters that affect shape and so on
+DS_DYNAMIC = frozenset({"ebi_expression_atlas"})
+# Additional marks for datasets besides “internet”
+DS_MARKS = defaultdict(list, moignard15=[needs.openpyxl])
+
+
+@pytest.mark.parametrize(
+    "ds_name",
+    [
+        pytest.param(
+            ds,
+            id=ds,
+            marks=[
+                *(() if ds in DS_INCLUDED else [pytest.mark.internet]),
+                *DS_MARKS[ds],
+            ],
+        )
+        for ds in set(sc.datasets.__all__) - DS_DYNAMIC
+    ],
+)
+def test_doc_shape(ds_name):
+    dataset_fn: Callable[[], AnnData] = getattr(sc.datasets, ds_name)
+    assert dataset_fn.__doc__, "No docstring"
+    docstring = dedent(dataset_fn.__doc__)
+    with warnings.catch_warnings():
+        warnings.filterwarnings(
+            "ignore",
+            r"(Observation|Variable) names are not unique",
+            category=UserWarning,
+        )
+        dataset = dataset_fn()
+    assert repr(dataset) in docstring
diff --git a/scanpy/tests/test_queries.py b/scanpy/tests/test_queries.py
index 25e1469f51..d25df9d331 100644
--- a/scanpy/tests/test_queries.py
+++ b/scanpy/tests/test_queries.py
@@ -5,9 +5,11 @@
 
 import scanpy as sc
 from testing.scanpy._helpers.data import pbmc68k_reduced
+from testing.scanpy._pytest.marks import needs
 
 
 @pytest.mark.internet
+@needs.gprofiler
 def test_enrich():
     pbmc = pbmc68k_reduced()
     sc.tl.rank_genes_groups(pbmc, "louvain", n_genes=pbmc.shape[1])
@@ -32,6 +34,7 @@ def test_enrich():
 
 
 @pytest.mark.internet
+@needs.pybiomart
 def test_mito_genes():
     pbmc = pbmc68k_reduced()
     mt_genes = sc.queries.mitochondrial_genes("hsapiens")
diff --git a/scanpy/tests/test_sim.py b/scanpy/tests/test_sim.py
index df908d977f..1097051816 100644
--- a/scanpy/tests/test_sim.py
+++ b/scanpy/tests/test_sim.py
@@ -6,7 +6,7 @@
 import scanpy as sc
 
 
-def test_sim_toggleswitch(tmp_write_dir):
+def test_sim_toggleswitch():
     with pytest.warns(UserWarning, match=r"Observation names are not unique"):
         adata = sc.tl.sim("toggleswitch")
         np.allclose(adata.X, sc.datasets.toggleswitch().X, np.finfo(np.float32).eps)
diff --git a/src/testing/scanpy/_pytest/__init__.py b/src/testing/scanpy/_pytest/__init__.py
index 12fa07b72d..0403be07d8 100644
--- a/src/testing/scanpy/_pytest/__init__.py
+++ b/src/testing/scanpy/_pytest/__init__.py
@@ -17,7 +17,10 @@
 
 # Defining it here because it’s autouse.
 @pytest.fixture(autouse=True)
-def _global_test_context(request: pytest.FixtureRequest) -> Generator[None, None, None]:
+def _global_test_context(
+    request: pytest.FixtureRequest,
+    tmp_path_factory: pytest.TempPathFactory,
+) -> Generator[None, None, None]:
     """Switch to agg backend, reset settings, and close all figures at teardown."""
     # make sure seaborn is imported and did its thing
     import seaborn as sns  # noqa: F401
@@ -30,6 +33,8 @@ def _global_test_context(request: pytest.FixtureRequest) -> Generator[None, None
     sc.settings.logfile = sys.stderr
     sc.settings.verbosity = "hint"
     sc.settings.autoshow = True
+    sc.settings.datasetdir = tmp_path_factory.mktemp("scanpy_data")
+    sc.settings.writedir = tmp_path_factory.mktemp("scanpy_write")
 
     if isinstance(request.node, pytest.DoctestItem):
         _modify_doctests(request)
@@ -101,6 +106,10 @@ def _modify_doctests(request: pytest.FixtureRequest) -> None:
     skip_reason: str | None
     if skip_reason := getattr(func, "_doctest_skip_reason", None):
         pytest.skip(reason=skip_reason)
+    if getattr(func, "_doctest_internet", False) and not request.config.getoption(
+        "--internet-tests"
+    ):
+        pytest.skip(reason="need --internet-tests option to run")
 
 
 def pytest_itemcollected(item: pytest.Item) -> None:
diff --git a/src/testing/scanpy/_pytest/marks.py b/src/testing/scanpy/_pytest/marks.py
index b94b079b36..5695009a40 100644
--- a/src/testing/scanpy/_pytest/marks.py
+++ b/src/testing/scanpy/_pytest/marks.py
@@ -37,6 +37,7 @@ class needs(QuietMarkDecorator, Enum):
     gprofiler = "gprofiler-official"
     leidenalg = auto()
     louvain = auto()
+    openpyxl = auto()
     igraph = auto()
     pybiomart = auto()
     skimage = "scikit-image"