From f09b68bf4130a5e8234fde79fa7b74957f9c110b Mon Sep 17 00:00:00 2001
From: Eric Larson <larson.eric.d@gmail.com>
Date: Fri, 23 Feb 2024 14:49:16 -0500
Subject: [PATCH] DOC: Automatic flowchart

---
 docs/source/.gitignore                        |   1 +
 docs/source/features/gen_steps.py             | 145 +++++++++++++++++-
 docs/source/features/overview.md              |  53 -------
 docs/source/v1.6.md.inc                       |   1 +
 .../preprocessing/_05_regress_artifact.py     |   5 +-
 .../steps/preprocessing/_06a_run_ica.py       |   9 +-
 .../steps/preprocessing/_06b_run_ssp.py       |   3 +-
 .../steps/preprocessing/_08a_apply_ica.py     |   3 +-
 .../steps/preprocessing/_08b_apply_ssp.py     |   3 +-
 .../steps/preprocessing/_09_ptp_reject.py     |   4 +-
 10 files changed, 152 insertions(+), 75 deletions(-)
 delete mode 100644 docs/source/features/overview.md

diff --git a/docs/source/.gitignore b/docs/source/.gitignore
index 77afb012b..ce1332a62 100644
--- a/docs/source/.gitignore
+++ b/docs/source/.gitignore
@@ -1 +1,2 @@
 features/steps.md
+features/overview.md
diff --git a/docs/source/features/gen_steps.py b/docs/source/features/gen_steps.py
index 86ea6283f..2b3cc3bd7 100755
--- a/docs/source/features/gen_steps.py
+++ b/docs/source/features/gen_steps.py
@@ -6,20 +6,92 @@
 
 from mne_bids_pipeline._config_utils import _get_step_modules
 
-pre = """\
-# Detailed lis of processing steps
+autogen_header = f"""\
+[//]: # (AUTO-GENERATED, TO CHANGE EDIT {'/'.join(Path(__file__).parts[-4:])})
+"""
+
+steps_pre = f"""\
+{autogen_header}
+
+# Detailed list of processing steps
 
 The following table provides a concise summary of each processing step. The
 step names can be used to run individual steps or entire groups of steps by
 passing their name(s) to `mne_bids_pipeline` via the `steps=...` argument.
+"""  # noqa: E501
+
+overview_pre = f"""\
+{autogen_header}
+
+MNE-BIDS-Pipeline processes your data in a sequential manner, i.e., one step
+at a time. The next step is only run after the previous steps have been
+successfully completed. There are, of course, exceptions; for example, if you
+chose not to apply ICA, the respective steps will simply be omitted and we'll
+directly move to the subsequent steps. The following flow chart aims to give
+you a brief overview of which steps are included in the pipeline, in which
+order they are run, and how we group them together.
+
+!!! info
+    All intermediate results are saved to disk for later
+    inspection, and an **extensive report** is generated.
+
+!!! info
+    Analyses are conducted on individual (per-subject) as well as group level.
 """
 
+icon_map = {
+    "Filesystem initialization and dataset inspection": ":open_file_folder:",
+    "Preprocessing": ":broom:",
+    "Sensor-space analysis": ":satellite:",
+    "Source-space analysis": ":brain:",
+    "FreeSurfer-related processing": ":person_surfing:",
+}
+out_dir = Path(__file__).parent
+
 print("Generating steps …")
 step_modules = _get_step_modules()
+char_start = ord("A")
+
+# In principle we could try to sort this out based on naming, but for now let's just
+# set our hierarchy manually and update it when we move files around since that's easy
+# (and rare) enough to do.
+manual_order = {
+    "Preprocessing": (
+        ("01", "02"),
+        ("02", "03"),
+        ("03", "04"),
+        ("04", "05"),
+        ("05", "06a"),
+        ("05", "06b"),
+        ("05", "07"),
+        # technically we could have the raw data flow here, but it doesn't really help
+        # ("05", "08a"),
+        # ("05", "08b"),
+        ("06a", "08a"),
+        ("07", "08a"),
+        # Force the artifact-fitting and epoching steps on the same level, in this order
+        """\
+    subgraph Z[" "]
+    direction LR
+      B06a
+      B07
+      B06b
+    end
+    style Z fill:#0000,stroke-width:0px
+""",
+        ("06b", "08b"),
+        ("07", "08b"),
+        ("08a", "09"),
+        ("08b", "09"),
+    ),
+}
 
 # Construct the lines of steps.md
-lines = [pre]
+lines = [steps_pre]
+overview_lines = [overview_pre]
+used_titles = set()
 for di, (dir_, modules) in enumerate(step_modules.items(), 1):
+    # Steps
     if dir_ == "all":
         continue  # this is an alias
     dir_module = importlib.import_module(f"mne_bids_pipeline.steps.{dir_}")
@@ -29,7 +101,9 @@
         dir_body = dir_body[1].strip()
     else:
         dir_body = ""
-    lines.append(f"## {di}. {dir_header}\n")
+    icon = icon_map[dir_header]
+    module_header = f"{di}. {icon} {dir_header}"
+    lines.append(f"## {module_header}\n")
     if dir_body:
         lines.append(f"{dir_body}\n")
     lines.append("| Step name | Description |")
@@ -42,5 +116,64 @@
         step_title = module.__doc__.split("\n")[0]
         lines.append(f"`{step_name}` | {step_title} |")
     lines.append("")
-with open(Path(__file__).parent / "steps.md", "w") as fid:
-    fid.write("\n".join(lines))
+
+    # Overview
+    overview_lines.append(
+        f"""\
+## {module_header}
+
+```mermaid
+flowchart TD"""
+    )
+    chr_pre = chr(char_start + di - 1)  # A, B, C, ...
+    start = None
+    prev_idx = None
+    title_map = {}
+    for mi, module in enumerate(modules, 1):
+        step_title = module.__doc__.split("\n")[0].rstrip(".")
+        idx = module.__name__.split(".")[-1].split("_")[1]  # 01, 05a, etc.
+        # Need to quote the title to deal with parens, and sanitize quotes
+        step_title = step_title.replace('"', "'")
+        assert step_title not in used_titles, f"Redundant title: {step_title}"
+        used_titles.add(step_title)
+        this_block = f'{chr_pre}{idx}["{step_title}"]'
+        # special case: manual order
+        title_map[idx] = step_title
+        if dir_header in manual_order:
+            continue
+        if mi == 1:
+            start = this_block
+            assert prev_idx is None
+            continue
+        if start is not None:
+            assert mi == 2, mi
+            overview_lines.append(f"    {start} --> {this_block}")
+            start = None
+        else:
+            overview_lines.append(f"    {chr_pre}{prev_idx} --> {this_block}")
+        prev_idx = idx
+    if dir_header in manual_order:
+        mapped = set()
+        for a_b in manual_order[dir_header]:
+            if isinstance(a_b, str):  # insert directly
+                overview_lines.append(a_b)
+                continue
+            assert isinstance(a_b, tuple), type(a_b)
+            a_b = list(a_b)  # allow modification
+            for ii, idx in enumerate(a_b):
+                assert idx in title_map, (dir_header, sorted(title_map))
+                if idx not in mapped:
+                    mapped.add(idx)
+                    a_b[ii] = f'{idx}["{title_map[idx]}"]'
+            overview_lines.append(f"    {chr_pre}{a_b[0]} --> {chr_pre}{a_b[1]}")
+        all_steps = set(
+            sum(
+                [a_b for a_b in manual_order[dir_header] if not isinstance(a_b, str)],
+                (),
+            )
+        )
+        assert mapped == all_steps, all_steps.symmetric_difference(mapped)
+    overview_lines.append("```\n")
+
+(out_dir / "steps.md").write_text("\n".join(lines), encoding="utf8")
+(out_dir / "overview.md").write_text("\n".join(overview_lines), encoding="utf8")
diff --git a/docs/source/features/overview.md b/docs/source/features/overview.md
deleted file mode 100644
index 9fe044038..000000000
--- a/docs/source/features/overview.md
+++ /dev/null
@@ -1,53 +0,0 @@
-MNE-BIDS-Pipeline processes your data in a sequential manner, i.e., one step
-at a time. The next step is only run after the previous steps have been
-successfully completed. There are, of course, exceptions; for example, if you
-chose not to apply ICA, the respective steps will simply be omitted and we'll
-directly move to the subsequent steps. The following flow chart aims to give
-you a brief overview of which steps are included in the pipeline, in which
-order they are run, and how we group them together.
-
-!!! info
-    All intermediate results are saved to disk for later
-    inspection, and an **extensive report** is generated.
-
-!!! info
-    Analyses are conducted on individual (per-subject) as well as group level.
-
-
-## :open_file_folder: Filesystem initialization and dataset inspection
-```mermaid
-flowchart TD
-    A1[initialize the target directories] --> A2[locate empty-room recordings]
-```
-
-## :broom: Preprocessing
-```mermaid
-    flowchart TD
-    B1[Noisy & flat channel detection] --> B2[Maxwell filter]
-    B2 --> B3[Frequency filter]
-    B3 --> B4[Epoch creation]
-    B4 --> B5[SSP or ICA fitting]
-    B5 --> B6[Artifact removal via SSP or ICA]
-    B6 --> B7[Amplitude-based epoch rejection]
-```
-
-## :satellite: Sensor-space processing
-```mermaid
-    flowchart TD
-    C1[ERP / ERF calculation] --> C2[MVPA: full epochs]
-    C2 --> C3[MVPA: time-by-time decoding]
-    C3 --> C4[Time-frequency decomposition]
-    C4 --> C5[MVPA: CSP]
-    C5 --> C6[Noise covariance estimation]
-    C6 --> C7[Grand average]
-```
-
-## :brain: Source-space processing
-```mermaid
-    flowchart TD
-    D1[BEM surface creation] --> D2[BEM solution]
-    D2 --> D3[Source space creation]
-    D3 --> D4[Forward model creation]
-    D4 --> D5[Inverse solution]
-    D5 --> D6[Grand average]
-```
diff --git a/docs/source/v1.6.md.inc b/docs/source/v1.6.md.inc
index 3abc9a081..5ef48dcb8 100644
--- a/docs/source/v1.6.md.inc
+++ b/docs/source/v1.6.md.inc
@@ -36,3 +36,4 @@
 - The package build backend has been switched from `setuptools` to `hatchling`. (#825 by @hoechenberger)
 - Code formatting now uses `ruff format` instead of `black` (#834, #838 by @larsoner)
 - Code caching is now tested using GitHub Actions (#836 by @larsoner)
+- Steps in the documentatino are now automatically parsed into flowcharts (#859 by @larsoner)
diff --git a/mne_bids_pipeline/steps/preprocessing/_05_regress_artifact.py b/mne_bids_pipeline/steps/preprocessing/_05_regress_artifact.py
index cb31df04d..9fce737cc 100644
--- a/mne_bids_pipeline/steps/preprocessing/_05_regress_artifact.py
+++ b/mne_bids_pipeline/steps/preprocessing/_05_regress_artifact.py
@@ -1,7 +1,4 @@
-"""Run Signal Subspace Projections (SSP) for artifact correction.
-
-These are often also referred to as PCA vectors.
-"""
+"""Temporal regression for artifact removal."""
 
 from types import SimpleNamespace
 from typing import Optional
diff --git a/mne_bids_pipeline/steps/preprocessing/_06a_run_ica.py b/mne_bids_pipeline/steps/preprocessing/_06a_run_ica.py
index 7bfef3c56..fb6f1b089 100644
--- a/mne_bids_pipeline/steps/preprocessing/_06a_run_ica.py
+++ b/mne_bids_pipeline/steps/preprocessing/_06a_run_ica.py
@@ -1,14 +1,13 @@
-"""Run Independent Component Analysis (ICA) for artifact correction.
+"""Fit ICA.
 
-This fits ICA on epoched data filtered with 1 Hz highpass,
-for this purpose only using fastICA. Separate ICAs are fitted and stored for
-MEG and EEG data.
+This fits Independent Component Analysis (ICA) on raw data filtered with 1 Hz highpass,
+temporarily creating task-related epochs.
 
 Before performing ICA, we reject epochs based on peak-to-peak amplitude above
 the 'ica_reject' to filter massive non-biological artifacts.
 
 To actually remove designated ICA components from your data, you will have to
-run 05a-apply_ica.py.
+run the apply_ica step.
 """
 
 from collections.abc import Iterable
diff --git a/mne_bids_pipeline/steps/preprocessing/_06b_run_ssp.py b/mne_bids_pipeline/steps/preprocessing/_06b_run_ssp.py
index 7ec75ef91..1580836ca 100644
--- a/mne_bids_pipeline/steps/preprocessing/_06b_run_ssp.py
+++ b/mne_bids_pipeline/steps/preprocessing/_06b_run_ssp.py
@@ -1,5 +1,6 @@
-"""Run Signal Subspace Projections (SSP) for artifact correction.
+"""Compute SSP.
 
+Signal subspace projections (SSP) vectors are computed from EOG and ECG signals.
 These are often also referred to as PCA vectors.
 """
 
diff --git a/mne_bids_pipeline/steps/preprocessing/_08a_apply_ica.py b/mne_bids_pipeline/steps/preprocessing/_08a_apply_ica.py
index 8fcc8141c..fba8099f2 100644
--- a/mne_bids_pipeline/steps/preprocessing/_08a_apply_ica.py
+++ b/mne_bids_pipeline/steps/preprocessing/_08a_apply_ica.py
@@ -1,4 +1,4 @@
-"""Apply ICA and obtain the cleaned epochs and raw data.
+"""Apply ICA.
 
 Blinks and ECG artifacts are automatically detected and the corresponding ICA
 components are removed from the data.
@@ -8,7 +8,6 @@
 make sure you did not re-run the ICA in the meantime. Otherwise (especially if
 the random state was not set, or you used a different machine, the component
 order might differ).
-
 """
 
 from types import SimpleNamespace
diff --git a/mne_bids_pipeline/steps/preprocessing/_08b_apply_ssp.py b/mne_bids_pipeline/steps/preprocessing/_08b_apply_ssp.py
index e6fad4b8f..3f6d6576a 100644
--- a/mne_bids_pipeline/steps/preprocessing/_08b_apply_ssp.py
+++ b/mne_bids_pipeline/steps/preprocessing/_08b_apply_ssp.py
@@ -1,8 +1,7 @@
-"""Apply SSP projections and obtain the cleaned epochs and raw data.
+"""Apply SSP.
 
 Blinks and ECG artifacts are automatically detected and the corresponding SSP
 projections components are removed from the data.
-
 """
 
 from types import SimpleNamespace
diff --git a/mne_bids_pipeline/steps/preprocessing/_09_ptp_reject.py b/mne_bids_pipeline/steps/preprocessing/_09_ptp_reject.py
index d08469b3c..434b235ec 100644
--- a/mne_bids_pipeline/steps/preprocessing/_09_ptp_reject.py
+++ b/mne_bids_pipeline/steps/preprocessing/_09_ptp_reject.py
@@ -1,6 +1,6 @@
-"""Remove epochs based on peak-to-peak (PTP) amplitudes.
+"""Remove epochs based on PTP amplitudes.
 
-Epochs containing peak-to-peak above the thresholds defined
+Epochs containing peak-to-peak (PTP) above the thresholds defined
 in the 'reject' parameter are removed from the data.
 
 This step will drop epochs containing non-biological artifacts