ME-ICA · tsalo · Jan 22, 2021 · Jan 21, 2021 · Jan 21, 2021 · Jan 21, 2021
diff --git a/docs/approach.rst b/docs/approach.rst
@@ -201,9 +201,9 @@ Here we can see time series for some example components (we don't really care ab
 
 These components are subjected to component selection, the specifics of which
 vary according to algorithm.
-Specifically, ``tedana`` offers two different approaches that perform this step.
+Specifically, ``tedana`` offers three different approaches that perform this step.
 
-The simplest approach (the default ``mdl``, ``aic`` and ``kic`` options for
+The recommended approach (the default ``mdl`` option, along with the ``aic`` and ``kic`` options, for
 ``--tedpca``) is based on a moving average (stationary Gaussian) process
 proposed by `Li et al (2007)`_ and used primarily in the Group ICA of fMRI Toolbox (GIFT).
 A moving average process is the output of a linear system (which, in this case, is
@@ -229,8 +229,21 @@ to select the PCA components based on three widely-used model selection criteria
     option ``mdl`` might not yield perfect results on your data. We suggest you explore the ``kic``
     and ``aic`` options if running ``tedana`` with ``mdl`` returns less components than expected.
 
-In addition to the moving average process-based options described above, we also support a
-decision tree-based selection method (similar to the one in the :ref:`TEDICA` section below).
+The simplest approach uses a user-supplied threshold applied to the cumulative variance explained
+by the PCA.
+In this approach, the user provides a value to ``--tedpca`` between 0 and 1.
+That value corresponds to the percent of variance that must be explained by the components.
+For example, if a value of 0.9 is provided, then PCA components
+(ordered by decreasing variance explained)
+cumulatively explaining up to 90% of the variance will be retained.
+Components explaining more than that threshold
+(except for the component that crosses the threshold)
+will be excluded.
+
+In addition to the moving average process-based options and the variance explained threshold
+described above,
+we also support a decision tree-based selection method
+(similar to the one in the :ref:`TEDICA` section below).
 This method involves applying a decision tree to identify and discard PCA components which,
 in addition to not explaining much variance, are also not significantly TE-dependent (i.e.,
 have low Kappa) or TE-independent (i.e., have low Rho).

diff --git a/tedana/decomposition/pca.py b/tedana/decomposition/pca.py
@@ -3,6 +3,7 @@
 """
 import logging
 import os.path as op
+from numbers import Number
 
 import numpy as np
 import pandas as pd
@@ -73,11 +74,14 @@ def tedpca(data_cat, data_oc, combmode, mask, adaptive_mask, t2sG,
         Reference image to dictate how outputs are saved to disk
     tes : :obj:`list`
         List of echo times associated with `data_cat`, in milliseconds
-    algorithm : {'kundu', 'kundu-stabilize', 'mdl', 'aic', 'kic'}, optional
-        Method with which to select components in TEDPCA. Default is 'mdl'. PCA
+    algorithm : {'kundu', 'kundu-stabilize', 'mdl', 'aic', 'kic', float}, optional
+        Method with which to select components in TEDPCA. PCA
         decomposition with the mdl, kic and aic options are based on a Moving Average
-        (stationary Gaussian) process and are ordered from most to least aggresive.
-        See (Li et al., 2007).
+        (stationary Gaussian) process and are ordered from most to least aggressive
+        (see Li et al., 2007).
+        If a float is provided, then it is assumed to represent percentage of variance
+        explained (0-1) to retain from PCA.
+        Default is 'mdl'.
     kdaw : :obj:`float`, optional
         Dimensionality augmentation weight for Kappa calculations. Must be a
         non-negative float, or -1 (a special value). Default is 10.
@@ -90,6 +94,7 @@ def tedpca(data_cat, data_oc, combmode, mask, adaptive_mask, t2sG,
         Whether to output files from fitmodels_direct or not. Default: False
     low_mem : :obj:`bool`, optional
         Whether to use incremental PCA (for low-memory systems) or not.
+        This is only compatible with the "kundu" or "kundu-stabilize" algorithms.
         Default: False
 
     Returns
@@ -166,6 +171,10 @@ def tedpca(data_cat, data_oc, combmode, mask, adaptive_mask, t2sG,
                     "connectivity mapping using multiecho fMRI. Proceedings "
                     "of the National Academy of Sciences, 110(40), "
                     "16187-16192.")
+    elif isinstance(algorithm, Number):
+        alg_str = (
+            "in which the number of components was determined based on a "
+            "variance explained threshold")
     else:
         alg_str = ("based on the PCA component estimation with a Moving Average"
                    "(stationary Gaussian) process (Li et al., 2007)")
@@ -191,6 +200,14 @@ def tedpca(data_cat, data_oc, combmode, mask, adaptive_mask, t2sG,
         mask_img = io.new_nii_like(ref_img, mask.astype(int))
         voxel_comp_weights, varex, varex_norm, comp_ts = ma_pca.ma_pca(
             data_img, mask_img, algorithm)
+    elif isinstance(algorithm, Number):
+        ppca = PCA(copy=False, n_components=algorithm, svd_solver="full")
+        ppca.fit(data_z)
+        comp_ts = ppca.components_.T
+        varex = ppca.explained_variance_
+        voxel_comp_weights = np.dot(np.dot(data_z, comp_ts),
+                                    np.diag(1. / varex))
+        varex_norm = varex / varex.sum()
     elif low_mem:
         voxel_comp_weights, varex, comp_ts = low_mem_pca(data_z)
         varex_norm = varex / varex.sum()
@@ -227,9 +244,10 @@ def tedpca(data_cat, data_oc, combmode, mask, adaptive_mask, t2sG,
         comptable = kundu_tedpca(comptable, n_echos, kdaw, rdaw, stabilize=False)
     elif algorithm == 'kundu-stabilize':
         comptable = kundu_tedpca(comptable, n_echos, kdaw, rdaw, stabilize=True)
-    elif algorithm in ['mdl', 'aic', 'kic']:
+    else:
+        alg_str = "variance explained-based" if isinstance(algorithm, Number) else algorithm
         LGR.info('Selected {0} components with {1} dimensionality '
-                 'detection'.format(comptable.shape[0], algorithm))
+                 'detection'.format(comptable.shape[0], alg_str))
         comptable['classification'] = 'accepted'
         comptable['rationale'] = ''
 

diff --git a/tedana/tests/data/nih_five_echo_outputs_verbose.txt b/tedana/tests/data/nih_five_echo_outputs_verbose.txt
@@ -96,22 +96,3 @@ figures/comp_048.png
 figures/comp_049.png
 figures/comp_050.png
 figures/comp_051.png
-figures/comp_052.png
-figures/comp_053.png
-figures/comp_054.png
-figures/comp_055.png
-figures/comp_056.png
-figures/comp_057.png
-figures/comp_058.png
-figures/comp_059.png
-figures/comp_060.png
-figures/comp_061.png
-figures/comp_062.png
-figures/comp_063.png
-figures/comp_064.png
-figures/comp_065.png
-figures/comp_066.png
-figures/comp_067.png
-figures/comp_068.png
-figures/comp_069.png
-figures/comp_070.png
diff --git a/tedana/tests/test_integration.py b/tedana/tests/test_integration.py
@@ -92,7 +92,7 @@ def test_integration_five_echo(skip_integration):
         data=datalist,
         tes=echo_times,
         out_dir=out_dir,
-        tedpca='aic',
+        tedpca=0.95,
         fittype='curvefit',
         tedort=True,
         verbose=True)

diff --git a/tedana/tests/test_workflows_parser_utils.py b/tedana/tests/test_workflows_parser_utils.py
@@ -0,0 +1,17 @@
+"""Test workflow parser utility functions."""
+import argparse
+import pytest
+
+from tedana.workflows.parser_utils import string_or_float
+
+
+def test_string_or_float():
+    """Test the string_or_float function."""
+    with pytest.raises(argparse.ArgumentTypeError):
+        string_or_float("hello")
+
+    with pytest.raises(argparse.ArgumentTypeError):
+        string_or_float(1.5)
+
+    assert string_or_float(0.95) == 0.95
+    assert string_or_float("mdl") == "mdl"
diff --git a/tedana/workflows/parser_utils.py b/tedana/workflows/parser_utils.py
@@ -4,6 +4,26 @@
 import os.path as op
 import logging
 
+import argparse
+
+
+def string_or_float(string):
+    """Check if argument is a float or one of a list of strings."""
+    valid_options = ("mdl", "aic", "kic", "kundu", "kundu-stabilize")
+    if string not in valid_options:
+        try:
+            string = float(string)
+        except ValueError:
+            msg = "Argument must be a float or one of: {}".format(
+                ", ".join(valid_options)
+            )
+            raise argparse.ArgumentTypeError(msg)
+
+        if not (0 <= string <= 1):
+            msg = "Argument must be between 0 and 1."
+            raise argparse.ArgumentTypeError(msg)
+    return string
+
 
 def is_valid_file(parser, arg):
     """

diff --git a/tedana/workflows/tedana.py b/tedana/workflows/tedana.py
@@ -8,6 +8,7 @@
 import logging
 import datetime
 from glob import glob
+from numbers import Number
 
 import argparse
 import numpy as np
@@ -20,7 +21,7 @@
                     reporting, selection, utils)
 import tedana.gscontrol as gsc
 from tedana.stats import computefeats2
-from tedana.workflows.parser_utils import is_valid_file, ContextFilter
+from tedana.workflows.parser_utils import is_valid_file, string_or_float, ContextFilter
 
 LGR = logging.getLogger(__name__)
 RepLGR = logging.getLogger('REPORT')
@@ -98,12 +99,15 @@ def _get_parser():
                           default='t2s')
     optional.add_argument('--tedpca',
                           dest='tedpca',
+                          type=string_or_float,
                           help=('Method with which to select components in TEDPCA. '
                                 'PCA decomposition with the mdl, kic and aic options '
                                 'is based on a Moving Average (stationary Gaussian) '
-                                'process and are ordered from most to least aggresive. '
-                                'Default=\'mdl\'.'),
-                          choices=['kundu', 'kundu-stabilize', 'mdl', 'aic', 'kic'],
+                                'process and are ordered from most to least aggressive. '
+                                "Users may also provide a float from 0 to 1, "
+                                "in which case components will be selected based on the "
+                                "cumulative variance explained. "
+                                "Default='mdl'."),
                           default='mdl')
     optional.add_argument('--seed',
                           dest='fixed_seed',
@@ -262,8 +266,11 @@ def tedana_workflow(data, tes, out_dir='.', mask=None,
         Default is 'loglin'.
     combmode : {'t2s'}, optional
         Combination scheme for TEs: 't2s' (Posse 1999, default).
-    tedpca : {'kundu', 'kundu-stabilize', 'mdl', 'aic', 'kic'}, optional
-        Method with which to select components in TEDPCA. Default is 'mdl'.
+    tedpca : {'mdl', 'aic', 'kic', 'kundu', 'kundu-stabilize', float}, optional
+        Method with which to select components in TEDPCA.
+        If a float is provided, then it is assumed to represent percentage of variance
+        explained (0-1) to retain from PCA.
+        Default is 'mdl'.
     tedort : :obj:`bool`, optional
         Orthogonalize rejected components w.r.t. accepted ones prior to
         denoising. Default is False.
@@ -387,6 +394,10 @@ def tedana_workflow(data, tes, out_dir='.', mask=None,
     if not isinstance(gscontrol, list):
         gscontrol = [gscontrol]
 
+    # Check value of tedpca *if* it is a float
+    if isinstance(tedpca, Number) and not (0 <= tedpca <= 1):
+        raise ValueError("If a float is provided, tedpca must be between 0 and 1.")
+
     LGR.info('Loading input data: {}'.format([f for f in data]))
     catd, ref_img = io.load_data(data, n_echos=n_echos)
     n_samp, n_echos, n_vols = catd.shape