From f2cdb4ed756996609a0467ac22279605a2346edb Mon Sep 17 00:00:00 2001 From: Circle Date: Wed, 1 Nov 2023 12:26:51 +1100 Subject: [PATCH] Incorporated the comments on add robustica --- .../inspectionProfiles/profiles_settings.xml | 6 + .idea/vcs.xml | 6 + .idea/workspace.xml | 56 ++ tedana/config.py | 4 + tedana/decomposition/ica.py | 191 ++++- tedana/resources/references.bib | 10 + tedana/tests/test_integrity _robustica.py | 667 ++++++++++++++++++ tedana/workflows/tedana.py | 167 +++-- 8 files changed, 1006 insertions(+), 101 deletions(-) create mode 100644 .idea/inspectionProfiles/profiles_settings.xml create mode 100644 .idea/vcs.xml create mode 100644 .idea/workspace.xml create mode 100644 tedana/config.py create mode 100644 tedana/tests/test_integrity _robustica.py diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 000000000..105ce2da2 --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 000000000..94a25f7f4 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/workspace.xml b/.idea/workspace.xml new file mode 100644 index 000000000..f8901d75c --- /dev/null +++ b/.idea/workspace.xml @@ -0,0 +1,56 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + 1698800913661 + + + + + + + \ No newline at end of file diff --git a/tedana/config.py b/tedana/config.py new file mode 100644 index 000000000..9bca5b873 --- /dev/null +++ b/tedana/config.py @@ -0,0 +1,4 @@ +DEFAULT_ICA_METHOD = "robustica" +DEFAULT_N_ROBUST_RUNS = 30 +DEFAULT_N_MAX_ITER = 500 +DEFAULT_N_MAX_RESTART = 10 \ No newline at end of file diff --git a/tedana/decomposition/ica.py b/tedana/decomposition/ica.py index 5a233b42e..abb9d2c82 100644 --- a/tedana/decomposition/ica.py +++ b/tedana/decomposition/ica.py @@ -1,23 +1,34 @@ -""" -ICA and related signal decomposition methods for tedana -""" +"""ICA and related signal decomposition methods for tedana""" import logging import warnings -import sys - import numpy as np +from robustica import RobustICA from scipy import stats from sklearn.decomposition import FastICA -from robustica import RobustICA ####BTBTBT + +from tedana.config import ( + DEFAULT_ICA_METHOD, + DEFAULT_N_MAX_ITER, + DEFAULT_N_MAX_RESTART, + DEFAULT_N_ROBUST_RUNS, +) LGR = logging.getLogger("GENERAL") RepLGR = logging.getLogger("REPORT") -def tedica(data, n_components, fixed_seed, ica_method="robustica", n_robust_runs=30, maxit=500, maxrestart=10): ####BTBTBTB +def tedica( + data, + n_components, + fixed_seed, + ica_method=DEFAULT_ICA_METHOD, + n_robust_runs=DEFAULT_N_ROBUST_RUNS, + maxit=DEFAULT_N_MAX_ITER, + maxrestart=DEFAULT_N_MAX_RESTART, +): """ - Perform ICA on `data` and returns mixing matrix + Perform ICA on `data` with the user selected ica method and returns mixing matrix Parameters ---------- @@ -25,9 +36,13 @@ def tedica(data, n_components, fixed_seed, ica_method="robustica", n_robust_runs Dimensionally reduced optimally combined functional data, where `S` is samples and `T` is time n_components : :obj:`int` - Number of components retained from PCA decomposition + Number of components retained from PCA decomposition. fixed_seed : :obj:`int` - Seed for ensuring reproducibility of ICA results + Seed for ensuring reproducibility of ICA results. + ica_method : :obj: `str' + slected ICA method, can be fastica or robutica. + n_robust_runs : :obj: `int' + selected number of robust runs when robustica is used. Default is 30. maxit : :obj:`int`, optional Maximum number of iterations for ICA. Default is 500. maxrestart : :obj:`int`, optional @@ -43,9 +58,6 @@ def tedica(data, n_components, fixed_seed, ica_method="robustica", n_robust_runs fixed_seed : :obj:`int` Random seed from final decomposition. - Notes - ----- - Uses `sklearn` implementation of FastICA for decomposition """ warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd") RepLGR.info( @@ -53,51 +65,152 @@ def tedica(data, n_components, fixed_seed, ica_method="robustica", n_robust_runs "decompose the dimensionally reduced dataset." ) - if ica_method=='robustica': - mmix, Iq = r_ica(data, n_components, n_robust_runs, maxit) - fixed_seed=-99999 - elif ica_method=='fastica': - mmix, fixed_seed=f_ica(data, n_components, fixed_seed, maxit=500, maxrestart=10) - Iq = 0 + ica_method = ica_method.lower() + + if ica_method == "robustica": + mmix, fixed_seed = r_ica( + data, + n_components=n_components, + fixed_seed=fixed_seed, + n_robust_runs=n_robust_runs, + max_it=maxit, + ) + elif ica_method == "fastica": + mmix, fixed_seed = f_ica( + data, + n_components=n_components, + fixed_seed=fixed_seed, + maxit=maxit, + maxrestart=maxrestart, + ) else: - LGR.warning("The selected ICA method is invalid!") - sys.exit() + raise ValueError("The selected ICA method is invalid!") + return mmix, fixed_seed +def r_ica(data, n_components, fixed_seed, n_robust_runs, max_it): + """ + Perform robustica on `data` by running FastICA multiple times (n_robust runes) + and returns mixing matrix - return mmix, fixed_seed + Parameters + ---------- + data : (S x T) :obj:`numpy.ndarray` + Dimensionally reduced optimally combined functional data, where `S` is + samples and `T` is time + n_components : :obj:`int` + Number of components retained from PCA decomposition. + fixed_seed : :obj:`int` + Seed for ensuring reproducibility of ICA results. + n_robust_runs : :obj: `int' + selected number of robust runs when robustica is used. Default is 30. + maxit : :obj:`int`, optional + Maximum number of iterations for ICA. Default is 500. + Returns + ------- + mmix : (T x C) :obj:`numpy.ndarray` + Z-scored mixing matrix for converting input data to component space, + where `C` is components and `T` is the same as in `data` + fixed_seed : :obj:`int` + Random seed from final decomposition. -def r_ica(data, n_components, n_robust_runs, max_it): ####BTBTBTB: + """ + if n_robust_runs > 200: + LGR.warning( + "The selected n_robust_runs is a very big number! The process will take a long time!" + ) - if n_robust_runs>100: - LGR.warning("The selected n_robust_runs is a very big number!") + RepLGR.info("RobustICA package was used for ICA decomposition \\citep{Anglada2022}.") + if fixed_seed == -1: + fixed_seed = np.random.randint(low=1, high=1000) - RepLGR.info( - "RobustICA package was used for ICA decomposition \\citep{Anglada2022}." - ) - rica0 = RobustICA(n_components=n_components, robust_runs=n_robust_runs, whiten='arbitrary-variance',max_iter= max_it, - robust_dimreduce=False, fun='logcosh') - S0, mmix = rica0.fit_transform(data) + try: + rica = RobustICA( + n_components=n_components, + robust_runs=n_robust_runs, + whiten="arbitrary-variance", + max_iter=max_it, + random_state=fixed_seed, + robust_dimreduce=False, + fun="logcosh", + robust_method="DBSCAN", + ) - q0 = rica0.evaluate_clustering(rica0.S_all, rica0.clustering.labels_, rica0.signs_, rica0.orientation_) + S, mmix = rica.fit_transform(data) + q = rica.evaluate_clustering( + rica.S_all, rica.clustering.labels_, rica.signs_, rica.orientation_ + ) - - Iq0 = np.array(np.mean(q0.iq)) - + except: + rica = RobustICA( + n_components=n_components, + robust_runs=n_robust_runs, + whiten="arbitrary-variance", + max_iter=max_it, + random_state=fixed_seed, + robust_dimreduce=False, + fun="logcosh", + robust_method="AgglomerativeClustering", + ) + + S, mmix = rica.fit_transform(data) + q = rica.evaluate_clustering( + rica.S_all, rica.clustering.labels_, rica.signs_, rica.orientation_ + ) + iq = np.array(np.mean(q[q["cluster_id"] >= 0].iq)) # The cluster labeled -1 is noise + + if iq < 0.6: + LGR.warning( + "The resultant mean Index Quality is low. It is recommended to rerun the " + "process with a different seed." + ) + + mmix = mmix[:, q["cluster_id"] >= 0] mmix = stats.zscore(mmix, axis=0) LGR.info( - "RobustICA with {0} robust runs was used \n" - "The mean index quality is {1}".format(n_robust_runs, Iq0) + "RobustICA with {0} robust runs and seed {1} was used. " + "The mean Index Quality is {2}".format(n_robust_runs, fixed_seed, iq) ) - return mmix, Iq0 + return mmix, fixed_seed def f_ica(data, n_components, fixed_seed, maxit, maxrestart): + """ + Perform FastICA on `data` and returns mixing matrix + + Parameters + ---------- + data : (S x T) :obj:`numpy.ndarray` + Dimensionally reduced optimally combined functional data, where `S` is + samples and `T` is time + n_components : :obj:`int` + Number of components retained from PCA decomposition + fixed_seed : :obj:`int` + Seed for ensuring reproducibility of ICA results + maxit : :obj:`int`, optional + Maximum number of iterations for ICA. Default is 500. + maxrestart : :obj:`int`, optional + Maximum number of attempted decompositions to perform with different + random seeds. ICA will stop running if there is convergence prior to + reaching this limit. Default is 10. + + Returns + ------- + mmix : (T x C) :obj:`numpy.ndarray` + Z-scored mixing matrix for converting input data to component space, + where `C` is components and `T` is the same as in `data` + fixed_seed : :obj:`int` + Random seed from final decomposition. + + Notes + ----- + Uses `sklearn` implementation of FastICA for decomposition + """ if fixed_seed == -1: fixed_seed = np.random.randint(low=1, high=1000) @@ -135,4 +248,4 @@ def f_ica(data, n_components, fixed_seed, maxit, maxrestart): mmix = ica.mixing_ mmix = stats.zscore(mmix, axis=0) - return mmix, fixed_seed + return mmix, fixed_seed \ No newline at end of file diff --git a/tedana/resources/references.bib b/tedana/resources/references.bib index 006f2f779..180e0a827 100644 --- a/tedana/resources/references.bib +++ b/tedana/resources/references.bib @@ -313,3 +313,13 @@ @Article{Hunter:2007 doi = {10.1109/MCSE.2007.55}, year = 2007 } + +@Article{Anglada:2022, + Author = {Anglada-Girotto Miquel and Miravet-Verde Samuel and Serrano Luis and Head Sarah}, + Title = {robustica: customizable robust independent component analysis}, + Journal = {BMC Bioinformatics}, + Volume = {23}, + Number = {519}, + doi = {10.1186/s12859-022-05043-9}, + year = 2022 +} \ No newline at end of file diff --git a/tedana/tests/test_integrity _robustica.py b/tedana/tests/test_integrity _robustica.py new file mode 100644 index 000000000..88646d51f --- /dev/null +++ b/tedana/tests/test_integrity _robustica.py @@ -0,0 +1,667 @@ +"""Integration tests for "real" data.""" + +import glob +import json +import logging +import os +import os.path as op +import re +import shutil +import subprocess +import tarfile +from datetime import datetime +from gzip import GzipFile +from io import BytesIO + +import pandas as pd +import pytest +import requests +from pkg_resources import resource_filename + +from tedana.io import InputHarvester +from tedana.workflows import t2smap as t2smap_cli +from tedana.workflows import tedana as tedana_cli +from tedana.workflows.ica_reclassify import ica_reclassify_workflow + +# Need to see if a no BOLD warning occurred +LOGGER = logging.getLogger(__name__) +# Added a testing logger to output whether or not testing data were downlaoded +TestLGR = logging.getLogger("TESTING") + + +def check_integration_outputs(fname, outpath, n_logs=1): + """ + Checks outputs of integration tests. + + Parameters + ---------- + fname : str + Path to file with expected outputs + outpath : str + Path to output directory generated from integration tests + """ + + # Gets filepaths generated by integration test + found_files = [ + os.path.relpath(f, outpath) + for f in glob.glob(os.path.join(outpath, "**"), recursive=True)[1:] + ] + + # Checks for log file + log_regex = "^tedana_[12][0-9]{3}-[0-9]{2}-[0-9]{2}T[0-9]{2}[0-9]{2}[0-9]{2}.tsv$" + logfiles = [out for out in found_files if re.match(log_regex, out)] + assert len(logfiles) == n_logs + + # Removes logfiles from list of existing files + for log in logfiles: + found_files.remove(log) + + # Compares remaining files with those expected + with open(fname, "r") as f: + expected_files = f.read().splitlines() + expected_files = [os.path.normpath(path) for path in expected_files] + + if sorted(found_files) != sorted(expected_files): + expected_not_found = sorted(list(set(expected_files) - set(found_files))) + found_not_expected = sorted(list(set(found_files) - set(expected_files))) + + msg = "" + if expected_not_found: + msg += "\nExpected but not found:\n\t" + msg += "\n\t".join(expected_not_found) + + if found_not_expected: + msg += "\nFound but not expected:\n\t" + msg += "\n\t".join(found_not_expected) + raise ValueError(msg) + + +def data_for_testing_info(test_dataset=str): + """ + Get the path and download link for each dataset used for testing. + + Also creates the base directories into which the data and output + directories are written + + Parameters + ---------- + test_dataset : str + References one of the datasets to download. It can be: + three-echo + three-echo-reclassify + four-echo + five-echo + + Returns + ------- + test_data_path : str + The path to the local directory where the data will be downloaded + osf_id : str + The ID for the OSF file. + Data download link would be https://osf.io/osf_id/download + Metadata download link would be https://osf.io/osf_id/metadata/?format=datacite-json + """ + + tedana_path = os.path.dirname(tedana_cli.__file__) + base_data_path = os.path.abspath(os.path.join(tedana_path, "../../.testing_data_cache")) + os.makedirs(base_data_path, exist_ok=True) + os.makedirs(os.path.join(base_data_path, "outputs"), exist_ok=True) + if test_dataset == "three-echo": + test_data_path = os.path.join(base_data_path, "three-echo/TED.three-echo") + osf_id = "rqhfc" + os.makedirs(os.path.join(base_data_path, "three-echo"), exist_ok=True) + os.makedirs(os.path.join(base_data_path, "outputs/three-echo"), exist_ok=True) + elif test_dataset == "three-echo-reclassify": + test_data_path = os.path.join(base_data_path, "reclassify") + osf_id = "f6g45" + os.makedirs(os.path.join(base_data_path, "outputs/reclassify"), exist_ok=True) + elif test_dataset == "four-echo": + test_data_path = os.path.join(base_data_path, "four-echo/TED.four-echo") + osf_id = "gnj73" + os.makedirs(os.path.join(base_data_path, "four-echo"), exist_ok=True) + os.makedirs(os.path.join(base_data_path, "outputs/four-echo"), exist_ok=True) + elif test_dataset == "five-echo": + test_data_path = os.path.join(base_data_path, "five-echo/TED.five-echo") + osf_id = "9c42e" + os.makedirs(os.path.join(base_data_path, "five-echo"), exist_ok=True) + os.makedirs(os.path.join(base_data_path, "outputs/five-echo"), exist_ok=True) + else: + raise ValueError(f"{test_dataset} is not a valid dataset string for data_for_testing_info") + + return test_data_path, osf_id + + +def download_test_data(osf_id, test_data_path): + """ + If current data is not already available, downloads tar.gz data + stored at `https://osf.io/osf_id/download`. + + and unpacks into `out_path`. + + Parameters + ---------- + osf_id : str + The ID for the OSF file. + out_path : str + Path to directory where OSF data should be extracted + """ + + try: + datainfo = requests.get(f"https://osf.io/{osf_id}/metadata/?format=datacite-json") + except Exception: + if len(os.listdir(test_data_path)) == 0: + raise ConnectionError( + f"Cannot access https://osf.io/{osf_id} and testing data " "are not yet downloaded" + ) + else: + TestLGR.warning( + f"Cannot access https://osf.io/{osf_id}. " + f"Using local copy of testing data in {test_data_path} " + "but cannot validate that local copy is up-to-date" + ) + return + datainfo.raise_for_status() + metadata = json.loads(datainfo.content) + # 'dates' is a list with all udpates to the file, the last item in the list + # is the most recent and the 'date' field in the list is the date of the last + # update. + osf_filedate = metadata["dates"][-1]["date"] + + # File the file with the most recent date for comparision with + # the lsst updated date for the osf file + if os.path.exists(test_data_path): + filelist = glob.glob(f"{test_data_path}/*") + most_recent_file = max(filelist, key=os.path.getctime) + if os.path.exists(most_recent_file): + local_filedate = os.path.getmtime(most_recent_file) + local_filedate_str = str(datetime.fromtimestamp(local_filedate).date()) + local_data_exists = True + else: + local_data_exists = False + else: + local_data_exists = False + if local_data_exists: + if local_filedate_str == osf_filedate: + TestLGR.info( + f"Downloaded and up-to-date data already in {test_data_path}. Not redownloading" + ) + return + else: + TestLGR.info( + f"Downloaded data in {test_data_path} was last modified on " + f"{local_filedate_str}. Data on https://osf.io/{osf_id} " + f" was last updated on {osf_filedate}. Deleting and redownloading" + ) + shutil.rmtree(test_data_path) + req = requests.get(f"https://osf.io/{osf_id}/download") + req.raise_for_status() + t = tarfile.open(fileobj=GzipFile(fileobj=BytesIO(req.content))) + os.makedirs(test_data_path, exist_ok=True) + t.extractall(test_data_path) + + +def reclassify_raw() -> str: + test_data_path, _ = data_for_testing_info("three-echo-reclassify") + return os.path.join(test_data_path, "TED.three-echo") + + +def reclassify_raw_registry() -> str: + return os.path.join(reclassify_raw(), "desc-tedana_registry.json") + + +def guarantee_reclassify_data() -> None: + """Ensures that the reclassify data exists at the expected path and return path.""" + + test_data_path, osf_id = data_for_testing_info("three-echo-reclassify") + + # Should now be checking and not downloading for each test so don't see if statement here + # if not os.path.exists(reclassify_raw_registry()): + download_test_data(osf_id, test_data_path) + # else: + # Path exists, be sure that everything in registry exists + ioh = InputHarvester(reclassify_raw_registry()) + all_present = True + for _, v in ioh.registry.items(): + if not isinstance(v, list): + if not os.path.exists(os.path.join(reclassify_raw(), v)): + all_present = False + break + if not all_present: + # Something was removed, need to re-download + shutil.rmtree(reclassify_raw()) + guarantee_reclassify_data() + return test_data_path + + +def test_integration_five_echo(skip_integration): + """Integration test of the full tedana workflow using five-echo test data.""" + + if skip_integration: + pytest.skip("Skipping five-echo integration test") + + test_data_path, osf_id = data_for_testing_info("five-echo") + out_dir = os.path.abspath(os.path.join(test_data_path, "../../outputs/five-echo")) + # out_dir_manual = f"{out_dir}-manual" + + if os.path.exists(out_dir): + shutil.rmtree(out_dir) + + # if os.path.exists(out_dir_manual): + # shutil.rmtree(out_dir_manual) + + # download data and run the test + download_test_data(osf_id, test_data_path) + prepend = f"{test_data_path}/p06.SBJ01_S09_Task11_e" + suffix = ".sm.nii.gz" + datalist = [prepend + str(i + 1) + suffix for i in range(5)] + echo_times = [15.4, 29.7, 44.0, 58.3, 72.6] + tedana_cli.tedana_workflow( + data=datalist, + tes=echo_times, + ica_method="robustica", + n_robust_runs=5, + out_dir=out_dir, + tedpca=0.95, + fittype="curvefit", + fixed_seed=49, + tedort=True, + verbose=True, + prefix="sub-01", + ) + + # Just a check on the component table pending a unit test of load_comptable + comptable = os.path.join(out_dir, "sub-01_desc-tedana_metrics.tsv") + df = pd.read_table(comptable) + assert isinstance(df, pd.DataFrame) + + +def test_integration_four_echo(skip_integration): + """Integration test of the full tedana workflow using four-echo test data.""" + + if skip_integration: + pytest.skip("Skipping four-echo integration test") + + test_data_path, osf_id = data_for_testing_info("four-echo") + out_dir = os.path.abspath(os.path.join(test_data_path, "../../outputs/four-echo")) + out_dir_manual = f"{out_dir}-manual" + + if os.path.exists(out_dir): + shutil.rmtree(out_dir) + + if os.path.exists(out_dir_manual): + shutil.rmtree(out_dir_manual) + + # download data and run the test + download_test_data(osf_id, test_data_path) + prepend = f"{test_data_path}/sub-PILOT_ses-01_task-localizerDetection_run-01_echo-" + suffix = "_space-sbref_desc-preproc_bold+orig.HEAD" + datalist = [prepend + str(i + 1) + suffix for i in range(4)] + tedana_cli.tedana_workflow( + data=datalist, + tes=[11.8, 28.04, 44.28, 60.52], + out_dir=out_dir, + ica_method="robustica", + n_robust_runs=6, + tedpca="aic", + gscontrol=["gsr", "mir"], + png_cmap="bone", + prefix="sub-01", + debug=True, + verbose=True, + ) + + +def test_integration_three_echo(skip_integration): + """Integration test of the full tedana workflow using three-echo test data.""" + + if skip_integration: + pytest.skip("Skipping three-echo integration test") + + test_data_path, osf_id = data_for_testing_info("three-echo") + out_dir = os.path.abspath(os.path.join(test_data_path, "../../outputs/three-echo")) + out_dir_manual = f"{out_dir}-rerun" + + if os.path.exists(out_dir): + shutil.rmtree(out_dir) + + if os.path.exists(out_dir_manual): + shutil.rmtree(out_dir_manual) + + # download data and run the test + download_test_data(osf_id, test_data_path) + + tedana_cli.tedana_workflow( + data=f"{test_data_path}/three_echo_Cornell_zcat.nii.gz", + tes=[14.5, 38.5, 62.5], + ica_method="robustica", + n_robust_runs=30, + out_dir=out_dir, + low_mem=True, + tedpca="aic", + ) + + # Test re-running, but use the CLI + args = [ + "-d", + f"{test_data_path}/three_echo_Cornell_zcat.nii.gz", + "-e", + "14.5", + "38.5", + "62.5", + "--out-dir", + out_dir_manual, + "--debug", + "--verbose", + "-f", + ] + tedana_cli._main(args) + + +def test_integration_reclassify_insufficient_args(skip_integration): + if skip_integration: + pytest.skip("Skipping reclassify insufficient args") + + guarantee_reclassify_data() + + args = [ + "ica_reclassify", + reclassify_raw_registry(), + ] + + result = subprocess.run(args, capture_output=True) + assert b"ValueError: Must manually accept or reject" in result.stderr + assert result.returncode != 0 + + +def test_integration_reclassify_quiet_csv(skip_integration): + if skip_integration: + pytest.skip("Skip reclassify quiet csv") + + test_data_path = guarantee_reclassify_data() + out_dir = os.path.abspath(os.path.join(test_data_path, "../outputs/reclassify/quiet")) + if os.path.exists(out_dir): + shutil.rmtree(out_dir) + + # Make some files that have components to manually accept and reject + to_accept = [i for i in range(3)] + to_reject = [i for i in range(7, 4)] + acc_df = pd.DataFrame(data=to_accept, columns=["Components"]) + rej_df = pd.DataFrame(data=to_reject, columns=["Components"]) + acc_csv_fname = os.path.join(reclassify_raw(), "accept.csv") + rej_csv_fname = os.path.join(reclassify_raw(), "reject.csv") + acc_df.to_csv(acc_csv_fname) + rej_df.to_csv(rej_csv_fname) + + args = [ + "ica_reclassify", + "--manacc", + acc_csv_fname, + "--manrej", + rej_csv_fname, + "--out-dir", + out_dir, + reclassify_raw_registry(), + ] + + results = subprocess.run(args, capture_output=True) + assert results.returncode == 0 + fn = resource_filename("tedana", "tests/data/reclassify_quiet_out.txt") + check_integration_outputs(fn, out_dir) + + +def test_integration_reclassify_quiet_spaces(skip_integration): + if skip_integration: + pytest.skip("Skip reclassify quiet space-delimited integers") + + test_data_path = guarantee_reclassify_data() + out_dir = os.path.abspath(os.path.join(test_data_path, "../outputs/reclassify/quiet")) + if os.path.exists(out_dir): + shutil.rmtree(out_dir) + + args = [ + "ica_reclassify", + "--manacc", + "1", + "2", + "3", + "--manrej", + "4", + "5", + "6", + "--out-dir", + out_dir, + reclassify_raw_registry(), + ] + + results = subprocess.run(args, capture_output=True) + assert results.returncode == 0 + fn = resource_filename("tedana", "tests/data/reclassify_quiet_out.txt") + check_integration_outputs(fn, out_dir) + + +def test_integration_reclassify_quiet_string(skip_integration): + if skip_integration: + pytest.skip("Skip reclassify quiet string of integers") + + test_data_path = guarantee_reclassify_data() + out_dir = os.path.abspath(os.path.join(test_data_path, "../outputs/reclassify/quiet")) + + if os.path.exists(out_dir): + shutil.rmtree(out_dir) + + args = [ + "ica_reclassify", + "--manacc", + "1,2,3", + "--manrej", + "4,5,6,", + "--out-dir", + out_dir, + reclassify_raw_registry(), + ] + + results = subprocess.run(args, capture_output=True) + assert results.returncode == 0 + fn = resource_filename("tedana", "tests/data/reclassify_quiet_out.txt") + check_integration_outputs(fn, out_dir) + + +def test_integration_reclassify_debug(skip_integration): + if skip_integration: + pytest.skip("Skip reclassify debug") + + test_data_path = guarantee_reclassify_data() + out_dir = os.path.abspath(os.path.join(test_data_path, "../outputs/reclassify/debug")) + if os.path.exists(out_dir): + shutil.rmtree(out_dir) + + args = [ + "ica_reclassify", + "--manacc", + "1", + "2", + "3", + "--prefix", + "sub-testymctestface", + "--convention", + "orig", + "--tedort", + "--mir", + "--no-reports", + "--out-dir", + out_dir, + "--debug", + reclassify_raw_registry(), + ] + + results = subprocess.run(args, capture_output=True) + assert results.returncode == 0 + fn = resource_filename("tedana", "tests/data/reclassify_debug_out.txt") + check_integration_outputs(fn, out_dir) + + +def test_integration_reclassify_both_rej_acc(skip_integration): + if skip_integration: + pytest.skip("Skip reclassify both rejected and accepted") + + test_data_path = guarantee_reclassify_data() + out_dir = os.path.abspath(os.path.join(test_data_path, "../outputs/reclassify/both_rej_acc")) + if os.path.exists(out_dir): + shutil.rmtree(out_dir) + + with pytest.raises( + ValueError, + match=r"The following components were both accepted and", + ): + ica_reclassify_workflow( + reclassify_raw_registry(), + accept=[1, 2, 3], + reject=[1, 2, 3], + out_dir=out_dir, + ) + + +def test_integration_reclassify_run_twice(skip_integration): + if skip_integration: + pytest.skip("Skip reclassify both rejected and accepted") + + test_data_path = guarantee_reclassify_data() + out_dir = os.path.abspath(os.path.join(test_data_path, "../outputs/reclassify/run_twice")) + if os.path.exists(out_dir): + shutil.rmtree(out_dir) + + ica_reclassify_workflow( + reclassify_raw_registry(), + accept=[1, 2, 3], + out_dir=out_dir, + no_reports=True, + ) + ica_reclassify_workflow( + reclassify_raw_registry(), + accept=[1, 2, 3], + out_dir=out_dir, + overwrite=True, + no_reports=True, + ) + fn = resource_filename("tedana", "tests/data/reclassify_run_twice.txt") + check_integration_outputs(fn, out_dir, n_logs=2) + + +def test_integration_reclassify_no_bold(skip_integration, caplog): + if skip_integration: + pytest.skip("Skip reclassify both rejected and accepted") + + test_data_path = guarantee_reclassify_data() + out_dir = os.path.abspath(os.path.join(test_data_path, "../outputs/reclassify/no_bold")) + if os.path.exists(out_dir): + shutil.rmtree(out_dir) + + ioh = InputHarvester(reclassify_raw_registry()) + comptable = ioh.get_file_contents("ICA metrics tsv") + to_accept = [i for i in range(len(comptable))] + + ica_reclassify_workflow( + reclassify_raw_registry(), + reject=to_accept, + out_dir=out_dir, + no_reports=True, + ) + assert "No accepted components remaining after manual classification!" in caplog.text + + fn = resource_filename("tedana", "tests/data/reclassify_no_bold.txt") + check_integration_outputs(fn, out_dir) + + +def test_integration_reclassify_accrej_files(skip_integration, caplog): + if skip_integration: + pytest.skip("Skip reclassify both rejected and accepted") + + test_data_path = guarantee_reclassify_data() + out_dir = os.path.abspath(os.path.join(test_data_path, "../outputs/reclassify/no_bold")) + if os.path.exists(out_dir): + shutil.rmtree(out_dir) + + ioh = InputHarvester(reclassify_raw_registry()) + comptable = ioh.get_file_contents("ICA metrics tsv") + to_accept = [i for i in range(len(comptable))] + + ica_reclassify_workflow( + reclassify_raw_registry(), + reject=to_accept, + out_dir=out_dir, + no_reports=True, + ) + assert "No accepted components remaining after manual classification!" in caplog.text + + fn = resource_filename("tedana", "tests/data/reclassify_no_bold.txt") + check_integration_outputs(fn, out_dir) + + +def test_integration_reclassify_index_failures(skip_integration): + if skip_integration: + pytest.skip("Skip reclassify index failures") + + test_data_path = guarantee_reclassify_data() + out_dir = os.path.abspath(os.path.join(test_data_path, "../outputs/reclassify/index_failures")) + if os.path.exists(out_dir): + shutil.rmtree(out_dir) + + with pytest.raises( + ValueError, + match=r"_parse_manual_list expected a list of integers, but the input is", + ): + ica_reclassify_workflow( + reclassify_raw_registry(), + accept=[1, 2.5, 3], + out_dir=out_dir, + no_reports=True, + ) + + with pytest.raises( + ValueError, + match=r"_parse_manual_list expected integers or a filename, but the input is", + ): + ica_reclassify_workflow( + reclassify_raw_registry(), + accept=[2.5], + out_dir=out_dir, + no_reports=True, + ) + + +def test_integration_t2smap(skip_integration): + """Integration test of the full t2smap workflow using five-echo test data.""" + if skip_integration: + pytest.skip("Skipping t2smap integration test") + test_data_path, osf_id = data_for_testing_info("five-echo") + out_dir = os.path.abspath(os.path.join(test_data_path, "../../outputs/t2smap_five-echo")) + if os.path.exists(out_dir): + shutil.rmtree(out_dir) + + # download data and run the test + download_test_data(osf_id, test_data_path) + prepend = f"{test_data_path}/p06.SBJ01_S09_Task11_e" + suffix = ".sm.nii.gz" + datalist = [prepend + str(i + 1) + suffix for i in range(5)] + echo_times = [15.4, 29.7, 44.0, 58.3, 72.6] + args = ( + ["-d"] + + datalist + + ["-e"] + + [str(te) for te in echo_times] + + ["--out-dir", out_dir, "--fittype", "curvefit"] + ) + t2smap_cli._main(args) + + # compare the generated output files + fname = resource_filename("tedana", "tests/data/nih_five_echo_outputs_t2smap.txt") + # Gets filepaths generated by integration test + found_files = [ + os.path.relpath(f, out_dir) + for f in glob.glob(os.path.join(out_dir, "**"), recursive=True)[1:] + ] + + # Compares remaining files with those expected + with open(fname, "r") as f: + expected_files = f.read().splitlines() + assert sorted(expected_files) == sorted(found_files) \ No newline at end of file diff --git a/tedana/workflows/tedana.py b/tedana/workflows/tedana.py index ee567dfef..6411ce44d 100644 --- a/tedana/workflows/tedana.py +++ b/tedana/workflows/tedana.py @@ -1,6 +1,4 @@ -""" -Run the "canonical" TE-Dependent ANAlysis workflow. -""" +"""Run the "canonical" TE-Dependent ANAlysis workflow.""" import argparse import datetime import json @@ -8,6 +6,7 @@ import os import os.path as op import shutil +import sys from glob import glob import numpy as np @@ -29,6 +28,12 @@ utils, ) from tedana.bibtex import get_description_references +from tedana.config import ( + DEFAULT_ICA_METHOD, + DEFAULT_N_MAX_ITER, + DEFAULT_N_MAX_RESTART, + DEFAULT_N_ROBUST_RUNS, +) from tedana.stats import computefeats2 from tedana.workflows.parser_utils import check_tedpca_value, is_valid_file @@ -37,8 +42,7 @@ def _get_parser(): - """ - Parses command line inputs for tedana + """Parse command line inputs for tedana. Returns ------- @@ -46,7 +50,7 @@ def _get_parser(): """ from tedana import __version__ - verstr = "tedana v{}".format(__version__) + verstr = f"tedana v{__version__}" parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Argument parser follow template provided by RalphyZ # https://stackoverflow.com/a/43456577 @@ -150,7 +154,6 @@ def _get_parser(): "in which case the specificed number of components will be " "selected." ), - choices=["mdl", "kic", "aic"], default="aic", ) optional.add_argument( @@ -165,45 +168,44 @@ def _get_parser(): ), default="kundu", ) - optional.add_argument(#####BTBTBT + optional.add_argument( "--ica_method", dest="ica_method", help=( "The applied ICA method. If set to fastica the FastICA " "from sklearn library will be run once with the seed value. " - "robustica will run FastICA n_robust_runs times and and uses " + "robustica will run FastICA n_robust_runs times and uses " "clustering methods to overcome the randomness of the FastICA " - "algorithm. If set to robustica the seed value will be ignored." - "If set to fastica n_robust_runs will not be effective." + "algorithm. When set to fastica n_robust_runs will not be effective." ), choices=["robustica", "fastica"], - default="robustica", + type=str.lower, + default=DEFAULT_ICA_METHOD, ) optional.add_argument( "--seed", dest="fixed_seed", metavar="INT", type=int, - help=( ##BTBTBT + help=( "Value used for random initialization of ICA " - "algorithm when ica_mthods is set to fastica. Set to an integer value for " - "reproducible ICA results with fastica. Set to -1 for " - "varying results across ICA calls. " + "algorithm. Set to an integer value for " + "reproducible ICA results (fastica/robustica). Set to -1 for " + "varying results across ICA (fastica/robustica) calls. " ), default=42, ) - optional.add_argument(#####BTBTBT + optional.add_argument( "--n_robust_runs", dest="n_robust_runs", type=int, help=( "The number of times robustica will run." - "This is only effective when ica_mthods is " + "This is only effective when ica_method is " "set to robustica." - ), - ##choices=range(2,100), - default=30, + choices=range(5, 500), + default=DEFAULT_N_ROBUST_RUNS, ) optional.add_argument( "--maxit", @@ -211,7 +213,7 @@ def _get_parser(): metavar="INT", type=int, help=("Maximum number of iterations for ICA."), - default=500, + default=DEFAULT_N_MAX_ITER, ) optional.add_argument( "--maxrestart", @@ -225,7 +227,7 @@ def _get_parser(): "convergence is achieved before maxrestart " "attempts, ICA will finish early." ), - default=10, + default=DEFAULT_N_MAX_RESTART, ) optional.add_argument( "--tedort", @@ -351,12 +353,12 @@ def tedana_workflow( fittype="loglin", combmode="t2s", tree="kundu", - ica_method="robustica", ########BTBTAdded - n_robust_runs=30, + ica_method=DEFAULT_ICA_METHOD, + n_robust_runs=DEFAULT_N_ROBUST_RUNS, tedpca="aic", fixed_seed=42, - maxit=500, - maxrestart=10, + maxit=DEFAULT_N_MAX_ITER, + maxrestart=DEFAULT_N_MAX_RESTART, tedort=False, gscontrol=None, no_reports=False, @@ -368,9 +370,9 @@ def tedana_workflow( overwrite=False, t2smap=None, mixm=None, + tedana_command=None, ): - """ - Run the "canonical" TE-Dependent ANAlysis workflow. + """Run the "canonical" TE-Dependent ANAlysis workflow. Please remember to cite :footcite:t:`dupre2021te`. @@ -412,15 +414,27 @@ def tedana_workflow( accepts and rejects some distinct components compared to kundu. Testing to better understand the effects of the differences is ongoing. Default is 'kundu'. + ica_method : {'robustica', 'fastica'}, optional + The applied ICA method. If set to fastica the FastICA from sklearn + library will be run once with the seed value. 'robustica' will run + 'FastICA' n_robust_runs times and uses clustering methods to overcome + the randomness of the FastICA algorithm. When set to fastica n_robust_runs + will not be effective. + Default is 'robustica' + n_robust_runs : :obj:`int`, optional + The number of times robustica will run. This is only effective when 'ica_method' is + set to 'robustica'. tedpca : {'mdl', 'aic', 'kic', 'kundu', 'kundu-stabilize', float, int}, optional Method with which to select components in TEDPCA. If a float is provided, then it is assumed to represent percentage of variance - explained (0-1) to retain from PCA. + explained (0-1) to retain from PCA. If an int is provided, it will output + a fixed number of components defined by the integer between 1 and the + number of time points. Default is 'aic'. fixed_seed : :obj:`int`, optional Value passed to ``mdp.numx_rand.seed()``. - Set to a positive integer value for reproducible ICA results; - otherwise, set to -1 for varying results across calls. + Set to a positive integer value for reproducible ICA results (fastica/robustica); + otherwise, set to -1 for varying results across ICA (fastica/robustica) calls. maxit : :obj:`int`, optional Maximum number of iterations for ICA. Default is 500. maxrestart : :obj:`int`, optional @@ -457,6 +471,9 @@ def tedana_workflow( If True, suppresses logging/printing of messages. Default is False. overwrite : :obj:`bool`, optional If True, force overwriting of files. Default is False. + tedana_command : :obj:`str`, optional + If the command-line interface was used, this is the command that was + run. Default is None. Notes ----- @@ -473,10 +490,11 @@ def tedana_workflow( os.mkdir(out_dir) # boilerplate - basename = "report" + prefix = io._infer_prefix(prefix) + basename = f"{prefix}report" extension = "txt" repname = op.join(out_dir, (basename + "." + extension)) - bibtex_file = op.join(out_dir, "references.bib") + bibtex_file = op.join(out_dir, f"{prefix}references.bib") repex = op.join(out_dir, (basename + "*")) previousreps = glob(repex) previousreps.sort(reverse=True) @@ -492,7 +510,20 @@ def tedana_workflow( logname = op.join(out_dir, (basename + start_time + "." + extension)) utils.setup_loggers(logname, repname, quiet=quiet, debug=debug) - LGR.info("Using output directory: {}".format(out_dir)) + # Save command into sh file, if the command-line interface was used + # TODO: use io_generator to save command + if tedana_command is not None: + command_file = open(os.path.join(out_dir, "tedana_call.sh"), "w") + command_file.write(tedana_command) + command_file.close() + else: + # Get variables passed to function if the tedana command is None + variables = ", ".join(f"{name}={value}" for name, value in locals().items()) + # From variables, remove everything after ", tedana_command" + variables = variables.split(", tedana_command")[0] + tedana_command = f"tedana_workflow({variables})" + + LGR.info(f"Using output directory: {out_dir}") # ensure tes are in appropriate format tes = [float(te) for te in tes] @@ -510,7 +541,7 @@ def tedana_workflow( if isinstance(data, str): data = [data] - LGR.info("Loading input data: {}".format([f for f in data])) + LGR.info(f"Loading input data: {[f for f in data]}") catd, ref_img = io.load_data(data, n_echos=n_echos) io_generator = io.OutputGenerator( @@ -527,8 +558,13 @@ def tedana_workflow( # TODO: turn this into an IOManager since this isn't really output io_generator.register_input(data) + # Save system info to json + info_dict = utils.get_system_info() + info_dict["Python"] = sys.version + info_dict["Command"] = tedana_command + n_samp, n_echos, n_vols = catd.shape - LGR.debug("Resulting data shape: {}".format(catd.shape)) + LGR.debug(f"Resulting data shape: {catd.shape}") # check if TR is 0 img_t_r = io_generator.reference_img.header.get_zooms()[-1] @@ -600,7 +636,7 @@ def tedana_workflow( getsum=True, threshold=1, ) - LGR.debug("Retaining {}/{} samples for denoising".format(mask_denoise.sum(), n_samp)) + LGR.debug(f"Retaining {mask_denoise.sum()}/{n_samp} samples for denoising") io_generator.save_file(masksum_denoise, "adaptive mask img") # Create an adaptive mask with at least 3 good echoes, for classification @@ -614,7 +650,7 @@ def tedana_workflow( "(restricted to voxels with good data in at least the first three echoes) was used for " "the component classification procedure." ) - LGR.debug("Retaining {}/{} samples for classification".format(mask_clf.sum(), n_samp)) + LGR.debug(f"Retaining {mask_clf.sum()}/{n_samp} samples for classification") if t2smap is None: LGR.info("Computing T2* map") @@ -625,7 +661,7 @@ def tedana_workflow( # set a hard cap for the T2* map # anything that is 10x higher than the 99.5 %ile will be reset to 99.5 %ile cap_t2s = stats.scoreatpercentile(t2s_full.flatten(), 99.5, interpolation_method="lower") - LGR.debug("Setting cap on T2* map at {:.5f}s".format(utils.millisec2sec(cap_t2s))) + LGR.debug(f"Setting cap on T2* map at {utils.millisec2sec(cap_t2s):.5f}s") t2s_full[t2s_full > cap_t2s * 10] = cap_t2s io_generator.save_file(utils.millisec2sec(t2s_full), "t2star img") io_generator.save_file(s0_full, "s0 img") @@ -642,23 +678,20 @@ def tedana_workflow( catd, data_oc = gsc.gscontrol_raw(catd, data_oc, n_echos, io_generator) fout = io_generator.save_file(data_oc, "combined img") - LGR.info("Writing optimally combined data set: {}".format(fout)) + LGR.info(f"Writing optimally combined data set: {fout}") if mixm is None: # Identify and remove thermal noise from data dd, n_components = decomposition.tedpca( catd, data_oc, - combmode, mask_clf, masksum_clf, - t2s_full, io_generator, tes=tes, algorithm=tedpca, kdaw=10.0, rdaw=1.0, - verbose=verbose, low_mem=low_mem, ) if verbose: @@ -667,12 +700,17 @@ def tedana_workflow( # Perform ICA, calculate metrics, and apply decision tree # Restart when ICA fails to converge or too few BOLD components found keep_restarting = True - n_restarts = 0 seed = fixed_seed while keep_restarting: mmix, seed = decomposition.tedica( - dd, n_components, seed, ica_method, n_robust_runs, maxit, maxrestart=(maxrestart - n_restarts) + dd, + n_components, + seed, + ica_method, + n_robust_runs, + maxit, + maxrestart=(maxrestart - n_restarts), ) seed += 1 n_restarts = seed - fixed_seed @@ -706,17 +744,13 @@ def tedana_workflow( ) ica_selector = selection.automatic_selection(comptable, n_echos, n_vols, tree=tree) n_likely_bold_comps = ica_selector.n_likely_bold_comps - - if ica_method=='robustica': #########BTBTBT + if (n_restarts < maxrestart) and (n_likely_bold_comps == 0): + LGR.warning("No BOLD components found. Re-attempting ICA.") + elif n_likely_bold_comps == 0: + LGR.warning("No BOLD components found, but maximum number of restarts reached.") + keep_restarting = False + else: keep_restarting = False - else: - if (n_restarts < maxrestart) and (n_likely_bold_comps == 0): - LGR.warning("No BOLD components found. Re-attempting ICA.") - elif n_likely_bold_comps == 0: - LGR.warning("No BOLD components found, but maximum number of restarts reached.") - keep_restarting = False - else: - keep_restarting = False # If we're going to restart, temporarily allow force overwrite if keep_restarting: @@ -825,7 +859,6 @@ def tedana_workflow( mask=mask_denoise, comptable=comptable, mmix=mmix, - n_vols=n_vols, io_generator=io_generator, ) @@ -852,6 +885,16 @@ def tedana_workflow( "of non-BOLD noise from multi-echo fMRI data." ), "CodeURL": "https://github.com/ME-ICA/tedana", + "Node": { + "Name": info_dict["Node"], + "System": info_dict["System"], + "Machine": info_dict["Machine"], + "Processor": info_dict["Processor"], + "Release": info_dict["Release"], + "Version": info_dict["Version"], + }, + "Python": info_dict["Python"], + "Command": info_dict["Command"], } ], } @@ -908,22 +951,22 @@ def tedana_workflow( ) LGR.info("Generating dynamic report") - reporting.generate_report(io_generator, tr=img_t_r) + reporting.generate_report(io_generator) LGR.info("Workflow completed") utils.teardown_loggers() def _main(argv=None): - """Tedana entry point""" + """Run the tedana workflow.""" + tedana_command = "tedana " + " ".join(sys.argv[1:]) options = _get_parser().parse_args(argv) kwargs = vars(options) n_threads = kwargs.pop("n_threads") n_threads = None if n_threads == -1 else n_threads with threadpool_limits(limits=n_threads, user_api=None): - tedana_workflow(**kwargs) + tedana_workflow(**kwargs, tedana_command=tedana_command) if __name__ == "__main__": - _main() - + _main() \ No newline at end of file