Decoupler Pathway Inference (#308)

* adds decoupler_pathway_inference and its Galaxy wrapper --------- Co-authored-by: Nicola Soranzo <[email protected]> Co-authored-by: Pablo Moreno <[email protected]>
ebi-gene-expression-group · Mar 15, 2024 · 1034a45 · 1034a45
1 parent 0264c35
commit 1034a45
Show file tree

Hide file tree

Showing 5 changed files with 419 additions and 0 deletions.
diff --git a/tools/tertiary-analysis/decoupler/decoupler_pathway_inference.py b/tools/tertiary-analysis/decoupler/decoupler_pathway_inference.py
@@ -0,0 +1,132 @@
+# import the necessary packages
+import argparse
+
+import anndata as ad
+import decoupler as dc
+import pandas as pd
+
+# define arguments for the script
+parser = argparse.ArgumentParser()
+
+# add AnnData input file option
+parser.add_argument(
+    "-i", "--input_anndata", help="AnnData input file", required=True
+)
+
+# add network input file option
+parser.add_argument(
+    "-n", "--input_network", help="Network input file", required=True
+)
+
+# output file prefix
+parser.add_argument(
+    "-o", "--output",
+    help="output files prefix",
+    default=None,
+)
+
+# path to save Activities AnnData file
+parser.add_argument(
+    "-a", "--activities_path", help="Path to save Activities AnnData file", default=None
+)
+
+# Column name in net with source nodes
+parser.add_argument(
+    "-s", "--source", help="Column name in net with source nodes.", default="source"
+)
+
+# Column name in net with target nodes
+parser.add_argument(
+    "-t", "--target", help="Column name in net with target nodes.", default="target"
+)
+
+# Column name in net with weights.
+parser.add_argument(
+    "-w", "--weight", help="Column name in net with weights.", default="weight"
+)
+
+# add boolean argument for use_raw
+parser.add_argument(
+    "--use_raw", action="store_true", default=False, help="Whether to use the raw part of the AnnData object"
+)
+
+# add argument for min_cells
+parser.add_argument(
+    "--min_n", help="Minimum of targets per source. If less, sources are removed.", default=5, type=int
+)
+
+# add activity inference method option
+parser.add_argument(
+    "-m", "--method", help="Activity inference method", default="mlm", required=True
+)
+args = parser.parse_args()
+
+# check that either -o or --output is specified
+if args.output is None:
+    raise ValueError("Please specify either -o or --output")
+
+# read in the AnnData input file
+adata = ad.read_h5ad(args.input_anndata)
+
+# read in the input file network input file
+network = pd.read_csv(args.input_network, sep='\t')
+
+if (
+    args.source not in network.columns
+    or args.target not in network.columns
+    or args.weight not in network.columns
+):
+    raise ValueError(
+        "Source, target, and weight columns are not present in the network"
+    )
+
+
+print(type(args.min_n))
+
+if args.method == "mlm":
+    dc.run_mlm(
+        mat=adata,
+        net=network,
+        source=args.source,
+        target=args.target,
+        weight=args.weight,
+        verbose=True,
+        min_n=args.min_n,
+        use_raw=args.use_raw 
+    )
+
+    if args.output is not None:
+        # write adata.obsm[mlm_key] and adata.obsm[mlm_pvals_key] to the output network files
+        combined_df = pd.concat([adata.obsm["mlm_estimate"], adata.obsm["mlm_pvals"]], axis=1)
+
+        # Save the combined dataframe to a file
+        combined_df.to_csv(args.output + ".tsv", sep="\t")
+
+    # if args.activities_path is specified, generate the activities AnnData and save the AnnData object to the specified path
+    if args.activities_path is not None:
+        acts = dc.get_acts(adata, obsm_key="mlm_estimate")
+        acts.write_h5ad(args.activities_path)
+
+elif args.method == "ulm":
+    dc.run_ulm(
+        mat=adata,
+        net=network,
+        source=args.source,
+        target=args.target,
+        weight=args.weight,
+        verbose=True,
+        min_n=args.min_n,
+        use_raw=args.use_raw 
+    )
+
+    if args.output is not None:
+        # write adata.obsm[mlm_key] and adata.obsm[mlm_pvals_key] to the output network files
+        combined_df = pd.concat([adata.obsm["ulm_estimate"], adata.obsm["ulm_pvals"]], axis=1)
+
+        # Save the combined dataframe to a file
+        combined_df.to_csv(args.output + ".tsv", sep="\t")
+
+    # if args.activities_path is specified, generate the activities AnnData and save the AnnData object to the specified path
+    if args.activities_path is not None:
+        acts = dc.get_acts(adata, obsm_key="ulm_estimate")
+        acts.write_h5ad(args.activities_path)
diff --git a/tools/tertiary-analysis/decoupler/decoupler_pathway_inference.xml b/tools/tertiary-analysis/decoupler/decoupler_pathway_inference.xml
@@ -0,0 +1,129 @@
+<tool id="decoupler_pathway_inference" name="Decoupler Pathway Inference" version="1.4.0+galaxy0" profile="20.05" license="MIT">
+    <description>
+        of functional genesets/pathways for scRNA-seq data.
+    </description>
+    <requirements>
+        <requirement type="package" version="1.4.0">decoupler</requirement>
+    </requirements>
+    <command>
+        python '$__tool_directory__/decoupler_pathway_inference.py'
+            -i '$input_anndata'
+            -n '$input_network_file'
+            --min_n "$min_n"
+            --method '$method'
+            $use_raw
+            --source $source
+            --target $target
+            --weight $weight
+            --output "inference"
+            $write_activities_path
+    </command>
+    <inputs>
+        <param name="input_anndata" type="data" format="h5ad" label="Input AnnData file" />
+        <param name="input_network_file" type="data" format="tabular" label="Input Network file" help="Tabular file with columns Source, Target and Weight. A source gene/pathway regulates/contains a target gene, weights can be either positive or negative. The source element needs to be part of the network, the target is a gene in the network and in the dataset" />
+        <param name="min_n" type="integer" min="0" value="5" label="Minimum targets per source." help="If targets are less than minimum, sources are removed" />
+        <param name="method" type="select" label="Activity inference method">
+            <option value="mlm" selected="true">Multivariate linear model (MLM)</option>
+            <option value="ulm">Univariate linear model (ULM)</option>
+        </param>
+        <param name="use_raw" type="boolean" truevalue="--use_raw" falsevalue="" checked="false" label="Use the raw part of the AnnData object" />
+        <param name="write_activities_path" type="boolean" truevalue="--activities_path anndata_activities_path.h5ad" falsevalue="" checked="true" label="Write the activities AnnData object (contains the MLM/ULM activity results for each pathway and each cell in the main matrix, it is not a replacement of the original AnnData provided as input)." />
+        <param name="source" type="text" value='source' label="Column name in network with source nodes." help="If empty then default is 'source' is used." />
+        <param name="target" type="text" value='target' label="Column name in network with target nodes." help="If empty then default is 'target' is used." />
+        <param name="weight" type="text" value='weight' label="Column name in network with weight." help="If empty then default is 'weight' is used." />
+    </inputs>
+    <outputs>
+        <data name="output_ad" format="h5ad" from_work_dir="anndata_activities_path.h5ad" label="${tool.name} on ${on_string}: Regulators/Pathways activity AnnData file">
+            <filter>write_activities_path</filter>
+        </data>
+        <data name="output_table" format="tabular" from_work_dir="inference.tsv" label="${tool.name} on ${on_string}: Output estimate table" />
+    </outputs>
+    <tests>
+        <!-- Hint: You can use [ctrl+alt+t] after defining the inputs/outputs to auto-scaffold some basic test cases. -->
+
+    <test expect_num_outputs="2">
+        <param name="input_anndata" value="pbmc3k_processed.h5ad"/>
+        <param name="input_network_file" value="progeny_test.tsv"/>
+        <param name="min_n" value="0"/>
+        <param name="method" value="mlm"/>
+        <param name="use_raw" value="false"/>
+        <param name="write_activities_path" value="true"/>
+        <param name="source" value="source"/>
+        <param name="target" value="target"/>
+        <param name="weight" value="weight"/>
+        <output name="output_ad">
+            <assert_contents>
+                <has_h5_keys keys="obsm/mlm_estimate"/>
+            </assert_contents>
+        </output>
+        <output name="output_table">
+            <assert_contents>
+                <has_n_columns n="5"/>
+            </assert_contents>
+        </output>
+    </test>
+    <test>
+        <param name="input_anndata" value="pbmc3k_processed.h5ad"/>
+        <param name="input_network_file" value="progeny_test_2.tsv"/>
+        <param name="min_n" value="0"/>
+        <param name="method" value="ulm"/>
+        <param name="use_raw" value="false"/>
+        <param name="write_activities_path" value="true"/>
+        <param name="source" value="source"/>
+        <param name="target" value="target"/>
+        <param name="weight" value="weight"/>
+        <output name="output_ad">
+            <assert_contents>
+                <has_h5_keys keys="obsm/ulm_estimate"/>
+            </assert_contents>
+        </output>
+        <output name="output_table">
+            <assert_contents>
+                <has_n_columns n="5"/>
+            </assert_contents>
+        </output>
+    </test>
+    </tests>
+    <help>
+**What it does**
+
+Usage
+.....
+
+
+**Description**
+
+This tool extracts pathway activity inference using decoupler.
+
+**Input** 
+
+The input file should be an AnnData object in H5AD format. The tool accepts an H5AD file containing raw or normalized data.
+
+The tool also takes network file containing a collection of pathways and their target genes, with weights for each interaction.
+        Example:
+        ```
+                source    target    weight
+            0    T1    G01    1.0
+            1    T1    G02    1.0
+            2    T1    G03    0.7
+            3    T2    G04    1.0
+            4    T2    G06    -0.5
+        ```
+
+You can also specify whether to use the raw data in the AnnData object instead of the X matrix using the "use_raw" parameter and Minimum of targets per source using "min_n".
+
+
+**Output**
+
+The tool outputs an AnnData object containing the scores in the "obs" field, and tab-separated text files containing the scores for each cell.
+
+If the "write_activities_path" parameter is set to "true", the tool will write the modified AnnData object to an H5AD file. 
+If the "write_inference" parameter is set to "true", the tool will output a tab-separated text file containing the scores for each cell.
+
+
+
+    </help>
+    <citations>
+        <citation type="doi">10.1093/bioadv/vbac016 </citation>
+    </citations>
+</tool>
diff --git a/tools/tertiary-analysis/decoupler/get_test_data.sh b/tools/tertiary-analysis/decoupler/get_test_data.sh
@@ -19,3 +19,19 @@ function get_data {
 mkdir -p test-data
 pushd test-data
 get_data $MTX_LINK $BASENAME_FILE
+
+
+# Download input anndata for decoupler-pathway_inference
+BASENAME_FILE='pbmc3k_processed.h5ad'
+
+MTX_LINK='https://zenodo.org/records/3752813/files/pbmc3k_processed.h5ad'
+
+get_data $MTX_LINK $BASENAME_FILE
+
+# Download output anndata for decoupler-pathway_inference
+BASENAME_FILE='test.h5ad'
+
+MTX_LINK='https://zenodo.org/records/10401958/files/test.h5ad'
+
+get_data $MTX_LINK $BASENAME_FILE
+
diff --git a/tools/tertiary-analysis/decoupler/test-data/progeny_test.tsv b/tools/tertiary-analysis/decoupler/test-data/progeny_test.tsv
@@ -0,0 +1,71 @@
+	source	target	weight	p_value
+0	Androgen	TMPRSS2	11.490631	0.0
+1	Androgen	NKX3-1	10.622551	2.2e-44
+2	Androgen	MBOAT2	10.472733	4.6e-44
+3	Androgen	KLK2	10.176186	1.94441e-40
+4	Androgen	SARG	11.386852	2.79021e-40
+5	EGFR	LZTFL1	-1.8738769	2.0809955e-18
+6	EGFR	PHLDA2	3.5051384	2.0530624e-17
+7	EGFR	DUSP6	12.6293125	6.537324e-17
+8	EGFR	DUSP5	7.9430394	6.86669e-17
+9	EGFR	PHLDA1	6.619626	3.4106933e-16
+10	Estrogen	GREB1	17.240173	0.0
+11	Estrogen	RET	10.718027	0.0
+12	Estrogen	TFF1	14.430255	0.0
+13	Estrogen	HEY2	11.482369	3.1e-44
+14	Estrogen	RAPGEFL1	10.544896	5.2e-43
+15	Hypoxia	FAM162A	8.335551	0.0
+16	Hypoxia	NDRG1	22.08712	0.0
+17	Hypoxia	ENO2	14.32694	0.0
+18	Hypoxia	PDK1	13.120449	0.0
+19	Hypoxia	ANKRD37	8.484976	0.0
+20	JAK-STAT	OAS1	15.028714	1.058e-41
+21	JAK-STAT	HERC6	8.769676	1.3450407e-38
+22	JAK-STAT	OAS3	10.618842	1.2143582e-37
+23	JAK-STAT	PLSCR1	8.481604	8.955206e-37
+24	JAK-STAT	DDX60	12.198234	9.150971e-36
+25	MAPK	DUSP6	16.859016	0.0
+26	MAPK	SPRED2	3.5018346	0.0
+27	MAPK	SPRY2	9.481585	9.19e-43
+28	MAPK	ETV5	5.9887094	6.7425e-41
+29	MAPK	EPHA2	6.3140125	3.7492e-40
+30	NFkB	NFKB1	9.513637	0.0
+31	NFkB	CXCL3	22.946114	0.0
+32	NFkB	NFKB2	5.5155754	0.0
+33	NFkB	NFKBIA	11.444533	0.0
+34	NFkB	BCL2A1	14.416924	0.0
+35	PI3K	MLANA	-9.985743	1.84e-43
+36	PI3K	PMEL	-6.5903482	6.8747866e-36
+37	PI3K	FAXDC2	-12.421274	3.297515e-34
+38	PI3K	HSD17B8	-8.601571	9.948224e-34
+39	PI3K	CTSF	-9.172143	1.0235212e-31
+40	TGFb	LINC00312	4.428987	2.0074443e-17
+41	TGFb	TSPAN2	5.502326	3.1451768e-16
+42	TGFb	SMAD7	7.6311436	7.3087106e-16
+43	TGFb	NOX4	5.913813	3.8292238e-15
+44	TGFb	COL4A1	6.3374896	9.052501e-15
+45	TNFa	CSF2	8.35548	0.0
+46	TNFa	CXCL5	10.0813675	0.0
+47	TNFa	NFKBIE	10.356205	0.0
+48	TNFa	TNFAIP3	35.40072	0.0
+49	TNFa	EFNA1	18.63111	0.0
+50	Trail	FRMPD1	-2.2346141	9.378505e-07
+51	Trail	WT1-AS	2.2251053	2.0316747e-06
+52	Trail	WNT8A	-1.8469616	3.795469e-05
+53	Trail	GPR18	3.240805	6.1090715e-05
+54	Trail	TEC	2.0513217	6.32898e-05
+55	VEGF	CRACD	-4.87119	6.7185365e-25
+56	VEGF	VWA8	-3.6068044	1.4495265e-18
+57	VEGF	NLGN1	-5.618075	2.6587072e-18
+58	VEGF	NRG3	-5.823747	1.0848074e-16
+59	VEGF	KCNK10	2.8833063	1.8129868e-16
+60	WNT	BMP4	5.936831	2.511717e-10
+61	WNT	SIGLEC6	2.0207362	2.347858e-09
+62	WNT	NPY2R	1.3872339	8.666917e-09
+63	WNT	CSF3R	1.9323153	3.0219417e-07
+64	WNT	KRT23	4.1216116	5.463989e-07
+65	p53	GLS2	6.452465	7.444302e-37
+66	p53	MDM2	8.193488	2.1194304e-35
+67	p53	ZNF79	4.020263	4.5987433e-34
+68	p53	FDXR	11.994496	5.589482e-32
+69	p53	LCE1B	11.813737	7.8095406e-30