Skip to content

Commit

Permalink
Decoupler Pathway Inference (#308)
Browse files Browse the repository at this point in the history
* adds decoupler_pathway_inference and its Galaxy wrapper

---------

Co-authored-by: Nicola Soranzo <[email protected]>
Co-authored-by: Pablo Moreno <[email protected]>
  • Loading branch information
3 people authored Mar 15, 2024
1 parent 0264c35 commit 1034a45
Show file tree
Hide file tree
Showing 5 changed files with 419 additions and 0 deletions.
132 changes: 132 additions & 0 deletions tools/tertiary-analysis/decoupler/decoupler_pathway_inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
# import the necessary packages
import argparse

import anndata as ad
import decoupler as dc
import pandas as pd

# define arguments for the script
parser = argparse.ArgumentParser()

# add AnnData input file option
parser.add_argument(
"-i", "--input_anndata", help="AnnData input file", required=True
)

# add network input file option
parser.add_argument(
"-n", "--input_network", help="Network input file", required=True
)

# output file prefix
parser.add_argument(
"-o", "--output",
help="output files prefix",
default=None,
)

# path to save Activities AnnData file
parser.add_argument(
"-a", "--activities_path", help="Path to save Activities AnnData file", default=None
)

# Column name in net with source nodes
parser.add_argument(
"-s", "--source", help="Column name in net with source nodes.", default="source"
)

# Column name in net with target nodes
parser.add_argument(
"-t", "--target", help="Column name in net with target nodes.", default="target"
)

# Column name in net with weights.
parser.add_argument(
"-w", "--weight", help="Column name in net with weights.", default="weight"
)

# add boolean argument for use_raw
parser.add_argument(
"--use_raw", action="store_true", default=False, help="Whether to use the raw part of the AnnData object"
)

# add argument for min_cells
parser.add_argument(
"--min_n", help="Minimum of targets per source. If less, sources are removed.", default=5, type=int
)

# add activity inference method option
parser.add_argument(
"-m", "--method", help="Activity inference method", default="mlm", required=True
)
args = parser.parse_args()

# check that either -o or --output is specified
if args.output is None:
raise ValueError("Please specify either -o or --output")

# read in the AnnData input file
adata = ad.read_h5ad(args.input_anndata)

# read in the input file network input file
network = pd.read_csv(args.input_network, sep='\t')

if (
args.source not in network.columns
or args.target not in network.columns
or args.weight not in network.columns
):
raise ValueError(
"Source, target, and weight columns are not present in the network"
)


print(type(args.min_n))

if args.method == "mlm":
dc.run_mlm(
mat=adata,
net=network,
source=args.source,
target=args.target,
weight=args.weight,
verbose=True,
min_n=args.min_n,
use_raw=args.use_raw
)

if args.output is not None:
# write adata.obsm[mlm_key] and adata.obsm[mlm_pvals_key] to the output network files
combined_df = pd.concat([adata.obsm["mlm_estimate"], adata.obsm["mlm_pvals"]], axis=1)

# Save the combined dataframe to a file
combined_df.to_csv(args.output + ".tsv", sep="\t")

# if args.activities_path is specified, generate the activities AnnData and save the AnnData object to the specified path
if args.activities_path is not None:
acts = dc.get_acts(adata, obsm_key="mlm_estimate")
acts.write_h5ad(args.activities_path)

elif args.method == "ulm":
dc.run_ulm(
mat=adata,
net=network,
source=args.source,
target=args.target,
weight=args.weight,
verbose=True,
min_n=args.min_n,
use_raw=args.use_raw
)

if args.output is not None:
# write adata.obsm[mlm_key] and adata.obsm[mlm_pvals_key] to the output network files
combined_df = pd.concat([adata.obsm["ulm_estimate"], adata.obsm["ulm_pvals"]], axis=1)

# Save the combined dataframe to a file
combined_df.to_csv(args.output + ".tsv", sep="\t")

# if args.activities_path is specified, generate the activities AnnData and save the AnnData object to the specified path
if args.activities_path is not None:
acts = dc.get_acts(adata, obsm_key="ulm_estimate")
acts.write_h5ad(args.activities_path)
129 changes: 129 additions & 0 deletions tools/tertiary-analysis/decoupler/decoupler_pathway_inference.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
<tool id="decoupler_pathway_inference" name="Decoupler Pathway Inference" version="1.4.0+galaxy0" profile="20.05" license="MIT">
<description>
of functional genesets/pathways for scRNA-seq data.
</description>
<requirements>
<requirement type="package" version="1.4.0">decoupler</requirement>
</requirements>
<command>
python '$__tool_directory__/decoupler_pathway_inference.py'
-i '$input_anndata'
-n '$input_network_file'
--min_n "$min_n"
--method '$method'
$use_raw
--source $source
--target $target
--weight $weight
--output "inference"
$write_activities_path
</command>
<inputs>
<param name="input_anndata" type="data" format="h5ad" label="Input AnnData file" />
<param name="input_network_file" type="data" format="tabular" label="Input Network file" help="Tabular file with columns Source, Target and Weight. A source gene/pathway regulates/contains a target gene, weights can be either positive or negative. The source element needs to be part of the network, the target is a gene in the network and in the dataset" />
<param name="min_n" type="integer" min="0" value="5" label="Minimum targets per source." help="If targets are less than minimum, sources are removed" />
<param name="method" type="select" label="Activity inference method">
<option value="mlm" selected="true">Multivariate linear model (MLM)</option>
<option value="ulm">Univariate linear model (ULM)</option>
</param>
<param name="use_raw" type="boolean" truevalue="--use_raw" falsevalue="" checked="false" label="Use the raw part of the AnnData object" />
<param name="write_activities_path" type="boolean" truevalue="--activities_path anndata_activities_path.h5ad" falsevalue="" checked="true" label="Write the activities AnnData object (contains the MLM/ULM activity results for each pathway and each cell in the main matrix, it is not a replacement of the original AnnData provided as input)." />
<param name="source" type="text" value='source' label="Column name in network with source nodes." help="If empty then default is 'source' is used." />
<param name="target" type="text" value='target' label="Column name in network with target nodes." help="If empty then default is 'target' is used." />
<param name="weight" type="text" value='weight' label="Column name in network with weight." help="If empty then default is 'weight' is used." />
</inputs>
<outputs>
<data name="output_ad" format="h5ad" from_work_dir="anndata_activities_path.h5ad" label="${tool.name} on ${on_string}: Regulators/Pathways activity AnnData file">
<filter>write_activities_path</filter>
</data>
<data name="output_table" format="tabular" from_work_dir="inference.tsv" label="${tool.name} on ${on_string}: Output estimate table" />
</outputs>
<tests>
<!-- Hint: You can use [ctrl+alt+t] after defining the inputs/outputs to auto-scaffold some basic test cases. -->

<test expect_num_outputs="2">
<param name="input_anndata" value="pbmc3k_processed.h5ad"/>
<param name="input_network_file" value="progeny_test.tsv"/>
<param name="min_n" value="0"/>
<param name="method" value="mlm"/>
<param name="use_raw" value="false"/>
<param name="write_activities_path" value="true"/>
<param name="source" value="source"/>
<param name="target" value="target"/>
<param name="weight" value="weight"/>
<output name="output_ad">
<assert_contents>
<has_h5_keys keys="obsm/mlm_estimate"/>
</assert_contents>
</output>
<output name="output_table">
<assert_contents>
<has_n_columns n="5"/>
</assert_contents>
</output>
</test>
<test>
<param name="input_anndata" value="pbmc3k_processed.h5ad"/>
<param name="input_network_file" value="progeny_test_2.tsv"/>
<param name="min_n" value="0"/>
<param name="method" value="ulm"/>
<param name="use_raw" value="false"/>
<param name="write_activities_path" value="true"/>
<param name="source" value="source"/>
<param name="target" value="target"/>
<param name="weight" value="weight"/>
<output name="output_ad">
<assert_contents>
<has_h5_keys keys="obsm/ulm_estimate"/>
</assert_contents>
</output>
<output name="output_table">
<assert_contents>
<has_n_columns n="5"/>
</assert_contents>
</output>
</test>
</tests>
<help>
**What it does**

Usage
.....


**Description**

This tool extracts pathway activity inference using decoupler.

**Input**

The input file should be an AnnData object in H5AD format. The tool accepts an H5AD file containing raw or normalized data.

The tool also takes network file containing a collection of pathways and their target genes, with weights for each interaction.
Example:
```
source target weight
0 T1 G01 1.0
1 T1 G02 1.0
2 T1 G03 0.7
3 T2 G04 1.0
4 T2 G06 -0.5
```

You can also specify whether to use the raw data in the AnnData object instead of the X matrix using the "use_raw" parameter and Minimum of targets per source using "min_n".


**Output**

The tool outputs an AnnData object containing the scores in the "obs" field, and tab-separated text files containing the scores for each cell.

If the "write_activities_path" parameter is set to "true", the tool will write the modified AnnData object to an H5AD file.
If the "write_inference" parameter is set to "true", the tool will output a tab-separated text file containing the scores for each cell.



</help>
<citations>
<citation type="doi">10.1093/bioadv/vbac016 </citation>
</citations>
</tool>
16 changes: 16 additions & 0 deletions tools/tertiary-analysis/decoupler/get_test_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,19 @@ function get_data {
mkdir -p test-data
pushd test-data
get_data $MTX_LINK $BASENAME_FILE


# Download input anndata for decoupler-pathway_inference
BASENAME_FILE='pbmc3k_processed.h5ad'

MTX_LINK='https://zenodo.org/records/3752813/files/pbmc3k_processed.h5ad'

get_data $MTX_LINK $BASENAME_FILE

# Download output anndata for decoupler-pathway_inference
BASENAME_FILE='test.h5ad'

MTX_LINK='https://zenodo.org/records/10401958/files/test.h5ad'

get_data $MTX_LINK $BASENAME_FILE

71 changes: 71 additions & 0 deletions tools/tertiary-analysis/decoupler/test-data/progeny_test.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
source target weight p_value
0 Androgen TMPRSS2 11.490631 0.0
1 Androgen NKX3-1 10.622551 2.2e-44
2 Androgen MBOAT2 10.472733 4.6e-44
3 Androgen KLK2 10.176186 1.94441e-40
4 Androgen SARG 11.386852 2.79021e-40
5 EGFR LZTFL1 -1.8738769 2.0809955e-18
6 EGFR PHLDA2 3.5051384 2.0530624e-17
7 EGFR DUSP6 12.6293125 6.537324e-17
8 EGFR DUSP5 7.9430394 6.86669e-17
9 EGFR PHLDA1 6.619626 3.4106933e-16
10 Estrogen GREB1 17.240173 0.0
11 Estrogen RET 10.718027 0.0
12 Estrogen TFF1 14.430255 0.0
13 Estrogen HEY2 11.482369 3.1e-44
14 Estrogen RAPGEFL1 10.544896 5.2e-43
15 Hypoxia FAM162A 8.335551 0.0
16 Hypoxia NDRG1 22.08712 0.0
17 Hypoxia ENO2 14.32694 0.0
18 Hypoxia PDK1 13.120449 0.0
19 Hypoxia ANKRD37 8.484976 0.0
20 JAK-STAT OAS1 15.028714 1.058e-41
21 JAK-STAT HERC6 8.769676 1.3450407e-38
22 JAK-STAT OAS3 10.618842 1.2143582e-37
23 JAK-STAT PLSCR1 8.481604 8.955206e-37
24 JAK-STAT DDX60 12.198234 9.150971e-36
25 MAPK DUSP6 16.859016 0.0
26 MAPK SPRED2 3.5018346 0.0
27 MAPK SPRY2 9.481585 9.19e-43
28 MAPK ETV5 5.9887094 6.7425e-41
29 MAPK EPHA2 6.3140125 3.7492e-40
30 NFkB NFKB1 9.513637 0.0
31 NFkB CXCL3 22.946114 0.0
32 NFkB NFKB2 5.5155754 0.0
33 NFkB NFKBIA 11.444533 0.0
34 NFkB BCL2A1 14.416924 0.0
35 PI3K MLANA -9.985743 1.84e-43
36 PI3K PMEL -6.5903482 6.8747866e-36
37 PI3K FAXDC2 -12.421274 3.297515e-34
38 PI3K HSD17B8 -8.601571 9.948224e-34
39 PI3K CTSF -9.172143 1.0235212e-31
40 TGFb LINC00312 4.428987 2.0074443e-17
41 TGFb TSPAN2 5.502326 3.1451768e-16
42 TGFb SMAD7 7.6311436 7.3087106e-16
43 TGFb NOX4 5.913813 3.8292238e-15
44 TGFb COL4A1 6.3374896 9.052501e-15
45 TNFa CSF2 8.35548 0.0
46 TNFa CXCL5 10.0813675 0.0
47 TNFa NFKBIE 10.356205 0.0
48 TNFa TNFAIP3 35.40072 0.0
49 TNFa EFNA1 18.63111 0.0
50 Trail FRMPD1 -2.2346141 9.378505e-07
51 Trail WT1-AS 2.2251053 2.0316747e-06
52 Trail WNT8A -1.8469616 3.795469e-05
53 Trail GPR18 3.240805 6.1090715e-05
54 Trail TEC 2.0513217 6.32898e-05
55 VEGF CRACD -4.87119 6.7185365e-25
56 VEGF VWA8 -3.6068044 1.4495265e-18
57 VEGF NLGN1 -5.618075 2.6587072e-18
58 VEGF NRG3 -5.823747 1.0848074e-16
59 VEGF KCNK10 2.8833063 1.8129868e-16
60 WNT BMP4 5.936831 2.511717e-10
61 WNT SIGLEC6 2.0207362 2.347858e-09
62 WNT NPY2R 1.3872339 8.666917e-09
63 WNT CSF3R 1.9323153 3.0219417e-07
64 WNT KRT23 4.1216116 5.463989e-07
65 p53 GLS2 6.452465 7.444302e-37
66 p53 MDM2 8.193488 2.1194304e-35
67 p53 ZNF79 4.020263 4.5987433e-34
68 p53 FDXR 11.994496 5.589482e-32
69 p53 LCE1B 11.813737 7.8095406e-30
Loading

0 comments on commit 1034a45

Please sign in to comment.