Merge pull request #82 from kundajelab/python-workflows

Python workflows
kundajelab · Jan 25, 2023 · 3f30813 · 3f30813
2 parents f07ce81 + 2b3bf6e
commit 3f30813
Show file tree

Hide file tree

Showing 226 changed files with 40,411 additions and 82,862 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,13 @@
 # Changelog
 
+## Version - 1.3 - Inworks - 2022-12-11
+- Changed pipelines to use modisco-lite, old modisco will soon be removed
+- Added automatic shift scripts to repo and integrated with the pipeline
+- modisco now outputs both html and pdf. pdf can be shared with anyone.
+- Use ATAC, DNASE not ATAC_PE, DNASE_SE anymore
+- Simplyfing workflows to include only two main workflows, chrombpnet_train_tf_model, chrombpnet_train_bias_model
+- Restructuring README, moving tutorial to additional documentation and introducing FAQ, and only two pipelines (chrombpnet_train_tf_model, chrombpnet_train_bias_model)
+
 ## [Unreleased] - 2022-02-28
 - Typo fix - (PR#31-36)
 - We can now specify ylimit in marginal footprinting plots (PR#27)

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,8 +1,4 @@
 include requirements.txt
-include step1_download_bams_and_peaks.sh
-include step2_make_bigwigs_from_bams.sh
-include step3_get_background_regions.sh
-include step4_train_bias_model.sh
-include step5_interpret_bias_model.sh
-include step6_train_chrombpnet_model.sh
-include step7_interpret_chrombpnet_model.sh
+include workflows/*
+include tests/*
+include chrombpnet/data/*
diff --git a/README.md b/README.md
diff --git a/chrombpnet/CHROMBPNET.py b/chrombpnet/CHROMBPNET.py
@@ -0,0 +1,180 @@
+import chrombpnet.parsers as parsers
+import os
+from chrombpnet.data import DefaultDataFile, get_default_data_path
+from chrombpnet.data import print_meme_motif_file
+import chrombpnet.pipelines as pipelines
+import copy
+import pandas as pd
+import logging
+logging.getLogger('matplotlib.font_manager').disabled = True
+
+
+# invoke pipeline modules based on command
+
+def main():
+	args = parsers.read_parser()
+
+	if args.cmd == "pipeline" or args.cmd == "train":
+		os.makedirs(os.path.join(args.output_dir,"logs"), exist_ok=False)
+		os.makedirs(os.path.join(args.output_dir,"auxiliary"), exist_ok=False)
+		os.makedirs(os.path.join(args.output_dir,"models"), exist_ok=False)
+		os.makedirs(os.path.join(args.output_dir,"evaluation"), exist_ok=False)
+
+		pipelines.chrombpnet_train_pipeline(args)
+
+	elif args.cmd == "qc":
+		os.makedirs(os.path.join(args.output_dir,"auxiliary"), exist_ok=False)
+		os.makedirs(os.path.join(args.output_dir,"evaluation"), exist_ok=False)
+
+		pipelines.chrombpnet_qc(args)
+
+	elif args.cmd == "bias":
+		if args.cmd_bias == "pipeline" or args.cmd_bias == "train":
+			os.makedirs(os.path.join(args.output_dir,"logs"), exist_ok=False)
+			os.makedirs(os.path.join(args.output_dir,"auxiliary"), exist_ok=False)
+			os.makedirs(os.path.join(args.output_dir,"models"), exist_ok=False)
+			os.makedirs(os.path.join(args.output_dir,"evaluation"), exist_ok=False)
+
+			pipelines.train_bias_pipeline(args)
+
+		elif args.cmd_bias == "qc":
+			os.makedirs(os.path.join(args.output_dir,"auxiliary"), exist_ok=False)
+			os.makedirs(os.path.join(args.output_dir,"evaluation"), exist_ok=False)
+
+			pipelines.bias_model_qc(args)
+
+		else:
+			print("Command not found")
+
+
+
+	elif args.cmd == "pred_bw":
+
+		assert (args.bias_model is None) + (args.chrombpnet_model is None) + (args.chrombpnet_model_nb is None) < 3, "No input model provided!"
+		import chrombpnet.evaluation.make_bigwigs.predict_to_bigwig as predict_to_bigwig
+
+		predict_to_bigwig.main(args)
+
+	elif args.cmd == "contribs_bw":
+
+		import chrombpnet.evaluation.interpret.interpret as interpret
+		interpret.main(args)
+		import chrombpnet.evaluation.make_bigwigs.importance_hdf5_to_bigwig as importance_hdf5_to_bigwig
+		if "counts" in  args.profile_or_counts:
+			args_copy = copy.deepcopy(args)
+			args_copy.hdf5 = args_copy.output_prefix + ".counts_scores.h5"
+			args_copy.output_prefix = args.output_prefix + ".counts_scores"
+
+			importance_hdf5_to_bigwig.main(args_copy)
+		if "profile" in  args.profile_or_counts:
+			args_copy = copy.deepcopy(args)
+			args_copy.hdf5 = args_copy.output_prefix + ".profile_scores.h5"
+			args_copy.output_prefix = args.output_prefix + ".profile_scores"
+
+			importance_hdf5_to_bigwig.main(args_copy)
+
+	elif args.cmd == "footprints":
+
+		import chrombpnet.evaluation.marginal_footprints.marginal_footprinting as marginal_footprinting
+		marginal_footprinting.main(args)
+
+	elif args.cmd == "snp_score":
+
+		import chrombpnet.evaluation.variant_effect_prediction.snp_scoring as snp_scoring
+		snp_scoring.main(args)
+
+	elif args.cmd == "modisco_motifs":
+		import chrombpnet
+		chrombpnet_src_dir = os.path.dirname(chrombpnet.__file__)
+		meme_file=get_default_data_path(DefaultDataFile.motifs_meme)
+
+		modisco_command = "modisco motifs -i {} -n 50000 -o {} -w 500".format(args.h5py,args.output_prefix+"_modisco.h5")
+		os.system(modisco_command)
+		modisco_command = "modisco report -i {} -o {} -m {}".format(args.output_prefix+"_modisco.h5",args.output_prefix+"_reports",meme_dir)
+		os.system(modisco_command)
+
+		import chrombpnet.evaluation.modisco.convert_html_to_pdf as convert_html_to_pdf
+		convert_html_to_pdf.main(args.output_prefix+"_reports/motifs.html",args.output_prefix+"_reports/motifs.pdf")
+
+	elif args.cmd == "prep":
+
+		if args.cmd_prep == "nonpeaks":
+
+			assert(args.inputlen%2==0) # input length should be a multiple of 2
+
+			os.makedirs(args.output_prefix+"_auxiliary/", exist_ok=False)
+
+			from chrombpnet.helpers.make_gc_matched_negatives.get_genomewide_gc_buckets.get_genomewide_gc_bins import get_genomewide_gc
+			get_genomewide_gc(args.genome,args.output_prefix+"_auxiliary/genomewide_gc.bed",args.inputlen, args.stride)
+
+			# get gc content in peaks
+			import chrombpnet.helpers.make_gc_matched_negatives.get_gc_content as get_gc_content
+			args_copy = copy.deepcopy(args)
+			args_copy.input_bed = args_copy.peaks
+			args_copy.output_prefix = args.output_prefix+"_auxiliary/foreground.gc"
+			get_gc_content.main(args_copy)
+
+			# prepare candidate negatives
+
+			exclude_bed = pd.read_csv(args.peaks, sep="\t", header=None)
+			os.system("bedtools slop -i {peaks} -g {chrom_sizes} -b {flank_size} > {output}".format(peaks=args.peaks,
+												chrom_sizes=args.chrom_sizes,
+												flank_size=args.inputlen//2,
+												output=args.output_prefix+"_auxiliary/peaks_slop.bed"))
+			exclude_bed = pd.read_csv(args.output_prefix+"_auxiliary/peaks_slop.bed", sep="\t", header=None, usecols=[0,1,2])
+
+			if args.blacklist_regions:
+				os.system("bedtools slop -i {blacklist} -g {chrom_sizes} -b {flank_size} > {output}".format(blacklist=args.blacklist_regions,
+												chrom_sizes=args.chrom_sizes,
+												flank_size=args.inputlen//2,
+												output=args.output_prefix+"_auxiliary/blacklist_slop.bed"))
+
+				exclude_bed = pd.concat([exclude_bed,pd.read_csv(args.output_prefix+"_auxiliary/blacklist_slop.bed",sep="\t",header=None, usecols=[0,1,2])])
+
+			exclude_bed.to_csv(args.output_prefix+"_auxiliary/exclude_unmerged.bed", sep="\t", header=False, index=False)
+			os.system("bedtools sort -i {inputb} | bedtools merge -i stdin > {output}".format(inputb=args.output_prefix+"_auxiliary/exclude_unmerged.bed",
+																					output=args.output_prefix+"_auxiliary/exclude.bed"))			
+
+
+
+			bedtools_command = "bedtools intersect -v -a {genomewide_gc} -b {exclude_bed}  > {candidate_bed}".format(
+															genomewide_gc=args.output_prefix+"_auxiliary/genomewide_gc.bed",
+															exclude_bed=args.output_prefix+"_auxiliary/exclude.bed",
+															candidate_bed=args.output_prefix+"_auxiliary/candidates.bed")
+			os.system(bedtools_command)
+
+			# get final negatives
+			import chrombpnet.helpers.make_gc_matched_negatives.get_gc_matched_negatives as get_gc_matched_negatives
+			args_copy = copy.deepcopy(args)
+			args_copy.candidate_negatives = args.output_prefix+"_auxiliary/candidates.bed"
+			args_copy.foreground_gc_bed = args.output_prefix+"_auxiliary/foreground.gc.bed"
+			args_copy.output_prefix = 	args.output_prefix+"_auxiliary/negatives"
+
+			get_gc_matched_negatives.main(args_copy)
+
+			negatives = pd.read_csv(args.output_prefix+"_auxiliary/negatives.bed", sep="\t", header=None)
+			negatives[3]="."
+			negatives[4]="."
+			negatives[5]="."
+			negatives[6]="."
+			negatives[7]="."
+			negatives[8]="."
+			negatives[9]=1057
+			negatives.to_csv(args.output_prefix+"_negatives.bed", sep="\t", header=False, index=False)
+
+		elif args.cmd_prep == "splits":
+			import chrombpnet.helpers.make_chr_splits.splits as splits
+			splits.main(args)
+
+		else:
+			print("Command not found")
+
+	else:
+		print("Command not found")
+
+
+if __name__=="__main_-":
+	main()
+
+
+
diff --git a/chrombpnet/__init__.py b/chrombpnet/__init__.py
@@ -1,2 +0,0 @@
-print("WARNING: IF upgrading from v1.0 or v1.1 to v1.2, note that chrombpnet has undergone linting to generate a modular structure for release on pypi."
-      "Hard-coded script paths are no longer necessary. Please refer to the updated README to ensure your script calls are compatible with v1.2")

diff --git a/...ssing/reads_to_bigwig/ATAC.ref.motifs.txt → chrombpnet/data/ATAC.ref.motifs.txt b/...ssing/reads_to_bigwig/ATAC.ref.motifs.txt → chrombpnet/data/ATAC.ref.motifs.txt
diff --git a/...sing/reads_to_bigwig/DNASE.ref.motifs.txt → chrombpnet/data/DNASE.ref.motifs.txt b/...sing/reads_to_bigwig/DNASE.ref.motifs.txt → chrombpnet/data/DNASE.ref.motifs.txt
diff --git a/chrombpnet/data/__init__.py b/chrombpnet/data/__init__.py
@@ -0,0 +1,22 @@
+from importlib import resources
+from enum import Enum
+
+class DefaultDataFile(Enum):
+    atac_ref_motifs = "ATAC.ref.motifs.txt"
+    dnase_ref_motifs = "DNASE.ref.motifs.txt"
+    motif_to_pwm_atac = "motif_to_pwm.ATAC.tsv"
+    motif_to_pwm_dnase = "motif_to_pwm.DNASE.tsv"
+    motif_to_pwm_tf = "motif_to_pwm.TF.tsv"
+    motifs_meme = "motifs.meme.txt"
+
+
+def get_default_data_path(default_data_file_entry):    
+    with resources.path("chrombpnet.data", default_data_file_entry.value) as f:
+        data_file_path=f
+    return data_file_path
+
+def print_meme_motif_file():
+    with resources.path("chrombpnet.data", DefaultDataFile.motifs_meme.value) as f:
+        data_file_path=f
+    print(f)
+
diff --git a/data/motif_to_pwm.ATAC.tsv → chrombpnet/data/motif_to_pwm.ATAC.tsv b/data/motif_to_pwm.ATAC.tsv → chrombpnet/data/motif_to_pwm.ATAC.tsv
diff --git a/data/motif_to_pwm.DNASE.tsv → chrombpnet/data/motif_to_pwm.DNASE.tsv b/data/motif_to_pwm.DNASE.tsv → chrombpnet/data/motif_to_pwm.DNASE.tsv
diff --git a/data/motif_to_pwm.TF.tsv → chrombpnet/data/motif_to_pwm.TF.tsv b/data/motif_to_pwm.TF.tsv → chrombpnet/data/motif_to_pwm.TF.tsv
Original file line number	Diff line number	Diff line change
		@@ -1,2 +0,0 @@
		print("WARNING: IF upgrading from v1.0 or v1.1 to v1.2, note that chrombpnet has undergone linting to generate a modular structure for release on pypi."
		"Hard-coded script paths are no longer necessary. Please refer to the updated README to ensure your script calls are compatible with v1.2")