minor clean up and README update

aberthel · Apr 6, 2021 · 9f626d7 · 9f626d7
1 parent 6f31e5f
commit 9f626d7
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 21 deletions.
diff --git a/PIPELINE_aggregate_over_genomic_regions.mk b/PIPELINE_aggregate_over_genomic_regions.mk
@@ -53,9 +53,8 @@ all:$(PREFIX)_hap1_allele_ratios.region_filtered_counts.chrs1-22$(KEEP_CHR).$(Cn
 
 
 # todo: this seems to work, but the way it deals with paths, filenames, etc needs to be cleaned up
-# currently, keeping JC's betabinomial scripts with as little modifications as possible
-# here it is even worse
-# fix columns
+# currently, keeping alleleDB betabinomial scripts with as little modifications as possible
+# fix columns and all the workaround with the .tmp files
 $(PREFIX)_interesting_regions.FDR-$(FDR_CUTOFF).betabinom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv: $(PREFIX)_region_filtered_counts.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv
 	awk '{print $$1"\t"$$2"\t"$$3"\t"$$2"\t"$$3"\t0\t0\t"$$2+$$3"\t"$$4"\t1.0"}' $< | \
 		sed 's/hap1_count\thap2_count\t0\t0\t0/cA\tcC\tcG\tcT\tsum_ref_n_alt_cnts/' | \
@@ -76,7 +75,6 @@ $(PREFIX)_interesting_regions.FDR-$(FDR_CUTOFF).betabinom.chrs1-22$(KEEP_CHR).$(
 	#rm $(PREFIX)_region_filtered_counts_min.$(Cntthresh_tot)-tot.$(Cntthresh_min)-min.tsv.tmp \
 	#	$(PREFIX)_region_FDR-$(FDR_CUTOFF).betabinom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tmp
 
-# todo fix colnames
 $(PREFIX)_interesting_regions.FDR-$(FDR_CUTOFF).binom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv: $(PREFIX)_region_filtered_counts.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv
 	python $(PL)/FalsePos_regions.py $< $(FDR_SIMS) $(FDR_CUTOFF) > $(PREFIX)_region_FDR-$(FDR_CUTOFF).binom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.txt
 	cat $< | python $(PL)/filter_regions_by_pval.py $(PREFIX)_region_FDR-$(FDR_CUTOFF).binom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.txt > $@
@@ -111,10 +109,6 @@ $(PREFIX)_region_raw_counts_mmap.tsv: hets_regions_hap2.bed
 $(PREFIX)_hap1_allele_ratios.region_raw_counts.pdf: $(PREFIX)_region_raw_counts_uniq.tsv
 	Rscript $(PL)/plot_AllelicRatio_distribution.R $< $(PREFIX) region_raw_counts hap1_allele_ratio
 
-# allowing multiple UNIQ_ALN_FILES files, but probably slower (since became -b instead of -a), also changes to intersect2counts.py to accomodate output file format
-#$(PREFIX)_region_counts.tsv: hets_regions_hap1.bed hets_regions_hap2.bed
-#        $(BEDTOOLS_intersectBed) -a $(UNIQ_ALN_FILES) -b hets_regions_hap1.bed hets_regions_hap2.bed -split -wb -bed | \
-#                python $(PL)/intersect2counts.py > $@
 
 
 # -split -- not counting reads that splice over a het

diff --git a/README.md b/README.md
@@ -34,7 +34,7 @@ make -f makePersonalGenome.mk \
 
 
 
-## (1) Calling AS+ hetSNVs from a single sample
+## Calling AS+ hetSNVs from a single sample
 ### Makefile options (can be specified in PIPELINE.mk or as command-line arguments):
 #### Dependencies:
 ##### python2 
@@ -88,7 +88,7 @@ ENCFF337ZBN_ENCFF481IQE_interestingHets.FDR-0.10.betabinom.chrs1-22.6-tot_0-min_
 
 
 
-## (2) Pool replicates / tissues
+## (2) Pool replicates/tissues
 
 ### Makefile options (can be specified in PIPELINE_aggregated_counts.mk or as command-line arguments)
 #### Dependencies, system parameters/paths:  
@@ -101,10 +101,10 @@ ENCFF337ZBN_ENCFF481IQE_interestingHets.FDR-0.10.betabinom.chrs1-22.6-tot_0-min_
 ``PREFIX``: prefix for output file names   
 ``Cntthresh_tot``: threshold for the total number of reads mapped to hetSNV    
 ``Cntthresh_min``: threshold for the minimal number of reads mapped to each allele   
+``FDR_CUTOFF``: FDR threshold  
 
-
+### Example:
 ```
-
 make -f PIPELINE_aggregated_counts.mk \
         PGENOME_DIR=pgenome_ENC-003 \
         INPUT_UNIQ_READS_PILEUP_FILES="../ENCSR238ZZD_ENCFF719MSG_1_ENCFF120MML_2_1_1/ENCFF719MSG_ENCFF120MML_hap1_uniqreads.mpileup    ../ENCSR238ZZD_ENCFF719MSG_1_ENCFF120MML_2_1_1/ENCFF719MSG_ENCFF120MML_hap2_uniqreads.mpileup ../ENCSR238ZZD_ENCFF337ZBN_1_ENCFF481IQE_2_1_1/ENCFF337ZBN_ENCFF481IQE_hap1_uniqreads.mpileup ../ENCSR238ZZD_ENCFF337ZBN_1_ENCFF481IQE_2_1_1/ENCFF337ZBN_ENCFF481IQE_hap2_uniqreads.mpileup" \
@@ -118,19 +118,32 @@ The main output file: ENCSR238ZZD_interestingHets.FDR-0.10.binom.chrs1-22.6-tot_
 
 
 
-## (3) Aggregate across genomic elements, e.g. genes. ### 
+## (3) Aggregate across genomic elements, e.g. genes.
 
+### Makefile options (can be specified in PIPELINE_aggregated_counts.mk or as command-line arguments)
+#### Dependencies, system parameters/paths:  
+``PL``: path to AlleleSeq2   
+``BEDTOOLS_intersectBed``: Bedtools intersectBed   
+``SAMTOOLS``: Samtools  
+#### Other options: 
+``PREFIX``: prefix for output file names   
+``Cntthresh_tot``: threshold for the total number of reads mapped to hetSNV     
+``Cntthresh_min``: threshold for the minimal number of reads mapped to each allele     
+``FDR_CUTOFF``: FDR threshold    
+``REGIONS_FILE``: bath to a .bed file with genomic elments (e.g. gene, cCRE) coordinates  
+``UNIQ_ALN_FILES``: .bam(s) with uniquely mapping reads generated by (1) or (2)   
+``MMAP_ALN_FILES``: .bam(s) with multimapping reads generated by (1) or (2)  
+
+
+### Example:
 ```
-make -f ~/bin/AlleleSeq2/PIPELINE_aggregate_over_genomic_regions.mk \
+make -f   PIPELINE_aggregate_over_genomic_regions.mk \
 	  PREFIX=ENCSR238ZZD \
-	  REGIONS_FILE="../../../../pgenomes_20191127/gencode.v24_all_genes.bed" \
-	  COUNTS_FILE=../ENCSR238ZZD_combined/ENCSR238ZZD_filtered_counts.tsv \
-	  UNIQ_ALN_FILES='../ENCSR238ZZD_ENCFF719MSG_1_ENCFF120MML_2_1_1/ENCFF719MSG_ENCFF120MML_ASE-params_crdsorted_uniqreads_over_hetSNVs.bam ../ENCSR238ZZD_ENCFF337ZBN_1_ENCFF481IQE_2_1_1/ENCFF337ZBN_ENCFF481IQE_ASE-params_crdsorted_uniqreads_over_hetSNVs.bam' \
-	  MMAP_ALN_FILES='../ENCSR238ZZD_ENCFF719MSG_1_ENCFF120MML_2_1_1/ENCFF719MSG_ENCFF120MML_ASE-params_crdsorted_mmapreads_over_hetSNVs.bam ../ENCSR238ZZD_ENCFF337ZBN_1_ENCFF481IQE_2_1_1/ENCFF337ZBN_ENCFF481IQE_ASE-params_crdsorted_mmapreads_over_hetSNVs.bam' \
+	  REGIONS_FILE="gencode.v24_all_genes.bed" \
+	  COUNTS_FILE=ENCSR238ZZD_filtered_counts.tsv \
+	  UNIQ_ALN_FILES='ENCSR238ZZD_ENCFF719MSG_1_ENCFF120MML_2_1_1/ENCFF719MSG_ENCFF120MML_ASE-params_crdsorted_uniqreads_over_hetSNVs.bam ENCSR238ZZD_ENCFF337ZBN_1_ENCFF481IQE_2_1_1/ENCFF337ZBN_ENCFF481IQE_ASE-params_crdsorted_uniqreads_over_hetSNVs.bam' \
+	  MMAP_ALN_FILES='ENCSR238ZZD_ENCFF719MSG_1_ENCFF120MML_2_1_1/ENCFF719MSG_ENCFF120MML_ASE-params_crdsorted_mmapreads_over_hetSNVs.bam ENCSR238ZZD_ENCFF337ZBN_1_ENCFF481IQE_2_1_1/ENCFF337ZBN_ENCFF481IQE_ASE-params_crdsorted_mmapreads_over_hetSNVs.bam' \
 	  FDR_CUTOFF=0.10  
 ```
 
-
-All input files should be produced from (1) or (2)   
-REGIONS_FILE: must be a bed file: with gene(region) coordinates and id    
 The main output file with ASE genes: ENCSR238ZZD_interesting_regions.FDR-0.10.betabinom.chrs1-22.6-tot_0-min_cnt.tsv