Skip to content

Commit

Permalink
minor clean up and README update
Browse files Browse the repository at this point in the history
  • Loading branch information
trgaleev committed Apr 6, 2021
1 parent 6f31e5f commit 9f626d7
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 21 deletions.
10 changes: 2 additions & 8 deletions PIPELINE_aggregate_over_genomic_regions.mk
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,8 @@ all:$(PREFIX)_hap1_allele_ratios.region_filtered_counts.chrs1-22$(KEEP_CHR).$(Cn


# todo: this seems to work, but the way it deals with paths, filenames, etc needs to be cleaned up
# currently, keeping JC's betabinomial scripts with as little modifications as possible
# here it is even worse
# fix columns
# currently, keeping alleleDB betabinomial scripts with as little modifications as possible
# fix columns and all the workaround with the .tmp files
$(PREFIX)_interesting_regions.FDR-$(FDR_CUTOFF).betabinom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv: $(PREFIX)_region_filtered_counts.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv
awk '{print $$1"\t"$$2"\t"$$3"\t"$$2"\t"$$3"\t0\t0\t"$$2+$$3"\t"$$4"\t1.0"}' $< | \
sed 's/hap1_count\thap2_count\t0\t0\t0/cA\tcC\tcG\tcT\tsum_ref_n_alt_cnts/' | \
Expand All @@ -76,7 +75,6 @@ $(PREFIX)_interesting_regions.FDR-$(FDR_CUTOFF).betabinom.chrs1-22$(KEEP_CHR).$(
#rm $(PREFIX)_region_filtered_counts_min.$(Cntthresh_tot)-tot.$(Cntthresh_min)-min.tsv.tmp \
# $(PREFIX)_region_FDR-$(FDR_CUTOFF).betabinom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tmp

# todo fix colnames
$(PREFIX)_interesting_regions.FDR-$(FDR_CUTOFF).binom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv: $(PREFIX)_region_filtered_counts.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv
python $(PL)/FalsePos_regions.py $< $(FDR_SIMS) $(FDR_CUTOFF) > $(PREFIX)_region_FDR-$(FDR_CUTOFF).binom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.txt
cat $< | python $(PL)/filter_regions_by_pval.py $(PREFIX)_region_FDR-$(FDR_CUTOFF).binom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.txt > $@
Expand Down Expand Up @@ -111,10 +109,6 @@ $(PREFIX)_region_raw_counts_mmap.tsv: hets_regions_hap2.bed
$(PREFIX)_hap1_allele_ratios.region_raw_counts.pdf: $(PREFIX)_region_raw_counts_uniq.tsv
Rscript $(PL)/plot_AllelicRatio_distribution.R $< $(PREFIX) region_raw_counts hap1_allele_ratio

# allowing multiple UNIQ_ALN_FILES files, but probably slower (since became -b instead of -a), also changes to intersect2counts.py to accomodate output file format
#$(PREFIX)_region_counts.tsv: hets_regions_hap1.bed hets_regions_hap2.bed
# $(BEDTOOLS_intersectBed) -a $(UNIQ_ALN_FILES) -b hets_regions_hap1.bed hets_regions_hap2.bed -split -wb -bed | \
# python $(PL)/intersect2counts.py > $@


# -split -- not counting reads that splice over a het
Expand Down
39 changes: 26 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ make -f makePersonalGenome.mk \



## (1) Calling AS+ hetSNVs from a single sample
## Calling AS+ hetSNVs from a single sample
### Makefile options (can be specified in PIPELINE.mk or as command-line arguments):
#### Dependencies:
##### python2
Expand Down Expand Up @@ -88,7 +88,7 @@ ENCFF337ZBN_ENCFF481IQE_interestingHets.FDR-0.10.betabinom.chrs1-22.6-tot_0-min_



## (2) Pool replicates / tissues
## (2) Pool replicates/tissues

### Makefile options (can be specified in PIPELINE_aggregated_counts.mk or as command-line arguments)
#### Dependencies, system parameters/paths:
Expand All @@ -101,10 +101,10 @@ ENCFF337ZBN_ENCFF481IQE_interestingHets.FDR-0.10.betabinom.chrs1-22.6-tot_0-min_
``PREFIX``: prefix for output file names
``Cntthresh_tot``: threshold for the total number of reads mapped to hetSNV
``Cntthresh_min``: threshold for the minimal number of reads mapped to each allele
``FDR_CUTOFF``: FDR threshold


### Example:
```
make -f PIPELINE_aggregated_counts.mk \
PGENOME_DIR=pgenome_ENC-003 \
INPUT_UNIQ_READS_PILEUP_FILES="../ENCSR238ZZD_ENCFF719MSG_1_ENCFF120MML_2_1_1/ENCFF719MSG_ENCFF120MML_hap1_uniqreads.mpileup ../ENCSR238ZZD_ENCFF719MSG_1_ENCFF120MML_2_1_1/ENCFF719MSG_ENCFF120MML_hap2_uniqreads.mpileup ../ENCSR238ZZD_ENCFF337ZBN_1_ENCFF481IQE_2_1_1/ENCFF337ZBN_ENCFF481IQE_hap1_uniqreads.mpileup ../ENCSR238ZZD_ENCFF337ZBN_1_ENCFF481IQE_2_1_1/ENCFF337ZBN_ENCFF481IQE_hap2_uniqreads.mpileup" \
Expand All @@ -118,19 +118,32 @@ The main output file: ENCSR238ZZD_interestingHets.FDR-0.10.binom.chrs1-22.6-tot_



## (3) Aggregate across genomic elements, e.g. genes. ###
## (3) Aggregate across genomic elements, e.g. genes.

### Makefile options (can be specified in PIPELINE_aggregated_counts.mk or as command-line arguments)
#### Dependencies, system parameters/paths:
``PL``: path to AlleleSeq2
``BEDTOOLS_intersectBed``: Bedtools intersectBed
``SAMTOOLS``: Samtools
#### Other options:
``PREFIX``: prefix for output file names
``Cntthresh_tot``: threshold for the total number of reads mapped to hetSNV
``Cntthresh_min``: threshold for the minimal number of reads mapped to each allele
``FDR_CUTOFF``: FDR threshold
``REGIONS_FILE``: bath to a .bed file with genomic elments (e.g. gene, cCRE) coordinates
``UNIQ_ALN_FILES``: .bam(s) with uniquely mapping reads generated by (1) or (2)
``MMAP_ALN_FILES``: .bam(s) with multimapping reads generated by (1) or (2)


### Example:
```
make -f ~/bin/AlleleSeq2/PIPELINE_aggregate_over_genomic_regions.mk \
make -f PIPELINE_aggregate_over_genomic_regions.mk \
PREFIX=ENCSR238ZZD \
REGIONS_FILE="../../../../pgenomes_20191127/gencode.v24_all_genes.bed" \
COUNTS_FILE=../ENCSR238ZZD_combined/ENCSR238ZZD_filtered_counts.tsv \
UNIQ_ALN_FILES='../ENCSR238ZZD_ENCFF719MSG_1_ENCFF120MML_2_1_1/ENCFF719MSG_ENCFF120MML_ASE-params_crdsorted_uniqreads_over_hetSNVs.bam ../ENCSR238ZZD_ENCFF337ZBN_1_ENCFF481IQE_2_1_1/ENCFF337ZBN_ENCFF481IQE_ASE-params_crdsorted_uniqreads_over_hetSNVs.bam' \
MMAP_ALN_FILES='../ENCSR238ZZD_ENCFF719MSG_1_ENCFF120MML_2_1_1/ENCFF719MSG_ENCFF120MML_ASE-params_crdsorted_mmapreads_over_hetSNVs.bam ../ENCSR238ZZD_ENCFF337ZBN_1_ENCFF481IQE_2_1_1/ENCFF337ZBN_ENCFF481IQE_ASE-params_crdsorted_mmapreads_over_hetSNVs.bam' \
REGIONS_FILE="gencode.v24_all_genes.bed" \
COUNTS_FILE=ENCSR238ZZD_filtered_counts.tsv \
UNIQ_ALN_FILES='ENCSR238ZZD_ENCFF719MSG_1_ENCFF120MML_2_1_1/ENCFF719MSG_ENCFF120MML_ASE-params_crdsorted_uniqreads_over_hetSNVs.bam ENCSR238ZZD_ENCFF337ZBN_1_ENCFF481IQE_2_1_1/ENCFF337ZBN_ENCFF481IQE_ASE-params_crdsorted_uniqreads_over_hetSNVs.bam' \
MMAP_ALN_FILES='ENCSR238ZZD_ENCFF719MSG_1_ENCFF120MML_2_1_1/ENCFF719MSG_ENCFF120MML_ASE-params_crdsorted_mmapreads_over_hetSNVs.bam ENCSR238ZZD_ENCFF337ZBN_1_ENCFF481IQE_2_1_1/ENCFF337ZBN_ENCFF481IQE_ASE-params_crdsorted_mmapreads_over_hetSNVs.bam' \
FDR_CUTOFF=0.10
```


All input files should be produced from (1) or (2)
REGIONS_FILE: must be a bed file: with gene(region) coordinates and id
The main output file with ASE genes: ENCSR238ZZD_interesting_regions.FDR-0.10.betabinom.chrs1-22.6-tot_0-min_cnt.tsv

0 comments on commit 9f626d7

Please sign in to comment.