Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/macs3/narrowpeak #645

Merged
merged 39 commits into from
May 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
e8f80c1
use narrowPeak instead of gappedPeak
philippadoherty May 2, 2024
1bac081
correct 'open' start/end pos
philippadoherty May 2, 2024
9212f58
deal with nuc-open-nuc-open-nuc type regions
philippadoherty May 2, 2024
cf5603d
update hmmratac results
taoliu May 7, 2024
bd77e33
require nuc-open-nuc region len > openregion_minlen
philippadoherty May 8, 2024
3e90221
clarify the descriptions of __close_peak and __close_peak2 functions
taoliu May 14, 2024
4f0ae45
clarify the descriptions of enforce_peakyness function
taoliu May 14, 2024
781f5b5
enable test for SignalProcessing
taoliu May 14, 2024
eb9d659
fix the way to call __close_peak() -- there is no smoothlen
taoliu May 14, 2024
9276ace
fix the way to call __close_peak() -- there is no smoothlen
taoliu May 14, 2024
aaa2557
fix some description
taoliu May 14, 2024
2467241
update instructions for conda install
taoliu Mar 28, 2024
856a86f
update hmmratac
taoliu Mar 28, 2024
f1f44d6
add cutoff-analysis-steps to hmmratac
taoliu Apr 9, 2024
ff61e5e
cutoff-analysis-steps option for bdgpeakcall and hmmratac
taoliu Apr 10, 2024
7151ca9
fix a missing url
taoliu Apr 10, 2024
a227a6b
#637 -O3 replaces -Ofast
taoliu Apr 10, 2024
e388627
update docs
taoliu Apr 13, 2024
93f89ea
reorg md files
taoliu Apr 13, 2024
fd0493e
add more format related files
taoliu Apr 13, 2024
3d68904
update
taoliu Apr 13, 2024
22ef76b
update docs
taoliu Apr 25, 2024
0e98008
update docs
taoliu Apr 26, 2024
a2597b8
reorg docs
taoliu Apr 26, 2024
97a3a4b
reorg docs
taoliu Apr 26, 2024
3dd01c3
add option --cutoff-analysis-max for hmmratac
taoliu Apr 26, 2024
16de42b
update changelog
taoliu Apr 27, 2024
84a1d2c
edit adv tutorial. Add empty files for file formats.
taoliu May 3, 2024
c6d20d8
cutoff analysis
taoliu May 3, 2024
52f121c
handle error correctly #643
taoliu May 7, 2024
b798015
add two index md files
taoliu May 8, 2024
1e94870
cutoff-analysis-max for bdgpeakcall;output rows with 0 peak in cutoff…
taoliu May 14, 2024
7b26297
still skip cutoffs with 0 peak since avepeak need to be computed
taoliu May 14, 2024
52ce9e7
update hmmratac narrowpeak output since we now allow some short peaks
taoliu May 14, 2024
3d32000
skip the test on SignalProcessing
taoliu May 14, 2024
0ae471c
move IO functions for bedGraph to BedGraphIO class
taoliu May 15, 2024
769b8e1
new function to find peak summit and score by giving peak regions to …
taoliu May 15, 2024
3b9c17c
let hmmratac output narrowpeak with summit and score from foldchange …
taoliu May 16, 2024
e03da87
Merge branch 'master' into feat/macs3/narrowpeak
taoliu May 16, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions MACS3/Commands/bdgbroadcall_cmd.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Time-stamp: <2020-12-03 14:26:29 Tao Liu>
# Time-stamp: <2024-05-15 10:41:43 Tao Liu>

"""Description: Fine-tuning script to call broad peaks from a single
bedGraph track for scores.
Expand Down Expand Up @@ -41,7 +41,7 @@
def run( options ):
info("Read and build bedGraph...")
bio = BedGraphIO.bedGraphIO(options.ifile)
btrack = bio.build_bdgtrack(baseline_value=0)
btrack = bio.read_bedGraph(baseline_value=0)

info("Call peaks from bedGraph...")

Expand Down
9 changes: 5 additions & 4 deletions MACS3/Commands/bdgcmp_cmd.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Time-stamp: <2020-11-30 13:49:34 Tao Liu>
# Time-stamp: <2024-05-15 10:13:23 Tao Liu>

"""Description: compare bdg files

Expand Down Expand Up @@ -39,11 +39,11 @@ def run( options ):

info("Read and build treatment bedGraph...")
tbio = BedGraphIO.bedGraphIO(options.tfile)
tbtrack = tbio.build_bdgtrack()
tbtrack = tbio.read_bedGraph()

info("Read and build control bedGraph...")
cbio = BedGraphIO.bedGraphIO(options.cfile)
cbtrack = cbio.build_bdgtrack()
cbtrack = cbio.read_bedGraph()

info("Build ScoreTrackII...")
sbtrack = tbtrack.make_ScoreTrackII_for_macs( cbtrack, depth1 = pseudo_depth, depth2 = pseudo_depth )
Expand Down Expand Up @@ -86,6 +86,7 @@ def run( options ):
raise Exception("Can't reach here!")

info("Write bedGraph of scores...")
ofhd = open(ofile,"w")
ofhd = open( ofile, "w" )
# write_bedGraph function for ScoreTrack
sbtrack.write_bedGraph(ofhd,name="%s_Scores" % (method.upper()),description="Scores calculated by %s" % (method.upper()), column = 3)
info("Finished '%s'! Please check '%s'!" % (method, ofile))
10 changes: 5 additions & 5 deletions MACS3/Commands/bdgdiff_cmd.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Time-stamp: <2020-11-26 17:04:26 Tao Liu>
# Time-stamp: <2024-05-15 10:42:27 Tao Liu>

"""Description: Naive call differential peaks from 4 bedGraph tracks for scores.

Expand Down Expand Up @@ -47,19 +47,19 @@ def run( options ):

info("Read and build treatment 1 bedGraph...")
t1bio = BedGraphIO.bedGraphIO(options.t1bdg)
t1btrack = t1bio.build_bdgtrack()
t1btrack = t1bio.read_bedGraph()

info("Read and build control 1 bedGraph...")
c1bio = BedGraphIO.bedGraphIO(options.c1bdg)
c1btrack = c1bio.build_bdgtrack()
c1btrack = c1bio.read_bedGraph()

info("Read and build treatment 2 bedGraph...")
t2bio = BedGraphIO.bedGraphIO(options.t2bdg)
t2btrack = t2bio.build_bdgtrack()
t2btrack = t2bio.read_bedGraph()

info("Read and build control 2 bedGraph...")
c2bio = BedGraphIO.bedGraphIO(options.c2bdg)
c2btrack = c2bio.build_bdgtrack()
c2btrack = c2bio.read_bedGraph()

depth1 = options.depth1
depth2 = options.depth2
Expand Down
11 changes: 5 additions & 6 deletions MACS3/Commands/bdgopt_cmd.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Time-stamp: <2020-11-24 16:46:33 Tao Liu>
# Time-stamp: <2024-05-15 11:15:48 Tao Liu>

"""Description: Modify bedGraph file

Expand Down Expand Up @@ -40,7 +40,7 @@ def run( options ):

info("Read and build bedGraph...")
bio = BedGraphIO.bedGraphIO(options.ifile)
btrack = bio.build_bdgtrack(baseline_value=0)
btrack = bio.read_bedGraph(baseline_value=0)

info("Modify bedGraph...")
if options.method.lower() == "p2q":
Expand All @@ -57,11 +57,10 @@ def run( options ):
elif options.method.lower() == "min":
btrack.apply_func( lambda x: x if x< extraparam else extraparam )

ofile = os.path.join( options.outdir, options.ofile )
ofile = BedGraphIO.bedGraphIO( os.path.join( options.outdir, options.ofile ), data = btrack )
info("Write bedGraph of modified scores...")
ofhd = open(ofile,"w")
btrack.write_bedGraph(ofhd,name="%s_modified_scores" % (options.method.upper()),description="Scores calculated by %s" % (options.method.upper()))
info("Finished '%s'! Please check '%s'!" % (options.method, ofile))
ofile.write_bedGraph(name="%s_modified_scores" % (options.method.upper()),description="Scores calculated by %s" % (options.method.upper()))
info("Finished '%s'! Please check '%s'!" % (options.method, ofile.bedGraph_filename))



6 changes: 3 additions & 3 deletions MACS3/Commands/bdgpeakcall_cmd.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Time-stamp: <2023-11-02 11:52:48 Tao Liu>
# Time-stamp: <2024-05-15 10:43:03 Tao Liu>

"""Description: Naive call peaks from a single bedGraph track for
scores.
Expand Down Expand Up @@ -39,11 +39,11 @@
def run( options ):
info("Read and build bedGraph...")
bio = BedGraphIO.bedGraphIO(options.ifile)
btrack = bio.build_bdgtrack(baseline_value=0)
btrack = bio.read_bedGraph(baseline_value=0)

if options.cutoff_analysis:
info("Analyze cutoff vs number of peaks/total length of peaks/average length of peak")
cutoff_analysis_result = btrack.cutoff_analysis( int(options.maxgap), int(options.minlen), min_score = btrack.minvalue, max_score = btrack.maxvalue, steps = int(options.cutoff_analysis_steps) )
cutoff_analysis_result = btrack.cutoff_analysis( int(options.maxgap), int(options.minlen), min_score = btrack.minvalue, max_score = int(options.cutoff_analysis_max), steps = int(options.cutoff_analysis_steps) )
info("Write report...")
if options.ofile:
fhd = open( os.path.join( options.outdir, options.ofile ), 'w' )
Expand Down
14 changes: 7 additions & 7 deletions MACS3/Commands/cmbreps_cmd.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Time-stamp: <2020-11-24 16:47:11 Tao Liu>
# Time-stamp: <2024-05-15 11:16:04 Tao Liu>

"""Description: combine replicates

Expand Down Expand Up @@ -38,16 +38,16 @@ def run( options ):
i = 1
for ifile in options.ifile:
info("Read file #%d" % i)
reps.append( BedGraphIO.bedGraphIO( ifile ).build_bdgtrack( ) )
reps.append( BedGraphIO.bedGraphIO( ifile ).read_bedGraph() )
i += 1

# first two reps

info("combining tracks 1-%i with method '%s'" % (i - 1, options.method))
cmbtrack = reps[ 0 ].overlie( [reps[ j ] for j in range(1, i - 1)], func=options.method )
ofile = os.path.join( options.outdir, options.ofile )

# now output
ofile = BedGraphIO.bedGraphIO( os.path.join( options.outdir, options.ofile ), data = cmbtrack )
info("Write bedGraph of combined scores...")
ofhd = open(ofile,"w")
cmbtrack.write_bedGraph(ofhd,name="%s_combined_scores" % (options.method.upper()),description="Scores calculated by %s" % (options.method.upper()))
info("Finished '%s'! Please check '%s'!" % (options.method, ofile))
ofile.write_bedGraph(name="%s_combined_scores" % (options.method.upper()),description="Scores calculated by %s" % (options.method.upper()))
info("Finished '%s'! Please check '%s'!" % (options.method, ofile.bedGraph_filename))

103 changes: 49 additions & 54 deletions MACS3/Commands/hmmratac_cmd.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Time-stamp: <2024-04-26 15:46:03 Tao Liu>
# Time-stamp: <2024-05-15 19:40:45 Tao Liu>


"""Description: Main HMMR command

Expand Down Expand Up @@ -27,13 +28,13 @@
from MACS3.Utilities.Constants import *
from MACS3.Utilities.OptValidator import opt_validate_hmmratac
from MACS3.IO.PeakIO import PeakIO
from MACS3.IO.PeakIO import BroadPeakIO
from MACS3.IO.Parser import BAMPEParser, BEDPEParser #BAMaccessor
from MACS3.Signal.HMMR_EM import HMMR_EM
from MACS3.Signal.HMMR_Signal_Processing import generate_weight_mapping, generate_digested_signals, extract_signals_from_regions
from MACS3.Signal.HMMR_HMM import hmm_training, hmm_predict, hmm_model_init, hmm_model_save
from MACS3.Signal.Region import Regions
from MACS3.Signal.BedGraph import bedGraphTrackI
from MACS3.IO.BedGraphIO import bedGraphIO

#from MACS3.IO.BED import BEDreader # this hasn't been implemented yet.

Expand Down Expand Up @@ -61,15 +62,21 @@ def run( args ):
mono_bdgfile = os.path.join( options.outdir, options.name+"_digested_mono.bdg" )
di_bdgfile = os.path.join( options.outdir, options.name+"_digested_di.bdg" )
tri_bdgfile = os.path.join( options.outdir, options.name+"_digested_tri.bdg" )

training_region_bedfile = os.path.join( options.outdir, options.name+"_training_regions.bed" )
training_datafile = os.path.join( options.outdir, options.name+"_training_data.txt" )
training_datalengthfile = os.path.join( options.outdir, options.name+"_training_lengths.txt" )

hmm_modelfile = os.path.join( options.outdir, options.name+"_model.json" )

open_state_bdgfile = os.path.join( options.outdir, options.name+"_open.bdg" )
nuc_state_bdgfile = os.path.join( options.outdir, options.name+"_nuc.bdg" )
bg_state_bdgfile = os.path.join( options.outdir, options.name+"_bg.bdg" )

states_file = os.path.join( options.outdir, options.name+"_states.bed" )
accessible_file = os.path.join( options.outdir, options.name+"_accessible_regions.gappedPeak" )

accessible_file = os.path.join( options.outdir, options.name+"_accessible_regions.narrowPeak" )

cutoffanalysis_file = os.path.join( options.outdir, options.name+"_cutoff_analysis.tsv" )

#############################################
Expand Down Expand Up @@ -158,18 +165,17 @@ def run( args ):

# save three types of signals if needed
if options.save_digested:
fhd = open(short_bdgfile,"w")
digested_atac_signals[ 0 ].write_bedGraph(fhd, "short","short")
fhd.close()
fhd = open(mono_bdgfile,"w")
digested_atac_signals[ 1 ].write_bedGraph(fhd, "mono","mono")
fhd.close()
fhd = open(di_bdgfile,"w")
digested_atac_signals[ 2 ].write_bedGraph(fhd, "di","di")
fhd.close()
fhd = open(tri_bdgfile,"w")
digested_atac_signals[ 3 ].write_bedGraph(fhd, "tri","tri")
fhd.close()
bdgshort = BedGraphIO( short_bdgfile, data = digested_atac_signals[ 0 ] )
bdgshort.write_bedGraph("short","short")

bdgmono = BedGraphIO( mono_bdgfile, data = digested_atac_signals[ 1 ] )
bdgmono.write_bedGraph("mono", "mono")

bdgdi = BedGraphIO( di_bdgfile, data = digested_atac_signals[ 2 ] )
bdgdi.write_bedGraph("di", "di")

bdgtri = BedGraphIO( tri_bdgfile, data = digested_atac_signals[ 3 ] )
bdgtri.write_bedGraph("tri", "tri")

minlen = int(petrack.average_template_length)
# if options.pileup_short is on, we pile up only the short fragments to identify training
Expand Down Expand Up @@ -389,10 +395,9 @@ def run( args ):
# Note: we implement in a way that we will decode the candidate regions 10000 regions at a time so 1. we can make it running in parallel in the future; 2. we can reduce the memory usage.
options.info( f"# Use HMM to predict states")
n = 0
#predicted_proba_file_name = "predicted_proba.csv"
#predicted_proba_file = open(predicted_proba_file_name, "wb")

# we create a temporary file to save the proba predicted from hmm
predicted_proba_file = tempfile.TemporaryFile(mode="w+b")
# predicted_proba_file = open("predicted_proba.csv", "w+b")

while candidate_regions.total != 0:
n += 1
Expand All @@ -414,11 +419,8 @@ def run( args ):
cr_data_lengths = []
cr_bins = []
prob_data = []

#options.debug( "# clean up complete")
gc.collect()

#predicted_proba_file.close()
predicted_proba_file.seek(0) # reset
options.info( f"# predicted_proba files written...")

Expand Down Expand Up @@ -449,25 +451,30 @@ def run( args ):
# # Generate states path:
states_path = generate_states_path( predicted_proba_file, options.hmm_binsize, i_open_region, i_nucleosomal_region, i_background_region )
options.info( f"# finished generating states path")
predicted_proba_file.close() #kill the tempfile
predicted_proba_file.close() #kill the temp file
# Save states path if needed
# PS: we need to implement extra feature to include those regions NOT in candidate_bins and assign them as 'background state'.
if options.save_states:
options.info( f"# Write states assignments in a BED file: {options.name}_states.bed" )
with open( states_file, "w" ) as f:
save_states_bed( states_path, f )

options.info( f"# Write accessible regions in a gappedPeak file: {options.name}_accessible_regions.gappedPeak")
options.info( f"# Write accessible regions in a narrowPeak file: {options.name}_accessible_regions.narrowPeak")
with open( accessible_file, "w" ) as ofhd:
save_accessible_regions( states_path, ofhd, options.openregion_minlen )
save_accessible_regions( states_path, ofhd, options.openregion_minlen, fc_bdg )

options.info( f"# Finished")

def save_proba_to_bedGraph( predicted_proba_file, binsize, open_state_bdg_file, nuc_state_bdg_file, bg_state_bdg_file, i_open, i_nuc, i_bg ):
open_state_bdg = bedGraphTrackI( baseline_value = 0 )
nuc_state_bdg = bedGraphTrackI( baseline_value = 0 )
bg_state_bdg = bedGraphTrackI( baseline_value = 0 )

open_state_bdg_file = bedGraphIO( open_state_bdg_file )
nuc_state_bdg_file = bedGraphIO( nuc_state_bdg_file )
bg_state_bdg_file = bedGraphIO( bg_state_bdg_file )

open_state_bdg = open_state_bdg_file.data
nuc_state_bdg = nuc_state_bdg_file.data
bg_state_bdg = bg_state_bdg_file.data

prev_chrom_name = None
prev_bin_end = None

Expand Down Expand Up @@ -501,9 +508,9 @@ def save_proba_to_bedGraph( predicted_proba_file, binsize, open_state_bdg_file,
bg_state_bdg.add_loc( chrname, start_pos, end_pos, pp_bg )
prev_bin_end = end_pos

open_state_bdg.write_bedGraph( open_state_bdg_file, "Open States", "Likelihoods of being Open States" )
nuc_state_bdg.write_bedGraph( nuc_state_bdg_file, "Nucleosomal States", "Likelihoods of being Nucleosomal States" )
bg_state_bdg.write_bedGraph( bg_state_bdg_file, "Background States", "Likelihoods of being Background States" )
open_state_bdg_file.write_bedGraph( "Open States", "Likelihoods of being Open States", trackline = False )
nuc_state_bdg_file.write_bedGraph( "Nucleosomal States", "Likelihoods of being Nucleosomal States", trackline = False )
bg_state_bdg_file.write_bedGraph( "Background States", "Likelihoods of being Background States", trackline = False )
return

def save_states_bed( states_path, states_bedfile ):
Expand All @@ -515,6 +522,7 @@ def save_states_bed( states_path, states_bedfile ):
return

def generate_states_path(predicted_proba_file, binsize, i_open, i_nuc, i_bg):
# predicted_proba_file is a temporary file
ret_states_path = []
labels_list = ["open", "nuc", "bg"]

Expand Down Expand Up @@ -558,7 +566,7 @@ def generate_states_path(predicted_proba_file, binsize, i_open, i_nuc, i_bg):
prev_bin_end = end_pos
return ret_states_path

def save_accessible_regions(states_path, accessible_region_file, openregion_minlen):
def save_accessible_regions(states_path, accessible_region_file, openregion_minlen, bdgscore):
# Function to add regions to the list
def add_regions(i, regions):
for j in range(i, i+3):
Expand All @@ -572,31 +580,18 @@ def add_regions(i, regions):
for i in range(len(states_path)-2):
if (states_path[i][3] == 'nuc' and states_path[i+1][3] == 'open' and states_path[i+2][3] == 'nuc' and
states_path[i][2] == states_path[i+1][1] and states_path[i+1][2] == states_path[i+2][1] and
states_path[i+1][2] - states_path[i+1][1] > openregion_minlen):
states_path[i+2][2] - states_path[i][1] > openregion_minlen): # require nuc-open-nuc entire region start/endpos > openregion_minlen
accessible_regions = add_regions(i, accessible_regions)
# Group states by region
list_of_groups = []
one_group = [accessible_regions[0]]
for j in range(1, len(accessible_regions)):
if accessible_regions[j][1] == accessible_regions[j-1][2]:
one_group.append(accessible_regions[j])
else:
list_of_groups.append(one_group)
one_group = [accessible_regions[j]]
accessible_regions = list_of_groups

# remove 'nuc' regions:
accessible_regions = [tup for tup in accessible_regions if tup[3] != 'nuc']

# Generate broadpeak object
broadpeak = BroadPeakIO()
openpeak = PeakIO()
for region in accessible_regions[:-1]:
block_num = sum('open' in tup for tup in region)
block_sizes = ','.join(str(region[j][2] - region[j][1]) for j in range(1, len(region) - 1, 2))
block_starts = ','.join(str(region[j][1] - region[0][1]) for j in range(1, len(region) - 1, 2))
broadpeak.add(region[1][0], region[0][1], region[-1][2],
#bytes(region[1][0], encoding="raw_unicode_escape"), region[0][1], region[-1][2],
thickStart=bytes(str(region[1][1]), encoding="raw_unicode_escape"),
thickEnd=bytes(str(region[-2][2]), encoding="raw_unicode_escape"),
blockNum=block_num,
blockSizes=bytes(block_sizes, encoding="raw_unicode_escape"),
blockStarts=bytes(block_starts, encoding="raw_unicode_escape"))
broadpeak.write_to_gappedPeak(accessible_region_file)
openpeak.add(chromosome=region[0], start=region[1], end=region[2])

# refine peak summit and score using bedGraphTrackI with scores
openpeak = bdgscore.refine_peaks( openpeak )
openpeak.write_to_narrowPeak(accessible_region_file)
return
Loading