macs3-project · JohnUrban · Jun 4, 2014 · Jun 4, 2014 · Jun 4, 2014 · Jun 14, 2014
diff --git a/.gitignore b/.gitignore
@@ -29,3 +29,5 @@ MACS2/mnumpy.pxd
 MACS2/numpy.pxd
 
 test/NOSPMR.ppois.bdg_ppois.bdg
+
+johnChanges
diff --git a/MACS2/OptValidator.py b/MACS2/OptValidator.py
@@ -185,8 +185,8 @@ def opt_validate ( options ):
             options.argtxt += "# Larger dataset will be scaled towards smaller dataset.\n"
 
     if options.ratio != 1.0:
-	options.argtxt += "# Using a custom scaling factor: %.2e\n" % (options.ratio)
-	
+       options.argtxt += "# Using a custom scaling factor: %.2e\n" % (options.ratio)
+
     if options.cfile:
         options.argtxt += "# Range for calculating regional lambda is: %d bps and %d bps\n" % (options.smalllocal,options.largelocal)
     else:
@@ -804,3 +804,124 @@ def opt_validate_bdgcmp ( options ):
 
     return options
 
+
+
+def opt_validate_calmaxtag ( options ):
+    """Validate options from a OptParser object.
+
+    Ret: Validated options object.
+    """
+    # gsize
+    try:
+        options.gsize = efgsize[options.gsize]
+    except:
+        try:
+            options.gsize = float(options.gsize)
+        except:
+            logging.error("Error when interpreting --gsize option: %s" % options.gsize)
+            logging.error("Available shortcuts of effective genome sizes are %s" % ",".join(efgsize.keys()))
+            sys.exit(1)
+
+    # format
+
+    options.gzip_flag = False           # if the input is gzip file
+
+    options.format = options.format.upper()
+    if options.format == "ELAND":
+        options.parser = ELANDResultParser
+    elif options.format == "BED":
+        options.parser = BEDParser
+    elif options.format == "ELANDMULTI":
+        options.parser = ELANDMultiParser
+    elif options.format == "ELANDEXPORT":
+        options.parser = ELANDExportParser
+    elif options.format == "SAM":
+        options.parser = SAMParser
+    elif options.format == "BAM":
+        options.parser = BAMParser
+        options.gzip_flag = True
+    elif options.format == "BOWTIE":
+        options.parser = BowtieParser
+    elif options.format == "AUTO":
+        options.parser = guess_parser
+    else:
+        logging.error("Format \"%s\" cannot be recognized!" % (options.format))
+        sys.exit(1)
+
+    # uppercase the format string 
+    options.format = options.format.upper()
+
+    # logging object
+    logging.basicConfig(level=10,
+                        format='%(levelname)-5s @ %(asctime)s: %(message)s ',
+                        datefmt='%a, %d %b %Y %H:%M:%S',
+                        stream=sys.stderr,
+                        filemode="w"
+                        )
+
+    options.error   = logging.critical        # function alias
+    options.warn    = logging.warning
+    options.debug   = logging.debug
+    options.info    = logging.info
+
+    return options
+
+def opt_validate_calmaxtag ( options ):
+    """Validate options from a OptParser object.
+
+    Ret: Validated options object.
+    """
+    # gsize
+    try:
+        options.gsize = efgsize[options.gsize]
+    except:
+        try:
+            options.gsize = float(options.gsize)
+        except:
+            logging.error("Error when interpreting --gsize option: %s" % options.gsize)
+            logging.error("Available shortcuts of effective genome sizes are %s" % ",".join(efgsize.keys()))
+            sys.exit(1)
+
+    # format
+
+    options.gzip_flag = False           # if the input is gzip file
+
+    options.format = options.format.upper()
+    if options.format == "ELAND":
+        options.parser = ELANDResultParser
+    elif options.format == "BED":
+        options.parser = BEDParser
+    elif options.format == "ELANDMULTI":
+        options.parser = ELANDMultiParser
+    elif options.format == "ELANDEXPORT":
+        options.parser = ELANDExportParser
+    elif options.format == "SAM":
+        options.parser = SAMParser
+    elif options.format == "BAM":
+        options.parser = BAMParser
+        options.gzip_flag = True
+    elif options.format == "BOWTIE":
+        options.parser = BowtieParser
+    elif options.format == "AUTO":
+        options.parser = guess_parser
+    else:
+        logging.error("Format \"%s\" cannot be recognized!" % (options.format))
+        sys.exit(1)
+
+    # uppercase the format string 
+    options.format = options.format.upper()
+
+    # logging object
+    logging.basicConfig(level=10,
+                        format='%(levelname)-5s @ %(asctime)s: %(message)s ',
+                        datefmt='%a, %d %b %Y %H:%M:%S',
+                        stream=sys.stderr,
+                        filemode="w"
+                        )
+
+    options.error   = logging.critical        # function alias
+    options.warn    = logging.warning
+    options.debug   = logging.debug
+    options.info    = logging.info
+
+    return options
diff --git a/MACS2/calmaxtag.py b/MACS2/calmaxtag.py
@@ -0,0 +1,101 @@
+# Time-stamp: <2014-06-14 10:59:30 Tao Liu/JohnUrban>
+
+"""Description: Calculate maximum duplicate (redundant) reads allowed depending on sequencing depth and genome size.
+
+Copyright (c) 2011 Tao Liu <[email protected]>
+
+This code is free software; you can redistribute it and/or modify it
+under the terms of the BSD License (see the file COPYING included
+with the distribution).
+
+@status: release candidate
+@version: $Id$
+@author:  Yong Zhang, Tao Liu
+@contact: [email protected]
+"""
+
+# ------------------------------------
+# python modules
+# ------------------------------------
+
+import os
+import sys
+import logging
+
+# ------------------------------------
+# own python modules
+# ------------------------------------
+from MACS2.OptValidator import opt_validate_calmaxtag as opt_validate
+from MACS2.cProb import binomial_cdf_inv
+from MACS2.Constants import *
+# ------------------------------------
+# Main function
+# ------------------------------------
+def run( o_options ):
+    """The calculation based on the binomial distribution for how many tags to allow at one site.
+
+    """
+    # Parse options...
+    options = opt_validate( o_options )
+    # end of parsing commandline options
+    info = options.info
+    warn = options.warn
+    debug = options.debug
+    error = options.error
+
+    if options.outputfile != "stdout":
+        outfhd = open( os.path.join( options.outdir, options.outputfile ) ,"w" )
+    else:
+        outfhd = sys.stdout
+
+    #1 Read tag files
+    if options.ifile:
+	if not options.quiet:
+            info("counting tags from input files...")
+        fwtrack = load_tag_files_options (options)
+        t0 = fwtrack.total
+	if not options.quiet:
+            info(" total tags in alignment file: %d" % (t0))
+            info("tag size = %d" % options.tsize)
+        fwtrack.fw = options.tsize
+    elif options.numTags:
+	t0 = options.numTags
+
+    if not options.quiet:
+    	info("calculate max duplicate tags in single position based on binomal distribution...")
+    max_dup_tags = cal_max_dup_tags(options.gsize,t0)
+
+    if not options.quiet:    
+	info(" max_dup_tags based on binomal = %d" % (max_dup_tags))
+    else:
+	print max_dup_tags
+
+
+def cal_max_dup_tags ( genome_size, tags_number, p=1e-5 ):
+    """Calculate the maximum duplicated tag number based on genome
+    size, total tag number and a p-value based on binomial
+    distribution. Brute force algorithm to calculate reverse CDF no
+    more than MAX_LAMBDA(100000).
+
+    """
+    return binomial_cdf_inv(1-p,tags_number,1.0/genome_size)
+
+def load_tag_files_options ( options ):
+    """From the options, load alignment tags.
+
+    """
+    if not options.quiet: 
+	options.info("read alignment tags...")
+    tp = options.parser(options.ifile)
+
+    if not options.tsize:           # override tsize if user specified --tsize
+        ttsize = tp.tsize()
+        options.tsize = ttsize
+
+    treat = tp.build_fwtrack()
+    treat.sort()
+
+    if not options.quiet:
+	options.info("tag size is determined as %d bps" % options.tsize)
+    return treat
+
diff --git a/bin/macs2 b/bin/macs2
@@ -94,6 +94,10 @@ def main():
         # pileup alignment results with a given extension method
         from MACS2.pileup import run
         run( args )
+    elif subcommand == "calmaxtag":
+        # user only wants to calculate max tags given input paramters
+        from MACS2.calmaxtag import run
+        run( args )
 
 def prepare_argparser ():
     """Prepare optparser object. New options will be added in this
@@ -141,6 +145,9 @@ def prepare_argparser ():
 
     # command for 'refinepeak'
     add_refinepeak_parser( subparsers )
+
+    # command for 'calmaxtag'
+    add_calmaxtag_parser( subparsers )
 
     return argparser
 
@@ -553,7 +560,31 @@ def add_pileup_parser( subparsers ):
                                       help = "Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. If you want to know where are the duplicate reads, use 3. DEFAULT:2" )
     return
 
-
+def add_calmaxtag_parser( subparsers ):
+    argparser_calmaxtag = subparsers.add_parser("calmaxtag",
+						help = "Simply calculate maximum duplicate tags allowed at a given site with given input paramters" )
+    input = argparser_calmaxtag.add_mutually_exclusive_group()
+    input.add_argument( "-i", dest = "ifile", type = str, required = False,
+                                      help = "Sequencing alignment file. Provide reads file or number of reads with -n." )
+    input.add_argument( "-n", dest = "numTags", type = int, required = False,
+                                      help = "Number of reads in input file (this allows macs2 to skip counting). Provide number of reads or reads file -i." )
+    argparser_calmaxtag.add_argument( "-f", "--format", dest = "format", type = str,
+                                      choices=("AUTO","BAM","SAM","BED","ELAND","ELANDMULTI","ELANDEXPORT","BOWTIE"),
+                                      help = "Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\". The default AUTO option will let '%(prog)s' decide which format the file is. Please check the definition in README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE. DEFAULT: \"AUTO\"",
+                                      default = "AUTO" )
+    argparser_calmaxtag.add_argument( "-g", "--gsize", dest = "gsize", type = str, default = "hs", required=True,
+                                      help = "Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2.7e9), 'mm' for mouse (1.87e9), 'ce' for C. elegans (9e7) and 'dm' for fruitfly (1.2e8), DEFAULT:hs" )
+    argparser_calmaxtag.add_argument( "-p", "--pvalue", dest = "pvalue", type = float,
+                                      help = "Pvalue cutoff for binomial distribution test. DEFAULT:1e-5" )
+    argparser_calmaxtag.add_argument( "-s", "--tsize", dest = "tsize", type = int,
+                                      help = "Tag size. This will overide the auto detected tag size. DEFAULT: Not set" )
+    argparser_calmaxtag.add_argument( "-q", "--quiet", dest = "quiet", action = "store_true", 
+                                      help = "When using -n, this flag will suppress all messages except answer. When using -i, it will suppress most messages except answer. DEFAULT: Not set" )
+    add_outdir_option( argparser_calmaxtag )
+    argparser_calmaxtag.add_argument( "-o", "--ofile", dest = "outputfile", type = str,
+                                      help = "Output BED file name. If not specified, will write to standard output. DEFAULT: stdout",
+                                      default = "stdout" )
+    return
 if __name__ == '__main__':
     try:
         main()