pfpebwt

#!/usr/bin/env python3

import sys, time, argparse, subprocess, os.path

Description = """
Tool to compute the eBWT and the GCA of a string collection.
"""

def main():
    parser = argparse.ArgumentParser(description=Description, formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument('input', help='input fasta file name', type=str)
    parser.add_argument('-w', '--wsize', help='sliding window size (def. 10)', default=10, type=int)
    parser.add_argument('-p', '--mod', help='hash modulus (def. 100)', default=100, type=int)
    parser.add_argument('-t', help='number of helper threads (def. None)', default=0, type=int)
    parser.add_argument('-n', help='number of different primes when using --reads (def. 1)', default=1, type=int)
    parser.add_argument('--rle', help='store eBWT in RLE format (def. False)', action='store_true')
    parser.add_argument('--samples', help='compute the GCA samples (def. False)', action='store_true')
    parser.add_argument('--GCA', help='compute the complete GCA (def. False)', action='store_true')
    parser.add_argument('--reads', help='process input ad a reads multiset (def. False)', action='store_true')
    parser.add_argument('--remainders',  help='use multiple remainders instead of multiple primes (def. False)',action='store_true')
    parser.add_argument('--period', help='remove periodic sequences (def. False)', action='store_true')
    parser.add_argument('--invert', help='invert the eBWT (def. False)', action='store_true')
    parser.add_argument('--keep',  help='keep auxiliary files (debug only)',action='store_false')
    parser.add_argument('--parsing',  help='stop after the parsing phase (debug only)',action='store_true')
    parser.add_argument('--verbose',  help='verbose (def. False)',action='store_true')
    args = parser.parse_args()

    dirname = os.path.dirname(os.path.abspath(__file__))
    parse_exe         =  os.path.join(dirname, "include/PFP-algos/circpfp.x")
    parseNT_exe       =  os.path.join(dirname, "include/PFP-algos/circpfpNT.x")
    parseReads_exe    =  os.path.join(dirname, "include/PFP-algos/circpfpr.x")
    parseReadsNT_exe  =  os.path.join(dirname, "include/PFP-algos/circpfprNT.x")
    parseReadsD_exe   =  os.path.join(dirname, "include/PFP-algos/circpfpd.x")
    parseReadsDNT_exe =  os.path.join(dirname, "include/PFP-algos/circpfpdNT.x")
    parsebwtNT_exe    =  os.path.join(dirname, "include/eBWT-algos/parsebwtNT.x")
    parsebwtNT64_exe  =  os.path.join(dirname, "include/eBWT-algos/parsebwtNT64.x")
    bebwtNT_exe       =  os.path.join(dirname, "include/eBWT-algos/build-ebwt-gca.x")
    bebwtNTp64_exe    =  os.path.join(dirname, "include/eBWT-algos/build-ebwt-gca_p64.x")
    bebwtNTd64_exe    =  os.path.join(dirname, "include/eBWT-algos/build-ebwt-gca_d64.x")
    bebwtNT64_exe     =  os.path.join(dirname, "include/eBWT-algos/build-ebwt-gca_64.x")
    invertNT_exe      =  os.path.join(dirname, "include/eBWT-algos/invertNT.x")

    if args.samples or args.GCA:
        parseNT_exe       = os.path.join(dirname, "include/PFP-algos/circpfp-gca.x")
        parsebwtNT_exe    =  os.path.join(dirname, "include/eBWT-algos/parsebwtgcaNT.x")
        parsebwtNT64_exe  =  os.path.join(dirname, "include/eBWT-algos/parsebwtgcaNT64.x")
        if args.reads or args.t > 0:
            print("GCA computation not yet implemented for multithreading and short reads mode")
            exit(1)

    if args.samples and args.GCA:
        args.samples = False
        print("Warning --GCA flag overrides --samples flag.")

    logfile_name = args.input + ".log"
    # get main bigbwt directory
    args.bigbwt_dir = os.path.split(sys.argv[0])[0]
    print("Sending logging messages to file:", logfile_name)
    with open(logfile_name,"a") as logfile:

        # ---------- Parsing of the input file
        start0 = start = time.time()

        if args.reads:
            # Input is a short sequences multiset
            if args.remainders:
                # Use different remainders
                if args.t>0:
                    command = "{exe} {file} -w {wsize} -p {modulus} -t {th} -n {wnumb}".format(
                            exe = os.path.join(args.bigbwt_dir,parseReadsD_exe),
                            wsize=args.wsize, modulus = args.mod, th=args.t, wnumb=args.n, file=args.input)
                else:
                    command = "{exe} {file} -w {wsize} -p {modulus} -n {wnumb}".format(
                            exe = os.path.join(args.bigbwt_dir,parseReadsDNT_exe),
                            wsize=args.wsize, modulus = args.mod, wnumb = args.n, file=args.input)
            else:
                # Use different primes
                if args.t>0:
                    command = "{exe} {file} -w {wsize} -p {modulus} -t {th} -n {wnumb}".format(
                            exe = os.path.join(args.bigbwt_dir,parseReads_exe),
                            wsize=args.wsize, modulus = args.mod, th=args.t, wnumb=args.n, file=args.input)
                else:
                    command = "{exe} {file} -w {wsize} -p {modulus} -n {wnumb}".format(
                            exe = os.path.join(args.bigbwt_dir,parseReadsNT_exe),
                            wsize=args.wsize, modulus = args.mod, wnumb = args.n, file=args.input)
        else:
            # Input is a long sequences multiset
            if args.t>0:
                command = "{exe} {file} -w {wsize} -p {modulus} -t {th}".format(
                        exe = os.path.join(args.bigbwt_dir,parse_exe),
                        wsize=args.wsize, modulus = args.mod, th=args.t, file=args.input)
            else:
                command = "{exe} {file} -w {wsize} -p {modulus}".format(
                        exe = os.path.join(args.bigbwt_dir,parseNT_exe),
                        wsize=args.wsize, modulus = args.mod, file=args.input)

        # Verbose
        if args.verbose: command += " -v"
        # Remove periodic sequences
        if args.period: command += " -c"

        print("==== Parsing. Command:", command)
        if(execute_command(command,logfile,logfile_name)!=True):
            return
        print("Elapsed time: {0:.4f}".format(time.time()-start))
        if args.parsing:
            # delete temporary parsing files 
            command = "rm -f {file}.eparse_old {file}.start {file}.offset {file}.fchar {file}.eocc".format(file=args.input)
            if(execute_command(command,logfile,logfile_name)!=True):
                return
            for i in range(args.t):
                command = "rm -f {file}.{i}.eparse_old {file}.{i}.offset_old ".format(file=args.input, i=i)
                if(execute_command(command,logfile,logfile_name)!=True):
                    return
            print("==== Stopping after the parsing phase as requested")      
            return

        # ----------- Computing the eBWT and Inverted List of the parse
        start = time.time()
        print("==== Computing Inverted List of parse's eBWT.")
        parse_size = os.path.getsize(args.input+".eparse")/4
        print("Parse contains " + str(parse_size) + " words.")
        if(parse_size >= (2**32-1)):
            print("IL creation running in 64 bit mode")
            command = "{exe} {file} -w {wsize}".format(
                    exe = os.path.join(args.bigbwt_dir,parsebwtNT64_exe), wsize=args.wsize, file=args.input)
        else:
            print("IL creation running in 32 bit mode")
            command = "{exe} {file} -w {wsize}".format(
                     exe = os.path.join(args.bigbwt_dir,parsebwtNT_exe), wsize=args.wsize, file=args.input)

        print("Command:", command)
        if(execute_command(command,logfile,logfile_name)!=True):
            return
        print("Elapsed time: {0:.4f}".format(time.time()-start))

        # ----------- Computing the eBWT of the text
        start = time.time()
        print("==== Computing the eBWT of the text.")
        dict_size = os.path.getsize(args.input+".edict")
        print("Dictionary contains " + str(dict_size) + " characters.")
        if(dict_size >=  (2**31-1)):
            print("Dict SA running in 64 bit mode")
            if(parse_size >= (2**32-1)):
                command = "{exe} {file} -w {wsize}".format(
                        exe = os.path.join(args.bigbwt_dir,bebwtNT64_exe), wsize=args.wsize, file=args.input)
            else:
                command = "{exe} {file} -w {wsize}".format(
                        exe = os.path.join(args.bigbwt_dir,bebwtNTd64_exe), wsize=args.wsize, file=args.input)
        else:
            print("Dict SA running in 32 bit mode")
            if(parse_size >= (2**32-1)):
                command = "{exe} {file} -w {wsize}".format(
                        exe = os.path.join(args.bigbwt_dir,bebwtNTp64_exe), wsize=args.wsize, file=args.input)
            else:
                command = "{exe} {file} -w {wsize}".format(
                        exe = os.path.join(args.bigbwt_dir,bebwtNT_exe), wsize=args.wsize, file=args.input)

        if(args.rle): command += " -r" # Activate rle 
        if(args.samples): command += " -s" # Activate samples computation
        if(args.GCA): command += " -c" # Activate full GCA computation
        print("==== Computing eBWT and GCA of the text. Command:", command)
        if(execute_command(command,logfile,logfile_name)!=True):
            return
        print("Elapsed time: {0:.4f}".format(time.time()-start));
        print("Total construction time: {0:.4f}".format(time.time()-start0))
        if args.keep:
            print("Deleting auxiliary files")
            command = "rm -f {file}.eparse_old {file}.offset_old {file}.eparse {file}.edict {file}.offset {file}.eocc {file}.fchar {file}.start {file}.sdsl {file}.last {file}.spos {file}.slast".format(file=args.input)
            if(execute_command(command,logfile,logfile_name)!=True):
                return
            for i in range(args.t):
                command = "rm -f {file}.{i}.eparse_old {file}.{i}.offset_old ".format(file=args.input, i=i)
                if(execute_command(command,logfile,logfile_name)!=True):
                    return
        if args.invert:
            print("Inverting the eBWT")
            command = "{exe} {file}".format(exe = os.path.join(args.bigbwt_dir,invertNT_exe), file=args.input)
            if(execute_command(command,logfile,logfile_name)!=True):
                return

# execute command: return True is everything OK, False otherwise
def execute_command(command,logfile,logfile_name,env=None):
  try:
    #subprocess.run(command.split(),stdout=logfile,stderr=logfile,check=True,env=env)
    subprocess.check_call(command.split(),stdout=logfile,stderr=logfile,env=env)
  except subprocess.CalledProcessError:
    print("Error executing command line:")
    print("\t"+ command)
    print("Check log file: " + logfile_name)
    return False
  return True
  

if __name__ == '__main__':
    main()