final.py

import multiprocessing
from Comb_fastq import combine
from utils import *
import sys
import gzip
import os
def cut(step,length_bin,link,i):
    start = step*i
    end = length_bin + start
    if end > len(link):
        end = len(link)
    if start>=end or end-start<=length_bin-step:
        return False,''
    return True,link[start:end]

def writefiles(UnmappedReads,step,length_bin,max_length,outputname):
    Part_Fastq_Filename = []
    for i in range(max_length):
        filecontent = []
        for readsname in UnmappedReads:
            link = UnmappedReads[readsname]
            mark,cutreads = cut(step,length_bin,link[0],i)
            if not mark: continue
            _,cutquality = cut(step,length_bin,link[1],i)
            filecontent.append(readsname+'\n'+cutreads+'\n+\n'+cutquality+'\n')
        if len(filecontent)==0: break
        name = outputname+'.part'+str(i+1)+'.fastq'
        Part_Fastq_Filename.append(name)
        with open(name,'a') as f:
            f.writelines(filecontent)
    print(Part_Fastq_Filename)
    return Part_Fastq_Filename

def do_process(l):
    #print(l+'in')
    temp = l.strip().split()
    length = len(temp)
    if length<=0 or length>2:
        print("Parameter error in "+l)
        sys.exit()
    refpath='/data/dsun/ref/mouseigenome/mm9.fa'
    #refpath = '/data/dsun/ref/humanigenome/hg19.fa'
    tempname=temp[0].lower()
    if tempname.endswith(".gz"):
        tempname = tempname[:-3]
    if tempname.endswith(".fq"):
        tempname = tempname[:-3]
    if tempname.endswith(".fastq"):
        tempname = tempname[:-6]
    outputname = temp[0][:len(tempname)]
    #print(outputname)
    phred=33
    if length==2 :
        commend='bsmap -a '+temp[0]+' -b '+temp[1]+' -z '+str(phred)+' -d '+refpath+' -o '+outputname+'.bam -n 1 -r 0'
    else:
        commend='bsmap -a '+temp[0]+' -z '+str(phred)+' -d '+refpath+'  -o '+outputname+'.bam -n 1 -r 0'
    First_try = Pshell(commend)
    First_try.process()
    command='samtools sort -@ 4 '+outputname+'.bam'+' -o '+outputname+'.sorted.bam'
    First_try.change(command)
    First_try.process()
    command='mv '+outputname+'.sorted.bam '+outputname+'.bam'
    First_try.change(command)
    First_try.process()
    command='rm '+outputname+'.sorted.bam'
    First_try.change(command)
    First_try.process()
#Test1 done
    inputfileinfo=l.strip().split()
    commend = 'samtools view '+outputname+'.bam > '+outputname+'.sam'
    BamFileReader = Pshell(commend)

    BamFileReader.process()
    with open(outputname+".sam") as sam:
    #second column in sam file: 64, mate 1; 128, mate 2;
        samlines = sam.readlines()
    set_sam = {}
    for line in samlines:
        temp = line.strip().split()
        m1 = (int(temp[1]) & 64)
        m2 = (int(temp[1]) & 128)
#        print(temp[1],m1,m2)
        if m1>0: mate = 1
        elif m2>0: mate = 2
        else: mate = 0
        if temp[0] in set_sam: set_sam[temp[0]]=3
        else: set_sam[temp[0]]=mate
#        print(mate)
#    for k in set_sam:
#        print(k)
#        break
    del samlines

    UnmappedReads = {}
    o=0
    step = 5
    length_bin = 30#30
    max_length = 24#50

    Part_Fastq_Filename=[]

    for filename in inputfileinfo:
        o+=1
        gzmark=False
        if filename.endswith('.gz'):
            f = gzip.open(filename)
            gzmark=True
        else:
            f = open(filename)

        if f:
            while 1:
                if gzmark:
                    line1 = f.readline().decode()
                else:
                    line1 = f.readline()
                if not line1:
                    break
                if gzmark:
                    line2 = f.readline().decode().strip()
                    line3 = f.readline().decode()
                    line4 = f.readline().decode().strip()
                else:
                    line2 = f.readline().strip()
                    line3 = f.readline()
                    line4 = f.readline().strip()
                line1 = line1.strip().split()
                line1[0] = line1[0]
 #           print(line1[0][1:])
                if (line1[0][1:] in set_sam):
                    string_mark = o
                    if line1[1][0]>='1' and line1[1][0]<='2':
                        string_mark = int(line1[1][0])
                    if set_sam[line1[0][1:]]==0 or set_sam[line1[0][1:]]==3 : continue
                    if set_sam[line1[0][1:]]==string_mark : continue

                temp = line1[0]
                if length>1: temp+='_'+line1[1][0]
                #Maybe the mate search method is buggy. Cuz there are different structures of reads name generated by different sequencing machine.
                #fastqlines[i] = line1[0]+'_'+line1[1][0]+' '+line1[1]
                UnmappedReads[temp]=[line2,line4]
                if len(UnmappedReads)>10000000:
                    pfn = writefiles(UnmappedReads,step,length_bin,max_length,outputname)
                    UnmappedReads={}
                    if len(pfn)>len(Part_Fastq_Filename):
                        Part_Fastq_Filename=pfn
                    

#We've got a dictionary named UnmappedReads = {readsname:[line1,line2,line3,line4]}

#Change cut funtion into cut(setp,length_bin,string,fileorder), return Available(T/F), reads_fraction
    if len(UnmappedReads)>0:
        pfn=writefiles(UnmappedReads,step,length_bin,max_length,outputname)
        if len(pfn)>len(Part_Fastq_Filename):
            Part_Fastq_Filename=pfn
    print('finish')
    f.close()
    del UnmappedReads
    #We've got the splited fastq file, filename is stored in Part_Fastq_Filename
   # p = multiprocessing.Pool(processes=7)
    for i in range(len(Part_Fastq_Filename)):
        commend = 'bsmap -a '+Part_Fastq_Filename[i]+' -z '+str(phred)+' -d '+refpath+'  -o '+Part_Fastq_Filename[i]+'.bam -n 1 -r 0 -R'
        Batch_try = Pshell(commend)
        Batch_try.process()

   #run bsmap and get bam files named as Part_Fastq_Filename[i].bam
    #import combine to generate the finalfastq

    combine(outputname,Part_Fastq_Filename,step,length_bin)

    commend = 'bsmap -a '+outputname+'_finalfastq.fastq -d '+refpath+' -z '+str(phred)+' -o '+outputname+'_split.bam -n 1 -r 0'
    Bam = Pshell(commend)
    Bam.process()
    splitfilename = outputname+'_split.bam'
    header = outputname+'.header'
    command='samtools view -H '+splitfilename+' > '+header
    filter = Pshell(command)
    filter.process()
    split_length=40
    command='samtools view '+splitfilename+"| awk '{if (length($10)>"+str(split_length)+") print}' > "+splitfilename+'.sam'
    filter.change(command)
    filter.process()
    command='cat '+header+' '+splitfilename+'.sam | samtools view -bS - > '+splitfilename+'.bam'
    filter.change(command)
    filter.process()
    command='samtools sort -@ 4 '+splitfilename+'.bam'+' -o '+splitfilename+'.sorted.bam'
    filter.change(command)
    filter.process()
    command='mv '+splitfilename+'.sorted.bam '+splitfilename
    filter.change(command)
    filter.process()
    command='rm '+splitfilename+'.bam '+splitfilename+'.sam'
    filter.change(command)
    filter.process()
    m=Pshell('samtools merge '+outputname+'_combine.bam '+outputname+'.bam '+outputname+'_split.bam')
    m.process()
    print("Merge done!\nCreated final bam file called "+outputname+'_combine.bam')

if __name__=="__main__":
    with open("config.txt") as f:
        lines = f.readlines()
    import multiprocessing
    pool = multiprocessing.Pool(processes=2)
    for l in lines:
        pool.apply_async(do_process,(l,))
        #do_process(l) #pass file name to do_process
    pool.close()
    pool.join()