Skip to content

Commit

Permalink
Merge pull request #21 from ythuang0522/hotfix/update_meta
Browse files Browse the repository at this point in the history
Hotfix/update meta
  • Loading branch information
J-I-P authored Nov 27, 2020
2 parents 1264403 + 214207f commit 5ddd65c
Show file tree
Hide file tree
Showing 8 changed files with 445 additions and 162 deletions.
Binary file added modules/.DS_Store
Binary file not shown.
5 changes: 5 additions & 0 deletions modules/align2df.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,5 +115,10 @@ def todf(draft, db_np, path, truth_np=None):
df_dict.update({'label': label})
df_path = path + '/{}.feather'.format(record.id)
df = pd.DataFrame(df_dict)

if df.empty:
print('DataFrame is empty!')
return False

df.to_feather(df_path)
return df_path
40 changes: 24 additions & 16 deletions modules/alignment.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import os
import sys
import numpy as np
import time
from Bio import SeqIO
from modules.utils.TextColor import TextColor
from modules.utils.FileManager import FileManager

LONG_DELETION_LENGTH = 50

Expand Down Expand Up @@ -104,26 +107,31 @@ def pileup(paf, genome_size):
start_pos+=1
return arr, coverage, ins

def align(draft, minimap_args, threads, db, path,reference=None):
def make_output_dir(type, output_dir, contig_id=None):
if type=='contig':
contig_output_dir = output_dir + '/' + contig_id
contig_output_dir = FileManager.handle_output_directory(contig_output_dir)
return contig_output_dir
else:
output_dir_debug = output_dir + '/' + type
output_dir_debug = FileManager.handle_output_directory(output_dir_debug)
return output_dir_debug

def align(draft, minimap_args, threads, db, path, reference=None):

t_start=time.time()

record = SeqIO.read(draft, "fasta")
genome_size = len(record)


if reference:
npz = '{}/truth.npz'.format(path)
paf = '{}/truth.paf'.format(path)
minimap2_cmd= 'minimap2 -cx asm5 --cs=long -t {thread} {draft} {reference} > {paf}'.format(thread=threads, draft=draft, reference=reference,paf=paf)
else:
npz='{}/{}.npz'.format(path, record.id)
paf = '{}/{}.paf'.format(path, record.id)
paf = '{}/contig.paf'.format(path)
minimap2_cmd = 'minimap2 -cx {asm} --cs=long -t {thread} {draft} {db} > {paf}'\
.format(asm=minimap_args, thread=threads, draft=draft, db=db, paf=paf)
.format(asm=minimap_args, thread=threads, draft=draft, db=db, paf=paf)

os.system(minimap2_cmd)
t_end = time.time()
print(t_end-t_start)

return paf


os.system(minimap2_cmd)
if os.stat(paf).st_size == 0: #minimap2 can't align return false
return False
arr, coverage, ins = pileup(paf, genome_size)
np.savez(npz, arr=arr, coverage=coverage, ins=ins)
return npz
88 changes: 69 additions & 19 deletions modules/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@
import requests
import multiprocessing
from modules.utils.FileManager import FileManager
import time
import socket


#socket.setdefaulttimeout(30)


def checkInternetRequests(url='http://www.google.com/', timeout=3):
Expand All @@ -14,36 +19,63 @@ def checkInternetRequests(url='http://www.google.com/', timeout=3):
print(ex)
return False


def run_process(id, url, path):
path = path + '/'
print("Downloaded " + id)

if 'ftp' in url:
filename = '{}{}.fna.gz'.format(path, id)
filename = '{}{}'.format(path, id)
try:
urllib.request.urlretrieve(url, filename)
pass

except urllib.error.URLError as e:
print(e)
sys.exit(0)
#print(e)
for i in range(1, 5):

if os.path.isfile(filename):
break
else:
print("URL ERROR: time out !!!!!!!!")
print("sleep 10 sec...")
time.sleep(10)
print("Download again............"+str(i))
urllib.request.urlretrieve(url,filename)


except urllib.error.ContentTooShortError as e:
print ('Network conditions is not good. Reloading...')
run_process(id, url, path)

else:
try:
r = requests.get(url, allow_redirects=True)
r.raise_for_status()
open('{}{}.fasta'.format(path, id), 'wb').write(r.content)

pass

except requests.exceptions.RequestException as e:
print(e)
sys.exit(0)
open('{}{}.fasta'.format(path, id), 'wb').write(r.content)
print("sleep 10 sec...")
time.sleep(10)
return (id, url)



def parser_url(ncbi_id):
def parser_url(ncbi_id): #GCF_002060415.1_ASM206041v1_genomic.fna.gz
url_list = []
for filename in ncbi_id:
if '_genomic.fna.gz' in filename: #download chromosome
id = filename.split('_genomic.fna.gz')[0]
gcf = id.split('_')[0]
second = id.split('_')[1]
number = second.split('.')[0]
letter = '/'.join(number[i:i+3] for i in range(0, len(number), 3))
id = filename.split('_genomic.fna.gz')[0] #GCF_002060415.1_ASM206041v1
gcf = id.split('_')[0] #GCF
second = id.split('_')[1] #002060415.1
number = second.split('.')[0] #002060415
letter = '/'.join(number[i:i+3] for i in range(0, len(number), 3)) #002/060/415/
url = "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/{gcf}/{letter}/{id}/{filename}".format(gcf=gcf, letter=letter, id=id, filename=filename)
url_list.append(url)
else: #download plasmid
else: #download plasmid
url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id={id}&rettype=fasta'.format(id=filename)
url_list.append(url)
return url_list
Expand All @@ -67,7 +99,9 @@ def parser_genus_species(genus_species, download_contig_nums=None):
element = line[i].split("\t")

if genus_species in element[7]:

filename = element[19].split('/')[-1]
#print(filename)
ftp = element[19] + "/" + filename + "_genomic.fna.gz"

if len(ncbi_id) < int(download_contig_nums):
Expand All @@ -77,6 +111,7 @@ def parser_genus_species(genus_species, download_contig_nums=None):
else:
break

# add homologous sequences (same genus) quantity up to download_contig_num
if len(ncbi_id) < int(download_contig_nums):
genus_species = genus_species.split(" ")

Expand All @@ -94,21 +129,36 @@ def parser_genus_species(genus_species, download_contig_nums=None):
else:
break

if len(ncbi_id) < 5: # Would'nt polish if closely-related genomes less than 5
sys.stderr.write(TextColor.PURPLE + "Closely-related genomes less than 5, not to polish...\n" + TextColor.END)
return

return ncbi_id, url_list



def download(path, ncbi_id, url_list):
db_dir = path + '/homologous_sequences/'
db_dir = FileManager.handle_output_directory(db_dir)
max_pool_size = 3 #API rate limit exceeded, can't go higher
cpus = multiprocessing.cpu_count()
pool = multiprocessing.Pool(cpus if cpus < max_pool_size else max_pool_size)
for id, url in zip(ncbi_id, url_list):
pool.apply_async(run_process, args=(id, url, db_dir))
pool.close()
pool.join()

cnt = 0

results = [(ncbi_id[i], url_list[i]) for i in range(0, len(ncbi_id))]
loss = []
with multiprocessing.Pool(cpus if cpus < max_pool_size else max_pool_size) as pool:
for pair in results:
cnt += 1
pool.apply_async(run_process, args=(pair[0], pair[1], db_dir))
pool.close()
pool.join()

results = list(filter(None, results))


file_path = db_dir + '/*'
db_path = path + '/All_homologous_sequences.fna.gz'
db_path = path + '/All_homologous_sequences.fasta.gz'

os.system('cat {} > {}'.format(file_path, db_path))
print('')
return db_path
82 changes: 71 additions & 11 deletions modules/mash.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,29 @@
import os
import sys
from modules.utils.TextColor import TextColor

def screen(contig_name, sketch_path, threads, output_dir, mash_threshold, download_contig_nums, contig_id):
def screen(contig_name, sketch_path, threads, output_dir, mash_threshold, contig_id=None, download_contig_nums=None,):
mash_out = '{}/{}.sort.tab'.format(output_dir, contig_id)
mash_cmd = 'mash screen -p {thread} -i {ratio} {db} {draft} > temp.tab'\
.format(thread=threads, ratio=mash_threshold, db=sketch_path, draft=contig_name)
sort_cmd = 'sort -gr temp.tab > temp.sort.tab'
head_cmd = 'head -n {num} temp.sort.tab > {out}'.format(num=download_contig_nums, out=mash_out)
.format(thread=threads, ratio=mash_threshold, db=sketch_path, draft=contig_name)
os.system(mash_cmd)
os.system(sort_cmd)
os.system(head_cmd)

if contig_id=="meta":
sort_cmd = 'sort -gr temp.tab > {out}'.format(out=mash_out)
os.system(sort_cmd)
else:
sort_cmd = 'sort -gr temp.tab > temp.sort.tab'
os.system(sort_cmd)
head_cmd = 'head -n {num} temp.sort.tab > {out}'.format(num=download_contig_nums, out=mash_out)
os.system(head_cmd)
os.remove('temp.sort.tab')

os.remove('temp.tab')
os.remove('temp.sort.tab')

return mash_out


def dist(contig_name, sketch_path, threads, output_dir,mash_threshold ,download_contig_nums, contig_id):

def dist(contig_name, sketch_path, threads, output_dir, mash_threshold ,download_contig_nums, contig_id):
ratios=1- float (mash_threshold)
mash_out = '{}/{}.sort.tab'.format(output_dir, contig_id)
mash_cmd = 'mash dist -p {thread} -d {ratio} {db} {draft} > temp.tab'\
Expand All @@ -29,16 +36,69 @@ def dist(contig_name, sketch_path, threads, output_dir,mash_threshold ,download_
os.remove('temp.tab')
os.remove('temp.sort.tab')
return mash_out


def get_ncbi_id(mashfile, mash_screen=None):
ncbi_id = []

with open(mashfile,'r') as f:
for line in f:
line = line.split('\t')
if mash_screen:
ncbi_id.append(line[4]) #Use mash screen
else:
ncbi_id.append(line[0]) #Use mash dist
return ncbi_id

def meta_get_ncbi_id(mashfile, download_contig_nums):
download_2dlist = []

with open(mashfile, 'r') as f:
for line in f:
line = line.split('\t')
test = line[5].split(" ")
genus = ""

chromosome_type = False
if "_genomic.fna.gz" in line[4]: # chromosome
chromosome_type = True
if line[5][0] != "[":
if test[2]=='sp.':
continue
genus = test[1] + " " + test[2]
else:
if test[4]=='sp.':
continue
genus = test[3] + " " + test[4]


else: # plasmid
genus = test[0] + " " + test[1]
continue

if genus[-1] == ',':
genus = genus[:-1]

chk = False
for i, x in enumerate(download_2dlist):
if genus in x:
chk = True
if chromosome_type == True and download_2dlist[i][1]<int(download_contig_nums):
download_2dlist[i][1] += 1
download_2dlist[i].append(line[4])
elif chromosome_type == False and download_2dlist[i][2]<int(download_contig_nums):
download_2dlist[i][2] += 1
download_2dlist[i].append(line[4])

if chk == False:
download_2dlist.append([genus, 1, 0])
download_2dlist[-1].append(line[4])


ncbi_id = []
for i, x in enumerate(download_2dlist):
if x[1] >= 5:
print(x[0]+" "+str(x[1]))
for i in range(3, len(x)):
ncbi_id.append(x[i])

return ncbi_id
Loading

0 comments on commit 5ddd65c

Please sign in to comment.