-
Notifications
You must be signed in to change notification settings - Fork 111
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
running version calls compleasm and generates hints file for Augustus
- Loading branch information
1 parent
cc41cd0
commit d0b1b6c
Showing
1 changed file
with
162 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,162 @@ | ||
#!/usr/bin/env python3 | ||
|
||
""" | ||
Description: This script runs compleasm on a genome with a given BUSCO partition | ||
and parses the output table for complete BUSCOs to generate a hints | ||
file for AUGUSTUS/BRAKER/GALBA. | ||
Author: Katharina J. Hoff | ||
Email: [email protected] | ||
Date: November 27th, 2023 | ||
Copyright (C) 2023, Katharina J. Hoff, University of Greifswald | ||
This program is free software; you can redistribute it and/or modify | ||
it under the terms of the Artistic License. | ||
""" | ||
|
||
# install instructions for sepp that will be needed for containers: | ||
# git clone https://github.com/smirarab/sepp.git | ||
# sudo apt-get install default-jre | ||
# python3 setup.py config -c | ||
# sudo python3 setup.py install | ||
# other than that, the script depends on compleasm.py and its dependencies | ||
|
||
|
||
import argparse | ||
import re | ||
import shutil | ||
import os | ||
import csv | ||
import subprocess | ||
from inspect import currentframe, getframeinfo | ||
|
||
''' Function that runs a subprocess with arguments ''' | ||
|
||
|
||
def run_simple_process(args_lst): | ||
try: | ||
# bed files need sorting with LC_COLLATE=C | ||
myenv = os.environ.copy() | ||
myenv['LC_COLLATE'] = 'C' | ||
print("Trying to execute the following command:") | ||
print(" ".join(args_lst)) | ||
result = subprocess.run( | ||
args_lst, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=myenv) | ||
print("Suceeded in executing command.") | ||
if(result.returncode == 0): | ||
return(result) | ||
else: | ||
frameinfo = getframeinfo(currentframe()) | ||
print('Error in file ' + frameinfo.filename + ' at line ' + | ||
str(frameinfo.lineno) + ': ' + "Return code of subprocess was " + | ||
str(result.returncode) + str(result.args)) | ||
quit(1) | ||
except subprocess.CalledProcessError as grepexc: | ||
frameinfo = getframeinfo(currentframe()) | ||
print('Error in file ' + frameinfo.filename + ' at line ' + | ||
str(frameinfo.lineno) + ': ' + "Failed executing: ", | ||
" ".join(grepexec.args)) | ||
print("Error code: ", grepexc.returncode, grepexc.output) | ||
quit(1) | ||
|
||
|
||
def extract_tx_ids_from_tsv(tsv_file): | ||
busco_ids = {} | ||
with open(tsv_file, newline='') as csvfile: | ||
reader = csv.DictReader(csvfile, delimiter='\t') | ||
for row in reader: | ||
if (row['Status'] == 'Single' or row['Status'] == 'Duplicated') and (row['Frameshift events'] == '0'): | ||
busco_ids[row['Best gene']] = True | ||
return busco_ids | ||
|
||
def read_and_filter_gff(gff_file, id_dict): | ||
gff_list = [] | ||
try: | ||
with open(gff_file, 'r') as gff_handle: | ||
for line in gff_handle: | ||
if not line.startswith("#"): | ||
if re.search(r'Target=', line): | ||
bid = re.search(r'Target=(\S+)', line).group(1) | ||
if bid in id_dict.keys(): | ||
gff_list.append(line) | ||
except IOError: | ||
print("Could not open file ", gff_file) | ||
return gff_list | ||
|
||
def miniprot_to_hints(gff_lines): | ||
hints_lines = [] | ||
for line in gff_lines: | ||
line = line.rstrip() | ||
line_fields = line.split("\t") | ||
if re.search(r'\tCDS\t.*Parent=([^;]+)', line): | ||
if (int(line_fields[4])-3) - (int(line_fields[3])+3) + 1 > 0: | ||
regex_result = re.search(r'\tCDS\t.*Parent=([^;]+)', line) | ||
hint = line_fields[0] + "\t" + "c2h" + "\tCDSpart\t" + str(int(line_fields[3])+3) + "\t" + str(int(line_fields[4])-3) + "\t" + "." + "\t" + line_fields[6] + "\t" + line_fields[7] + "\t" + "src=M;grp=" + regex_result.group(1) + ";pri=4\n" | ||
hints_lines.append(hint) | ||
return hints_lines | ||
|
||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser(description="Run compleasm and generate a gtf file with complete BUSCO genes.") | ||
|
||
# Mandatory input arguments | ||
parser.add_argument("-g", "--genome", required=True, help="Genome file in fasta format") | ||
parser.add_argument("-d", "--database", required=True, help="BUSCO database to use") | ||
parser.add_argument("-p", "--compleasm", required=False, help="Location of compleasm binary incl. binary name") | ||
parser.add_argument("-t", "--threads", required=False, help="Number of threads to use, default is 1") | ||
parser.add_argument("-o", "--output", required=True, help="Output file name, file is in GTF format.") | ||
parser.add_argument("-s", "--scratch_dir", required=False, help="Temporary directory for compleasm output, default is current directory, must be writable.") | ||
args = parser.parse_args() | ||
|
||
if args.compleasm is None and shutil.which('compleasm.py'): | ||
# Try to find 'compleasm' in the system PATH | ||
args.compleasm = shutil.which('compleasm') | ||
elif args.compleasm is None and os.environ.get('COMPLEASM_PATH'): | ||
# If 'compleasm' is not in PATH, check for an environment variable | ||
args.compleasm = str(os.environ.get('COMPLEASM_PATH')) + '/compleasm.py' | ||
elif args.compleasm is None: | ||
# If still not found, raise an error | ||
raise FileNotFoundError("compleasm is not in PATH and COMPLEASM_PATH is not set") | ||
if args.compleasm is not None: | ||
# check whether provided compleasm is executable | ||
if not os.access(args.compleasm, os.X_OK): | ||
raise FileNotFoundError("compleasm is not executable") | ||
|
||
# apply compleasm to genome file with run_subprocess and the database | ||
if args.scratch_dir is None: | ||
args.scratch_dir = "compleasm_genome_out" | ||
run_simple_process([args.compleasm, 'run', '-l', args.database, '-a', args.genome, '-t', args.threads, '-o', args.scratch_dir]) | ||
|
||
# parse compleasm output table for complete BUSCOs without frameshifts | ||
busco_ids = extract_tx_ids_from_tsv(args.scratch_dir + '/' + args.database + '/full_table.tsv') | ||
|
||
# filter the miniprot alignments for those that have no frame shifts | ||
gff_lines = read_and_filter_gff(args.scratch_dir + '/' + args.database + '/miniprot_output.gff', busco_ids) | ||
|
||
# convert the miniprot lines to CDSpart hints | ||
hints_lines = miniprot_to_hints(gff_lines) | ||
|
||
# gff_lines is not compatible with getAnnoFastaFromJoingenes, we need to fix this! | ||
try: | ||
with open(args.output, "w") as out_handle: | ||
for line in hints_lines: | ||
out_handle.write(line) | ||
except IOError: | ||
print("Failed to open file", args.out) | ||
|
||
# print the BUSCO scores for genome level statistics to STDOUT | ||
print("The following BUSCOs were found in the genome:") | ||
try: | ||
with open(args.scratch_dir + '/' + 'summary.txt', 'r') as busco_summary: | ||
for line in busco_summary: | ||
line = line.strip() | ||
print(line) | ||
except IOError: | ||
print("Failed to open file", args.scratch_dir + '/' + 'summary.txt') | ||
|
||
# delete the temporary directory | ||
shutil.rmtree(args.scratch_dir) | ||
|
||
if __name__ == "__main__": | ||
main() |