From d0b1b6cedd6a360a915f2f514caf1cb8987389e1 Mon Sep 17 00:00:00 2001
From: KatharinaHoff <katharina.hoff@gmail.com>
Date: Mon, 27 Nov 2023 15:35:59 +0100
Subject: [PATCH] running version calls compleasm and generates hints file for
 Augustus

---
 scripts/compleasm_to_hints.py | 162 ++++++++++++++++++++++++++++++++++
 1 file changed, 162 insertions(+)
 create mode 100755 scripts/compleasm_to_hints.py

diff --git a/scripts/compleasm_to_hints.py b/scripts/compleasm_to_hints.py
new file mode 100755
index 00000000..d6faeff8
--- /dev/null
+++ b/scripts/compleasm_to_hints.py
@@ -0,0 +1,162 @@
+#!/usr/bin/env python3
+
+"""
+Description: This script runs compleasm on a genome with a given BUSCO partition
+             and parses the output table for complete BUSCOs to generate a hints
+             file for AUGUSTUS/BRAKER/GALBA.
+Author: Katharina J. Hoff
+Email: katharina.hoff@uni-greifswald.de
+Date: November 27th, 2023
+
+Copyright (C) 2023, Katharina J. Hoff, University of Greifswald
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the Artistic License.
+"""
+
+# install instructions for sepp that will be needed for containers:
+# git clone https://github.com/smirarab/sepp.git
+# sudo apt-get install default-jre
+# python3 setup.py config -c
+# sudo python3 setup.py install
+# other than that, the script depends on compleasm.py and its dependencies
+
+
+import argparse
+import re
+import shutil
+import os
+import csv
+import subprocess
+from inspect import currentframe, getframeinfo
+
+''' Function that runs a subprocess with arguments '''
+
+
+def run_simple_process(args_lst):
+    try:
+        # bed files need sorting with LC_COLLATE=C
+        myenv = os.environ.copy()
+        myenv['LC_COLLATE'] = 'C'
+        print("Trying to execute the following command:")
+        print(" ".join(args_lst))
+        result = subprocess.run(
+            args_lst, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=myenv)
+        print("Suceeded in executing command.")
+        if(result.returncode == 0):
+            return(result)
+        else:
+            frameinfo = getframeinfo(currentframe())
+            print('Error in file ' + frameinfo.filename + ' at line ' +
+                  str(frameinfo.lineno) + ': ' + "Return code of subprocess was " +
+                  str(result.returncode) + str(result.args))
+            quit(1)
+    except subprocess.CalledProcessError as grepexc:
+        frameinfo = getframeinfo(currentframe())
+        print('Error in file ' + frameinfo.filename + ' at line ' +
+              str(frameinfo.lineno) + ': ' + "Failed executing: ",
+              " ".join(grepexec.args))
+        print("Error code: ", grepexc.returncode, grepexc.output)
+        quit(1)
+
+
+def extract_tx_ids_from_tsv(tsv_file):
+    busco_ids = {}
+    with open(tsv_file, newline='') as csvfile:
+        reader = csv.DictReader(csvfile, delimiter='\t')
+        for row in reader:
+            if (row['Status'] == 'Single' or row['Status'] == 'Duplicated') and (row['Frameshift events'] == '0'):
+                busco_ids[row['Best gene']] = True
+    return busco_ids
+
+def read_and_filter_gff(gff_file, id_dict):
+    gff_list = []
+    try:
+        with open(gff_file, 'r') as gff_handle:
+            for line in gff_handle:
+                if not line.startswith("#"):
+                    if re.search(r'Target=', line):
+                        bid = re.search(r'Target=(\S+)', line).group(1)
+                        if bid in id_dict.keys():
+                            gff_list.append(line)
+    except IOError:
+        print("Could not open file ", gff_file)
+    return gff_list
+
+def miniprot_to_hints(gff_lines):
+    hints_lines = []
+    for line in gff_lines:
+        line = line.rstrip()
+        line_fields = line.split("\t")
+        if re.search(r'\tCDS\t.*Parent=([^;]+)', line):
+            if (int(line_fields[4])-3) - (int(line_fields[3])+3) + 1 > 0:
+                regex_result = re.search(r'\tCDS\t.*Parent=([^;]+)', line)
+                hint = line_fields[0] + "\t" + "c2h" + "\tCDSpart\t" + str(int(line_fields[3])+3) + "\t" + str(int(line_fields[4])-3) + "\t" + "." + "\t" + line_fields[6] + "\t" + line_fields[7] + "\t" + "src=M;grp=" + regex_result.group(1) + ";pri=4\n"
+                hints_lines.append(hint)
+    return hints_lines
+
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Run compleasm and generate a gtf file with complete BUSCO genes.")
+    
+    # Mandatory input arguments
+    parser.add_argument("-g", "--genome", required=True, help="Genome file in fasta format")
+    parser.add_argument("-d", "--database", required=True, help="BUSCO database to use")
+    parser.add_argument("-p", "--compleasm", required=False, help="Location of compleasm binary incl. binary name")
+    parser.add_argument("-t", "--threads", required=False, help="Number of threads to use, default is 1")
+    parser.add_argument("-o", "--output", required=True, help="Output file name, file is in GTF format.")
+    parser.add_argument("-s", "--scratch_dir", required=False, help="Temporary directory for compleasm output, default is current directory, must be writable.")
+    args = parser.parse_args()
+
+    if args.compleasm is None and shutil.which('compleasm.py'):
+        # Try to find 'compleasm' in the system PATH
+        args.compleasm = shutil.which('compleasm')
+    elif args.compleasm is None and os.environ.get('COMPLEASM_PATH'):
+        # If 'compleasm' is not in PATH, check for an environment variable
+        args.compleasm = str(os.environ.get('COMPLEASM_PATH')) + '/compleasm.py'
+    elif args.compleasm is None:
+        # If still not found, raise an error
+        raise FileNotFoundError("compleasm is not in PATH and COMPLEASM_PATH is not set")
+    if args.compleasm is not None:
+        # check whether provided compleasm is executable
+        if not os.access(args.compleasm, os.X_OK):
+            raise FileNotFoundError("compleasm is not executable")
+
+    # apply compleasm to genome file with run_subprocess and the database
+    if args.scratch_dir is None:
+        args.scratch_dir = "compleasm_genome_out"
+    run_simple_process([args.compleasm, 'run', '-l', args.database, '-a', args.genome, '-t', args.threads, '-o', args.scratch_dir])
+
+    # parse compleasm output table for complete BUSCOs without frameshifts
+    busco_ids = extract_tx_ids_from_tsv(args.scratch_dir + '/' + args.database + '/full_table.tsv')
+    
+    # filter the miniprot alignments for those that have no frame shifts
+    gff_lines = read_and_filter_gff(args.scratch_dir + '/' + args.database + '/miniprot_output.gff', busco_ids)
+
+    # convert the miniprot lines to CDSpart hints
+    hints_lines = miniprot_to_hints(gff_lines)
+
+    # gff_lines is not compatible with getAnnoFastaFromJoingenes, we need to fix this!
+    try:
+        with open(args.output, "w") as out_handle:
+            for line in hints_lines:
+                out_handle.write(line)
+    except IOError:
+        print("Failed to open file", args.out)
+
+    # print the BUSCO scores for genome level statistics to STDOUT
+    print("The following BUSCOs were found in the genome:")
+    try:
+        with open(args.scratch_dir + '/' + 'summary.txt', 'r') as busco_summary:
+            for line in busco_summary:
+                line = line.strip()
+                print(line)
+    except IOError:
+        print("Failed to open file", args.scratch_dir + '/' + 'summary.txt')
+
+    # delete the temporary directory
+    shutil.rmtree(args.scratch_dir)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file