Skip to content

Commit

Permalink
added gz support and geneID prefix
Browse files Browse the repository at this point in the history
  • Loading branch information
LarsGab committed Dec 16, 2024
1 parent a870d29 commit ea8ca5c
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 20 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,12 @@ The following Python libraries are required:
- biopython
- bcbio-gff
- requests
- gzip
- bz2

They can be installed with:
```
pip install pyBigWig bio scikit-learn biopython bcbio-gff requests
pip install pyBigWig bio scikit-learn biopython bcbio-gff requests gzip bz2
```
Tensorflow should be installed with GPU support. If you are using conda, you can install Tensorflow 2.10 with these [instructions](docs/install_tensorflow.md).

Expand Down
35 changes: 21 additions & 14 deletions bin/genome_fasta.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import numpy as np
import gzip, bz2

class GenomeSequences:
def __init__(self, fasta_file='', np_file='', chunksize=20000, overlap=1000):
Expand Down Expand Up @@ -27,21 +28,27 @@ def __init__(self, fasta_file='', np_file='', chunksize=20000, overlap=1000):
#self.encode_sequences()

def read_fasta(self):
"""Read genome sequences from the specified FASTA file.
"""Read genome sequences from a FASTA file, it can be compressed with gz or bz2.
"""

with open(self.fasta_file, "r") as file:
lines = file.readlines()
current_sequence = ""
for line in lines:
if line.startswith(">"):
if current_sequence:
self.sequences.append(current_sequence)
current_sequence = ""
self.sequence_names.append(line[1:].strip())
else:
current_sequence += line.strip()
self.sequences.append(current_sequence)
if self.fasta_file.endswith('.gz'):
with gzip.open(self.fasta_file, 'rt') as file:
lines = file.readlines()
elif self.fasta_file.endswith('.bz2'):
with bz2.open(self.fasta_file, 'rt') as file:
lines = file.readlines()
else:
with open(self.fasta_file, "r") as file:
lines = file.readlines()
current_sequence = ""
for line in lines:
if line.startswith(">"):
if current_sequence:
self.sequences.append(current_sequence)
current_sequence = ""
self.sequence_names.append(line[1:].strip())
else:
current_sequence += line.strip()
self.sequences.append(current_sequence)

def encode_sequences(self, seq=None):
"""One-hot encode the sequences and store in self.one_hot_encoded.
Expand Down
23 changes: 18 additions & 5 deletions bin/tiberius.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# Running Tiberius for single genome prediction
# ==============================================================

import sys, json, os, re, sys, csv, argparse, requests, time, logging, warnings, math
import sys, json, os, re, sys, csv, argparse, requests, time, logging, warnings, math, gzip, bz2
script_dir = os.path.dirname(os.path.realpath(__file__))
import subprocess as sp
import numpy as np
Expand Down Expand Up @@ -59,10 +59,9 @@ def assemble_transcript(exons, sequence, strand):
exon_seq = sequence.seq[exon[0]-1:exon[1]]
if strand == '-':
exon_seq = exon_seq.reverse_complement()
parts.append(str(exon_seq)) # Convert Seq object to string here
parts.append(str(exon_seq))

coding_seq = Seq("".join(parts))
# print(len(coding_seq))
if len(coding_seq) > 0 and len(coding_seq)%3==0:
prot_seq = coding_seq.translate()
if prot_seq[-1] == '*':
Expand Down Expand Up @@ -117,6 +116,18 @@ def extract_tar_gz(file_path, dest_dir):
def is_writable(file_path):
return os.access(file_path, os.W_OK)

def load_genome(genome_path):
if genome_path.endswith(".gz"):
with gzip.open(genome_path, "rt") as file:
genome = SeqIO.to_dict(SeqIO.parse(file, "fasta"))
elif genome_path.endswith(".bz2"):
with bz2.open(genome_path, "rt") as file:
genome = SeqIO.to_dict(SeqIO.parse(file, "fasta"))
else:
with open(genome_path, "r") as file:
genome = SeqIO.to_dict(SeqIO.parse(file, "fasta"))
return genome

def main():
args = parseCmd()

Expand Down Expand Up @@ -272,7 +283,7 @@ def main():
filt=False)

# Load the genome sequence from the FASTA file
genome = SeqIO.to_dict(SeqIO.parse(genome_path, "fasta"))
genome = load_genome(genome_path)
anno_outp = Anno('', f'anno')
out_tx = {}
for tx_id, tx in anno.transcripts.items():
Expand All @@ -293,7 +304,7 @@ def main():
anno_outp.add_transcripts(out_tx, f'anno')
anno_outp.norm_tx_format()
anno_outp.find_genes()
anno_outp.rename_tx_ids()
anno_outp.rename_tx_ids(args.id_prefix)
anno_outp.write_anno(gtf_out)

prot_seq_out = ""
Expand Down Expand Up @@ -379,6 +390,8 @@ def parseCmd():
help='Length of sub-sequences used for parallelizing the prediction.', default=500004)
parser.add_argument('--batch_size', type=int,
help='Number of sub-sequences per batch.', default=16)
parser.add_argument('--id_prefix', type=str,
help='Prefix for gene and transcript IDs in output GTF file.', default='')

return parser.parse_args()

Expand Down

0 comments on commit ea8ca5c

Please sign in to comment.