Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(IPVC-2276): skip accessions from gff that are not in the tx_info file, log missing acs to file #18

Merged
merged 10 commits into from
Apr 5, 2024
60 changes: 32 additions & 28 deletions sbin/exonset-to-seqinfo
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
#!/usr/bin/env python

"""write SeqInfo files from fasta"""
"""Writes SeqInfo files from fasta to stdout. Writes transcript accessions that are missing from
SeqRepo to a file called exonset_missing_accessions.txt in the same directory as the input files."""

import argparse
import configparser as ConfigParser
import gzip
import itertools
import logging
import logging.config
import pkg_resources
import os
import re
import sys

from bioutils.digests import seq_md5
import configparser as ConfigParser
import pkg_resources
from biocommons.seqrepo import SeqRepo
# from multifastadb import MultiFastaDB
from bioutils.digests import seq_md5

from uta.formats.exonset import ExonSetReader
from uta.formats.seqinfo import SeqInfo, SeqInfoWriter
Expand Down Expand Up @@ -49,6 +49,8 @@ if __name__ == "__main__":
ac_re = re.compile("[NX][CGMPR]_")

opts = parse_args(sys.argv[1:])
input_dir = os.path.dirname(opts.FILES[0])

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add an output directory to the supported arguments. So we can have this file directed to the directory the user chooses?

missing_accessions_file_path = os.path.join(input_dir, "exonset_missing_accessions.txt")

cf = ConfigParser.ConfigParser()
for conf_fn in opts.conf:
Expand All @@ -72,25 +74,27 @@ if __name__ == "__main__":
# this is just a fancy way to make a set of all tx_ac and alt_ac accessions
acs = sorted(
set(itertools.chain.from_iterable((es.tx_ac, es.alt_ac) for es in esr)))

acs_not_found = set()
for ac in acs:
try:
seq = str(sr[ac])
except KeyError:
logging.warning("Sequence not found: " + ac)
acs_not_found.update([ac])
continue

si = SeqInfo(
ac=ac,
descr=None,
len=len(seq),
md5=seq_md5(seq),
origin=opts.origin,
seq=None,
)
siw.write(si)

if acs_not_found:
raise RuntimeError("Sequences for {} accessions not found".format(len(acs_not_found)))
missing_acs_count = 0
with open(missing_accessions_file_path, "w") as f:
f.write("tx_ac\n")
for ac in acs:
try:
seq = str(sr[ac])
except KeyError:
logging.warning("Sequence not found: " + ac)
missing_acs_count += 1
f.write(ac + "\n")
continue

si = SeqInfo(
ac=ac,
descr=None,
len=len(seq),
md5=seq_md5(seq),
origin=opts.origin,
seq=None,
)
siw.write(si)

# Log the number of missing accessions written to the file
logging.info(f"{missing_acs_count} accessions were missing from Seqrepo. See {missing_accessions_file_path}.")
4 changes: 2 additions & 2 deletions sbin/ncbi_parse_genomic_gff.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def _get_exon_number_from_id(alignment_id: str) -> int:
return int(alignment_id.split("-")[-1])


def parse_gff_file(file_paths: List[str]) -> dict[str, List[GFFRecord]]:
def parse_gff_files(file_paths: List[str]) -> dict[str, List[GFFRecord]]:
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated the name to reflect the function takes a list of files

tx_data = defaultdict(list)
for file_path in file_paths:
with open_file(file_path) as f:
Expand Down Expand Up @@ -152,7 +152,7 @@ def get_zero_based_exon_ranges(transcript_exons: List[GFFRecord]) -> str:
gff_files = args.gff_files
esw = ExonSetWriter(sys.stdout)

transcript_alignments = parse_gff_file(gff_files)
transcript_alignments = parse_gff_files(gff_files)
logger.info(
f"read {len(transcript_alignments)} transcript alignments from file(s): {', '.join(gff_files)}"
)
Expand Down
6 changes: 3 additions & 3 deletions tests/test_ncbi_parse_genomic_gff.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from sbin.ncbi_parse_genomic_gff import (
get_zero_based_exon_ranges,
GFFRecord,
parse_gff_file,
parse_gff_files,
parse_gff_record,
)

Expand Down Expand Up @@ -160,7 +160,7 @@ def test_parse_gff_record_raises_unparseable_id(self):
def test_parse_gff_file(self):
# Test parsing the entire uncompressed GFF file
expected_result = {"rna-NR_046018.2": self.gff_records}
parsed_result = parse_gff_file([self.temp_gff.name])
parsed_result = parse_gff_files([self.temp_gff.name])
self.assertEqual(parsed_result, expected_result)

def test_parse_gff_file_accepts_gzipped_files(self):
Expand All @@ -171,7 +171,7 @@ def test_parse_gff_file_accepts_gzipped_files(self):

# Test parsing the gzipped GFF file
expected_result = {"rna-NR_046018.2": self.gff_records}
parsed_result = parse_gff_file([self.temp_gff.name + ".gz"])
parsed_result = parse_gff_files([self.temp_gff.name + ".gz"])
self.assertEqual(parsed_result, expected_result)

def test_get_zero_based_exon_ranges(self):
Expand Down