Skip to content

Commit

Permalink
Bugfix v0.1.2 (#9)
Browse files Browse the repository at this point in the history
Fix a few bugs that appeared when testing v0.1.1 of the commec package.

* The nucleotide search was erroneously pointing at `nt_dir` instead of `nt_db`
* Errors would appear if the `#` sign was present in a FASTA id
* Two `FutureWarning`s would appear in the file output
  • Loading branch information
alexanian authored Jul 2, 2024
1 parent 68e4493 commit 3e9d943
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 6 deletions.
9 changes: 7 additions & 2 deletions commec/check_reg_path.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def main():
sys.stdout.write("\t\t Species: %s (taxid(s): %s) (%s percent identity to query)\n" % (species_list, taxid_list, percent_ids))
sys.stdout.write("\t\t Description: %s\n" % (desc))
# could explicitly list which are and aren't regulated?
# otherwise, raise a flag and say which superkingdom the flag belongs to
# otherwise, raise a flag and say which superkingdom the flag belongs to
elif (n_reg == n_total):
if subset['superkingdom'][0] == "Viruses":
reg_vir = 1
Expand All @@ -133,7 +133,12 @@ def main():
if subset['superkingdom'][0] == "Eukaryota":
org = "eukaryote"
reg_fung = 1
hits = pd.concat([hits, subset[['q. start', 'q. end']]])

new_hits = subset[['q. start', 'q. end']].dropna()
if not new_hits.empty and not hits.empty:
hits = pd.concat([hits, new_hits], ignore_index=True)
elif not new_hits.empty:
hits = new_hits.copy()
sys.stdout.write("\t\t --> Best match to sequence(s) %s at bases %s found in only regulated organisms: FLAG (%s)\n" % (gene_names, coordinates, org))
sys.stdout.write("\t\t Species: %s (taxid(s): %s) (%s percent identity to query)\n" % (species_list, taxid_list, percent_ids))
sys.stdout.write("\t\t Description: %s\n" % (desc))
Expand Down
2 changes: 1 addition & 1 deletion commec/fetch_nc_bits.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def fetch_sequences(seqid, nc_bits, f_file, outfile):
elif nc_bits == []: # if the entire sequence, save regions <50 bases, is covered with protein, skip nt scan
sys.stdout.write("\t\t --> no noncoding regions >= 50 bases found, skipping nt scan\n")
else:
seqid = blast.iloc[0][0]
seqid = blast.iloc[0, 0]
fetch_sequences(seqid, nc_bits, f_file, outfile)


Expand Down
7 changes: 4 additions & 3 deletions commec/screen.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,8 @@ def get_output_prefix(input_file, prefix_arg=""):

def get_cleaned_fasta(input_file, out_prefix):
"""
Return a FASTA where whitespace (including non-breaking spaces) is replaced with underscores.
Return a FASTA where whitespace (including non-breaking spaces) and illegal characters are
replaced with underscores.
"""
cleaned_file = f"{out_prefix}.cleaned.fasta"
with (
Expand All @@ -129,7 +130,7 @@ def get_cleaned_fasta(input_file, out_prefix):
for line in fin:
line = line.strip()
modified_line = "".join(
"_" if c.isspace() or c == "\xc2\xa0" else c for c in line
"_" if c.isspace() or c == "\xc2\xa0" or c == "#" else c for c in line
)
fout.write(f"{modified_line}{os.linesep}")
return cleaned_file
Expand Down Expand Up @@ -273,7 +274,7 @@ def screen_nucleotides(
"-query",
noncoding_fasta,
"-db",
screen_dbs.nt_dir,
screen_dbs.nt_db,
"-out",
nt_output,
"-outfmt",
Expand Down

0 comments on commit 3e9d943

Please sign in to comment.