Bugfix v0.1.2 (#9)

Fix a few bugs that appeared when testing v0.1.1 of the commec package. * The nucleotide search was erroneously pointing at `nt_dir` instead of `nt_db` * Errors would appear if the `#` sign was present in a FASTA id * Two `FutureWarning`s would appear in the file output
ibbis-bio · Jul 2, 2024 · 3e9d943 · 3e9d943
1 parent 68e4493
commit 3e9d943
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 6 deletions.
diff --git a/commec/check_reg_path.py b/commec/check_reg_path.py
@@ -121,7 +121,7 @@ def main():
                     sys.stdout.write("\t\t     Species: %s (taxid(s): %s) (%s percent identity to query)\n" % (species_list, taxid_list, percent_ids))
                     sys.stdout.write("\t\t     Description: %s\n" % (desc))
                     # could explicitly list which are and aren't regulated?
-                    # otherwise, raise a flag and say which superkingdom the flag belongs to
+                # otherwise, raise a flag and say which superkingdom the flag belongs to
                 elif (n_reg == n_total):
                     if subset['superkingdom'][0] == "Viruses":
                         reg_vir = 1
@@ -133,7 +133,12 @@ def main():
                         if subset['superkingdom'][0] == "Eukaryota":
                             org = "eukaryote"
                             reg_fung = 1
-                    hits = pd.concat([hits, subset[['q. start', 'q. end']]])
+
+                    new_hits = subset[['q. start', 'q. end']].dropna()
+                    if not new_hits.empty and not hits.empty:
+                         hits = pd.concat([hits, new_hits], ignore_index=True)
+                    elif not new_hits.empty:
+                        hits = new_hits.copy()
                     sys.stdout.write("\t\t --> Best match to sequence(s) %s at bases %s found in only regulated organisms: FLAG (%s)\n" % (gene_names, coordinates, org))
                     sys.stdout.write("\t\t     Species: %s (taxid(s): %s) (%s percent identity to query)\n" % (species_list, taxid_list, percent_ids))
                     sys.stdout.write("\t\t     Description: %s\n" % (desc))

diff --git a/commec/fetch_nc_bits.py b/commec/fetch_nc_bits.py
@@ -78,7 +78,7 @@ def fetch_sequences(seqid, nc_bits, f_file, outfile):
 elif nc_bits == []: # if the entire sequence, save regions <50 bases, is covered with protein, skip nt scan
     sys.stdout.write("\t\t --> no noncoding regions >= 50 bases found, skipping nt scan\n")
 else: 
-    seqid = blast.iloc[0][0]
+    seqid = blast.iloc[0, 0]
     fetch_sequences(seqid, nc_bits, f_file, outfile)
 
 

diff --git a/commec/screen.py b/commec/screen.py
@@ -119,7 +119,8 @@ def get_output_prefix(input_file, prefix_arg=""):
 
 def get_cleaned_fasta(input_file, out_prefix):
     """
-    Return a FASTA where whitespace (including non-breaking spaces) is replaced with underscores.
+    Return a FASTA where whitespace (including non-breaking spaces) and illegal characters are
+    replaced with underscores.
     """
     cleaned_file = f"{out_prefix}.cleaned.fasta"
     with (
@@ -129,7 +130,7 @@ def get_cleaned_fasta(input_file, out_prefix):
         for line in fin:
             line = line.strip()
             modified_line = "".join(
-                "_" if c.isspace() or c == "\xc2\xa0" else c for c in line
+                "_" if c.isspace() or c == "\xc2\xa0" or c == "#" else c for c in line
             )
             fout.write(f"{modified_line}{os.linesep}")
     return cleaned_file
@@ -273,7 +274,7 @@ def screen_nucleotides(
             "-query",
             noncoding_fasta,
             "-db",
-            screen_dbs.nt_dir,
+            screen_dbs.nt_db,
             "-out",
             nt_output,
             "-outfmt",