sourmash-bio · luizirber · Dec 14, 2019 · Dec 14, 2019 · Dec 14, 2019
diff --git a/sourmash/lca/command_rankinfo.py b/sourmash/lca/command_rankinfo.py
@@ -6,12 +6,12 @@
 import sys
 from collections import defaultdict
 
-from ..logging import error, debug, set_quiet
+from ..logging import error, debug, set_quiet, notify
 from . import lca_utils
 from ..sourmash_args import SourmashArgumentParser
 
 
-def make_lca_counts(dblist):
+def make_lca_counts(dblist, min_num=0):
     """
     Collect counts of all the LCAs in the list of databases.
 
@@ -22,10 +22,14 @@ def make_lca_counts(dblist):
     assignments = defaultdict(set)
     for lca_db in dblist:
         for hashval, idx_list in lca_db.hashval_to_idx.items():
+            if min_num and len(idx_list) < min_num:
+                continue
+
             for idx in idx_list:
-                lid = lca_db.idx_to_lid[idx]
-                lineage = lca_db.lid_to_lineage[lid]
-                assignments[hashval].add(lineage)
+                lid = lca_db.idx_to_lid.get(idx)
+                if lid is not None:
+                    lineage = lca_db.lid_to_lineage[lid]
+                    assignments[hashval].add(lineage)
 
     # now convert to trees -> do LCA & counts
     counts = defaultdict(int)
@@ -55,6 +59,8 @@ def rankinfo_main(args):
                    help='suppress non-error output')
     p.add_argument('-d', '--debug', action='store_true',
                    help='output debugging output')
+    p.add_argument('--minimum-num', type=int, default=0,
+                   help='Minimum number of different lineages a k-mer must be in to be counted')
     args = p.parse_args(args)
 
     if not args.db:
@@ -70,7 +76,7 @@ def rankinfo_main(args):
     dblist, ksize, scaled = lca_utils.load_databases(args.db, args.scaled)
 
     # count all the LCAs across these databases
-    counts = make_lca_counts(dblist)
+    counts = make_lca_counts(dblist, args.minimum_num)
 
     # collect counts across all ranks
     counts_by_rank = defaultdict(int)
@@ -81,9 +87,12 @@ def rankinfo_main(args):
 
     # output!
     total = float(sum(counts_by_rank.values()))
-    for rank in lca_utils.taxlist():
-        count = counts_by_rank.get(rank, 0)
-        print('{}: {} ({:.1f}%)'.format(rank, count, count / total * 100.))
+    if total == 0:
+        notify("(no hashvals with lineages found)")
+    else:
+        for rank in lca_utils.taxlist():
+            count = counts_by_rank.get(rank, 0)
+            print('{}: {} ({:.1f}%)'.format(rank, count, count / total * 100.))
 
 
 if __name__ == '__main__':

diff --git a/tests/test_lca.py b/tests/test_lca.py
@@ -795,6 +795,54 @@ def test_rankinfo_on_single():
         assert not lines
 
 
+def test_rankinfo_no_tax():
+    with utils.TempDirectory() as location:
+        taxcsv = utils.get_test_data('lca/delmont-1.csv')
+        input_sig = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig')
+        lca_db = os.path.join(location, 'delmont-1.lca.json')
+
+        cmd = ['lca', 'index', taxcsv, lca_db, input_sig]
+        status, out, err = utils.runscript('sourmash', cmd)
+
+        print(cmd)
+        print(out)
+        print(err)
+
+        assert os.path.exists(lca_db)
+
+        assert "** assuming column 'MAGs' is identifiers in spreadsheet" in err
+        assert "** assuming column 'Domain' is superkingdom in spreadsheet" in err
+        assert '1 identifiers used out of 1 distinct identifiers in spreadsheet.' in err
+
+        cmd = ['lca', 'rankinfo', lca_db]
+        status, out, err = utils.runscript('sourmash', cmd)
+
+
+def test_rankinfo_with_min():
+    with utils.TempDirectory() as location:
+        db1 = utils.get_test_data('lca/dir1.lca.json')
+        db2 = utils.get_test_data('lca/dir2.lca.json')
+
+        cmd = ['lca', 'rankinfo', db1, db2, '--minimum-num', '1']
+        status, out, err = utils.runscript('sourmash', cmd)
+
+        print(cmd)
+        print(out)
+        print(err)
+
+        lines = out.splitlines()
+        lines.remove('superkingdom: 0 (0.0%)')
+        lines.remove('phylum: 464 (12.8%)')
+        lines.remove('class: 533 (14.7%)')
+        lines.remove('order: 1050 (29.0%)')
+        lines.remove('family: 695 (19.2%)')
+        lines.remove('genus: 681 (18.8%)')
+        lines.remove('species: 200 (5.5%)')
+        lines.remove('strain: 0 (0.0%)')
+
+        assert not lines
+
+
 def test_compare_csv():
     with utils.TempDirectory() as location:
         a = utils.get_test_data('lca/classify-by-both.csv')