Merge pull request #32 from microbial-pangenomes-lab/memory-leak

panfeed-get-kmers: avoid memory leaks with a lot of clusters
microbial-pangenomes-lab · Aug 22, 2024 · b773fa4 · b773fa4
2 parents 3a91350 + 7952e5e
commit b773fa4
Showing 1 changed file with 33 additions and 15 deletions.
diff --git a/panfeed/get_kmers.py b/panfeed/get_kmers.py
@@ -69,6 +69,13 @@ def get_options():
             default=False,
             help="Only output passing k-mers (default is all)")
 
+    parser.add_argument("--clusters-per-iteration",
+            type=int,
+            default=15,
+            help="Number of clusters to be considered in each iteration, "
+                 "a higher number means faster execution "
+                 "but higher memory usage (default %(default)d)")
+
     parser.add_argument("-v", action='count',
                         default=0,
                         help='Increase verbosity level')
@@ -110,21 +117,32 @@ def main():
     logger.info(f"Found {len(clusters)} gene clusters")
     logger.info(f"Found {len(kmers)} k-mers")
 
-    # load k-mers table piece-wise to reduce memory footprint
-    # thanks to SO: https://stackoverflow.com/a/13653490/1237531
-    iter_k = pd.read_csv(args.kmers, sep='\t',
-                         iterator=True, chunksize=100_000)
-    k = pd.concat([x[x['cluster'].isin(clusters)]
-        for x in iter_k]).set_index(['cluster', 'k-mer'])
-
-    a = a.join(h, how='inner')
-    if args.only_passing:
-        how = 'left'
-    else:
-        how = 'right'
-    a = a.reset_index().set_index(['cluster', 'k-mer']).join(k, how=how)
-
-    a.to_csv(sys.stdout, sep='\t')
+    first = True
+    clusters = list(clusters)
+    # only look for a limited number of clusters
+    # to avoid memory spikes when the number of clusters is too big
+    # the k-mers file has to be parsed multiple times unfortunately
+    for i, idx in enumerate(range(0, len(clusters), args.clusters_per_iteration)):
+        bunch = clusters[idx: idx + args.clusters_per_iteration]
+
+        logger.info(f"Searching for k-mers for {len(bunch)} clusters (iteration {i+1})")
+        # load k-mers table piece-wise to reduce memory footprint
+        # thanks to SO: https://stackoverflow.com/a/13653490/1237531
+        iter_k = pd.read_csv(args.kmers, sep='\t',
+                             iterator=True, chunksize=100_000)
+        k = pd.concat([x[x['cluster'].isin(bunch)]
+            for x in iter_k]).set_index(['cluster', 'k-mer'])
+
+        b = a.join(h, how='inner')
+        if args.only_passing:
+            how = 'left'
+        else:
+            how = 'right'
+        b = b.reset_index().set_index(['cluster', 'k-mer']).join(k, how=how)
+
+        b.to_csv(sys.stdout, sep='\t', header=first)
+
+        first = False
 
 
 if __name__ == "__main__":