Skip to content

Commit

Permalink
Merge pull request #32 from microbial-pangenomes-lab/memory-leak
Browse files Browse the repository at this point in the history
panfeed-get-kmers: avoid memory leaks with a lot of clusters
  • Loading branch information
haneubau authored Aug 22, 2024
2 parents 3a91350 + 7952e5e commit b773fa4
Showing 1 changed file with 33 additions and 15 deletions.
48 changes: 33 additions & 15 deletions panfeed/get_kmers.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,13 @@ def get_options():
default=False,
help="Only output passing k-mers (default is all)")

parser.add_argument("--clusters-per-iteration",
type=int,
default=15,
help="Number of clusters to be considered in each iteration, "
"a higher number means faster execution "
"but higher memory usage (default %(default)d)")

parser.add_argument("-v", action='count',
default=0,
help='Increase verbosity level')
Expand Down Expand Up @@ -110,21 +117,32 @@ def main():
logger.info(f"Found {len(clusters)} gene clusters")
logger.info(f"Found {len(kmers)} k-mers")

# load k-mers table piece-wise to reduce memory footprint
# thanks to SO: https://stackoverflow.com/a/13653490/1237531
iter_k = pd.read_csv(args.kmers, sep='\t',
iterator=True, chunksize=100_000)
k = pd.concat([x[x['cluster'].isin(clusters)]
for x in iter_k]).set_index(['cluster', 'k-mer'])

a = a.join(h, how='inner')
if args.only_passing:
how = 'left'
else:
how = 'right'
a = a.reset_index().set_index(['cluster', 'k-mer']).join(k, how=how)

a.to_csv(sys.stdout, sep='\t')
first = True
clusters = list(clusters)
# only look for a limited number of clusters
# to avoid memory spikes when the number of clusters is too big
# the k-mers file has to be parsed multiple times unfortunately
for i, idx in enumerate(range(0, len(clusters), args.clusters_per_iteration)):
bunch = clusters[idx: idx + args.clusters_per_iteration]

logger.info(f"Searching for k-mers for {len(bunch)} clusters (iteration {i+1})")
# load k-mers table piece-wise to reduce memory footprint
# thanks to SO: https://stackoverflow.com/a/13653490/1237531
iter_k = pd.read_csv(args.kmers, sep='\t',
iterator=True, chunksize=100_000)
k = pd.concat([x[x['cluster'].isin(bunch)]
for x in iter_k]).set_index(['cluster', 'k-mer'])

b = a.join(h, how='inner')
if args.only_passing:
how = 'left'
else:
how = 'right'
b = b.reset_index().set_index(['cluster', 'k-mer']).join(k, how=how)

b.to_csv(sys.stdout, sep='\t', header=first)

first = False


if __name__ == "__main__":
Expand Down

0 comments on commit b773fa4

Please sign in to comment.