-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #25 from masakistan/counter_dtype_generalize
changed from int to uint16_t for counts
- Loading branch information
Showing
8 changed files
with
104 additions
and
20 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,11 @@ | ||
#pragma once | ||
|
||
#include <stdint.h> | ||
|
||
#define CAPACITY 4096 | ||
#define NHASHES 12 | ||
#define HASHSIZE 512 // HASHSIZE % 32 must be 0 | ||
|
||
|
||
typedef uint16_t count_dtype; | ||
#define MAXCOUNT UINT16_MAX |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
#!/usr/bin/env python | ||
|
||
import kcollections, sys, time | ||
from tqdm import tqdm | ||
|
||
k = int(sys.argv[1]) | ||
threads = int(sys.argv[2]) | ||
ks = kcollections.Kcounter(k) | ||
ks.parallel_add_init(threads) | ||
|
||
seqs = [] | ||
seq = '' | ||
c = 0 | ||
|
||
start_time = time.time() | ||
|
||
with(open(sys.argv[3], 'r')) as fh: | ||
for line in fh: | ||
if line[0] == '>': | ||
if len(seq) > 0: | ||
#seqs.append(seq) | ||
tstart_time = time.time() | ||
ks.parallel_add_seq(seq, len(seq)) | ||
telapsed_time = time.time() - tstart_time | ||
print c, '\tadded seq of len', len(seq), telapsed_time | ||
sys.stdout.flush() | ||
c += 1 | ||
seq = '' | ||
else: | ||
seq += line.strip() | ||
if len(seq) > 0: | ||
#seqs.append(seq) | ||
tstart_time = time.time() | ||
ks.parallel_add_seq(seq, len(seq)) | ||
telapsed_time = time.time() - tstart_time | ||
print c, '\tadded seq of len', len(seq), telapsed_time | ||
|
||
ks.parallel_add_join() | ||
|
||
elapsed_time = time.time() - start_time | ||
print 'elapsed time:', elapsed_time | ||
#print 'read', len(seqs), 'seqs, adding to ks...' | ||
|
||
#for seq in seqs: | ||
# print '\tadding seq...' | ||
# sys.stdout.flush() | ||
# ks.parallel_add_seq(seq, len(seq)) | ||
|
||
print len(ks), 'kmers' | ||
print 'done!' | ||
print 'checking correctness' | ||
|
||
for kmer, count in ks.iteritems(): | ||
print kmer, count | ||
|
||
c = 0 | ||
if len(sys.argv) > 4: | ||
with open(sys.argv[3], 'r') as fh: | ||
seq = '' | ||
for line in fh: | ||
if line[0] == '>': | ||
if len(seq) > 0: | ||
for i in range(len(seq) - k + 1): | ||
kmer = seq[i : i + k] | ||
assert kmer in ks, "not find: " + kmer | ||
c += 1 | ||
seq = '' | ||
else: | ||
seq += line.strip() | ||
for i in range(len(seq) - k + 1): | ||
kmer = seq[i : i + k] | ||
assert kmer in ks, "not find: " + kmer | ||
c += 1 | ||
print 'checked', c, 'kmers' | ||
|
||
del ks |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters