Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Devel #92

Merged
merged 51 commits into from
Jul 1, 2023
Merged
Changes from 1 commit
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
c99cd82
Clean up bustools count
Yenaled Jan 22, 2023
26999ad
Preliminary untested count split matrix option
Yenaled Jan 23, 2023
53b0acc
fix some typos
Yenaled Jan 23, 2023
0d8cec3
fix another typo
Yenaled Jan 23, 2023
a2f3837
change unordered_map
Yenaled Jan 23, 2023
5ca38bf
Change typedef to macro for map
Yenaled Jan 23, 2023
2a74dc3
fix bustools inspect
Yenaled Jan 23, 2023
41f631e
fix mash
Yenaled Jan 24, 2023
07eeb66
fix undefined reference
Yenaled Jan 24, 2023
bf16198
better hashing
Yenaled Jan 24, 2023
7dd1eb8
fix roaring
Yenaled Jan 24, 2023
19b3674
ecmapinv: attempt to use bitmap
Yenaled Jan 24, 2023
097f991
fix
Yenaled Jan 24, 2023
67ebc46
added -s count option to main
Yenaled Feb 6, 2023
3f38e15
fix split
Yenaled Feb 6, 2023
73bf616
updates to make 3-matrix sparse
Yenaled Feb 7, 2023
2ac7d9f
mask length in bustools correct
Yenaled Feb 28, 2023
3147f45
fix len_mask
Yenaled Feb 28, 2023
3d3df88
fix len_mask again
Yenaled Feb 28, 2023
e3b6ff4
undo bitmap and unordered map
Yenaled Mar 1, 2023
7d4a558
Revert "undo bitmap and unordered map"
Yenaled Mar 1, 2023
82959c0
back to unordered map
Yenaled Mar 1, 2023
86bd1bc
style add parentheses
Yenaled Mar 1, 2023
ae96146
try again to undo bitmap
Yenaled Mar 5, 2023
eb58f13
adds multicore sorting
pmelsted Mar 14, 2023
b762a52
better partition function
pmelsted Mar 14, 2023
679f84b
Add priority option
Yenaled Mar 15, 2023
2aea2ee
updated common for count_mtx_priority
Yenaled Mar 15, 2023
3f287d5
batches writes
pmelsted Mar 15, 2023
a7af47a
Priority rules for mtx types
Yenaled Mar 17, 2023
59fcc9e
Merge pull request #90 from BUStools/sort_mc
Yenaled Mar 17, 2023
7ef8da3
update count to output barcode prefix
Yenaled Mar 22, 2023
5dc2bcf
cleanup len_mask in bustools correct
Yenaled Mar 22, 2023
44d724c
fix count_mtx_priority
Yenaled Mar 22, 2023
f5e9de4
fix count_mtx_priority w/ UMI collision logic
Yenaled Mar 27, 2023
c31d2f1
Try multicomponent barcodes
Yenaled Apr 11, 2023
49d69ad
fixed stuff with multicomponent barcodes
Yenaled Apr 11, 2023
8274a6f
more fixes
Yenaled Apr 11, 2023
8e70f28
another fix
Yenaled Apr 11, 2023
6a18f28
some final fixes (hopefully)
Yenaled Apr 11, 2023
5d09ac1
cleanup
Yenaled Apr 11, 2023
88ffe8d
make multipart barcodes more lax/flexible
Yenaled Apr 11, 2023
d04a222
Update bustools_correct.cpp
Yenaled Apr 11, 2023
58d5477
bustools correct --replace: initial implementation
Yenaled Apr 24, 2023
61f9883
bustools correct: more features for replace
Yenaled Apr 24, 2023
f4fd12a
fix minor bug
Yenaled Apr 24, 2023
01f1ac5
fix bustools correct/replace
Yenaled Apr 25, 2023
676f106
fix bustools correct replace
Yenaled Apr 27, 2023
816564d
Merge pull request #91 from BUStools/dlist
Yenaled May 31, 2023
31b90d8
version bump
pmelsted May 31, 2023
7a11c5a
update bustools count prefix to always be len 16
Yenaled Jun 29, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
better hashing
Yenaled committed Jan 24, 2023
commit bf16198a9aafba5e93a3bef1d229723e83a8ba44
24 changes: 22 additions & 2 deletions src/Common.hpp
Original file line number Diff line number Diff line change
@@ -10,6 +10,8 @@
#include <unordered_map>
#include <sstream>
#include "robin_hood.h"
#include "roaring.h"
#include "hash.hpp"

#define BUSTOOLS_VERSION "0.42.0"

@@ -160,14 +162,32 @@ struct SortedVectorHasher
int i = 0;
for (auto x : v)
{
uint64_t t = std::hash<int32_t>{}(x);
uint64_t t;
MurmurHash3_x64_64(&x,sizeof(x), 0,&t);
t = (x >> i) | (x << (64 - i));
r = r ^ t;
i = (i + 1) % 64;
i = (i+1)&63;
}
return r;
}
};

struct RoaringHasher {
size_t operator()(const Roaring& rr) const {
uint64_t r = 0;
int i=0;
for (auto x : rr) {
uint64_t t;
MurmurHash3_x64_64(&x, sizeof(x), 0, &t);
t = (x>>i) | (x<<(64-i));
r ^= t;
i = (i+1)&63; // (i+1)%64
}
return r;
}
};
typedef u_map_<Roaring, int32_t, RoaringHasher> EcMapInv;

std::vector<int32_t> intersect(std::vector<int32_t> &u, std::vector<int32_t> &v);
std::vector<int32_t> union_vectors(const std::vector<std::vector<int32_t>> &v);
std::vector<int32_t> intersect_vectors(const std::vector<std::vector<int32_t>> &v);
194 changes: 194 additions & 0 deletions src/hash.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
#include <stdint.h>
#include <cstring>
#include "hash.hpp"

uint64_t inline _rotl64(uint64_t value, int8_t amount) {
return ((value) << (amount)) | ((value) >> (64 - (amount)));
}

uint32_t SuperFastHash (const char *data, int len) {
uint32_t hash = len, tmp;
int rem;

if (len <= 0 || data == NULL) { return 0; }

rem = len & 3;
len >>= 2;

/* Main loop */
for (; len > 0; len--) {
hash += get16bits (data);
tmp = (get16bits (data+2) << 11) ^ hash;
hash = (hash << 16) ^ tmp;
data += 2*sizeof (uint16_t);
hash += hash >> 11;
}

/* Handle end cases */
switch (rem) {
case 3: hash += get16bits (data);
hash ^= hash << 16;
hash ^= data[sizeof (uint16_t)] << 18;
hash += hash >> 11;
break;
case 2: hash += get16bits (data);
hash ^= hash << 11;
hash += hash >> 17;
break;
case 1: hash += *data;
hash ^= hash << 10;
hash += hash >> 1;
}

/* Force "avalanching" of final 127 bits */
hash ^= hash << 3;
hash += hash >> 5;
hash ^= hash << 4;
hash += hash >> 17;
hash ^= hash << 25;
hash += hash >> 6;

return hash;
}




//-----------------------------------------------------------------------------
// Block read - if your platform needs to do endian-swapping or can only
// handle aligned reads, do the conversion here

inline uint64_t getblock ( const uint64_t *p, int i ) {
return p[i];
}

//----------
// Block mix - combine the key bits with the hash bits and scramble everything

inline void bmix64 ( uint64_t& h1, uint64_t& h2, uint64_t& k1, uint64_t& k2, uint64_t& c1, uint64_t& c2 ) {
k1 *= c1;
k1 = _rotl64(k1,23);
k1 *= c2;
h1 ^= k1;
h1 += h2;

h2 = _rotl64(h2,41);

k2 *= c2;
k2 = _rotl64(k2,23);
k2 *= c1;
h2 ^= k2;
h2 += h1;

h1 = h1*3+0x52dce729;
h2 = h2*3+0x38495ab5;

c1 = c1*5+0x7b7d159c;
c2 = c2*5+0x6bce6396;
}

//----------
// Finalization mix - avalanches all bits to within 0.05% bias

inline uint64_t fmix64 ( uint64_t k ) {
k ^= k >> 33;
k *= 0xff51afd7ed558ccd;
k ^= k >> 33;
k *= 0xc4ceb9fe1a85ec53;
k ^= k >> 33;

return k;
}

void MurmurHash3_x64_128 ( const void *key, const int len, const uint32_t seed, void *out ) {
const uint8_t *data = (const uint8_t *)key;
const int nblocks = len / 16;

uint64_t h1 = 0x9368e53c2f6af274 ^ seed;
uint64_t h2 = 0x586dcd208f7cd3fd ^ seed;

uint64_t c1 = 0x87c37b91114253d5;
uint64_t c2 = 0x4cf5ad432745937f;

//----------
// body

const uint64_t *blocks = (const uint64_t *)(data);

for(int i = 0; i < nblocks; i++) {
uint64_t k1 = getblock(blocks,i*2+0);
uint64_t k2 = getblock(blocks,i*2+1);

bmix64(h1,h2,k1,k2,c1,c2);
}

//----------
// tail

const uint8_t *tail = (const uint8_t *)(data + nblocks*16);

uint64_t k1 = 0;
uint64_t k2 = 0;

switch(len & 15) {
case 15: k2 ^= uint64_t(tail[14]) << 48;
case 14: k2 ^= uint64_t(tail[13]) << 40;
case 13: k2 ^= uint64_t(tail[12]) << 32;
case 12: k2 ^= uint64_t(tail[11]) << 24;
case 11: k2 ^= uint64_t(tail[10]) << 16;
case 10: k2 ^= uint64_t(tail[ 9]) << 8;
case 9: k2 ^= uint64_t(tail[ 8]) << 0;

case 8: k1 ^= uint64_t(tail[ 7]) << 56;
case 7: k1 ^= uint64_t(tail[ 6]) << 48;
case 6: k1 ^= uint64_t(tail[ 5]) << 40;
case 5: k1 ^= uint64_t(tail[ 4]) << 32;
case 4: k1 ^= uint64_t(tail[ 3]) << 24;
case 3: k1 ^= uint64_t(tail[ 2]) << 16;
case 2: k1 ^= uint64_t(tail[ 1]) << 8;
case 1: k1 ^= uint64_t(tail[ 0]) << 0;
bmix64(h1,h2,k1,k2,c1,c2);
};

//----------
// finalization

h2 ^= len;

h1 += h2;
h2 += h1;

h1 = fmix64(h1);
h2 = fmix64(h2);

h1 += h2;
h2 += h1;

((uint64_t *)out)[0] = h1;
((uint64_t *)out)[1] = h2;
}

//-----------------------------------------------------------------------------
// If we need a smaller hash value, it's faster to just use a portion of the
// 128-bit hash

void MurmurHash3_x64_32 ( const void *key, int len, uint32_t seed, void *out ) {
uint32_t temp[4];

MurmurHash3_x64_128(key,len,seed,temp);

*(uint32_t *)out = temp[0];
}

//----------

void MurmurHash3_x64_64 ( const void *key, int len, uint32_t seed, void *out ) {
uint64_t temp[2];

MurmurHash3_x64_128(key,len,seed,temp);

*(uint64_t *)out = temp[0];
}

//-----------------------------------------------------------------------------

22 changes: 22 additions & 0 deletions src/hash.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#ifndef HASH_H
#define HASH_H

#include <stdint.h> /* Replace with <stdint.h> if appropriate */
#undef get16bits
#if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \
|| defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__)
#define get16bits(d) (*((const uint16_t *) (d)))
#endif

#if !defined (get16bits)
#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)\
+(uint32_t)(((const uint8_t *)(d))[0]) )
#endif

uint32_t SuperFastHash (const char *data, int len);

//void MurmurHash3_x64_32 ( const void * key, int len, uint32_t seed, void * out );
void MurmurHash3_x64_64 ( const void *key, int len, uint32_t seed, void *out );

#endif