diff --git a/src/BUSData.cpp b/src/BUSData.cpp
index c97f80f..397e6dd 100644
--- a/src/BUSData.cpp
+++ b/src/BUSData.cpp
@@ -258,7 +258,7 @@ bool writeECs(const std::string &filename, const BUSHeader &header) {
   return true;
 }
 
-bool writeGenes(const std::string &filename, const std::unordered_map<std::string, int32_t>  &genenames) {
+bool writeGenes(const std::string &filename, const u_map_<std::string, int32_t>  &genenames) {
   std::ofstream outf;
   outf.open(filename.c_str(), std::ios::out);
 
@@ -279,7 +279,7 @@ bool writeGenes(const std::string &filename, const std::unordered_map<std::strin
   return true;
 }
 
-bool parseTranscripts(const std::string &filename, std::unordered_map<std::string, int32_t> &txnames) {
+bool parseTranscripts(const std::string &filename, u_map_<std::string, int32_t> &txnames) {
   std::ifstream inf(filename.c_str());
 
   int i = 0;
@@ -291,7 +291,7 @@ bool parseTranscripts(const std::string &filename, std::unordered_map<std::strin
   return true;
 }
 
-bool parseTxCaptureList(const std::string &filename, std::unordered_map<std::string, int32_t> &txnames, std::unordered_set<uint64_t> &captures) {
+bool parseTxCaptureList(const std::string &filename, u_map_<std::string, int32_t> &txnames, std::unordered_set<uint64_t> &captures) {
   std::ifstream inf(filename.c_str());
 
   std::string txp;
@@ -318,7 +318,7 @@ bool parseBcUmiCaptureList(const std::string &filename, std::unordered_set<uint6
   return true;
 }
 
-bool parse_ProjectMap(const std::string &filename, std::unordered_map<uint64_t, uint64_t> &project_map) {
+bool parse_ProjectMap(const std::string &filename, u_map_<uint64_t, uint64_t> &project_map) {
   // This function occurs in 3 places: here, BUSData.h, and bustools_project.cpp
   std::ifstream inf(filename.c_str());
 
@@ -346,7 +346,7 @@ bool parseFlagsCaptureList(const std::string &filename, std::unordered_set<uint6
   return true;
 }
 
-bool parseGenes(const std::string &filename, const std::unordered_map<std::string, int32_t> &txnames, std::vector<int32_t> &genemap, std::unordered_map<std::string, int32_t> &genenames) {
+bool parseGenes(const std::string &filename, const u_map_<std::string, int32_t> &txnames, std::vector<int32_t> &genemap, u_map_<std::string, int32_t> &genenames) {
   std::ifstream inf(filename.c_str());
 
   std::string line, t;
diff --git a/src/BUSData.h b/src/BUSData.h
index 5c697f7..227f889 100644
--- a/src/BUSData.h
+++ b/src/BUSData.h
@@ -7,6 +7,7 @@
 #include <unordered_set>
 #include <stdint.h>
 #include <fstream>
+#include "Common.hpp"
 
 const uint32_t BUSFORMAT_VERSION = 1;
 
@@ -66,15 +67,15 @@ int identifyParseHeader(std::istream &inf, BUSHeader &header, compressed_BUSHead
 bool parseECs_stream(std::istream &in, BUSHeader &header);
 bool parseECs(const std::string &filename, BUSHeader &header);
 bool writeECs(const std::string &filename, const BUSHeader &header);
-bool writeGenes(const std::string &filename, const std::unordered_map<std::string, int32_t>  &genenames);
-bool parseGenes(const std::string &filename, const std::unordered_map<std::string, int32_t> &txnames, std::vector<int32_t> &genemap, std::unordered_map<std::string, int32_t> &genenames);
+bool writeGenes(const std::string &filename, const u_map_<std::string, int32_t>  &genenames);
+bool parseGenes(const std::string &filename, const u_map_<std::string, int32_t> &txnames, std::vector<int32_t> &genemap, u_map_<std::string, int32_t> &genenames);
 bool parseGenesList(const std::string& filename, std::vector<std::string>& geneNames);
-bool parseTxCaptureList(const std::string &filename, std::unordered_map<std::string, int32_t> &txnames, std::unordered_set<uint64_t> &captures);
+bool parseTxCaptureList(const std::string &filename, u_map_<std::string, int32_t> &txnames, std::unordered_set<uint64_t> &captures);
 bool parseBcUmiCaptureList(const std::string &filename, std::unordered_set<uint64_t> &captures);
 bool parseFlagsCaptureList(const std::string &filename, std::unordered_set<uint64_t> &captures);
-bool parseTranscripts(const std::string &filename, std::unordered_map<std::string, int32_t> &txnames);
+bool parseTranscripts(const std::string &filename, u_map_<std::string, int32_t> &txnames);
 
-bool parse_ProjectMap(const std::string &filename, std::unordered_map<uint64_t, uint64_t> &project_map);
+bool parse_ProjectMap(const std::string &filename, u_map_<uint64_t, uint64_t> &project_map);
 
 uint64_t stringToBinary(const std::string &s, uint32_t &flag);
 uint64_t stringToBinary(const char* s, const size_t len, uint32_t &flag);
diff --git a/src/Common.cpp b/src/Common.cpp
index ad96c96..58c169c 100644
--- a/src/Common.cpp
+++ b/src/Common.cpp
@@ -73,7 +73,7 @@ std::vector<int32_t> intersect_vectors(const std::vector<std::vector<int32_t>> &
   return std::move(u);
 }
 
-int32_t intersect_ecs(const std::vector<int32_t> &ecs, std::vector<int32_t> &u, const std::vector<int32_t> &genemap, std::vector<std::vector<int32_t>> &ecmap, std::unordered_map<std::vector<int32_t>, int32_t, SortedVectorHasher> &ecmapinv, std::vector<std::vector<int32_t>> &ec2genes) {
+int32_t intersect_ecs(const std::vector<int32_t> &ecs, std::vector<int32_t> &u, const std::vector<int32_t> &genemap, std::vector<std::vector<int32_t>> &ecmap, EcMapInv &ecmapinv, std::vector<std::vector<int32_t>> &ec2genes) {
   if (ecs.empty()) {
     return -1;
   }
@@ -91,13 +91,13 @@ int32_t intersect_ecs(const std::vector<int32_t> &ecs, std::vector<int32_t> &u,
   for (size_t i = 0; i< v.size(); i++) {
     u.push_back(v[i]);
   }
-
+  
   for (size_t i = 1; i < ecs.size(); i++) {
     if (ecs[i] < 0 || ecs[i] >= ecmap.size()) {
       return -1;
     }
     const auto &v = ecmap[ecs[i]];
-
+    
     int j = 0;
     int k = 0;
     int l = 0;
@@ -212,7 +212,7 @@ void intersect_genes_of_ecs(const std::vector<int32_t> &ecs, const  std::vector<
 }
 
 
-int32_t intersect_ecs_with_genes(const std::vector<int32_t> &ecs, const std::vector<int32_t> &genemap, std::vector<std::vector<int32_t>> &ecmap, std::unordered_map<std::vector<int32_t>, int32_t, SortedVectorHasher> &ecmapinv, std::vector<std::vector<int32_t>> &ec2genes, bool assumeIntersectionIsEmpty) {
+int32_t intersect_ecs_with_genes(const std::vector<int32_t> &ecs, const std::vector<int32_t> &genemap, std::vector<std::vector<int32_t>> &ecmap, EcMapInv &ecmapinv, std::vector<std::vector<int32_t>> &ec2genes, bool assumeIntersectionIsEmpty) {
   
   std::vector<std::vector<int32_t>> gu; // per gene transcript results
   std::vector<int32_t> u; // final list of transcripts
@@ -327,9 +327,82 @@ void create_ec2genes(const std::vector<std::vector<int32_t>> &ecmap, const std::
   }
 }
 
+COUNT_MTX_TYPE intersect_ecs_with_subset_txs(int32_t ec, const std::vector<std::vector<int32_t>> &ecmap, const std::vector<int32_t>& tx_split) {
+  if (tx_split.size() == 0) return COUNT_DEFAULT;
+  std::vector<int32_t> ecs;
+  ecs.push_back(ec);
+  return intersect_ecs_with_subset_txs(ecs, ecmap, tx_split);
+}
+
+COUNT_MTX_TYPE intersect_ecs_with_subset_txs(const std::vector<int32_t>& ecs, const std::vector<std::vector<int32_t>> &ecmap, const std::vector<int32_t>& tx_split) {
+  // Note: tx_split indices are tx ids and values are 1 (exists in split) or 0 (does not exist in split)
+  if (tx_split.size() == 0) return COUNT_DEFAULT;
+  if (ecs.size() == 0) return COUNT_AMBIGUOUS; // Shouldn't happen
+  std::vector<int32_t> u;
+  u.resize(0);
+  auto &v = ecmap[ecs[0]]; // copy
+  for (size_t i = 0; i< v.size(); i++) {
+    u.push_back(v[i]);
+  }
+  for (size_t i = 1; i < ecs.size(); i++) {
+    const auto &v = ecmap[ecs[i]];
+    
+    int j = 0;
+    int k = 0;
+    int l = 0;
+    int n = u.size();
+    int m = v.size();
+    // u and v are sorted, j,k,l = 0
+    while (j < n && l < m) {
+      // invariant: u[:k] is the intersection of u[:j] and v[:l], j <= n, l <= m
+      //            u[:j] <= u[j:], v[:l] <= v[l:], u[j:] is sorted, v[l:] is sorted, u[:k] is sorted
+      if (u[j] < v[l]) {
+        j++;
+      } else if (u[j] > v[l]) {
+        l++;
+      } else {
+        // match
+        if (k < j) {
+          std::swap(u[k], u[j]);
+        }
+        k++;
+        j++;
+        l++;
+      }
+    }
+    if (k < n) {
+      u.resize(k);
+    }
+  }
+  size_t n_1 = 0;
+  size_t n_2 = 0;
+  for (auto t : u) {
+      if(tx_split[t]) {
+        n_2++;
+      } else {
+        n_1++;
+      }
+      if (n_1 > 0 && n_2 > 0) break; // Stop searching
+  }
+  return (n_1 > 0 && n_2 > 0 ? COUNT_AMBIGUOUS : (n_1 > 0 ? COUNT_DEFAULT : COUNT_SPLIT));
+  /*for (auto ec : ecs) { // We still need to optimize this
+    for (auto t: ecmap[ec]) {
+      if(std::find(tx_split.begin(), tx_split.end(), t) != tx_split.end()) {
+        n_2++;
+      } else {
+        n_1++;
+      }
+      if (n_1 > 0 && n_2 > 0) break; // Stop searching
+    }
+    if (n_1 > 0 && n_2 > 0) break; // Stop searching
+  }*/
+  //return (n_1 > 0 && n_2 > 0 ? COUNT_AMBIGUOUS : (n_1 > 0 ? COUNT_DEFAULT : COUNT_SPLIT));
+}
+
+
 void copy_file(std::string src, std::string dest) {
 	std::ifstream  isrc(src, std::ios::binary);
 	std::ofstream  idest(dest, std::ios::binary);
 
 	idest << isrc.rdbuf();
-}
\ No newline at end of file
+}
diff --git a/src/Common.hpp b/src/Common.hpp
index f67ee5c..badd7c4 100644
--- a/src/Common.hpp
+++ b/src/Common.hpp
@@ -9,9 +9,12 @@
 #include <string>
 #include <unordered_map>
 #include <sstream>
+#include "roaring.hh"
+#include "hash.hpp"
 
-#define BUSTOOLS_VERSION "0.42.0"
+#define BUSTOOLS_VERSION "0.43.0"
 
+#define u_map_ std::unordered_map
 enum CAPTURE_TYPE : char
 {
   CAPTURE_NONE = 0,
@@ -35,6 +38,12 @@ enum PROJECT_TYPE : char
   PROJECT_TX,
   PROJECT_F
 };
+enum COUNT_MTX_TYPE : char
+{
+  COUNT_DEFAULT = 0,
+    COUNT_SPLIT,
+    COUNT_AMBIGUOUS
+};
 
 struct Bustools_opt
 {
@@ -62,6 +71,7 @@ struct Bustools_opt
   std::string count_genes;
   std::string count_ecs;
   std::string count_txp;
+  std::string count_split;
   bool count_em = false;
   bool count_cm = false;
   bool count_collapse = false;
@@ -75,6 +85,7 @@ struct Bustools_opt
   std::string dump;
   bool dump_bool = false;
   bool split_correct = false;
+  bool barcode_replacement = false;
 
   /* predict */
   std::string predict_input; //specified the same way as the output for count - count and histogram filenames will be created from this
@@ -98,6 +109,7 @@ struct Bustools_opt
   /* text */
   bool text_dumpflags = false;
   bool text_dumppad = false;
+  bool text_showall = false;
 
   /* linker */
   int start, end;
@@ -151,22 +163,42 @@ struct SortedVectorHasher
     int i = 0;
     for (auto x : v)
     {
-      uint64_t t = std::hash<int32_t>{}(x);
+      uint64_t t;
+      MurmurHash3_x64_64(&x,sizeof(x), 0,&t);
       t = (x >> i) | (x << (64 - i));
       r = r ^ t;
-      i = (i + 1) % 64;
+      i = (i+1)&63;
     }
     return r;
   }
 };
+
+struct RoaringHasher {
+  size_t operator()(const Roaring& rr) const {
+    uint64_t r = 0;
+    int i=0;
+    for (auto x : rr) {
+      uint64_t t;
+      MurmurHash3_x64_64(&x, sizeof(x), 0, &t);
+      t = (x>>i) | (x<<(64-i));
+      r ^= t;
+      i = (i+1)&63; // (i+1)%64
+    }
+    return r;
+  }
+};
+typedef u_map_<std::vector<int32_t>, int32_t, SortedVectorHasher> EcMapInv;
+
 std::vector<int32_t> intersect(std::vector<int32_t> &u, std::vector<int32_t> &v);
 std::vector<int32_t> union_vectors(const std::vector<std::vector<int32_t>> &v);
 std::vector<int32_t> intersect_vectors(const std::vector<std::vector<int32_t>> &v);
-int32_t intersect_ecs(const std::vector<int32_t> &ecs, std::vector<int32_t> &u, const std::vector<int32_t> &genemap, std::vector<std::vector<int32_t>> &ecmap, std::unordered_map<std::vector<int32_t>, int32_t, SortedVectorHasher> &ecmapinv, std::vector<std::vector<int32_t>> &ec2genes);
+int32_t intersect_ecs(const std::vector<int32_t> &ecs, std::vector<int32_t> &u, const std::vector<int32_t> &genemap, std::vector<std::vector<int32_t>> &ecmap, EcMapInv &ecmapinv, std::vector<std::vector<int32_t>> &ec2genes);
 void vt2gene(const std::vector<int32_t> &v, const std::vector<int32_t> &genemap, std::vector<int32_t> &glist);
 void intersect_genes_of_ecs(const std::vector<int32_t> &ecs, const std::vector<std::vector<int32_t>> &ec2genes, std::vector<int32_t> &glist);
-int32_t intersect_ecs_with_genes(const std::vector<int32_t> &ecs, const std::vector<int32_t> &genemap, std::vector<std::vector<int32_t>> &ecmap, std::unordered_map<std::vector<int32_t>, int32_t, SortedVectorHasher> &ecmapinv, std::vector<std::vector<int32_t>> &ec2genes, bool assumeIntersectionIsEmpty = true);
+int32_t intersect_ecs_with_genes(const std::vector<int32_t> &ecs, const std::vector<int32_t> &genemap, std::vector<std::vector<int32_t>> &ecmap, EcMapInv &ecmapinv, std::vector<std::vector<int32_t>> &ec2genes, bool assumeIntersectionIsEmpty = true);
 void create_ec2genes(const std::vector<std::vector<int32_t>> &ecmap, const std::vector<int32_t> &genemap, std::vector<std::vector<int32_t>> &ec2gene);
+COUNT_MTX_TYPE intersect_ecs_with_subset_txs(int32_t ec, const std::vector<std::vector<int32_t>> &ecmap, const std::vector<int32_t>& tx_split);
+COUNT_MTX_TYPE intersect_ecs_with_subset_txs(const std::vector<int32_t>& ecs, const std::vector<std::vector<int32_t>> &ecmap, const std::vector<int32_t>& tx_split);
 
 void copy_file(std::string src, std::string dest);
 
diff --git a/src/bustools_capture.cpp b/src/bustools_capture.cpp
index d03e1fd..a90ff2e 100644
--- a/src/bustools_capture.cpp
+++ b/src/bustools_capture.cpp
@@ -12,11 +12,11 @@ void bustools_capture(Bustools_opt &opt) {
 
   std::unordered_set<uint64_t> captures;
   std::vector<std::vector<int32_t>> ecmap;
-  std::unordered_map<std::vector<int32_t>, int32_t, SortedVectorHasher> ecmapinv;
+  u_map_<std::vector<int32_t>, int32_t, SortedVectorHasher> ecmapinv;
 
   if (opt.type == CAPTURE_TX) {
     // parse ecmap and capture list
-    std::unordered_map<std::string, int32_t> txnames;
+    u_map_<std::string, int32_t> txnames;
     std::cerr << "Parsing transcripts .. "; std::cerr.flush();
     parseTranscripts(opt.count_txp, txnames);
     std::cerr << "done" << std::endl;
diff --git a/src/bustools_clusterhist.cpp b/src/bustools_clusterhist.cpp
index 2bbb667..591eeeb 100644
--- a/src/bustools_clusterhist.cpp
+++ b/src/bustools_clusterhist.cpp
@@ -18,13 +18,13 @@ void bustools_clusterhist(Bustools_opt& opt) {
 
 	// read and parse the equivelence class files
 
-	std::unordered_map<std::vector<int32_t>, int32_t, SortedVectorHasher> ecmapinv;
+	u_map_<std::vector<int32_t>, int32_t, SortedVectorHasher> ecmapinv;
 	std::vector<std::vector<int32_t>> ecmap;
 
-	std::unordered_map<std::string, int32_t> txnames;
+	u_map_<std::string, int32_t> txnames;
 	parseTranscripts(opt.count_txp, txnames);
 	std::vector<int32_t> genemap(txnames.size(), -1);
-	std::unordered_map<std::string, int32_t> genenames;
+	u_map_<std::string, int32_t> genenames;
 	parseGenes(opt.count_genes, txnames, genemap, genenames);
 	parseECs(opt.count_ecs, h);
 	ecmap = std::move(h.ecs);
@@ -52,7 +52,7 @@ void bustools_clusterhist(Bustools_opt& opt) {
 
 	//Read the cluster file
 	std::vector<std::string> clusterNames;
-	std::unordered_map<uint64_t, size_t> bcClusters;
+	u_map_<uint64_t, size_t> bcClusters;
 	{
 		std::ifstream ifs(opt.cluster_input_file);
 		uint32_t flag = 0;
diff --git a/src/bustools_collapse.cpp b/src/bustools_collapse.cpp
index 2111c9b..120d0ef 100644
--- a/src/bustools_collapse.cpp
+++ b/src/bustools_collapse.cpp
@@ -17,13 +17,13 @@ void bustools_collapse(Bustools_opt &opt) {
 
   // read and parse the equivelence class files
   
-  std::unordered_map<std::vector<int32_t>, int32_t, SortedVectorHasher> ecmapinv;
+  u_map_<std::vector<int32_t>, int32_t, SortedVectorHasher> ecmapinv;
   std::vector<std::vector<int32_t>> ecmap;
 
-  std::unordered_map<std::string, int32_t> txnames;
+  u_map_<std::string, int32_t> txnames;
   parseTranscripts(opt.count_txp, txnames);
   std::vector<int32_t> genemap(txnames.size(), -1);
-  std::unordered_map<std::string, int32_t> genenames;
+  u_map_<std::string, int32_t> genenames;
   parseGenes(opt.count_genes, txnames, genemap, genenames);
   parseECs(opt.count_ecs, h);
   ecmap = std::move(h.ecs);
diff --git a/src/bustools_correct.cpp b/src/bustools_correct.cpp
index 588b744..ef50271 100644
--- a/src/bustools_correct.cpp
+++ b/src/bustools_correct.cpp
@@ -3,6 +3,7 @@
 #include <algorithm>
 
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 #include "Common.hpp"
@@ -258,6 +259,7 @@ void bustools_split_correct(Bustools_opt &opt)
     }
 
     int rc = 0;
+    uint64_t len_mask = ((1ULL << (2*bclen)) - 1); // Only include n least significant bits where n=2*bclen
     while (true)
     {
       in.read((char *)p, N * sizeof(BUSData));
@@ -276,7 +278,7 @@ void bustools_split_correct(Bustools_opt &opt)
 
         bd = p[i];
 
-        uint64_t b = bd.barcode;
+        uint64_t b = bd.barcode & len_mask;
         uint64_t bc12 = b & mask_12;
         uint64_t bc34 = (b >> (2 * len_12)) & mask_34;
 
@@ -366,14 +368,14 @@ void bustools_split_correct(Bustools_opt &opt)
 
             if (dump_bool)
             {
-              if (bd.barcode != old_barcode)
+              if ((bd.barcode & len_mask) != old_barcode)
               {
-                of << binaryToString(bd.barcode, bclen) << "\t" << binaryToString(b_corrected, bclen) << "\n";
-                old_barcode = bd.barcode;
+                of << binaryToString(bd.barcode & len_mask, bclen) << "\t" << binaryToString(b_corrected, bclen) << "\n";
+                old_barcode = bd.barcode & len_mask;
               }
             }
 
-            bd.barcode = b_corrected;
+            bd.barcode = b_corrected | (bd.barcode & ~len_mask);
             bus_out.write((char *)&bd, sizeof(bd));
 
             if (corrected_12_flag && corrected_34_flag)
@@ -415,10 +417,268 @@ void bustools_split_correct(Bustools_opt &opt)
   p = nullptr;
 }
 
-void bustools_correct(Bustools_opt &opt)
-{
+void bustools_correct_replace(Bustools_opt &opt) {
   uint32_t bclen = 0;
+  uint32_t umilen = 0;
+  uint32_t rplen = 0;
   uint32_t wc_bclen = 0;
+  BUSHeader h;
+  size_t nr = 0;
+  size_t N = 100000;
+  BUSData *p = new BUSData[N];
+  char magic[4];
+  uint32_t version = 0;
+  size_t stat_white = 0;
+  size_t stat_uncorr = 0;
+  uint64_t old_barcode;
+  enum replacement_type { bc_record, msb_meta, lsb_meta, msb_bus, lsb_bus };
+  
+  // There are five replacement types:
+  // bc_record: simply replace the barcode of length 'bclen' with another barcode of length 'bclen' (while preserving metadata)
+  // msb_meta: put replacement (format: NNNN<) into most significant bits of metadata in BUS record (preserve barcode record)
+  // lsb_meta: put replacement (format: <NNNN) into least significant bits of metadata in BUS record (existing metadata shifted left; preserve barcode record)
+  // msb_bus: put replacement (format: NNNN*) into most significant bits of barcode (replacing existing bits of barcode)
+  // lsb_bus: put replacement (format: *NNNN) into least significant bits of barcode (replacing existing bits of barcode)
+
+  
+  bool dump_bool = opt.dump_bool;
+  std::ofstream of;
+  if (dump_bool) {
+    of.open(opt.dump);
+  }
+  
+  std::ifstream wf(opt.whitelist, std::ios::in);
+  std::string line;
+  line.reserve(100);
+  std::unordered_map<uint64_t,uint64_t> rp_map; // Replacement map (key = onlisted bc; value = replacement bc)
+  replacement_type rtype = bc_record;
+  uint32_t f = 0;
+  
+  while (std::getline(wf, line)) {
+    std::stringstream ss(line);
+    std::string barcode;
+    std::string replacement;
+    ss >> barcode >> replacement;
+    if (barcode.empty() || replacement.empty()) continue;
+    if (!barcode.empty() && replacement.empty()) { 
+      std::cerr << "Error: replacement file malformed; no replacement found for barcode: " << barcode << std::endl;
+      exit(1);
+    }
+    std::transform(barcode.begin(), barcode.end(), barcode.begin(), ::toupper);
+    std::transform(replacement.begin(), replacement.end(), replacement.begin(), ::toupper);
+    replacement_type rtype_ = bc_record;
+    if (replacement[0] == '<') rtype_ = lsb_meta;
+    if (replacement[0] == '*') rtype_ = lsb_bus;
+    if (rtype_ != bc_record) {
+      replacement = replacement.substr(1);
+    } else {
+      if (replacement[replacement.length()-1] == '<') rtype_ = msb_meta;
+      if (replacement[replacement.length()-1] == '*') rtype_ = msb_bus;
+      if (rtype_ != bc_record) {
+        replacement = replacement.substr(0, replacement.length()-1);
+      }
+    }
+    if (replacement.length() == 0) continue;
+    
+    if (wc_bclen == 0) {
+      rtype = rtype_;
+      wc_bclen = barcode.length();
+      rplen = replacement.length();
+    }
+    if (rtype != rtype_) {
+      std::cerr << "Error: Replacement types not consistent in file" << std::endl;
+      exit(1);
+    }
+    if (wc_bclen != barcode.length()) {
+      std::cerr << "Error: Barcode lengths not consistent in file" << std::endl;
+      exit(1);
+    }
+
+    uint64_t rl = replacement.length();
+    uint64_t bc = stringToBinary(barcode, f);
+    uint64_t rp = stringToBinary(replacement, f);
+    if (rp_map.find(bc) != rp_map.end()) {
+      std::cerr << "Error: Duplicate entries found: " << barcode << std::endl;
+      exit(1);
+    }
+    if (rplen != rl) {
+      std::cerr << "Error: replacement length in list inconsistent" << std::endl;
+      exit(1);
+    }
+    rp_map.insert(std::make_pair(bc, rp));
+  }
+  wf.close();
+  
+  if (rp_map.size() == 0) {
+    std::cerr << "Error: replacement file malformed; no barcodes found" <<std::endl;
+    exit(1);
+  }
+  
+  size_t n_barcodes_in_whitelist = rp_map.size();
+  std::cerr << "Found " << n_barcodes_in_whitelist << " barcodes in the replacement list" << std::endl;
+  
+  std::streambuf *buf = nullptr;
+  std::ofstream busf_out;
+  
+  if (!opt.stream_out) {
+    busf_out.open(opt.output, std::ios::out | std::ios::binary);
+    buf = busf_out.rdbuf();
+  }
+  else {
+    buf = std::cout.rdbuf();
+  }
+  std::ostream bus_out(buf);
+  
+  bool outheader_written = false;
+  
+  nr = 0;
+  BUSData bd;
+  for (const auto &infn : opt.files) {
+    std::streambuf *inbuf;
+    std::ifstream inf;
+    if (!opt.stream_in) {
+      inf.open(infn.c_str(), std::ios::binary);
+      inbuf = inf.rdbuf();
+    } else {
+      inbuf = std::cin.rdbuf();
+    }
+    std::istream in(inbuf);
+    parseHeader(in, h);
+    
+    if (bclen == 0) {
+      bclen = h.bclen;
+      // See if we need to adjust the barcode length in BUS file header
+      if (rtype == bc_record && bclen > rplen) {
+        h.bclen = rplen;
+      }
+    }
+    if (umilen == 0) {
+      umilen = h.umilen;
+    }
+    
+    if (!outheader_written) {
+      writeHeader(bus_out, h);
+      outheader_written = true;
+    }
+    
+    int rc = 0;
+    uint64_t len_mask = ((1ULL << (2*bclen)) - 1); // Only include n least significant bits where n=2*bclen
+    while (true) {
+      in.read((char *)p, N * sizeof(BUSData));
+      size_t rc = in.gcount() / sizeof(BUSData);
+      if (rc == 0) {
+        break;
+      }
+      nr += rc;
+      
+      for (size_t i = 0; i < rc; i++) {
+        bd = p[i];
+        uint64_t b = bd.barcode & len_mask;
+        uint64_t correction = 0;
+        uint64_t b_lookup = b;
+        if (rtype == lsb_bus || rtype == msb_meta || rtype == lsb_meta) {
+          // For these, look up based off LSBs (off of wc_bclen)
+          b_lookup = b_lookup & ((1ULL << (2*wc_bclen)) - 1);
+        }
+        if (rtype == msb_bus) {
+          // For this, look up based off MSBs (from beginning of barcode)
+          if (bclen >= rplen) {
+            b_lookup = b_lookup & (~(1ULL << (2*(bclen-rplen))));
+          }
+        }
+        auto it = rp_map.find(b_lookup);
+        if (it != rp_map.end()) {
+          stat_white++;
+          correction = it->second;
+          switch (rtype) {
+            case bc_record: // This is the only option where we'll allow replacement to be shorter than barcode
+            {
+              uint64_t len_mask2 = ((1ULL << (2*std::max(rplen, bclen))) - 1); // n least significant bits where n=2*max(rplen,bclen) [if rplen > bclen, overwrite based on rplen]
+              bd.barcode = bd.barcode & ~len_mask2; // Preserve the metadata bits outside barcode length (or overwrites the part where the barcode length exceeds it)
+              if (rplen < bclen) {
+                bd.barcode = bd.barcode >> (2*(bclen-rplen));
+                bd.barcode = bd.barcode & ~((1ULL << (2*(rplen))) - 1); // Delete everything within rplen (i.e. where the correction will eventually be)
+              }
+              bd.barcode = correction | (bd.barcode & ~len_mask2); // Correction
+              break;
+            }
+            case msb_meta:
+            {
+              uint64_t clen = 32-rplen; // 32 minus correction length
+              bd.barcode = bd.barcode & ((1ULL << (2*clen)) - 1); // Unset the MSBs
+              bd.barcode = (correction << (2*clen) ) | bd.barcode; // Shift the corrected sequence into the MSBs
+              break;
+            }
+            case lsb_meta:
+            {
+              uint64_t original_bc = bd.barcode & len_mask; // The original barcode sequence (no metadata)
+              bd.barcode = bd.barcode << (2*rplen); // Shift the barcode+metadata to the left based on rplen
+              uint64_t mlen = (rplen+bclen); // How much space the new metadata plus the original barcode will take up
+              bd.barcode = bd.barcode & (~((1ULL << (2*mlen)) - 1)); // Preserve only the bits containing the (shifted) metadata
+              bd.barcode = bd.barcode | original_bc; // Throw the original barcode back in
+              bd.barcode = (correction << (2*bclen) ) | bd.barcode; // Throw the new metadata in
+              break;
+            }
+            case msb_bus:
+            {
+              if (bclen >= rplen) { // Only do the substitution if barcode encapsulates the replacement
+                uint64_t mdata = bd.barcode & (~((1ULL << (2*bclen)) - 1)); // Preserve only the bits containing the metadata (not the barcode)
+                uint64_t bdata = bd.barcode & ((1ULL << (2*(bclen-rplen))) - 1); // Preserve only the bits containing the part of the barcode we want to keep
+                bd.barcode = (mdata | bdata) | (correction << (2*(bclen-rplen))); // Merge everything together
+              }
+              break;
+            }
+            case lsb_bus:
+            {
+              bd.barcode = correction | (bd.barcode & ~((1ULL << (2*rplen)) - 1)); // Set 2*rplen LSBs to 0, and put the new replacement in
+              break;
+            }
+          }
+          bus_out.write((char *)&bd, sizeof(bd));
+          if (dump_bool) {
+            if (b != old_barcode) {
+              of << binaryToString(b, bclen) << "\t" << binaryToString(correction, bclen) << "\n";
+              old_barcode = b;
+            }
+          }
+        } else {
+          // No correction; except shift metadata right if necessary
+          if (rtype == bc_record && rplen < bclen) {
+            uint64_t shifted_bc = bd.barcode >> (2*(bclen-rplen));
+            shifted_bc = shifted_bc & ~((1ULL << (2*(rplen))) - 1); // Delete everything within rplen (i.e. where the replacement would be)
+            bd.barcode = (bd.barcode & ((1ULL << (2*(rplen))) - 1)); // Preserve only the LSB rlen stuff
+            bd.barcode = shifted_bc | bd.barcode; // Merge
+          }
+          bus_out.write((char *)&bd, sizeof(bd)); 
+          stat_uncorr++;
+        }
+      }
+    }
+  }
+  
+  std::cerr << "Processed " << nr << " BUS records" << std::endl
+            << "Replaced = " << stat_white << std::endl
+            << "Not replaced = " << stat_uncorr << std::endl;
+  
+  if (!opt.stream_out) {
+    busf_out.close();
+  }
+  
+  if (opt.dump_bool) {
+    of.close(); // if of is open
+  }
+  
+  delete[] p;
+  p = nullptr;
+}
+
+void bustools_correct(Bustools_opt &opt) {
+  if (opt.barcode_replacement) { // Run in replacement mode
+    bustools_correct_replace(opt);
+    return;
+  }
+  uint32_t bclen = 0;
+  std::vector<uint32_t> wc_bclen;
   uint32_t umilen = 0;
   BUSHeader h;
   size_t nr = 0;
@@ -434,57 +694,97 @@ void bustools_correct(Bustools_opt &opt)
   bool dump_bool = opt.dump_bool;
 
   std::ofstream of;
-  if (dump_bool)
-  {
+  if (dump_bool) {
     of.open(opt.dump);
   }
 
   std::ifstream wf(opt.whitelist, std::ios::in);
   std::string line;
   line.reserve(100);
-  std::unordered_set<uint64_t> wbc;
-  wbc.reserve(100000);
+  std::vector<std::unordered_set<uint64_t> > wbc; // Each set contains whitelist
   uint32_t f = 0;
-  while (std::getline(wf, line))
-  {
-    if (wc_bclen == 0)
-    {
-      wc_bclen = line.size();
+  bool first_line = true;
+  while (std::getline(wf, line)) {
+    std::stringstream ss(line);
+    std::string barcode;
+    int i = 0;
+    while (ss >> barcode) {
+      std::transform(barcode.begin(), barcode.end(), barcode.begin(), ::toupper);
+      uint64_t bc = stringToBinary(barcode, f);
+      if (first_line) { // First line establishes all the barcode sets (can't have any empty barcodes here)
+        std::unordered_set<uint64_t> bc_set;
+        bc_set.insert(bc);
+        wbc.push_back(bc_set);
+        wbc[i].reserve(100000);
+        wc_bclen.push_back(barcode.size());
+      } else if (barcode == "-") {
+        i++;
+        continue; // Empty barcode
+      } else if (i >= wbc.size()) { // Too many barcodes in this line
+        std::cerr << "Error: on-list file malformed; encountered " << (i+1)
+                  << " barcodes on a line while " << wbc.size() << " barcodes on a previous line"
+                  << std::endl;
+        exit(1);
+      } else if (barcode.length() != wc_bclen[i]) {
+        std::cerr << "Error: on-list file malformed; encountered barcode length " << wc_bclen[i]
+                  << " on a line but barcode length " << barcode.length() << " on another line"
+                  << std::endl;
+        exit(1);
+      } else {
+        wbc[i].insert(bc);
+      }
+      i++;
     }
-    uint64_t bc = stringToBinary(line, f);
-    wbc.insert(bc);
+    if (i == 0) continue; // empty line
+    first_line = false;
   }
   wf.close();
 
-  std::cerr << "Found " << wbc.size() << " barcodes in the whitelist" << std::endl;
-
-  // split barcode into upper and lower half
-  size_t bc2 = (wc_bclen + 1) / 2;
-
-  std::vector<std::pair<Roaring, Roaring>> correct(1ULL << (2 * bc2)); // 4^(bc/2) possible barcodes
-
-  uint64_t mask_size = (1ULL << (2 * bc2));
-  uint64_t lower_mask = (1ULL << (2 * bc2)) - 1;
-  uint64_t upper_mask = (1ULL << (2 * (wc_bclen - bc2))) - 1;
-  for (uint64_t b : wbc)
-  {
-    uint64_t lb = b & lower_mask;
-    uint64_t ub = (b >> (2 * bc2)) & upper_mask;
-
-    correct[ub].second.add(lb);
-    correct[lb].first.add(ub);
+  if (wbc.size() == 0) {
+    std::cerr << "Error: on-list file malformed; no barcodes found" <<std::endl;
+    exit(1);
+  }
+  size_t n_barcodes_in_whitelist = 0;
+  for (auto nbc : wbc) {
+    if (n_barcodes_in_whitelist < nbc.size()) {
+      n_barcodes_in_whitelist = nbc.size();
+    }
+  }
+  std::cerr << "Found " << n_barcodes_in_whitelist << " barcodes in the on-list" << std::endl;
+  if (wbc.size() > 1) {
+    std::cerr << "Found " << wbc.size() << " barcode sets" << std::endl;
+  }
+  
+  // split barcode into upper and lower half (across all barcodes in a barcode set)
+  std::vector<std::vector<std::pair<Roaring, Roaring>>> correct_vec; // size of vector = how many barcode sets there are
+  std::vector<std::pair<uint64_t,uint64_t> > lower_upper_mask_vec; // size of vector = how many barcode sets there are
+  std::vector<size_t > bc2_vec; // size of vector = how many barcode sets there are
+  for (int i = 0; i < wc_bclen.size(); i++) {
+    auto bclen2 = wc_bclen[i]; // i = index of current barcode set
+    size_t bc2 = (bclen2 + 1) / 2;
+    std::vector<std::pair<Roaring, Roaring>> correct(1ULL << (2 * bc2)); // 4^(bc/2) possible barcodes
+    uint64_t mask_size = (1ULL << (2 * bc2));
+    uint64_t lower_mask = (1ULL << (2 * bc2)) - 1;
+    uint64_t upper_mask = (1ULL << (2 * (bclen2 - bc2))) - 1;
+    for (uint64_t b : wbc[i]) { // Iterate through barcodes of current barcode set
+      uint64_t lb = b & lower_mask;
+      uint64_t ub = (b >> (2 * bc2)) & upper_mask;
+      correct[ub].second.add(lb);
+      correct[lb].first.add(ub);
+    }
+    correct_vec.push_back(std::move(correct));
+    lower_upper_mask_vec.push_back(std::make_pair(lower_mask, upper_mask));
+    bc2_vec.push_back(bc2);
   }
 
   std::streambuf *buf = nullptr;
   std::ofstream busf_out;
 
-  if (!opt.stream_out)
-  {
+  if (!opt.stream_out) {
     busf_out.open(opt.output, std::ios::out | std::ios::binary);
     buf = busf_out.rdbuf();
   }
-  else
-  {
+  else {
     buf = std::cout.rdbuf();
   }
   std::ostream bus_out(buf);
@@ -493,111 +793,121 @@ void bustools_correct(Bustools_opt &opt)
 
   nr = 0;
   BUSData bd;
-  for (const auto &infn : opt.files)
-  {
+  for (const auto &infn : opt.files) {
     std::streambuf *inbuf;
     std::ifstream inf;
-    if (!opt.stream_in)
-    {
+    if (!opt.stream_in) {
       inf.open(infn.c_str(), std::ios::binary);
       inbuf = inf.rdbuf();
-    }
-    else
-    {
+    } else {
       inbuf = std::cin.rdbuf();
     }
     std::istream in(inbuf);
     parseHeader(in, h);
 
-    if (!outheader_written)
-    {
+    if (!outheader_written) {
       writeHeader(bus_out, h);
       outheader_written = true;
     }
 
-    if (bclen == 0)
-    {
+    if (bclen == 0) {
       bclen = h.bclen;
+      size_t final_wc_bclen = 0;
+      
+      for (auto l : wc_bclen) {
+        final_wc_bclen += l;
+      }
 
-      if (bclen != wc_bclen)
-      {
-        std::cerr << "Error: barcode length and whitelist length differ, barcodes = " << bclen << ", whitelist = " << wc_bclen << std::endl
-                  << "       check that your whitelist matches the technology used" << std::endl;
+      if (bclen != final_wc_bclen) {
+        std::cerr << "Error: barcode length and on-list length differ, barcodes = " << bclen << ", on-list = " << final_wc_bclen << std::endl
+                  << "       check that your on-list matches the technology used" << std::endl;
 
         exit(1);
       }
     }
-    if (umilen == 0)
-    {
+    if (umilen == 0) {
       umilen = h.umilen;
     }
 
     int rc = 0;
+    uint64_t len_mask = ((1ULL << (2*bclen)) - 1); // Only include n least significant bits where n=2*bclen
     while (true)
     {
       in.read((char *)p, N * sizeof(BUSData));
       size_t rc = in.gcount() / sizeof(BUSData);
-      if (rc == 0)
-      {
+      if (rc == 0) {
         break;
       }
       nr += rc;
 
-      for (size_t i = 0; i < rc; i++)
-      {
+      for (size_t i = 0; i < rc; i++) {
         bd = p[i];
-        auto it = wbc.find(bd.barcode);
-        if (it != wbc.end())
-        {
-          stat_white++;
-          bus_out.write((char *)&bd, sizeof(bd));
-        }
-        else
-        {
-          uint64_t b = bd.barcode;
-          uint64_t lb = b & lower_mask;
-          uint64_t ub = (b >> (2 * bc2)) & upper_mask;
-          uint64_t lbc = 0, ubc = 0;
-          int correct_lower = search_for_mismatch(correct[ub].second, bc2, lb, lbc);
-          int correct_upper = search_for_mismatch(correct[lb].first, wc_bclen - bc2, ub, ubc);
-          int nc = correct_lower + correct_upper;
-          if (nc != 1)
-          {
-            stat_uncorr++;
+        uint64_t b = bd.barcode & len_mask;
+        uint64_t running_len = 0;
+        size_t stat_white_ = 0;
+        size_t stat_uncorr_ = 0;
+        size_t stat_corr_ = 0;
+        uint64_t correction = 0;
+        for (int j = wbc.size()-1; j >= 0; j--) { // Iterate through all the barcode sets
+          auto bclen2 = wc_bclen[j];
+          running_len += bclen2;
+          uint64_t shift_len = 2*(running_len-bclen2); // used for masking out the least significant bits up to the current barcode
+          uint64_t len_mask2 = 0; // This mask = Only consider these bits (based on each barcode set)
+          len_mask2 = ((1ULL << (2*running_len)) - 1);
+          if (shift_len != 0) {
+            len_mask2 &= (~((1ULL << (shift_len)) - 1));
           }
-          else if (nc == 1)
-          {
-            if (correct_lower == 1)
-            {
-              uint64_t b_corrected = (ub << (2 * bc2)) | lbc;
-              if (dump_bool)
-              {
-                if (bd.barcode != old_barcode)
-                {
-                  of << binaryToString(bd.barcode, bclen) << "\t" << binaryToString(b_corrected, bclen) << "\n";
-                  old_barcode = bd.barcode;
-                }
+          len_mask2 &= ((1ULL << (2*running_len)) - 1);
+          len_mask2 &= len_mask; // not necessary
+          uint64_t b_ = b & len_mask2; // The barcode alone in the location that it appears in
+          uint64_t b_shifted = b_ >> shift_len; // The barcode shifted into the least significant bits location
+          auto it = wbc[j].find(b_shifted);
+          if (it != wbc[j].end()) { // Barcode is in the whitelist
+            stat_white_++;
+            correction |= b_;
+          } else {
+            auto lower_mask = lower_upper_mask_vec[j].first;
+            auto upper_mask = lower_upper_mask_vec[j].second;
+            auto bc2 = bc2_vec[j];
+            auto& correct = correct_vec[j];
+            uint64_t lb = b_shifted & lower_mask;
+            uint64_t ub = (b_shifted >> (2 * bc2)) & upper_mask;
+            uint64_t lbc = 0, ubc = 0;
+            int correct_lower = search_for_mismatch(correct[ub].second, bc2, lb, lbc);
+            int correct_upper = search_for_mismatch(correct[lb].first, bclen2 - bc2, ub, ubc);
+            int nc = correct_lower + correct_upper;
+            if (nc != 1) { // Uncorrected
+              stat_uncorr_++;
+              break;
+            } else if (nc == 1) {
+              stat_corr_++;
+              if (correct_lower == 1) {
+                uint64_t b_corrected = (ub << (2 * bc2)) | lbc;
+                b_corrected = b_corrected << shift_len; // We have the corrected barcode in the correct location
+                b_corrected &= len_mask2;
+                correction |= b_corrected; // Add onto existing correction
+              } else if (correct_upper == 1) {
+                uint64_t b_corrected = (ubc << (2 * bc2)) | lb;
+                b_corrected = b_corrected << shift_len; // We have the corrected barcode in the correct location
+                b_corrected &= len_mask2;
+                correction |= b_corrected; // Add onto existing correction
               }
-
-              bd.barcode = b_corrected;
-              bus_out.write((char *)&bd, sizeof(bd));
-              stat_corr++;
             }
-            else if (correct_upper == 1)
-            {
-              uint64_t b_corrected = (ubc << (2 * bc2)) | lb;
-              if (dump_bool)
-              {
-                if (bd.barcode != old_barcode)
-                {
-                  of << binaryToString(bd.barcode, bclen) << "\t" << binaryToString(b_corrected, bclen) << "\n";
-                  old_barcode = bd.barcode;
-                }
-              }
-
-              bd.barcode = b_corrected;
-              bus_out.write((char *)&bd, sizeof(bd));
-              stat_corr++;
+          }
+        }
+        if (stat_uncorr_ > 0) {
+          stat_uncorr++; // Uncorrected; do not write BUS record
+        } else if (stat_white_ == wbc.size()) {
+          stat_white++;
+          bus_out.write((char *)&bd, sizeof(bd)); // No correction; just write BUS record as-is
+        } else if (stat_corr_ > 0) {
+          stat_corr++; // Corrected; and write it out
+          bd.barcode = correction | (bd.barcode & ~len_mask); // Correction plus preserve the metadata bits outside barcode length
+          bus_out.write((char *)&bd, sizeof(bd));
+          if (dump_bool) {
+            if (b != old_barcode) {
+              of << binaryToString(b, bclen) << "\t" << binaryToString(correction, bclen) << "\n";
+              old_barcode = b;
             }
           }
         }
@@ -606,20 +916,18 @@ void bustools_correct(Bustools_opt &opt)
   }
 
   std::cerr << "Processed " << nr << " BUS records" << std::endl
-            << "In whitelist = " << stat_white << std::endl
+            << "In on-list = " << stat_white << std::endl
             << "Corrected    = " << stat_corr << std::endl
             << "Uncorrected  = " << stat_uncorr << std::endl;
 
-  if (!opt.stream_out)
-  {
+  if (!opt.stream_out) {
     busf_out.close();
   }
 
-  if (opt.dump_bool)
-  {
+  if (opt.dump_bool) {
     of.close(); // if of is open
   }
 
   delete[] p;
   p = nullptr;
-}
\ No newline at end of file
+}
diff --git a/src/bustools_count.cpp b/src/bustools_count.cpp
index e5961e5..8ccb6a8 100644
--- a/src/bustools_count.cpp
+++ b/src/bustools_count.cpp
@@ -18,13 +18,18 @@ void bustools_count(Bustools_opt &opt) {
 
   // read and parse the equivalence class files
 
-  std::unordered_map<std::vector<int32_t>, int32_t, SortedVectorHasher> ecmapinv;
+  EcMapInv ecmapinv;
   std::vector<std::vector<int32_t>> ecmap;
 
-  std::unordered_map<std::string, int32_t> txnames;
+  u_map_<std::string, int32_t> txnames;
+  auto txnames_split = txnames; // copy
+  std::vector<int32_t> tx_split; // Store transcript names for split
+  std::vector<int32_t> tx_split_lookup; // Map transcript IDs to mtx status
+  bool count_split = !opt.count_split.empty();
+  int count_mtx_priority = !opt.count_gene_multimapping && count_split ? 1 : 0; // 1 = when something in tx_split overlaps something not in tx_split, prioritize the latter (useful for dealing in cases when introns of one gene overlap exons of another gene [we prioritize the exons]
   parseTranscripts(opt.count_txp, txnames);
   std::vector<int32_t> genemap(txnames.size(), -1);
-  std::unordered_map<std::string, int32_t> genenames;
+  u_map_<std::string, int32_t> genenames;
   parseGenes(opt.count_genes, txnames, genemap, genenames);
   parseECs(opt.count_ecs, h);
   ecmap = std::move(h.ecs);
@@ -32,19 +37,23 @@ void bustools_count(Bustools_opt &opt) {
   for (int32_t ec = 0; ec < ecmap.size(); ec++) {
     ecmapinv.insert({ecmap[ec], ec});
   }
-  std::vector<std::vector<int32_t>> ec2genes;        
+  std::vector<std::vector<int32_t>> ec2genes, ec2genes_priority;
   create_ec2genes(ecmap, genemap, ec2genes);
 
 
   std::ofstream of;
+  std::ofstream of_2;
+  std::ofstream of_A;
   std::string mtx_ofn = opt.output + ".mtx";
+  std::string mtx_ofn_split_2 = opt.output + ".2.mtx";
+  std::string mtx_ofn_split_A = opt.output + ".ambiguous.mtx";
   std::string barcodes_ofn = opt.output + ".barcodes.txt";
+  std::string barcodes_prefix_ofn = opt.output + ".barcodes.prefix.txt";
   std::string ec_ofn = opt.output + ".ec.txt";
   std::string gene_ofn = opt.output + ".genes.txt";
   std::string hist_ofn = opt.output + ".hist.txt";
   std::string cu_per_cell_ofn = opt.output + ".CUPerCell.txt";
   std::string cu_ofn = opt.output + ".cu.txt";
-  of.open(mtx_ofn);
 
   // write out the initial header
   // keep the number of newlines constant, this way it will work for both Windows and Linux
@@ -54,12 +63,46 @@ void bustools_count(Bustools_opt &opt) {
   ssHeader << headerComments;
   ssHeader << std::string(66, '%') << '\n';
   size_t headerLength = ssHeader.str().length();
+  // If we need to split matrix
+  if (count_split) {
+    parseTranscripts(opt.count_split, txnames_split); // subset of txnames
+    tx_split.reserve(txnames_split.size());
+    tx_split_lookup.resize(txnames.size(), -1);
+    for (auto x : txnames_split) {
+        if (txnames.count(x.first)) tx_split.push_back(txnames[x.first]);
+    }
+    of_2.open(mtx_ofn_split_2);
+    of_A.open(mtx_ofn_split_A);
+    of_2 << ssHeader.str();
+    of_A << ssHeader.str();
+    auto ecmap_ = ecmap; // copy
+    for (int ec = 0; ec < ecmap.size(); ec++) { // Get new ecmap based on split
+      for (auto tx : ecmap[ec]) {
+        auto &new_ec = ecmap_[ec];
+	bool found = std::find(tx_split.begin(), tx_split.end(), tx) != tx_split.end();
+	tx_split_lookup[tx] = found;
+        // Remove transcripts depending on whether they're found in tx_split
+        // Note: It is possible for one of the new ECs to be empty, in which case intersect_genes_of_ecs will result in the empty set for glist
+        // Essentially, we are removing all tx's that belong to (or not belong to) tx_split in the equivalence classes
+        // This handles instances in which a read maps to exon of one gene but intron of another (likely overlapping) gene to avoid discarding the record
+        // This is done at read-level (not UMI-level) so if one UMI maps to one gene's exon but another UMI maps to the other gene's intron, we still discard it
+        if (count_mtx_priority == 1 && !found)
+          new_ec.erase(std::remove(new_ec.begin(), new_ec.end(), tx), new_ec.end());
+        else if (count_mtx_priority == 2 && found)
+          new_ec.erase(std::remove(new_ec.begin(), new_ec.end(), tx), new_ec.end());
+      }
+    }
+    if (count_mtx_priority != 0)
+    create_ec2genes(ecmap_, genemap, ec2genes_priority); // Note: Some ECs may not be associated with any genes (i.e. empty vector)
+  }
+  of.open(mtx_ofn);
   of << ssHeader.str();
 
-
   size_t n_cols = 0;
   size_t n_rows = 0;
   size_t n_entries = 0;
+  size_t n_entries_2 = 0;
+  size_t n_entries_A = 0;
   std::vector<BUSData> v;
   v.reserve(N);
   uint64_t current_bc = 0xFFFFFFFFFFFFFFFFULL;
@@ -70,7 +113,7 @@ void bustools_count(Bustools_opt &opt) {
   std::vector<int32_t> u;
   u.reserve(100);
   std::vector<int32_t> column_v;
-  std::vector<std::pair<int32_t, double>> column_vp;
+  std::vector<std::pair<int32_t, std::pair<double, COUNT_MTX_TYPE>>> column_vp; // gene, {count, matrix type}
   if (!opt.count_collapse) {
     column_v.reserve(N); 
   } else {
@@ -131,6 +174,7 @@ void bustools_count(Bustools_opt &opt) {
       
       if (opt.umi_gene_collapse) {
         intersect_genes_of_ecs(ecs,ec2genes, glist);
+	if (count_mtx_priority != 0 && glist.size() > 1) intersect_genes_of_ecs(ecs, ec2genes_priority, glist);
       }
       if (opt.umi_gene_collapse && glist.size() == 0) {
         // Gene-intersection zero, check for UMI collision
@@ -213,8 +257,11 @@ void bustools_count(Bustools_opt &opt) {
         }
       }
       double val = j-i;
-      of << n_rows << " " << (column_v[i]+1) << " " << val << "\n";
-      n_entries++;
+      auto which_mtx = intersect_ecs_with_subset_txs(column_v[i], ecmap, tx_split_lookup);
+      auto& of_ = which_mtx == COUNT_DEFAULT ? of : (which_mtx == COUNT_SPLIT ? of_2 : of_A);
+      auto& n_entries_ = which_mtx == COUNT_DEFAULT ? n_entries : (which_mtx == COUNT_SPLIT ? n_entries_2 : n_entries_A);
+      of_ << n_rows << " " << (column_v[i]+1) << " " << val << "\n";
+      n_entries_++;
       
       i = j; // increment
     }
@@ -235,7 +282,7 @@ void bustools_count(Bustools_opt &opt) {
 
     std::vector<std::vector<int32_t>> ambiguous_genes;
 
-    for (size_t i = 0; i < n; ) {
+    if (!opt.count_cm) for (size_t i = 0; i < n; ) { // Entire loop is for !opt.count_cm
       size_t j = i+1;
       for (; j < n; j++) {
         if (v[i].UMI != v[j].UMI) {
@@ -252,6 +299,7 @@ void bustools_count(Bustools_opt &opt) {
       }
 
       intersect_genes_of_ecs(ecs,ec2genes, glist);
+      if (count_mtx_priority != 0 && glist.size() > 1) intersect_genes_of_ecs(ecs, ec2genes_priority, glist);
       int gn = glist.size();
       if (opt.count_downsampling_factor != 1.0) {
         uint32_t newCounts = 0;
@@ -266,9 +314,10 @@ void bustools_count(Bustools_opt &opt) {
         }
       }
       if (gn > 0) {
+        auto which_mtx = intersect_ecs_with_subset_txs(ecs, ecmap, tx_split_lookup);
         if (opt.count_gene_multimapping) {
           for (auto x : glist) {
-            column_vp.push_back({x, (opt.count_raw_counts ? counts : 1.0)/gn});
+            column_vp.push_back({x, {(opt.count_raw_counts ? counts : 1.0)/gn, which_mtx}});
           }
           //Fill in histograms for prediction.
           if (opt.count_gen_hist) {
@@ -284,7 +333,7 @@ void bustools_count(Bustools_opt &opt) {
           }
         } else {
           if (gn==1) {
-            column_vp.push_back({glist[0],opt.count_raw_counts ? counts : 1.0});
+            column_vp.push_back({glist[0],{opt.count_raw_counts ? counts : 1.0, which_mtx}});
             //Fill in histograms for prediction.
             if (opt.count_gen_hist) {
               if (glist[0] < n_genes) { //crasches with an invalid gene file otherwise
@@ -327,16 +376,18 @@ void bustools_count(Bustools_opt &opt) {
           }
           if (glist.size() == 0) {
             intersect_genes_of_ecs(ecs_within_molecule, ec2genes, glist);
+	    if (count_mtx_priority != 0 && glist.size() > 1) intersect_genes_of_ecs(ecs_within_molecule, ec2genes_priority, glist);
           }
           gn = glist.size();
           if (gn > 0) {
+            auto which_mtx = intersect_ecs_with_subset_txs(ecs_within_molecule, ecmap, tx_split_lookup);
             if (opt.count_gene_multimapping) {
               for (auto x : glist) {
-                column_vp.push_back({x, 1.0/gn});
+                column_vp.push_back({x, {1.0/gn, which_mtx}});
               }
             } else {
               if (gn==1) {
-                column_vp.push_back({glist[0],1.0});
+                column_vp.push_back({glist[0],{1.0, which_mtx}});
               } else if (opt.count_em) {
                 ambiguous_genes.push_back(std::move(glist));
               }
@@ -345,32 +396,72 @@ void bustools_count(Bustools_opt &opt) {
         }
       }
       i = j; // increment
+    } else for (size_t i = 0; i < n; i++) { // Entire loop is for opt.count_cm
+      ecs.resize(0);
+      ecs.push_back(v[i].ec);
+      
+      intersect_genes_of_ecs(ecs, ec2genes, glist);
+      if (count_mtx_priority != 0 && glist.size() > 1) intersect_genes_of_ecs(ecs, ec2genes_priority, glist);
+      int gn = glist.size();
+      if (gn > 0) {
+        auto which_mtx = intersect_ecs_with_subset_txs(ecs, ecmap, tx_split_lookup);
+        if (opt.count_gene_multimapping) {
+          for (auto x : glist) {
+            column_vp.push_back({x, {v[i].count/gn, which_mtx}});
+          }
+        } else {
+          if (gn==1) {
+            column_vp.push_back({glist[0],{v[i].count, which_mtx}});
+          } 
+        }
+      }
     }
     std::sort(column_vp.begin(), column_vp.end());
     size_t m = column_vp.size();
-    std::unordered_map<int32_t, double> col_map(m);
+    u_map_<int32_t, double> col_map(m);
+    auto col_map_2 = col_map; // copy
+    auto col_map_A = col_map; // copy
     std::vector<int32_t> cols;
 
     for (size_t i = 0; i < m; ) {
       size_t j = i+1;
-      double val = column_vp[i].second;
+      double val = 0;
+      double val_2 = 0;
+      double val_A = 0;
+      auto mtx_type = column_vp[i].second.second;
+      if (mtx_type == COUNT_DEFAULT) val = column_vp[i].second.first;
+      else if (mtx_type == COUNT_SPLIT) val_2 = column_vp[i].second.first;
+      else val_A = column_vp[i].second.first;
       for (; j < m; j++) {
         if (column_vp[i].first != column_vp[j].first) {
           break;
         }
-        val += column_vp[j].second;
+        auto mtx_type = column_vp[j].second.second;
+        if (mtx_type == COUNT_DEFAULT) val += column_vp[j].second.first;
+        else if (mtx_type == COUNT_SPLIT) val_2 += column_vp[j].second.first;
+        else val_A += column_vp[j].second.first;
+      }
+      if (!count_split || val != 0) col_map.insert({column_vp[i].first,val});
+      if (count_split) {
+        if (val_2 != 0) col_map_2.insert({column_vp[i].first,val_2});
+        if (val_A != 0) col_map_A.insert({column_vp[i].first,val_A});
       }
-      col_map.insert({column_vp[i].first,val});
       cols.push_back(column_vp[i].first);
 
-      n_entries++;
+      if (count_split) {
+        if (val != 0) n_entries++;
+        if (val_2 != 0) n_entries_2++;
+        if (val_A != 0) n_entries_A++;
+      } else {
+        n_entries++;
+      }
       
       i = j; // increment
     }
 
     if (opt.count_em) {
       //std::cerr << "Running EM algorithm" << std::endl;
-      std::unordered_map<int32_t, double> c1,c2;
+      u_map_<int32_t, double> c1,c2;
       // initialize with unique counts
       for (const auto &x : cols) {
         double val = 0;
@@ -426,19 +517,77 @@ void bustools_count(Bustools_opt &opt) {
 
     }
 
-
-
     for (const auto &x : cols) {
       double val = 0;
       auto it = col_map.find(x);
-      if (it != col_map.end()) {
-        val = it->second;
+      if (!count_split) {
+        if (it != col_map.end()) val = it->second;
+        of << n_rows << " " << (x+1) << " " << val << "\n";
+      } else {
+        if (it != col_map.end()) {
+          val = it->second;
+          of << n_rows << " " << (x+1) << " " << val << "\n";
+        }
+        it = col_map_2.find(x);
+        if (it != col_map_2.end()) {
+          val = it->second;
+          of_2 << n_rows << " " << (x+1) << " " << val << "\n";
+        }
+        it = col_map_A.find(x);
+        if (it != col_map_A.end()) {
+          val = it->second;
+          of_A << n_rows << " " << (x+1) << " " << val << "\n";
+        }
+      }
+    }
+  };
+  
+  auto write_barcode_matrix_mult = [&](const std::vector<BUSData> &v) {
+    if(v.empty()) {
+      return;
+    }
+    column_vp.resize(0);
+    n_rows+= 1;
+    
+    barcodes.push_back(v[0].barcode);
+    double val = 0.0;
+    size_t n = v.size();
+    
+    for (size_t i = 0; i < n; i++) {
+      int32_t ec = v[i].ec;
+      if (!opt.count_gene_multimapping) {
+        ecs.resize(0);
+        ecs.push_back(ec);
+        intersect_genes_of_ecs(ecs, ec2genes, glist);
+	if (count_mtx_priority != 0 && glist.size() > 1) intersect_genes_of_ecs(ecs, ec2genes_priority, glist);
+        int gn = glist.size();
+        if (gn != 1) {
+          continue;
+        }
       }
-      of << n_rows << " " << (x+1) << " " << val << "\n";
+      column_vp.push_back({ec,{v[i].count,COUNT_DEFAULT}});
+    }
+    std::sort(column_vp.begin(), column_vp.end());
+    size_t m = column_vp.size();
+    for (size_t i = 0; i < m; ) {
+      size_t j = i+1;
+      double val = column_vp[i].second.first;
+      for (; j < m; j++) {
+        if (column_vp[i].first != column_vp[j].first) {
+          break;
+        }
+        val += column_vp[j].second.first;
+      }
+      auto which_mtx = intersect_ecs_with_subset_txs(column_vp[i].first, ecmap, tx_split_lookup);
+      auto& of_ = which_mtx == COUNT_DEFAULT ? of : (which_mtx == COUNT_SPLIT ? of_2 : of_A);
+      auto& n_entries_ = which_mtx == COUNT_DEFAULT ? n_entries : (which_mtx == COUNT_SPLIT ? n_entries_2 : n_entries_A);
+      of_ << n_rows << " " << (column_vp[i].first+1) << " " << val << "\n";
+      n_entries_++;
+      i = j; // increment
     }
   };
 
-  for (const auto& infn : opt.files) { 
+  for (const auto& infn : opt.files) {
     std::streambuf *inbuf;
     std::ifstream inf;
     if (!opt.stream_in) {
@@ -467,9 +616,10 @@ void bustools_count(Bustools_opt &opt) {
           // output whatever is in v
           if (!v.empty()) {
             if (!opt.count_collapse) {
-              write_barcode_matrix(v);
+              if (!opt.count_cm) write_barcode_matrix(v);
+              else write_barcode_matrix_mult(v);
             } else {
-              write_barcode_matrix_collapsed(v);
+              write_barcode_matrix_collapsed(v); // Same signature for count_cm and !count_cm
             }
           }
           v.clear();
@@ -481,9 +631,10 @@ void bustools_count(Bustools_opt &opt) {
     }
     if (!v.empty()) {
       if (!opt.count_collapse) {
-        write_barcode_matrix(v);
+        if (!opt.count_cm) write_barcode_matrix(v);
+        else write_barcode_matrix_mult(v);
       } else {
-        write_barcode_matrix_collapsed(v);
+        write_barcode_matrix_collapsed(v); // Same signature for count_cm and !count_cm
       }
     }
 
@@ -500,9 +651,13 @@ void bustools_count(Bustools_opt &opt) {
   }
 
   of.close();
+  if (count_split) {
+    of_2.close();
+    of_A.close();
+  }
   
   //Rewrite header in a way that works for both Windows and Linux
-  std::stringstream ss;
+  std::stringstream ss, ss_2, ss_A;
   ss << n_rows << " " << n_cols << " " << n_entries;
   std::string header = ss.str();
   int hlen = header.size();
@@ -510,6 +665,22 @@ void bustools_count(Bustools_opt &opt) {
   of.open(mtx_ofn, std::ios::in | std::ios::out);
   of << headerComments << header;
   of.close();
+  if (count_split) {
+    ss_2 << n_rows << " " << n_cols << " " << n_entries_2;
+    header = ss_2.str();
+    hlen = header.size();
+    header = header + std::string(66 - hlen, ' ') + '\n';
+    of_2.open(mtx_ofn_split_2, std::ios::in | std::ios::out);
+    of_2 << headerComments << header;
+    of_2.close();
+    ss_A << n_rows << " " << n_cols << " " << n_entries_A;
+    header = ss_A.str();
+    hlen = header.size();
+    header = header + std::string(66 - hlen, ' ') + '\n';
+    of_A.open(mtx_ofn_split_A, std::ios::in | std::ios::out);
+    of_A << headerComments << header;
+    of_A.close();
+  }
 
   // write updated ec file
   h.ecs = std::move(ecmap);
@@ -519,11 +690,22 @@ void bustools_count(Bustools_opt &opt) {
     writeGenes(gene_ofn, genenames);
   }
   // write barcode file
+  bool write_prefix = false;
   std::ofstream bcof;
   bcof.open(barcodes_ofn);
+  uint64_t len_mask = ((1ULL << (2*bclen)) - 1);
   for (const auto &x : barcodes) {
+    if (x != (x & len_mask)) write_prefix = true;
     bcof << binaryToString(x, bclen) << "\n";
   }
+  if (write_prefix) {
+    std::ofstream bcprefixof;
+    bcprefixof.open(barcodes_prefix_ofn);
+    for (const auto &x : barcodes) {
+      bcprefixof << binaryToString(x >> (2*bclen), 16) << "\n"; // Always make prefix length 16
+    }
+    bcprefixof.close();
+  }
   bcof.close();
 
   //write histogram file
@@ -612,274 +794,5 @@ void bustools_count(Bustools_opt &opt) {
 }
 
 void bustools_count_mult(Bustools_opt &opt) {
-  BUSHeader h;
-  size_t nr = 0;
-  size_t N = 100000;
-  uint32_t bclen = 0;
-  BUSData* p = new BUSData[N];
-
-  // read and parse the equivalence class files
-
-  std::unordered_map<std::vector<int32_t>, int32_t, SortedVectorHasher> ecmapinv;
-  std::vector<std::vector<int32_t>> ecmap;
-
-  std::unordered_map<std::string, int32_t> txnames;
-  parseTranscripts(opt.count_txp, txnames);
-  std::vector<int32_t> genemap(txnames.size(), -1);
-  std::unordered_map<std::string, int32_t> genenames;
-  parseGenes(opt.count_genes, txnames, genemap, genenames);
-  parseECs(opt.count_ecs, h);
-  ecmap = std::move(h.ecs);
-  ecmapinv.reserve(ecmap.size());
-  for (int32_t ec = 0; ec < ecmap.size(); ec++) {
-    ecmapinv.insert({ecmap[ec], ec});
-  }
-  std::vector<std::vector<int32_t>> ec2genes;        
-  create_ec2genes(ecmap, genemap, ec2genes);
-
-
-  std::ofstream of;
-  std::string mtx_ofn = opt.output + ".mtx";
-  std::string barcodes_ofn = opt.output + ".barcodes.txt";
-  std::string ec_ofn = opt.output + ".ec.txt";
-  std::string gene_ofn = opt.output + ".genes.txt";
-  of.open(mtx_ofn); 
-
-  // write out the initial header
-  of << "%%MatrixMarket matrix coordinate real general\n%\n";
-  // number of genes
-  auto mat_header_pos = of.tellp();
-  std::string dummy_header(66, '\n');
-  for (int i = 0; i < 33; i++) {
-    dummy_header[2*i] = '%';
-  }
-  of.write(dummy_header.c_str(), dummy_header.size());
-
-
-  size_t n_cols = 0;
-  size_t n_rows = 0;
-  size_t n_entries = 0;
-  std::vector<BUSData> v;
-  v.reserve(N);
-  uint64_t current_bc = 0xFFFFFFFFFFFFFFFFULL;
-  //temporary data
-  std::vector<int32_t> ecs;
-  std::vector<int32_t> glist;
-  ecs.reserve(100);
-  std::vector<int32_t> u;
-  u.reserve(100);
-  std::vector<int32_t> column_v;
-  std::vector<std::pair<int32_t, double>> column_vp;
-  if (!opt.count_collapse) {
-    column_vp.reserve(N); 
-  } else {
-    column_vp.reserve(N);
-    glist.reserve(100);
-  }
-  //barcodes 
-  std::vector<uint64_t> barcodes;
-  int bad_count = 0;
-  int compacted = 0;
-  int rescued = 0;
-
-
-  auto write_barcode_matrix = [&](const std::vector<BUSData> &v) {
-    if(v.empty()) {
-      return;
-    }
-    column_vp.resize(0);
-    n_rows+= 1;
-    
-    barcodes.push_back(v[0].barcode);
-    double val = 0.0;
-    size_t n = v.size();
-    
-    for (size_t i = 0; i < n; i++) {
-      int32_t ec = v[i].ec;
-      if (!opt.count_gene_multimapping) {
-        ecs.resize(0);
-        ecs.push_back(ec);
-        intersect_genes_of_ecs(ecs, ec2genes, glist);
-        int gn = glist.size();
-        if (gn != 1) {
-          continue;
-        }
-      }
-      column_vp.push_back({ec,v[i].count});
-    }
-    std::sort(column_vp.begin(), column_vp.end());
-    size_t m = column_vp.size();
-    for (size_t i = 0; i < m; ) {
-      size_t j = i+1;
-      double val = column_vp[i].second;
-      for (; j < m; j++) {
-        if (column_vp[i].first != column_vp[j].first) {
-          break;
-        }
-        val += column_vp[j].second;
-      }
-      n_entries++;
-      of << n_rows << " " << (column_vp[i].first+1) << " " << val << "\n";
-      i = j; // increment
-    }
-  };
-  
-  auto write_barcode_matrix_collapsed = [&](const std::vector<BUSData> &v) {
-    if(v.empty()) {
-      return;
-    }
-    column_vp.resize(0);
-    n_rows+= 1;
-    
-    barcodes.push_back(v[0].barcode);
-    double val = 0.0;
-    size_t n = v.size();
-
-    for (size_t i = 0; i < n; i++) {
-      ecs.resize(0);
-      ecs.push_back(v[i].ec);
-      
-      intersect_genes_of_ecs(ecs, ec2genes, glist);
-      int gn = glist.size();
-      if (gn > 0) {
-        if (opt.count_gene_multimapping) {
-          for (auto x : glist) {
-            column_vp.push_back({x, v[i].count/gn});
-          }
-        } else {
-          if (gn==1) {
-            column_vp.push_back({glist[0],v[i].count});
-          } 
-        }
-      }
-    }
-
-    std::sort(column_vp.begin(), column_vp.end());
-    size_t m = column_vp.size();
-    std::unordered_map<int32_t, double> col_map(m);
-    std::vector<int32_t> cols;
-
-    for (size_t i = 0; i < m; ) {
-      size_t j = i+1;
-      double val = column_vp[i].second;
-      for (; j < m; j++) {
-        if (column_vp[i].first != column_vp[j].first) {
-          break;
-        }
-        val += column_vp[j].second;
-      }
-      col_map.insert({column_vp[i].first,val});
-      cols.push_back(column_vp[i].first);
-
-      n_entries++;
-      
-      i = j; // increment
-    }
-
-
-
-    for (const auto &x : cols) {
-      double val = 0;
-      auto it = col_map.find(x);
-      if (it != col_map.end()) {
-        val = it->second;
-      }
-      of << n_rows << " " << (x+1) << " " << val << "\n";
-    }
-
-  };
-
-  for (const auto& infn : opt.files) { 
-    std::streambuf *inbuf;
-    std::ifstream inf;
-    if (!opt.stream_in) {
-      inf.open(infn.c_str(), std::ios::binary);
-      inbuf = inf.rdbuf();
-    } else {
-      inbuf = std::cin.rdbuf();
-    }
-    std::istream in(inbuf); 
-
-    parseHeader(in, h);
-    bclen = h.bclen;
-    
-    int rc = 0;
-    while (true) {
-      in.read((char*)p, N*sizeof(BUSData));
-      size_t rc = in.gcount() / sizeof(BUSData);
-      nr += rc;
-      if (rc == 0) {
-        break;
-      }
-
-      
-      for (size_t i = 0; i < rc; i++) {
-        if (p[i].barcode != current_bc) {                 
-          // output whatever is in v
-          if (!v.empty()) {
-            if (!opt.count_collapse) {
-              write_barcode_matrix(v);
-              } else {
-                write_barcode_matrix_collapsed(v);
-              }
-            }
-          v.clear();
-          current_bc = p[i].barcode;
-        }
-        v.push_back(p[i]);
-
-      }            
-    }
-    if (!v.empty()) {
-      if (!opt.count_collapse) {
-        write_barcode_matrix(v);
-      } else {
-        write_barcode_matrix_collapsed(v);
-      }
-    }
-
-    if (!opt.stream_in) {
-      inf.close();
-    }
-  }
-  delete[] p; p = nullptr;
-
-  if (!opt.count_collapse) {
-    n_cols = ecmap.size();
-  } else {
-    n_cols = genenames.size();
-  }
-
-  of.close();
-  
-  std::stringstream ss;
-  ss << n_rows << " " << n_cols << " " << n_entries << "\n";
-  std::string header = ss.str();
-  int hlen = header.size();
-  assert(hlen < 66);
-  of.open(mtx_ofn, std::ios::binary | std::ios::in | std::ios::out);
-  of.seekp(mat_header_pos);
-  of.write("%",1);
-  of.write(std::string(66-hlen-2,' ').c_str(),66-hlen-2);
-  of.write("\n",1);
-  of.write(header.c_str(), hlen);
-  of.close();
-
-  // write updated ec file
-  h.ecs = std::move(ecmap);
-  if (!opt.count_collapse) {
-    writeECs(ec_ofn, h);
-  } else {
-    writeGenes(gene_ofn, genenames);
-  }
-  // write barcode file
-  std::ofstream bcof;
-  bcof.open(barcodes_ofn);
-  for (const auto &x : barcodes) {
-    bcof << binaryToString(x, bclen) << "\n";
-  }
-  bcof.close();
-  //std::cerr << "bad counts = " << bad_count <<", rescued  =" << rescued << ", compacted = " << compacted << std::endl;
-
-  //std::cerr << "Read in " << nr << " BUS records" << std::endl;
+  bustools_count(opt);
 }
diff --git a/src/bustools_main.cpp b/src/bustools_main.cpp
index ab954a8..fe85131 100644
--- a/src/bustools_main.cpp
+++ b/src/bustools_main.cpp
@@ -328,7 +328,7 @@ void parse_ProgramOptions_capture(int argc, char **argv, Bustools_opt &opt)
 
 void parse_ProgramOptions_count(int argc, char **argv, Bustools_opt &opt)
 {
-  const char *opt_string = "o:g:e:t:md:";
+  const char *opt_string = "o:g:e:t:md:s:";
   int gene_flag = 0;
   int umigene_flag = 0;
   int em_flag = 0;
@@ -348,6 +348,7 @@ void parse_ProgramOptions_count(int argc, char **argv, Bustools_opt &opt)
     {"hist", no_argument, &hist_flag, 1},
     {"downsample", required_argument, 0, 'd'},
     {"rawcounts", no_argument, &rawcounts_flag, 1},
+    {"split", required_argument, 0, 's'},
     {0, 0, 0, 0}};
   
   int option_index = 0, c;
@@ -375,24 +376,23 @@ void parse_ProgramOptions_count(int argc, char **argv, Bustools_opt &opt)
     case 'm':
       opt.count_gene_multimapping = true;
       break;
+    case 's':
+      opt.count_split = optarg;
+      break;
     default:
       break;
     }
   }
-  if (gene_flag)
-  {
+  if (gene_flag) {
     opt.count_collapse = true;
   }
-  if (umigene_flag)
-  {
+  if (umigene_flag) {
     opt.umi_gene_collapse = true;
   }
-  if (em_flag)
-  {
+  if (em_flag) {
     opt.count_em = true;
   }
-  if (cm_flag)
-  {
+  if (cm_flag) {
     opt.count_cm = true;
   }
   if (hist_flag) {
@@ -493,13 +493,14 @@ void parse_ProgramOptions_predict(int argc, char **argv, Bustools_opt& opt) {
 void parse_ProgramOptions_dump(int argc, char **argv, Bustools_opt &opt)
 {
   
-  const char *opt_string = "o:pfd";
+  const char *opt_string = "o:pfda";
   
   static struct option long_options[] = {
     {"output", required_argument, 0, 'o'},
     {"pipe", no_argument, 0, 'p'},
     {"flags", no_argument, 0, 'f'},
     {"pad", no_argument, 0, 'd'},
+    {"showAll", no_argument, 0, 'a'},
     {0, 0, 0, 0}};
   
   int option_index = 0, c;
@@ -521,6 +522,9 @@ void parse_ProgramOptions_dump(int argc, char **argv, Bustools_opt &opt)
     case 'd':
       opt.text_dumppad = true;
       break;
+    case 'a':
+      opt.text_showall = true;
+      break;
     default:
       break;
     }
@@ -576,13 +580,14 @@ void parse_ProgramOptions_fromtext(int argc, char **argv, Bustools_opt& opt) {
 void parse_ProgramOptions_correct(int argc, char **argv, Bustools_opt &opt)
 {
   
-  const char *opt_string = "o:w:d:sp";
+  const char *opt_string = "o:w:d:spr";
   static struct option long_options[] = {
     {"output", required_argument, 0, 'o'},
     {"whitelist", required_argument, 0, 'w'},
     {"dump", required_argument, 0, 'd'},
     {"split", no_argument, 0, 's'},
     {"pipe", no_argument, 0, 'p'},
+    {"replace", no_argument, 0, 'r'},
     {0, 0, 0, 0}};
   
   int option_index = 0, c;
@@ -608,6 +613,9 @@ void parse_ProgramOptions_correct(int argc, char **argv, Bustools_opt &opt)
     case 'p':
       opt.stream_out = true;
       break;
+    case 'r':
+      opt.barcode_replacement = true;
+      break;
     default:
       break;
     }
@@ -1604,7 +1612,7 @@ bool check_ProgramOptions_correct(Bustools_opt &opt)
   
   if (opt.whitelist.size() == 0)
   {
-    std::cerr << "Error: Missing whitelist file" << std::endl;
+    std::cerr << "Error: Missing on-list file" << std::endl;
     ret = false;
   }
   else
@@ -1632,95 +1640,75 @@ bool check_ProgramOptions_count(Bustools_opt &opt)
   bool ret = true;
   
   // check for output directory
-  if (opt.output.empty())
-  {
+  if (opt.output.empty()) {
     std::cerr << "Error: Missing output directory" << std::endl;
     ret = false;
   }
-  else
-  {
+  else {
     bool isDir = false;
-    if (checkDirectoryExists(opt.output))
-    {
+    if (checkDirectoryExists(opt.output)) {
       isDir = true;
     }
-    else
-    {
-      if (opt.output.at(opt.output.size() - 1) == '/')
-      {
-        if (my_mkdir(opt.output.c_str(), 0777) == -1)
-        {
+    else {
+      if (opt.output.at(opt.output.size() - 1) == '/') {
+        if (my_mkdir(opt.output.c_str(), 0777) == -1) {
           std::cerr << "Error: could not create directory " << opt.output << std::endl;
           ret = false;
         }
-        else
-        {
+        else {
           isDir = true;
         }
       }
     }
     
-    if (isDir)
-    {
+    if (isDir) {
       opt.output += "output";
     }
   }
   
-  if (opt.count_em && opt.count_gene_multimapping)
-  {
+  if (opt.count_em && opt.count_gene_multimapping) {
     std::cerr << "Error: EM algorithm and counting multimapping reads are incompatible" << std::endl;
     ret = false;
   }
   
-  if (opt.count_em && opt.count_cm)
-  {
+  if (opt.count_em && opt.count_cm) {
     std::cerr << "Error: EM algorithm and counting multiplicites are incompatible" << std::endl;
     ret = false;
   }
   
-  if (opt.umi_gene_collapse && opt.count_cm)
-  {
+  if (opt.umi_gene_collapse && opt.count_cm) {
     std::cerr << "Error: Gene-level collapsing of UMIs and counting multiplicites are incompatible" << std::endl;
     ret = false;
   }
   
-  if (opt.umi_gene_collapse && (opt.count_raw_counts || opt.count_gen_hist || opt.count_downsampling_factor != 1.0))
-  {
+  if (opt.umi_gene_collapse && (opt.count_raw_counts || opt.count_gen_hist || opt.count_downsampling_factor != 1.0)) {
     std::cerr << "Error: Gene-level collapsing of UMIs is currently incompatible with --hist, --downsample, or --rawcounts" << std::endl;
     ret = false;
   }
   
-  if (opt.count_cm && (opt.count_raw_counts || opt.count_gen_hist || opt.count_downsampling_factor != 1.0))
-  {
+  if (opt.count_cm && (opt.count_raw_counts || opt.count_gen_hist || opt.count_downsampling_factor != 1.0)) {
     std::cerr << "Error: Counting multiplicites is incompatible with --hist, --downsample, or --rawcounts" << std::endl;
     ret = false;
   }
   
-  if (opt.count_raw_counts && opt.count_em) 
-  {
+  if (opt.count_raw_counts && opt.count_em)  {
     std::cerr << "Error: Counting raw counts are not supported for the EM algorithm" << std::endl;
     ret = false;
   }
   
-  if (opt.count_raw_counts && !opt.count_collapse) 
-  {
+  if (opt.count_raw_counts && !opt.count_collapse)  {
     std::cerr << "Error: Raw counts are currently only supported for gene counting, not ec counting." << std::endl;
     ret = false;
   }
   
-  if (opt.files.size() == 0)
-  {
+  if (opt.files.size() == 0) {
     std::cerr << "Error: Missing BUS input files" << std::endl;
     ret = false;
   }
-  else
-  {
-    if (!opt.stream_in)
-    {
-      for (const auto &it : opt.files)
-      {
-        if (!checkFileExists(it))
-        {
+  else {
+    if (!opt.stream_in) {
+      for (const auto &it : opt.files) {
+        if (!checkFileExists(it)) {
           std::cerr << "Error: File not found, " << it << std::endl;
           ret = false;
         }
@@ -1728,13 +1716,11 @@ bool check_ProgramOptions_count(Bustools_opt &opt)
     }
   }
   
-  if (opt.count_genes.size() == 0)
-  {
+  if (opt.count_genes.size() == 0) {
     std::cerr << "Error: missing gene mapping file" << std::endl;
     ret = false;
   }
-  else
-  {
+  else {
     if (!checkFileExists(opt.count_genes))
     {
       std::cerr << "Error: File not found " << opt.count_genes << std::endl;
@@ -1742,13 +1728,11 @@ bool check_ProgramOptions_count(Bustools_opt &opt)
     }
   }
   
-  if (opt.count_ecs.size() == 0)
-  {
+  if (opt.count_ecs.size() == 0) {
     std::cerr << "Error: missing equivalence class mapping file" << std::endl;
     ret = false;
   }
-  else
-  {
+  else {
     if (!checkFileExists(opt.count_ecs))
     {
       std::cerr << "Error: File not found " << opt.count_ecs << std::endl;
@@ -1756,20 +1740,28 @@ bool check_ProgramOptions_count(Bustools_opt &opt)
     }
   }
   
-  if (opt.count_txp.size() == 0)
-  {
+  if (opt.count_txp.size() == 0) {
     std::cerr << "Error: missing transcript name file" << std::endl;
     ret = false;
   }
-  else
-  {
-    if (!checkFileExists(opt.count_txp))
-    {
+  else {
+    if (!checkFileExists(opt.count_txp)) {
       std::cerr << "Error: File not found " << opt.count_txp << std::endl;
       ret = false;
     }
   }
   
+  if (opt.count_split.size() != 0) {
+    if (!checkFileExists(opt.count_split)) {
+      std::cerr << "Error: File not found " << opt.count_split << std::endl;
+      ret = false;
+    }
+    if (opt.count_em) {
+      std::cerr << "Cannot use -s with --em" << std::endl;
+      ret = false;
+    }
+  }
+  
   return ret;
 }
 
@@ -2619,6 +2611,7 @@ void Bustools_dump_Usage()
             << "-f, --flags           Write the flag column" << std::endl
             << "-d, --pad             Write the pad column" << std::endl
             << "-p, --pipe            Write to standard output" << std::endl
+            << "-a, --showAll         Show hidden metadata in barcodes" << std::endl
             << std::endl;
 }
 
@@ -2637,10 +2630,10 @@ void Bustools_correct_Usage()
             << std::endl
             << "Options: " << std::endl
             << "-o, --output          File for corrected bus output" << std::endl
-            << "-w, --whitelist       File of whitelisted barcodes to correct to" << std::endl
+            << "-w, --whitelist       File of on-list barcodes to correct to" << std::endl
             << "-p, --pipe            Write to standard output" << std::endl
             << "-d, --dump            Dump uncorrected to corrected barcodes (optional)" << std::endl
-            << "-s, --split           Split the whitelist and correct each half independently (optional)" << std::endl
+            << "-r, --replace         The file of on-list barcodes is a barcode replacement file" << std::endl
             << std::endl;
 }
 
@@ -2657,6 +2650,7 @@ void Bustools_count_Usage()
             << "    --umi-gene        Perform gene-level collapsing of UMIs" << std::endl
             << "    --em              Estimate gene abundances using EM algorithm" << std::endl
             << "    --cm              Count multiplicites instead of UMIs" << std::endl
+            << "-s, --split           Split output matrix in two (plus ambiguous) based on transcripts supplied in this file" << std::endl
             << "-m, --multimapping    Include bus records that pseudoalign to multiple genes" << std::endl
             << "    --hist            Output copy per UMI histograms for all genes" << std::endl 
             << "-d  --downsample      Specify a factor between 0 and 1 specifying how much to downsample" << std::endl 
diff --git a/src/bustools_mash.cpp b/src/bustools_mash.cpp
index d9c3d17..1df9702 100644
--- a/src/bustools_mash.cpp
+++ b/src/bustools_mash.cpp
@@ -12,7 +12,7 @@
 
 #include "bustools_merge.h"
 
-inline std::vector<int32_t> get_tids(const BUSHeader &oh, const std::unordered_map<std::vector<int32_t>, int32_t, SortedVectorHasher> &ecmapinv, const int32_t &eid)
+inline std::vector<int32_t> get_tids(const BUSHeader &oh, const u_map_<std::vector<int32_t>, int32_t, SortedVectorHasher> &ecmapinv, const int32_t &eid)
 {
 
     std::vector<int32_t> tids = oh.ecs[eid];
@@ -56,7 +56,7 @@ void bustools_mash(const Bustools_opt &opt)
     std::cerr << "[info] parsed output.bus files" << std::endl;
 
     // parse the transcripts.txt
-    std::unordered_map<std::string, int32_t> txn_tid;
+    u_map_<std::string, int32_t> txn_tid;
     std::vector<std::vector<int32_t>> tids_per_file; // list of tids as they occur for each file
     std::vector<int32_t> tids;                       // a vector of tids
     int32_t tid = 0;
@@ -101,12 +101,14 @@ void bustools_mash(const Bustools_opt &opt)
     oh.bclen = vh[0].bclen;
     oh.umilen = vh[0].umilen;
 
-    std::unordered_map<std::vector<int32_t>, int32_t, SortedVectorHasher> ecmapinv; // set{tids} (ec) to eid it came from
+    u_map_<std::vector<int32_t>, int32_t, SortedVectorHasher> ecmapinv; // set{tids} (ec) to eid it came from
 
     for (int32_t i = 0; i < tid; i++)
     {
-        oh.ecs.push_back({i});
-        ecmapinv.insert({{i}, i});
+        std::vector<int32_t> tmp_vec;
+        tmp_vec.push_back(i);
+        oh.ecs.push_back(tmp_vec);
+        ecmapinv.insert({tmp_vec, i});
     }
 
     std::vector<std::vector<int32_t>> eids_per_file;
diff --git a/src/bustools_merge.cpp b/src/bustools_merge.cpp
index f330efe..c89eae3 100644
--- a/src/bustools_merge.cpp
+++ b/src/bustools_merge.cpp
@@ -57,7 +57,7 @@ void bustools_merge_different_index(const Bustools_opt &opt)
   std::ifstream ifn(opt.count_txp);
   std::string txn;
   int32_t tid;
-  std::unordered_map<std::string, int32_t> txn_tid;
+  u_map_<std::string, int32_t> txn_tid;
   std::vector<int32_t> tids;
 
   // insert tids into a vector
@@ -81,7 +81,7 @@ void bustools_merge_different_index(const Bustools_opt &opt)
   BUSHeader h, bh;
   parseECs(opt.count_ecs, h);
   // put the ecs into a ecmap inv
-  std::unordered_map<std::vector<int32_t>, int32_t, SortedVectorHasher> ecmapinv;
+  u_map_<std::vector<int32_t>, int32_t, SortedVectorHasher> ecmapinv;
 
   for (std::size_t ec = 0; ec < h.ecs.size(); ec++)
   {
diff --git a/src/bustools_project.cpp b/src/bustools_project.cpp
index c5b1979..6d1e195 100644
--- a/src/bustools_project.cpp
+++ b/src/bustools_project.cpp
@@ -20,7 +20,7 @@ void bustools_project(Bustools_opt &opt) {
   size_t stat_map = 0;
   size_t stat_unmap = 0;
 
-  std::unordered_map<uint64_t, uint64_t> project_map;
+  u_map_<uint64_t, uint64_t> project_map;
 
   /* Load the map into project_map variable
   parse bus records and map each object (barcode, umi) with project_map
@@ -175,18 +175,18 @@ void bustools_project(Bustools_opt &opt) {
   }
   if (opt.type == PROJECT_TX) {
     std::ofstream of;
-    std::unordered_map<std::string, int32_t> txnames;
+    u_map_<std::string, int32_t> txnames;
     parseTranscripts(opt.count_txp, txnames);
 
     std::vector<int32_t> genemap(txnames.size(), -1);
-    std::unordered_map<std::string, int32_t> genenames;
+    u_map_<std::string, int32_t> genenames;
     parseGenes(opt.map, txnames, genemap, genenames);
     std::vector<std::string> genenamesinv(genenames.size(), "");
     for (const auto &gene : genenames) {
       genenamesinv[gene.second] = gene.first;
     }
 
-    std::unordered_map<std::vector<int32_t>, int32_t, SortedVectorHasher> ecmapinv;
+    u_map_<std::vector<int32_t>, int32_t, SortedVectorHasher> ecmapinv;
     std::vector<std::vector<int32_t>> ecmap;
     parseECs(opt.count_ecs, h);
     ecmap = std::move(h.ecs);
@@ -199,7 +199,7 @@ void bustools_project(Bustools_opt &opt) {
     create_ec2genes(ecmap, genemap, ec2genes);
 
     std::vector<std::vector<int32_t>> geneEc2genes = ec2genes;
-    std::unordered_map<std::vector<int32_t>, int32_t, SortedVectorHasher> geneEc2genesinv;
+    u_map_<std::vector<int32_t>, int32_t, SortedVectorHasher> geneEc2genesinv;
     std::sort(geneEc2genes.begin(), geneEc2genes.end());
     auto firstNonempty = geneEc2genes.begin();
     while (firstNonempty->size() == 0 && firstNonempty != geneEc2genes.end()) {
@@ -284,7 +284,7 @@ void bustools_project(Bustools_opt &opt) {
     BUSData *p = new BUSData[N];
     BUSData currRec;
     // Gene EC --> counts for current barcode/UMI pair
-    std::unordered_map<uint32_t, uint32_t> counts;
+    u_map_<uint32_t, uint32_t> counts;
     
     while (true) {
       in.read((char*) p, N * sizeof(BUSData));
diff --git a/src/bustools_sort.cpp b/src/bustools_sort.cpp
index 0f2323c..adc5e7e 100644
--- a/src/bustools_sort.cpp
+++ b/src/bustools_sort.cpp
@@ -4,11 +4,14 @@
 #include <algorithm>
 #include <queue>
 #include <functional>
+#include <thread>
 
 #include "Common.hpp"
 #include "BUSData.h"
 #include "bustools_sort.h"
 
+#include <time.h>
+
 #define TP std::pair<BUSData, int>
 
 //This code is for automatically creating the tmp directory supplied if it doesn't exist
@@ -16,9 +19,61 @@
 //#include <filesystem> //once filesystem is acceptable for minGW, switch to that
 #include "windows.h" //Needed for CreateDirectory
 
+
+  void EnsureWindowsTempDirectoryExists(const Bustools_opt &opt) {
+    //Make sure to create the tmp directory if it doesn't exist - writing temporary files fails otherwise in Windows
+    //First get the directory - in theory, opt.temp_files can look like "tmp/x_" or just "x_" (or even nothing)
+    //so we should find the last slash and make sure that directory exists
+
+    std::size_t ind = opt.temp_files.rfind('/');
+    std::size_t ind2 = opt.temp_files.rfind('\\');
+    if (ind == std::string::npos)
+    {
+    ind = ind2;
+    }
+    else if (ind2 != std::string::npos)
+    {
+    //both valid, take the largest value (representing the last slash)
+    ind = std::max(ind, ind2);
+    }
+    if (ind != std::string::npos)
+    {
+      auto dirName = opt.temp_files.substr(0, ind);
+    //When our MinGW builds support c++17, change to std::filesystem
+  
+      //std::filesystem::path filepath = dirName;
+      //if (!std::filesystem::is_directory(filepath)) 
+      //{
+      //    std::filesystem::create_directory(filepath);
+      //}
+    CreateDirectory(dirName.c_str(), NULL); //This will do nothing if the directory exists already
+    }
+  }
+
+    //There is a bug in Windows, where bustools sort fails. The problem is that 
+  //gcount for some reason fails here if too much is read and returns 0, even though
+  //it succeeds. Could perhaps be a 32 bit issue somewhere, does size_t become 32 bits?
+  //Anyway, this is a workaround that fixes the issue - does the same as the flag -m 100000000.
+  //An interesting observation is that opt.max_memory is set to 1 << 32, which will become exactly
+  //zero if truncated to 32 bits...
+  size_t WindowsMaxMemory(size_t mem) {
+
+    const size_t win_mem_max = 1e8;
+    if (mem > win_mem_max) {
+    mem = win_mem_max;
+    }
+    return mem;
+  }
+#else
+  void EnsureWindowsTempDirectoryExists(const Bustools_opt &opt) {}
+  size_t WindowsMaxMemory(size_t mem) {return mem;}
+
 #endif
 
 
+
+
+
 inline bool cmp1(const BUSData &a, const BUSData &b)
 {
   if (a.barcode == b.barcode)
@@ -328,22 +383,132 @@ inline bool ncmp5(const TP &a, const TP &b)
   }
 };
 
+
+void sort_bus_array(BUSData* busdata, size_t N, const int t, bool (*cmp)(const BUSData &, const BUSData &)) {
+  //std::sort(busdata, busdata + N, cmp);
+  if (t > 1 && N > 100000) {
+    const size_t s = 256;
+    std::vector<BUSData> samples, pivots;
+
+    // samples = drawn from 0, s, 2s, ... , t*s
+    samples.reserve(s*t);
+    for (int i = 0; i < s*t; ++i) {
+      samples.push_back(busdata[i * (N / (s*t))]);
+    }
+    std::sort(samples.begin(), samples.end(), cmp);
+
+    pivots.reserve(t-1);
+    // piviots are samples s, 2s, ... , (t-1)*s
+    for (int i = 1; i < t; ++i) {
+      pivots.push_back(samples[i * s]);
+      
+      //std::cerr << "pivot " << i << " = " << binaryToString(pivots[i-1].barcode, 16) << std::endl;
+    }
+
+    // buckets are locations of pivots after partitioning
+    // partition i is between buckets[i] and buckets[i+1]
+    std::vector<size_t> buckets(t+1, 0);
+    buckets[0] = 0;
+    buckets[t] = N;
+
+
+    double partition_time = 0;
+    clock_t start, end;
+    start = clock();
+    /*
+    for (int i = 0; i < t-1; i++) {
+      BUSData p = pivots[i];
+      //std::cerr << "partitioning around " << binaryToString(p.barcode, 16) << std::endl;
+      auto mid =  std::partition(busdata + buckets[i], busdata + N, [&p, &cmp](const BUSData &a) { return cmp(a, p); }) - busdata;
+      buckets[i+1] = mid;
+      //std::cerr << "bucket " << i << " has " << buckets[i+1] - buckets[i] << " elements, mid =  " << mid << std::endl;
+    }
+    //std::cerr << "bucket " << t-1 << " has " << buckets[t] - buckets[t-1] << " elements" << std::endl;
+    */
+
+    // partition the busdata based on the middle pivot
+    std::function<void(int,int)> mid_partition = [&](int i, int j) { 
+      if (j-i <= 1) {
+        return;
+      }
+      size_t k = i + (j-i)/2;
+      //std::cerr << "partitioning " << i << " to " << j << " with middle " << k << std::endl;
+      //std::cerr << "buckets i and j are " << buckets[i] << " and " << buckets[j] << std::endl;
+      BUSData p = pivots[k-1];
+      //std::cerr << "pivot element is " << binaryToString(p.barcode, 16) << std::endl;
+      buckets[k] = std::partition(busdata + buckets[i], busdata + buckets[j], [&p, &cmp](const BUSData &a) { return cmp(a, p); }) - busdata;
+      //std::cerr << "bucket " << k << " is " <<  buckets[k] << std::endl;
+      mid_partition(i, k);
+      mid_partition(k, j);
+    };
+
+    mid_partition(0, t);
+    
+
+    /*
+    // verify that the pivots are sorted
+    for (int i = 0; i < t-2; i++) {
+      if (!cmp(pivots[i], pivots[i+1])) {
+        std::cerr << "pivot " << i << " is not smaller than pivot " << i+1 << std::endl;
+        exit(1);
+      }
+    }
+    
+    for (int i = 0; i < t; i++) {
+      std::cerr << "bucket " << i << " at " << buckets[i] <<  " has " << buckets[i+1] - buckets[i] << " elements" << std::endl;
+    }
+    
+    //verify that each partition is smaller than the pivot
+    for (int i = 0; i < t; i++) {
+      for (size_t j = buckets[i]; j < buckets[i+1]; j++) {
+        if (i < t-1 && !cmp(busdata[j], pivots[i])) {
+          std::cerr << "partition " << i << " has an element larger than the pivot" << std::endl;
+          std::cerr << "element " << j << " = " << binaryToString(busdata[j].barcode, 16) << std::endl;
+          std::cerr << "pivot " << i << " = " << binaryToString(pivots[i].barcode, 16) << std::endl;
+
+          exit(1);
+        }
+        if (i > 0 && cmp(busdata[j], pivots[i-1])) {
+          std::cerr << "partition " << i << " has an element smaller than the next pivot" << std::endl;
+          std::cerr << "element " << j << " = " << binaryToString(busdata[j].barcode, 16) << std::endl;
+          std::cerr << "pivot " << i-1 << " = " << binaryToString(pivots[i-1].barcode, 16) << std::endl;
+          exit(1);
+        }
+      }
+    }
+    */
+
+    
+    end = clock();
+    partition_time  += ((double) (end - start)) / CLOCKS_PER_SEC;
+    std::cerr << "partition time: " << partition_time << "s" << std::endl;
+
+
+
+    // sort each bucket
+    std::vector<std::thread> workers;
+    for (int i = 0; i < t; ++i) {
+      workers.push_back(std::thread([&busdata, &buckets, &cmp, i]() {
+        //std::cerr << "sorting bucket " << i << " with " << buckets[i] <<  " to " << buckets[i+1]<< std::endl;
+        std::sort(busdata + buckets[i], busdata + buckets[i+1], cmp);
+      }));
+    }
+
+    for (auto &w : workers) {
+      w.join();
+    }
+
+
+  } else {
+    std::sort(busdata, busdata + N, cmp);
+  }
+
+}
+
 void bustools_sort(const Bustools_opt &opt)
 {
-  auto mem = opt.max_memory;
-  //There is a bug in Windows, where bustools sort fails. The problem is that 
-  //gcount for some reason fails here if too much is read and returns 0, even though
-  //it succeeds. Could perhaps be a 32 bit issue somewhere, does size_t become 32 bits?
-  //Anyway, this is a workaround that fixes the issue - does the same as the flag -m 100000000.
-  //An interesting observation is that opt.max_memory is set to 1 << 32, which will become exactly
-  //zero if truncated to 32 bits...
-#if defined(__MINGW32__) || defined(_MSC_VER)
-  const size_t win_mem_max = 1e8;
-  if (mem > win_mem_max)
-  {
-	mem = win_mem_max;
-  }
-#endif
+  auto mem = WindowsMaxMemory(opt.max_memory);
+
   BUSHeader h;
   size_t N = mem / sizeof(BUSData);
   BUSData *p = new BUSData[N];
@@ -381,49 +546,77 @@ void bustools_sort(const Bustools_opt &opt)
     exit(1);
   }
 
-#if defined(__MINGW32__) || defined(_MSC_VER)
-  //Make sure to create the tmp directory if it doesn't exist - writing temporary files fails otherwise in Windows
-  //First get the directory - in theory, opt.temp_files can look like "tmp/x_" or just "x_" (or even nothing)
-  //so we should find the last slash and make sure that directory exists
 
-  std::size_t ind = opt.temp_files.rfind('/');
-  std::size_t ind2 = opt.temp_files.rfind('\\');
-  if (ind == std::string::npos)
-  {
-	ind = ind2;
-  }
-  else if (ind2 != std::string::npos)
-  {
-	//both valid, take the largest value (representing the last slash)
-	ind = std::max(ind, ind2);
-  }
-  if (ind != std::string::npos)
-  {
-    auto dirName = opt.temp_files.substr(0, ind);
-	//When our MinGW builds support c++17, change to std::filesystem
- 
-    //std::filesystem::path filepath = dirName;
-    //if (!std::filesystem::is_directory(filepath)) 
-    //{
-    //    std::filesystem::create_directory(filepath);
-    //}
-	CreateDirectory(dirName.c_str(), NULL); //This will do nothing if the directory exists already
-  }
-#endif
+  
 
-  size_t sc = 0;
+  size_t sc = 0; // number of records read
+  double sorting_time = 0;
   int tmp_file_no = 0;
-  for (const auto &infn : opt.files)
-  {
+
+  // only use a single buffer if we are reading from stdin or if we have a single file
+  bool all_in_buffer = opt.stream_in || opt.files.size() == 1;
+
+
+  const auto collapse_and_write = [&](BUSData *p, size_t rc, std::ostream &outf) {
+    size_t batch = 1<<20;
+    std::vector<BUSData> v;
+    v.reserve(batch);
+
+    for (size_t i = 0; i < rc;) {
+      size_t j = i + 1;
+      uint32_t c = p[i].count;
+      auto ec = p[i].ec;
+      for (; j < rc; j++) {
+        if (p[i].barcode == p[j].barcode && p[i].UMI == p[j].UMI && p[i].ec == p[j].ec && p[i].flags == p[j].flags && p[i].pad == p[j].pad) {
+          c += p[j].count;
+        } else {
+          break;
+        }
+      }
+      // merge identical things
+      p[i].count = c;
+
+      // push back p to the vector
+      v.push_back(p[i]);
+
+      if (v.size() >= batch) {
+        outf.write((char *)v.data(), v.size() * sizeof(BUSData));
+        v.clear();
+      }
+
+      //outf.write((char *)(&(p[i])), sizeof(p[i]));
+      // increment
+      i = j;
+    }
+    if (v.size() > 0) {
+      outf.write((char *)v.data(), v.size() * sizeof(BUSData));
+      v.clear();
+    }
+  };
+
+  // open the correct output stream
+  std::ofstream of;
+  std::streambuf *buf = nullptr;
+  if (!opt.stream_out) {
+      of.open(opt.output, std::ios::out | std::ios::binary);
+      buf = of.rdbuf();
+  } else {
+      buf = std::cout.rdbuf();
+  }
+  std::ostream busf_out(buf);
+
+  // measure time spent reading input
+  clock_t start,end;
+  double reading_time = 0;
+  double writing_time = 0;
+
+  for (const auto &infn : opt.files) {
     std::streambuf *inbuf;
     std::ifstream inf;
-    if (!opt.stream_in)
-    {
+    if (!opt.stream_in) {
       inf.open(infn.c_str(), std::ios::binary);
       inbuf = inf.rdbuf();
-    }
-    else
-    {
+    } else {
       inbuf = std::cin.rdbuf();
     }
     std::istream in(inbuf);
@@ -432,163 +625,163 @@ void bustools_sort(const Bustools_opt &opt)
 
     int rc = 1;
 
-    while (in.good())
-    {
-      // read as much as we can
+
+    while (in.good()) {
+      
+      start = clock();
       in.read((char *)p, N * sizeof(BUSData));
       size_t rc = in.gcount() / sizeof(BUSData);
-      if (rc == 0)
-      {
+      end = clock();
+      reading_time += ((double) (end - start)) / CLOCKS_PER_SEC;
+
+      // no records read, we are done
+      if (rc == 0) {
         break;
       }
+
+      // records did not fit in buffer
+      if (rc >= N) {
+        all_in_buffer = false;
+      }
+      
       // now sort the data
-      std::sort(p, p + rc, cmp);
-      sc += rc;
+      start = clock();
+      //std::sort(p, p + rc, cmp);
+      sort_bus_array(p, rc, opt.threads, cmp);
+      end = clock();
+      sorting_time += ((double) (end - start)) / CLOCKS_PER_SEC;
 
-      // write the output
-      std::ofstream outf(opt.temp_files + std::to_string(tmp_file_no), std::ios::binary);
-      writeHeader(outf, h);
+      sc += rc;
 
-      for (size_t i = 0; i < rc;)
-      {
-        size_t j = i + 1;
-        uint32_t c = p[i].count;
-        auto ec = p[i].ec;
-        for (; j < rc; j++)
-        {
-          if (p[i].barcode == p[j].barcode && p[i].UMI == p[j].UMI && p[i].ec == p[j].ec && p[i].flags == p[j].flags && p[i].pad == p[j].pad)
-          {
-            c += p[j].count;
-          }
-          else
-          {
-            break;
-          }
-        }
-        // merge identical things
-        p[i].count = c;
-        outf.write((char *)(&(p[i])), sizeof(p[i]));
-        // increment
-        i = j;
+      if (all_in_buffer) {
+        std::cerr << " all fits in buffer" << std::endl;
+        // single file or stream, all data fits in buffer, write directly to output
+        start = clock();
+        writeHeader(busf_out, h);
+        collapse_and_write(p, rc, busf_out);
+        end = clock();
+        writing_time = ((double) (end - start)) / CLOCKS_PER_SEC;
+      } else {
+        // need to sort in chunks
+        // write the output
+        std::ofstream outf(opt.temp_files + std::to_string(tmp_file_no), std::ios::binary);
+        writeHeader(outf, h);
+
+        collapse_and_write(p, rc, outf);
+    
+        outf.close();
+        tmp_file_no++;
       }
-
-      outf.close();
-      tmp_file_no++;
+      
     }
   }
   delete[] p;
   p = nullptr;
 
   std::cerr << "Read in " << sc << " BUS records" << std::endl;
+  
+  std::cerr << "reading time " << reading_time << "s" << std::endl;
+  std::cerr << "sorting time " << sorting_time << "s" << std::endl;
+  std::cerr << "writing time " << writing_time << "s" << std::endl;
 
-  std::streambuf *buf = nullptr;
-  std::ofstream of;
 
-  if (!opt.stream_out)
-  {
-    of.open(opt.output, std::ios::out | std::ios::binary);
-    buf = of.rdbuf();
-  }
-  else
-  {
-    buf = std::cout.rdbuf();
-  }
-  std::ostream busf_out(buf);
 
-  writeHeader(busf_out, h);
+  
+  if (!all_in_buffer) {
+    writeHeader(busf_out, h);
 
-  // todo: skip writing to disk if it fits in memory
-  if (tmp_file_no == 1)
-  {
-    size_t M = N / 8;
-    p = new BUSData[M];
-    std::ifstream in(opt.temp_files + "0", std::ios::binary);
-    BUSHeader tmp;
-    parseHeader(in, tmp);
-    while (in.good())
+    if (tmp_file_no == 1)
     {
-      // read as much as we can
-      in.read((char *)p, M * sizeof(BUSData));
-      size_t rc = in.gcount() / sizeof(BUSData);
-      if (rc == 0)
+      size_t M = N / 8;
+      p = new BUSData[M];
+      std::ifstream in(opt.temp_files + "0", std::ios::binary);
+      BUSHeader tmp;
+      parseHeader(in, tmp);
+      while (in.good())
       {
-        break;
+        // read as much as we can
+        in.read((char *)p, M * sizeof(BUSData));
+        size_t rc = in.gcount() / sizeof(BUSData);
+        if (rc == 0)
+        {
+          break;
+        }
+        busf_out.write((char *)p, rc * sizeof(BUSData));
       }
-      busf_out.write((char *)p, rc * sizeof(BUSData));
-    }
-    in.close();
-    std::remove((opt.temp_files + "0").c_str());
-  }
-  else
-  {
-    // TODO: test if replacing with k-way merge is better
-    // adapted from https://github.com/arq5x/kway-mergesort/blob/master/kwaymergesort.h
-    int k = tmp_file_no;
-    size_t M = N / (k);
-    //std::memset(p, 0, N*sizeof(BUSData));
-    std::vector<std::ifstream> bf(k);
-    for (int i = 0; i < k; i++)
-    {
-      bf[i].open((opt.temp_files + std::to_string(i)).c_str(), std::ios::binary);
-      BUSHeader tmp;
-      parseHeader(bf[i], tmp);
+      in.close();
+      std::remove((opt.temp_files + "0").c_str());
     }
-
-    std::priority_queue<TP, std::vector<TP>, std::function<bool(const TP &a, const TP &b)>> pq(ncmp);
-    BUSData t;
-    for (int i = 0; i < k; i++)
-    {
-      bf[i].read((char *)&t, sizeof(t));
-      pq.push({t, i});
-    }
-
-    BUSData curr = pq.top().first;
-    curr.count = 0; // we'll count this again in the first loop
-    while (!pq.empty())
+    else
     {
-      TP min = pq.top();
+      // TODO: test if replacing with k-way merge is better
+      // adapted from https://github.com/arq5x/kway-mergesort/blob/master/kwaymergesort.h
+      int k = tmp_file_no;
+      size_t M = N / (k);
+      //std::memset(p, 0, N*sizeof(BUSData));
+      std::vector<std::ifstream> bf(k);
+      for (int i = 0; i < k; i++)
+      {
+        bf[i].open((opt.temp_files + std::to_string(i)).c_str(), std::ios::binary);
+        BUSHeader tmp;
+        parseHeader(bf[i], tmp);
+      }
 
-      pq.pop();
-      // process the data
-      BUSData &m = min.first;
-      int i = min.second;
-      if (m.barcode == curr.barcode && m.UMI == curr.UMI && m.ec == curr.ec && m.flags == curr.flags && m.pad == curr.pad)
+      std::priority_queue<TP, std::vector<TP>, std::function<bool(const TP &a, const TP &b)>> pq(ncmp);
+      BUSData t;
+      for (int i = 0; i < k; i++)
       {
-        // same data, increase count
-        curr.count += m.count;
+        bf[i].read((char *)&t, sizeof(t));
+        pq.push({t, i});
       }
-      else
+
+      BUSData curr = pq.top().first;
+      curr.count = 0; // we'll count this again in the first loop
+      while (!pq.empty())
       {
+        TP min = pq.top();
 
-        // new data let's output curr, new curr is m
-        if (curr.count != 0)
+        pq.pop();
+        // process the data
+        BUSData &m = min.first;
+        int i = min.second;
+        if (m.barcode == curr.barcode && m.UMI == curr.UMI && m.ec == curr.ec && m.flags == curr.flags && m.pad == curr.pad)
         {
-          busf_out.write((char *)&curr, sizeof(curr));
+          // same data, increase count
+          curr.count += m.count;
         }
-        curr = m;
-      }
-      // read next from stream
-      if (bf[i].good())
-      {
-        bf[i].read((char *)&t, sizeof(t));
-        if (bf[i].gcount() > 0)
+        else
+        {
+
+          // new data let's output curr, new curr is m
+          if (curr.count != 0)
+          {
+            busf_out.write((char *)&curr, sizeof(curr));
+          }
+          curr = m;
+        }
+        // read next from stream
+        if (bf[i].good())
         {
-          pq.push({t, i});
+          bf[i].read((char *)&t, sizeof(t));
+          if (bf[i].gcount() > 0)
+          {
+            pq.push({t, i});
+          }
         }
       }
-    }
 
-    if (curr.count > 0)
-    {
-      // write out remaining straggler
-      busf_out.write((char *)&curr, sizeof(curr));
-    }
+      if (curr.count > 0)
+      {
+        // write out remaining straggler
+        busf_out.write((char *)&curr, sizeof(curr));
+      }
 
-    // remove intermediary files
-    for (int i = 0; i < k; i++)
-    {
-      bf[i].close();
-      std::remove((opt.temp_files + std::to_string(i)).c_str());
+      // remove intermediary files
+      for (int i = 0; i < k; i++)
+      {
+        bf[i].close();
+        std::remove((opt.temp_files + std::to_string(i)).c_str());
+      }
     }
   }
 
diff --git a/src/bustools_text.cpp b/src/bustools_text.cpp
index 2c86d3e..9eff0dc 100644
--- a/src/bustools_text.cpp
+++ b/src/bustools_text.cpp
@@ -46,6 +46,9 @@ void bustools_text(const Bustools_opt& opt) {
 		parseHeader(in, h);
 		uint32_t bclen = h.bclen;
 		uint32_t umilen = h.umilen;
+		if (opt.text_showall) {
+		  bclen = 32;
+		}
 		int rc = 0;
 		while (true) {
 			in.read((char*)p, N * sizeof(BUSData));
diff --git a/src/bustools_umicorrect.cpp b/src/bustools_umicorrect.cpp
index 6efdadb..a6facbb 100644
--- a/src/bustools_umicorrect.cpp
+++ b/src/bustools_umicorrect.cpp
@@ -231,13 +231,13 @@ void bustools_umicorrect(const Bustools_opt& opt) {
 
 	// read and parse the equivelence class files
 
-	std::unordered_map<std::vector<int32_t>, int32_t, SortedVectorHasher> ecmapinv;
+	u_map_<std::vector<int32_t>, int32_t, SortedVectorHasher> ecmapinv;
 	std::vector<std::vector<int32_t>> ecmap;
 
-	std::unordered_map<std::string, int32_t> txnames;
+	u_map_<std::string, int32_t> txnames;
 	parseTranscripts(opt.count_txp, txnames);
 	std::vector<int32_t> genemap(txnames.size(), -1);
-	std::unordered_map<std::string, int32_t> genenames;
+	u_map_<std::string, int32_t> genenames;
 	parseGenes(opt.count_genes, txnames, genemap, genenames);
 	parseECs(opt.count_ecs, h);
 	ecmap = std::move(h.ecs);
diff --git a/src/hash.cpp b/src/hash.cpp
new file mode 100644
index 0000000..b2d18e1
--- /dev/null
+++ b/src/hash.cpp
@@ -0,0 +1,194 @@
+#include <stdint.h>
+#include <cstring>
+#include "hash.hpp"
+
+uint64_t inline _rotl64(uint64_t value, int8_t amount) {
+  return ((value) << (amount)) | ((value) >> (64 - (amount)));
+}
+
+uint32_t SuperFastHash (const char *data, int len) {
+  uint32_t hash = len, tmp;
+  int rem;
+
+  if (len <= 0 || data == NULL) { return 0; }
+
+  rem = len & 3;
+  len >>= 2;
+
+  /* Main loop */
+  for (; len > 0; len--) {
+    hash  += get16bits (data);
+    tmp    = (get16bits (data+2) << 11) ^ hash;
+    hash   = (hash << 16) ^ tmp;
+    data  += 2*sizeof (uint16_t);
+    hash  += hash >> 11;
+  }
+
+  /* Handle end cases */
+  switch (rem) {
+  case 3: hash += get16bits (data);
+    hash ^= hash << 16;
+    hash ^= data[sizeof (uint16_t)] << 18;
+    hash += hash >> 11;
+    break;
+  case 2: hash += get16bits (data);
+    hash ^= hash << 11;
+    hash += hash >> 17;
+    break;
+  case 1: hash += *data;
+    hash ^= hash << 10;
+    hash += hash >> 1;
+  }
+
+  /* Force "avalanching" of final 127 bits */
+  hash ^= hash << 3;
+  hash += hash >> 5;
+  hash ^= hash << 4;
+  hash += hash >> 17;
+  hash ^= hash << 25;
+  hash += hash >> 6;
+
+  return hash;
+}
+
+
+
+
+//-----------------------------------------------------------------------------
+// Block read - if your platform needs to do endian-swapping or can only
+// handle aligned reads, do the conversion here
+
+inline uint64_t getblock ( const uint64_t *p, int i ) {
+  return p[i];
+}
+
+//----------
+// Block mix - combine the key bits with the hash bits and scramble everything
+
+inline void bmix64 ( uint64_t& h1, uint64_t& h2, uint64_t& k1, uint64_t& k2, uint64_t& c1, uint64_t& c2 ) {
+  k1 *= c1;
+  k1  = _rotl64(k1,23);
+  k1 *= c2;
+  h1 ^= k1;
+  h1 += h2;
+
+  h2 = _rotl64(h2,41);
+
+  k2 *= c2;
+  k2  = _rotl64(k2,23);
+  k2 *= c1;
+  h2 ^= k2;
+  h2 += h1;
+
+  h1 = h1*3+0x52dce729;
+  h2 = h2*3+0x38495ab5;
+
+  c1 = c1*5+0x7b7d159c;
+  c2 = c2*5+0x6bce6396;
+}
+
+//----------
+// Finalization mix - avalanches all bits to within 0.05% bias
+
+inline uint64_t fmix64 ( uint64_t k ) {
+  k ^= k >> 33;
+  k *= 0xff51afd7ed558ccd;
+  k ^= k >> 33;
+  k *= 0xc4ceb9fe1a85ec53;
+  k ^= k >> 33;
+
+  return k;
+}
+
+void MurmurHash3_x64_128 ( const void *key, const int len, const uint32_t seed, void *out ) {
+  const uint8_t *data = (const uint8_t *)key;
+  const int nblocks = len / 16;
+
+  uint64_t h1 = 0x9368e53c2f6af274 ^ seed;
+  uint64_t h2 = 0x586dcd208f7cd3fd ^ seed;
+
+  uint64_t c1 = 0x87c37b91114253d5;
+  uint64_t c2 = 0x4cf5ad432745937f;
+
+  //----------
+  // body
+
+  const uint64_t *blocks = (const uint64_t *)(data);
+
+  for(int i = 0; i < nblocks; i++) {
+    uint64_t k1 = getblock(blocks,i*2+0);
+    uint64_t k2 = getblock(blocks,i*2+1);
+
+    bmix64(h1,h2,k1,k2,c1,c2);
+  }
+
+  //----------
+  // tail
+
+  const uint8_t *tail = (const uint8_t *)(data + nblocks*16);
+
+  uint64_t k1 = 0;
+  uint64_t k2 = 0;
+
+  switch(len & 15) {
+  case 15: k2 ^= uint64_t(tail[14]) << 48;
+  case 14: k2 ^= uint64_t(tail[13]) << 40;
+  case 13: k2 ^= uint64_t(tail[12]) << 32;
+  case 12: k2 ^= uint64_t(tail[11]) << 24;
+  case 11: k2 ^= uint64_t(tail[10]) << 16;
+  case 10: k2 ^= uint64_t(tail[ 9]) << 8;
+  case  9: k2 ^= uint64_t(tail[ 8]) << 0;
+
+  case  8: k1 ^= uint64_t(tail[ 7]) << 56;
+  case  7: k1 ^= uint64_t(tail[ 6]) << 48;
+  case  6: k1 ^= uint64_t(tail[ 5]) << 40;
+  case  5: k1 ^= uint64_t(tail[ 4]) << 32;
+  case  4: k1 ^= uint64_t(tail[ 3]) << 24;
+  case  3: k1 ^= uint64_t(tail[ 2]) << 16;
+  case  2: k1 ^= uint64_t(tail[ 1]) << 8;
+  case  1: k1 ^= uint64_t(tail[ 0]) << 0;
+    bmix64(h1,h2,k1,k2,c1,c2);
+  };
+
+  //----------
+  // finalization
+
+  h2 ^= len;
+
+  h1 += h2;
+  h2 += h1;
+
+  h1 = fmix64(h1);
+  h2 = fmix64(h2);
+
+  h1 += h2;
+  h2 += h1;
+
+  ((uint64_t *)out)[0] = h1;
+  ((uint64_t *)out)[1] = h2;
+}
+
+//-----------------------------------------------------------------------------
+// If we need a smaller hash value, it's faster to just use a portion of the
+// 128-bit hash
+
+void MurmurHash3_x64_32 ( const void *key, int len, uint32_t seed, void *out ) {
+  uint32_t temp[4];
+
+  MurmurHash3_x64_128(key,len,seed,temp);
+
+  *(uint32_t *)out = temp[0];
+}
+
+//----------
+
+void MurmurHash3_x64_64 ( const void *key, int len, uint32_t seed, void *out ) {
+  uint64_t temp[2];
+
+  MurmurHash3_x64_128(key,len,seed,temp);
+
+  *(uint64_t *)out = temp[0];
+}
+
+//-----------------------------------------------------------------------------
+
diff --git a/src/hash.hpp b/src/hash.hpp
new file mode 100644
index 0000000..ab2c0d1
--- /dev/null
+++ b/src/hash.hpp
@@ -0,0 +1,22 @@
+#ifndef HASH_H
+#define HASH_H
+
+#include <stdint.h> /* Replace with <stdint.h> if appropriate */
+#undef get16bits
+#if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \
+  || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__)
+#define get16bits(d) (*((const uint16_t *) (d)))
+#endif
+
+#if !defined (get16bits)
+#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)\
+                       +(uint32_t)(((const uint8_t *)(d))[0]) )
+#endif
+
+uint32_t SuperFastHash (const char *data, int len);
+
+//void MurmurHash3_x64_32 ( const void * key, int len, uint32_t seed, void * out );
+void MurmurHash3_x64_64 ( const void *key, int len, uint32_t seed, void *out );
+
+#endif
+