Merge branch 'develop'

Merge in changes for the 1.2.0 release
COMBINE-lab · Apr 10, 2020 · 9434ead · 9434ead
2 parents 7c5e864 + 6c75426
commit 9434ead
Show file tree

Hide file tree

Showing 50 changed files with 1,635 additions and 3,079 deletions.
diff --git a/.drone/build.sh b/.drone/build.sh
@@ -12,7 +12,7 @@ cd build
 
 echo "[Drone build] cmake configuration"
 
-cmake -DDO_QUIET_MAKE=TRUE -DBOOST_ROOT=/usr -DNO_IPO=TRUE ..
+cmake -DDO_QUIET_MAKE=TRUE -DBOOST_ROOT=/usr -DNO_IPO=TRUE -DCMAKE_BUILD_TYPE=RELEASE ..
 
 echo "[Drone build] making salmon and installing locally (this could take a while)"
 

diff --git a/.travis.yml b/.travis.yml
@@ -45,6 +45,7 @@ script:
   # VERBOSE=1 to show the input commands in generatd Makefile for debug.
   - |
     cmake \
+      -DCMAKE_BUILD_TYPE=RELEASE \
       -DFETCH_BOOST=TRUE \
       -DNO_RTM=TRUE \
       .. && \

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -42,8 +42,8 @@ message("version: ${CPACK_PACKAGE_VERSION}")
 set(PROJECT_VERSION ${CPACK_PACKAGE_VERSION})
 set(CPACK_GENERATOR "TGZ")
 set(CPACK_SOURCE_GENERATOR "TGZ")
-set(CPACK_PACKAGE_VENDOR "Stony Brook University")
-set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Salmon - Wicked-fast RNA-seq isoform quantification using lightweight mapping")
+set(CPACK_PACKAGE_VENDOR "University of Maryland")
+set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Salmon - Wicked-fast RNA-seq isoform quantification using selective alignment")
 set(CPACK_PACKAGE_NAME
   "${CMAKE_PROJECT_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}")
 set(CPACK_SOURCE_PACKAGE_FILE_NAME
@@ -59,10 +59,12 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
   set(CMAKE_BUILD_TYPE "${default_build_type}" CACHE
       STRING "Choose the type of build." FORCE)
   # Set the possible values of build type for cmake-gui
-  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS
-    "Debug" "Release")
+  #set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS
+  #  "Debug" "Release")
 endif()
 
+message(STATUS "CMAKE_BUILD_TYPE = ${CMAKE_BUILD_TYPE}")
+
 ## Set the standard required compile flags
 # Nov 18th --- removed -DHAVE_CONFIG_H
 set(REMOVE_WARNING_FLAGS "-Wno-unused-function;-Wno-unused-local-typedefs")
@@ -288,8 +290,6 @@ if(NOT FETCHED_PUFFERFISH)
   set(FETCHED_PUFFERFISH TRUE CACHE BOOL "Has pufferfish been fetched?" FORCE)
 endif()
 
-
-
 ##
 # Super-secret override
 ##
@@ -459,23 +459,25 @@ elseif(FETCH_BOOST)
     --with-timer)
   set(BOOST_WILL_RECONFIGURE TRUE)
   set(FETCH_BOOST FALSE)
+  set(BOOST_FETCHED_VERSION "1_72_0")
   message("Build system will fetch and build Boost")
   message("==================================================================")
   externalproject_add(libboost
     DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
-    DOWNLOAD_COMMAND curl -k -L https://dl.bintray.com/boostorg/release/1.71.0/source/boost_1_71_0.tar.gz -o boost_1_71_0.tar.gz &&
-      ${SHASUM} 96b34f7468f26a141f6020efb813f1a2f3dfb9797ecf76a7d7cbd843cc95f5bd boost_1_71_0.tar.gz &&
-      tar xzf boost_1_71_0.tar.gz
-    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/boost_1_71_0
+    DOWNLOAD_COMMAND curl -k -L https://sourceforge.net/projects/boost/files/boost/1.72.0/boost_1_72_0.tar.gz/download -o boost_1_72_0.tar.gz &&
+      #${SHASUM} 96b34f7468f26a141f6020efb813f1a2f3dfb9797ecf76a7d7cbd843cc95f5bd boost_1_71_0.tar.gz &&
+      ${SHASUM} c66e88d5786f2ca4dbebb14e06b566fb642a1a6947ad8cc9091f9f445134143f boost_${BOOST_FETCHED_VERSION}.tar.gz &&
+      tar xzf boost_${BOOST_FETCHED_VERSION}.tar.gz 
+    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/boost_${BOOST_FETCHED_VERSION}
     INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
     #PATCH_COMMAND patch -p2 < ${CMAKE_CURRENT_SOURCE_DIR}/external/boost156.patch
-    CONFIGURE_COMMAND CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} ${CMAKE_CURRENT_SOURCE_DIR}/external/boost_1_71_0/bootstrap.sh ${BOOST_CONFIGURE_TOOLSET} ${BOOST_BUILD_LIBS} --prefix=<INSTALL_DIR>
+    CONFIGURE_COMMAND CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} ${CMAKE_CURRENT_SOURCE_DIR}/external/boost_${BOOST_FETCHED_VERSION}/bootstrap.sh ${BOOST_CONFIGURE_TOOLSET} ${BOOST_BUILD_LIBS} --prefix=<INSTALL_DIR>
     add_custom_command(
-      OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/external/boost_1_71_0/tools/build/src/user-config.jam
+      OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/external/boost_${BOOST_FETCHED_VERSION}/tools/build/src/user-config.jam
       PRE_BUILD
       COMMAND echo "using gcc : ${CC_VERSION} : ${CMAKE_CXX_COMPILER} ;"
     )
-    BUILD_COMMAND CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} ${CMAKE_CURRENT_SOURCE_DIR}/external/boost_1_71_0/b2 -d0 -j${BOOST_BUILD_THREADS} ${BOOST_LIB_SUBSET} toolset=${BOOST_TOOLSET} ${BOOST_EXTRA_FLAGS} cxxflags=${BOOST_CXX_FLAGS} link=static install
+    BUILD_COMMAND CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} ${CMAKE_CURRENT_SOURCE_DIR}/external/boost_${BOOST_FETCHED_VERSION}/b2 -d0 -j${BOOST_BUILD_THREADS} ${BOOST_LIB_SUBSET} toolset=${BOOST_TOOLSET} ${BOOST_EXTRA_FLAGS} cxxflags=${BOOST_CXX_FLAGS} link=static install
       BUILD_IN_SOURCE 1
       INSTALL_COMMAND ""
   )
@@ -610,7 +612,7 @@ endif()
 message("Build system will fetch and build Intel Threading Building Blocks")
 message("==================================================================")
 # These are useful for the custom install step we'll do later
-set(TBB_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/tbb-2019_U8)
+set(TBB_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/oneTBB-2019_U8)
 set(TBB_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install)
 
 if("${TBB_COMPILER}" STREQUAL "gcc")
@@ -624,9 +626,9 @@ set(TBB_CXXFLAGS "${TBB_CXXFLAGS} ${CXXSTDFLAG}")
 externalproject_add(libtbb
   DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
   DOWNLOAD_COMMAND curl -k -L https://github.com/intel/tbb/archive/2019_U8.tar.gz -o tbb-2019_U8.tgz &&
-    ${SHASUM} 7b1fd8caea14be72ae4175896510bf99c809cd7031306a1917565e6de7382fba tbb-2019_U8.tgz &&
+    ${SHASUM} 6b540118cbc79f9cbc06a35033c18156c21b84ab7b6cf56d773b168ad2b68566 tbb-2019_U8.tgz &&
     tar -xzvf tbb-2019_U8.tgz
-  SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/tbb-2019_U8
+  SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/oneTBB-2019_U8 
   INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
   PATCH_COMMAND "${TBB_PATCH_STEP}"
   CONFIGURE_COMMAND ""

diff --git a/doc/source/salmon.rst b/doc/source/salmon.rst
@@ -16,33 +16,32 @@ step, obviously, is specific to the set of RNA-seq reads and is thus run more
 frequently. For a more complete description of all available options in Salmon,
 see below.
 
-.. note:: Mapping validation in mapping-based mode
-
-   Selective alignment, enabled by the ``--validateMappings`` flag, is a major
-   feature enhancement introduced in recent versions of salmon. When salmon is
-   run with selective alignment, it adopts a considerably more sensitive scheme
-   that we have developed for finding the potential mapping loci of a read, and
-   score potential mapping loci using the chaining algorithm introdcued in
-   minimap2 [#minimap2]_. It scores and validates these mappings using
-   the score-only, SIMD, dynamic programming algorithm of ksw2 [#ksw2]_.
-   Finally, we recommend using selective alignment with a *decoy-aware* transcriptome,
-   to mitigate potential spurious mapping of reads that actually arise from some
-   unannotated genomic locus that is sequence-similar to an annotated transcriptome.
-   The selective-alignment algorithm, the use of a decoy-aware transcriptome, and
-   the influence of running salmon with different mapping and alignment strategies
-   is covered in detail in the paper `Alignment and mapping methodology influence transcript abundance estimation <https://www.biorxiv.org/content/10.1101/657874v1>`_.
+.. note:: Selective alignment
+
+   Selective alignment, first introduced by the ``--validateMappings`` flag
+   in salmon, and now the default mapping strategy (in version 1.0.0
+   forward), is a major feature enhancement introduced in recent versions of
+   salmon. When salmon is run with selective alignment, it adopts a
+   considerably more sensitive scheme that we have developed for finding the
+   potential mapping loci of a read, and score potential mapping loci using
+   the chaining algorithm introdcued in minimap2 [#minimap2]_. It scores and
+   validates these mappings using the score-only, SIMD, dynamic programming
+   algorithm of ksw2 [#ksw2]_. Finally, we recommend using selective
+   alignment with a *decoy-aware* transcriptome, to mitigate potential
+   spurious mapping of reads that actually arise from some unannotated
+   genomic locus that is sequence-similar to an annotated transcriptome. The
+   selective-alignment algorithm, the use of a decoy-aware transcriptome, and
+   the influence of running salmon with different mapping and alignment
+   strategies is covered in detail in the paper `Alignment and mapping methodology influence transcript abundance estimation <https://www.biorxiv.org/content/10.1101/657874v1>`_.
 
    The use of selective alignment implies the use of range factorization, as mapping
    scores become very meaningful with this option. Selective alignment can
    improve the accuracy, sometimes considerably, over the faster, but
-   less-precise default mapping algorithm. As of salmon v0.13.1, we highly
-   recommend all users adopt selective alignment unless they have a specific
-   reason to avoid it. It is likely that this option will be enabled by default
-   in a future release. Also, there are a number of options and flags that allow
-   the user to control details about how the scoring is carried out, including
-   setting match, mismatch, and gap scores, and choosing the minimum score
-   below which an alignment will be considered invalid, and therefore not
-   used for the purposes of quantification. 
+   less-precise mapping algorithm that was previously used.  Also, there are a number of 
+   options and flags that allow the user to control details about how the scoring is 
+   carried out, including setting match, mismatch, and gap scores, and choosing the minimum 
+   score below which an alignment will be considered invalid, and therefore not used for the
+   purposes of quantification. 
 
 The **alignment**-based mode of Salmon does not require indexing.  Rather, you can 
 simply provide Salmon with a FASTA file of the transcripts and a SAM/BAM file
@@ -105,21 +104,33 @@ Preparing transcriptome indices (mapping-based mode)
 
 One of the novel and innovative features of Salmon is its ability to accurately
 quantify transcripts without having previously aligned the reads using its fast,
-built-in mapping algorithms (either *quasi-mapping* or *selective alignment*).
-These approaches are typically **much** faster to compute than traditional (or
-full) alignments. Further details about the selective alignment algorithm can be
-found `here <https://www.biorxiv.org/content/10.1101/657874v1>`_ and more
-details about quasi-mapping can be found `in this paper <http://bioinformatics.oxfordjournals.org/content/32/12/i192.full>`_.
+built-in selective-alignment mapping algorithm. Further details about the selective alignment algorithm can be
+found `here <https://www.biorxiv.org/content/10.1101/657874v1>`_.
 
 If you want to use Salmon in mapping-based mode, then you first have to build a
 salmon index for your transcriptome. Assume that ``transcripts.fa`` contains the
 set of transcripts you wish to quantify. We generally recommend that you build a
-*decoy-aware* transcriptome file and do quantification using selective alignment, which can be done with the
-`generateDecoyTranscriptome.sh
-<https://github.com/COMBINE-lab/SalmonTools/blob/master/scripts/generateDecoyTranscriptome.sh>`_
-script, whose instructions you can find `in this README 
-<https://github.com/COMBINE-lab/SalmonTools/blob/master/README.md>`_. First, you
-run the salmon indexer:
+*decoy-aware* transcriptome file. 
+
+There are two options for generating a decoy-aware transcriptome:
+
+- The first is to compute a set of decoy sequences by mapping the annotated transcripts you wish to index
+  against a hard-masked version of the organism's genome.  This can be done with e.g. 
+  `MashMap2  <https://github.com/marbl/MashMap>`_, and we provide some simple scripts to 
+  greatly simplify this whole process.  Specifically, you can use the 
+  `generateDecoyTranscriptome.sh <https://github.com/COMBINE-lab/SalmonTools/blob/master/scripts/generateDecoyTranscriptome.sh>`_
+  script, whose instructions you can find `in this README <https://github.com/COMBINE-lab/SalmonTools/blob/master/README.md>`_. 
+
+- The second is to use the entire genome of the organism as the decoy sequence. This can be 
+  done by concatenating the genome to the end of the transcriptome you want to index and populating 
+  the `decoys.txt` file with the chromosome names.  Detailed instructions on how to prepare this 
+  type of decoy sequence is available `here <https://combine-lab.github.io/alevin-tutorial/2019/selective-alignment/>`_.
+  This scheme provides a more comprehensive set of decoys, but, obviously, requires considerably more memory to build the index.
+
+Finally, pre-built versions of both the *partial* decoy and *full* decoy (i.e. using the whole genome) salmon indices 
+for some common organisms are available via refgenie `here <http://refgenomes.databio.org/>`_.
+
+If you are not using a pre-computed index, you run the salmon indexer as so:
 
 ::
     
@@ -136,12 +147,6 @@ improve sensitivity even more when using selective alignment (enabled via the `-
 if you are seeing a smaller mapping rate than you might expect, consider building
 the index with a slightly smaller `k`.  
 
-.. note:: Decoy-augmented transcriptomes and quasi-mapping
-   Currently, the use of decoy-augmented transcriptomes is only supported in 
-   conjunction with selective-alignment (via the `--validateMappings`, `--mimicBT2`
-   or `--mimicStrictBT2` flags.  For the time being, if you wish to quantify using 
-   quasi-mapping, you should not build a decoy-augmented index.
-
 Quantifying in mapping-based mode
 ---------------------------------------
 

diff --git a/doc/steps_to_prepare_release.md b/doc/steps_to_prepare_release.md
@@ -12,4 +12,4 @@
  9. Add release notes for the tagged master version.
  10. Upload the pre-compiled linux binary (from the CI server) to GitHub.
  11. Place a new version file on the website and update the old one.
- 12. (not technically part of release) Reset the relevant changes (steps 1,2) on the develop branch so they now point to a non-tagged RapMap.
+ 12. (not technically part of release) Reset the relevant changes (steps 1,2) on the develop branch so they now point to a non-tagged pufferfish.
diff --git a/include/AlevinOpts.hpp b/include/AlevinOpts.hpp
@@ -19,6 +19,7 @@ struct AlevinOpts {
   AlevinOpts(): numParsingThreads(1),
                 numConsumerThreads(2),
                 useVBEM{false},
+                numNoMapCB(0),
                 initUniform{false}{}
 
   //IUPAC code for the cell-barcodes
@@ -114,6 +115,7 @@ struct AlevinOpts {
   uint32_t intelligentCutoff;
   uint32_t totalLowConfidenceCBs;
   uint32_t numFeatures;
+  uint32_t numNoMapCB;
 
   uint32_t eqReads;
   uint32_t noisyUmis;

diff --git a/include/AlevinUtils.hpp b/include/AlevinUtils.hpp
@@ -74,14 +74,19 @@ namespace alevin{
 
     template <typename ProtocolT>
     bool processAlevinOpts(AlevinOpts<ProtocolT>& aopt,
-                             SalmonOpts& sopt,
-                             boost::program_options::variables_map& vm);
+                           SalmonOpts& sopt, bool noTgMap,
+                           boost::program_options::variables_map& vm);
 
     template <typename ProtocolT>
     bool extractUMI(std::string& read,
                     ProtocolT& pt,
                     std::string& umi);
 
+    template <typename ProtocolT>
+    void getReadSequence(ProtocolT& pt,
+                         std::string& seq,
+                         std::string& subseq);
+
     template <typename ProtocolT>
     nonstd::optional<std::string> extractBarcode(std::string& read, ProtocolT& pt);
 
@@ -106,7 +111,8 @@ namespace alevin{
                          const std::string& t2gFile, const std::string& refNamesFile,
                          const std::string& refLengthFile,
                          const std::string& headerFile,
-                         std::shared_ptr<spdlog::logger>& jointLog);
+                         std::shared_ptr<spdlog::logger>& jointLog,
+                         bool noTgMap);
 
     bool checkSetCoverage(std::vector<std::vector<uint32_t>>& tgroup,
                           std::vector<uint32_t> txps);

diff --git a/include/AlignmentLibrary.hpp b/include/AlignmentLibrary.hpp
@@ -504,6 +504,10 @@ for (auto& txp : transcripts_) {
     return numDecoys_;
   }
 
+  salmon::utils::DuplicateTargetStatus index_retains_duplicates() const { 
+    return salmon::utils::DuplicateTargetStatus::UNKNOWN; 
+  }
+
 private:
 
   void setTranscriptLengthClasses_(std::vector<uint32_t>& lengths,