Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
Merge in changes for the 1.2.0 release
  • Loading branch information
Rob Patro committed Apr 10, 2020
2 parents 7c5e864 + 6c75426 commit 9434ead
Show file tree
Hide file tree
Showing 50 changed files with 1,635 additions and 3,079 deletions.
2 changes: 1 addition & 1 deletion .drone/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ cd build

echo "[Drone build] cmake configuration"

cmake -DDO_QUIET_MAKE=TRUE -DBOOST_ROOT=/usr -DNO_IPO=TRUE ..
cmake -DDO_QUIET_MAKE=TRUE -DBOOST_ROOT=/usr -DNO_IPO=TRUE -DCMAKE_BUILD_TYPE=RELEASE ..

echo "[Drone build] making salmon and installing locally (this could take a while)"

Expand Down
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ script:
# VERBOSE=1 to show the input commands in generatd Makefile for debug.
- |
cmake \
-DCMAKE_BUILD_TYPE=RELEASE \
-DFETCH_BOOST=TRUE \
-DNO_RTM=TRUE \
.. && \
Expand Down
34 changes: 18 additions & 16 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ message("version: ${CPACK_PACKAGE_VERSION}")
set(PROJECT_VERSION ${CPACK_PACKAGE_VERSION})
set(CPACK_GENERATOR "TGZ")
set(CPACK_SOURCE_GENERATOR "TGZ")
set(CPACK_PACKAGE_VENDOR "Stony Brook University")
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Salmon - Wicked-fast RNA-seq isoform quantification using lightweight mapping")
set(CPACK_PACKAGE_VENDOR "University of Maryland")
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Salmon - Wicked-fast RNA-seq isoform quantification using selective alignment")
set(CPACK_PACKAGE_NAME
"${CMAKE_PROJECT_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}")
set(CPACK_SOURCE_PACKAGE_FILE_NAME
Expand All @@ -59,10 +59,12 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
set(CMAKE_BUILD_TYPE "${default_build_type}" CACHE
STRING "Choose the type of build." FORCE)
# Set the possible values of build type for cmake-gui
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS
"Debug" "Release")
#set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS
# "Debug" "Release")
endif()

message(STATUS "CMAKE_BUILD_TYPE = ${CMAKE_BUILD_TYPE}")

## Set the standard required compile flags
# Nov 18th --- removed -DHAVE_CONFIG_H
set(REMOVE_WARNING_FLAGS "-Wno-unused-function;-Wno-unused-local-typedefs")
Expand Down Expand Up @@ -288,8 +290,6 @@ if(NOT FETCHED_PUFFERFISH)
set(FETCHED_PUFFERFISH TRUE CACHE BOOL "Has pufferfish been fetched?" FORCE)
endif()



##
# Super-secret override
##
Expand Down Expand Up @@ -459,23 +459,25 @@ elseif(FETCH_BOOST)
--with-timer)
set(BOOST_WILL_RECONFIGURE TRUE)
set(FETCH_BOOST FALSE)
set(BOOST_FETCHED_VERSION "1_72_0")
message("Build system will fetch and build Boost")
message("==================================================================")
externalproject_add(libboost
DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
DOWNLOAD_COMMAND curl -k -L https://dl.bintray.com/boostorg/release/1.71.0/source/boost_1_71_0.tar.gz -o boost_1_71_0.tar.gz &&
${SHASUM} 96b34f7468f26a141f6020efb813f1a2f3dfb9797ecf76a7d7cbd843cc95f5bd boost_1_71_0.tar.gz &&
tar xzf boost_1_71_0.tar.gz
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/boost_1_71_0
DOWNLOAD_COMMAND curl -k -L https://sourceforge.net/projects/boost/files/boost/1.72.0/boost_1_72_0.tar.gz/download -o boost_1_72_0.tar.gz &&
#${SHASUM} 96b34f7468f26a141f6020efb813f1a2f3dfb9797ecf76a7d7cbd843cc95f5bd boost_1_71_0.tar.gz &&
${SHASUM} c66e88d5786f2ca4dbebb14e06b566fb642a1a6947ad8cc9091f9f445134143f boost_${BOOST_FETCHED_VERSION}.tar.gz &&
tar xzf boost_${BOOST_FETCHED_VERSION}.tar.gz
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/boost_${BOOST_FETCHED_VERSION}
INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
#PATCH_COMMAND patch -p2 < ${CMAKE_CURRENT_SOURCE_DIR}/external/boost156.patch
CONFIGURE_COMMAND CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} ${CMAKE_CURRENT_SOURCE_DIR}/external/boost_1_71_0/bootstrap.sh ${BOOST_CONFIGURE_TOOLSET} ${BOOST_BUILD_LIBS} --prefix=<INSTALL_DIR>
CONFIGURE_COMMAND CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} ${CMAKE_CURRENT_SOURCE_DIR}/external/boost_${BOOST_FETCHED_VERSION}/bootstrap.sh ${BOOST_CONFIGURE_TOOLSET} ${BOOST_BUILD_LIBS} --prefix=<INSTALL_DIR>
add_custom_command(
OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/external/boost_1_71_0/tools/build/src/user-config.jam
OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/external/boost_${BOOST_FETCHED_VERSION}/tools/build/src/user-config.jam
PRE_BUILD
COMMAND echo "using gcc : ${CC_VERSION} : ${CMAKE_CXX_COMPILER} ;"
)
BUILD_COMMAND CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} ${CMAKE_CURRENT_SOURCE_DIR}/external/boost_1_71_0/b2 -d0 -j${BOOST_BUILD_THREADS} ${BOOST_LIB_SUBSET} toolset=${BOOST_TOOLSET} ${BOOST_EXTRA_FLAGS} cxxflags=${BOOST_CXX_FLAGS} link=static install
BUILD_COMMAND CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} ${CMAKE_CURRENT_SOURCE_DIR}/external/boost_${BOOST_FETCHED_VERSION}/b2 -d0 -j${BOOST_BUILD_THREADS} ${BOOST_LIB_SUBSET} toolset=${BOOST_TOOLSET} ${BOOST_EXTRA_FLAGS} cxxflags=${BOOST_CXX_FLAGS} link=static install
BUILD_IN_SOURCE 1
INSTALL_COMMAND ""
)
Expand Down Expand Up @@ -610,7 +612,7 @@ endif()
message("Build system will fetch and build Intel Threading Building Blocks")
message("==================================================================")
# These are useful for the custom install step we'll do later
set(TBB_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/tbb-2019_U8)
set(TBB_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/oneTBB-2019_U8)
set(TBB_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install)

if("${TBB_COMPILER}" STREQUAL "gcc")
Expand All @@ -624,9 +626,9 @@ set(TBB_CXXFLAGS "${TBB_CXXFLAGS} ${CXXSTDFLAG}")
externalproject_add(libtbb
DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
DOWNLOAD_COMMAND curl -k -L https://github.com/intel/tbb/archive/2019_U8.tar.gz -o tbb-2019_U8.tgz &&
${SHASUM} 7b1fd8caea14be72ae4175896510bf99c809cd7031306a1917565e6de7382fba tbb-2019_U8.tgz &&
${SHASUM} 6b540118cbc79f9cbc06a35033c18156c21b84ab7b6cf56d773b168ad2b68566 tbb-2019_U8.tgz &&
tar -xzvf tbb-2019_U8.tgz
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/tbb-2019_U8
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/oneTBB-2019_U8
INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
PATCH_COMMAND "${TBB_PATCH_STEP}"
CONFIGURE_COMMAND ""
Expand Down
85 changes: 45 additions & 40 deletions doc/source/salmon.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,33 +16,32 @@ step, obviously, is specific to the set of RNA-seq reads and is thus run more
frequently. For a more complete description of all available options in Salmon,
see below.

.. note:: Mapping validation in mapping-based mode

Selective alignment, enabled by the ``--validateMappings`` flag, is a major
feature enhancement introduced in recent versions of salmon. When salmon is
run with selective alignment, it adopts a considerably more sensitive scheme
that we have developed for finding the potential mapping loci of a read, and
score potential mapping loci using the chaining algorithm introdcued in
minimap2 [#minimap2]_. It scores and validates these mappings using
the score-only, SIMD, dynamic programming algorithm of ksw2 [#ksw2]_.
Finally, we recommend using selective alignment with a *decoy-aware* transcriptome,
to mitigate potential spurious mapping of reads that actually arise from some
unannotated genomic locus that is sequence-similar to an annotated transcriptome.
The selective-alignment algorithm, the use of a decoy-aware transcriptome, and
the influence of running salmon with different mapping and alignment strategies
is covered in detail in the paper `Alignment and mapping methodology influence transcript abundance estimation <https://www.biorxiv.org/content/10.1101/657874v1>`_.
.. note:: Selective alignment

Selective alignment, first introduced by the ``--validateMappings`` flag
in salmon, and now the default mapping strategy (in version 1.0.0
forward), is a major feature enhancement introduced in recent versions of
salmon. When salmon is run with selective alignment, it adopts a
considerably more sensitive scheme that we have developed for finding the
potential mapping loci of a read, and score potential mapping loci using
the chaining algorithm introdcued in minimap2 [#minimap2]_. It scores and
validates these mappings using the score-only, SIMD, dynamic programming
algorithm of ksw2 [#ksw2]_. Finally, we recommend using selective
alignment with a *decoy-aware* transcriptome, to mitigate potential
spurious mapping of reads that actually arise from some unannotated
genomic locus that is sequence-similar to an annotated transcriptome. The
selective-alignment algorithm, the use of a decoy-aware transcriptome, and
the influence of running salmon with different mapping and alignment
strategies is covered in detail in the paper `Alignment and mapping methodology influence transcript abundance estimation <https://www.biorxiv.org/content/10.1101/657874v1>`_.

The use of selective alignment implies the use of range factorization, as mapping
scores become very meaningful with this option. Selective alignment can
improve the accuracy, sometimes considerably, over the faster, but
less-precise default mapping algorithm. As of salmon v0.13.1, we highly
recommend all users adopt selective alignment unless they have a specific
reason to avoid it. It is likely that this option will be enabled by default
in a future release. Also, there are a number of options and flags that allow
the user to control details about how the scoring is carried out, including
setting match, mismatch, and gap scores, and choosing the minimum score
below which an alignment will be considered invalid, and therefore not
used for the purposes of quantification.
less-precise mapping algorithm that was previously used. Also, there are a number of
options and flags that allow the user to control details about how the scoring is
carried out, including setting match, mismatch, and gap scores, and choosing the minimum
score below which an alignment will be considered invalid, and therefore not used for the
purposes of quantification.

The **alignment**-based mode of Salmon does not require indexing. Rather, you can
simply provide Salmon with a FASTA file of the transcripts and a SAM/BAM file
Expand Down Expand Up @@ -105,21 +104,33 @@ Preparing transcriptome indices (mapping-based mode)

One of the novel and innovative features of Salmon is its ability to accurately
quantify transcripts without having previously aligned the reads using its fast,
built-in mapping algorithms (either *quasi-mapping* or *selective alignment*).
These approaches are typically **much** faster to compute than traditional (or
full) alignments. Further details about the selective alignment algorithm can be
found `here <https://www.biorxiv.org/content/10.1101/657874v1>`_ and more
details about quasi-mapping can be found `in this paper <http://bioinformatics.oxfordjournals.org/content/32/12/i192.full>`_.
built-in selective-alignment mapping algorithm. Further details about the selective alignment algorithm can be
found `here <https://www.biorxiv.org/content/10.1101/657874v1>`_.

If you want to use Salmon in mapping-based mode, then you first have to build a
salmon index for your transcriptome. Assume that ``transcripts.fa`` contains the
set of transcripts you wish to quantify. We generally recommend that you build a
*decoy-aware* transcriptome file and do quantification using selective alignment, which can be done with the
`generateDecoyTranscriptome.sh
<https://github.com/COMBINE-lab/SalmonTools/blob/master/scripts/generateDecoyTranscriptome.sh>`_
script, whose instructions you can find `in this README
<https://github.com/COMBINE-lab/SalmonTools/blob/master/README.md>`_. First, you
run the salmon indexer:
*decoy-aware* transcriptome file.

There are two options for generating a decoy-aware transcriptome:

- The first is to compute a set of decoy sequences by mapping the annotated transcripts you wish to index
against a hard-masked version of the organism's genome. This can be done with e.g.
`MashMap2 <https://github.com/marbl/MashMap>`_, and we provide some simple scripts to
greatly simplify this whole process. Specifically, you can use the
`generateDecoyTranscriptome.sh <https://github.com/COMBINE-lab/SalmonTools/blob/master/scripts/generateDecoyTranscriptome.sh>`_
script, whose instructions you can find `in this README <https://github.com/COMBINE-lab/SalmonTools/blob/master/README.md>`_.

- The second is to use the entire genome of the organism as the decoy sequence. This can be
done by concatenating the genome to the end of the transcriptome you want to index and populating
the `decoys.txt` file with the chromosome names. Detailed instructions on how to prepare this
type of decoy sequence is available `here <https://combine-lab.github.io/alevin-tutorial/2019/selective-alignment/>`_.
This scheme provides a more comprehensive set of decoys, but, obviously, requires considerably more memory to build the index.

Finally, pre-built versions of both the *partial* decoy and *full* decoy (i.e. using the whole genome) salmon indices
for some common organisms are available via refgenie `here <http://refgenomes.databio.org/>`_.

If you are not using a pre-computed index, you run the salmon indexer as so:

::
Expand All @@ -136,12 +147,6 @@ improve sensitivity even more when using selective alignment (enabled via the `-
if you are seeing a smaller mapping rate than you might expect, consider building
the index with a slightly smaller `k`.

.. note:: Decoy-augmented transcriptomes and quasi-mapping
Currently, the use of decoy-augmented transcriptomes is only supported in
conjunction with selective-alignment (via the `--validateMappings`, `--mimicBT2`
or `--mimicStrictBT2` flags. For the time being, if you wish to quantify using
quasi-mapping, you should not build a decoy-augmented index.

Quantifying in mapping-based mode
---------------------------------------

Expand Down
2 changes: 1 addition & 1 deletion doc/steps_to_prepare_release.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@
9. Add release notes for the tagged master version.
10. Upload the pre-compiled linux binary (from the CI server) to GitHub.
11. Place a new version file on the website and update the old one.
12. (not technically part of release) Reset the relevant changes (steps 1,2) on the develop branch so they now point to a non-tagged RapMap.
12. (not technically part of release) Reset the relevant changes (steps 1,2) on the develop branch so they now point to a non-tagged pufferfish.
2 changes: 2 additions & 0 deletions include/AlevinOpts.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ struct AlevinOpts {
AlevinOpts(): numParsingThreads(1),
numConsumerThreads(2),
useVBEM{false},
numNoMapCB(0),
initUniform{false}{}

//IUPAC code for the cell-barcodes
Expand Down Expand Up @@ -114,6 +115,7 @@ struct AlevinOpts {
uint32_t intelligentCutoff;
uint32_t totalLowConfidenceCBs;
uint32_t numFeatures;
uint32_t numNoMapCB;

uint32_t eqReads;
uint32_t noisyUmis;
Expand Down
12 changes: 9 additions & 3 deletions include/AlevinUtils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,14 +74,19 @@ namespace alevin{

template <typename ProtocolT>
bool processAlevinOpts(AlevinOpts<ProtocolT>& aopt,
SalmonOpts& sopt,
boost::program_options::variables_map& vm);
SalmonOpts& sopt, bool noTgMap,
boost::program_options::variables_map& vm);

template <typename ProtocolT>
bool extractUMI(std::string& read,
ProtocolT& pt,
std::string& umi);

template <typename ProtocolT>
void getReadSequence(ProtocolT& pt,
std::string& seq,
std::string& subseq);

template <typename ProtocolT>
nonstd::optional<std::string> extractBarcode(std::string& read, ProtocolT& pt);

Expand All @@ -106,7 +111,8 @@ namespace alevin{
const std::string& t2gFile, const std::string& refNamesFile,
const std::string& refLengthFile,
const std::string& headerFile,
std::shared_ptr<spdlog::logger>& jointLog);
std::shared_ptr<spdlog::logger>& jointLog,
bool noTgMap);

bool checkSetCoverage(std::vector<std::vector<uint32_t>>& tgroup,
std::vector<uint32_t> txps);
Expand Down
4 changes: 4 additions & 0 deletions include/AlignmentLibrary.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -504,6 +504,10 @@ for (auto& txp : transcripts_) {
return numDecoys_;
}

salmon::utils::DuplicateTargetStatus index_retains_duplicates() const {
return salmon::utils::DuplicateTargetStatus::UNKNOWN;
}

private:

void setTranscriptLengthClasses_(std::vector<uint32_t>& lengths,
Expand Down
Loading

0 comments on commit 9434ead

Please sign in to comment.