From ef45d8513d83bc96cf6f0806b6e90500b1ae22fb Mon Sep 17 00:00:00 2001 From: Dongze He <32473855+DongzeHE@users.noreply.github.com> Date: Thu, 5 May 2022 16:11:10 -0400 Subject: [PATCH 01/10] adding slash between $base and file pattern --- scripts/v1_10x/run.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/v1_10x/run.sh b/scripts/v1_10x/run.sh index c0a5bb344..fd6fc462d 100755 --- a/scripts/v1_10x/run.sh +++ b/scripts/v1_10x/run.sh @@ -13,8 +13,8 @@ p2="$tmpdir/p2.fa" mkfifo $p1 mkfifo $p2 -i1=`ls $base*I1*` -wrapper <(cat $base*I1*) <(cat $base*RA*) >> $p1 2>> $p2 & +i1=`ls $base/*I1*` +wrapper <(cat $base/*I1*) <(cat $base/*RA*) >> $p1 2>> $p2 & echo "Running command [${new_cmd} -1 $p1 -2 $p2 -r $i1]" ${new_cmd} -1 $p1 -2 $p2 -r $i1 From f1328ec66c3d88201ae4196503c49973e26ceb33 Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Tue, 9 Aug 2022 16:53:01 -0400 Subject: [PATCH 02/10] refactor: get rid of redundant compact_vector, propagate property --- CMakeLists.txt | 11 ++++++++++- include/Transcript.hpp | 3 +-- scripts/fetchPufferfish.sh | 7 ++++--- src/CMakeLists.txt | 6 +++++- 4 files changed, 20 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cf2061e7a..da0e4180e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.12) +cmake_minimum_required(VERSION 3.15) if(DEFINED ENV{CC}) set(CC $ENV{CC}) @@ -906,11 +906,20 @@ set(CPACK_SOURCE_IGNORE_FILES message("CPACK_SOURCE_IGNORE_FILES = ${CPACK_SOURCE_IGNORE_FILES}") +# we will use this property later +define_property(TARGET PROPERTY COMPACT_VECTOR_DIR INHERITED + BRIEF_DOCS "the path to the directory containing the compact_vector include tree" + FULL_DOCS "the path to the directory containing the compact_vector include tree") + # Recurse into pufferfish source directory # and build the library set(BUILD_PUFF_FOR_SALMON TRUE) add_subdirectory(external/pufferfish) +# make sure we know the path to compact_vector +get_property(COMPACT_VECTOR_INCLUDE_PATH TARGET graphdump PROPERTY COMPACT_VECTOR_DIR) +message("fetched path for compact_vector as [${COMPACT_VECTOR_INCLUDE_PATH}]") + # and then the main salmon source directory add_subdirectory(src) diff --git a/include/Transcript.hpp b/include/Transcript.hpp index df19890cd..42f646015 100644 --- a/include/Transcript.hpp +++ b/include/Transcript.hpp @@ -14,8 +14,7 @@ #include #include -//#include "rapmap/bit_array.h" -#include "pufferfish/compact_vector/compact_vector.hpp" +#include "compact_vector/compact_vector.hpp" #include "pufferfish/rank9b.hpp" class Transcript { diff --git a/scripts/fetchPufferfish.sh b/scripts/fetchPufferfish.sh index 4c63ef5c5..8c737f088 100755 --- a/scripts/fetchPufferfish.sh +++ b/scripts/fetchPufferfish.sh @@ -23,8 +23,8 @@ if [ -d ${INSTALL_DIR}/src/pufferfish ] ; then rm -fr ${INSTALL_DIR}/src/pufferfish fi -SVER=salmon-v1.9.0 -#SVER=develop +#SVER=salmon-v1.9.0 +SVER=develop #SVER=sketch-mode EXPECTED_SHA256=2a862daeff95a19c9b188bc26a5d02fc0ecc5b9c9ae5523a67c9d14e62551c5d @@ -92,7 +92,8 @@ cp ${EXTERNAL_DIR}/pufferfish/include/BinWriter.hpp ${INSTALL_DIR}/include/puffe cp -r ${EXTERNAL_DIR}/pufferfish/include/libdivide ${INSTALL_DIR}/include/pufferfish cp -r ${EXTERNAL_DIR}/pufferfish/include/ksw2pp ${INSTALL_DIR}/include/pufferfish -cp -r ${EXTERNAL_DIR}/pufferfish/include/compact_vector ${INSTALL_DIR}/include/pufferfish +# this is now automatically tracked and inherited via twopaco (on which libpuffer depends) +# cp -r ${EXTERNAL_DIR}/pufferfish/include/compact_vector ${INSTALL_DIR}/include/pufferfish cp -r ${EXTERNAL_DIR}/pufferfish/include/metro ${INSTALL_DIR}/include/pufferfish cp -r ${EXTERNAL_DIR}/pufferfish/include/itlib ${INSTALL_DIR}/include/pufferfish cp -r ${EXTERNAL_DIR}/pufferfish/include/sparsepp ${INSTALL_DIR}/include/pufferfish diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index bec29f8ea..dafffedd8 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -141,7 +141,6 @@ else() set(CMAKE_INSTALL_RPATH_USE_LINK_PATH FALSE) endif() - set (TGT_RELEASE_FLAGS "${TGT_COMPILE_FLAGS};${TGT_WARN_FLAGS}") set (TGT_DEBUG_FLAGS "-g;${TGT_COMPILE_FLAGS};${TGT_WARN_FLAGS}") @@ -155,6 +154,8 @@ HAVE_SSTREAM=1 STX_NO_STD_STRING_VIEW=1 span_FEATURE_MAKE_SPAN_TO_STD=14 ) +target_include_directories(salmon_core PUBLIC ${COMPACT_VECTOR_INCLUDE_PATH}) + if (USE_ARM) target_compile_definitions(salmon_core PUBLIC KSW_USE_ARM=1) @@ -176,6 +177,7 @@ HAVE_ANSI_TERM=1 HAVE_SSTREAM=1 span_FEATURE_MAKE_SPAN_TO_STD=14 ) +target_include_directories(alevin_core PUBLIC ${COMPACT_VECTOR_INCLUDE_PATH}) if (USE_ARM) target_compile_definitions(alevin_core PUBLIC KSW_USE_ARM=1) @@ -190,6 +192,7 @@ endif() # Build the salmon executable add_executable(salmon ${SALMON_MAIN_SRCS} ${SALMON_ALIGN_SRCS}) +target_include_directories(salmon PUBLIC ${COMPACT_VECTOR_INCLUDE_PATH}) if(HAS_IPO AND (NOT NO_IPO)) set_property(TARGET salmon PROPERTY INTERPROCEDURAL_OPTIMIZATION True) @@ -202,6 +205,7 @@ target_compile_options(UnitTestsMain PUBLIC "$<$:${TGT_RELEASE_F add_executable(unitTests ${UNIT_TESTS_INDIVIDUAL_SRCS} ${GAT_SOURCE_DIR}/tests/catch.hpp) target_compile_options(unitTests PUBLIC "$<$:${TGT_DEBUG_FLAGS}>") target_compile_options(unitTests PUBLIC "$<$:${TGT_RELEASE_FLAGS}>") +target_include_directories(unitTests PUBLIC ${COMPACT_VECTOR_INCLUDE_PATH}) #add_executable(salmon-read ${SALMON_READ_SRCS}) #set_target_properties(salmon-read PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_LIBPTHREAD -D_PBGZF_USE -fopenmp" From d674aa2871eaecd771cce8142951340ff7785edf Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Mon, 5 Sep 2022 15:27:10 -0400 Subject: [PATCH 03/10] some refactoring of sc mapping code to prepare for feature merges --- include/AlevinOpts.hpp | 4 + include/SalmonMappingUtils.hpp | 1097 +++++++++++++++++++++++++------- src/SalmonAlevin.cpp | 410 ++---------- 3 files changed, 924 insertions(+), 587 deletions(-) diff --git a/include/AlevinOpts.hpp b/include/AlevinOpts.hpp index 05e0a593d..68400149b 100644 --- a/include/AlevinOpts.hpp +++ b/include/AlevinOpts.hpp @@ -13,6 +13,10 @@ enum class BarcodeEnd { FIVE = 5, THREE = 3 }; enum class Sequence { BARCODE, UMI }; +// refers to which reads to use for mapping; use both for 5' libraries +enum class ReadsToMap { USE_BOTH, USE_FIRST, USE_SECOND }; +// refers to which reads map to transcripts and to which transcripts they map to +enum class PairingStatus { UNPAIRED_LEFT, UNPAIRED_RIGHT, PAIRED_FR, PAIRED_RF }; template struct AlevinOpts { diff --git a/include/SalmonMappingUtils.hpp b/include/SalmonMappingUtils.hpp index 7e143dfdc..66810430d 100644 --- a/include/SalmonMappingUtils.hpp +++ b/include/SalmonMappingUtils.hpp @@ -22,7 +22,6 @@ #ifndef __SALMON_MAPPING_UTILS__ #define __SALMON_MAPPING_UTILS__ - #include #include #include @@ -39,107 +38,122 @@ // Future C++ convenience classes #include "core/range.hpp" -#include "SalmonDefaults.hpp" -#include "SalmonMath.hpp" -#include "SalmonUtils.hpp" -#include "Transcript.hpp" +#include "AlevinOpts.hpp" #include "AlignmentGroup.hpp" #include "ProgramOptionsGenerator.hpp" #include "ReadExperiment.hpp" +#include "SalmonDefaults.hpp" +#include "SalmonMath.hpp" #include "SalmonOpts.hpp" +#include "SalmonUtils.hpp" +#include "Transcript.hpp" -#include "pufferfish/Util.hpp" -#include "pufferfish/MemCollector.hpp" +#include "parallel_hashmap/phmap.h" #include "pufferfish/MemChainer.hpp" -#include "pufferfish/SAMWriter.hpp" +#include "pufferfish/MemCollector.hpp" #include "pufferfish/PuffAligner.hpp" -#include "pufferfish/ksw2pp/KSW2Aligner.hpp" -#include "pufferfish/metro/metrohash64.h" +#include "pufferfish/SAMWriter.hpp" #include "pufferfish/SelectiveAlignmentUtils.hpp" +#include "pufferfish/Util.hpp" #include "pufferfish/itlib/small_vector.hpp" -#include "parallel_hashmap/phmap.h" +#include "pufferfish/ksw2pp/KSW2Aligner.hpp" +#include "pufferfish/metro/metrohash64.h" namespace salmon { - namespace mapping_utils { +namespace mapping_utils { - using MateStatus = pufferfish::util::MateStatus; - constexpr const int32_t invalid_score_ = std::numeric_limits::min(); - constexpr const int32_t invalid_index_ = std::numeric_limits::min(); - constexpr const size_t static_vec_size = 32; +using MateStatus = pufferfish::util::MateStatus; +constexpr const int32_t invalid_score_ = std::numeric_limits::min(); +constexpr const int32_t invalid_index_ = std::numeric_limits::min(); +constexpr const size_t static_vec_size = 32; - class MappingScoreInfo { - public: - MappingScoreInfo() - : bestScore(invalid_score_), secondBestScore(invalid_score_), - bestDecoyScore(invalid_score_), decoyThresh(1.0), collect_decoy_info_(false) {} +class MappingScoreInfo { +public: + MappingScoreInfo() + : bestScore(invalid_score_), secondBestScore(invalid_score_), + bestDecoyScore(invalid_score_), decoyThresh(1.0), + collect_decoy_info_(false) {} - MappingScoreInfo(double decoyThreshIn) : MappingScoreInfo() { - decoyThresh = decoyThreshIn; - } + MappingScoreInfo(double decoyThreshIn) : MappingScoreInfo() { + decoyThresh = decoyThreshIn; + } - void collect_decoys(bool do_collect) { collect_decoy_info_ = do_collect; } - bool collect_decoys() const { return collect_decoy_info_; } - - // clear everything but the decoy threshold - void clear(size_t num_hits) { - bestScore = invalid_score_; - secondBestScore = invalid_score_; - bestDecoyScore = invalid_score_; - scores_.clear(); scores_.resize(num_hits, invalid_score_); - bestScorePerTranscript_.clear(); - perm_.clear(); - // NOTE: we do _not_ reset decoyThresh here - if (collect_decoy_info_) { - bool revert_to_static = best_decoy_hits.size() > static_vec_size; - best_decoy_hits.clear(); - if (revert_to_static) { best_decoy_hits.revert_to_static(); } - } - } + void collect_decoys(bool do_collect) { collect_decoy_info_ = do_collect; } + bool collect_decoys() const { return collect_decoy_info_; } - bool haveOnlyDecoyMappings() const { - // if the best non-decoy mapping has score less than decoyThresh * - // bestDecoyScore and if the bestDecoyScore is a valid value, then we - // have no valid non-decoy mappings. - return (bestScore < static_cast(decoyThresh * bestDecoyScore)) and - (bestDecoyScore > std::numeric_limits::min()); + // clear everything but the decoy threshold + void clear(size_t num_hits) { + bestScore = invalid_score_; + secondBestScore = invalid_score_; + bestDecoyScore = invalid_score_; + scores_.clear(); + scores_.resize(num_hits, invalid_score_); + bestScorePerTranscript_.clear(); + perm_.clear(); + // NOTE: we do _not_ reset decoyThresh here + if (collect_decoy_info_) { + bool revert_to_static = best_decoy_hits.size() > static_vec_size; + best_decoy_hits.clear(); + if (revert_to_static) { + best_decoy_hits.revert_to_static(); } - - inline bool update_decoy_mappings(int32_t hitScore, size_t idx, uint32_t tid) { - const bool better_score = hitScore > bestDecoyScore; - if (hitScore > bestDecoyScore) { - bestDecoyScore = hitScore; - if (collect_decoy_info_) { - best_decoy_hits.clear(); - best_decoy_hits.push_back(std::make_pair(static_cast(idx), static_cast(tid))); - } - } else if (collect_decoy_info_ and (hitScore == bestDecoyScore)){ - best_decoy_hits.push_back(std::make_pair(static_cast(idx), static_cast(tid))); - } - return better_score; + } + } + + bool haveOnlyDecoyMappings() const { + // if the best non-decoy mapping has score less than decoyThresh * + // bestDecoyScore and if the bestDecoyScore is a valid value, then we + // have no valid non-decoy mappings. + return (bestScore < static_cast(decoyThresh * bestDecoyScore)) and + (bestDecoyScore > + std::numeric_limits::min()); + } + + inline bool update_decoy_mappings(int32_t hitScore, size_t idx, + uint32_t tid) { + const bool better_score = hitScore > bestDecoyScore; + if (hitScore > bestDecoyScore) { + bestDecoyScore = hitScore; + if (collect_decoy_info_) { + best_decoy_hits.clear(); + best_decoy_hits.push_back(std::make_pair(static_cast(idx), + static_cast(tid))); } + } else if (collect_decoy_info_ and (hitScore == bestDecoyScore)) { + best_decoy_hits.push_back( + std::make_pair(static_cast(idx), static_cast(tid))); + } + return better_score; + } - int32_t bestScore; - int32_t secondBestScore; - int32_t bestDecoyScore; - double decoyThresh; - itlib::small_vector> best_decoy_hits; - bool collect_decoy_info_; - std::vector scores_; - phmap::flat_hash_map> bestScorePerTranscript_; - std::vector> perm_; - }; + int32_t bestScore; + int32_t secondBestScore; + int32_t bestDecoyScore; + double decoyThresh; + itlib::small_vector> best_decoy_hits; + bool collect_decoy_info_; + std::vector scores_; + phmap::flat_hash_map> + bestScorePerTranscript_; + std::vector> perm_; +}; template -inline bool initMapperSettings(SalmonOpts& salmonOpts, MemCollector& memCollector, ksw2pp::KSW2Aligner& aligner, - pufferfish::util::AlignmentConfig& aconf, pufferfish::util::MappingConstraintPolicy& mpol) { +inline bool +initMapperSettings(SalmonOpts& salmonOpts, MemCollector& memCollector, + ksw2pp::KSW2Aligner& aligner, + pufferfish::util::AlignmentConfig& aconf, + pufferfish::util::MappingConstraintPolicy& mpol) { memCollector.configureMemClusterer(salmonOpts.maxOccsPerHit); - double consensusFraction = (salmonOpts.consensusSlack == 0.0) ? 1.0 : (1.0 - salmonOpts.consensusSlack); + double consensusFraction = (salmonOpts.consensusSlack == 0.0) + ? 1.0 + : (1.0 - salmonOpts.consensusSlack); memCollector.setConsensusFraction(consensusFraction); memCollector.setHitFilterPolicy(salmonOpts.hitFilterPolicy); memCollector.setAltSkip(salmonOpts.mismatchSeedSkip); memCollector.setChainSubOptThresh(salmonOpts.pre_merge_chain_sub_thresh); - //Initialize ksw aligner + // Initialize ksw aligner ksw2pp::KSW2Config config; config.dropoff = -1; config.gapo = salmonOpts.gapOpenPenalty; @@ -166,14 +180,17 @@ inline bool initMapperSettings(SalmonOpts& salmonOpts, MemCollector& mem aconf.mimicBT2 = salmonOpts.mimicBT2; aconf.mimicBT2Strict = salmonOpts.mimicStrictBT2; aconf.allowOverhangSoftclip = salmonOpts.softclipOverhangs; - aconf.allowSoftclip = salmonOpts.softclip; + aconf.allowSoftclip = salmonOpts.softclip; aconf.useAlignmentCache = !salmonOpts.disableAlignmentCache; aconf.alignmentMode = pufferfish::util::PuffAlignmentMode::SCORE_ONLY; - // we actually care about the softclips in the cigar string - // if we are writing output and softclipping (or softclipping of overhangs) is enabled - if ( (!salmonOpts.qmFileName.empty()) and (salmonOpts.softclip or salmonOpts.softclipOverhangs) ) { - aconf.alignmentMode = pufferfish::util::PuffAlignmentMode::APPROXIMATE_CIGAR; + // we actually care about the softclips in the cigar string + // if we are writing output and softclipping (or softclipping of overhangs) is + // enabled + if ((!salmonOpts.qmFileName.empty()) and + (salmonOpts.softclip or salmonOpts.softclipOverhangs)) { + aconf.alignmentMode = + pufferfish::util::PuffAlignmentMode::APPROXIMATE_CIGAR; } mpol.noOrphans = !salmonOpts.allowOrphans; @@ -189,25 +206,26 @@ inline bool initMapperSettings(SalmonOpts& salmonOpts, MemCollector& mem mpol.setPostMergeChainSubThresh(salmonOpts.post_merge_chain_sub_thresh); mpol.setOrphanChainSubThresh(salmonOpts.orphan_chain_sub_thresh); - + return true; } - -inline void updateRefMappings(uint32_t tid, int32_t hitScore, bool isCompat, size_t idx, - const std::vector& transcripts, - int32_t invalidScore, - salmon::mapping_utils::MappingScoreInfo& msi) { +inline void updateRefMappings(uint32_t tid, int32_t hitScore, bool isCompat, + size_t idx, + const std::vector& transcripts, + int32_t invalidScore, + salmon::mapping_utils::MappingScoreInfo& msi) { auto& scores = msi.scores_; scores[idx] = hitScore; auto& t = transcripts[tid]; bool isDecoy = t.isDecoy(); - double decoyCutoff = static_cast(msi.decoyThresh * msi.bestDecoyScore); + double decoyCutoff = + static_cast(msi.decoyThresh * msi.bestDecoyScore); - //if (hitScore < decoyCutoff or (hitScore == invalidScore)) { } + // if (hitScore < decoyCutoff or (hitScore == invalidScore)) { } if (isDecoy) { - // NOTE: decide here if we need to process any of this if the + // NOTE: decide here if we need to process any of this if the // current score is < the best (non-decoy) score. I think not. // if this is a decoy and its score is better than the best decoy score @@ -215,10 +233,10 @@ inline void updateRefMappings(uint32_t tid, int32_t hitScore, bool isCompat, siz (void)did_update; return; } else if (hitScore < decoyCutoff or (hitScore == invalidScore)) { - // if the current score is to a valid target but doesn't + // if the current score is to a valid target but doesn't // exceed the necessary decoy threshold, then skip it. return; - } + } // otherwise, we have a "high-scoring" hit to a non-decoy auto& perm = msi.perm_; @@ -230,7 +248,8 @@ inline void updateRefMappings(uint32_t tid, int32_t hitScore, bool isCompat, siz // this is the current best bestScorePerTranscript[tid].first = hitScore; bestScorePerTranscript[tid].second = idx; - } else if ((hitScore > it->second.first) or (hitScore == it->second.first and isCompat)) { + } else if ((hitScore > it->second.first) or + (hitScore == it->second.first and isCompat)) { // otherwise, if we had an alignment for this transcript and it's // better than the current best, then set the best score to this // alignment's score, and invalidate the previous alignment @@ -241,7 +260,7 @@ inline void updateRefMappings(uint32_t tid, int32_t hitScore, bool isCompat, siz // otherwise, there is already a better mapping for this transcript. scores[idx] = invalidScore; } - + if (hitScore > msi.bestScore) { msi.secondBestScore = msi.bestScore; msi.bestScore = hitScore; @@ -249,44 +268,50 @@ inline void updateRefMappings(uint32_t tid, int32_t hitScore, bool isCompat, siz perm.push_back(std::make_pair(idx, tid)); } - inline void filterAndCollectAlignments( - std::vector& jointHits, - uint32_t readLen, - uint32_t mateLen, - bool singleEnd, - bool tryAlign, - bool hardFilter, - double scoreExp, - double minAlnProb, - salmon::mapping_utils::MappingScoreInfo& msi, - std::vector& jointAlignments) { + std::vector& jointHits, uint32_t readLen, + uint32_t mateLen, bool singleEnd, bool tryAlign, bool hardFilter, + double scoreExp, double minAlnProb, + salmon::mapping_utils::MappingScoreInfo& msi, + std::vector& jointAlignments) { auto invalidScore = std::numeric_limits::min(); - if (msi.bestDecoyScore == invalidScore) { msi.bestDecoyScore = invalidScore + 1 ;} + if (msi.bestDecoyScore == invalidScore) { + msi.bestDecoyScore = invalidScore + 1; + } - int32_t decoyThreshold = static_cast(msi.decoyThresh * msi.bestDecoyScore); + int32_t decoyThreshold = static_cast( + msi.decoyThresh * msi.bestDecoyScore); - //auto filterScore = (bestDecoyScore < secondBestScore) ? secondBestScore : bestDecoyScore; + // auto filterScore = (bestDecoyScore < secondBestScore) ? secondBestScore : + // bestDecoyScore; auto& scores = msi.scores_; auto& perm = msi.perm_; // throw away any pairs for which we should not produce valid alignments : // ====== - // If we are doing soft-filtering (default), we remove those not exceeding the bestDecoyScore - // If we are doing hard-filtering, we remove those less than the bestScore - perm.erase(std::remove_if(perm.begin(), perm.end(), - [&scores, hardFilter, &msi, decoyThreshold, invalidScore](const std::pair& idxtid) -> bool { - return !hardFilter ? scores[idxtid.first] < decoyThreshold : scores[idxtid.first] < msi.bestScore; - //return !hardFilter ? scores[idxtid.first] < filterScore : scores[idxtid.first] < bestScore; - }), perm.end()); - - // Unlike RapMap, pufferfish doesn't guarantee the hits computed above are in order - // by transcript, so we find the permutation of indices that puts things in transcript - // order. - std::sort(perm.begin(), perm.end(), [](const std::pair& p1, - const std::pair& p2) { - return p1.second < p2.second;}); + // If we are doing soft-filtering (default), we remove those not exceeding the + // bestDecoyScore If we are doing hard-filtering, we remove those less than + // the bestScore + perm.erase(std::remove_if( + perm.begin(), perm.end(), + [&scores, hardFilter, &msi, decoyThreshold, invalidScore]( + const std::pair& idxtid) -> bool { + return !hardFilter ? scores[idxtid.first] < decoyThreshold + : scores[idxtid.first] < msi.bestScore; + // return !hardFilter ? scores[idxtid.first] < filterScore : + // scores[idxtid.first] < bestScore; + }), + perm.end()); + + // Unlike RapMap, pufferfish doesn't guarantee the hits computed above are in + // order by transcript, so we find the permutation of indices that puts things + // in transcript order. + std::sort(perm.begin(), perm.end(), + [](const std::pair& p1, + const std::pair& p2) { + return p1.second < p2.second; + }); // moving our alinged / score jointMEMs over to QuasiAlignment objects double bestScoreD = static_cast(msi.bestScore); @@ -298,27 +323,33 @@ inline void filterAndCollectAlignments( double currScore = scores[ctr]; double v = bestScoreD - currScore; // why -1? - double estAlnProb = hardFilter ? -1.0 : std::exp(- scoreExp * v ); + double estAlnProb = hardFilter ? -1.0 : std::exp(-scoreExp * v); // skip any alignment with aln prob < minAlnProb - if (!hardFilter and (estAlnProb < minAlnProb)) { continue; } + if (!hardFilter and (estAlnProb < minAlnProb)) { + continue; + } if (singleEnd or jointHit.isOrphan()) { readLen = jointHit.isLeftAvailable() ? readLen : mateLen; - jointAlignments.emplace_back(tid, // reference id - jointHit.orphanClust()->getTrFirstHitPos(), // reference pos - jointHit.orphanClust()->isFw, // fwd direction - readLen, // read length - jointHit.orphanClust()->cigar, // cigar string - jointHit.fragmentLen, // fragment length - false); - auto &qaln = jointAlignments.back(); + jointAlignments.emplace_back( + tid, // reference id + jointHit.orphanClust()->getTrFirstHitPos(), // reference pos + jointHit.orphanClust()->isFw, // fwd direction + readLen, // read length + jointHit.orphanClust()->cigar, // cigar string + jointHit.fragmentLen, // fragment length + false); + auto& qaln = jointAlignments.back(); // NOTE : score should not be filled in from a double - qaln.score = !tryAlign ? static_cast(jointHit.orphanClust()->coverage):jointHit.alignmentScore; + qaln.score = !tryAlign + ? static_cast(jointHit.orphanClust()->coverage) + : jointHit.alignmentScore; qaln.estAlnProb(estAlnProb); // NOTE : wth is numHits? - qaln.numHits = static_cast(jointHits.size());//orphanClust()->coverage; + qaln.numHits = + static_cast(jointHits.size()); // orphanClust()->coverage; qaln.mateStatus = jointHit.mateStatus; - if(singleEnd) { + if (singleEnd) { qaln.mateLen = readLen; qaln.mateCigar.clear(); qaln.matePos = 0; @@ -327,116 +358,732 @@ inline void filterAndCollectAlignments( qaln.mateStatus = MateStatus::SINGLE_END; } } else { - jointAlignments.emplace_back(tid, // reference id - jointHit.leftClust->getTrFirstHitPos(), // reference pos - jointHit.leftClust->isFw, // fwd direction - readLen, // read length - jointHit.leftClust->cigar, // cigar string - jointHit.fragmentLen, // fragment length - true); // properly paired + jointAlignments.emplace_back( + tid, // reference id + jointHit.leftClust->getTrFirstHitPos(), // reference pos + jointHit.leftClust->isFw, // fwd direction + readLen, // read length + jointHit.leftClust->cigar, // cigar string + jointHit.fragmentLen, // fragment length + true); // properly paired // Fill in the mate info - auto &qaln = jointAlignments.back(); + auto& qaln = jointAlignments.back(); qaln.mateLen = mateLen; qaln.mateCigar = jointHit.rightClust->cigar; - qaln.matePos = static_cast(jointHit.rightClust->getTrFirstHitPos()); + qaln.matePos = + static_cast(jointHit.rightClust->getTrFirstHitPos()); qaln.mateIsFwd = jointHit.rightClust->isFw; qaln.mateStatus = MateStatus::PAIRED_END_PAIRED; // NOTE : wth is numHits? - qaln.numHits = static_cast(jointHits.size()); + qaln.numHits = static_cast(jointHits.size()); // NOTE : score should not be filled in from a double - qaln.score = !tryAlign ? static_cast(jointHit.leftClust->coverage):jointHit.alignmentScore; + qaln.score = !tryAlign + ? static_cast(jointHit.leftClust->coverage) + : jointHit.alignmentScore; qaln.estAlnProb(estAlnProb); - qaln.mateScore = !tryAlign ? static_cast(jointHit.rightClust->coverage):jointHit.mateAlignmentScore; + qaln.mateScore = !tryAlign + ? static_cast(jointHit.rightClust->coverage) + : jointHit.mateAlignmentScore; } } // done moving our alinged / score jointMEMs over to QuasiAlignment objects } - inline void filterAndCollectAlignmentsDecoy( - std::vector& jointHits, - uint32_t readLen, - uint32_t mateLen, - bool singleEnd, - bool tryAlign, - bool hardFilter, - double scoreExp, - double minAlnProb, - salmon::mapping_utils::MappingScoreInfo& msi, - std::vector& jointAlignments) { -// NOTE: this function should only be called in the case that we have valid decoy mappings to report. -// Currently, this happens only when there are *no valid non-decoy* mappings. -// Further, this function will only add equally *best* decoy mappings to the output jointAlignments object -// regardless of the the status of hardFilter (i.e. no sub-optimal decoy mappings will be reported). -(void) hardFilter; -(void) minAlnProb; -(void) scoreExp; -double estAlnProb = 1.0; //std::exp(-scoreExp * 0.0); -for (auto& idxTxp : msi.best_decoy_hits) { - int32_t ctr = idxTxp.first; - int32_t tid = idxTxp.second; - auto& jointHit = jointHits[ctr]; - - if (singleEnd or jointHit.isOrphan()) { - readLen = jointHit.isLeftAvailable() ? readLen : mateLen; - jointAlignments.emplace_back( - tid, // reference id - jointHit.orphanClust()->getTrFirstHitPos(), // reference pos - jointHit.orphanClust()->isFw, // fwd direction - readLen, // read length - jointHit.orphanClust()->cigar, // cigar string - jointHit.fragmentLen, // fragment length - false); - auto& qaln = jointAlignments.back(); - // NOTE : score should not be filled in from a double - qaln.score = !tryAlign - ? static_cast(jointHit.orphanClust()->coverage) - : jointHit.alignmentScore; - qaln.estAlnProb(estAlnProb); - // NOTE : wth is numHits? - qaln.numHits = - static_cast(jointHits.size()); // orphanClust()->coverage; - qaln.mateStatus = jointHit.mateStatus; - if (singleEnd) { - qaln.mateLen = readLen; - qaln.mateCigar.clear(); - qaln.matePos = 0; - qaln.mateIsFwd = true; - qaln.mateScore = 0; - qaln.mateStatus = MateStatus::SINGLE_END; + std::vector& jointHits, uint32_t readLen, + uint32_t mateLen, bool singleEnd, bool tryAlign, bool hardFilter, + double scoreExp, double minAlnProb, + salmon::mapping_utils::MappingScoreInfo& msi, + std::vector& jointAlignments) { + // NOTE: this function should only be called in the case that we have valid + // decoy mappings to report. Currently, this happens only when there are *no + // valid non-decoy* mappings. Further, this function will only add equally + // *best* decoy mappings to the output jointAlignments object regardless of + // the the status of hardFilter (i.e. no sub-optimal decoy mappings will be + // reported). + (void)hardFilter; + (void)minAlnProb; + (void)scoreExp; + double estAlnProb = 1.0; // std::exp(-scoreExp * 0.0); + for (auto& idxTxp : msi.best_decoy_hits) { + int32_t ctr = idxTxp.first; + int32_t tid = idxTxp.second; + auto& jointHit = jointHits[ctr]; + + if (singleEnd or jointHit.isOrphan()) { + readLen = jointHit.isLeftAvailable() ? readLen : mateLen; + jointAlignments.emplace_back( + tid, // reference id + jointHit.orphanClust()->getTrFirstHitPos(), // reference pos + jointHit.orphanClust()->isFw, // fwd direction + readLen, // read length + jointHit.orphanClust()->cigar, // cigar string + jointHit.fragmentLen, // fragment length + false); + auto& qaln = jointAlignments.back(); + // NOTE : score should not be filled in from a double + qaln.score = !tryAlign + ? static_cast(jointHit.orphanClust()->coverage) + : jointHit.alignmentScore; + qaln.estAlnProb(estAlnProb); + // NOTE : wth is numHits? + qaln.numHits = + static_cast(jointHits.size()); // orphanClust()->coverage; + qaln.mateStatus = jointHit.mateStatus; + if (singleEnd) { + qaln.mateLen = readLen; + qaln.mateCigar.clear(); + qaln.matePos = 0; + qaln.mateIsFwd = true; + qaln.mateScore = 0; + qaln.mateStatus = MateStatus::SINGLE_END; + } + } else { + jointAlignments.emplace_back( + tid, // reference id + jointHit.leftClust->getTrFirstHitPos(), // reference pos + jointHit.leftClust->isFw, // fwd direction + readLen, // read length + jointHit.leftClust->cigar, // cigar string + jointHit.fragmentLen, // fragment length + true); // properly paired + // Fill in the mate info + auto& qaln = jointAlignments.back(); + qaln.mateLen = mateLen; + qaln.mateCigar = jointHit.rightClust->cigar; + qaln.matePos = + static_cast(jointHit.rightClust->getTrFirstHitPos()); + qaln.mateIsFwd = jointHit.rightClust->isFw; + qaln.mateStatus = MateStatus::PAIRED_END_PAIRED; + // NOTE : wth is numHits? + qaln.numHits = static_cast(jointHits.size()); + // NOTE : score should not be filled in from a double + qaln.score = !tryAlign + ? static_cast(jointHit.leftClust->coverage) + : jointHit.alignmentScore; + qaln.estAlnProb(estAlnProb); + qaln.mateScore = !tryAlign + ? static_cast(jointHit.rightClust->coverage) + : jointHit.mateAlignmentScore; } - } else { - jointAlignments.emplace_back( - tid, // reference id - jointHit.leftClust->getTrFirstHitPos(), // reference pos - jointHit.leftClust->isFw, // fwd direction - readLen, // read length - jointHit.leftClust->cigar, // cigar string - jointHit.fragmentLen, // fragment length - true); // properly paired - // Fill in the mate info - auto& qaln = jointAlignments.back(); - qaln.mateLen = mateLen; - qaln.mateCigar = jointHit.rightClust->cigar; - qaln.matePos = - static_cast(jointHit.rightClust->getTrFirstHitPos()); - qaln.mateIsFwd = jointHit.rightClust->isFw; - qaln.mateStatus = MateStatus::PAIRED_END_PAIRED; - // NOTE : wth is numHits? - qaln.numHits = static_cast(jointHits.size()); - // NOTE : score should not be filled in from a double - qaln.score = !tryAlign ? static_cast(jointHit.leftClust->coverage) - : jointHit.alignmentScore; - qaln.estAlnProb(estAlnProb); - qaln.mateScore = !tryAlign - ? static_cast(jointHit.rightClust->coverage) - : jointHit.mateAlignmentScore; + } // end for over best decoy hits +} + +namespace pasc { + +constexpr int32_t invalid_frag_len = std::numeric_limits::min(); +constexpr int32_t invalid_mate_pos = std::numeric_limits::min(); + +/* from pesc +struct simple_hit { + bool is_fw{false}; + bool mate_is_fw{false}; + int32_t pos{-1}; + float score{0.0}; + uint32_t num_hits{0}; + uint32_t tid{std::numeric_limits::max()}; + int32_t mate_pos{std::numeric_limits::max()}; + int32_t fragment_length{std::numeric_limits::max()}; + inline bool valid_pos(int32_t read_len, uint32_t txp_len, int32_t max_over) +{ int32_t signed_txp_len = static_cast(txp_len); return (pos > +-max_over) and ((pos + read_len) < (signed_txp_len + max_over)); + } + inline bool has_mate() const { return mate_pos != invalid_mate_pos; } + inline bool mate_is_mapped() const { return mate_pos != invalid_mate_pos; } + inline int32_t frag_len() const { + return (fragment_length != invalid_frag_len) ? fragment_length : 0; + } +}; +*/ +struct simple_hit { + bool is_fw{false}; + int32_t pos{-1}; + float score{0.0}; + uint32_t num_hits{0}; + uint32_t tid{std::numeric_limits::max()}; + // int32_t mate_pos{std::numeric_limits::max()}; + // int32_t fragment_length{std::numeric_limits::max()}; + + // UNPAIRED_LEFT, UNPAIRED_RIGHT, PAIRED_FR, PAIRED_RF + PairingStatus pairing_status{PairingStatus::UNPAIRED_RIGHT}; + + bool valid_pos(int32_t read_len, uint32_t txp_len, int32_t max_over) { + int32_t signed_txp_len = static_cast(txp_len); + return (pos > -max_over) and + ((pos + read_len) < (signed_txp_len + max_over)); + } + + bool operator<(const simple_hit& hit2) { + if (tid != hit2.tid) { // hit1 and hit2 are on different transcripts + return tid < hit2.tid; + } else { // hit1 and hit2 are on the same transcript + // NOTE: Is this what we actually want? + if (is_fw) { // fw < rc + return true; + } else { + return false; + } + } + } +}; + +enum class HitDirection : uint8_t { FW, RC, BOTH }; + +/* +struct sketch_hit_info { + // add a hit to the current target that occurs in the forward + // orientation with respect to the target. + bool add_fw(int32_t ref_pos, int32_t read_pos, int32_t rl, int32_t k, + int32_t max_stretch, float score_inc) { + (void)rl; + (void)k; + bool added{false}; + + // since hits are collected by moving _forward_ in the + // read, if this is a fw hit, it should be moving + // forward in the reference. Only add it if this is + // the case. This ensure that we don't + // double-count a k-mer that might occur twice on + // this target. + if (ref_pos > last_ref_pos_fw and read_pos > last_read_pos_fw) { + if (last_read_pos_fw == -1) { + approx_pos_fw = ref_pos - read_pos; + } else { + if ((ref_pos - approx_pos_fw) > max_stretch) { + return false; + } + } + // if (last_ref_pos_fw > -1 and (ref_pos > last_ref_pos_fw + 15)) { return + // false; } + last_ref_pos_fw = ref_pos; + last_read_pos_fw = read_pos; + fw_score += score_inc; + ++fw_hits; + added = true; + } + return added; + } + + // add a hit to the current target that occurs in the forward + // orientation with respect to the target. + bool add_rc(int32_t ref_pos, int32_t read_pos, int32_t rl, int32_t k, + int32_t max_stretch, float score_inc) { + + bool added{false}; + // since hits are collected by moving _forward_ in the + // read, if this is an rc hit, it should be moving + // backwards in the reference. Only add it if this is + // the case. + // This ensures that we don't double-count a k-mer that + // might occur twice on this target. + if (ref_pos < last_ref_pos_rc and read_pos > last_read_pos_rc) { + approx_pos_rc = (ref_pos - (rl - (read_pos + k))); + if (last_read_pos_rc == -1) { + approx_end_pos_rc = ref_pos + read_pos; + } else { + if (approx_end_pos_rc - approx_pos_rc > max_stretch) { + return false; + } + } + // if (last_ref_pos_rc > -1 and ref_pos < last_ref_pos_rc - 15) { return + // false; } + last_ref_pos_rc = ref_pos; + last_read_pos_rc = read_pos; + rc_score += score_inc; + ++rc_hits; + added = true; + } + return added; } -} // end for over best decoy hits + + inline uint32_t max_hits_for_target() { return std::max(fw_hits, rc_hits); } + + // true if forward, false if rc + // second element is score + inline HitDirection best_hit_direction() { + int32_t fw_minus_rc = + static_cast(fw_hits) - static_cast(rc_hits); + return (fw_minus_rc > 0) + ? HitDirection::FW + : ((fw_minus_rc < 0) ? HitDirection::RC : HitDirection::BOTH); + } + + inline simple_hit get_fw_hit(PairingStatus ps) { + return simple_hit{true, + approx_pos_fw, + fw_score, + fw_hits, + std::numeric_limits::max(), + ps}; + } + + inline simple_hit get_rc_hit(PairingStatus ps) { + return simple_hit{false, + approx_pos_rc, + rc_score, + rc_hits, + std::numeric_limits::max(), + ps}; + } + + inline std::string to_string() { + std::stringstream ss; + ss << "fw_hits: " << fw_hits << ", fw_score : " << fw_score + << ", fw_pos : " << approx_pos_fw << " || rc_hits: " << rc_hits + << ", rc_score: " << rc_score << ", rc_pos: " << approx_pos_rc; + return ss.str(); + } + + int32_t last_read_pos_fw{-1}; + int32_t last_read_pos_rc{-1}; + + int32_t last_ref_pos_fw{-1}; + int32_t last_ref_pos_rc{std::numeric_limits::max()}; + + int32_t approx_pos_fw{-1}; + int32_t approx_pos_rc{-1}; + int32_t approx_end_pos_rc{-1}; + + uint32_t fw_hits{0}; + uint32_t rc_hits{0}; + float fw_score{0.0}; + float rc_score{0.0}; +};*/ + +struct sketch_hit_info { + // add a hit to the current target that occurs in the forward + // orientation with respect to the target. + bool add_fw(int32_t ref_pos, int32_t read_pos, int32_t rl, int32_t k, int32_t max_stretch, + float score_inc) { + (void)rl; + (void)k; + bool added{false}; + + // since hits are collected by moving _forward_ in the + // read, if this is a fw hit, it should be moving + // forward in the reference. Only add it if this is + // the case. This ensure that we don't + // double-count a k-mer that might occur twice on + // this target. + if (ref_pos > last_ref_pos_fw and read_pos > last_read_pos_fw) { + if (last_read_pos_fw == -1) { + approx_pos_fw = ref_pos - read_pos; + } else { + if ((ref_pos - approx_pos_fw) > max_stretch) { return false; } + } + last_ref_pos_fw = ref_pos; + last_read_pos_fw = read_pos; + fw_score += score_inc; + ++fw_hits; + added = true; + } + return added; + } + + // add a hit to the current target that occurs in the forward + // orientation with respect to the target. + bool add_rc(int32_t ref_pos, int32_t read_pos, int32_t rl, int32_t k, int32_t max_stretch, + float score_inc) { + bool added{false}; + // since hits are collected by moving _forward_ in the + // read, if this is an rc hit, it should be moving + // backwards in the reference. + // In general, we only add the hit if this is the case. + // This ensures that we don't double-count a k-mer that + // might occur twice on this target. + + // we have a special case here; what if the same exact + // k-mer (i.e. not just the same sequence but same position + // on the query) occurs more than one time on this refernece? + // + // In that case, the GENERAL case code will already have + // processed and seen a k-mer with the read position + // equal to `read_pos`. In the case below, we see + // a hit with the *same* read pos again (one or more times). + // + // Here, we swap out the previous hit having this read_pos + // if the position of the current hit on the read is + // the same and the position on the reference is greater + // (this is a heuristic to help in the case of tandem repeats or + // highly-repetitive subsequence). + // NOTE: consider if a similar heuristic should be + // adopted for the forward case. + if ((read_pos == last_read_pos_rc) and (ref_pos > last_ref_pos_rc) and + (ref_pos < rightmost_bound_rc)) { + + last_ref_pos_rc = ref_pos; + // if the read_pos was the same as the first read pos + // then also update the approx_end_pos_rc accordingly + // NOTE: for the time being don't mess with this position + // empirically this does better, but if we really want + // to optimize this for accuracy we need a better general + // heuristic. + // if (read_pos == first_read_pos_rc) { + // approx_end_pos_rc = ref_pos + read_pos; + //} + + return added; + } + + // GENERAL case + if (ref_pos < last_ref_pos_rc and read_pos > last_read_pos_rc) { + approx_pos_rc = (ref_pos - (rl - (read_pos + k))); + if (last_read_pos_rc == -1) { + approx_end_pos_rc = ref_pos + read_pos; + first_read_pos_rc = read_pos; + } else { + if (approx_end_pos_rc - approx_pos_rc > max_stretch) { return false; } + } + rc_score += score_inc; + ++rc_hits; + + // new + rightmost_bound_rc = last_ref_pos_rc; + + last_ref_pos_rc = ref_pos; + last_read_pos_rc = read_pos; + added = true; + } + return added; + } + + inline uint32_t max_hits_for_target() { return std::max(fw_hits, rc_hits); } + + // true if forward, false if rc + // second element is score + inline HitDirection best_hit_direction() { + int32_t fw_minus_rc = static_cast(fw_hits) - static_cast(rc_hits); + return (fw_minus_rc > 0) ? HitDirection::FW + : ((fw_minus_rc < 0) ? HitDirection::RC : HitDirection::BOTH); + } + + inline simple_hit get_fw_hit(PairingStatus ps) { + return simple_hit{true, approx_pos_fw, + fw_score, fw_hits, std::numeric_limits::max(), ps}; + } + + inline simple_hit get_rc_hit(PairingStatus ps) { + return simple_hit{false, approx_pos_rc, + rc_score, rc_hits, std::numeric_limits::max(), ps}; + } + + inline std::string to_string() { + std::stringstream ss; + ss << "fw_hits: " << fw_hits << ", fw_score : " << fw_score + << ", fw_pos : " << approx_pos_fw << " || rc_hits: " << rc_hits + << ", rc_score: " << rc_score << ", rc_pos: " << approx_pos_rc; + return ss.str(); + } + + int32_t last_read_pos_fw{-1}; + int32_t last_read_pos_rc{-1}; + int32_t rightmost_bound_rc{std::numeric_limits::max()}; + + // marks the read position (key) of the + // first hit we see in the rc direction + int32_t first_read_pos_rc{-1}; + + int32_t last_ref_pos_fw{-1}; + int32_t last_ref_pos_rc{std::numeric_limits::max()}; + + int32_t approx_pos_fw{-1}; + int32_t approx_pos_rc{-1}; + int32_t approx_end_pos_rc{-1}; + + uint32_t fw_hits{0}; + uint32_t rc_hits{0}; + float fw_score{0.0}; + float rc_score{0.0}; +}; + +/** + * This contains the information necessary to collect hits and perform + * mapping on a read using PASC. + **/ +template struct mapping_cache_info { +public: + mapping_cache_info(IndexT* midx, size_t max_occ_default_in, + size_t max_occ_recover_in, size_t max_read_occ_in) + : idx(midx), k(midx->k()), hit_searcher(midx), + max_occ_default(max_occ_default_in), + max_occ_recover(max_occ_recover_in), + attempt_occ_recover(max_occ_recover > max_occ_default), + max_read_occ(max_read_occ_in), alt_max_occ(max_read_occ_in) {} + + inline void clear() { + map_type = salmon::utils::MappingType::UNMAPPED; + hit_searcher.clear(); + hit_map.clear(); + accepted_hits.clear(); + alt_max_occ = max_read_occ; + has_matching_kmers = false; + } + + // will store how the read mapped + salmon::utils::MappingType map_type{salmon::utils::MappingType::UNMAPPED}; + + // map from reference id to hit info + phmap::flat_hash_map hit_map; + + // where the mappings that pass will be stored + std::vector accepted_hits; + + // map to recall the number of unmapped reads we see + // for each barcode + phmap::flat_hash_map unmapped_bc_map; + + // pointer to the underlying index + IndexT* idx{nullptr}; + + // the k-mer size of our index + size_t k{0}; + + // implements the PASC algorithm + MemCollector hit_searcher; + + // the number of occurrences of a hit above which we + // won't consider it + size_t max_occ_default; + + // the number of occurences of a hit above which we + // won't consider it, even in recovery mode + size_t max_occ_recover; + + // will we attempt recovery if there are no hits with + // frequency below max_occ_default? + bool attempt_occ_recover; + + // maximum number of places a read can occur and still + // be considered. + size_t max_read_occ; + + size_t alt_max_occ; + + // to perform queries + pufferfish::util::QueryCache qc; + + // regardless of having full mappings, did any k-mers match + bool has_matching_kmers{false}; +}; + +/** + * This function will map the read given by `read_seq` + * using PASC. The relevant parameters (e.g. maximum ambiguity + * of seed and maximum number of allowable mappings) are stored + * in the `map_cache` structure. The `PairingStatus` is passed in + * by the caller and designates if the read being mapped is the + * left end or the right end. + **/ +template +inline bool map_read(std::string* read_seq, + mapping_cache_info& map_cache, PairingStatus ps, + bool verbose = false) { + map_cache.clear(); + // rebind map_cache variables to + // local names + IndexT* qidx = map_cache.idx; + auto& qc = map_cache.qc; + auto& hit_searcher = map_cache.hit_searcher; + auto& hit_map = map_cache.hit_map; + auto& accepted_hits = map_cache.accepted_hits; + auto& map_type = map_cache.map_type; + const bool attempt_occ_recover = map_cache.attempt_occ_recover; + auto k = map_cache.k; + int32_t signed_k = static_cast(k); + + // collect the set of matching seeds + map_cache.has_matching_kmers = + hit_searcher.get_raw_hits_sketch(*read_seq, qc, true, false); + bool early_stop = false; + + // if there were hits + if (map_cache.has_matching_kmers) { + uint32_t num_valid_hits{0}; + uint64_t total_occs{0}; + uint64_t largest_occ{0}; + auto& raw_hits = hit_searcher.get_left_hits(); + + // SANITY + decltype(raw_hits[0].first) prev_read_pos = -1; + // the maximum span the supporting k-mers of a + // mapping position are allowed to have. + // NOTE this is still > read_length b/c the stretch is measured wrt the + // START of the terminal k-mer. + int32_t max_stretch = static_cast(read_seq->length() * 1.0); + + // a raw hit is a pair of read_pos and a projected hit + + // the least frequent hit for this fragment. + uint64_t min_occ = std::numeric_limits::max(); + + // this is false by default and will be set to true + // if *every* collected hit for this fragment occurs + // max_occ_default times or more. + bool had_alt_max_occ = false; + int32_t signed_rl = static_cast(read_seq->length()); + auto collect_mappings_from_hits = + [&max_stretch, &min_occ, &hit_map, &num_valid_hits, &total_occs, + &largest_occ, &early_stop, signed_rl, k, signed_k, qidx, + verbose](auto& raw_hits, auto& prev_read_pos, auto& max_allowed_occ, + auto& had_alt_max_occ) -> bool { + for (auto& raw_hit : raw_hits) { + auto& read_pos = raw_hit.first; + auto& proj_hits = raw_hit.second; + auto& refs = proj_hits.refRange; + + // Keep track of the *least* ambiguous hit we see. + // If all hits occur more than `max_allowed_occ` times, + // but the least ambiguous hit (which occurs `min_occ` + // times) has < `max_occ_recover` occurrences, then + // we will use all hits occurring `min_occ` number + // of times or less to try top recover the read's mappings. + uint64_t num_occ = static_cast(refs.size()); + min_occ = std::min(min_occ, num_occ); + had_alt_max_occ = true; + + // we visit every hit that occurs the allowable number of time + bool still_have_valid_target = false; + prev_read_pos = read_pos; + if (num_occ <= max_allowed_occ) { + total_occs += num_occ; + largest_occ = (num_occ > largest_occ) ? num_occ : largest_occ; + float score_inc = 1.0; + + // vist each mapped reference position (mrp) where this hit + // occurs. + for (auto& pos_it : refs) { + const auto& ref_pos_ori = proj_hits.decodeHit(pos_it); + uint32_t tid = + static_cast(qidx->getRefId(pos_it.transcript_id())); + int32_t pos = static_cast(ref_pos_ori.pos); + bool ori = ref_pos_ori.isFW; + auto& target = hit_map[tid]; + + // Why >= here instead of == ? + // Because hits can happen on the same target in both the forward + // and rc orientations, it is possible that we start the loop with + // the target having num_valid_hits hits in a given orientation (o) + // we see a new hit for this target in oriention o (now it has + // num_valid_hits + 1) then we see a hit for this target in + // orientation rc(o). We still want to add / consider this hit, but + // max_hits_for_target() > num_valid_hits. So, we must allow for + // that here. + if (target.max_hits_for_target() >= num_valid_hits) { + if (ori) { + target.add_fw(pos, static_cast(read_pos), signed_rl, + signed_k, max_stretch, score_inc); + } else { + target.add_rc(pos, static_cast(read_pos), signed_rl, + signed_k, max_stretch, score_inc); + } + + // if this target is still valid after evaluating this hit + // then we will have at least one globally valid target. If + // no targets are valid after evaluating all mrps for this + // hit then we can exit the search early. + still_have_valid_target |= + (target.max_hits_for_target() >= num_valid_hits + 1); + } + + } // DONE: for (auto &pos_it : refs) + + ++num_valid_hits; + + // if there are no targets reaching the valid hit threshold, then + // break early + if (!still_have_valid_target) { + return true; + } + + } // DONE : if (num_occ <= max_allowed_occ) + } // DONE : for (auto& raw_hit : raw_hits) + + return false; + }; + + bool _discard = false; + auto mao_first_pass = map_cache.max_occ_default - 1; + early_stop = collect_mappings_from_hits(raw_hits, prev_read_pos, + mao_first_pass, _discard); + + // If our default threshold was too stringent, then fallback to a more + // liberal threshold and look up the k-mers that occur the least frequently. + // Specifically, if the min occuring hits have frequency < max_occ_recover + // (2500 by default) times, then collect the min occuring hits to get the + // mapping. + if (attempt_occ_recover and (min_occ >= map_cache.max_occ_default) and + (min_occ < map_cache.max_occ_recover)) { + prev_read_pos = -1; + uint64_t max_allowed_occ = min_occ; + early_stop = collect_mappings_from_hits(raw_hits, prev_read_pos, + max_allowed_occ, had_alt_max_occ); + } + + uint32_t best_alt_hits = 0; + + for (auto& kv : hit_map) { + auto best_hit_dir = kv.second.best_hit_direction(); + // if the best direction is FW or BOTH, add the fw hit + // otherwise add the RC. + auto mapping = (best_hit_dir != HitDirection::RC) + ? kv.second.get_fw_hit(ps) + : kv.second.get_rc_hit(ps); + + if (mapping.num_hits >= num_valid_hits) { + mapping.tid = kv.first; + accepted_hits.emplace_back(mapping); + // if we had equally good hits in both directions + // add the rc hit here (since we added the fw) + // above if the best hit was either FW or BOTH + if (best_hit_dir == HitDirection::BOTH) { + auto second_hit = kv.second.get_rc_hit(ps); + second_hit.tid = kv.first; + accepted_hits.emplace_back(second_hit); + } + } else { + // best_alt_score = mapping.score > best_alt_score ? mapping.score : + // best_alt_score; + best_alt_hits = + mapping.num_hits > best_alt_hits ? mapping.num_hits : best_alt_hits; + } + } + + map_cache.alt_max_occ = + had_alt_max_occ ? accepted_hits.size() : map_cache.max_occ_default; + + /* + * This rule; if enabled, allows through mappings missing a single hit, if + there + * was no mapping with all hits. NOTE: this won't work with the current + early-exit + * optimization however. + if (accepted_hits.empty() and (num_valid_hits > 1) and (best_alt_hits >= + num_valid_hits + - 1)) { for (auto& kv : hit_map) { auto mapping = kv.second.get_best_hit(); + if (mapping.num_hits >= best_alt_hits) { + //if (mapping.valid_pos(signed_read_len, transcripts[kv.first].RefLength, + 10)) { mapping.tid = kv.first; accepted_hits.emplace_back(mapping); + //} + } + } + } + */ + } // DONE : if (rh) + + // If the read mapped to > maxReadOccs places, discard it + if (accepted_hits.size() > map_cache.alt_max_occ) { + accepted_hits.clear(); + map_type = salmon::utils::MappingType::UNMAPPED; + } else if (!accepted_hits.empty()) { + map_type = salmon::utils::MappingType::SINGLE_MAPPED; + } + + return early_stop; } +} // namespace pasc - } // namespace mapping_utils +} // namespace mapping_utils } // namespace salmon #endif //__SALMON_MAPPING_UTILS__ diff --git a/src/SalmonAlevin.cpp b/src/SalmonAlevin.cpp index f17dd0699..21032cfad 100644 --- a/src/SalmonAlevin.cpp +++ b/src/SalmonAlevin.cpp @@ -414,15 +414,10 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re salmon::utils::ShortFragStats shortFragStats; double maxZeroFrac{0.0}; - // Write unmapped reads - fmt::MemoryWriter unmappedNames; - bool writeUnmapped = salmonOpts.writeUnmappedNames; - spdlog::logger* unmappedLogger = (writeUnmapped) ? salmonOpts.unmappedLog.get() : nullptr; - - // Write unmapped reads - fmt::MemoryWriter orphanLinks; - bool writeOrphanLinks = salmonOpts.writeOrphanLinks; - spdlog::logger* orphanLinkLogger = (writeOrphanLinks) ? salmonOpts.orphanLinkLog.get() : nullptr; + bool quiet = salmonOpts.quiet; + if (salmonOpts.writeQualities) { + salmonOpts.jointLog->warn("The --writeQualities flag has no effect when writing results to a RAD file."); + } BasicBinWriter bw; uint32_t num_reads_in_chunk{0}; @@ -436,200 +431,49 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re uint64_t localNumAssignedFragments{0}; uint64_t localNumUniqueMappings{0}; - bool consistentHits = salmonOpts.consistentHits; - bool quiet = salmonOpts.quiet; - - size_t maxNumHits{salmonOpts.maxReadOccs}; size_t readLenLeft{0}; size_t readLenRight{0}; - constexpr const int32_t invalidScore = std::numeric_limits::min(); - MemCollector memCollector(qidx); - ksw2pp::KSW2Aligner aligner; - pufferfish::util::AlignmentConfig aconf; - pufferfish::util::MappingConstraintPolicy mpol; - bool initOK = salmon::mapping_utils::initMapperSettings(salmonOpts, memCollector, aligner, aconf, mpol); - PuffAligner puffaligner(qidx->refseq_, qidx->refAccumLengths_, qidx->k(), aconf, aligner); - - pufferfish::util::CachedVectorMap, std::hash> hits; - std::vector recoveredHits; - std::vector jointHits; - PairedAlignmentFormatter formatter(qidx); - if (salmonOpts.writeQualities) { - salmonOpts.jointLog->warn("The --writeQualities flag has no effect when writing results to a RAD file."); - } - pufferfish::util::QueryCache qc; - - bool mimicStrictBT2 = salmonOpts.mimicStrictBT2; - bool mimicBT2 = salmonOpts.mimicBT2; - bool noDovetail = !salmonOpts.allowDovetail; - bool useChainingHeuristic = !salmonOpts.disableChainingHeuristic; - - pufferfish::util::HitCounters hctr; salmon::utils::MappingType mapType{salmon::utils::MappingType::UNMAPPED}; - bool hardFilter = salmonOpts.hardFilter; fmt::MemoryWriter sstream; auto* qmLog = salmonOpts.qmLog.get(); bool writeQuasimappings = (qmLog != nullptr); - - struct SimpleHit { - bool is_fw{false}; - int32_t pos{-1}; - float score{0.0}; - uint32_t num_hits{0}; - uint32_t tid{std::numeric_limits::max()}; - bool valid_pos(int32_t read_len, uint32_t txp_len, int32_t max_over) { - int32_t signed_txp_len = static_cast(txp_len); - return (pos > -max_over) and ((pos + read_len) < (signed_txp_len + max_over)); - } - }; - - enum class HitDirection : uint8_t {FW, RC, BOTH}; - - struct SketchHitInfo { - // add a hit to the current target that occurs in the forward - // orientation with respect to the target. - bool add_fw(int32_t ref_pos, int32_t read_pos, int32_t rl, int32_t k, int32_t max_stretch, float score_inc) { - (void)rl; - (void)k; - bool added{false}; - - // since hits are collected by moving _forward_ in the - // read, if this is a fw hit, it should be moving - // forward in the reference. Only add it if this is - // the case. This ensure that we don't - // double-count a k-mer that might occur twice on - // this target. - if (ref_pos > last_ref_pos_fw and read_pos > last_read_pos_fw) { - if (last_read_pos_fw == -1) { - approx_pos_fw = ref_pos - read_pos; - } else { - if ((ref_pos - approx_pos_fw) > max_stretch) { return false; } - } - //if (last_ref_pos_fw > -1 and (ref_pos > last_ref_pos_fw + 15)) { return false; } - last_ref_pos_fw = ref_pos; - last_read_pos_fw = read_pos; - fw_score += score_inc; - ++fw_hits; - added = true; - } - return added; - } - - // add a hit to the current target that occurs in the forward - // orientation with respect to the target. - bool add_rc(int32_t ref_pos, int32_t read_pos, int32_t rl, int32_t k, int32_t max_stretch, float score_inc) { - - bool added{false}; - // since hits are collected by moving _forward_ in the - // read, if this is an rc hit, it should be moving - // backwards in the reference. Only add it if this is - // the case. - // This ensures that we don't double-count a k-mer that - // might occur twice on this target. - if (ref_pos < last_ref_pos_rc and read_pos > last_read_pos_rc) { - approx_pos_rc = (ref_pos - (rl - (read_pos + k))); - if (last_read_pos_rc == -1) { - approx_end_pos_rc = ref_pos + read_pos; - } else { - if (approx_end_pos_rc - approx_pos_rc > max_stretch) { return false;} - } - //if (last_ref_pos_rc > -1 and ref_pos < last_ref_pos_rc - 15) { return false; } - last_ref_pos_rc = ref_pos; - last_read_pos_rc = read_pos; - rc_score += score_inc; - ++rc_hits; - added = true; - - } - return added; - } - - inline uint32_t max_hits_for_target() { - return std::max(fw_hits, rc_hits); - } - - // true if forward, false if rc - // second element is score - inline HitDirection best_hit_direction() { - int32_t fw_minus_rc = static_cast(fw_hits) - static_cast(rc_hits); - return (fw_minus_rc > 0) ? HitDirection::FW : - ((fw_minus_rc < 0) ? HitDirection::RC : HitDirection::BOTH); - } - - inline SimpleHit get_fw_hit() { - return SimpleHit{true, approx_pos_fw, fw_score, fw_hits, std::numeric_limits::max()}; - } - - inline SimpleHit get_rc_hit() { - return SimpleHit{false, approx_pos_rc, rc_score, rc_hits, std::numeric_limits::max()}; - } - - inline SimpleHit get_best_hit() { - auto best_direction = best_hit_direction(); - return (best_direction != HitDirection::RC) ? - SimpleHit{true, approx_pos_fw, fw_score, fw_hits, std::numeric_limits::max()} : - SimpleHit{false, approx_pos_rc, rc_score, rc_hits, std::numeric_limits::max()}; - } - - inline std::string to_string() { - std::stringstream ss; - ss << "fw_hits: " << fw_hits << ", fw_score : " << fw_score << ", fw_pos : " << approx_pos_fw - << " || rc_hits: " << rc_hits << ", rc_score: " << rc_score << ", rc_pos: " << approx_pos_rc; - return ss.str(); - } - - int32_t last_read_pos_fw{-1}; - int32_t last_read_pos_rc{-1}; - - int32_t last_ref_pos_fw{-1}; - int32_t last_ref_pos_rc{std::numeric_limits::max()}; - - int32_t approx_pos_fw{-1}; - int32_t approx_pos_rc{-1}; - int32_t approx_end_pos_rc{-1}; - - uint32_t fw_hits{0}; - uint32_t rc_hits{0}; - float fw_score{0.0}; - float rc_score{0.0}; - }; - + PairedAlignmentFormatter formatter(qidx); + // map to recall the number of unmapped reads we see // for each barcode phmap::flat_hash_map unmapped_bc_map; - // map from transcript id to hit info - phmap::flat_hash_map hit_map; - std::vector accepted_hits; - - ////////////////////// - // NOTE: validation mapping based new parameters - std::string rc1; rc1.reserve(300); - - // check the frequency and decide here if we should // be attempting recovery of highly-multimapping reads const size_t max_occ_default = salmonOpts.maxReadOccs; const size_t max_occ_recover = salmonOpts.maxRecoverReadOccs; const bool attempt_occ_recover = (max_occ_recover > max_occ_default); - size_t numMappingsDropped{0}; - size_t numDecoyFrags{0}; - const double decoyThreshold = salmonOpts.decoyThreshold; + // think if this should be different since it's at the read level + const size_t max_read_occs = salmonOpts.maxReadOccs; + size_t alt_max_occ = max_read_occs; - salmon::mapping_utils::MappingScoreInfo msi(decoyThreshold); - // we only collect detailed decoy information if we will be - // writing output to SAM. - msi.collect_decoys(writeQuasimappings); + // we'll use this for mapping + salmon::mapping_utils::pasc::mapping_cache_info map_cache_first(qidx, max_occ_default, max_occ_recover, max_read_occs); + salmon::mapping_utils::pasc::mapping_cache_info map_cache_second(qidx, max_occ_default, max_occ_recover, max_read_occs); + + std::vector* accepted_hits_left_ptr; + std::vector* accepted_hits_right_ptr; + std::vector* accepted_hits_ptr; + + // buffer in which to accumulate accepted hits if we are merging for left + // and right reads + std::vector accepted_hits_buffer; std::string readBuffer; std::string umi(alevinOpts.protocol.umiLength, 'N'); std::string barcode(alevinOpts.protocol.barcodeLength, 'N'); ////////////////////// - bool tryAlign{salmonOpts.validateMappings}; + auto localProtocol = alevinOpts.protocol; + // start parsing reads auto rg = parser->getReadGroup(); while (parser->refill(rg)) { rangeSize = rg.size(); @@ -650,45 +494,35 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re extraBAMtags.reserve(reserveSize); } - auto localProtocol = alevinOpts.protocol; for (size_t i = 0; i < rangeSize; ++i) { // For all the read in this batch auto& rp = rg[i]; readLenLeft = rp.first.seq.length(); readLenRight= rp.second.seq.length(); + // while the pasc code path will use the jointHitGroup to + // store UMI and BarCode info, it won't actually use the + // "alignment" fields bool tooShortRight = (readLenRight < (minK+alevinOpts.trimRight)); - //localUpperBoundHits = 0; auto& jointHitGroup = structureVec[i]; jointHitGroup.clearAlignments(); - auto& jointAlignments= jointHitGroup.alignments(); - hits.clear(); - jointHits.clear(); - memCollector.clear(); - hit_map.clear(); - accepted_hits.clear(); - //jointAlignments.clear(); - //readSubSeq.clear(); mapType = salmon::utils::MappingType::UNMAPPED; + // ensure that accepted_hits_ptr points to a valid + // vector in case it's not set below + accepted_hits_ptr = &accepted_hits_buffer; + alt_max_occ = max_read_occs; ////////////////////////////////////////////////////////////// // extracting barcodes size_t barcodeLength = alevinOpts.protocol.barcodeLength; size_t umiLength = alevinOpts.protocol.umiLength; - //umi.clear(); - //barcode.clear(); + nonstd::optional barcodeIdx; extraBAMtags.clear(); bool seqOk = false; - // keep track of the *least* freqeuntly - // occurring hit in this fragment to consider - // alternative processing of fragments where - // all observed hits have high frequency. - uint64_t alt_max_occ = 0; - if (alevinOpts.protocol.end == bcEnd::FIVE || - alevinOpts.protocol.end == bcEnd::THREE){ + alevinOpts.protocol.end == bcEnd::THREE) { // If the barcode sequence could be extracted, then this is set to true, // but the barcode sequence itself may still be invalid (e.g. contain `N` characters). // However, if extracted_barcode is false here, there is no hope to even recover the @@ -721,164 +555,15 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re if (isUmiIdxOk) { jointHitGroup.setUMI(umiIdx.word(0)); - bool rh = false; - std::string* readSubSeq = aut::getReadSequence(localProtocol, rp.first.seq, rp.second.seq, readBuffer); - rh = tooShortRight - ? false - : memCollector.get_raw_hits_sketch(*readSubSeq, - qc, - true, // isLeft - false // verbose - ); - // if there were hits - if (rh) { - uint32_t num_valid_hits{0}; - uint64_t total_occs{0}; - uint64_t largest_occ{0}; - auto& raw_hits = memCollector.get_left_hits(); - - // SANITY - decltype(raw_hits[0].first) prev_read_pos = -1; - // the maximum span the supporting k-mers of a - // mapping position are allowed to have. - // NOTE this is still > read_length b/c the stretch is measured wrt the - // START of the terminal k-mer. - int32_t signed_rl = static_cast(readSubSeq->length()); - int32_t max_stretch = static_cast(signed_rl * 1.0); - - // a raw hit is a pair of read_pos and a projected hit - - // the least frequent hit for this fragment. - uint64_t min_occ = std::numeric_limits::max(); - - // this is false by default and will be set to true - // if *every* collected hit for this fragment occurs - // salmonOpts.maxReadOccs times or more. - bool had_alt_max_occ = false; - - auto collect_mappings_from_hits = [&max_stretch, &min_occ, &hit_map, - &salmonOpts, &num_valid_hits, &total_occs, - &largest_occ, &qidx, signed_rl, signed_k]( - auto& raw_hits, auto& prev_read_pos, - auto& max_allowed_occ, auto& had_alt_max_occ - ) -> bool { - for (auto& raw_hit : raw_hits) { - auto& read_pos = raw_hit.first; - auto& proj_hits = raw_hit.second; - auto& refs = proj_hits.refRange; - uint64_t num_occ = static_cast(refs.size()); - min_occ = std::min(min_occ, num_occ); - had_alt_max_occ = true; - - bool still_have_valid_target = false; - prev_read_pos = read_pos; - if (num_occ <= max_allowed_occ) { - - total_occs += num_occ; - largest_occ = (num_occ > largest_occ) ? num_occ : largest_occ; - float score_inc = 1.0; - - for (auto &pos_it : refs) { - const auto& ref_pos_ori = proj_hits.decodeHit(pos_it); - uint32_t tid = static_cast(qidx->getRefId(pos_it.transcript_id())); - int32_t pos = static_cast(ref_pos_ori.pos); - bool ori = ref_pos_ori.isFW; - auto& target = hit_map[tid]; - - // Why >= here instead of == ? - // Because hits can happen on the same target in both the forward - // and rc orientations, it is possible that we start the loop with - // the target having num_valid_hits hits in a given orientation (o) - // we see a new hit for this target in oriention o (now it has num_valid_hits + 1) - // then we see a hit for this target in orientation rc(o). We still want to - // add / consider this hit, but max_hits_for_target() > num_valid_hits. - // So, we must allow for that here. - if (target.max_hits_for_target() >= num_valid_hits) { - if (ori) { - target.add_fw(pos, static_cast(read_pos), - signed_rl, signed_k, max_stretch, score_inc); - } else { - target.add_rc(pos, static_cast(read_pos), - signed_rl, signed_k, max_stretch, score_inc); - } - - still_have_valid_target |= (target.max_hits_for_target() >= num_valid_hits + 1); - } - - } // DONE: for (auto &pos_it : refs) - - ++num_valid_hits; - - // if there are no targets reaching the valid hit threshold, then break early - if (!still_have_valid_target) { return true; } - - } // DONE : if (num_occ <= max_allowed_occ) - } // DONE : for (auto& raw_hit : raw_hits) - - return false; - }; - - bool _discard = false; - auto mao_first_pass = max_occ_default - 1; - bool early_stop = collect_mappings_from_hits(raw_hits, prev_read_pos, mao_first_pass, _discard); - - // If our default threshold was too stringent, then fallback to a more liberal - // threshold and look up the k-mers that occur the least frequently. - // Specifically, if the min occuring hits have frequency < max_occ_recover (2500 by default) - // times, then collect the min occuring hits to get the mapping. - if (attempt_occ_recover and (min_occ >= max_occ_default) and (min_occ < max_occ_recover)) { - prev_read_pos = -1; - uint64_t max_allowed_occ = min_occ; - early_stop = collect_mappings_from_hits(raw_hits, prev_read_pos, max_allowed_occ, had_alt_max_occ); - } - uint32_t best_alt_hits = 0; - int32_t signed_read_len = static_cast(readSubSeq->length()); - - for (auto& kv : hit_map) { - auto best_hit_dir = kv.second.best_hit_direction(); - // if the best direction is FW or BOTH, add the fw hit - // otherwise add the RC. - auto simple_hit = (best_hit_dir != HitDirection::RC) ? - kv.second.get_fw_hit() : kv.second.get_rc_hit(); - - if (simple_hit.num_hits >= num_valid_hits) { - simple_hit.tid = kv.first; - accepted_hits.emplace_back(simple_hit); - // if we had equally good hits in both directions - // add the rc hit here (since we added the fw) - // above if the best hit was either FW or BOTH - if (best_hit_dir == HitDirection::BOTH) { - auto second_hit = kv.second.get_rc_hit(); - second_hit.tid = kv.first; - accepted_hits.emplace_back(second_hit); - } - } else { - //best_alt_score = simple_hit.score > best_alt_score ? simple_hit.score : best_alt_score; - best_alt_hits = simple_hit.num_hits > best_alt_hits ? simple_hit.num_hits : best_alt_hits; - } - } - - alt_max_occ = had_alt_max_occ ? accepted_hits.size() : salmonOpts.maxReadOccs; - - /* - * This rule; if enabled, allows through mappings missing a single hit, if there - * was no mapping with all hits. NOTE: this won't work with the current early-exit - * optimization however. - if (accepted_hits.empty() and (num_valid_hits > 1) and (best_alt_hits >= num_valid_hits - 1)) { - for (auto& kv : hit_map) { - auto simple_hit = kv.second.get_best_hit(); - if (simple_hit.num_hits >= best_alt_hits) { - //if (simple_hit.valid_pos(signed_read_len, transcripts[kv.first].RefLength, 10)) { - simple_hit.tid = kv.first; - accepted_hits.emplace_back(simple_hit); - //} - } - } - } - */ - } // DONE : if (rh) + // get the subsequence for the alignable portion of the read + std::string* readSubSeq = aut::getReadSequence(localProtocol, rp.first.seq, rp.second.seq, readBuffer); + // try and map it + bool early_exit = salmon::mapping_utils::pasc::map_read(readSubSeq, map_cache_second, PairingStatus::UNPAIRED_RIGHT); + // in this case accepted_hits_ptr gets set to what we found + accepted_hits_ptr = &map_cache_second.accepted_hits; + alt_max_occ = map_cache_second.alt_max_occ; } else { nSeqs += 1; } @@ -900,19 +585,20 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re } // If the read mapped to > maxReadOccs places, discard it - if (accepted_hits.size() > alt_max_occ ) { - accepted_hits.clear(); - } else if (!accepted_hits.empty()) { + if (accepted_hits_ptr->size() > alt_max_occ) { + accepted_hits_ptr->clear(); + } else if (!accepted_hits_ptr->empty()) { mapType = salmon::utils::MappingType::SINGLE_MAPPED; } + alevin::types::AlevinCellBarcodeKmer bck; bool barcode_ok = bck.fromChars(barcode); // NOTE: Think if we should put decoy mappings in the RAD file if (mapType == salmon::utils::MappingType::SINGLE_MAPPED) { // num aln - bw << static_cast(accepted_hits.size()); + bw << static_cast(accepted_hits_ptr->size()); // bc // if we can fit the barcode into an integer @@ -941,7 +627,7 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re bw << umi; } - for (auto& aln : accepted_hits) { + for (auto& aln : (*accepted_hits_ptr)) { uint32_t fw_mask = aln.is_fw ? 0x80000000 : 0x00000000; bw << (aln.tid | fw_mask); } @@ -969,9 +655,9 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re bw << num_reads_in_chunk; } - validHits += accepted_hits.size(); - localNumAssignedFragments += (accepted_hits.size() > 0); - localNumUniqueMappings += (accepted_hits.size() == 1) ? 1 : 0; + validHits += accepted_hits_ptr->size(); + localNumAssignedFragments += (accepted_hits_ptr->size() > 0); + localNumUniqueMappings += (accepted_hits_ptr->size() == 1) ? 1 : 0; locRead++; ++numObservedFragments; jointHitGroup.clearAlignments(); @@ -1028,7 +714,7 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re } numAssignedFragments += localNumAssignedFragments; numUniqueMappings += localNumUniqueMappings; - mstats.numDecoyFragments += numDecoyFrags; + mstats.numDecoyFragments += 0; // no decoys with pasc (yet?) readExp.updateShortFrags(shortFragStats); } From f66d7e8ce0f212513779861e942a7140f9f058e5 Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Thu, 27 Oct 2022 14:55:10 -0400 Subject: [PATCH 04/10] convert several counters to uint64_t (#806) --- include/AlevinOpts.hpp | 34 +++++++++++------------ src/Alevin.cpp | 62 +++++++++++++++++++++--------------------- 2 files changed, 48 insertions(+), 48 deletions(-) diff --git a/include/AlevinOpts.hpp b/include/AlevinOpts.hpp index 68400149b..8d3c3b21b 100644 --- a/include/AlevinOpts.hpp +++ b/include/AlevinOpts.hpp @@ -86,11 +86,11 @@ struct AlevinOpts { // initialize EM with uniform prior bool initUniform; //number of cells - uint32_t numCells; + uint64_t numCells; // minimum number of CB to use for low confidence region - uint32_t lowRegionMinNumBarcodes; + uint64_t lowRegionMinNumBarcodes; // maximum number of barcodes to use - uint32_t maxNumBarcodes; + uint64_t maxNumBarcodes; // number of bootstraps to perform uint32_t numBootstraps; // number of gibbs samples to perform @@ -119,27 +119,27 @@ struct AlevinOpts { boost::filesystem::path bfhFile; //meta-info related tags - uint32_t totalReads; - uint32_t totalUsedReads; - uint32_t readsThrown; + uint64_t totalReads; + uint64_t totalUsedReads; + uint64_t readsThrown; - uint32_t totalCBs; - uint32_t totalUsedCBs; + uint64_t totalCBs; + uint64_t totalUsedCBs; - uint32_t kneeCutoff; - uint32_t intelligentCutoff; - uint32_t totalLowConfidenceCBs; - uint32_t numFeatures; - uint32_t numNoMapCB; + uint64_t kneeCutoff; + uint64_t intelligentCutoff; + uint64_t totalLowConfidenceCBs; + uint64_t numFeatures; + uint64_t numNoMapCB; - uint32_t eqReads; - uint32_t noisyUmis; + uint64_t eqReads; + uint64_t noisyUmis; double mappingRate; double keepCBFraction; double vbemNorm; - uint32_t totalDedupUMIs; - uint32_t totalExpGenes; + uint64_t totalDedupUMIs; + uint64_t totalExpGenes; }; #endif // ALEVIN_OPTS_HPP diff --git a/src/Alevin.cpp b/src/Alevin.cpp index 8246b7263..536649145 100644 --- a/src/Alevin.cpp +++ b/src/Alevin.cpp @@ -164,10 +164,10 @@ std::vector sort_indexes(const std::vector &v) { // reference from https://github.com/scipy/scipy/blob/master/scipy/stats/kde.py // and https://github.com/CGATOxford/UMI-tools/blob/master/umi_tools/umi_methods.py#L193 -bool gaussianKDE(const std::vector& freqCounter, +bool gaussianKDE(const std::vector& freqCounter, const std::vector& sortedIdx, double& invCovariance, double& normFactor, - uint32_t expect_cells, uint32_t& predicted_cells){ + uint64_t expect_cells, uint64_t& predicted_cells){ double covariance{0.0}, mean{0.0}, bw_method {0.01}, threshold = 0.001*freqCounter[sortedIdx[0]]; std::vector logDataset; size_t numElem, xSpace {10000}; @@ -218,7 +218,7 @@ bool gaussianKDE(const std::vector& freqCounter, } //calculating the argrelextrema - std::vector local_mins; + std::vector local_mins; for (size_t i=1; i density[i] and density[i] < density[i+1]){ local_mins.emplace_back(i); @@ -247,26 +247,26 @@ bool gaussianKDE(const std::vector& freqCounter, return false; } -uint32_t getLeftBoundary(std::vector& sortedIdx, - uint32_t topxBarcodes, - const std::vector& freqCounter){ +uint64_t getLeftBoundary(std::vector& sortedIdx, + uint64_t topxBarcodes, + const std::vector& freqCounter){ // iterate in reverse order since sortedIdx is sorted in decreasing // order double cumCount{0.0}; std::vector freqs(topxBarcodes); - for(uint32_t i = 0; i < topxBarcodes; i++){ + for(uint64_t i = 0; i < topxBarcodes; i++){ size_t ind = sortedIdx[topxBarcodes-i-1]; cumCount += freqCounter[ind]; freqs[i] = std::log(cumCount); } - std::vector X(topxBarcodes); + std::vector X(topxBarcodes); std::iota(X.begin(), X.end(), 0); bool isUp; - uint32_t x; + uint64_t x; double y, slope, leftExtreme{freqs[0]}; - for(uint32_t j=0; j& sortedIdx, slope = y/x; // fill in the values for fitted line std::transform(X.begin()+nextBcIdx, X.end(), Y.begin(), - [slope](uint32_t i) {return i*slope;} ); + [slope](uint64_t i) {return i*slope;} ); isUp = false; double curveY, lineY; @@ -294,9 +294,9 @@ uint32_t getLeftBoundary(std::vector& sortedIdx, if(isUp == false){ // ignoring all the frequencies having same frequency as cutoff - uint32_t cutoff = topxBarcodes-j; - uint32_t cutoffFrequency = freqCounter[sortedIdx[cutoff]]; - uint32_t nearestLeftFrequency = cutoffFrequency; + uint64_t cutoff = topxBarcodes-j; + uint64_t cutoffFrequency = freqCounter[sortedIdx[cutoff]]; + uint64_t nearestLeftFrequency = cutoffFrequency; while(nearestLeftFrequency == cutoffFrequency){ nearestLeftFrequency = freqCounter[sortedIdx[--cutoff]]; } @@ -310,13 +310,13 @@ uint32_t getLeftBoundary(std::vector& sortedIdx, void dumpFeatures(std::string& frequencyFileName, std::vector& sortedIdx, - const std::vector& freqCounter, + const std::vector& freqCounter, std::unordered_map>& colMap ){ std::ofstream freqFile; freqFile.open(frequencyFileName); for (auto i:sortedIdx){ - uint32_t count = freqCounter[i]; + uint64_t count = freqCounter[i]; if (count == 0){ break; } @@ -330,7 +330,7 @@ void dumpFeatures(std::string& frequencyFileName, Knee calculation and sample true set of barcodes */ template -void sampleTrueBarcodes(const std::vector& freqCounter, +void sampleTrueBarcodes(const std::vector& freqCounter, TrueBcsT& trueBarcodes, size_t& lowRegionNumBarcodes, std::unordered_map> colMap, AlevinOpts& aopt){ @@ -338,7 +338,7 @@ void sampleTrueBarcodes(const std::vector& freqCounter, size_t maxNumBarcodes { aopt.maxNumBarcodes }, lowRegionMaxNumBarcodes { 1000 }; size_t lowRegionMinNumBarcodes { aopt.lowRegionMinNumBarcodes }; double lowConfidenceFraction { 0.5 }; - uint32_t topxBarcodes = std::min(maxNumBarcodes, freqCounter.size()); + uint64_t topxBarcodes = std::min(maxNumBarcodes, freqCounter.size()); uint64_t history { 0 }; if (aopt.forceCells > 0) { @@ -358,13 +358,13 @@ void sampleTrueBarcodes(const std::vector& freqCounter, // Expect Cells algorithm is taken from // https://github.com/10XGenomics/cellranger/blob/e5396c6c444acec6af84caa7d3655dd33a162852/lib/python/cellranger/stats.py#L138 - constexpr uint32_t MAX_FILTERED_CELLS = 2; + constexpr uint64_t MAX_FILTERED_CELLS = 2; constexpr double UPPER_CELL_QUANTILE = 0.01; constexpr double FRACTION_MAX_FREQUENCY = 0.1; - uint32_t baselineBcs = std::max(1, static_cast( aopt.expectCells*UPPER_CELL_QUANTILE )); + uint64_t baselineBcs = std::max(static_cast(1), static_cast( aopt.expectCells*UPPER_CELL_QUANTILE )); double cutoffFrequency = std::max(1.0, freqCounter[sortedIdx[baselineBcs]] * FRACTION_MAX_FREQUENCY); - uint32_t maxNumCells = std::min(static_cast(freqCounter.size()), aopt.expectCells * MAX_FILTERED_CELLS); + uint64_t maxNumCells = std::min(static_cast(freqCounter.size()), aopt.expectCells * MAX_FILTERED_CELLS); topxBarcodes = maxNumCells; for(size_t i=baselineBcs; i& freqCounter, green, topxBarcodes, RESET_COLOR); double invCovariance {0.0}, normFactor{0.0}; - uint32_t gaussThreshold; + uint64_t gaussThreshold; bool isGaussOk = gaussianKDE(freqCounter, sortedIdx, invCovariance, normFactor, topxBarcodes, gaussThreshold); @@ -421,7 +421,7 @@ void sampleTrueBarcodes(const std::vector& freqCounter, } }//end-left-knee finding case - uint32_t fractionTrueBarcodes = static_cast(lowConfidenceFraction * topxBarcodes); + uint64_t fractionTrueBarcodes = static_cast(lowConfidenceFraction * topxBarcodes); if (fractionTrueBarcodes < lowRegionMinNumBarcodes){ lowRegionNumBarcodes = lowRegionMinNumBarcodes; @@ -442,8 +442,8 @@ void sampleTrueBarcodes(const std::vector& freqCounter, } topxBarcodes += lowRegionNumBarcodes; - uint32_t cutoffFrequency = freqCounter[sortedIdx[ topxBarcodes ]]; - uint32_t nearestLeftFrequency = cutoffFrequency; + uint64_t cutoffFrequency = freqCounter[sortedIdx[ topxBarcodes ]]; + uint64_t nearestLeftFrequency = cutoffFrequency; while(nearestLeftFrequency == cutoffFrequency && lowRegionNumBarcodes != 0){ nearestLeftFrequency = freqCounter[sortedIdx[--topxBarcodes]]; if (lowRegionNumBarcodes > lowRegionMinNumBarcodes) { lowRegionNumBarcodes--; } @@ -484,7 +484,7 @@ void indexBarcodes(AlevinOpts& aopt, SoftMapT& barcodeSoftMap){ std::unordered_set neighbors; std::unordered_map> ZMatrix; - uint32_t wrngWhiteListCount{0}; + uint64_t wrngWhiteListCount{0}; for (const auto trueBarcode: trueBarcodes){ neighbors.clear(); //find all neighbors to this true barcode @@ -493,7 +493,7 @@ void indexBarcodes(AlevinOpts& aopt, neighbors); for(const auto& neighbor : neighbors){ - uint32_t freq{0}; + uint64_t freq{0}; auto indexIt = freqCounter.find(neighbor); bool indexOk = indexIt != freqCounter.end(); //bool indexOk = freqCounter.find(neighbor, freq); @@ -542,7 +542,7 @@ bool writeFastq(AlevinOpts& aopt, SoftMapT& barcodeMap, TrueBcsT& trueBarcodes){ size_t rangeSize{0}; - uint32_t totNumBarcodes{0}; + uint64_t totNumBarcodes{0}; std::string barcode( aopt.protocol.barcodeLength, 'N'); std::string umi( aopt.protocol.umiLength, 'N'); @@ -717,7 +717,7 @@ void processBarcodes(std::vector& barcodeFiles, if (aopt.dumpfeatures) { aopt.jointLog->info("Sorting and dumping raw barcodes"); auto frequencyFileName = (aopt.outputDirectory / "raw_cb_frequency.txt").string(); - std::vector collapsedfrequency; + std::vector collapsedfrequency; std::unordered_map> collapMap; size_t ind{0}; for(auto fqIt = freqCounter.begin(); fqIt != freqCounter.end(); ++fqIt){ @@ -731,7 +731,7 @@ void processBarcodes(std::vector& barcodeFiles, } } else { - std::vector collapsedfrequency; + std::vector collapsedfrequency; std::unordered_map> collapMap; size_t ind{0}; for(auto fqIt = freqCounter.begin(); fqIt != freqCounter.end(); ++fqIt){ @@ -741,7 +741,7 @@ void processBarcodes(std::vector& barcodeFiles, } if (aopt.keepCBFraction != 0.0) { - aopt.forceCells = std::min(static_cast(aopt.keepCBFraction * freqCounter.size()), + aopt.forceCells = std::min(static_cast(aopt.keepCBFraction * freqCounter.size()), aopt.maxNumBarcodes); aopt.jointLog->info("Forcing to use {} cells", aopt.forceCells); aopt.jointLog->flush(); From 15ee150ec18cba24d8467df6bb5ab589c571dec2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89tienne=20Mollier?= Date: Sat, 5 Nov 2022 20:14:23 +0100 Subject: [PATCH 05/10] cmake/TestSalmonQuasi.cmake: more verbose. When trying to debug test failure, I ended up with an incomplete error messages which turned out to stem from a typo in the variable name, so I took the liberty to make the whole message a bit more verbose to help with the present and future debugging. --- cmake/TestSalmonQuasi.cmake | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/cmake/TestSalmonQuasi.cmake b/cmake/TestSalmonQuasi.cmake index 8cd8d29fb..92c6305d5 100644 --- a/cmake/TestSalmonQuasi.cmake +++ b/cmake/TestSalmonQuasi.cmake @@ -14,7 +14,10 @@ execute_process(COMMAND ${SALMON_QUASI_INDEX_CMD} ) if (SALMON_QUASI_INDEX_RESULT) - message(FATAL_ERROR "Error running ${SALMON_QUASI_INDEX_COMMAND}") + message(FATAL_ERROR + "While running: ${SALMON_QUASI_INDEX_CMD}\n" + "error: ${SALMON_QUASI_INDEX_RESULT}" + ) endif() set(SALMON_QUANT_COMMAND ${CMAKE_BINARY_DIR}/salmon quant -i sample_salmon_quasi_index -l IU -1 reads_1.fastq -2 reads_2.fastq -o sample_salmon_quasi_quant) @@ -23,7 +26,10 @@ execute_process(COMMAND ${SALMON_QUANT_COMMAND} RESULT_VARIABLE SALMON_QUASI_QUANT_RESULT ) if (SALMON_QUASI_QUANT_RESULT) - message(FATAL_ERROR "Error running ${SALMON_QUASI_QUANT_RESULT}") + message(FATAL_ERROR + "While running: ${SALMON_QUANT_COMMAND}\n" + "error: ${SALMON_QUASI_QUANT_RESULT}" + ) endif() if (EXISTS ${TOPLEVEL_DIR}/sample_data/sample_salmon_quasi_quant/quant.sf) From f2238f0303c6143848961c17582cbd7d73026ef7 Mon Sep 17 00:00:00 2001 From: Gaurav Date: Fri, 9 Dec 2022 15:03:09 -0500 Subject: [PATCH 06/10] counter artificial increase in barcode length Subtract 2 to counter artificial increase in barcode length. Read1 length of 33 or 34 would result in error while being valid lengths. --- src/AlevinUtils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/AlevinUtils.cpp b/src/AlevinUtils.cpp index a5bad4a9f..5bbb1613f 100644 --- a/src/AlevinUtils.cpp +++ b/src/AlevinUtils.cpp @@ -348,7 +348,7 @@ namespace alevin { (void)read2; pt.anchorPos = read.find(pt.anchorSeq); if (pt.anchorPos != std::string::npos && ( pt.anchorPos == pt.maxHairpinIndexLen || pt.anchorPos == pt.maxHairpinIndexLen -1) // only 2 possible values of pt.anchorPos - && read.length() >= pt.barcodeLength + pt.umiLength + pt.anchorSeqLen) { + && read.length() >= pt.barcodeLength + pt.umiLength + pt.anchorSeqLen -2) { // Subtract 2 to counter artificial increase in barcode length std::string bcAssign = read.substr(0,pt.anchorPos) + read.substr(pt.anchorPos + pt.anchorSeqLen + pt.umiLength, pt.rtIdxLen); if (pt.anchorPos < pt.maxHairpinIndexLen) { // hairpin index can be 9 or 10 bp bcAssign += "AC"; From 65280174023797c5a286a799867a659c3b852236 Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Thu, 23 Feb 2023 14:57:12 -0500 Subject: [PATCH 07/10] update libstaden --- CMakeLists.txt | 14 +++++++------- cmake/Modules/Findlibstadenio.cmake | 7 +++++-- src/CMakeLists.txt | 7 +++++-- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index da0e4180e..42b7b9cba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -662,7 +662,7 @@ tar -xzvf v2021.5.tar.gz SOURCE_DIR ${TBB_SOURCE_DIR} INSTALL_DIR ${TBB_INSTALL_DIR} PATCH_COMMAND "${TBB_PATCH_STEP}" -CMAKE_ARGS -DCMAKE_CXX_FLAGS=${TBB_CXXFLAGS} -DCMAKE_INSTALL_PREFIX= -DTBB_TEST=OFF -DTBB_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD} +CMAKE_ARGS -DCMAKE_CXX_FLAGS=${TBB_CXXFLAGS} -DCMAKE_INSTALL_PREFIX= -DTBB_TEST=OFF -DTBB_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD} -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} BUILD_IN_SOURCE TRUE ) @@ -794,12 +794,12 @@ if (NOT LIBSTADENIO_FOUND) message("==================================================================") externalproject_add(libstadenio DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external - DOWNLOAD_COMMAND curl -k -L https://github.com/COMBINE-lab/staden-io_lib/archive/v1.14.8.1.tar.gz -o staden-io_lib-v1.14.8.tar.gz && - ${SHASUM} f6f30eefa478cfb708f3109a35fb6ffa0e24951d9d971985df2cef5919dd0bc3 staden-io_lib-v1.14.8.tar.gz && - mkdir -p staden-io_lib-1.14.8 && - tar -xzf staden-io_lib-v1.14.8.tar.gz --strip-components=1 -C staden-io_lib-1.14.8 && + DOWNLOAD_COMMAND curl -k -L https://github.com/jkbonfield/io_lib/releases/download/io_lib-1-14-15/io_lib-1.14.15.tar.gz -o staden-io_lib-v1.14.15.tar.gz && + ${SHASUM} 20814c4365e1e2fe6630fb11d0df370dec4c5688af3871de7f1cb0129671401e staden-io_lib-v1.14.15.tar.gz && + mkdir -p staden-io_lib-1.14.15 && + tar -xzf staden-io_lib-v1.14.15.tar.gz --strip-components=1 -C staden-io_lib-1.14.15 && rm -fr staden-io_lib && - mv -f staden-io_lib-1.14.8 staden-io_lib + mv -f staden-io_lib-1.14.15 staden-io_lib SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/staden-io_lib INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install CONFIGURE_COMMAND ./configure --enable-shared=no --without-libcurl --prefix= LDFLAGS=${LIBSTADEN_LDFLAGS} CFLAGS=${LIBSTADEN_CFLAGS} CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} @@ -814,7 +814,7 @@ if (NOT LIBSTADENIO_FOUND) endif() set(FETCHED_STADEN TRUE) - set(STADEN_LIBRARIES "${GAT_SOURCE_DIR}/external/install/lib/libstaden-read.a") + set(STADEN_LIBRARIES "${GAT_SOURCE_DIR}/external/install/lib/libhtscodecs.a;${GAT_SOURCE_DIR}/external/install/lib/libstaden-read.a") endif() if (ASAN_BUILD) diff --git a/cmake/Modules/Findlibstadenio.cmake b/cmake/Modules/Findlibstadenio.cmake index 9e8c2b932..3dea8b67a 100644 --- a/cmake/Modules/Findlibstadenio.cmake +++ b/cmake/Modules/Findlibstadenio.cmake @@ -10,7 +10,10 @@ find_path(STADEN_INCLUDE_DIR io_lib HINTS ${STADEN_ROOT} ENV STADEN_ROOT PATH_SUFFIXES include) -find_library(STADEN_LIBRARY NAMES staden-read libstaden-read +find_library(STADEN_LIBRARY NAMES staden-read libstaden-read + HINTS ${STADEN_ROOT} ENV STADEN_ROOT PATH_SUFFIXES lib lib64) + +find_library(HTSCODEC_LIBRARY NAMES htscodecs libhtscodecs HINTS ${STADEN_ROOT} ENV STADEN_ROOT PATH_SUFFIXES lib lib64) if(STADEN_INCLUDE_DIR) @@ -30,7 +33,7 @@ find_package_handle_standard_args(libstadenio DEFAULT_MSG if (LIBSTADENIO_FOUND) message(STATUS "Staden IOLib found (include: ${STADEN_INCLUDE_DIR})") - set(STADEN_LIBRARIES ${STADEN_LIBRARY}) + set(STADEN_LIBRARIES "${STADEN_LIBRARY};${HTSCODEC_LIBRARY}") endif() mark_as_advanced(STADEN_INCLUDE_DIR STADEN_LIBRARY) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index dafffedd8..1aeb7388d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -229,6 +229,7 @@ add_dependencies(salmon ksw2pp) add_dependencies(salmon salmon_core) add_dependencies(salmon alevin_core) if(TBB_RECONFIGURE OR TBB_TARGET_EXISTED) + # Link the executable target_link_libraries(salmon Threads::Threads @@ -240,7 +241,7 @@ target_link_libraries(salmon gff ${Boost_LIBRARIES} ${ICU_LIBS} - ${STADEN_LIBRARIES} ${CURL_LIBRARIES} + ${CURL_LIBRARIES} ${ZLIB_LIBRARY} m ${LIBLZMA_LIBRARIES} @@ -253,6 +254,7 @@ target_link_libraries(salmon ${FAST_MALLOC_LIB} TBB::tbb TBB::tbbmalloc + ${STADEN_LIBRARIES} ${LIBRT} ${CMAKE_DL_LIBS} ) @@ -273,12 +275,13 @@ if(TBB_RECONFIGURE OR TBB_TARGET_EXISTED) target_link_libraries(unitTests Threads::Threads ## PUFF_INTEGRATION + ${Boost_LIBRARIES} salmon_core alevin_core gff UnitTestsMain - ${STADEN_LIBRARIES} ${Boost_LIBRARIES} + ${STADEN_LIBRARIES} ${ICU_LIBS} ${CURL_LIBRARIES} ${ZLIB_LIBRARY} From a8f2920737b256b68c458193743b3d40c8a4a560 Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Thu, 23 Feb 2023 17:07:56 -0500 Subject: [PATCH 08/10] prepare for version bump --- CMakeLists.txt | 2 +- current_version.txt | 2 +- doc/source/conf.py | 4 ++-- docker/Dockerfile | 2 +- docker/build_test.sh | 2 +- include/SalmonConfig.hpp | 4 ++-- scripts/fetchPufferfish.sh | 6 +++--- 7 files changed, 11 insertions(+), 11 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 42b7b9cba..4ab4a91f7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -786,7 +786,7 @@ find_package(CURL) if (FETCH_STADEN) set(LIBSTADEN_FOUND FALSE) else () - find_package(libstadenio) + find_package(libstadenio 1.14.15) endif() if (NOT LIBSTADENIO_FOUND) diff --git a/current_version.txt b/current_version.txt index 40f96c219..227e558bd 100644 --- a/current_version.txt +++ b/current_version.txt @@ -1,3 +1,3 @@ VERSION_MAJOR 1 -VERSION_MINOR 9 +VERSION_MINOR 10 VERSION_PATCH 0 diff --git a/doc/source/conf.py b/doc/source/conf.py index ebc201b0f..3cb812f4e 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -55,9 +55,9 @@ # built documents. # # The short X.Y version. -version = '1.9' +version = '1.10' # The full version, including alpha/beta/rc tags. -release = '1.9.0' +release = '1.10.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docker/Dockerfile b/docker/Dockerfile index 70009a298..578b25b4d 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -6,7 +6,7 @@ MAINTAINER salmon.maintainer@gmail.com ENV PACKAGES git gcc make g++ libboost-all-dev liblzma-dev libbz2-dev \ ca-certificates zlib1g-dev libcurl4-openssl-dev curl unzip autoconf apt-transport-https ca-certificates gnupg software-properties-common wget -ENV SALMON_VERSION 1.9.0 +ENV SALMON_VERSION 1.10.0 # salmon binary will be installed in /home/salmon/bin/salmon diff --git a/docker/build_test.sh b/docker/build_test.sh index 02ce42f6e..a52edf11f 100644 --- a/docker/build_test.sh +++ b/docker/build_test.sh @@ -1,3 +1,3 @@ #! /bin/bash -SALMON_VERSION=1.9.0 +SALMON_VERSION=1.10.0 docker build --no-cache -t combinelab/salmon:${SALMON_VERSION} -t combinelab/salmon:latest . diff --git a/include/SalmonConfig.hpp b/include/SalmonConfig.hpp index 8d126a197..24dcebd7c 100644 --- a/include/SalmonConfig.hpp +++ b/include/SalmonConfig.hpp @@ -26,9 +26,9 @@ namespace salmon { constexpr char majorVersion[] = "1"; -constexpr char minorVersion[] = "9"; +constexpr char minorVersion[] = "10"; constexpr char patchVersion[] = "0"; -constexpr char version[] = "1.9.0"; +constexpr char version[] = "1.10.0"; constexpr uint32_t indexVersion = 5; constexpr char requiredQuasiIndexVersion[] = "p7"; } // namespace salmon diff --git a/scripts/fetchPufferfish.sh b/scripts/fetchPufferfish.sh index 8c737f088..1fb1ae029 100755 --- a/scripts/fetchPufferfish.sh +++ b/scripts/fetchPufferfish.sh @@ -23,11 +23,11 @@ if [ -d ${INSTALL_DIR}/src/pufferfish ] ; then rm -fr ${INSTALL_DIR}/src/pufferfish fi -#SVER=salmon-v1.9.0 -SVER=develop +SVER=salmon-v1.9.0 +#SVER=develop #SVER=sketch-mode -EXPECTED_SHA256=2a862daeff95a19c9b188bc26a5d02fc0ecc5b9c9ae5523a67c9d14e62551c5d +EXPECTED_SHA256=c961b9c252856b53c6538d22103b711c924ea1e649516de81efb85870aa8b143 mkdir -p ${EXTERNAL_DIR} curl -k -L https://github.com/COMBINE-lab/pufferfish/archive/${SVER}.zip -o ${EXTERNAL_DIR}/pufferfish.zip From 45a9e244ee37afb86969718de0c9662d2d6d1d5d Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Thu, 23 Feb 2023 18:25:24 -0500 Subject: [PATCH 09/10] update fetchPufferfish tag --- scripts/fetchPufferfish.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/fetchPufferfish.sh b/scripts/fetchPufferfish.sh index 1fb1ae029..b067751dc 100755 --- a/scripts/fetchPufferfish.sh +++ b/scripts/fetchPufferfish.sh @@ -23,7 +23,7 @@ if [ -d ${INSTALL_DIR}/src/pufferfish ] ; then rm -fr ${INSTALL_DIR}/src/pufferfish fi -SVER=salmon-v1.9.0 +SVER=salmon-v1.10.0 #SVER=develop #SVER=sketch-mode From 332e51fc74ebde1813f3ba7c683087044188a5c8 Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Thu, 23 Feb 2023 21:32:58 -0500 Subject: [PATCH 10/10] order --- CMakeLists.txt | 2 +- src/CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4ab4a91f7..b5dd1cc53 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -814,7 +814,7 @@ if (NOT LIBSTADENIO_FOUND) endif() set(FETCHED_STADEN TRUE) - set(STADEN_LIBRARIES "${GAT_SOURCE_DIR}/external/install/lib/libhtscodecs.a;${GAT_SOURCE_DIR}/external/install/lib/libstaden-read.a") + set(STADEN_LIBRARIES "${GAT_SOURCE_DIR}/external/install/lib/libstaden-read.a;${GAT_SOURCE_DIR}/external/install/lib/libhtscodecs.a") endif() if (ASAN_BUILD) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1aeb7388d..eccbb8bbc 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -244,6 +244,7 @@ target_link_libraries(salmon ${CURL_LIBRARIES} ${ZLIB_LIBRARY} m + ${STADEN_LIBRARIES} ${LIBLZMA_LIBRARIES} ${BZIP2_LIBRARIES} ${LIBSALMON_LINKER_FLAGS} @@ -254,7 +255,6 @@ target_link_libraries(salmon ${FAST_MALLOC_LIB} TBB::tbb TBB::tbbmalloc - ${STADEN_LIBRARIES} ${LIBRT} ${CMAKE_DL_LIBS} )