Merge branch 'develop'

COMBINE-lab · Nov 23, 2021 · 447ae4e · 447ae4e
2 parents fe11990 + b7a2167
commit 447ae4e
Show file tree

Hide file tree

Showing 27 changed files with 9,909 additions and 5,836 deletions.
diff --git a/current_version.txt b/current_version.txt
@@ -1,3 +1,3 @@
 VERSION_MAJOR 1
-VERSION_MINOR 5
-VERSION_PATCH 2
+VERSION_MINOR 6
+VERSION_PATCH 0
diff --git a/doc/source/alevin.rst b/doc/source/alevin.rst
@@ -1,10 +1,14 @@
 Alevin
 ================
 
-Alevin is a tool --- integrated with the salmon software --- that introduces a family of algorithms for quantification and analysis of 3' tagged-end single-cell sequencing data. Currently alevin supports the following two major droplet based single-cell protocols:
+Alevin is a tool --- integrated with the salmon software --- that introduces a family of algorithms for quantification and analysis of 3' tagged-end single-cell sequencing data. Currently alevin supports the following single-cell protocols:
 
 1. Drop-seq
 2. 10x-Chromium v1/2/3
+3. inDropV2
+4. CELSeq 1/2
+5. Quartz-Seq2
+6. sci-RNA-seq3
 
 Alevin works under the same indexing scheme (as salmon) for the reference, and consumes the set of FASTA/Q files(s) containing the Cellular Barcode(CB) + Unique Molecule identifier (UMI) in one read file and the read sequence in the other.  Given just the transcriptome and the raw read files, alevin generates a cell-by-gene count matrix (in a fraction of the time compared to other tools).
 
@@ -177,6 +181,18 @@ map end-to-end.  Instead, the score of the mapping will be the position along th
 highest score.  This is the score which must reach the fraction threshold for the read to be considered
 as valid.
 
+Single-cell protocol specific notes
+------------------------------------
+
+In cases where single-cell protocol supports variable length cellbarcodes, alevin adds nucleotide padding to make the lengths uniform.
+Furthermore, the padding scheme ensures that there are no collisions added in the process. The padding scheme is as follows:
+
+1. sci-RNA-seq3: The barcode is composed of 9-10 bp hairpin adaptor and 10 bp reverse transcription index making it 19-20 bp long. If 
+the bacode is 20 bp long, alevin adds `A` and it adds `AC` if it is 19 bp long. Thus, the length of barcode in the output is 21 bp. 
+2. inDropV2: 8-11 bp barcode1 along with 8 bp barcode2 makes up the barcode. For barcode lengths of 16, 17, 18, and 19 bp, alevin adds
+`AAAC`, `AAG`, `AT`, and `A` respectively. Thus, the length of barcode in the output is 20 bp. Furthermore, the position of barcode1 is
+dependent on finding exact match of sequence `w1`. If exact match is not found, a search for `w1` is performed allowing a maximum hamming
+ distance 2 b/w `w1` and read2 substring of w1 length within the required bounds; the first match is returned.  
 
 Output
 ------

diff --git a/doc/source/conf.py b/doc/source/conf.py
@@ -48,16 +48,16 @@
 
 # General information about the project.
 project = u'Salmon'
-copyright = u'2013-2017, Rob Patro, Geet Duggal, Mike Love, Rafael Irizarry and Carl Kingsford'
+copyright = u'2013-2021, Rob Patro, Geet Duggal, Mike Love, Rafael Irizarry and Carl Kingsford'
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
 # built documents.
 #
 # The short X.Y version.
-version = '1.5'
+version = '1.6'
 # The full version, including alpha/beta/rc tags.
-release = '1.5.2'
+release = '1.6.0'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -6,7 +6,7 @@ MAINTAINER [email protected]
 
 ENV PACKAGES git gcc make g++ libboost-all-dev liblzma-dev libbz2-dev \
     ca-certificates zlib1g-dev libcurl4-openssl-dev curl unzip autoconf apt-transport-https ca-certificates gnupg software-properties-common wget
-ENV SALMON_VERSION 1.5.2
+ENV SALMON_VERSION 1.6.0
 
 # salmon binary will be installed in /home/salmon/bin/salmon
 

diff --git a/docker/build_test.sh b/docker/build_test.sh
@@ -1,3 +1,3 @@
 #! /bin/bash
-SALMON_VERSION=1.5.2
+SALMON_VERSION=1.6.0
 docker build --no-cache -t combinelab/salmon:${SALMON_VERSION} -t combinelab/salmon:latest .
diff --git a/include/AlevinUtils.hpp b/include/AlevinUtils.hpp
@@ -22,6 +22,7 @@
 #include <algorithm>
 #include <limits>
 #include <string>
+#include <numeric>
 
 #include "spdlog/spdlog.h"
 
@@ -72,6 +73,8 @@ namespace alevin{
     void readWhitelist(bfs::path& filePath,
                        TrueBcsT& trueBarcodes);
 
+    unsigned int hammingDistance(const std::string s1, const std::string s2);
+
     template <typename ProtocolT>
     bool processAlevinOpts(AlevinOpts<ProtocolT>& aopt,
                            SalmonOpts& sopt, bool noTgMap,
@@ -97,7 +100,7 @@ namespace alevin{
                       OrderedOptionsT& orderedOptions) {
       std::ofstream os(cmdInfoPath.string());
       cereal::JSONOutputArchive oa(os);
-      oa(cereal::make_nvp("salmon_version:", std::string(salmon::version)));
+      oa(cereal::make_nvp("salmon_version", std::string(salmon::version)));
       for (auto& opt : orderedOptions.options) {
         if (opt.value.size() == 1) {
           oa(cereal::make_nvp(opt.string_key, opt.value.front()));

diff --git a/include/ReadExperiment.hpp b/include/ReadExperiment.hpp
@@ -25,6 +25,7 @@
 
 // Boost includes
 #include <boost/filesystem.hpp>
+#include <boost/filesystem/path.hpp>
 #include <boost/range/irange.hpp>
 
 // Cereal includes
@@ -48,10 +49,12 @@ class ReadExperiment {
 public:
   ReadExperiment(std::vector<ReadLibrary>& readLibraries,
                  // const boost::filesystem::path& transcriptFile,
-                 const boost::filesystem::path& indexDirectory,
+                 SalmonIndex* salmonIndex,
+                 // const boost::filesystem::path& indexDirectory,
                  SalmonOpts& sopt)
       : readLibraries_(readLibraries),
         // transcriptFile_(transcriptFile),
+        salmonIndex_(salmonIndex),
         transcripts_(std::vector<Transcript>()), totalAssignedFragments_(0),
         fragStartDists_(5), posBiasFW_(5), posBiasRC_(5), posBiasExpectFW_(5),
         posBiasExpectRC_(5), /*seqBiasModel_(1.0),*/ eqBuilder_(sopt.jointLog, sopt.maxHashResizeThreads),
@@ -115,24 +118,6 @@ class ReadExperiment {
     }
     */
 
-    // ==== Figure out the index type
-    boost::filesystem::path versionPath = indexDirectory / "versionInfo.json";
-    SalmonIndexVersionInfo versionInfo;
-    versionInfo.load(versionPath);
-    if (versionInfo.indexVersion() == 0) {
-      fmt::MemoryWriter infostr;
-      infostr << "Error: The index version file " << versionPath.string()
-              << " doesn't seem to exist.  Please try re-building the salmon "
-                 "index.";
-      throw std::invalid_argument(infostr.str());
-    }
-    // Check index version compatibility here
-    auto indexType = versionInfo.indexType();
-    // ==== Figure out the index type
-
-    salmonIndex_.reset(new SalmonIndex(sopt.jointLog, indexType));
-    salmonIndex_->load(indexDirectory);
-
     // Now we'll have either an FMD-based index or a QUASI index
     // dispatch on the correct type.
     fmt::MemoryWriter infostr;
@@ -159,7 +144,7 @@ class ReadExperiment {
     // Create the cluster forest for this set of transcripts
     clusters_.reset(new ClusterForest(transcripts_.size(), transcripts_));
   }
-
+  
   EQBuilderT& equivalenceClassBuilder() { return eqBuilder_; }
 
   std::string getIndexSeqHash256() const { return salmonIndex_->seqHash256(); }
@@ -262,7 +247,7 @@ class ReadExperiment {
     }
   }
 
-  SalmonIndex* getIndex() { return salmonIndex_.get(); }
+  SalmonIndex* getIndex() { return salmonIndex_; }
 
   template <typename PuffIndexT>
   void loadTranscriptsFromPuff(PuffIndexT* idx_, const SalmonOpts& sopt) {
@@ -416,7 +401,7 @@ class ReadExperiment {
     std::atomic<bool> burnedIn{
         totalAssignedFragments_ + numAssignedFragments_ >= sopt.numBurninFrags};
     for (auto& rl : readLibraries_) {
-      processReadLibrary(rl, salmonIndex_.get(), transcripts_, clusterForest(),
+      processReadLibrary(rl, salmonIndex_, transcripts_, clusterForest(),
                          *(fragLengthDist_.get()), numAssignedFragments_,
                          numThreads, burnedIn);
     }
@@ -806,7 +791,7 @@ class ReadExperiment {
   /**
    * The index we've built on the set of transcripts.
    */
-  std::unique_ptr<SalmonIndex> salmonIndex_{nullptr};
+  SalmonIndex* salmonIndex_{nullptr};
   /**
    * The cluster forest maintains the dynamic relationship
    * defined by transcripts and reads --- if two transcripts

diff --git a/include/SalmonConfig.hpp b/include/SalmonConfig.hpp
@@ -26,9 +26,9 @@
 
 namespace salmon {
 constexpr char majorVersion[] = "1";
-constexpr char minorVersion[] = "5";
-constexpr char patchVersion[] = "2";
-constexpr char version[] = "1.5.2";
+constexpr char minorVersion[] = "6";
+constexpr char patchVersion[] = "0";
+constexpr char version[] = "1.6.0";
 constexpr uint32_t indexVersion = 5;
 constexpr char requiredQuasiIndexVersion[] = "p7";
 } // namespace salmon

diff --git a/include/SalmonDefaults.hpp b/include/SalmonDefaults.hpp
@@ -140,6 +140,7 @@ namespace defaults {
   constexpr const bool isCELSeq{false};
   constexpr const bool isCELSeq2{false};
   constexpr const bool isQuartzSeq2{false};
+  constexpr const bool isSciSeq3{false};
   constexpr const bool noQuant{false};
   constexpr const bool dumpFQ{false};
   constexpr const bool dumpArborescences{false};

diff --git a/include/SalmonIndex.hpp b/include/SalmonIndex.hpp
@@ -246,4 +246,9 @@ class SalmonIndex {
   std::string decoyNameHash256_;
 };
 
+// Convenience function to load an index
+std::unique_ptr<SalmonIndex>
+checkLoadIndex(const boost::filesystem::path& indexDirectory,
+               std::shared_ptr<spdlog::logger>& logger);
+
 #endif //__SALMON_INDEX_HPP
diff --git a/include/SingleCellProtocols.hpp b/include/SingleCellProtocols.hpp
@@ -124,17 +124,19 @@ namespace alevin{
       DropSeq(): Rule(12, 8, BarcodeEnd::FIVE, 16777216){}
     };
 
-    struct InDrop : Rule{
-        //InDrop starts from 5end with variable
-        //length barcodes so provide the full
-        // length of the barcod eincluding w1.
-        // UMI length is 6
-      InDrop(): Rule(42, 6, BarcodeEnd::FIVE, 22347776){}
+    struct InDropV2 : Rule{
+        //InDropV2 starts from 5end with variable
+        //length barcodes where barcode1 varies from 8 to 11 bp
+        // followed by w1 sequence, 8 bp barcode2 and 6bp UMI
+      InDropV2(): Rule(20, 6, BarcodeEnd::FIVE, 22347776){}
 
       std::string w1;
+      std::size_t w1Length, maxHammingDist = 2, bc2Len = 8;
       void setW1(std::string& w1_){
         w1 = w1_;
+        w1Length = w1.length();
       }
+      std::size_t w1Pos = 0, bc2EndPos;
     };
 
     struct CITESeq : Rule{
@@ -179,7 +181,14 @@ namespace alevin{
     struct Custom : Rule{
       Custom() : Rule(0,0,BarcodeEnd::FIVE,0){}
     };
-
+    struct SciSeq3 : Rule{
+      SciSeq3() : Rule(21, 8, BarcodeEnd::FIVE, 1073741824){}
+      std::string anchorSeq = "CAGAGC";
+      std::size_t anchorSeqLen = anchorSeq.length();
+      std::size_t anchorPos = 0;
+      u_int16_t const maxHairpinIndexLen = 10;
+      u_int16_t const rtIdxLen = 10; // rev transcription index length
+    };
 
     // for the new type of specification
     struct CustomGeometry {

diff --git a/include/cuckoohash_map.hh b/include/cuckoohash_map.hh
@@ -113,7 +113,7 @@ public:
         maximum_hashpower_(NO_MAXIMUM_HASHPOWER),
         max_num_worker_threads_(0) {
     all_locks_.emplace_back(std::min(bucket_count(), size_type(kMaxNumLocks)),
-                            spinlock(), get_allocator());
+                            get_allocator());
   }
 
   /**
@@ -695,7 +695,7 @@ private:
 
   void add_locks_from_other(const cuckoohash_map &other) {
     locks_t &other_locks = other.get_current_locks();
-    all_locks_.emplace_back(other_locks.size(), spinlock(), get_allocator());
+    all_locks_.emplace_back(other_locks.size(), get_allocator());
     std::copy(other_locks.begin(), other_locks.end(),
               get_current_locks().begin());
   }
@@ -794,7 +794,7 @@ private:
   // under this lock. One can compute the size of the table by summing the
   // elem_counter over all locks.
   //
-  // - is_migrated: When resizing with cuckoo_fast_doulbe, we do not
+  // - is_migrated: When resizing with cuckoo_fast_double, we do not
   // immediately rehash elements from the old buckets array to the new one.
   // Instead, we'll mark all of the locks as not migrated. So anybody trying to
   // acquire the lock must also migrate the corresponding buckets if
@@ -1823,7 +1823,7 @@ private:
     }
 
     locks_t new_locks(std::min(size_type(kMaxNumLocks), new_bucket_count),
-                      spinlock(), get_allocator());
+                      get_allocator());
     assert(new_locks.size() > current_locks.size());
     std::copy(current_locks.begin(), current_locks.end(), new_locks.begin());
     for (spinlock &lock : new_locks) {