Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Created a builder class for the Lift class. #21

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
83 changes: 43 additions & 40 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -174,55 +174,58 @@ target_link_libraries(leviosam ${SDSL_LIB})
target_link_libraries(leviosam pthread)
target_link_libraries(leviosam z)

option(BUILD_TESTS "Set OFF to not compile the tests" ON)

if(${BUILD_TESTS})
# Download and unpack googletest at configure time
configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt)
execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
RESULT_VARIABLE result
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download )
if(result)
message(FATAL_ERROR "CMake step for googletest failed: ${result}")
endif()
execute_process(COMMAND ${CMAKE_COMMAND} --build .
RESULT_VARIABLE result
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download )
if(result)
message(FATAL_ERROR "Build step for googletest failed: ${result}")
endif()

# Download and unpack googletest at configure time
configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt)
execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
RESULT_VARIABLE result
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download )
if(result)
message(FATAL_ERROR "CMake step for googletest failed: ${result}")
endif()
execute_process(COMMAND ${CMAKE_COMMAND} --build .
RESULT_VARIABLE result
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download )
if(result)
message(FATAL_ERROR "Build step for googletest failed: ${result}")
endif()

# Prevent overriding the parent project's compiler/linker
# settings on Windows
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
# Prevent overriding the parent project's compiler/linker
# settings on Windows
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)

enable_testing()
enable_testing()

# Add googletest directly to our build. This defines
# the gtest and gtest_main targets.
add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/googletest-src
${CMAKE_CURRENT_BINARY_DIR}/googletest-build
EXCLUDE_FROM_ALL)
# Add googletest directly to our build. This defines
# the gtest and gtest_main targets.
add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/googletest-src
${CMAKE_CURRENT_BINARY_DIR}/googletest-build
EXCLUDE_FROM_ALL)


add_executable(leviosam_test src/leviosam_test.cpp)
target_link_libraries(leviosam_test lvsam)
target_link_libraries(leviosam_test ${HTS_LIB})
target_link_libraries(leviosam_test ${SDSL_LIB})
target_link_libraries(leviosam_test gtest gtest_main)
add_executable(leviosam_test src/leviosam_test.cpp)
target_link_libraries(leviosam_test lvsam)
target_link_libraries(leviosam_test ${HTS_LIB})
target_link_libraries(leviosam_test ${SDSL_LIB})
target_link_libraries(leviosam_test gtest gtest_main)

add_test(NAME leviosam_test COMMAND leviosam_test
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/testdata
)
add_test(NAME leviosam_test COMMAND leviosam_test
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/testdata
)


add_executable(chain_test src/chain_test.cpp)
target_link_libraries(chain_test lvsam)
target_link_libraries(chain_test ${HTS_LIB})
target_link_libraries(chain_test ${SDSL_LIB})
target_link_libraries(chain_test gtest gtest_main)
add_executable(chain_test src/chain_test.cpp)
target_link_libraries(chain_test lvsam)
target_link_libraries(chain_test ${HTS_LIB})
target_link_libraries(chain_test ${SDSL_LIB})
target_link_libraries(chain_test gtest gtest_main)

add_test(NAME chain_test COMMAND chain_test
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/testdata
)
add_test(NAME chain_test COMMAND chain_test
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/testdata
)
endif()

INSTALL(TARGETS leviosam DESTINATION bin)
INSTALL(FILES src/leviosam.hpp DESTINATION include)
Expand Down
159 changes: 106 additions & 53 deletions src/leviosam.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ namespace lift {
// Serialization
class Name2IdxMap: public std::unordered_map<std::string,int> {
public:
size_t serialize(std::ofstream& out) {
size_t serialize(std::ostream& out) {
size_t size = 0;
size_t map_size = this->size();
out.write(reinterpret_cast<char*>(&map_size), sizeof(map_size));
Expand All @@ -117,7 +117,7 @@ class Name2IdxMap: public std::unordered_map<std::string,int> {
return size;
}

void load(std::ifstream& in) {
void load(std::istream& in) {
size_t map_size;
in.read(reinterpret_cast<char*>(&map_size), sizeof(map_size));
for (auto i = 0; i < map_size; ++i) {
Expand All @@ -137,7 +137,7 @@ class Name2IdxMap: public std::unordered_map<std::string,int> {

class Name2NameMap : public std::unordered_map<std::string,std::string> {
public:
size_t serialize(std::ofstream& out) {
size_t serialize(std::ostream& out) {
size_t size = 0;
size_t map_size = this->size();
out.write(reinterpret_cast<char*>(&map_size), sizeof(map_size));
Expand All @@ -154,7 +154,7 @@ class Name2NameMap : public std::unordered_map<std::string,std::string> {
return size;
}

void load(std::ifstream& in) {
void load(std::istream& in) {
size_t map_size;
in.read(reinterpret_cast<char*>(&map_size), sizeof(map_size));
for (auto i = 0; i < map_size; ++i) {
Expand All @@ -173,6 +173,85 @@ class Name2NameMap : public std::unordered_map<std::string,std::string> {
}
};

// Builder class for lift construction
class Lift_builder
{
friend class Lift;

protected:
sdsl::bit_vector m_ibv;
sdsl::bit_vector m_dbv;
sdsl::bit_vector m_sbv;
size_t m_size = 0; // Size of the contig
size_t m_x = 0;
int m_ppos = 0;
int m_prev_is_ins = 0; // 1 if last variant is an insertion
public:
Lift_builder()
{

}

Lift_builder(const size_t l)
{
init(l);
}

inline void init(const size_t l)
{
m_ibv = sdsl::bit_vector(l*2);
m_dbv = sdsl::bit_vector(l*2);
m_sbv = sdsl::bit_vector(l*2);
m_size = l;
m_x = 0;
m_ppos = 0;
m_prev_is_ins = 0;
}

inline int prev_is_ins() const {return m_prev_is_ins;}
inline int contig_size() const {return m_size;}

inline void set(
const size_t rpos, // Position in the reference
const int var_type, // Type of the variant
const int rlen, // Length of the reference allele variant
const int alen // Length of the alternate allele variant
)
{
m_x += (rpos - m_ppos); // x should now be pointing to *start* of current variant wrt alignment
m_ppos = rpos; // m_ppos now pointing to *start* of current variant
if (var_type == VCF_INDEL) {
++m_ppos;
++m_x;
m_prev_is_ins = 0;
if (rlen < alen) { // ins
for (size_t i = 0; i < alen - rlen; ++i) {
m_ibv[m_x++] = 1;
}
m_prev_is_ins = 1;
} else if (rlen > alen) { // del
for (size_t i = 0; i < rlen - alen; ++i) {
m_dbv[m_x++] = 1;
++m_ppos; // s1 advances, so m_ppos advances
}
}
--m_x;
--m_ppos;
} else if (var_type == VCF_SNP) {
m_sbv[m_x] = 1; // no need to increment x here?
}
}

inline void finalize()
{
if (m_ppos > m_size) {
die( "something went wrong, we went past bounds of s1 sequence. exiting...\n");
}
m_ibv.resize(m_x + (m_size - m_ppos));
m_dbv.resize(m_x + (m_size - m_ppos));
m_sbv.resize(m_x + (m_size - m_ppos));
}
};

// liftover data structure for a single sequence
class Lift {
Expand All @@ -183,7 +262,7 @@ class Lift {
init_rs_sls();
}

Lift(std::ifstream& in) {
Lift(std::istream& in) {
this->load(in);
}

Expand All @@ -196,6 +275,16 @@ class Lift {
init_rs_sls();
}

// construct liftover from builder
Lift(Lift_builder& builder) :
ins(builder.m_ibv),
del(builder.m_dbv),
snp(builder.m_sbv)
{
// create rank and select data-structures for everything
init_rs_sls();
}

Lift(const Lift& rhs) : ins(rhs.ins), del(rhs.del), snp(rhs.snp) {
init_rs_sls();
}
Expand Down Expand Up @@ -375,7 +464,7 @@ class Lift {
// }

// Save to stream
size_t serialize(std::ofstream& out) const {
size_t serialize(std::ostream& out) const {
size_t size = 0;
size += ins.serialize(out);
size += del.serialize(out);
Expand Down Expand Up @@ -420,7 +509,7 @@ class LiftMap {

~LiftMap() {}

LiftMap(std::ifstream& in) {
LiftMap(std::istream& in) {
this->load(in);
}

Expand Down Expand Up @@ -467,15 +556,13 @@ class LiftMap {
int lmap_idx = 0; // current index of lmap


sdsl::bit_vector ibv, dbv, sbv; // ins, del, snp
Lift_builder builder;
bool no_sample = (sample_name == "");
if (no_sample) fprintf(stderr, "no sample given, assuming GT=1 for all variants\n");
if (!no_sample && bcf_hdr_set_samples(hdr, sample_name.c_str(), 0))
die("error: sample does not exist!\n");
bcf1_t* rec = bcf_init();
size_t x = 0;
int rid = -1;
int ppos = 0;
int tppos = 0;
size_t l = 0;
int hap = stoi(haplotype);
Expand All @@ -484,13 +571,8 @@ class LiftMap {
if (rec->rid != rid) {
if (rid != -1) { // save the bitvector
// condense the bit vectors and add to map
if (ppos > l) {
die( "something went wrong, we went past bounds of s1 sequence. exiting...\n");
}
ibv.resize(x + (l - ppos));
dbv.resize(x + (l - ppos));
sbv.resize(x + (l - ppos));
lmap.push_back(Lift(ibv,dbv,sbv));
builder.finalize();
lmap.push_back(Lift(builder));
}
rid = rec->rid;
if (get_names) {
Expand All @@ -512,11 +594,7 @@ class LiftMap {
}
// initialize bit vectors for this contig
// TODO: maybe set vec size to something less than 2l
ibv = sdsl::bit_vector(l*2);
dbv = sdsl::bit_vector(l*2);
sbv = sdsl::bit_vector(l*2);
x = 0;
ppos = 0;
builder.init(l);
tppos = 0; // end of last variant processed
}
int32_t* gt_arr = NULL;
Expand Down Expand Up @@ -555,40 +633,15 @@ class LiftMap {
}

// at this point, ppos and x should point to *end* of last variant in s1 & s2 alignment, resp.
x += (rec->pos - ppos); // x should now be pointing to *start* of current variant wrt alignment
ppos = rec->pos; // ppos now pointing to *start* of current variant
if (var_type == VCF_INDEL) {
++ppos;
++x;
prev_is_ins = 0;
if (rlen < alen) { // ins
for (size_t i = 0; i < alen - rlen; ++i) {
ibv[x++] = 1;
}
prev_is_ins = 1;
} else if (rlen > alen) { // del
for (size_t i = 0; i < rlen - alen; ++i) {
dbv[x++] = 1;
++ppos; // s1 advances, so ppos advances
}
}
--x;
--ppos;
} else if (var_type == VCF_SNP) {
sbv[x] = 1; // no need to increment x here?
}
builder.set(rec->pos,var_type,rlen,alen);
prev_is_ins = builder.prev_is_ins();
tppos = rec->pos + rec->rlen - 1; // tppos points to end of this variant
} else {
continue;
}
}
if (ppos > l) {
die("something went wrong, we went past bounds of s1 sequence. exiting...\n");
}
ibv.resize(x + (l - ppos));
dbv.resize(x + (l - ppos));
sbv.resize(x + (l - ppos));
lmap.push_back(Lift(ibv,dbv,sbv));
builder.finalize();
lmap.push_back(Lift(builder));
length_map = ls;
}

Expand Down Expand Up @@ -695,7 +748,7 @@ class LiftMap {
}

// saves to stream
size_t serialize(std::ofstream& out) {
size_t serialize(std::ostream& out) {
size_t size = 0;
size_t nelems = lmap.size();
out.write(reinterpret_cast<char*>(&nelems), sizeof(nelems));
Expand All @@ -711,7 +764,7 @@ class LiftMap {
}

// loads from stream
void load(std::ifstream& in) {
void load(std::istream& in) {
size_t nelems;
in.read(reinterpret_cast<char*>(&nelems), sizeof(nelems));
for (auto i = 0; i < nelems; ++i) {
Expand Down
Loading