Skip to content

Commit

Permalink
New contig assembly method. Fixed an issue with CRAM reading. (#34)
Browse files Browse the repository at this point in the history
  • Loading branch information
kvg authored Oct 15, 2024
1 parent 321de9b commit 632fa90
Show file tree
Hide file tree
Showing 5 changed files with 87 additions and 37 deletions.
23 changes: 13 additions & 10 deletions src/hidive/src/coassemble.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,31 +41,34 @@ pub fn start(
let m = MLdBG::from_ldbgs(vec![l1, s1])
.score_kmers(model_path)
.collapse()
.clean_color_specific_paths(1, 0.2)
.clean_tangles(1, 100, 0.2)
.clean_branches(0.01)
.clean_tips(3*kmer_size, 0.01)
.clean_contigs(100)
.clean(0.2, 0.01)
.build_links(&all_lr_seqs, true);

skydive::elog!("Built MLdBG with {} k-mers.", m.kmers.len());

let progress_bar = skydive::utils::default_bounded_progress_bar("Correcting reads", all_lr_seqs.len() as u64);

let contigs =
let corrected_seqs =
all_lr_seqs
.par_iter()
.progress_with(progress_bar)
.map(|seq| m.correct_seq(seq))
.flatten()
.collect::<Vec<Vec<u8>>>();

// let contigs = m.assemble_all();
// let contigs = m.assemble_at_bubbles();

// let mut fa_file = File::create(&output).unwrap();
// for (i, contig) in corrected_seqs.iter().chain(contigs.iter()).enumerate() {
// let _ = writeln!(fa_file, ">contig_{}\n{}", i, String::from_utf8(contig.clone()).unwrap());
// }

skydive::elog!("Assembling diplotypes...");

let mut la = spoa::AlignmentEngine::new(AlignmentType::kSW, 5, -10, -16, -12, -20, -8);

let mut sg = spoa::Graph::new();
for lr_seq in all_ref_seqs.iter().chain(contigs.iter()) {
for lr_seq in all_ref_seqs.iter().chain(corrected_seqs.iter()) {
let seq_cstr = std::ffi::CString::new(lr_seq.clone()).unwrap();
let seq_qual = std::ffi::CString::new(vec![b'I'; lr_seq.len()]).unwrap();
let a = la.align(seq_cstr.as_ref(), &sg);
Expand Down Expand Up @@ -96,7 +99,7 @@ pub fn start(
let matrix = create_read_allele_matrix(&msa_strings);
let wmec = create_wmec_matrix(&matrix);

let (h1, h2) = skydive::wmec::phase(&wmec);
let (h1, _) = skydive::wmec::phase(&wmec);
let (hap1, hap2) = create_fully_phased_haplotypes(&msa_strings, &h1);

let mut output = File::create(output).expect("Failed to create output file");
Expand All @@ -115,7 +118,7 @@ fn create_fully_phased_haplotypes(lr_msas: &Vec<String>, h1: &Vec<u8>) -> (Strin

while index1 < lr_msas[0].len() {
let combined_base_counts = allele_counts(lr_msas, index1, index1+1);
let bases = allele_indices(lr_msas, index1, index1+1);
// let bases = allele_indices(lr_msas, index1, index1+1);

if combined_base_counts.len() == 0 {
index1 += 1;
Expand Down
14 changes: 6 additions & 8 deletions src/hidive/src/correct.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,12 @@ pub fn start(

skydive::elog!("Built MLdBG with {} k-mers.", m.kmers.len());

/*
let contigs = m.assemble_all();
// let contigs = m.assemble_at_bubbles();

let mut fa_file = File::create(&output).unwrap();
for (i, contig) in contigs.iter().enumerate() {
let _ = writeln!(fa_file, ">contig_{}\n{}", i, String::from_utf8(contig.clone()).unwrap());
}
*/
// let mut fa_file = File::create(&output).unwrap();
// for (i, contig) in contigs.iter().enumerate() {
// let _ = writeln!(fa_file, ">contig_{}\n{}", i, String::from_utf8(contig.clone()).unwrap());
// }

let progress_bar = skydive::utils::default_bounded_progress_bar("Correcting reads", all_lr_seqs.len() as u64);

Expand All @@ -57,7 +55,7 @@ pub fn start(
.collect::<Vec<Vec<u8>>>();

skydive::elog!("Writing reads to {}", output.display());

let mut fa_file = File::create(&output).unwrap();
for (i, corrected_seq) in corrected_seqs.iter().enumerate() {
let _ = writeln!(fa_file, ">corrected_{}\n{}", i, String::from_utf8(corrected_seq.clone()).unwrap());
Expand Down
15 changes: 10 additions & 5 deletions src/hidive/src/rescue.rs
Original file line number Diff line number Diff line change
Expand Up @@ -169,9 +169,14 @@ pub fn start(
.expect("Could not write to file");
}

skydive::elog!(
"Wrote {} reads to {}.",
all_records.len().to_formatted_string(&Locale::en),
output.to_str().unwrap()
);
if !all_records.is_empty() {
skydive::elog!(
"Wrote {} reads to {}.",
all_records.len().to_formatted_string(&Locale::en),
output.to_str().unwrap()
);
} else {
skydive::elog!("No reads were found in the CRAM files. Aborting.");
std::process::exit(1);
}
}
64 changes: 50 additions & 14 deletions src/skydive/src/ldbg.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
use anyhow::Result;
use petgraph::Direction::Incoming;

use std::collections::{HashMap, HashSet, VecDeque};
use std::fs::File;
use std::io::Write;
use std::collections::{HashMap, HashSet};

use parquet::data_type::AsBytes;

Expand All @@ -13,8 +10,6 @@ use std::path::PathBuf;

use petgraph::graph::{DiGraph, NodeIndex};
use petgraph::visit::EdgeRef;
use petgraph::dot::{Dot, Config};
use petgraph::visit::{Dfs, DfsEvent};

use indicatif::{ProgressIterator, ParallelProgressIterator};
use rayon::iter::IntoParallelRefIterator;
Expand All @@ -34,13 +29,6 @@ type KmerScores = HashMap<Vec<u8>, f32>;
type Links = HashMap<Vec<u8>, HashMap<Link, u16>>;
type Sources = HashMap<Vec<u8>, Vec<usize>>;

#[derive(Clone, Copy, PartialEq, Eq)]
enum NodeState {
NotVisited,
InProgress,
Done,
}

/// Represents a linked de Bruijn graph with a k-mer size specified at construction time.
#[derive(Debug, Clone)]
pub struct LdBG {
Expand Down Expand Up @@ -1481,6 +1469,54 @@ impl LdBG {
contigs
}

pub fn assemble_at_bubbles(&self) -> Vec<Vec<u8>> {
let g = self.traverse_all_kmers();
let bubbles = find_all_superbubbles(&g);

let mut all_contigs = Vec::new();
let mut visited = HashSet::new();

for ((in_node, out_node), interior) in bubbles {
let paths_fwd = petgraph::algo::all_simple_paths::<Vec<_>, _>(&g, in_node, out_node, 0, Some(interior.len()));
let paths_rev = petgraph::algo::all_simple_paths::<Vec<_>, _>(&g, out_node, in_node, 0, Some(interior.len()));

let paths: Vec<Vec<NodeIndex>> = paths_fwd.chain(paths_rev).collect();

let mut unique_nodes = Vec::new();

for (i, path) in paths.iter().enumerate() {
let other_paths: HashSet<_> = paths.iter().enumerate()
.filter(|&(j, _)| j != i)
.flat_map(|(_, p)| p)
.collect();

if let Some(unique_node) = path.iter().find(|&node| !other_paths.contains(node)) {
let kmer = g.node_weight(*unique_node).unwrap().as_bytes().to_vec();
let cn_kmer = crate::utils::canonicalize_kmer(&kmer);

if !visited.contains(&cn_kmer) {
unique_nodes.push(*unique_node);
}
}
}

let contigs = unique_nodes.iter().map(|unique_node| {
let kmer = g.node_weight(*unique_node).unwrap().as_bytes().to_vec();
self.assemble(&kmer)
}).collect::<Vec<_>>();

all_contigs.extend(contigs.clone());

for contig in contigs {
for kmer in contig.windows(self.kmer_size) {
visited.insert(crate::utils::canonicalize_kmer(kmer));
}
}
}

all_contigs
}

/// Update links available to inform navigation during graph traversal.
///
/// # Arguments
Expand Down Expand Up @@ -1553,7 +1589,7 @@ impl LdBG {
.clean_tangles(1, 100, lenient_threshold)
.clean_branches(strict_threshold)
.clean_tips(3*kmer_size, strict_threshold)
.clean_bubbles(2.0*lenient_threshold)
// .clean_bubbles(2.0*lenient_threshold)
.clean_contigs(100)
}

Expand Down
8 changes: 8 additions & 0 deletions wdl/HidiveCorrect.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,14 @@ task Rescue {
command <<<
set -euxo pipefail

wget https://storage.googleapis.com/gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta
wget https://storage.googleapis.com/gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai
wget https://storage.googleapis.com/gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.ref_cache.tar.gz

tar xzf Homo_sapiens_assembly38.ref_cache.tar.gz
export REF_PATH="$(pwd)/ref/cache/%2s/%2s/%s:http://www.ebi.ac.uk/ena/cram/md5/%s"
export REF_CACHE="$(pwd)/ref/cache/%2s/%2s/%s"

hidive rescue -f ~{long_reads_fasta} ~{short_reads_cram} > ~{prefix}.fa
>>>

Expand Down

0 comments on commit 632fa90

Please sign in to comment.