Skip to content

Commit

Permalink
Kvg stitch (#30)
Browse files Browse the repository at this point in the history
* Added read error correction workflow
* Added .dockstore.yml
  • Loading branch information
kvg authored Oct 11, 2024
1 parent e2e7193 commit 427f12f
Show file tree
Hide file tree
Showing 6 changed files with 219 additions and 25 deletions.
4 changes: 4 additions & 0 deletions .bumpversion.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,7 @@ replace = version = "{new_version}"
[bumpversion:file:src/skydive/Cargo.toml]
search = version = "{current_version}"
replace = version = "{new_version}"

[bumpversion:file:wdl/HidiveCorrect.wdl]
search = version = "{current_version}"
replace = version = "{new_version}"
5 changes: 5 additions & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
version: 1.2
workflows:
- name: HidiveCorrect
subclass: wdl
primaryDescriptorPath: /wdl/HidiveCorrect.wdl
21 changes: 13 additions & 8 deletions src/hidive/src/coassemble.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ use indicatif::ParallelProgressIterator;

use skydive::ldbg::LdBG;
use skydive::mldbg::MLdBG;
use skydive::utils::*;

use skydive::wmec::*;
use spoa::AlignmentType;
Expand All @@ -16,21 +15,21 @@ pub fn start(
output: &PathBuf,
kmer_size: usize,
model_path: &PathBuf,
long_read_fasta_paths: &Vec<PathBuf>,
short_read_fasta_paths: &Vec<PathBuf>,
reference_fasta_paths: &Vec<PathBuf>,
long_read_fasta_path: PathBuf,
short_read_fasta_path: PathBuf,
) {
let long_read_seq_urls = skydive::parse::parse_file_names(long_read_fasta_paths);
let short_read_seq_urls = skydive::parse::parse_file_names(short_read_fasta_paths);
let long_read_seq_urls = skydive::parse::parse_file_names(&[long_read_fasta_path.clone()]);
let short_read_seq_urls = skydive::parse::parse_file_names(&[short_read_fasta_path.clone()]);
let reference_seq_urls = skydive::parse::parse_file_names(reference_fasta_paths);

// Read all long reads.
skydive::elog!("Processing long-read samples {:?}...", long_read_seq_urls.iter().map(|url| url.as_str()).collect::<Vec<&str>>());
let all_lr_seqs = skydive::utils::read_fasta(long_read_fasta_paths);
let all_lr_seqs = skydive::utils::read_fasta(&vec![long_read_fasta_path.clone()]);

// Read all short reads.
skydive::elog!("Processing short-read samples {:?}...", short_read_seq_urls.iter().map(|url| url.as_str()).collect::<Vec<&str>>());
let all_sr_seqs = skydive::utils::read_fasta(short_read_fasta_paths);
let all_sr_seqs = skydive::utils::read_fasta(&vec![short_read_fasta_path]);

// Read all reference subsequences.
skydive::elog!("Processing reference {:?}...", reference_seq_urls.iter().map(|url| url.as_str()).collect::<Vec<&str>>());
Expand All @@ -47,7 +46,7 @@ pub fn start(
.clean_branches(0.01)
.clean_tips(3*kmer_size, 0.01)
.clean_contigs(100)
.build_links(&all_lr_seqs, false);
.build_links(&all_lr_seqs, true);

skydive::elog!("Built MLdBG with {} k-mers.", m.kmers.len());

Expand All @@ -61,6 +60,8 @@ pub fn start(
.flatten()
.collect::<Vec<Vec<u8>>>();

// let contigs = m.assemble_all();

let mut la = spoa::AlignmentEngine::new(AlignmentType::kSW, 5, -10, -16, -12, -20, -8);

let mut sg = spoa::Graph::new();
Expand Down Expand Up @@ -88,6 +89,10 @@ pub fn start(
})
.collect::<Vec<String>>();

// for msa_string in &msa_strings {
// println!("{}", msa_string);
// }

let matrix = create_read_allele_matrix(&msa_strings);
let wmec = create_wmec_matrix(&matrix);

Expand Down
9 changes: 9 additions & 0 deletions src/hidive/src/correct.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,15 @@ pub fn start(

skydive::elog!("Built MLdBG with {} k-mers.", m.kmers.len());

/*
let contigs = m.assemble_all();
let mut fa_file = File::create(&output).unwrap();
for (i, contig) in contigs.iter().enumerate() {
let _ = writeln!(fa_file, ">contig_{}\n{}", i, String::from_utf8(contig.clone()).unwrap());
}
*/

let progress_bar = skydive::utils::default_bounded_progress_bar("Correcting reads", all_lr_seqs.len() as u64);

let corrected_seqs =
Expand Down
34 changes: 17 additions & 17 deletions src/hidive/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ const DEFAULT_KMER_SIZE: usize = 17;

#[derive(Debug, Subcommand)]
enum Commands {
/// Train a graph-cleaning model using long- and (optionally) short-read data with ground truth assemblies.
/// Train a error correction model with reads and ground truth assemblies.
#[clap(arg_required_else_help = true)]
Train {
/// Output path for trained model.
Expand Down Expand Up @@ -113,7 +113,7 @@ enum Commands {
debug: bool,
},

/// Stream selected loci from FASTA and long-read WGS BAM files stored locally or in Google Cloud Storage.
/// Stream loci from CRAM/BAM/FASTA files stored locally or in Google Cloud Storage.
#[clap(arg_required_else_help = true)]
Fetch {
/// Output path for FASTA file with reads spanning locus of interest.
Expand Down Expand Up @@ -185,7 +185,7 @@ enum Commands {
seq_paths: Vec<PathBuf>,
},

/// Optionally further filter rescued reads to those most closely matching a long-read draft assembly.
/// Further filter rescued reads to those most closely matching a long-read draft assembly.
#[clap(arg_required_else_help = true)]
Filter {
/// Output path for filtered short-read sequences.
Expand Down Expand Up @@ -269,7 +269,7 @@ enum Commands {
graph: PathBuf,
},

/// Correct reads.
/// Error-correct long reads using a linked de Bruijn graph.
#[clap(arg_required_else_help = true)]
Correct {
/// Output path for corrected reads.
Expand Down Expand Up @@ -297,7 +297,7 @@ enum Commands {
long_read_fasta_paths: Vec<PathBuf>,
},

/// Co-assemble target locus from long-read and short-read data using a linked de Bruijn graph.
/// Co-assemble target locus from long- and short-read data using a linked de Bruijn graph.
#[clap(arg_required_else_help = true)]
Coassemble {
/// Output path for assembled short-read sequences.
Expand All @@ -312,17 +312,17 @@ enum Commands {
#[clap(short, long, required = true, value_parser)]
model_path: PathBuf,

/// FASTA files with short-read sequences (may contain one or more samples).
#[clap(short, long, required = false, value_parser)]
short_read_fasta_paths: Vec<PathBuf>,
/// FASTA files with reference subsequences.
#[clap(short, long, required = true, value_parser)]
reference_fasta_paths: Vec<PathBuf>,

/// FASTA files with long-read sequences (may contain one or more samples).
#[clap(required = true, value_parser)]
long_read_fasta_paths: Vec<PathBuf>,
long_read_fasta_path: PathBuf,

/// FASTA files with reference subsequences.
#[clap(short, long, required = true, value_parser)]
reference_fasta_paths: Vec<PathBuf>,
/// FASTA files with short-read sequences (may contain one or more samples).
#[clap(required = false, value_parser)]
short_read_fasta_path: PathBuf,
},
}

Expand Down Expand Up @@ -428,19 +428,19 @@ fn main() {
}
Commands::Coassemble {
output,
model_path,
kmer_size,
long_read_fasta_paths,
short_read_fasta_paths,
model_path,
reference_fasta_paths,
long_read_fasta_path,
short_read_fasta_path,
} => {
coassemble::start(
&output,
kmer_size,
&model_path,
&long_read_fasta_paths,
&short_read_fasta_paths,
&reference_fasta_paths,
long_read_fasta_path,
short_read_fasta_path
);
}
}
Expand Down
171 changes: 171 additions & 0 deletions wdl/HidiveCorrect.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
version 1.0

workflow Hidive {
input {
File long_reads_bam
File long_reads_bai
File short_reads_cram
File short_reads_crai

File reference
String locus
File model

Int padding = 500
}

call Fetch {
input:
bam = long_reads_bam,
locus = locus,
padding = padding,
}

call Rescue {
input:
long_reads_fasta = Fetch.fasta,
short_reads_cram = short_reads_cram,
short_reads_crai = short_reads_crai,
}

call Correct {
input:
model = model,
long_read_fasta = Fetch.fasta,
short_read_fasta = Rescue.fasta,
}

call Align {
input:
reference = reference,
sequences = Correct.fasta,
}

output {
File corrected_fa = Correct.fasta
File aligned_bam = Align.aligned_bam
File aligned_bai = Align.aligned_bai
}
}

task Fetch {
input {
String bam
String locus
Int padding

String prefix = "out"

Int disk_size_gb = 2
Int num_cpus = 4
}

command <<<
set -euxo pipefail

hidive fetch -l "~{locus}" -p ~{padding} ~{bam} > ~{prefix}.fa
>>>

output {
File fasta = "~{prefix}.fa"
}

runtime {
docker: "us.gcr.io/broad-dsp-lrma/lr-hidive:0.1.69"
memory: "2 GB"
cpu: num_cpus
disks: "local-disk ~{disk_size_gb} SSD"
}
}

task Rescue {
input {
File long_reads_fasta
File short_reads_cram
File short_reads_crai

String prefix = "out"

Int num_cpus = 4
}

Int disk_size_gb = 1 + 2*ceil(size([long_reads_fasta, short_reads_cram, short_reads_crai], "GB"))

command <<<
set -euxo pipefail

hidive rescue -f ~{long_reads_fasta} ~{short_reads_cram} > ~{prefix}.fa
>>>

output {
File fasta = "~{prefix}.fa"
}

runtime {
docker: "us.gcr.io/broad-dsp-lrma/lr-hidive:0.1.69"
memory: "2 GB"
cpu: num_cpus
disks: "local-disk ~{disk_size_gb} SSD"
}
}

task Correct {
input {
File model
File long_read_fasta
File short_read_fasta

String prefix = "out"

Int num_cpus = 4
}

Int disk_size_gb = 1 + 2*ceil(size([model, long_read_fasta, short_read_fasta], "GB"))

command <<<
set -euxo pipefail

hidive correct -m ~{model} -s ~{short_read_fasta} ~{long_read_fasta} > ~{prefix}.fa
>>>

output {
File fasta = "~{prefix}.fa"
}

runtime {
docker: "us.gcr.io/broad-dsp-lrma/lr-hidive:0.1.69"
memory: "4 GB"
cpu: num_cpus
disks: "local-disk ~{disk_size_gb} SSD"
}
}

task Align {
input {
File reference
File sequences

String preset = "map-hifi"
String prefix = "out"

Int num_cpus = 8
}

command <<<
set -euxo pipefail

minimap2 -t ~{num_cpus} -ayYL -x ~{preset} ~{reference} ~{sequences} | samtools sort --write-index -O BAM -o ~{prefix}.bam
>>>

output {
File aligned_bam = "~{prefix}.bam"
File aligned_bai = "~{prefix}.bam.bai"
}

runtime {
docker: "us.gcr.io/broad-dsp-lrma/lr-hidive:0.1.69"
memory: "32 GB"
cpu: num_cpus
disks: "local-disk 100 SSD"
}
}

0 comments on commit 427f12f

Please sign in to comment.