diff --git a/.gitignore b/.gitignore index f9fa6da..3e5e3db 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ results/ testing/ testing* *.pyc +test_data/kraken_viral_db/ diff --git a/README.md b/README.md index 166ac54..fea648c 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool ### Taxonomic classification - Sequence classification ([`Kaiju`](https://github.com/bioinformatics-centre/kaiju/)) -- (**WIP**) Sequence classification ([`Kraken2`](https://github.com/DerrickWood/kraken2)) +- Sequence classification ([`Kraken2`](https://github.com/DerrickWood/kraken2)) - Visualization ([`Krona`](https://github.com/marbl/Krona/wiki)) ### Functional annotation diff --git a/conf/modules.config b/conf/modules.config index 41faf9b..96a215b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -59,6 +59,14 @@ process { ] } + withName: KRAKEN2_KRAKEN2 { + publishDir = [ + path: { "${params.outdir}/taxonomy/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: KRONA_KTIMPORTTEXT { publishDir = [ path: { "${params.outdir}/taxonomy/${meta.id}" }, diff --git a/docs/params.md b/docs/params.md index c3b2976..fb2970b 100644 --- a/docs/params.md +++ b/docs/params.md @@ -30,7 +30,8 @@ Choose to skip pipeline steps | Parameter | Description | Type | Default | Required | Hidden | |-----------|-----------|-----------|-----------|-----------|-----------| -| `host_fasta` | | `string` | None | | | +| `host_fasta` | Host FASTA to use for decontamination | `string` | | | | +| `bowtie2_db` | Pre-built bowtie2 index. Directory where index is located. | `string` | | | | ## Alignment @@ -38,8 +39,8 @@ Choose to skip pipeline steps | Parameter | Description | Type | Default | Required | Hidden | |-----------|-----------|-----------|-----------|-----------|-----------| -| `reference_fasta` | Path to FASTA genome file. | `string` | None | | | -| `diamond_db` | Path to pre-built DIAMOND db. | `string` | None | | | +| `reference_fasta` | Path to FASTA genome file. | `string` | | | | +| `diamond_db` | Path to pre-built DIAMOND db. | `string` | | | | ## Taxonomy @@ -47,7 +48,10 @@ Choose to skip pipeline steps | Parameter | Description | Type | Default | Required | Hidden | |-----------|-----------|-----------|-----------|-----------|-----------| -| `kaiju_db` | | `string` | None | True | | +| `kaiju_db` | Kaiju database | `string` | | True | | +| `kraken2_db` | Kraken2 database | `string` | | | | +| `run_kaiju` | Run Kaiju classifier | `boolean` | True | | | +| `run_kraken2` | Run Kraken2 classifier | `boolean` | | | | ## Functional @@ -55,7 +59,7 @@ Choose to skip pipeline steps | Parameter | Description | Type | Default | Required | Hidden | |-----------|-----------|-----------|-----------|-----------|-----------| -| `id_mapping` | Path to ID mapping file to be used for the Functional annotation | `string` | None | True | | +| `id_mapping` | Path to ID mapping file to be used for the Functional annotation | `string` | | True | | | `minimum_bitscore` | Minimum bitscore of a match to be used for annotation | `integer` | 50 | | | | `minimum_pident` | Minimum identity of a match to be used for annotation | `integer` | 80 | | | | `minimum_alen` | Minimum alignment length of a match to be used for annotation | `integer` | 50 | | | @@ -78,7 +82,7 @@ Reference genome related files and options required for the workflow. | `genome` | Name of iGenomes reference.
HelpIf using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`.

See the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details.
| `string` | | | | | `igenomes_base` | Directory / URL base for iGenomes references. | `string` | s3://ngi-igenomes/igenomes | | True | | `igenomes_ignore` | Do not load the iGenomes reference config.
HelpDo not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`.
| `boolean` | | | True | -| `fasta` | | `string` | None | | | +| `fasta` | | `string` | | | | ## Max job request options @@ -110,9 +114,4 @@ Less common options for the pipeline, typically set in a config file. | `tracedir` | Directory to keep pipeline Nextflow logs and reports. | `string` | ${params.outdir}/pipeline_info | | True | | `validate_params` | Boolean whether to validate parameters against the schema at runtime | `boolean` | True | | True | | `show_hidden_params` | Show all params when using `--help`
HelpBy default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters.
| `boolean` | | | True | - -## Other parameters - -| Parameter | Description | Type | Default | Required | Hidden | -|-----------|-----------|-----------|-----------|-----------|-----------| -| `schema_ignore_params` | | `string` | genomes | | | +| `schema_ignore_params` | | `string` | genomes | | True | diff --git a/modules.json b/modules.json index 5506c10..4d15e1d 100644 --- a/modules.json +++ b/modules.json @@ -68,6 +68,18 @@ "installed_by": ["modules"], "patch": "modules/nf-core/kaiju/kaiju2table/kaiju-kaiju2table.diff" }, + "kraken2/kraken2": { + "branch": "master", + "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", + "installed_by": ["modules"], + "patch": "modules/nf-core/kraken2/kraken2/kraken2-kraken2.diff" + }, + "krakentools/kreport2krona": { + "branch": "master", + "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", + "installed_by": ["modules"], + "patch": "modules/nf-core/krakentools/kreport2krona/krakentools-kreport2krona.diff" + }, "krona/ktimporttext": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", diff --git a/modules/nf-core/kraken2/kraken2/environment.yml b/modules/nf-core/kraken2/kraken2/environment.yml new file mode 100644 index 0000000..480d40f --- /dev/null +++ b/modules/nf-core/kraken2/kraken2/environment.yml @@ -0,0 +1,7 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::kraken2=2.1.2 + - conda-forge::pigz=2.6 diff --git a/modules/nf-core/kraken2/kraken2/kraken2-kraken2.diff b/modules/nf-core/kraken2/kraken2/kraken2-kraken2.diff new file mode 100644 index 0000000..369df06 --- /dev/null +++ b/modules/nf-core/kraken2/kraken2/kraken2-kraken2.diff @@ -0,0 +1,14 @@ +Changes in module 'nf-core/kraken2/kraken2' +--- modules/nf-core/kraken2/kraken2/main.nf ++++ modules/nf-core/kraken2/kraken2/main.nf +@@ -5,7 +5,7 @@ + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' : +- 'biocontainers/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' }" ++ 'quay.io/biocontainers/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' }" + + input: + tuple val(meta), path(reads) + +************************************************************ diff --git a/modules/nf-core/kraken2/kraken2/main.nf b/modules/nf-core/kraken2/kraken2/main.nf new file mode 100644 index 0000000..e73a819 --- /dev/null +++ b/modules/nf-core/kraken2/kraken2/main.nf @@ -0,0 +1,58 @@ +process KRAKEN2_KRAKEN2 { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' : + 'quay.io/biocontainers/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' }" + + input: + tuple val(meta), path(reads) + path db + val save_output_fastqs + val save_reads_assignment + + output: + tuple val(meta), path('*.classified{.,_}*') , optional:true, emit: classified_reads_fastq + tuple val(meta), path('*.unclassified{.,_}*') , optional:true, emit: unclassified_reads_fastq + tuple val(meta), path('*classifiedreads.txt') , optional:true, emit: classified_reads_assignment + tuple val(meta), path('*report.txt') , emit: report + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def paired = meta.single_end ? "" : "--paired" + def classified = meta.single_end ? "${prefix}.classified.fastq" : "${prefix}.classified#.fastq" + def unclassified = meta.single_end ? "${prefix}.unclassified.fastq" : "${prefix}.unclassified#.fastq" + def classified_option = save_output_fastqs ? "--classified-out ${classified}" : "" + def unclassified_option = save_output_fastqs ? "--unclassified-out ${unclassified}" : "" + def readclassification_option = save_reads_assignment ? "--output ${prefix}.kraken2.classifiedreads.txt" : "--output /dev/null" + def compress_reads_command = save_output_fastqs ? "pigz -p $task.cpus *.fastq" : "" + + """ + kraken2 \\ + --db $db \\ + --threads $task.cpus \\ + --report ${prefix}.kraken2.report.txt \\ + --gzip-compressed \\ + $unclassified_option \\ + $classified_option \\ + $readclassification_option \\ + $paired \\ + $args \\ + $reads + + $compress_reads_command + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kraken2: \$(echo \$(kraken2 --version 2>&1) | sed 's/^.*Kraken version //; s/ .*\$//') + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/kraken2/kraken2/meta.yml b/modules/nf-core/kraken2/kraken2/meta.yml new file mode 100644 index 0000000..7909ffe --- /dev/null +++ b/modules/nf-core/kraken2/kraken2/meta.yml @@ -0,0 +1,78 @@ +name: kraken2_kraken2 +description: Classifies metagenomic sequence data +keywords: + - classify + - metagenomics + - fastq + - db +tools: + - kraken2: + description: | + Kraken2 is a taxonomic sequence classifier that assigns taxonomic labels to sequence reads + homepage: https://ccb.jhu.edu/software/kraken2/ + documentation: https://github.com/DerrickWood/kraken2/wiki/Manual + doi: 10.1186/s13059-019-1891-0 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - db: + type: directory + description: Kraken2 database + - save_output_fastqs: + type: string + description: | + If true, optional commands are added to save classified and unclassified reads + as fastq files + - save_reads_assignment: + type: string + description: | + If true, an optional command is added to save a file reporting the taxonomic + classification of each input read +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - classified_reads_fastq: + type: file + description: | + Reads classified as belonging to any of the taxa + on the Kraken2 database. + pattern: "*{fastq.gz}" + - unclassified_reads_fastq: + type: file + description: | + Reads not classified to any of the taxa + on the Kraken2 database. + pattern: "*{fastq.gz}" + - classified_reads_assignment: + type: file + description: | + Kraken2 output file indicating the taxonomic assignment of + each input read + - report: + type: file + description: | + Kraken2 report containing stats about classified + and not classifed reads. + pattern: "*.{report.txt}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" +maintainers: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/krakentools/kreport2krona/environment.yml b/modules/nf-core/krakentools/kreport2krona/environment.yml new file mode 100644 index 0000000..98a8c11 --- /dev/null +++ b/modules/nf-core/krakentools/kreport2krona/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::krakentools=1.2 diff --git a/modules/nf-core/krakentools/kreport2krona/krakentools-kreport2krona.diff b/modules/nf-core/krakentools/kreport2krona/krakentools-kreport2krona.diff new file mode 100644 index 0000000..714cd8a --- /dev/null +++ b/modules/nf-core/krakentools/kreport2krona/krakentools-kreport2krona.diff @@ -0,0 +1,14 @@ +Changes in module 'nf-core/krakentools/kreport2krona' +--- modules/nf-core/krakentools/kreport2krona/main.nf ++++ modules/nf-core/krakentools/kreport2krona/main.nf +@@ -6,7 +6,7 @@ + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/krakentools:1.2--pyh5e36f6f_0': +- 'biocontainers/krakentools:1.2--pyh5e36f6f_0' }" ++ 'quay.io/biocontainers/krakentools:1.2--pyh5e36f6f_0' }" + + input: + tuple val(meta), path(kreport) + +************************************************************ diff --git a/modules/nf-core/krakentools/kreport2krona/main.nf b/modules/nf-core/krakentools/kreport2krona/main.nf new file mode 100644 index 0000000..03e79df --- /dev/null +++ b/modules/nf-core/krakentools/kreport2krona/main.nf @@ -0,0 +1,36 @@ +process KRAKENTOOLS_KREPORT2KRONA { + tag "$meta.id" + label 'process_single' + + // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/krakentools:1.2--pyh5e36f6f_0': + 'quay.io/biocontainers/krakentools:1.2--pyh5e36f6f_0' }" + + input: + tuple val(meta), path(kreport) + + output: + tuple val(meta), path("*.txt"), emit: txt + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '1.2' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + kreport2krona.py \\ + -r ${kreport} \\ + -o ${prefix}.txt \\ + ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kreport2krona.py: ${VERSION} + END_VERSIONS + """ +} diff --git a/modules/nf-core/krakentools/kreport2krona/meta.yml b/modules/nf-core/krakentools/kreport2krona/meta.yml new file mode 100644 index 0000000..7a5dda4 --- /dev/null +++ b/modules/nf-core/krakentools/kreport2krona/meta.yml @@ -0,0 +1,40 @@ +name: krakentools_kreport2krona +description: Takes a Kraken report file and prints out a krona-compatible TEXT file +keywords: + - kraken + - krona + - metagenomics + - visualization +tools: + - krakentools: + description: KrakenTools is a suite of scripts to be used for post-analysis of Kraken/KrakenUniq/Kraken2/Bracken results. Please cite the relevant paper if using KrakenTools with any of the listed programs. + homepage: https://github.com/jenniferlu717/KrakenTools + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - kreport: + type: file + description: Kraken report + pattern: "*.{txt,kreport}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - krona: + type: file + description: Krona text-based input file converted from Kraken report + pattern: "*.{txt,krona}" +authors: + - "@MillironX" +maintainers: + - "@MillironX" diff --git a/nextflow.config b/nextflow.config index dd9a0bd..8ecfe8e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -16,6 +16,7 @@ params { // References kaiju_db = null + kraken2_db = null diamond_db = null reference_fasta = null genome = null @@ -30,6 +31,10 @@ params { host_fasta = null bowtie2_db = null + // Taxonomy + run_kaiju = true + run_kraken2 = false + // Functional id_mapping = null minimum_bitscore = 50 diff --git a/nextflow_schema.json b/nextflow_schema.json index 1269a41..3346dde 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -116,7 +116,24 @@ "properties": { "kaiju_db": { "type": "string", - "fa_icon": "fas fa-database" + "fa_icon": "fas fa-database", + "description": "Kaiju database" + }, + "kraken2_db": { + "type": "string", + "fa_icon": "fas fa-database", + "description": "Kraken2 database" + }, + "run_kaiju": { + "type": "boolean", + "default": true, + "fa_icon": "fas fa-bacterium", + "description": "Run Kaiju classifier" + }, + "run_kraken2": { + "type": "boolean", + "fa_icon": "fas fa-bacterium", + "description": "Run Kraken2 classifier" } }, "required": ["kaiju_db"], diff --git a/subworkflows/local/taxonomy.nf b/subworkflows/local/taxonomy.nf index 2344d3f..c13d8d1 100644 --- a/subworkflows/local/taxonomy.nf +++ b/subworkflows/local/taxonomy.nf @@ -1,5 +1,7 @@ include { UNTAR } from '../../modules/nf-core/untar/main' include { KAIJU_KAIJU } from '../../modules/nf-core/kaiju/kaiju/main' +include { KRAKEN2_KRAKEN2 } from '../../modules/nf-core/kraken2/kraken2/main' +include { KRAKENTOOLS_KREPORT2KRONA } from '../../modules/nf-core/krakentools/kreport2krona/main' include { KAIJU_KAIJU2TABLE } from '../../modules/nf-core/kaiju/kaiju2table/main' include { KAIJU_KAIJU2KRONA } from '../../modules/nf-core/kaiju/kaiju2krona/main' include { KRONA_KTIMPORTTEXT } from '../../modules/nf-core/krona/ktimporttext/main' @@ -8,39 +10,61 @@ workflow TAXONOMY { take: reads kaiju_db + kraken2_db main: ch_versions = Channel.empty() - UNTAR ( kaiju_db ) - ch_versions = ch_versions.mix(UNTAR.out.versions) + if (params.run_kaiju) { + UNTAR ( kaiju_db ) + ch_versions = ch_versions.mix(UNTAR.out.versions) - UNTAR.out.untar.map{ meta, path -> path }.set { kaiju_db_files } + UNTAR.out.untar.map{ meta, path -> path }.set { kaiju_db_files } - KAIJU_KAIJU (reads, kaiju_db_files) - ch_versions = ch_versions.mix(KAIJU_KAIJU.out.versions) + KAIJU_KAIJU (reads, kaiju_db_files) + ch_versions = ch_versions.mix(KAIJU_KAIJU.out.versions) - KAIJU_KAIJU.out.results.set { kaiju_out } + KAIJU_KAIJU.out.results.set { kaiju_out } - KAIJU_KAIJU2TABLE ( - kaiju_out, - kaiju_db_files, - "species" - ) + KAIJU_KAIJU2TABLE ( + kaiju_out, + kaiju_db_files, + "species" + ) - KAIJU_KAIJU2TABLE.out.summary.set { kaiju_report } + KAIJU_KAIJU2TABLE.out.summary.set { tax_report } - KAIJU_KAIJU2KRONA (kaiju_out, kaiju_db_files) - ch_versions = ch_versions.mix(KAIJU_KAIJU2KRONA.out.versions) + KAIJU_KAIJU2KRONA (kaiju_out, kaiju_db_files) + KAIJU_KAIJU2KRONA.out.txt.set { krona_input } + ch_versions = ch_versions.mix(KAIJU_KAIJU2KRONA.out.versions) + } - KRONA_KTIMPORTTEXT (KAIJU_KAIJU2KRONA.out.txt) + if (params.run_kraken2) { + KRAKEN2_KRAKEN2 ( + reads, + kraken2_db, + false, + false + ) + KRAKEN2_KRAKEN2.out.report.set { tax_report } + ch_versions = ch_versions.mix(KRAKEN2_KRAKEN2.out.versions.first()) + + KRAKENTOOLS_KREPORT2KRONA ( + tax_report + ) + KRAKENTOOLS_KREPORT2KRONA.out.txt.set { krona_input } + ch_versions = ch_versions.mix(KRAKENTOOLS_KREPORT2KRONA.out.versions.first()) + } + + + KRONA_KTIMPORTTEXT (krona_input) ch_versions = ch_versions.mix(KRONA_KTIMPORTTEXT.out.versions) KRONA_KTIMPORTTEXT.out.html.set { krona_report } emit: - kaiju_report = kaiju_report + tax_report = tax_report krona_report = krona_report versions = ch_versions } diff --git a/workflows/euryale.nf b/workflows/euryale.nf index af0fb97..0ea7834 100644 --- a/workflows/euryale.nf +++ b/workflows/euryale.nf @@ -74,9 +74,12 @@ def multiqc_report = [] workflow EURYALE { if (params.reference_fasta == null && params.diamond_db == null) { exit 1, 'A reference fasta (--reference_fasta) or a DIAMOND db (--diamond_db) must be specified' } if (params.host_fasta == null && params.bowtie2_db == null) {exit 1, 'Either a host reference FASTA (--host_fasta) or a pre-built bowtie2 index (--bowtie2_db) must be specified'} + if (params.run_kaiju == true && params.kaiju_db == null) {exit 1, 'A Kaiju tar.gz database must be specified with --kaiju_db'} + if (params.run_kraken2 == true && params.kraken2_db == null) {exit 1, 'A Kraken2 database must be specified with --kraken2_db'} ch_versions = Channel.empty() - ch_kaiju_db = Channel.value([ [id: "kaiju_db"], file(params.kaiju_db)]) + ch_kraken_db = params.run_kraken2 ? file(params.kraken2_db) : [] + ch_kaiju_db = params.run_kaiju ? Channel.value([ [id: "kaiju_db"], file(params.kaiju_db)]) : [] ch_reference_fasta = params.reference_fasta ? file(params.reference_fasta) : [] ch_diamond_db = params.diamond_db ? file(params.diamond_db) : [] ch_bowtie2_db = params.bowtie2_db ? Channel.value([ [id: "host_db"], file(params.bowtie2_db)]) : [] @@ -158,10 +161,11 @@ workflow EURYALE { if (!params.skip_classification) { TAXONOMY ( clean_reads, - ch_kaiju_db + ch_kaiju_db, + ch_kraken_db ) ch_versions = ch_versions.mix(TAXONOMY.out.versions) - ch_multiqc_files = ch_multiqc_files.mix(TAXONOMY.out.kaiju_report.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(TAXONOMY.out.tax_report.collect{it[1]}.ifEmpty([])) }