DADA2-SE

A subworkflow for processing single-end sequences from metabarcoding projects in order to construct ASV tables using DADA2. The example is based on the data provided in the R package. For more details, see the official website. While the tutorial is tailored for paired-end sequences, useful information can be found regarding common functions to singled-end sequences processing.

Usage

Via copy-paste

This meta-wrapper does not yet use Snakemake’s pathvars syntax. You can best directly copy-paste and modify the full meta-wrapper code below into your workflow.

Execution

When running with

snakemake --sdm conda

the software dependencies will be automatically deployed into an isolated environment before execution.

Used wrappers

The following individual wrappers are used in this meta-wrapper:

Please refer to each wrapper in above list for additional configuration parameters and information about the executed code.

Authors

Charlie Pauvert

Code

# Make sure that you set the `truncLen=` option in the rule `dada2_filter_and_trim_se` according
# to the results of the quality profile checks (after rule `dada2_quality_profile_se` has finished on all samples).
# If in doubt, check https://benjjneb.github.io/dada2/tutorial.html#inspect-read-quality-profiles

rule all:
    input:
        # In a first run of this meta-wrapper, comment out all other inputs and only keep this one.
        # Looking at the resulting plot, adjust the `truncLen` in rule `dada2_filter_trim_se` and then
        # rerun with all inputs uncommented.
        expand(
            "reports/dada2/quality-profile/{sample}.{orientation}-quality-profile.png",
            sample=["a","b"], orientation=1
        ),
        "results/dada2/taxa.RDS"

rule dada2_quality_profile_se:
    input:
        # FASTQ file without primer sequences
        "trimmed/{sample}.{orientation}.fastq.gz"
    output:
        "reports/dada2/quality-profile/{sample}.{orientation}-quality-profile.png"
    log:
        "logs/dada2/quality-profile/{sample}.{orientation}-quality-profile-se.log"
    wrapper:
        "v5.5.1/bio/dada2/quality-profile"

rule dada2_filter_trim_se:
    input:
        # Single-end files without primer sequences
        fwd="trimmed/{sample}.1.fastq.gz"
    output:
        filt="filtered-se/{sample}.1.fastq.gz",
        stats="reports/dada2/filter-trim-se/{sample}.tsv"
    params:
        # Set the maximum expected errors tolerated in filtered reads
        maxEE=1,
        # Set the number of kept bases
        truncLen=240
    log:
        "logs/dada2/filter-trim-se/{sample}.log"
    threads: 1 # set desired number of threads here
    wrapper:
        "v5.5.1/bio/dada2/filter-trim"

rule dada2_learn_errors:
    input:
    # Quality filtered and trimmed forward FASTQ files (potentially compressed)
        expand("filtered-se/{sample}.{{orientation}}.fastq.gz", sample=["a","b"])
    output:
        err="results/dada2/model_{orientation}.RDS",# save the error model
        plot="reports/dada2/errors_{orientation}.png",# plot observed and estimated rates
    params:
        randomize=True
    log:
        "logs/dada2/learn-errors/learn-errors_{orientation}.log"
    threads: 1 # set desired number of threads here
    wrapper:
        "v5.5.1/bio/dada2/learn-errors"

rule dada2_dereplicate_fastq:
    input:
    # Quality filtered FASTQ file
        "filtered-se/{fastq}.fastq.gz"
    output:
    # Dereplicated sequences stored as `derep-class` object in a RDS file
        "uniques/{fastq}.RDS"
    log:
        "logs/dada2/dereplicate-fastq/{fastq}.log"
    wrapper:
        "v5.5.1/bio/dada2/dereplicate-fastq"

rule dada2_sample_inference:
    input:
    # Dereplicated (aka unique) sequences of the sample
        derep="uniques/{sample}.{orientation}.RDS",
        err="results/dada2/model_{orientation}.RDS" # Error model
    output:
        "denoised/{sample}.{orientation}.RDS" # Inferred sample composition
    log:
        "logs/dada2/sample-inference/{sample}.{orientation}.log"
    threads: 1 # set desired number of threads here
    wrapper:
        "v5.5.1/bio/dada2/sample-inference"

rule dada2_make_table_se:
    input:
    # Inferred composition
        expand("denoised/{sample}.1.RDS", sample=['a','b'])
    output:
        "results/dada2/seqTab-se.RDS"
    params:
        names=['a','b'] # Sample names instead of paths
    log:
        "logs/dada2/make-table/make-table-se.log"
    threads: 1 # set desired number of threads here
    wrapper:
        "v5.5.1/bio/dada2/make-table"

rule dada2_remove_chimeras:
    input:
        "results/dada2/seqTab-se.RDS" # Sequence table
    output:
        "results/dada2/seqTab.nochimeras.RDS" # Chimera-free sequence table
    log:
        "logs/dada2/remove-chimeras/remove-chimeras.log"
    threads: 1 # set desired number of threads here
    wrapper:
        "v5.5.1/bio/dada2/remove-chimeras"

rule dada2_collapse_nomismatch:
    input:
        "results/dada2/seqTab.nochimeras.RDS" # Chimera-free sequence table
    output:
        "results/dada2/seqTab.collapsed.RDS"
    log:
        "logs/dada2/collapse-nomismatch/collapse-nomismatch.log"
    threads: 1 # set desired number of threads here
    wrapper:
        "v5.5.1/bio/dada2/collapse-nomismatch"

rule dada2_assign_taxonomy:
    input:
        seqs="results/dada2/seqTab.collapsed.RDS", # Chimera-free sequence table
        refFasta="resources/example_train_set.fa.gz" # Reference FASTA for taxonomy
    output:
        "results/dada2/taxa.RDS" # Taxonomic assignments
    log:
        "logs/dada2/assign-taxonomy/assign-taxonomy.log"
    threads: 1 # set desired number of threads here
    wrapper:
        "v5.5.1/bio/dada2/assign-taxonomy"