NANOSIM SIMULATOR

https://img.shields.io/github/issues-pr/snakemake/snakemake-wrappers/bio/nanosim/simulator?label=version%20update%20pull%20requests

NanoSim is a simulator of Oxford Nanopore reads that captures the technology-specific features of ONT data, and allows for adjustments upon improvement of Nanopore sequencing technology.

URL: https://github.com/bcgsc/NanoSim

Example

This wrapper can be used in the following way:

rule nanosim_genome:
    input:
        reference_genome="resources/{sample}.genome.fa",
        model=multiext(
            "resources/{model}",
            "_aligned_reads.pkl",
            "_aligned_region.pkl",
            "_chimeric_info",
            "_error_markov_model",
            "_error_rate.tsv",
            "_first_match.hist",
            "_gap_length.pkl",
            "_ht_length.pkl",
            "_ht_ratio.pkl",
            "_match_markov_model",
            "_model_profile",
            "_reads_alignment_rate",
            "_strandness_rate",
            "_unaligned_length.pkl",
        ),
    output:
        reads="results/nanosim/genome/{sample}.{model}.simulated_reads.fq",  # fastq output requires specification of a --basecaller
        errors="results/nanosim/genome/{sample}.{model}.simulated_errors.txt",
        unaligned_reads="results/nanosim/genome/{sample}.{model}.simulated_reads.unaligned.fq",  # asking for unaligned_reads implicitly turns off --perfect
    log:
        "logs/nanosim/genome/{sample}.{model}.log",
    params:
        extra="--number 5 --basecaller guppy -dna_type circular",
    threads: 4
    wrapper:
        "v5.3.0-16-g710597c/bio/nanosim/simulator"


rule nanosim_transcriptome:
    input:
        reference_transcriptome="resources/{sample}.transcriptome.fa",
        expression_profile="{sample}.expression_abundance.tsv",
        # reference_genome="resources/{sample}.genome.fa",  # optional, without it the wrapper will set --no_model_ir; NOTE: with this enabled, we sometimes get infinitely running simulations
        model=multiext(
            "resources/{model}",
            "_added_intron_final.gff3",
            "_aligned_reads.pkl",
            "_aligned_region_2d.pkl",
            "_aligned_region.pkl",
            "_error_markov_model",
            "_error_rate.tsv",
            "_first_match.hist",
            "_ht_length.pkl",
            "_ht_ratio.pkl",
            "_IR_markov_model",
            "_match_markov_model",
            "_model_profile",
            "_reads_alignment_rate",
            "_strandness_rate",
            "_unaligned_length.pkl",
        ),
    output:
        reads="results/nanosim/transcriptome/{sample}.{model}.simulated.fq",  # fastq output requires specification of a --basecaller and --read_type
        errors="results/nanosim/transcriptome/{sample}.{model}.simulated.errors.txt",
        unaligned_reads="results/nanosim/transcriptome/{sample}.{model}.simulated_reads.unaligned.fq",
    log:
        "logs/nanosim/transcriptome/{sample}.{model}.log",
    params:
        extra="--number 5 --basecaller albacore --read_type cDNA_1D",
    threads: 4
    wrapper:
        "v5.3.0-16-g710597c/bio/nanosim/simulator"


rule nanosim_metagenome:
    input:
        reference_genomes=multiext(
            "resources/{refs_prefix}.",
            "genome.fa",
            "genome2.fa",
        ),
        sample_abundances="config/{sample}.abundances.tsv",
        model=multiext(
            "resources/{model}",
            "_aligned_reads.pkl",
            "_aligned_region.pkl",
            "_chimeric_info",
            "_error_markov_model",
            "_error_rate.tsv",
            "_first_match.hist",
            "_gap_length.pkl",
            "_ht_length.pkl",
            "_ht_ratio.pkl",
            "_match_markov_model",
            "_model_profile",
            "_reads_alignment_rate",
            "_strandness_rate",
            "_unaligned_length.pkl",
        ),
    output:
        abundance_list_tsv="results/nanosim/metagenome/{refs_prefix}.{model}/config/{sample}.abundances.tsv",
        dna_type_list_tsv="results/nanosim/metagenome/{refs_prefix}.{model}/config/{sample}.dna_type_list.tsv",
        reference_genomes_list_tsv="results/nanosim/metagenome/{refs_prefix}.{model}/config/{sample}.reference_genomes_list.tsv",
        reads="results/nanosim/metagenome/{refs_prefix}.{model}/simulated/{sample}.simulated_reads.fa",
        errors="results/nanosim/metagenome/{refs_prefix}.{model}/simulated/{sample}.simulated_errors.txt",
        #unaligned_reads="results/nanosim/metagenome/{refs_prefix}.{model}/simulated/{sample}.simulated_reads_unaligned.fq",  # not asking for unaligned reads sets the flag --perfect
    log:
        "logs/nanosim/metagenome/{refs_prefix}.{model}/{sample}.log",
    params:
        extra="",
        species={
            "human BRCA2": {
                "chromosome": "NC_000013.11|:32306558-32408787 Homo sapiens chromosome 13",
                "dna_type": "circular",
                "ref_suffix": "genome.fa",
            },
            "mouse BRCA2": {
                "chromosome": "NC_000071.7|:150440974-150498397 Mus musculus strain C57BL/6J chromosome 5",
                "dna_type": "linear",
                "ref_suffix": "genome2.fa",
            },
        },
    threads: 4
    wrapper:
        "v5.3.0-16-g710597c/bio/nanosim/simulator"

Note that input, output and log file paths can be chosen freely.

When running with

snakemake --use-conda

the software dependencies will be automatically deployed into an isolated environment before execution.

Notes

  • Depending on the named inputs you specified, the wrapper automatically chooses the correct subcommand.

  • If the file extension of output: reads= is either fq or fastq, the wrapper will automatically set the --fastq flag. As fastq output generates basecalling quality values, you then also have to specify the --basecaller to use via params: extra=.

  • If you do not request an output: unaligned_reads= file, the wrapper will set the --perfect flag. Thus, if you want your simulated reads to contain simulated errors, make sure to specify a file to save the reads that contain too many errors to be aligned. Vice-versa, the wrapper will complain if you specify the --perfect flag, as this flag is implicitly set as described above.

  • All subcommands require a model as input, with slightly differing numbers of files for the different subcommands. You will have to either download a pre-trained model and extract it to match the named input: model= file path and names; or train a model based on Nanopore data yourself. For details, see the NanoSim documentation for model training.

  • If the input: for a transcriptome mode simulator run does not contain a reference_genome= entry, the wrapper will automatically set the --no_model_ir option. This skips intron retention events during simulation, which would need the reference genome. Please note that the if you do provide a reference_genome=, it has to match the genome and transcriptome used for training the model. Otherwise, the simulation process will run inifinitely.

  • For the metagenome mode simulator, we restrict the wrapper to simulating a single (multi-species / multi-reference) sample, to be able to sanely handle output files. If you need multiple samples, simply generate one sample_abundances= file for each sample and have your snakemake workflow execute the wrapper once for each sample you want. Please specify the species abundances in a simple tab-separated values (TSV) file, with one line per sample and the format “sample<tab>abundance”. Please specify the species information needed for the various nanosim-specific files via the params: species= dictionary.

Software dependencies

  • nanosim=3.1

  • numpy<1.20

Params

  • extra: Any further command line arguments you might want to add can be provided verbatim via this field, for example: --del-rate 0.8

Authors

  • David Lähnemann

Code

"""Snakemake wrapper for NanoSim."""

__author__ = "David Lähnemann"
__copyright__ = "Copyright 2024, David Lähnemann"
__email__ = "david.laehnemann@hhu.de"
__license__ = "MIT"

import tempfile
from os import path

from snakemake.shell import shell

log = snakemake.log_fmt_shell(stdout=True, stderr=True)


# Placeholder for optional parameters
extra = snakemake.params.get("extra", "")


def message_input_missing_for_subcommand(input_provided, subcommand, input_required):
    return (
        f"Providing 'input: {input_provided}' implies subcommand "
        f"'{subcommand}', but you did not provide the necessary "
        f"'input: {input_required}' that this subcommand requires."
    )


sample_infix = ""

if "reference_genomes" in snakemake.input.keys():
    subcommand = "metagenome"
    # nanosim names samples `sample0`, `sample1`, ... and this wrapper forces
    # it to output only one single sample, to sanely handle output
    sample_infix = "sample0_"

    with open(snakemake.output.abundance_list_tsv, "w") as abun_out, open(
        snakemake.input.sample_abundances, "r"
    ) as abun_in:
        abundances_dict = dict()
        for line in abun_in:
            (k, v) = line.rstrip().split("\t")
            abundances_dict[k] = v
        total_abundance = 0.0
        for key in abundances_dict:
            total_abundance += float(abundances_dict[key])
        abun_out.write(f"Size\t{int(total_abundance)}\n")
        for key in abundances_dict:
            abun_out.write(f"{key}\t{abundances_dict[key]}\n")

    with open(snakemake.output.dna_type_list_tsv, "w") as dna_type:
        for spec in snakemake.params.species:
            dna_type.write(
                f"{spec}\t"
                f"{snakemake.params.species[spec]['chromosome']}\t"
                f"{snakemake.params.species[spec]['dna_type']}\n"
            )

    with open(snakemake.output.reference_genomes_list_tsv, "w") as genomes_list:
        for spec in snakemake.params.species:
            filename = [
                f
                for f in snakemake.input.reference_genomes
                if f.endswith(snakemake.params.species[spec]["ref_suffix"])
            ]
            if len(filename) == 1:
                genomes_list.write(f"{spec}\t" f"{filename[0]}\n")
            elif len(filename) == 0:
                ValueError(
                    f"The params: species='{spec}' 'ref_suffix' you specified is: {snakemake.params.species[spec]['ref_suffix']}\n"
                    "This suffix does not match any of the specified 'input: reference_genomes=':\n"
                    f"{snakemake.input.reference_genomes}\n"
                )
            else:
                ValueError(
                    f"The params: species='{spec}' 'ref_suffix' you specified is: {snakemake.params.species[spec]['ref_suffix']}\n"
                    "This suffix is ambiguous with regard to the specified 'input: reference_genomes=':\n"
                    f"{snakemake.input.reference_genomes}\n"
                    "It matches all of these filename:\n"
                    f"{filename}\n"
                )

    input = (
        f"--genome_list {snakemake.output.reference_genomes_list_tsv} "
        f"--abun {snakemake.output.abundance_list_tsv} "
        f"--dna_type_list {snakemake.output.dna_type_list_tsv} "
    )
elif "reference_transcriptome" in snakemake.input.keys():
    subcommand = "transcriptome"
    if "expression_profile" not in snakemake.input.keys():
        raise KeyError(
            message_input_missing_for_subcommand(
                "reference_transcriptome", subcommand, "expression_profile"
            )
        )
    if "reference_genome" in snakemake.input.keys():
        genome = f"--ref_g {snakemake.input.reference_genome}"
    else:
        genome = "--no_model_ir"
    input = f"--ref_t {snakemake.input.reference_transcriptome} --exp {snakemake.input.expression_profile} {genome}"
elif "reference_genome" in snakemake.input.keys():
    subcommand = "genome"
    input = f"--ref_g {snakemake.input.reference_genome}"
else:
    raise KeyError(
        print(
            "None of the provided inputs clearly signify which subcommand should "
            "be used. Specify at least one of the following three:\n"
            "* reference_genome\n"
            "* reference_genomes\n"
            "* reference_transcriptome\n"
            "You provided:\n"
            f"{snakemake.input.keys()}"
        )
    )

if "model" not in snakemake.input.keys():
    raise KeyError(
        "NanoSim requires a model as input, please provide it via the named"
        "'input: model='."
    )
else:
    model_prefix = path.commonprefix(snakemake.input.model).rstrip("_")
    model = f"--model_prefix {model_prefix}"

if "--perfect" in snakemake.params.extra:
    raise ValueError(
        "The command line flag --perfect is set implicitly by the wrapper, if\n"
        "you do not specify an output file for 'output: unaligned_reads='.\n"
        "Please do not specify it under 'params: extra'."
    )

if path.splitext(snakemake.output.reads)[1] in [".fastq", ".fq"]:
    fq = "--fastq"
    extension = ".fastq"
else:
    fq = ""
    extension = ".fasta"

with tempfile.TemporaryDirectory() as tempdir:
    prefix = f"{tempdir}/simulated"

    if "unaligned_reads" in snakemake.output.keys():
        perfect = ""
        mv_unaligned_reads = f" mv {prefix}_{sample_infix}unaligned_reads{extension} {snakemake.output.unaligned_reads}; "
    else:
        perfect = "--perfect"
        mv_unaligned_reads = ""

    # Executed shell command
    shell(
        "(simulator.py {subcommand} "
        " {input} "
        " {model} "
        " {extra} "
        " {fq} "
        " {perfect} "
        " --num_threads {snakemake.threads} "
        " --output {prefix}; "
        " mv {prefix}_{sample_infix}aligned_reads{extension} {snakemake.output.reads}; "
        " mv {prefix}_{sample_infix}aligned_error_profile {snakemake.output.errors}; "
        " {mv_unaligned_reads}"
        ") {log}; "
    )