.. _`bio/bismark/bismark`:

BISMARK
=======


.. image:: https://img.shields.io/github/issues-pr/snakemake/snakemake-wrappers/bio/bismark/bismark?label=version%20update%20pull%20requests
   :target: https://github.com/snakemake/snakemake-wrappers/pulls?q=is%3Apr+is%3Aopen+label%3Abio/bismark/bismark

Align BS-Seq reads using Bismark (see https://github.com/FelixKrueger/Bismark/blob/master/bismark).


Example
-------

This wrapper can be used in the following way:

.. code-block:: python

    # Example: Pair-ended reads
    rule bismark_pe:
        input:
            fq_1="reads/{sample}.1.fastq",
            fq_2="reads/{sample}.2.fastq",
            genome="indexes/{genome}/{genome}.fa",
            bismark_indexes_dir="indexes/{genome}/Bisulfite_Genome",
            genomic_freq="indexes/{genome}/genomic_nucleotide_frequencies.txt"
        output:
            bam="bams/{sample}_{genome}_pe.bam",
            report="bams/{sample}_{genome}_PE_report.txt",
            nucleotide_stats="bams/{sample}_{genome}_pe.nucleotide_stats.txt",
            bam_unmapped_1="bams/{sample}_{genome}_unmapped_reads_1.fq.gz",
            bam_unmapped_2="bams/{sample}_{genome}_unmapped_reads_2.fq.gz",
            ambiguous_1="bams/{sample}_{genome}_ambiguous_reads_1.fq.gz",
            ambiguous_2="bams/{sample}_{genome}_ambiguous_reads_2.fq.gz"
        log:
            "logs/bams/{sample}_{genome}.log"
        params:
            # optional params string, e.g: -L32 -N0 -X400 --gzip
            # Useful options to tune:
            # (for bowtie2)
            # -N: The maximum number of mismatches permitted in the "seed", i.e. the first L base pairs
            # of the read (deafault: 1)
            # -L: The "seed length" (deafault: 28)
            # -I: The minimum insert size for valid paired-end alignments. ~ min fragment size filter (for
            # PE reads)
            # -X: The maximum insert size for valid paired-end alignments. ~ max fragment size filter (for
            # PE reads)
            # --gzip: Gzip intermediate fastq files
            # --ambiguous --unmapped
            # -p: bowtie2 parallel execution
            # --multicore: bismark parallel execution
            # --temp_dir: tmp dir for intermediate files instead of output directory
            extra=' --ambiguous --unmapped --nucleotide_coverage',
            basename='{sample}_{genome}'
        wrapper:
            "v3.0.4/bio/bismark/bismark"

    # Example: Single-ended reads
    rule bismark_se:
        input:
            fq="reads/{sample}.fq.gz",
            genome="indexes/{genome}/{genome}.fa",
            bismark_indexes_dir="indexes/{genome}/Bisulfite_Genome",
            genomic_freq="indexes/{genome}/genomic_nucleotide_frequencies.txt"
        output:
            bam="bams/{sample}_{genome}.bam",
            report="bams/{sample}_{genome}_SE_report.txt",
            nucleotide_stats="bams/{sample}_{genome}.nucleotide_stats.txt",
            bam_unmapped="bams/{sample}_{genome}_unmapped_reads.fq.gz",
            ambiguous="bams/{sample}_{genome}_ambiguous_reads.fq.gz"
        log:
            "logs/bams/{sample}_{genome}.log",
        params:
            # optional params string
            extra=' --ambiguous --unmapped --nucleotide_coverage',
            basename='{sample}_{genome}'
        wrapper:
            "v3.0.4/bio/bismark/bismark"

Note that input, output and log file paths can be chosen freely.

When running with

.. code-block:: bash

    snakemake --use-conda

the software dependencies will be automatically deployed into an isolated environment before execution.


Software dependencies
---------------------

* ``bowtie2=2.5.2``
* ``bismark=0.24.2``
* ``samtools=1.18``

Input/Output
------------
**Input:**

* In SE mode one reads file with keay 'fq=...'
* In PE mode two reads files with keys 'fq_1=...', 'fq_2=...'
* ``bismark_indexes_dir``: The path to the folder `Bisulfite_Genome` created by the Bismark_Genome_Preparation script, e.g. 'indexes/hg19/Bisulfite_Genome'

**Output:**

* ``bam``: Bam file. Output file will be renamed if differs from default `NAME_pe.bam` or `NAME_se.bam`
* ``report``: Aligning report file. Output file will be renamed if differs from default `NAME_PE_report.txt` or `NAME_SE_report.txt`
* ``nucleotide_stats``: Optional nucleotides report file. Output file will be renamed if differs from default `NAME_pe.nucleotide_stats.txt` or `NAME_se.nucleotide_stats.txt`


Params
------

* ``basename``: File base name

* ``extra``: Any additional args


Authors
-------

* Roman Cherniatchik


Code
----

.. code-block:: python

    """Snakemake wrapper for aligning methylation BS-Seq data using Bismark."""
    # https://github.com/FelixKrueger/Bismark/blob/master/bismark

    __author__ = "Roman Chernyatchik"
    __copyright__ = "Copyright (c) 2019 JetBrains"
    __email__ = "roman.chernyatchik@jetbrains.com"
    __license__ = "MIT"

    import os

    from snakemake.shell import shell
    from tempfile import TemporaryDirectory


    def basename_without_ext(file_path):
        """Returns basename of file path, without the file extension."""

        base = os.path.basename(file_path)

        split_ind = 2 if base.endswith(".gz") else 1
        base = ".".join(base.split(".")[:-split_ind])

        return base


    extra = snakemake.params.get("extra", "")
    cmdline_args = ["bismark {extra} --bowtie2"]

    outdir = os.path.dirname(snakemake.output.bam)
    if outdir:
        cmdline_args.append("--output_dir {outdir}")

    genome_indexes_dir = os.path.dirname(snakemake.input.bismark_indexes_dir)
    cmdline_args.append("{genome_indexes_dir}")

    if not snakemake.output.get("bam", None):
        raise ValueError("bismark/bismark: Error 'bam' output file isn't specified.")
    if not snakemake.output.get("report", None):
        raise ValueError("bismark/bismark: Error 'report' output file isn't specified.")

    # basename
    if snakemake.params.get("basename", None):
        cmdline_args.append("--basename {snakemake.params.basename:q}")
        basename = snakemake.params.basename
    else:
        basename = None

    # reads input
    single_end_mode = snakemake.input.get("fq", None)
    if single_end_mode:
        # for SE data, you only have to specify read1 input by -i or --in1, and
        # specify read1 output by -o or --out1.
        cmdline_args.append("--se {snakemake.input.fq:q}")
        mode_prefix = "se"
        if basename is None:
            basename = basename_without_ext(snakemake.input.fq)
    else:
        # for PE data, you should also specify read2 input by -I or --in2, and
        # specify read2 output by -O or --out2.
        cmdline_args.append("-1 {snakemake.input.fq_1:q} -2 {snakemake.input.fq_2:q}")
        mode_prefix = "pe"

        if basename is None:
            # default basename
            basename = basename_without_ext(snakemake.input.fq_1) + "_bismark_bt2"

    # log
    log = snakemake.log_fmt_shell(stdout=True, stderr=True)
    cmdline_args.append("{log}")

    # run
    shell(" ".join(cmdline_args))

    # Move outputs into proper position.
    expected_2_actual_paths = [
        (
            snakemake.output.bam,
            os.path.join(
                outdir, "{}{}.bam".format(basename, "" if single_end_mode else "_pe")
            ),
        ),
        (
            snakemake.output.report,
            os.path.join(
                outdir,
                "{}_{}_report.txt".format(basename, "SE" if single_end_mode else "PE"),
            ),
        ),
        (
            snakemake.output.get("nucleotide_stats", None),
            os.path.join(
                outdir,
                "{}{}.nucleotide_stats.txt".format(
                    basename, "" if single_end_mode else "_pe"
                ),
            ),
        ),
    ]
    log_append = snakemake.log_fmt_shell(stdout=True, stderr=True, append=True)
    for exp_path, actual_path in expected_2_actual_paths:
        if exp_path and (exp_path != actual_path):
            shell("mv {actual_path:q} {exp_path:q} {log_append}")


.. |nl| raw:: html

   <br>