BISMARK#
Align BS-Seq reads using Bismark (see https://github.com/FelixKrueger/Bismark/blob/master/bismark).
Example#
This wrapper can be used in the following way:
# Example: Pair-ended reads
rule bismark_pe:
input:
fq_1="reads/{sample}.1.fastq",
fq_2="reads/{sample}.2.fastq",
genome="indexes/{genome}/{genome}.fa",
bismark_indexes_dir="indexes/{genome}/Bisulfite_Genome",
genomic_freq="indexes/{genome}/genomic_nucleotide_frequencies.txt"
output:
bam="bams/{sample}_{genome}_pe.bam",
report="bams/{sample}_{genome}_PE_report.txt",
nucleotide_stats="bams/{sample}_{genome}_pe.nucleotide_stats.txt",
bam_unmapped_1="bams/{sample}_{genome}_unmapped_reads_1.fq.gz",
bam_unmapped_2="bams/{sample}_{genome}_unmapped_reads_2.fq.gz",
ambiguous_1="bams/{sample}_{genome}_ambiguous_reads_1.fq.gz",
ambiguous_2="bams/{sample}_{genome}_ambiguous_reads_2.fq.gz"
log:
"logs/bams/{sample}_{genome}.log"
params:
# optional params string, e.g: -L32 -N0 -X400 --gzip
# Useful options to tune:
# (for bowtie2)
# -N: The maximum number of mismatches permitted in the "seed", i.e. the first L base pairs
# of the read (deafault: 1)
# -L: The "seed length" (deafault: 28)
# -I: The minimum insert size for valid paired-end alignments. ~ min fragment size filter (for
# PE reads)
# -X: The maximum insert size for valid paired-end alignments. ~ max fragment size filter (for
# PE reads)
# --gzip: Gzip intermediate fastq files
# --ambiguous --unmapped
# -p: bowtie2 parallel execution
# --multicore: bismark parallel execution
# --temp_dir: tmp dir for intermediate files instead of output directory
extra=' --ambiguous --unmapped --nucleotide_coverage',
basename='{sample}_{genome}'
wrapper:
"v3.0.2/bio/bismark/bismark"
# Example: Single-ended reads
rule bismark_se:
input:
fq="reads/{sample}.fq.gz",
genome="indexes/{genome}/{genome}.fa",
bismark_indexes_dir="indexes/{genome}/Bisulfite_Genome",
genomic_freq="indexes/{genome}/genomic_nucleotide_frequencies.txt"
output:
bam="bams/{sample}_{genome}.bam",
report="bams/{sample}_{genome}_SE_report.txt",
nucleotide_stats="bams/{sample}_{genome}.nucleotide_stats.txt",
bam_unmapped="bams/{sample}_{genome}_unmapped_reads.fq.gz",
ambiguous="bams/{sample}_{genome}_ambiguous_reads.fq.gz"
log:
"logs/bams/{sample}_{genome}.log",
params:
# optional params string
extra=' --ambiguous --unmapped --nucleotide_coverage',
basename='{sample}_{genome}'
wrapper:
"v3.0.2/bio/bismark/bismark"
Note that input, output and log file paths can be chosen freely.
When running with
snakemake --use-conda
the software dependencies will be automatically deployed into an isolated environment before execution.
Software dependencies#
bowtie2=2.5.2
bismark=0.24.2
samtools=1.18
Input/Output#
Input:
In SE mode one reads file with keay ‘fq=…’
In PE mode two reads files with keys ‘fq_1=…’, ‘fq_2=…’
bismark_indexes_dir
: The path to the folder Bisulfite_Genome created by the Bismark_Genome_Preparation script, e.g. ‘indexes/hg19/Bisulfite_Genome’
Output:
bam
: Bam file. Output file will be renamed if differs from default NAME_pe.bam or NAME_se.bamreport
: Aligning report file. Output file will be renamed if differs from default NAME_PE_report.txt or NAME_SE_report.txtnucleotide_stats
: Optional nucleotides report file. Output file will be renamed if differs from default NAME_pe.nucleotide_stats.txt or NAME_se.nucleotide_stats.txt
Params#
basename
: File base nameextra
: Any additional args
Code#
"""Snakemake wrapper for aligning methylation BS-Seq data using Bismark."""
# https://github.com/FelixKrueger/Bismark/blob/master/bismark
__author__ = "Roman Chernyatchik"
__copyright__ = "Copyright (c) 2019 JetBrains"
__email__ = "roman.chernyatchik@jetbrains.com"
__license__ = "MIT"
import os
from snakemake.shell import shell
from tempfile import TemporaryDirectory
def basename_without_ext(file_path):
"""Returns basename of file path, without the file extension."""
base = os.path.basename(file_path)
split_ind = 2 if base.endswith(".gz") else 1
base = ".".join(base.split(".")[:-split_ind])
return base
extra = snakemake.params.get("extra", "")
cmdline_args = ["bismark {extra} --bowtie2"]
outdir = os.path.dirname(snakemake.output.bam)
if outdir:
cmdline_args.append("--output_dir {outdir}")
genome_indexes_dir = os.path.dirname(snakemake.input.bismark_indexes_dir)
cmdline_args.append("{genome_indexes_dir}")
if not snakemake.output.get("bam", None):
raise ValueError("bismark/bismark: Error 'bam' output file isn't specified.")
if not snakemake.output.get("report", None):
raise ValueError("bismark/bismark: Error 'report' output file isn't specified.")
# basename
if snakemake.params.get("basename", None):
cmdline_args.append("--basename {snakemake.params.basename:q}")
basename = snakemake.params.basename
else:
basename = None
# reads input
single_end_mode = snakemake.input.get("fq", None)
if single_end_mode:
# for SE data, you only have to specify read1 input by -i or --in1, and
# specify read1 output by -o or --out1.
cmdline_args.append("--se {snakemake.input.fq:q}")
mode_prefix = "se"
if basename is None:
basename = basename_without_ext(snakemake.input.fq)
else:
# for PE data, you should also specify read2 input by -I or --in2, and
# specify read2 output by -O or --out2.
cmdline_args.append("-1 {snakemake.input.fq_1:q} -2 {snakemake.input.fq_2:q}")
mode_prefix = "pe"
if basename is None:
# default basename
basename = basename_without_ext(snakemake.input.fq_1) + "_bismark_bt2"
# log
log = snakemake.log_fmt_shell(stdout=True, stderr=True)
cmdline_args.append("{log}")
# run
shell(" ".join(cmdline_args))
# Move outputs into proper position.
expected_2_actual_paths = [
(
snakemake.output.bam,
os.path.join(
outdir, "{}{}.bam".format(basename, "" if single_end_mode else "_pe")
),
),
(
snakemake.output.report,
os.path.join(
outdir,
"{}_{}_report.txt".format(basename, "SE" if single_end_mode else "PE"),
),
),
(
snakemake.output.get("nucleotide_stats", None),
os.path.join(
outdir,
"{}{}.nucleotide_stats.txt".format(
basename, "" if single_end_mode else "_pe"
),
),
),
]
log_append = snakemake.log_fmt_shell(stdout=True, stderr=True, append=True)
for exp_path, actual_path in expected_2_actual_paths:
if exp_path and (exp_path != actual_path):
shell("mv {actual_path:q} {exp_path:q} {log_append}")