TRIM_GALORE-PE#
Trim paired-end reads using trim_galore.
Example#
This wrapper can be used in the following way:
rule trim_galore_pe:
input:
["reads/{sample}.1.fastq.gz", "reads/{sample}.2.fastq.gz"],
output:
fasta_fwd="trimmed/{sample}_R1.fq.gz",
report_fwd="trimmed/reports/{sample}_R1_trimming_report.txt",
fasta_rev="trimmed/{sample}_R2.fq.gz",
report_rev="trimmed/reports/{sample}_R2_trimming_report.txt",
threads: 1
params:
extra="--illumina -q 20",
log:
"logs/trim_galore/{sample}.log",
wrapper:
"v3.0.1-5-gc155ca9/bio/trim_galore/pe"
rule trim_galore_pe_uncompressed:
input:
["reads/{sample}_R1.fastq", "reads/{sample}_R2.fastq"],
output:
fasta_fwd="trimmed/{sample}_R1.fastq",
report_fwd="trimmed/reports/{sample}_R1_trimming_report.txt",
fasta_rev="trimmed/{sample}_R2.fastq",
report_rev="trimmed/reports/{sample}_R2_trimming_report.txt",
threads: 1
params:
extra="--illumina -q 20",
log:
"logs/trim_galore/{sample}.log",
wrapper:
"v3.0.1-5-gc155ca9/bio/trim_galore/pe"
Note that input, output and log file paths can be chosen freely.
When running with
snakemake --use-conda
the software dependencies will be automatically deployed into an isolated environment before execution.
Notes#
It is expected that the fastqc Snakemake wrapper be used in place of the –fastqc option.
All output files must be placed in the same directory.
Software dependencies#
trim-galore=0.6.10
Input/Output#
Input:
two (paired-end) fastq files (can be gzip compressed)
Output:
two trimmed (paired-end) fastq files
two trimming reports
Params#
extra
: additional parameters
Code#
"""Snakemake wrapper for trimming paired-end reads using trim_galore."""
__author__ = "Kerrin Mendler"
__copyright__ = "Copyright 2018, Kerrin Mendler"
__email__ = "mendlerke@gmail.com"
__license__ = "MIT"
from snakemake.shell import shell
import tempfile
import re
import os
def report_filename(infile: str) -> str:
"""Infer report output file name from input
>>> report_filename('reads/sample.1.fastq.gz')
'sample.1.fastq.gz_trimming_report.txt
>>> report_filename('reads/sample_R2.fastq.gz')
'sample_R2.fastq.gz_trimming_report.txt
"""
return os.path.basename(infile) + "_trimming_report.txt"
def fasta_filename(infile: str, infix: str, out_gzip: bool) -> str:
"""Infer fasta output file name from input
>>> fasta_filename('reads/sample.1.fq.gz', infix = '_val_1', out_gzip = False)
'sample.1_val_1.fq.gz'
>>> fasta_filename('reads/sample_R2.fastq', infix = '_val_2', out_gzip = True)
'sample_R2_val_2.fq.gz'
"""
base_input = os.path.basename(infile)
suffix = ".gz" if out_gzip or infile.endswith(".gz") else ""
REGEX_RULES = [r"\.fastq$", "\.fastq\.gz$", r"\.fq$", r"\.fq\.gz$"]
for regex in REGEX_RULES:
if re.search(regex, base_input):
return re.sub(regex, f"{infix}.fq", base_input) + suffix
return base_input + infix + suffix
log = snakemake.log_fmt_shell(stdout=True, stderr=True)
extra = snakemake.params.get("extra", "")
# Check that two input files were supplied
n = len(snakemake.input)
assert n == 2, "Input must contain 2 files. Given: %r." % n
infile_fwd, infile_rev = snakemake.input[0:2]
# Don't run with `--fastqc` flag
if "--fastqc" in snakemake.params.get("extra", ""):
raise ValueError(
"The trim_galore Snakemake wrapper cannot "
"be run with the `--fastqc` flag. Please "
"remove the flag from extra params. "
"You can use the fastqc Snakemake wrapper on "
"the input and output files instead."
)
# Check that four output files were supplied
m = len(snakemake.output)
assert m == 4, "Output must contain 4 files. Given: %r." % m
fasta_fwd, fasta_rev, report_fwd, report_rev = (
snakemake.output.get(key)
for key in ["fasta_fwd", "fasta_rev", "report_fwd", "report_rev"]
)
out_gzip = any((fasta_fwd.endswith("gz"), fasta_rev.endswith("gz")))
if out_gzip:
extra += " --gzip"
with tempfile.TemporaryDirectory() as tmpdir:
shell(
"(trim_galore"
" {extra}"
" --cores {snakemake.threads}"
" --paired"
" -o {tmpdir}"
" {infile_fwd} {infile_rev})"
" {log}"
)
if report_fwd:
shell(f"mv {tmpdir}/{report_filename(infile_fwd)} {report_fwd}")
if report_rev:
shell(f"mv {tmpdir}/{report_filename(infile_rev)} {report_rev}")
if fasta_fwd:
shell(
f"mv {tmpdir}/{fasta_filename(infile_fwd, '_val_1', out_gzip)} {fasta_fwd}"
)
if fasta_rev:
shell(
f"mv {tmpdir}/{fasta_filename(infile_rev, '_val_2', out_gzip)} {fasta_rev}"
)