TRIM_GALORE-PE

https://img.shields.io/github/issues-pr/snakemake/snakemake-wrappers/bio/trim_galore/pe?label=version%20update%20pull%20requests

Trim paired-end reads using trim_galore.

Example

This wrapper can be used in the following way:

rule trim_galore_pe:
    input:
        ["reads/{sample}.1.fastq.gz", "reads/{sample}.2.fastq.gz"],
    output:
        fasta_fwd="trimmed/{sample}_R1.fq.gz",
        report_fwd="trimmed/reports/{sample}_R1_trimming_report.txt",
        fasta_rev="trimmed/{sample}_R2.fq.gz",
        report_rev="trimmed/reports/{sample}_R2_trimming_report.txt",
    threads: 1
    params:
        extra="--illumina -q 20",
    log:
        "logs/trim_galore/{sample}.log",
    wrapper:
        "v5.0.1/bio/trim_galore/pe"


rule trim_galore_pe_uncompressed:
    input:
        ["reads/{sample}_R1.fastq", "reads/{sample}_R2.fastq"],
    output:
        fasta_fwd="trimmed/{sample}_R1.fastq",
        report_fwd="trimmed/reports/{sample}_R1_trimming_report.txt",
        fasta_rev="trimmed/{sample}_R2.fastq",
        report_rev="trimmed/reports/{sample}_R2_trimming_report.txt",
    threads: 1
    params:
        extra="--illumina -q 20",
    log:
        "logs/trim_galore/{sample}.log",
    wrapper:
        "v5.0.1/bio/trim_galore/pe"

Note that input, output and log file paths can be chosen freely.

When running with

snakemake --use-conda

the software dependencies will be automatically deployed into an isolated environment before execution.

Notes

  • It is expected that the fastqc Snakemake wrapper be used in place of the –fastqc option.

  • All output files must be placed in the same directory.

Software dependencies

  • trim-galore=0.6.10

Input/Output

Input:

  • two (paired-end) fastq files (can be gzip compressed)

Output:

  • two trimmed (paired-end) fastq files

  • two trimming reports

Params

  • extra: additional parameters

Authors

  • Kerrin Mendler

Code

"""Snakemake wrapper for trimming paired-end reads using trim_galore."""

__author__ = "Kerrin Mendler"
__copyright__ = "Copyright 2018, Kerrin Mendler"
__email__ = "mendlerke@gmail.com"
__license__ = "MIT"


from snakemake.shell import shell
import tempfile
import re
import os


def report_filename(infile: str) -> str:
    """Infer report output file name from input
    >>> report_filename('reads/sample.1.fastq.gz')
    'sample.1.fastq.gz_trimming_report.txt
    >>> report_filename('reads/sample_R2.fastq.gz')
    'sample_R2.fastq.gz_trimming_report.txt
    """
    return os.path.basename(infile) + "_trimming_report.txt"


def fasta_filename(infile: str, infix: str, out_gzip: bool) -> str:
    """Infer fasta output file name from input
    >>> fasta_filename('reads/sample.1.fq.gz', infix = '_val_1', out_gzip = False)
    'sample.1_val_1.fq.gz'
    >>> fasta_filename('reads/sample_R2.fastq', infix = '_val_2', out_gzip = True)
    'sample_R2_val_2.fq.gz'
    """
    base_input = os.path.basename(infile)
    suffix = ".gz" if out_gzip or infile.endswith(".gz") else ""
    REGEX_RULES = [r"\.fastq$", r"\.fastq\.gz$", r"\.fq$", r"\.fq\.gz$"]
    for regex in REGEX_RULES:
        if re.search(regex, base_input):
            return re.sub(regex, f"{infix}.fq", base_input) + suffix
    return base_input + infix + suffix


log = snakemake.log_fmt_shell(stdout=True, stderr=True)
extra = snakemake.params.get("extra", "")

# Check that two input files were supplied
n = len(snakemake.input)
assert n == 2, "Input must contain 2 files. Given: %r." % n
infile_fwd, infile_rev = snakemake.input[0:2]

# Don't run with `--fastqc` flag
if "--fastqc" in snakemake.params.get("extra", ""):
    raise ValueError(
        "The trim_galore Snakemake wrapper cannot "
        "be run with the `--fastqc` flag. Please "
        "remove the flag from extra params. "
        "You can use the fastqc Snakemake wrapper on "
        "the input and output files instead."
    )

# Check that four output files were supplied
m = len(snakemake.output)
assert m == 4, "Output must contain 4 files. Given: %r." % m

fasta_fwd, fasta_rev, report_fwd, report_rev = (
    snakemake.output.get(key)
    for key in ["fasta_fwd", "fasta_rev", "report_fwd", "report_rev"]
)

out_gzip = any((fasta_fwd.endswith("gz"), fasta_rev.endswith("gz")))
if out_gzip:
    extra += " --gzip"

with tempfile.TemporaryDirectory() as tmpdir:
    shell(
        "(trim_galore"
        " {extra}"
        " --cores {snakemake.threads}"
        " --paired"
        " -o {tmpdir}"
        " {infile_fwd} {infile_rev})"
        " {log}"
    )

    if report_fwd:
        shell(f"mv {tmpdir}/{report_filename(infile_fwd)} {report_fwd}")
    if report_rev:
        shell(f"mv {tmpdir}/{report_filename(infile_rev)} {report_rev}")

    if fasta_fwd:
        shell(
            f"mv {tmpdir}/{fasta_filename(infile_fwd, '_val_1', out_gzip)} {fasta_fwd}"
        )
    if fasta_rev:
        shell(
            f"mv {tmpdir}/{fasta_filename(infile_rev, '_val_2', out_gzip)} {fasta_rev}"
        )