BBTOOLS

https://img.shields.io/badge/wrapper_version-v9.6.0-10785b https://img.shields.io/github/issues-pr/snakemake/snakemake-wrappers/bio/bbtools?label=version%20update%20pull%20requests&color=1cb481

Wrapper for all tools in the BBtools. One pattern to run them all.

URL: https://bbmap.org/#documentation

Example

This wrapper can be used in the following way:

# Function returning ["R1", "R2"] for paired reads or ["se"] for single reads
def get_fractions():
    if config.get("reads_are_paired", True):
        fractions = ["R1", "R2"]
    else:
        fractions = ["se"]

    return fractions


rule all:
    input:
        expand("logs/loglog/{sample}.log", sample=["SampleA"]),
        expand("covstats/{sample}.tsv", sample=["SampleA"]),


### Create reads
rule index_genome:
    input:
        ref="genome.fasta",
    output:
        # path to the index base folder
        # The index is actually stored in subfolders of the path {path}/ref/genome/{build} and {path}/ref/index/{build}
        # Where build is by default 1
        path=temp(directory("path/to/indexes/genome/")),
    params:
        command="bbmap.sh",
    log:
        "logs/index_genome.log",
    resources:
        mem_mb=1000,
    wrapper:
        "v9.6.0/bio/bbtools"


rule create_reads:
    input:
        path=rules.index_genome.output.path,
    output:
        out=expand(
            "reads/raw/{sample}_{fraction}.fastq.gz",
            fraction=get_fractions(),
            allow_missing=True,
        ),
    params:
        command="randomreads.sh",  # Define which bbmap command to use
        # All parameters are passed to the command
        length=100,
        reads=100,
        paired=True,
        overwrite=True,
        pigz=True,
    resources:
        mem_mb=1000,
    log:
        "logs/create_reads/{sample}.log",
    wrapper:
        "v9.6.0/bio/bbtools"


rule loglog:
    input:
        rules.create_reads.output.out,
    log:
        "logs/loglog/{sample}.log",  # Captures the error output of the command and the log of the wrapper
    params:
        command="loglog.sh",
        buckets=2048,
        seed=1234,
    threads: 2
    resources:
        mem_mb=1024,
    wrapper:
        "v9.6.0/bio/bbtools"


# ########## Quality control of reads ##########


# all rules use the same wrapper!!
rule bbduk:
    input:
        rules.create_reads.output.out,
        adapters="adapt.fas",
    output:
        out=expand(
            "reads/trimmed/{sample}_{fraction}.fastq.gz",
            fraction=get_fractions(),
            allow_missing=True,
        ),
        outmatch=expand(
            "reads/discarded/{sample}_{fraction}.fastq.gz",
            fraction=get_fractions(),
            allow_missing=True,
        ),
        #outsingle="reads/discarded/{sample}_singletons.fastq.gz", # only for paired reads
        stats="stats/trimming_stats/{sample}.txt",
    log:
        "logs/bbduk/{sample}.log",
    threads: 7
    params:
        command="bbduk.sh",
        extra="tpe tbo ",
        ktrim="r ",
        ref=lambda w, input: [input.adapters, "adapters", "artifacts"],
        k=23,
        mink=11,
        hdist=1,
        trimpolygright=10,
        minlen=25,
        maxns=30,
        entropy=0.5,
        entropywindow=50,
        entropyk=5,
    resources:
        mem_mb=4000,
    log:
        "logs/trim/{sample}.log",
    wrapper:
        "v9.6.0/bio/bbtools"


rule tadpole_correct:
    input:
        rules.bbduk.output.out,
    output:
        out=expand(
            "reads/corrected/{sample}_{fraction}.fastq.gz",
            fraction=get_fractions(),
            allow_missing=True,
        ),
        outd="reads/discarded/{sample}.fastq.gz",
    log:
        "logs/correct/{sample}.log",
    params:
        command="tadpole.sh",
        mode="correct",
    threads: 2
    resources:
        mem_mb=1024,
    wrapper:
        "v9.6.0/bio/bbtools"


##########  Assembly ##########
rule tadpole_assemble:
    input:
        rules.tadpole_correct.output.out,
    output:
        "assembly/{sample}.fasta.gz",
    log:
        "logs/assemble/{sample}.log",
    params:
        command="tadpole.sh",
        mode="contig",
    threads: 2
    resources:
        mem_mb=1024,
    wrapper:
        "v9.6.0/bio/bbtools"


########## Map ##########


rule bbmap:
    input:
        input=rules.bbduk.output.out,
        ref=rules.tadpole_assemble.output,
    output:
        out="mapped/{sample}.bam",
    log:
        "logs/bbmap/{sample}.log",
    params:
        command="bbmap.sh",
        machineout=True,
        overwrite=True,  # recommended
        unpigz=True,
        nodisk=True,  # Don't write a index
    threads: 12
    resources:
        mem_mb=1000,  # optional: bbmap normaly needs a lot of memory, e.g. 60GB
    wrapper:
        "v9.6.0/bio/bbtools"


rule pileup:
    input:
        input=rules.bbmap.output.out,
        ref=rules.tadpole_assemble.output,
    output:
        covstats="covstats/{sample}.tsv",
        basecov="basecov/{sample}.tsv",
    log:
        "logs/pileup/{sample}.log",
    params:
        command="pileup.sh",
        nzo=True,
        overwrite=True,
    threads: 4
    resources:
        mem_mb=1000,
    wrapper:
        "v9.6.0/bio/bbtools"

Note that input, output and log file paths can be chosen freely.

When running with

snakemake --use-conda

the software dependencies will be automatically deployed into an isolated environment before execution.

Notes

This wrapper allows to run any of the BBtools. The command is defined by the ‘command’ parameter, which is required. All keywords in input, output, params are passed as key value pairs to the command! Take care that they are valid for the bbtool. In theory all of the tools in the bbmap-suite v39.01 are supported by this wrapper, but we didn’t test them all. Scripts with different input/output might not be supported by this wrapper. If you find one that is not yet supported, please feel free to adjust this wrapper accordingly and include a test case.

Paired input/output files

As it is not possible to define ‘in’ as a keyword, the keyword ‘input’ is used instead. Allowed aliases are ‘sample’ and ‘reads’. If exactly two files are provides for ‘input’ the wrapper parses them as in1, in2. The same holds for the output keywords ‘out’, ‘outm’, ‘outu’. For all parameters, if more than two files are provided, the wrapper parses them as key=value1,value2,value3… The wrapper makes a detailed log of how he parsed the parameters in the log file.

Software dependencies

  • bbmap=39.81

  • snakemake-wrapper-utils=0.8.0

  • pigz=2.8

Input/Output

Input:

  • input: All keywords in input are passed to the bbtool as key=value pairs. A special case is ‘input’, which is translated to ‘in’. If exactly two files are provides for ‘input’ the wrapper parses them as in1, in2.

  • flag: The keyword ‘flag’ is ignored and allows to specify input files that are not processed by the bbtool that is used.

Output:

  • out: If exactly two files are provides for ‘out’ the wrapper parses them as out1, out2.

  • outm: Same as ‘out’

  • outu: Same as ‘out’

  • flag: The keyword ‘flag’ is not passed to the command.

Params

  • command: Required parameter defining the command to be used e.g. ‘bbmap.sh’

  • extra: additional program arguments. All other parameters are passed to the bbtool as key=value pairs

Authors

  • Silas Kieser

Code

__author__ = "Silas Kieser"
__copyright__ = "Copyright 2023, Filipe G. Vieira"
__license__ = "MIT"

### loging all exceptiuon to log file if available
import sys
import logging, traceback
import warnings

input_keys = ["input", "fastq", "sample", "reads"]
# keys that can be used with 1,2, as suffixes to indicate the paired end reads.
paired_keys = ["in", "out", "outm", "outu", "outmatch"]

single_threaded_scripts = [
    "pileup.sh",
    "summarizescafstats.sh",
    "filterbycoverage.sh.",
    "mergeOTUs.sh",
    "bbest.sh",
    "summarizeseal.sh",
    "bbcountunique.sh",
    "commonkmers.sh",
    "callpeaks.sh",
    "kcompress.sh",
    "stats.sh",
    "statswrapper.sh",
    "fungalrelease.sh",
    "filterbytaxa.sh",
    "gi2taxid.sh",
    "gitable.sh",
    "sortbytaxa.sh",
    "taxonomy.sh",
    "taxtree.sh",
    "reducesilva.sh",
    "synthmda.sh",
    "crosscontaminate.sh",
    "dedupebymapping.sh",
    "randomreads.sh",
    "bbfakereads.sh",
    "gradesam.sh",
    "samtoroc.sh",
    "addadapters.sh",
    "grademerge.sh",
    "printtime.sh",
    "msa.sh",
    "cutprimers.sh",
    "matrixtocolumns.sh",
    "countbarcodes.sh",
    "filterbarcodes.sh",
    "mergebarcodes.sh",
    "removebadbarcodes.sh",
    "demuxbyname.sh",
    "filterbyname.sh",
    "filtersubs.sh",
    "getreads.sh",
    "shred.sh",
    "fuse.sh",
    "shuffle.sh",
    "textfile.sh",
    "countsharedlines.sh",
    "filterlines.sh",
    "makechimeras.sh",
    "phylip2fasta.sh",
    "readlength.sh",
    "reformat.sh",
    "rename.sh",
    "repair.sh",
    "bbsplitpairs.sh",
    "splitnextera.sh",
    "splitsam.sh",
    "testformat.sh",
    "translate6frames.sh",
]

logging.basicConfig(
    level=logging.WARN,
    format="%(asctime)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)

logger = logging.getLogger(__file__)

###################################### Beginning of wrapper ######################################
# global flags to check if input and output are parsed multiple times
parsed_input, parsed_output = False, False

import re
from snakemake_wrapper_utils import java
from snakemake.shell import shell


def get_java_opts(snakemake, java_mem_overhead_factor=0.15) -> str:
    """Obtain mem from params, and handle resource definitions in resources.
    As there was conflicts https://github.com/snakemake/snakemake-wrapper-utils/pull/37
    """

    all_java_opts = java.get_java_opts(snakemake, java_mem_overhead_factor)

    # regex to extract only the "-Xmx" part

    regex = re.compile(r"-Xmx\d+[gGmM]")
    memory_options = regex.findall(all_java_opts)[0]

    logger.debug(f"Memory specification: {memory_options}")

    return memory_options


## Get positional arguments
# TODO: replace with snakemake logging once it is implemented
## Will be implemented in Snakemake https://github.com/snakemake/snakemake/pull/2509
def _get_unnamed_arguments(parameter_list):
    """
    Get unnamed arguments of snakemake.input or snakemake.output.
    Find it as the first key in the parameter_list.

    Run as:
        _get_unnamed_arguments(snakemake.input)

    """
    keys_with_positions = parameter_list._names
    if len(keys_with_positions) == 0:
        # all arguments are unnamed
        return parameter_list

    first_key = next(iter(keys_with_positions.items()))
    n_unnamed_arguments = first_key[1][0]

    logger.debug(f"Found {n_unnamed_arguments} unnamed arguments")
    # as for input arguments either is is a string or a list of strings
    if n_unnamed_arguments == 1:
        return parameter_list[0]
    else:
        return [parameter_list[i] for i in range(n_unnamed_arguments)]


def _parse_in_out(input_or_output, values):
    """
    Parse input or output arguments.

    With global check that is not parsed multiple times.

    """

    if input_or_output == "input":
        key = "in"
        global parsed_input
        already_parsed = parsed_input
    elif input_or_output == "output":
        key = "out"
        global parsed_output
        already_parsed = parsed_output

    else:
        raise Exception("input_or_output should be 'input' or 'output'")

    if already_parsed:
        raise Exception(f"{input_or_output} should be defined only once!")

    parsed_arg = _parse_paired_keys(key, values)
    logger.debug(f"parsed {input_or_output} argument: {parsed_arg}")

    if input_or_output == "input":
        parsed_input = True
    else:
        parsed_output = True

    return parsed_arg


def _parse_paired_keys(key, values):
    """
    Some keys in bbtools can be used with 1,2, as suffixes to indicate the paired end reads.

    Single files are parsed as:

        in=values

    Two files are parsed as:

        in1=values[0] in2=values[1]

    More than two files are parsed as:

        in=values[0],values[1],values[2],...

    the same applies to output arguments (out=).

    """
    if len(values) == 1 or isinstance(values, str):
        # single input file
        parsed_arg = f" {key}={values} "

    elif len(values) == 2:
        parsed_arg = f" {key}1={values[0]} {key}2={values[1]} "

    else:
        # multiple files
        warnings.warn(
            "More than 2 files provided, this case cannot be parsed unambigously! I parse it as a comma seperated list of files."
        )

        parsed_arg = f" {key}=" + ",".join(values)

    logger.debug(f"parsed {key} argument: {parsed_arg}")
    return parsed_arg


def _parse_keywords_for_bbtool(parameter_list, section):
    """
    Parse rule input, output and params into a bbmap command.

    Run as.
        _parse_keywords_for_bbtool(
        snakemake.input,"input")


    """

    logger.debug(f"Parse {section} section of snamemake rule into bbmap command")

    if section == "params":
        ignore_keys = ["command", "extra"]
    else:
        ignore_keys = ["flag"]

    # command to be build
    command = ""

    logger.debug(f"parameter_list: {parameter_list}")

    ## unnamed arguments
    unnamed_arguments = _get_unnamed_arguments(parameter_list)

    if len(unnamed_arguments) > 0:
        assert section in ["input", "output"]
        logger.debug(f"Found unnamed arguments. parse them as {section}")

        command += _parse_in_out(section, unnamed_arguments)

    # parameters with keywords
    # transform to dict
    parameter_list = dict(parameter_list)

    for k in parameter_list.keys():
        logger.debug(f"Parse keyword: {k}")
        if k in ignore_keys:
            logger.debug(f"{k} argument detected, This is not passed to the bbtool.")
            pass

        # INPUT keys
        elif (k in input_keys) and (section == "input"):
            command += _parse_in_out(section, values=parameter_list[k])

        ## OUTPUT keys that can be paired
        elif k in paired_keys:
            if k == "out":
                command += _parse_in_out(section, values=parameter_list[k])
            # Other keys that can be paired, e.g. outm
            else:
                command += _parse_paired_keys(k, parameter_list[k])

        # other keys
        else:
            # bool conversions
            if type(parameter_list[k]) == bool:
                parameter_list[k] = "t" if parameter_list[k] else "f"

            # collapse list
            if isinstance(parameter_list[k], list):
                parameter_list[k] = ",".join(parameter_list[k])

            parsed = f" {k}={parameter_list[k]} "

            logger.debug(f"parsed argument: {parsed}")
            command += parsed

    return command


def __check_for_duplicated_keywords(snakemake):
    input_keys = list(snakemake.input.keys())
    output_keys = list(snakemake.output.keys())
    params_keys = list(snakemake.params.keys())

    all_keys = input_keys + output_keys + params_keys

    if len(all_keys) != len(set(all_keys)):
        raise Exception("Duplicated keywords in input, output or params")


def parse_bbtool(snakemake):
    """
    Assembles the bbtools wrapper shell command from Snakemake inputs, outputs, params, and runtime settings.

    Validates that `snakemake.params.command` is present and is a single-word string without any whitespace, ending with ".sh".
    Adds `snakemake.params.extra`, if present, asserting it is a string.
    Parses inputs, outputs, and params into bbtools-style arguments, appends thread and Java memory options, adds the "-eoom" flag, and includes shell log redirections.

    Returns:
        command (str): The fully assembled command string ready to be executed in the shell.

    Raises:
        Exception: If `snakemake.params.command` is not provided.
        AssertionError: If `command` or `extra` have invalid types or formats.
    """

    java_opts = get_java_opts(snakemake, java_mem_overhead_factor=0.15)
    extra = snakemake.params.get("extra", "")

    ## keywords
    __check_for_duplicated_keywords(snakemake)

    # Add threads if not in single threaded scripts
    threads = snakemake.threads
    if snakemake.params.command in single_threaded_scripts:
        if snakemake.threads > 3:
            logger.warning(
                f"{snakemake.params.command} will only use 1-3 threads, but you specified {snakemake.threads} threads. Threads will be capped at 3."
            )
            threads = 3

    input = _parse_keywords_for_bbtool(snakemake.input, "input")
    params = _parse_keywords_for_bbtool(snakemake.params, "params")
    output = _parse_keywords_for_bbtool(snakemake.output, "output")

    return f"{snakemake.params.command} {java_opts} -eoom threads={threads} {input} {params} {extra} {output}"


command = parse_bbtool(snakemake)
logger.debug(f"run command:\n\n\t{command}\n")

log = snakemake.log_fmt_shell(stdout=True, stderr=True)

shell("{command} {log}")