BBMAP-SUITE

https://img.shields.io/github/issues-pr/snakemake/snakemake-wrappers/bio/bbtools?label=version%20update%20pull%20requests

Wrapper for all tools in the BBmap suite. One pattern to run them all.

URL: https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/

Example

This wrapper can be used in the following way:

# Include snakefile with get_fractons function returning ["R1", "R2"] for paired reads or ["se"] for single reads
include: "input_functions.smk"


rule all:
    input:
        expand("logs/loglog/{sample}.log", sample=["SampleA"]),
        expand("covstats/{sample}.tsv", sample=["SampleA"]),


rule loglog:
    input:
        expand("reads/raw/{{sample}}_{fraction}.fastq.gz", fraction=get_fractions()),
    log:
        "logs/loglog/{sample}.log",  # Captures the error output of the command and the log of the wrapper
    params:
        command="loglog.sh",
        buckets=2048,
        seed=1234,
    threads: 2
    resources:
        mem_mb=1024,
    wrapper:
        "v3.9.0/bio/bbtools"


# ########## Quality control of reads ##########


# all rules use the same wrapper!!
rule bbduk:
    input:
        expand("reads/raw/{{sample}}_{fraction}.fastq.gz", fraction=get_fractions()),
        adapters="adapt.fas",
    output:
        out=expand(
            "reads/trimmed/{{sample}}_{fraction}.fastq.gz", fraction=get_fractions()
        ),
        outmatch=expand(
            "reads/discarded/{{sample}}_{fraction}.fastq.gz", fraction=get_fractions()
        ),
        #outsingle="reads/discarded/{sample}_singletons.fastq.gz", # only for paired reads
        stats="stats/trimming_stats/{sample}.txt",
    log:
        "logs/bbduk/{sample}.log",
    threads: 7
    params:
        command="bbduk.sh",
        extra="tpe tbo ",
        ktrim="r ",
        ref=lambda w, input: [input.adapters, "adapters", "artifacts"],
        k=23,
        mink=11,
        hdist=1,
        trimpolygright=10,
        minlen=25,
        maxns=30,
        entropy=0.5,
        entropywindow=50,
        entropyk=5,
    resources:
        mem_mb=4000,
    log:
        "logs/trim/{sample}.log",
    wrapper:
        "v3.9.0/bio/bbtools"


rule tadpole_correct:
    input:
        expand("reads/trimmed/{{sample}}_{fraction}.fastq.gz", fraction=get_fractions()),
    output:
        out=expand(
            "reads/corrected/{{sample}}_{fraction}.fastq.gz", fraction=get_fractions()
        ),
        outd="reads/discarded/{sample}.fastq.gz",
    log:
        "logs/correct/{sample}.log",
    params:
        command="tadpole.sh",
        mode="correct",
    threads: 2
    resources:
        mem_mb=1024,
    wrapper:
        "v3.9.0/bio/bbtools"


##########  Assembly ##########
rule tadpole_assemble:
    input:
        expand(
            "reads/corrected/{{sample}}_{fraction}.fastq.gz", fraction=get_fractions()
        ),
    output:
        "assembly/{sample}.fasta.gz",
    log:
        "logs/assemble/{sample}.log",
    params:
        command="tadpole.sh",
        mode="contig",
    threads: 2
    resources:
        mem_mb=1024,
    wrapper:
        "v3.9.0/bio/bbtools"


########## Map ##########


rule map_reads_with_bbmap:
    input:
        input=expand(
            "reads/trimmed/{{sample}}_{fraction}.fastq.gz", fraction=get_fractions()
        ),
        ref="assembly/{sample}.fasta.gz",
    output:
        out="mapped/{sample}.bam",
    log:
        "logs/bbmap/{sample}.log",
    params:
        command="bbmap.sh",
        machineout=True,
        overwrite=True,  # recommended
        unpigz=True,
        nodisk=True,  # Don't write a index
    threads: 12
    resources:
        mem_mb=1000,  # optional: bbmap normaly needs a lot of memory, e.g. 60GB
    wrapper:
        "v3.9.0/bio/bbtools"


rule pileup:
    input:
        input="mapped/{sample}.bam",
        ref="genome.fasta",
    output:
        covstats="covstats/{sample}.tsv",
        basecov="basecov/{sample}.tsv",
    log:
        "logs/pileup/{sample}.log",
    params:
        command="pileup.sh",
        nzo=True,
        overwrite=True,
    threads: 4
    resources:
        mem_mb=1000,
    wrapper:
        "v3.9.0/bio/bbtools"


### Create reads


rule index_genome:
    input:
        ref="genome.fasta",
    output:
        # path to the index base folder
        # The index is actually stored in subfolders of the path {path}/ref/genome/{build} and {path}/ref/index/{build}
        # Where build is by default 1
        path=temp(directory("path/to/indexes/genome/")),
    params:
        command="bbmap.sh",
    log:
        "logs/index_genome.log",
    resources:
        mem_mb=1000,
    wrapper:
        "v3.9.0/bio/bbtools"


rule create_reads:
    input:
        path="path/to/indexes/genome/",
    output:
        out=expand("reads/raw/{{sample}}_{fraction}.fastq.gz", fraction=get_fractions()),
    params:
        command="randomreads.sh",  # Define which bbmap command to use
        # All parameters are passed to the command
        length=100,
        reads=100,
        paired=True,
        overwrite=True,
        pigz=True,
    resources:
        mem_mb=1000,
    log:
        "logs/create_reads/{sample}.log",
    wrapper:
        "v3.9.0/bio/bbtools"

Note that input, output and log file paths can be chosen freely.

When running with

snakemake --use-conda

the software dependencies will be automatically deployed into an isolated environment before execution.

Notes

This wrapper allows to run any of the bbtools. The command is defined by the ‘command’ parameter, which is required. All keywords in input, output, params are passed as key value pairs to the command! Take care that they are valid for the bbtool. In theory all of the tools in the bbmap-suite v39.01 are sported by this wrapper, but we didn’t test them all. Scripts with different input/output might not be supported by this wrapper. If you find one that is not yet supported, please feel free to adjust this wrapper accordingly and include a test case.

Paired input/output files

As it is not possible to define ‘in’ as a keyword, the keyword ‘input’ is used instead. Allowed aliases are ‘sample’ and ‘reads’. If exactly two files are provides for ‘input’ the wrapper parses them as in1, in2. The same holds for the output keywords ‘out’, ‘outm’, ‘outu’. For all parameters, if more than two files are provided, the wrapper parses them as key=value1,value2,value3… The wrapper makes a detailed log of how he parsed the parameters in the log file.

Software dependencies

bbmap=39.06
snakemake-wrapper-utils=0.6.2
pigz=2.8

Input/Output

Input:

input: All keywords in input are passed to the bbtool as key=value pairs. A special case is ‘input’, which is translated to ‘in’. If exactly two files are provides for ‘input’ the wrapper parses them as in1, in2.
flag: The keyword ‘flag’ is ignored and allows to specify input files that are not processed by the bbtool that is used.

Output:

out: If exactly two files are provides for ‘out’ the wrapper parses them as out1, out2.
outm: Same as ‘out’
outu: Same as ‘out’
flag: The keyword ‘flag’ is not passed to the command.

Params

command: Required parameter defining the command to be used e.g. ‘bbmap.sh’
extra: additional program arguments. All other parameters are passed to the bbtool as key=value pairs

Authors

Silas Kieser

Code

__author__ = "Silas Kieser"
__copyright__ = "Copyright 2023, Filipe G. Vieira"
__license__ = "MIT"

### loging all exceptiuon to log file if available
import sys
import logging, traceback
import warnings


input_keys = ["input", "fastq", "sample", "reads"]
# keys that can be used with 1,2, as suffixes to indicate the paired end reads.
paired_keys = ["in", "out", "outm", "outu", "outmatch"]

single_threaded_scripts = [
    "pileup.sh",
    "summarizescafstats.sh",
    "filterbycoverage.sh.",
    "mergeOTUs.sh",
    "bbest.sh",
    "summarizeseal.sh",
    "bbcountunique.sh",
    "commonkmers.sh",
    "callpeaks.sh",
    "kcompress.sh",
    "stats.sh",
    "statswrapper.sh",
    "fungalrelease.sh",
    "filterbytaxa.sh",
    "gi2taxid.sh",
    "gitable.sh",
    "sortbytaxa.sh",
    "taxonomy.sh",
    "taxtree.sh",
    "reducesilva.sh",
    "synthmda.sh",
    "crosscontaminate.sh",
    "dedupebymapping.sh",
    "randomreads.sh",
    "bbfakereads.sh",
    "gradesam.sh",
    "samtoroc.sh",
    "addadapters.sh",
    "grademerge.sh",
    "printtime.sh",
    "msa.sh",
    "cutprimers.sh",
    "matrixtocolumns.sh",
    "countbarcodes.sh",
    "filterbarcodes.sh",
    "mergebarcodes.sh",
    "removebadbarcodes.sh",
    "demuxbyname.sh",
    "filterbyname.sh",
    "filtersubs.sh",
    "getreads.sh",
    "shred.sh",
    "fuse.sh",
    "shuffle.sh",
    "textfile.sh",
    "countsharedlines.sh",
    "filterlines.sh",
    "makechimeras.sh",
    "phylip2fasta.sh",
    "readlength.sh",
    "reformat.sh",
    "rename.sh",
    "repair.sh",
    "bbsplitpairs.sh",
    "splitnextera.sh",
    "splitsam.sh",
    "testformat.sh",
    "translate6frames.sh",
]


### Logging ####
# TODO: replace with snakemake logging once it is implemented
# Will be implemented in Snakemake https://github.com/snakemake/snakemake/pull/2474
def infer_stdout_and_stderr(log) -> tuple:
    """
    If multiple log files are provided, try to infer which one is for stderr.

    If only one log file is provided, or inference fails, return None for stdout_file


    Returns
    -------
    tuple
        stdout_file, stderr_file


    """

    if len(log) == 0:
        return None, None

    elif len(log) == 1:
        return None, log[0]

    else:
        # infer stdout and stderr file
        for key in ["stderr", "err"]:
            if hasattr(log, key):
                stderr_file = log[key]

        for key in ["stdout", "out"]:
            if hasattr(log, key):
                stdout_file = log[key]

        if (stderr_file is None) or (stderr_file is None):
            warnings.warn(
                "Cannot infer which logfile is stderr and which is stdout, Logging stderr and stdout to the same file"
            )
            return None, log[0]

        else:
            return stdout_file, stderr_file


def multiple_log_fmt_shell(snakemake, append_stderr=False, append_stdout=False) -> str:
    """
    Format shell command for logging to stdout and stderr files.
    """

    from snakemake.script import _log_shell_redirect

    stdout_file, stderr_file = infer_stdout_and_stderr(snakemake.log)

    if stdout_file is None:
        # log both to the same file
        return snakemake.log_fmt_shell(
            append=append_stderr or append_stdout, stdout=True, stderr=True
        )
    else:
        # successfully inferred und stderr and stdout file

        shell_log_fmt = (
            _log_shell_redirect(
                stderr_file, stdout=False, stderr=True, append=append_stderr
            )
            + " "
            + _log_shell_redirect(
                stdout_file, stdout=True, stderr=False, append=append_stdout
            )
        )

        return shell_log_fmt


try:
    _, logfile = infer_stdout_and_stderr(snakemake.log)

    # clear stderr file
    open(logfile, "w").close()

except:
    print("No log file provided, logging to stdout")
    logfile = None

logging.basicConfig(
    filename=logfile,
    level=logging.DEBUG,
    format="%(asctime)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)

logger = logging.getLogger(__file__)


def handle_exception(exc_type, exc_value, exc_traceback):
    if issubclass(exc_type, KeyboardInterrupt):
        sys.__excepthook__(exc_type, exc_value, exc_traceback)
        return

    logger.error(
        "".join(
            [
                "Uncaught exception: ",
                *traceback.format_exception(exc_type, exc_value, exc_traceback),
            ]
        )
    )


# Install exception handler
sys.excepthook = handle_exception


###################################### Beginning of wrapper ######################################


# global flags to check if input and output are parsed multiple times
parsed_input, parsed_output = False, False

import re
from snakemake_wrapper_utils import java
from snakemake.shell import shell


def get_java_opts(snakemake, java_mem_overhead_factor=0.15) -> str:
    """Obtain mem from params, and handle resource definitions in resources.
    As there was conflicts https://github.com/snakemake/snakemake-wrapper-utils/pull/37
    """

    all_java_opts = java.get_java_opts(snakemake, java_mem_overhead_factor)

    # regex to extract only the "-Xmx" part

    regex = re.compile(r"-Xmx\d+[gGmM]")
    memory_options = regex.findall(all_java_opts)[0]

    logger.info("Memory specification: " + memory_options)

    return memory_options


## Get positional arguments
# TODO: replace with snakemake logging once it is implemented
## Will be implemented in Snakemake https://github.com/snakemake/snakemake/pull/2509
def _get_unnamed_arguments(parameter_list):
    """
    Get unnamed arguments of snakemake.input or snakemake.output.
    Find it as the first key in the parameter_list.

    Run as:
        _get_unnamed_arguments(snakemake.input)

    """
    keys_with_positions = parameter_list._names
    if len(keys_with_positions) == 0:
        # all arguments are unnamed
        return parameter_list

    first_key = next(iter(keys_with_positions.items()))
    n_unnamed_arguments = first_key[1][0]

    logger.debug(f"Found {n_unnamed_arguments} unnamed arguments")
    # as for input arguments either is is a string or a list of strings
    if n_unnamed_arguments == 1:
        return parameter_list[0]
    else:
        return [parameter_list[i] for i in range(n_unnamed_arguments)]


def _parse_in_out(input_or_output, values):
    """
    Parse input or output arguments.

    With global check that is not parsed multiple times.

    """

    if input_or_output == "input":
        key = "in"
        global parsed_input
        already_parsed = parsed_input
    elif input_or_output == "output":
        key = "out"
        global parsed_output
        already_parsed = parsed_output

    else:
        raise Exception("input_or_output should be 'input' or 'output'")

    if already_parsed:
        raise Exception(f"{input_or_output} should be defined only once!")

    parsed_arg = _parse_paired_keys(key, values)
    logger.debug(f"parsed {input_or_output} argument: {parsed_arg}")

    if input_or_output == "input":
        parsed_input = True
    else:
        parsed_output = True

    return parsed_arg


def _parse_paired_keys(key, values):
    """
    Some keys in bbtools can be used with 1,2, as suffixes to indicate the paired end reads.

    Single files are parsed as:

        in=values

    Two files are parsed as:

        in1=values[0] in2=values[1]

    More than two files are parsed as:

        in=values[0],values[1],values[2],...

    the same applies to output arguments (out=).

    """
    if len(values) == 1 or isinstance(values, str):
        # single input file
        parsed_arg = f" {key}={values} "

    elif len(values) == 2:
        parsed_arg = f" {key}1={values[0]} {key}2={values[1]} "

    else:
        # multiple files
        warnings.warn(
            "More than 2 files provided, this case cannot be parsed unambigously! I parse it as a comma seperated list of files."
        )

        parsed_arg = f" {key}=" + ",".join(values)

    logger.info(f"parsed {key} argument: {parsed_arg}")
    return parsed_arg


def _parse_keywords_for_bbtool(parameter_list, section):
    """
    Parse rule input, output and params into a bbmap command.

    Run as.
        _parse_keywords_for_bbtool(
        snakemake.input,"input")


    """

    logger.info(f"Parse {section} section of snamemake rule into bbmap command")

    if section == "params":
        ignore_keys = ["command", "extra"]
    else:
        ignore_keys = ["flag"]

    # command to be build
    command = ""

    logger.debug(f"parameter_list: {parameter_list}")

    ## unnamed arguments
    unnamed_arguments = _get_unnamed_arguments(parameter_list)

    if len(unnamed_arguments) > 0:
        assert section in ["input", "output"]
        logger.info(f"Found unnamed arguments. parse them as {section}")

        command += _parse_in_out(section, unnamed_arguments)

    # parameters with keywords
    # transform to dict
    parameter_list = dict(parameter_list)

    for k in parameter_list.keys():
        logger.info(f"Parse keyword: {k}")
        if k in ignore_keys:
            logger.info(f"{k} argument detected, This is not passed to the bbtool.")
            pass

        # INPUT keys
        elif (k in input_keys) and (section == "input"):
            command += _parse_in_out(section, values=parameter_list[k])

        ## OUTPUT keys that can be paired
        elif k in paired_keys:
            if k == "out":
                command += _parse_in_out(section, values=parameter_list[k])
            # Other keys that can be paired, e.g. outm
            else:
                command += _parse_paired_keys(k, parameter_list[k])

        # other keys
        else:
            # bool conversions
            if type(parameter_list[k]) == bool:
                parameter_list[k] = "t" if parameter_list[k] else "f"

            # collapse list
            if isinstance(parameter_list[k], list):
                parameter_list[k] = ",".join(parameter_list[k])

            parsed = f" {k}={parameter_list[k]} "

            logger.info(f"parsed argument: {parsed}")
            command += parsed

    return command


def __check_for_duplicated_keywords(snakemake):
    input_keys = list(snakemake.input.keys())
    output_keys = list(snakemake.output.keys())
    params_keys = list(snakemake.params.keys())

    all_keys = input_keys + output_keys + params_keys

    if len(all_keys) != len(set(all_keys)):
        raise Exception("Duplicated keywords in input, output or params")


def parse_bbtool(snakemake):
    ## keywords
    __check_for_duplicated_keywords(snakemake)

    if not hasattr(snakemake.params, "command"):
        raise Exception("params needs 'command' parameter")
    else:
        command = snakemake.params.command
        assert type(command) == str, "command should be a string"
        assert len(command.split()) == 1, "command should not contain spaces"
        assert command.endswith(".sh"), "command should end with .sh"

        command_with_parameters = command
        logger.info(f"command is: {command_with_parameters}")

        # add extra arguments  at the beginning
        if hasattr(snakemake.params, "extra"):
            extra = snakemake.params.extra
            assert type(extra) == str, "extra should be a string"
            logger.info(f"extra arguments: {extra} ")
            command_with_parameters += f" {extra} "

    command_with_parameters += _parse_keywords_for_bbtool(snakemake.input, "input")
    command_with_parameters += _parse_keywords_for_bbtool(snakemake.output, "output")
    command_with_parameters += _parse_keywords_for_bbtool(snakemake.params, "params")

    # Add threads if not in single threaded scripts
    if command in single_threaded_scripts:
        if snakemake.threads > 3:
            logger.warning(
                f"Shell script {command} will only use 1-3 threads, but you specify {snakemake.threads} threads. I ignore the threads argument."
            )
    else:
        command_with_parameters += f" threads={snakemake.threads} "

    # memory
    java_opts = get_java_opts(snakemake, java_mem_overhead_factor=0.15)

    # log
    log = multiple_log_fmt_shell(snakemake, append_stderr=True)

    command_with_parameters += f" {java_opts} {log}"

    return command_with_parameters


command = parse_bbtool(snakemake)
logger.info(f"run command:\n\n\t{command}\n")

shell(command)