BBMAP-SUITE
Wrapper for all tools in the BBmap suite. One pattern to run them all.
URL: https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/
Example
This wrapper can be used in the following way:
# Include snakefile with get_fractons function returning ["R1", "R2"] for paired reads or ["se"] for single reads
include: "input_functions.smk"
rule all:
input:
expand("logs/loglog/{sample}.log", sample=["SampleA"]),
expand("covstats/{sample}.tsv", sample=["SampleA"]),
rule loglog:
input:
expand("reads/raw/{{sample}}_{fraction}.fastq.gz", fraction=get_fractions()),
log:
"logs/loglog/{sample}.log", # Captures the error output of the command and the log of the wrapper
params:
command="loglog.sh",
buckets=2048,
seed=1234,
threads: 2
resources:
mem_mb=1024,
wrapper:
"v3.9.0/bio/bbtools"
# ########## Quality control of reads ##########
# all rules use the same wrapper!!
rule bbduk:
input:
expand("reads/raw/{{sample}}_{fraction}.fastq.gz", fraction=get_fractions()),
adapters="adapt.fas",
output:
out=expand(
"reads/trimmed/{{sample}}_{fraction}.fastq.gz", fraction=get_fractions()
),
outmatch=expand(
"reads/discarded/{{sample}}_{fraction}.fastq.gz", fraction=get_fractions()
),
#outsingle="reads/discarded/{sample}_singletons.fastq.gz", # only for paired reads
stats="stats/trimming_stats/{sample}.txt",
log:
"logs/bbduk/{sample}.log",
threads: 7
params:
command="bbduk.sh",
extra="tpe tbo ",
ktrim="r ",
ref=lambda w, input: [input.adapters, "adapters", "artifacts"],
k=23,
mink=11,
hdist=1,
trimpolygright=10,
minlen=25,
maxns=30,
entropy=0.5,
entropywindow=50,
entropyk=5,
resources:
mem_mb=4000,
log:
"logs/trim/{sample}.log",
wrapper:
"v3.9.0/bio/bbtools"
rule tadpole_correct:
input:
expand("reads/trimmed/{{sample}}_{fraction}.fastq.gz", fraction=get_fractions()),
output:
out=expand(
"reads/corrected/{{sample}}_{fraction}.fastq.gz", fraction=get_fractions()
),
outd="reads/discarded/{sample}.fastq.gz",
log:
"logs/correct/{sample}.log",
params:
command="tadpole.sh",
mode="correct",
threads: 2
resources:
mem_mb=1024,
wrapper:
"v3.9.0/bio/bbtools"
########## Assembly ##########
rule tadpole_assemble:
input:
expand(
"reads/corrected/{{sample}}_{fraction}.fastq.gz", fraction=get_fractions()
),
output:
"assembly/{sample}.fasta.gz",
log:
"logs/assemble/{sample}.log",
params:
command="tadpole.sh",
mode="contig",
threads: 2
resources:
mem_mb=1024,
wrapper:
"v3.9.0/bio/bbtools"
########## Map ##########
rule map_reads_with_bbmap:
input:
input=expand(
"reads/trimmed/{{sample}}_{fraction}.fastq.gz", fraction=get_fractions()
),
ref="assembly/{sample}.fasta.gz",
output:
out="mapped/{sample}.bam",
log:
"logs/bbmap/{sample}.log",
params:
command="bbmap.sh",
machineout=True,
overwrite=True, # recommended
unpigz=True,
nodisk=True, # Don't write a index
threads: 12
resources:
mem_mb=1000, # optional: bbmap normaly needs a lot of memory, e.g. 60GB
wrapper:
"v3.9.0/bio/bbtools"
rule pileup:
input:
input="mapped/{sample}.bam",
ref="genome.fasta",
output:
covstats="covstats/{sample}.tsv",
basecov="basecov/{sample}.tsv",
log:
"logs/pileup/{sample}.log",
params:
command="pileup.sh",
nzo=True,
overwrite=True,
threads: 4
resources:
mem_mb=1000,
wrapper:
"v3.9.0/bio/bbtools"
### Create reads
rule index_genome:
input:
ref="genome.fasta",
output:
# path to the index base folder
# The index is actually stored in subfolders of the path {path}/ref/genome/{build} and {path}/ref/index/{build}
# Where build is by default 1
path=temp(directory("path/to/indexes/genome/")),
params:
command="bbmap.sh",
log:
"logs/index_genome.log",
resources:
mem_mb=1000,
wrapper:
"v3.9.0/bio/bbtools"
rule create_reads:
input:
path="path/to/indexes/genome/",
output:
out=expand("reads/raw/{{sample}}_{fraction}.fastq.gz", fraction=get_fractions()),
params:
command="randomreads.sh", # Define which bbmap command to use
# All parameters are passed to the command
length=100,
reads=100,
paired=True,
overwrite=True,
pigz=True,
resources:
mem_mb=1000,
log:
"logs/create_reads/{sample}.log",
wrapper:
"v3.9.0/bio/bbtools"
Note that input, output and log file paths can be chosen freely.
When running with
snakemake --use-conda
the software dependencies will be automatically deployed into an isolated environment before execution.
Notes
This wrapper allows to run any of the bbtools. The command is defined by the ‘command’ parameter, which is required. All keywords in input, output, params are passed as key value pairs to the command! Take care that they are valid for the bbtool. In theory all of the tools in the bbmap-suite v39.01 are sported by this wrapper, but we didn’t test them all. Scripts with different input/output might not be supported by this wrapper. If you find one that is not yet supported, please feel free to adjust this wrapper accordingly and include a test case.
Paired input/output files
As it is not possible to define ‘in’ as a keyword, the keyword ‘input’ is used instead. Allowed aliases are ‘sample’ and ‘reads’. If exactly two files are provides for ‘input’ the wrapper parses them as in1, in2. The same holds for the output keywords ‘out’, ‘outm’, ‘outu’. For all parameters, if more than two files are provided, the wrapper parses them as key=value1,value2,value3… The wrapper makes a detailed log of how he parsed the parameters in the log file.
Software dependencies
bbmap=39.06
snakemake-wrapper-utils=0.6.2
pigz=2.8
Input/Output
Input:
input
: All keywords in input are passed to the bbtool as key=value pairs. A special case is ‘input’, which is translated to ‘in’. If exactly two files are provides for ‘input’ the wrapper parses them as in1, in2.flag
: The keyword ‘flag’ is ignored and allows to specify input files that are not processed by the bbtool that is used.
Output:
out
: If exactly two files are provides for ‘out’ the wrapper parses them as out1, out2.outm
: Same as ‘out’outu
: Same as ‘out’flag
: The keyword ‘flag’ is not passed to the command.
Params
command
: Required parameter defining the command to be used e.g. ‘bbmap.sh’extra
: additional program arguments. All other parameters are passed to the bbtool as key=value pairs
Code
__author__ = "Silas Kieser"
__copyright__ = "Copyright 2023, Filipe G. Vieira"
__license__ = "MIT"
### loging all exceptiuon to log file if available
import sys
import logging, traceback
import warnings
input_keys = ["input", "fastq", "sample", "reads"]
# keys that can be used with 1,2, as suffixes to indicate the paired end reads.
paired_keys = ["in", "out", "outm", "outu", "outmatch"]
single_threaded_scripts = [
"pileup.sh",
"summarizescafstats.sh",
"filterbycoverage.sh.",
"mergeOTUs.sh",
"bbest.sh",
"summarizeseal.sh",
"bbcountunique.sh",
"commonkmers.sh",
"callpeaks.sh",
"kcompress.sh",
"stats.sh",
"statswrapper.sh",
"fungalrelease.sh",
"filterbytaxa.sh",
"gi2taxid.sh",
"gitable.sh",
"sortbytaxa.sh",
"taxonomy.sh",
"taxtree.sh",
"reducesilva.sh",
"synthmda.sh",
"crosscontaminate.sh",
"dedupebymapping.sh",
"randomreads.sh",
"bbfakereads.sh",
"gradesam.sh",
"samtoroc.sh",
"addadapters.sh",
"grademerge.sh",
"printtime.sh",
"msa.sh",
"cutprimers.sh",
"matrixtocolumns.sh",
"countbarcodes.sh",
"filterbarcodes.sh",
"mergebarcodes.sh",
"removebadbarcodes.sh",
"demuxbyname.sh",
"filterbyname.sh",
"filtersubs.sh",
"getreads.sh",
"shred.sh",
"fuse.sh",
"shuffle.sh",
"textfile.sh",
"countsharedlines.sh",
"filterlines.sh",
"makechimeras.sh",
"phylip2fasta.sh",
"readlength.sh",
"reformat.sh",
"rename.sh",
"repair.sh",
"bbsplitpairs.sh",
"splitnextera.sh",
"splitsam.sh",
"testformat.sh",
"translate6frames.sh",
]
### Logging ####
# TODO: replace with snakemake logging once it is implemented
# Will be implemented in Snakemake https://github.com/snakemake/snakemake/pull/2474
def infer_stdout_and_stderr(log) -> tuple:
"""
If multiple log files are provided, try to infer which one is for stderr.
If only one log file is provided, or inference fails, return None for stdout_file
Returns
-------
tuple
stdout_file, stderr_file
"""
if len(log) == 0:
return None, None
elif len(log) == 1:
return None, log[0]
else:
# infer stdout and stderr file
for key in ["stderr", "err"]:
if hasattr(log, key):
stderr_file = log[key]
for key in ["stdout", "out"]:
if hasattr(log, key):
stdout_file = log[key]
if (stderr_file is None) or (stderr_file is None):
warnings.warn(
"Cannot infer which logfile is stderr and which is stdout, Logging stderr and stdout to the same file"
)
return None, log[0]
else:
return stdout_file, stderr_file
def multiple_log_fmt_shell(snakemake, append_stderr=False, append_stdout=False) -> str:
"""
Format shell command for logging to stdout and stderr files.
"""
from snakemake.script import _log_shell_redirect
stdout_file, stderr_file = infer_stdout_and_stderr(snakemake.log)
if stdout_file is None:
# log both to the same file
return snakemake.log_fmt_shell(
append=append_stderr or append_stdout, stdout=True, stderr=True
)
else:
# successfully inferred und stderr and stdout file
shell_log_fmt = (
_log_shell_redirect(
stderr_file, stdout=False, stderr=True, append=append_stderr
)
+ " "
+ _log_shell_redirect(
stdout_file, stdout=True, stderr=False, append=append_stdout
)
)
return shell_log_fmt
try:
_, logfile = infer_stdout_and_stderr(snakemake.log)
# clear stderr file
open(logfile, "w").close()
except:
print("No log file provided, logging to stdout")
logfile = None
logging.basicConfig(
filename=logfile,
level=logging.DEBUG,
format="%(asctime)s %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger(__file__)
def handle_exception(exc_type, exc_value, exc_traceback):
if issubclass(exc_type, KeyboardInterrupt):
sys.__excepthook__(exc_type, exc_value, exc_traceback)
return
logger.error(
"".join(
[
"Uncaught exception: ",
*traceback.format_exception(exc_type, exc_value, exc_traceback),
]
)
)
# Install exception handler
sys.excepthook = handle_exception
###################################### Beginning of wrapper ######################################
# global flags to check if input and output are parsed multiple times
parsed_input, parsed_output = False, False
import re
from snakemake_wrapper_utils import java
from snakemake.shell import shell
def get_java_opts(snakemake, java_mem_overhead_factor=0.15) -> str:
"""Obtain mem from params, and handle resource definitions in resources.
As there was conflicts https://github.com/snakemake/snakemake-wrapper-utils/pull/37
"""
all_java_opts = java.get_java_opts(snakemake, java_mem_overhead_factor)
# regex to extract only the "-Xmx" part
regex = re.compile(r"-Xmx\d+[gGmM]")
memory_options = regex.findall(all_java_opts)[0]
logger.info("Memory specification: " + memory_options)
return memory_options
## Get positional arguments
# TODO: replace with snakemake logging once it is implemented
## Will be implemented in Snakemake https://github.com/snakemake/snakemake/pull/2509
def _get_unnamed_arguments(parameter_list):
"""
Get unnamed arguments of snakemake.input or snakemake.output.
Find it as the first key in the parameter_list.
Run as:
_get_unnamed_arguments(snakemake.input)
"""
keys_with_positions = parameter_list._names
if len(keys_with_positions) == 0:
# all arguments are unnamed
return parameter_list
first_key = next(iter(keys_with_positions.items()))
n_unnamed_arguments = first_key[1][0]
logger.debug(f"Found {n_unnamed_arguments} unnamed arguments")
# as for input arguments either is is a string or a list of strings
if n_unnamed_arguments == 1:
return parameter_list[0]
else:
return [parameter_list[i] for i in range(n_unnamed_arguments)]
def _parse_in_out(input_or_output, values):
"""
Parse input or output arguments.
With global check that is not parsed multiple times.
"""
if input_or_output == "input":
key = "in"
global parsed_input
already_parsed = parsed_input
elif input_or_output == "output":
key = "out"
global parsed_output
already_parsed = parsed_output
else:
raise Exception("input_or_output should be 'input' or 'output'")
if already_parsed:
raise Exception(f"{input_or_output} should be defined only once!")
parsed_arg = _parse_paired_keys(key, values)
logger.debug(f"parsed {input_or_output} argument: {parsed_arg}")
if input_or_output == "input":
parsed_input = True
else:
parsed_output = True
return parsed_arg
def _parse_paired_keys(key, values):
"""
Some keys in bbtools can be used with 1,2, as suffixes to indicate the paired end reads.
Single files are parsed as:
in=values
Two files are parsed as:
in1=values[0] in2=values[1]
More than two files are parsed as:
in=values[0],values[1],values[2],...
the same applies to output arguments (out=).
"""
if len(values) == 1 or isinstance(values, str):
# single input file
parsed_arg = f" {key}={values} "
elif len(values) == 2:
parsed_arg = f" {key}1={values[0]} {key}2={values[1]} "
else:
# multiple files
warnings.warn(
"More than 2 files provided, this case cannot be parsed unambigously! I parse it as a comma seperated list of files."
)
parsed_arg = f" {key}=" + ",".join(values)
logger.info(f"parsed {key} argument: {parsed_arg}")
return parsed_arg
def _parse_keywords_for_bbtool(parameter_list, section):
"""
Parse rule input, output and params into a bbmap command.
Run as.
_parse_keywords_for_bbtool(
snakemake.input,"input")
"""
logger.info(f"Parse {section} section of snamemake rule into bbmap command")
if section == "params":
ignore_keys = ["command", "extra"]
else:
ignore_keys = ["flag"]
# command to be build
command = ""
logger.debug(f"parameter_list: {parameter_list}")
## unnamed arguments
unnamed_arguments = _get_unnamed_arguments(parameter_list)
if len(unnamed_arguments) > 0:
assert section in ["input", "output"]
logger.info(f"Found unnamed arguments. parse them as {section}")
command += _parse_in_out(section, unnamed_arguments)
# parameters with keywords
# transform to dict
parameter_list = dict(parameter_list)
for k in parameter_list.keys():
logger.info(f"Parse keyword: {k}")
if k in ignore_keys:
logger.info(f"{k} argument detected, This is not passed to the bbtool.")
pass
# INPUT keys
elif (k in input_keys) and (section == "input"):
command += _parse_in_out(section, values=parameter_list[k])
## OUTPUT keys that can be paired
elif k in paired_keys:
if k == "out":
command += _parse_in_out(section, values=parameter_list[k])
# Other keys that can be paired, e.g. outm
else:
command += _parse_paired_keys(k, parameter_list[k])
# other keys
else:
# bool conversions
if type(parameter_list[k]) == bool:
parameter_list[k] = "t" if parameter_list[k] else "f"
# collapse list
if isinstance(parameter_list[k], list):
parameter_list[k] = ",".join(parameter_list[k])
parsed = f" {k}={parameter_list[k]} "
logger.info(f"parsed argument: {parsed}")
command += parsed
return command
def __check_for_duplicated_keywords(snakemake):
input_keys = list(snakemake.input.keys())
output_keys = list(snakemake.output.keys())
params_keys = list(snakemake.params.keys())
all_keys = input_keys + output_keys + params_keys
if len(all_keys) != len(set(all_keys)):
raise Exception("Duplicated keywords in input, output or params")
def parse_bbtool(snakemake):
## keywords
__check_for_duplicated_keywords(snakemake)
if not hasattr(snakemake.params, "command"):
raise Exception("params needs 'command' parameter")
else:
command = snakemake.params.command
assert type(command) == str, "command should be a string"
assert len(command.split()) == 1, "command should not contain spaces"
assert command.endswith(".sh"), "command should end with .sh"
command_with_parameters = command
logger.info(f"command is: {command_with_parameters}")
# add extra arguments at the beginning
if hasattr(snakemake.params, "extra"):
extra = snakemake.params.extra
assert type(extra) == str, "extra should be a string"
logger.info(f"extra arguments: {extra} ")
command_with_parameters += f" {extra} "
command_with_parameters += _parse_keywords_for_bbtool(snakemake.input, "input")
command_with_parameters += _parse_keywords_for_bbtool(snakemake.output, "output")
command_with_parameters += _parse_keywords_for_bbtool(snakemake.params, "params")
# Add threads if not in single threaded scripts
if command in single_threaded_scripts:
if snakemake.threads > 3:
logger.warning(
f"Shell script {command} will only use 1-3 threads, but you specify {snakemake.threads} threads. I ignore the threads argument."
)
else:
command_with_parameters += f" threads={snakemake.threads} "
# memory
java_opts = get_java_opts(snakemake, java_mem_overhead_factor=0.15)
# log
log = multiple_log_fmt_shell(snakemake, append_stderr=True)
command_with_parameters += f" {java_opts} {log}"
return command_with_parameters
command = parse_bbtool(snakemake)
logger.info(f"run command:\n\n\t{command}\n")
shell(command)