.. _`bio/bbtools`: GENERIC ======= .. image:: https://img.shields.io/github/issues-pr/snakemake/snakemake-wrappers/bio/bbtools?label=version%20update%20pull%20requests :target: https://github.com/snakemake/snakemake-wrappers/pulls?q=is%3Apr+is%3Aopen+label%3Abio/bbtools Generic wrapper for BBTools. One pattern to run them all. **URL**: https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/ Example ------- This wrapper can be used in the following way: .. code-block:: python # Include snakefile with get_fractons function returning ["R1", "R2"] for paired reads or ["se"] for single reads include: "input_functions.smk" rule all: input: expand("logs/loglog/{sample}.log", sample=["SampleA"]), expand("covstats/{sample}.tsv", sample=["SampleA"]), rule loglog: input: expand("reads/raw/{{sample}}_{fraction}.fastq.gz", fraction=get_fractions()), log: stdout="logs/loglog/{sample}.log", stderr="logs/loglog/{sample}.err", # Captures the error output of the command and the log of the wrapper params: command="loglog.sh", extra="buckets=2048 seed=1234", threads: 2 resources: mem_mb=1024, wrapper: "v3.0.1/bio/bbtools" # ########## Quality control of reads ########## # all rules use the same wrapper!! rule bbduk: input: expand("reads/raw/{{sample}}_{fraction}.fastq.gz", fraction=get_fractions()), adapters="adapt.fas", output: out=expand( "reads/trimmed/{{sample}}_{fraction}.fastq.gz", fraction=get_fractions() ), outmatch=expand( "reads/discarded/{{sample}}_{fraction}.fastq.gz", fraction=get_fractions() ), #outsingle="reads/discarded/{sample}_singletons.fastq.gz", # only for paired reads stats="stats/trimming_stats/{sample}.txt", log: "logs/bbduk/{sample}.log", threads: 7 params: command="bbduk.sh", extra="tpe tbo ", ktrim="r ", ref=lambda w, input: [input.adapters, "adapters", "artifacts"], k=23, mink=11, hdist=1, trimpolygright=10, minlen=25, maxns=30, entropy=0.5, entropywindow=50, entropyk=5, resources: mem_mb=4000, log: "logs/trim/{sample}.log", wrapper: "v3.0.1/bio/bbtools" rule tadpole_correct: input: expand("reads/trimmed/{{sample}}_{fraction}.fastq.gz", fraction=get_fractions()), output: out=expand( "reads/corrected/{{sample}}_{fraction}.fastq.gz", fraction=get_fractions() ), outd="reads/discarded/{sample}.fastq.gz", log: err="logs/correct/{sample}.log", params: command="tadpole.sh", mode="correct", threads: 2 resources: mem_mb=1024, wrapper: "v3.0.1/bio/bbtools" ########## Assembly ########## rule tadpole_assemble: input: expand( "reads/corrected/{{sample}}_{fraction}.fastq.gz", fraction=get_fractions() ), output: "assembly/{sample}.fasta.gz", log: "logs/assemble/{sample}.log", params: command="tadpole.sh", mode="contig", threads: 2 resources: mem_mb=1024, wrapper: "v3.0.1/bio/bbtools" ########## Map ########## rule map_reads_with_bbmap: input: input=expand( "reads/trimmed/{{sample}}_{fraction}.fastq.gz", fraction=get_fractions() ), ref="assembly/{sample}.fasta.gz", output: out="mapped/{sample}.bam", log: stderr="logs/bbmap/{sample}.log", # Split stdout and stderr into separate files to have a nice log file stdout="logs/bbmap/{sample}.out", params: command="bbmap.sh", machineout=True, overwrite=True, # reccomended unpigz=True, nodisk=True, # Don't write a index threads: 12 resources: mem_mb=1000, # optional: bbmap normaly needs a lot of memory, e.g. 60GB wrapper: "v3.0.1/bio/bbtools" rule pileup: input: input="mapped/{sample}.bam", ref="genome.fasta", output: covstats="covstats/{sample}.tsv", basecov="basecov/{sample}.tsv", log: stderr="logs/pileup/{sample}.log", stdout="logs/pileup/{sample}.out", params: command="pileup.sh", nzo=True, overwrite=True, threads: 4 resources: mem_mb=1000, wrapper: "v3.0.1/bio/bbtools" ### Create reads rule index_genome: input: ref="genome.fasta", output: # path to the index base folder # The index is actually stored in subfolders of the path {path}/ref/genome/{build} and {path}/ref/index/{build} # Where build is by default 1 path=temp(directory("path/to/indexes/genome/")), params: command="bbmap.sh", log: "logs/index_genome.log", resources: mem_mb=1000, wrapper: "v3.0.1/bio/bbtools" rule create_reads: input: path="path/to/indexes/genome/", output: out=expand("reads/raw/{{sample}}_{fraction}.fastq.gz", fraction=get_fractions()), params: command="randomreads.sh", # Define which bbmap command to use # All parameters are passed to the command length=100, reads=100, paired=True, overwrite=True, pigz=True, resources: mem_mb=1000, log: "logs/create_reads/{sample}.log", wrapper: "v3.0.1/bio/bbtools" Note that input, output and log file paths can be chosen freely. When running with .. code-block:: bash snakemake --use-conda the software dependencies will be automatically deployed into an isolated environment before execution. Notes ----- This wrapper allows to run any of the bbtools. The command is defined by the 'command' parameter, which is required. **All keywords in input, output, params are passed as key value pairs to the command!** Take care that they are valid for the bbtool. As it is not possible to define 'in' as a keyword, the keyword 'input' is used instead. Allowed aliases are 'sample' and 'reads'. ### Paired input/output files If exactly two files are provides for 'input' the wrapper parses them as in1, in2. The same holds for the output keywords 'out', 'outm', 'outu'. For all parameters, if more than two files are provided, the wrapper parses them as key=value1,value2,value3... ### Logging The wrapper makes a detailed log of how he parse the parameters. If you want to use the log of a bbtool, e.g. for parsing how many reads where processed, you have to specify a 'stdout', and 'stderr' log file. The wrapper will then write only to the stderr-log file. ### List of all the tools in the bbtools suite 39.01 In theory all of them are sported by this wrapper, but we didn't test them all. Scripts with different input/output might not be supported by this wrapper. If you find one that is not yet supported, please feel free to adjust this wrapper accordingly and include a test case. - bbmap.sh - removehuman.sh - removehuman2.sh - mapnt.sh - mapPacBio.sh - bbmapskimmer.sh - bbsplit.sh - bbwrap.sh - pileup.sh - summarizescafstats.sh - filterbycoverage.sh - mergeOTUs.sh - bbest.sh - postfilter.sh - bbduk.sh - bbduk2.sh - seal.sh - summarizeseal.sh - loglog.sh - kmercountexact.sh - bbnorm.sh - ecc.sh - khist.sh - bbcountunique.sh - commonkmers.sh - kmercoverage.sh - callpeaks.sh - tadpole.sh - tadwrapper.sh - kcompress.sh - stats.sh - statswrapper.sh - countgc.sh - fungalrelease.sh - filterbytaxa.sh - gi2taxid.sh - gitable.sh - sortbytaxa.sh - splitbytaxa.sh - taxonomy.sh - taxtree.sh - reducesilva.sh - synthmda.sh - crosscontaminate.sh - decontaminate.sh - crossblock.sh - dedupe.sh - dedupe2.sh - dedupebymapping.sh - clumpify.sh - bbmerge.sh - bbmerge-auto.sh - bbmergegapped.sh - randomreads.sh - bbfakereads.sh - gradesam.sh - samtoroc.sh - addadapters.sh - grademerge.sh - printtime.sh - msa.sh - cutprimers.sh - idmatrix.sh - matrixtocolumns.sh - countbarcodes.sh - filterbarcodes.sh - mergebarcodes.sh - removebadbarcodes.sh - demuxbyname.sh - filterbysequence.sh - filterbyname.sh - filtersubs.sh - getreads.sh - estherfilter.sh - bbqc.sh - rqcfilter.sh - shred.sh - fuse.sh - shuffle.sh - calcmem.sh - textfile.sh - countsharedlines.sh - filterlines.sh - a_sample_mt.sh - bbmask.sh - calctruequality.sh - makechimeras.sh - phylip2fasta.sh - readlength.sh - reformat.sh - removesmartbell.sh - rename.sh - repair.sh - bbsplitpairs.sh - splitnextera.sh - splitsam.sh - testformat.sh - translate6frames.sh Software dependencies --------------------- * ``bbmap=39.01`` * ``snakemake-wrapper-utils=0.6.2`` * ``pigz=2.8`` Input/Output ------------ **Input:** * ``input``: All keywords in input are passed to the bbtool as key=value pairs. A special case is 'input', which is translated to 'in'. If exactly two files are provides for 'input' the wrapper parses them as in1, in2. * ``flag``: The keyword 'flag' is ignored and allows to specify input files that are not processed by the bbtool that is used. **Output:** * ``out``: If exactly two files are provides for 'out' the wrapper parses them as out1, out2. * ``outm``: Same as 'out' * ``outu``: Same as 'out' * ``flag``: The keyword 'flag' is not passed to the command. Params ------ * ``command``: Required parameter defining the command to be used e.g. 'bbmap.sh' * ``extra``: additional program arguments. All other parameters are passed to the bbtool as key=value pairs Authors ------- * Silas Kieser Code ---- .. code-block:: python __author__ = "Silas Kieser" __copyright__ = "Copyright 2023, Filipe G. Vieira" __license__ = "MIT" ### loging all exceptiuon to log file if available import sys import logging, traceback import warnings input_keys = ["input", "fastq", "sample", "reads"] # keys that can be used with 1,2, as suffixes to indicate the paired end reads. paired_keys = ["in", "out", "outm", "outu", "outmatch"] single_threaded_scripts = [ "pileup.sh", "summarizescafstats.sh", "filterbycoverage.sh.", "mergeOTUs.sh", "bbest.sh", "summarizeseal.sh", "bbcountunique.sh", "commonkmers.sh", "callpeaks.sh", "kcompress.sh", "stats.sh", "statswrapper.sh", "fungalrelease.sh", "filterbytaxa.sh", "gi2taxid.sh", "gitable.sh", "sortbytaxa.sh", "taxonomy.sh", "taxtree.sh", "reducesilva.sh", "synthmda.sh", "crosscontaminate.sh", "dedupebymapping.sh", "randomreads.sh", "bbfakereads.sh", "gradesam.sh", "samtoroc.sh", "addadapters.sh", "grademerge.sh", "printtime.sh", "msa.sh", "cutprimers.sh", "matrixtocolumns.sh", "countbarcodes.sh", "filterbarcodes.sh", "mergebarcodes.sh", "removebadbarcodes.sh", "demuxbyname.sh", "filterbyname.sh", "filtersubs.sh", "getreads.sh", "shred.sh", "fuse.sh", "shuffle.sh", "textfile.sh", "countsharedlines.sh", "filterlines.sh", "makechimeras.sh", "phylip2fasta.sh", "readlength.sh", "reformat.sh", "rename.sh", "repair.sh", "bbsplitpairs.sh", "splitnextera.sh", "splitsam.sh", "testformat.sh", "translate6frames.sh", ] ### Logging #### # TODO: replace with snakemake logging once it is implemented # Will be implemented in Snakemake https://github.com/snakemake/snakemake/pull/2474 def infer_stdout_and_stderr(log) -> tuple: """ If multiple log files are provided, try to infer which one is for stderr. If only one log file is provided, or inference fails, return None for stdout_file Returns ------- tuple stdout_file, stderr_file """ if len(log) == 0: return None, None elif len(log) == 1: return None, log[0] else: # infer stdout and stderr file for key in ["stderr", "err"]: if hasattr(log, key): stderr_file = log[key] for key in ["stdout", "out"]: if hasattr(log, key): stdout_file = log[key] if (stderr_file is None) or (stderr_file is None): warnings.warn( "Cannot infer which logfile is stderr and which is stdout, Logging stderr and stdout to the same file" ) return None, log[0] else: return stdout_file, stderr_file def multiple_log_fmt_shell(snakemake, append_stderr=False, append_stdout=False) -> str: """ Format shell command for logging to stdout and stderr files. """ from snakemake.script import _log_shell_redirect stdout_file, stderr_file = infer_stdout_and_stderr(snakemake.log) if stdout_file is None: # log both to the same file return snakemake.log_fmt_shell( append=append_stderr or append_stdout, stdout=True, stderr=True ) else: # successfully inferred und stderr and stdout file shell_log_fmt = ( _log_shell_redirect( stderr_file, stdout=False, stderr=True, append=append_stderr ) + " " + _log_shell_redirect( stdout_file, stdout=True, stderr=False, append=append_stdout ) ) return shell_log_fmt try: _, logfile = infer_stdout_and_stderr(snakemake.log) # clear stderr file open(logfile, "w").close() except: print("No log file provided, logging to stdout") logfile = None logging.basicConfig( filename=logfile, level=logging.DEBUG, format="%(asctime)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) logger = logging.getLogger(__file__) def handle_exception(exc_type, exc_value, exc_traceback): if issubclass(exc_type, KeyboardInterrupt): sys.__excepthook__(exc_type, exc_value, exc_traceback) return logger.error( "".join( [ "Uncaught exception: ", *traceback.format_exception(exc_type, exc_value, exc_traceback), ] ) ) # Install exception handler sys.excepthook = handle_exception ###################################### Beginning of wrapper ###################################### # global flags to check if input and output are parsed multiple times parsed_input, parsed_output = False, False import re from snakemake_wrapper_utils import java from snakemake.shell import shell def get_java_opts(snakemake, java_mem_overhead_factor=0.15) -> str: """Obtain mem from params, and handle resource definitions in resources. As there was conflicts https://github.com/snakemake/snakemake-wrapper-utils/pull/37 """ all_java_opts = java.get_java_opts(snakemake, java_mem_overhead_factor) # regex to extract only the "-Xmx" part regex = re.compile(r"-Xmx\d+[gGmM]") memory_options = regex.findall(all_java_opts)[0] logger.info("Memory specification: " + memory_options) return memory_options ## Get positional arguments # TODO: replace with snakemake logging once it is implemented ## Will be implemented in Snakemake https://github.com/snakemake/snakemake/pull/2509 def _get_unnamed_arguments(parameter_list): """ Get unnamed arguments of snakemake.input or snakemake.output. Find it as the first key in the parameter_list. Run as: _get_unnamed_arguments(snakemake.input) """ keys_with_positions = parameter_list._names if len(keys_with_positions) == 0: # all arguments are unnamed return parameter_list first_key = next(iter(keys_with_positions.items())) n_unnamed_arguments = first_key[1][0] logger.debug(f"Found {n_unnamed_arguments} unnamed arguments") # as for input arguments either is is a string or a list of strings if n_unnamed_arguments == 1: return parameter_list[0] else: return [parameter_list[i] for i in range(n_unnamed_arguments)] def _parse_in_out(input_or_output, values): """ Parse input or output arguments. With global check that is not parsed multiple times. """ if input_or_output == "input": key = "in" global parsed_input already_parsed = parsed_input elif input_or_output == "output": key = "out" global parsed_output already_parsed = parsed_output else: raise Exception("input_or_output should be 'input' or 'output'") if already_parsed: raise Exception(f"{input_or_output} should be defined only once!") parsed_arg = _parse_paired_keys(key, values) logger.debug(f"parsed {input_or_output} argument: {parsed_arg}") if input_or_output == "input": parsed_input = True else: parsed_output = True return parsed_arg def _parse_paired_keys(key, values): """ Some keys in bbtools can be used with 1,2, as suffixes to indicate the paired end reads. Single files are parsed as: in=values Two files are parsed as: in1=values[0] in2=values[1] More than two files are parsed as: in=values[0],values[1],values[2],... the same applies to output arguments (out=). """ if len(values) == 1 or isinstance(values, str): # single input file parsed_arg = f" {key}={values} " elif len(values) == 2: parsed_arg = f" {key}1={values[0]} {key}2={values[1]} " else: # multiple files warnings.warn( "More than 2 files provided, this case cannot be parsed unambigously! I parse it as a comma seperated list of files." ) parsed_arg = f" {key}=" + ",".join(values) logger.info(f"parsed {key} argument: {parsed_arg}") return parsed_arg def _parse_keywords_for_bbtool(parameter_list, section): """ Parse rule input, output and params into a bbmap command. Run as. _parse_keywords_for_bbtool( snakemake.input,"input") """ logger.info(f"Parse {section} section of snamemake rule into bbmap command") if section == "params": ignore_keys = ["command", "extra"] else: ignore_keys = ["flag"] # command to be build command = "" logger.debug(f"parameter_list: {parameter_list}") ## unnamed arguments unnamed_arguments = _get_unnamed_arguments(parameter_list) if len(unnamed_arguments) > 0: assert section in ["input", "output"] logger.info(f"Found unnamed arguments. parse them as {section}") command += _parse_in_out(section, unnamed_arguments) # parameters with keywords # transform to dict parameter_list = dict(parameter_list) for k in parameter_list.keys(): logger.info(f"Parse keyword: {k}") if k in ignore_keys: logger.info(f"{k} argument detected, This is not passed to the bbtool.") pass # INPUT keys elif (k in input_keys) and (section == "input"): command += _parse_in_out(section, values=parameter_list[k]) ## OUTPUT keys that can be paired elif k in paired_keys: if k == "out": command += _parse_in_out(section, values=parameter_list[k]) # Other keys that can be paired, e.g. outm else: command += _parse_paired_keys(k, parameter_list[k]) # other keys else: # bool conversions if type(parameter_list[k]) == bool: parameter_list[k] = "t" if parameter_list[k] else "f" # collapse list if isinstance(parameter_list[k], list): parameter_list[k] = ",".join(parameter_list[k]) parsed = f" {k}={parameter_list[k]} " logger.info(f"parsed argument: {parsed}") command += parsed return command def __check_for_duplicated_keywords(snakemake): input_keys = list(snakemake.input.keys()) output_keys = list(snakemake.output.keys()) params_keys = list(snakemake.params.keys()) all_keys = input_keys + output_keys + params_keys if len(all_keys) != len(set(all_keys)): raise Exception("Duplicated keywords in input, output or params") def parse_bbtool(snakemake): ## keywords __check_for_duplicated_keywords(snakemake) if not hasattr(snakemake.params, "command"): raise Exception("params needs 'command' parameter") else: command = snakemake.params.command assert type(command) == str, "command should be a string" assert len(command.split()) == 1, "command should not contain spaces" assert command.endswith(".sh"), "command should end with .sh" command_with_parameters = command logger.info(f"command is: {command_with_parameters}") # add extra arguments at the beginning if hasattr(snakemake.params, "extra"): extra = snakemake.params.extra assert type(extra) == str, "extra should be a string" logger.info(f"extra arguments: {extra} ") command_with_parameters += f" {extra} " command_with_parameters += _parse_keywords_for_bbtool(snakemake.input, "input") command_with_parameters += _parse_keywords_for_bbtool(snakemake.output, "output") command_with_parameters += _parse_keywords_for_bbtool(snakemake.params, "params") # Add threads if not in single threaded scripts if command in single_threaded_scripts: if snakemake.threads > 3: logger.warning( f"Shell script {command} will only use 1-3 threads, but you specify {snakemake.threads} threads. I ignore the threads argument." ) else: command_with_parameters += f" threads={snakemake.threads} " # memory java_opts = get_java_opts(snakemake, java_mem_overhead_factor=0.15) # log log = multiple_log_fmt_shell(snakemake, append_stderr=True) command_with_parameters += f" {java_opts} {log}" return command_with_parameters command = parse_bbtool(snakemake) logger.info(f"run command:\n\n\t{command}\n") shell(command) .. |nl| raw:: html