MMSEQS2 WORKFLOWS

ultra fast and sensitive sequence search and clustering suite

URL: https://github.com/soedinglab/mmseqs2

Example

This wrapper can be used in the following way:

rule mmseqs2_workflow_search:
    input:
        query=["seqs/nucl.a.fas.gz", "seqs/nucl.b.fas.gz"],
        target="seqs/nucl.{sample}.fas.gz",
    output:
        aln="out/search/{sample}.tsv",
    log:
        "logs/search/{sample}.log",
    params:
        module="easy-search",
        extra="--search-type 3",
    threads: 2
    wrapper:
        "v9.3.0/bio/mmseqs2/workflows"


rule mmseqs2_workflow_cluster:
    input:
        query=["seqs/nucl.a.fas.gz", "seqs/nucl.b.fas.gz"],
    output:
        fas="out/cluster/a_b.seqs.fas",
        tsv="out/cluster/a_b.cluster.tsv",
        fas_rep="out/cluster/a_b.rep_seq.fasta",
    log:
        "logs/cluster/a_b.log",
    params:
        module="easy-cluster",
        extra="--max-seqs 5",
    threads: 2
    wrapper:
        "v9.3.0/bio/mmseqs2/workflows"


rule mmseqs2_workflow_linclust:
    input:
        query=["seqs/nucl.a.fas.gz", "seqs/nucl.b.fas.gz"],
    output:
        fas="out/linclust/a_b.seqs.fas",
        tsv="out/linclust/a_b.cluster.tsv",
        fas_rep="out/linclust/a_b.rep_seq.fasta",
    log:
        "logs/linclust/a_b.log",
    params:
        module="easy-linclust",
        extra="--max-seq-len 1000",
    threads: 2
    wrapper:
        "v9.3.0/bio/mmseqs2/workflows"


rule mmseqs2_workflow_taxonomy:
    input:
        query=["seqs/nucl.a.fas.gz", "seqs/nucl.b.fas.gz"],
        target=multiext(
            "db/{sample}",
            "",
            ".dbtype",
            ".index",
            ".lookup",
            ".source",
            "_h",
            "_h.dbtype",
            "_h.index",
        ),
    output:
        lca="out/taxonomy/{sample}.lca.tsv",
        report="out/taxonomy/{sample}.report",
        taln="out/taxonomy/{sample}.tophit_aln",
        treport="out/taxonomy/{sample}.tophit_report",
    log:
        "logs/taxonomy/{sample}.log",
    params:
        module="easy-taxonomy",
        extra="--max-seq-len 1000 --search-type 3",
    threads: 2
    wrapper:
        "v9.3.0/bio/mmseqs2/workflows"


rule mmseqs2_workflow_rbh:
    input:
        query="seqs/nucl.b.fas.gz",
        target="seqs/nucl.{sample}.fas.gz",
    output:
        aln="out/rbh/{sample}.tsv",
    log:
        "logs/rbh/{sample}.log",
    params:
        module="easy-rbh",
        extra="--search-type 3",
    threads: 2
    wrapper:
        "v9.3.0/bio/mmseqs2/workflows"

Note that input, output and log file paths can be chosen freely.

When running with

snakemake --use-conda

the software dependencies will be automatically deployed into an isolated environment before execution.

Software dependencies

mmseqs2=18.8cc5c
snakemake-wrapper-utils=0.8.0

Input/Output

Input:

query: input query FAS file(s)
target: input target FAS file(s) or DB

Output:

output: FAS, cluster or DB file(s)

Params

module: workflow to use; it can either be easy-search, easy-cluster, easy-linclust, easy-taxonomy, easy-rbh <https://github.com/soedinglab/mmseqs2/wiki#reciprocal-best-hit-using-mmseqs-rbh>_
extra: additional program arguments

Authors

Filipe G. Vieira

Code

__author__ = "Filipe G. Vieira"
__copyright__ = "Copyright 2024, Filipe G. Vieira"
__license__ = "MIT"

import os
import tempfile
from pathlib import Path
from snakemake.shell import shell
from snakemake_wrapper_utils.snakemake import get_format, move_files

extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)


# Input
target = snakemake.input.get("target", "")
if isinstance(target, list):
    target = os.path.commonprefix(snakemake.input.target)


with tempfile.TemporaryDirectory() as tmpout:
    out = snakemake.output
    move_cmds = ""
    # Output
    if snakemake.params.module in ["easy_search", "easy_rbh"]:
        format_mode = get_format(out)
        # (0) BLAST-TAB
        # (1) SAM
        # (2) BLAST-TAB + query/db length
        # (3) HTML
        # (4) BLAST-TAB + column headers
        if format_mode == "sam":
            extra += " --format-mode 1"
        elif format_mode == "html":
            extra += " --format-mode 3"
    elif snakemake.params.module in ["easy-cluster", "easy-linclust"]:
        out = Path(tmpout) / "clust"
        mapping = {
            "fas": str(out) + "_all_seqs.fasta",
            "tsv": str(out) + "_cluster.tsv",
            "fas_rep": str(out) + "_rep_seq.fasta",
        }
        move_cmds = "; ".join(move_files(snakemake, mapping))
    elif snakemake.params.module == "easy-taxonomy":
        out = Path(tmpout) / "taxonomy"
        mapping = {
            "lca": str(out) + "_lca.tsv",
            "report": str(out) + "_report",
            "taln": str(out) + "_tophit_aln",
            "treport": str(out) + "_tophit_report",
        }
        move_cmds = "; ".join(move_files(snakemake, mapping))

    # Run mmseqs2
    with tempfile.TemporaryDirectory() as tmpdir:
        shell(
            "(mmseqs {snakemake.params.module} {snakemake.input.query} {target} {out} {tmpdir} --threads {snakemake.threads} {extra}; {move_cmds}) {log}"
        )