SEQKIT GENERIC WRAPPER

https://img.shields.io/github/issues-pr/snakemake/snakemake-wrappers/bio/seqkit?label=version%20update%20pull%20requests&color=1cb481

Run SeqKit.

URL: https://bioinf.shenwei.me/seqkit/

Example

This wrapper can be used in the following way:

rule seqkit_seq:
    input:
        fasta="data/{sample}.fa",
    output:
        fasta="out/seq/{sample}.fa.gz",
    log:
        "logs/seq/{sample}.log",
    params:
        command="seq",
        extra="--min-len 10",
    threads: 2
    wrapper:
        "v9.4.2/bio/seqkit"


rule seqkit_subseq_bed:
    input:
        fasta="data/{sample}.fa",
        bed="data/{sample}.bed",
    output:
        fasta="out/subseq/bed/{sample}.fa.gz",
    log:
        "logs/subseq/bed/{sample}.log",
    params:
        command="subseq",
    threads: 2
    wrapper:
        "v9.4.2/bio/seqkit"


rule seqkit_subseq_gtf:
    input:
        fasta="data/{sample}.fa",
        gtf="data/{sample}.gtf",
    output:
        fasta="out/subseq/gtf/{sample}.fa.gz",
    log:
        "logs/subseq/gtf/{sample}.log",
    params:
        command="subseq",
        extra="--feature CDS",
    threads: 2
    wrapper:
        "v9.4.2/bio/seqkit"


rule seqkit_subseq_region:
    input:
        fasta="data/{sample}.fa",
    output:
        fasta="out/subseq/region/{sample}.fa.gz",
    log:
        "logs/subseq/region/{sample}.log",
    params:
        command="subseq",
        extra="--region 1:12",
    threads: 2
    wrapper:
        "v9.4.2/bio/seqkit"


rule seqkit_fx2tab:
    input:
        fastx="data/{sample}.fastq",
    output:
        tsv="out/fx2tab/{sample}.tsv",
    log:
        "logs/fx2tab/{sample}.log",
    params:
        command="fx2tab",
        extra="--name",
    threads: 2
    wrapper:
        "v9.4.2/bio/seqkit"


rule seqkit_grep_name:
    input:
        fastx="data/{sample}.fastq",
        pattern="data/name.txt",
    output:
        fastx="out/grep/name/{sample}.fastq.gz",
    log:
        "logs/grep/name/{sample}.log",
    params:
        command="grep",
        extra="--by-name",
    threads: 2
    wrapper:
        "v9.4.2/bio/seqkit"


rule seqkit_grep_seq:
    input:
        fastx="data/{sample}.fastq",
        pattern="data/seq.txt",
    output:
        fastx="out/grep/seq/{sample}.fastq.gz",
    log:
        "logs/grep/seq/{sample}.log",
    params:
        command="grep",
        extra="--by-seq",
    threads: 2
    wrapper:
        "v9.4.2/bio/seqkit"


rule seqkit_rmdup_name:
    input:
        fastx="data/{sample}.fastq",
    output:
        fastx="out/rmdup/name/{sample}.fastq.gz",
        dup_num="out/rmdup/name/{sample}.num.txt",
        dup_seqs="out/rmdup/name/{sample}.seq.txt",
    log:
        "logs/rmdup/name/{sample}.log",
    params:
        command="rmdup",
        extra="--by-name",
    threads: 2
    wrapper:
        "v9.4.2/bio/seqkit"


rule seqkit_rmdup_seq:
    input:
        fastx="data/{sample}.fastq",
    output:
        fastx="out/rmdup/seq/{sample}.fastq.gz",
        dup_num="out/rmdup/seq/{sample}.num.txt",
        dup_seqs="out/rmdup/seq/{sample}.seq.txt",
    log:
        "logs/rmdup/seq/{sample}.log",
    params:
        command="rmdup",
        extra="--by-seq",
    threads: 2
    wrapper:
        "v9.4.2/bio/seqkit"


rule seqkit_stats:
    input:
        fastx="data/{sample}.fastq",
    output:
        stats="out/stats/{sample}.tsv",
    log:
        "logs/stats/{sample}.log",
    params:
        command="stats",
        extra="--all --tabular",
    threads: 2
    wrapper:
        "v9.4.2/bio/seqkit"


rule seqkit_common:
    input:
        fastas=[
            "data/{sample1}.fa",
            "data/{sample2}.fa",
        ],
    output:
        fasta="out/common/{sample1}_{sample2}.fa.gz",
    log:
        "logs/common/{sample1}_{sample2}.log",
    params:
        command="common",
        extra="",
    threads: 2
    wrapper:
        "v9.4.2/bio/seqkit"


rule seqkit_concat:
    input:
        fastas=[
            "data/{sample1}.fa",
            "data/{sample2}.fa",
        ],
    output:
        fasta="out/concat/{sample1}_{sample2}.fa.gz",
    log:
        "logs/concat/{sample1}_{sample2}.log",
    params:
        command="concat",
        extra="",
        out_bgzip=True,
    threads: 2
    wrapper:
        "v9.4.2/bio/seqkit"


rule seqkit_split2_part:
    input:
        fasta="data/{sample}.fa",
    output:
        fasta=[
            "out/split2/part/{sample}.1-of-2.fas",
            "out/split2/part/{sample}.2-of-2.fas",
        ],
    log:
        "logs/split/part/{sample}.log",
    params:
        command="split2",
        extra="--by-part 2",
    threads: 2
    wrapper:
        "v9.4.2/bio/seqkit"

Note that input, output and log file paths can be chosen freely.

When running with

snakemake --use-conda

the software dependencies will be automatically deployed into an isolated environment before execution.

Notes

First input and output file is considered to be the main one (except for commands concat, common, and stats that take all input files).
Keys for extra input and output files need to match seqkit arguments without the -file suffix (if present).

Software dependencies

seqkit=2.13.0
htslib=1.23.1
snakemake-wrapper-utils=0.8.0

Input/Output

Input:

input file(s)

Output:

output file(s)

Params

command: SeqKit command to use.
extra: Optional parameters.

Authors

Filipe G. Vieira

Code

__author__ = "Filipe G. Vieira"
__copyright__ = "Copyright 2023, Filipe G. Vieira"
__license__ = "MIT"

import tempfile
from pathlib import Path
from snakemake.shell import shell
from snakemake_wrapper_utils.snakemake import is_arg

extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)


# Subcommands with multiple input files
if snakemake.params.command in ["concat", "common", "stats"]:
    input = " ".join(snakemake.input)
# Subcommands with a single input file (if more than one provided, concat'em all)
elif snakemake.params.command in [
    "sum",
    "rmdup",
    "split",
    "split2",
    "sample",
    "sort",
]:
    input = "<(cat " + " ".join(snakemake.input) + ")"
else:
    input = snakemake.input[0]


# Extra input
extra_input = " ".join(
    [
        (
            f"--{key.replace('_','-')} {value}"
            if key in ["bed", "gtf"]
            else f"--{key.replace('_','-')}-file {value}"
        )
        for key, value in snakemake.input.items()
    ][1:]
)


# Extra output
extra_output = " ".join(
    [
        (
            f"--{key.replace('_','-')} {value}"
            if key in ["read1", "read2"]
            else f"--{key.replace('_','-')}-file {value}"
        )
        for key, value in snakemake.output.items()
    ][1:]
)


if snakemake.params.command in ["split", "split2"]:
    # Check type of splitting
    if is_arg("-i", extra) or is_arg("--by-id", extra):
        split_by = "id"
    elif is_arg("-p", extra) or is_arg("--by-part", extra):
        split_by = "part"
    elif is_arg("-r", extra) or is_arg("--by-region", extra):
        split_by = "region"
    elif is_arg("-s", extra) or is_arg("--by-size", extra):
        split_by = "size"
    elif is_arg("-l", extra) or is_arg("--by-length", extra):
        split_by = "length"

    out_dir = Path(snakemake.output[0]).parent
    output = f"--out-dir {out_dir} --by-{split_by}-prefix output_part. --extension .fas"
else:
    if snakemake.params.get("out_bgzip"):
        assert Path(snakemake.output[0]).suffix in [
            ".gz",
            ".bgz",
            ".bgzip",
        ], "invalid output file extension"
        output = f"| bgzip --threads {snakemake.threads} > {snakemake.output[0]}"
    else:
        output = f"--out-file {snakemake.output[0]}"


shell(
    "(seqkit {snakemake.params.command}"
    " --threads {snakemake.threads}"
    " {extra_input}"
    " {extra_output}"
    " {extra}"
    " {input}"
    " {output}"
    ") {log}"
)


# Rename output files
if snakemake.params.command in ["split", "split2"]:
    for idx, output_file in enumerate(snakemake.output, start=1):
        shell("mv {out_dir}/output_part.{idx:03d}.fas {output_file}")