SEQKIT GENERIC WRAPPER

https://img.shields.io/badge/wrapper_version-v9.4.2-10785b https://img.shields.io/github/issues-pr/snakemake/snakemake-wrappers/bio/seqkit?label=version%20update%20pull%20requests&color=1cb481

Run SeqKit.

URL: https://bioinf.shenwei.me/seqkit/

Example

This wrapper can be used in the following way:

rule seqkit_seq:
    input:
        fasta="data/{sample}.fa",
    output:
        fasta="out/seq/{sample}.fa.gz",
    log:
        "logs/seq/{sample}.log",
    params:
        command="seq",
        extra="--min-len 10",
    threads: 2
    wrapper:
        "v9.4.2/bio/seqkit"


rule seqkit_subseq_bed:
    input:
        fasta="data/{sample}.fa",
        bed="data/{sample}.bed",
    output:
        fasta="out/subseq/bed/{sample}.fa.gz",
    log:
        "logs/subseq/bed/{sample}.log",
    params:
        command="subseq",
    threads: 2
    wrapper:
        "v9.4.2/bio/seqkit"


rule seqkit_subseq_gtf:
    input:
        fasta="data/{sample}.fa",
        gtf="data/{sample}.gtf",
    output:
        fasta="out/subseq/gtf/{sample}.fa.gz",
    log:
        "logs/subseq/gtf/{sample}.log",
    params:
        command="subseq",
        extra="--feature CDS",
    threads: 2
    wrapper:
        "v9.4.2/bio/seqkit"


rule seqkit_subseq_region:
    input:
        fasta="data/{sample}.fa",
    output:
        fasta="out/subseq/region/{sample}.fa.gz",
    log:
        "logs/subseq/region/{sample}.log",
    params:
        command="subseq",
        extra="--region 1:12",
    threads: 2
    wrapper:
        "v9.4.2/bio/seqkit"


rule seqkit_fx2tab:
    input:
        fastx="data/{sample}.fastq",
    output:
        tsv="out/fx2tab/{sample}.tsv",
    log:
        "logs/fx2tab/{sample}.log",
    params:
        command="fx2tab",
        extra="--name",
    threads: 2
    wrapper:
        "v9.4.2/bio/seqkit"


rule seqkit_grep_name:
    input:
        fastx="data/{sample}.fastq",
        pattern="data/name.txt",
    output:
        fastx="out/grep/name/{sample}.fastq.gz",
    log:
        "logs/grep/name/{sample}.log",
    params:
        command="grep",
        extra="--by-name",
    threads: 2
    wrapper:
        "v9.4.2/bio/seqkit"


rule seqkit_grep_seq:
    input:
        fastx="data/{sample}.fastq",
        pattern="data/seq.txt",
    output:
        fastx="out/grep/seq/{sample}.fastq.gz",
    log:
        "logs/grep/seq/{sample}.log",
    params:
        command="grep",
        extra="--by-seq",
    threads: 2
    wrapper:
        "v9.4.2/bio/seqkit"


rule seqkit_rmdup_name:
    input:
        fastx="data/{sample}.fastq",
    output:
        fastx="out/rmdup/name/{sample}.fastq.gz",
        dup_num="out/rmdup/name/{sample}.num.txt",
        dup_seqs="out/rmdup/name/{sample}.seq.txt",
    log:
        "logs/rmdup/name/{sample}.log",
    params:
        command="rmdup",
        extra="--by-name",
    threads: 2
    wrapper:
        "v9.4.2/bio/seqkit"


rule seqkit_rmdup_seq:
    input:
        fastx="data/{sample}.fastq",
    output:
        fastx="out/rmdup/seq/{sample}.fastq.gz",
        dup_num="out/rmdup/seq/{sample}.num.txt",
        dup_seqs="out/rmdup/seq/{sample}.seq.txt",
    log:
        "logs/rmdup/seq/{sample}.log",
    params:
        command="rmdup",
        extra="--by-seq",
    threads: 2
    wrapper:
        "v9.4.2/bio/seqkit"


rule seqkit_stats:
    input:
        fastx="data/{sample}.fastq",
    output:
        stats="out/stats/{sample}.tsv",
    log:
        "logs/stats/{sample}.log",
    params:
        command="stats",
        extra="--all --tabular",
    threads: 2
    wrapper:
        "v9.4.2/bio/seqkit"


rule seqkit_common:
    input:
        fastas=[
            "data/{sample1}.fa",
            "data/{sample2}.fa",
        ],
    output:
        fasta="out/common/{sample1}_{sample2}.fa.gz",
    log:
        "logs/common/{sample1}_{sample2}.log",
    params:
        command="common",
        extra="",
    threads: 2
    wrapper:
        "v9.4.2/bio/seqkit"


rule seqkit_concat:
    input:
        fastas=[
            "data/{sample1}.fa",
            "data/{sample2}.fa",
        ],
    output:
        fasta="out/concat/{sample1}_{sample2}.fa.gz",
    log:
        "logs/concat/{sample1}_{sample2}.log",
    params:
        command="concat",
        extra="",
        out_bgzip=True,
    threads: 2
    wrapper:
        "v9.4.2/bio/seqkit"


rule seqkit_split2_part:
    input:
        fasta="data/{sample}.fa",
    output:
        fasta=[
            "out/split2/part/{sample}.1-of-2.fas",
            "out/split2/part/{sample}.2-of-2.fas",
        ],
    log:
        "logs/split/part/{sample}.log",
    params:
        command="split2",
        extra="--by-part 2",
    threads: 2
    wrapper:
        "v9.4.2/bio/seqkit"

Note that input, output and log file paths can be chosen freely.

When running with

snakemake --use-conda

the software dependencies will be automatically deployed into an isolated environment before execution.

Notes

  • First input and output file is considered to be the main one (except for commands concat, common, and stats that take all input files).

  • Keys for extra input and output files need to match seqkit arguments without the -file suffix (if present).

Software dependencies

  • seqkit=2.13.0

  • htslib=1.23.1

  • snakemake-wrapper-utils=0.8.0

Input/Output

Input:

  • input file(s)

Output:

  • output file(s)

Params

  • command: SeqKit command to use.

  • extra: Optional parameters.

Authors

  • Filipe G. Vieira

Code

__author__ = "Filipe G. Vieira"
__copyright__ = "Copyright 2023, Filipe G. Vieira"
__license__ = "MIT"

import tempfile
from pathlib import Path
from snakemake.shell import shell
from snakemake_wrapper_utils.snakemake import is_arg

extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)


# Subcommands with multiple input files
if snakemake.params.command in ["concat", "common", "stats"]:
    input = " ".join(snakemake.input)
# Subcommands with a single input file (if more than one provided, concat'em all)
elif snakemake.params.command in [
    "sum",
    "rmdup",
    "split",
    "split2",
    "sample",
    "sort",
]:
    input = "<(cat " + " ".join(snakemake.input) + ")"
else:
    input = snakemake.input[0]


# Extra input
extra_input = " ".join(
    [
        (
            f"--{key.replace('_','-')} {value}"
            if key in ["bed", "gtf"]
            else f"--{key.replace('_','-')}-file {value}"
        )
        for key, value in snakemake.input.items()
    ][1:]
)


# Extra output
extra_output = " ".join(
    [
        (
            f"--{key.replace('_','-')} {value}"
            if key in ["read1", "read2"]
            else f"--{key.replace('_','-')}-file {value}"
        )
        for key, value in snakemake.output.items()
    ][1:]
)


if snakemake.params.command in ["split", "split2"]:
    # Check type of splitting
    if is_arg("-i", extra) or is_arg("--by-id", extra):
        split_by = "id"
    elif is_arg("-p", extra) or is_arg("--by-part", extra):
        split_by = "part"
    elif is_arg("-r", extra) or is_arg("--by-region", extra):
        split_by = "region"
    elif is_arg("-s", extra) or is_arg("--by-size", extra):
        split_by = "size"
    elif is_arg("-l", extra) or is_arg("--by-length", extra):
        split_by = "length"

    out_dir = Path(snakemake.output[0]).parent
    output = f"--out-dir {out_dir} --by-{split_by}-prefix output_part. --extension .fas"
else:
    if snakemake.params.get("out_bgzip"):
        assert Path(snakemake.output[0]).suffix in [
            ".gz",
            ".bgz",
            ".bgzip",
        ], "invalid output file extension"
        output = f"| bgzip --threads {snakemake.threads} > {snakemake.output[0]}"
    else:
        output = f"--out-file {snakemake.output[0]}"


shell(
    "(seqkit {snakemake.params.command}"
    " --threads {snakemake.threads}"
    " {extra_input}"
    " {extra_output}"
    " {extra}"
    " {input}"
    " {output}"
    ") {log}"
)


# Rename output files
if snakemake.params.command in ["split", "split2"]:
    for idx, output_file in enumerate(snakemake.output, start=1):
        shell("mv {out_dir}/output_part.{idx:03d}.fas {output_file}")