SEQKIT GENERIC WRAPPER
Run SeqKit.
URL: https://bioinf.shenwei.me/seqkit/
Example
This wrapper can be used in the following way:
rule seqkit_seq:
input:
fasta="data/{sample}.fa",
output:
fasta="out/seq/{sample}.fa.gz",
log:
"logs/seq/{sample}.log",
params:
command="seq",
extra="--min-len 10",
threads: 2
wrapper:
"v9.4.2/bio/seqkit"
rule seqkit_subseq_bed:
input:
fasta="data/{sample}.fa",
bed="data/{sample}.bed",
output:
fasta="out/subseq/bed/{sample}.fa.gz",
log:
"logs/subseq/bed/{sample}.log",
params:
command="subseq",
threads: 2
wrapper:
"v9.4.2/bio/seqkit"
rule seqkit_subseq_gtf:
input:
fasta="data/{sample}.fa",
gtf="data/{sample}.gtf",
output:
fasta="out/subseq/gtf/{sample}.fa.gz",
log:
"logs/subseq/gtf/{sample}.log",
params:
command="subseq",
extra="--feature CDS",
threads: 2
wrapper:
"v9.4.2/bio/seqkit"
rule seqkit_subseq_region:
input:
fasta="data/{sample}.fa",
output:
fasta="out/subseq/region/{sample}.fa.gz",
log:
"logs/subseq/region/{sample}.log",
params:
command="subseq",
extra="--region 1:12",
threads: 2
wrapper:
"v9.4.2/bio/seqkit"
rule seqkit_fx2tab:
input:
fastx="data/{sample}.fastq",
output:
tsv="out/fx2tab/{sample}.tsv",
log:
"logs/fx2tab/{sample}.log",
params:
command="fx2tab",
extra="--name",
threads: 2
wrapper:
"v9.4.2/bio/seqkit"
rule seqkit_grep_name:
input:
fastx="data/{sample}.fastq",
pattern="data/name.txt",
output:
fastx="out/grep/name/{sample}.fastq.gz",
log:
"logs/grep/name/{sample}.log",
params:
command="grep",
extra="--by-name",
threads: 2
wrapper:
"v9.4.2/bio/seqkit"
rule seqkit_grep_seq:
input:
fastx="data/{sample}.fastq",
pattern="data/seq.txt",
output:
fastx="out/grep/seq/{sample}.fastq.gz",
log:
"logs/grep/seq/{sample}.log",
params:
command="grep",
extra="--by-seq",
threads: 2
wrapper:
"v9.4.2/bio/seqkit"
rule seqkit_rmdup_name:
input:
fastx="data/{sample}.fastq",
output:
fastx="out/rmdup/name/{sample}.fastq.gz",
dup_num="out/rmdup/name/{sample}.num.txt",
dup_seqs="out/rmdup/name/{sample}.seq.txt",
log:
"logs/rmdup/name/{sample}.log",
params:
command="rmdup",
extra="--by-name",
threads: 2
wrapper:
"v9.4.2/bio/seqkit"
rule seqkit_rmdup_seq:
input:
fastx="data/{sample}.fastq",
output:
fastx="out/rmdup/seq/{sample}.fastq.gz",
dup_num="out/rmdup/seq/{sample}.num.txt",
dup_seqs="out/rmdup/seq/{sample}.seq.txt",
log:
"logs/rmdup/seq/{sample}.log",
params:
command="rmdup",
extra="--by-seq",
threads: 2
wrapper:
"v9.4.2/bio/seqkit"
rule seqkit_stats:
input:
fastx="data/{sample}.fastq",
output:
stats="out/stats/{sample}.tsv",
log:
"logs/stats/{sample}.log",
params:
command="stats",
extra="--all --tabular",
threads: 2
wrapper:
"v9.4.2/bio/seqkit"
rule seqkit_common:
input:
fastas=[
"data/{sample1}.fa",
"data/{sample2}.fa",
],
output:
fasta="out/common/{sample1}_{sample2}.fa.gz",
log:
"logs/common/{sample1}_{sample2}.log",
params:
command="common",
extra="",
threads: 2
wrapper:
"v9.4.2/bio/seqkit"
rule seqkit_concat:
input:
fastas=[
"data/{sample1}.fa",
"data/{sample2}.fa",
],
output:
fasta="out/concat/{sample1}_{sample2}.fa.gz",
log:
"logs/concat/{sample1}_{sample2}.log",
params:
command="concat",
extra="",
out_bgzip=True,
threads: 2
wrapper:
"v9.4.2/bio/seqkit"
rule seqkit_split2_part:
input:
fasta="data/{sample}.fa",
output:
fasta=[
"out/split2/part/{sample}.1-of-2.fas",
"out/split2/part/{sample}.2-of-2.fas",
],
log:
"logs/split/part/{sample}.log",
params:
command="split2",
extra="--by-part 2",
threads: 2
wrapper:
"v9.4.2/bio/seqkit"
Note that input, output and log file paths can be chosen freely.
When running with
snakemake --use-conda
the software dependencies will be automatically deployed into an isolated environment before execution.
Notes
First input and output file is considered to be the main one (except for commands concat, common, and stats that take all input files).
Keys for extra input and output files need to match seqkit arguments without the -file suffix (if present).
Software dependencies
seqkit=2.13.0htslib=1.23.1snakemake-wrapper-utils=0.8.0
Input/Output
Input:
input file(s)
Output:
output file(s)
Params
command: SeqKit command to use.extra: Optional parameters.
Code
__author__ = "Filipe G. Vieira"
__copyright__ = "Copyright 2023, Filipe G. Vieira"
__license__ = "MIT"
import tempfile
from pathlib import Path
from snakemake.shell import shell
from snakemake_wrapper_utils.snakemake import is_arg
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)
# Subcommands with multiple input files
if snakemake.params.command in ["concat", "common", "stats"]:
input = " ".join(snakemake.input)
# Subcommands with a single input file (if more than one provided, concat'em all)
elif snakemake.params.command in [
"sum",
"rmdup",
"split",
"split2",
"sample",
"sort",
]:
input = "<(cat " + " ".join(snakemake.input) + ")"
else:
input = snakemake.input[0]
# Extra input
extra_input = " ".join(
[
(
f"--{key.replace('_','-')} {value}"
if key in ["bed", "gtf"]
else f"--{key.replace('_','-')}-file {value}"
)
for key, value in snakemake.input.items()
][1:]
)
# Extra output
extra_output = " ".join(
[
(
f"--{key.replace('_','-')} {value}"
if key in ["read1", "read2"]
else f"--{key.replace('_','-')}-file {value}"
)
for key, value in snakemake.output.items()
][1:]
)
if snakemake.params.command in ["split", "split2"]:
# Check type of splitting
if is_arg("-i", extra) or is_arg("--by-id", extra):
split_by = "id"
elif is_arg("-p", extra) or is_arg("--by-part", extra):
split_by = "part"
elif is_arg("-r", extra) or is_arg("--by-region", extra):
split_by = "region"
elif is_arg("-s", extra) or is_arg("--by-size", extra):
split_by = "size"
elif is_arg("-l", extra) or is_arg("--by-length", extra):
split_by = "length"
out_dir = Path(snakemake.output[0]).parent
output = f"--out-dir {out_dir} --by-{split_by}-prefix output_part. --extension .fas"
else:
if snakemake.params.get("out_bgzip"):
assert Path(snakemake.output[0]).suffix in [
".gz",
".bgz",
".bgzip",
], "invalid output file extension"
output = f"| bgzip --threads {snakemake.threads} > {snakemake.output[0]}"
else:
output = f"--out-file {snakemake.output[0]}"
shell(
"(seqkit {snakemake.params.command}"
" --threads {snakemake.threads}"
" {extra_input}"
" {extra_output}"
" {extra}"
" {input}"
" {output}"
") {log}"
)
# Rename output files
if snakemake.params.command in ["split", "split2"]:
for idx, output_file in enumerate(snakemake.output, start=1):
shell("mv {out_dir}/output_part.{idx:03d}.fas {output_file}")