VEP ANNOTATE
Annotate variant calls with VEP.
Example
This wrapper can be used in the following way:
rule annotate_variants:
input:
calls="variants.bcf", # .vcf, .vcf.gz or .bcf
cache="resources/vep/cache", # can be omitted if fasta and gff are specified
plugins="resources/vep/plugins",
# optionally add reference genome fasta
# fasta="genome.fasta",
# fai="genome.fasta.fai", # fasta index
# gff="annotation.gff",
# csi="annotation.gff.csi", # tabix index
# add mandatory aux-files required by some plugins if not present in the VEP plugin directory specified above.
# aux files must be defined as following: "<plugin> = /path/to/file" where plugin must be in lowercase
# revel = path/to/revel_scores.tsv.gz
output:
calls="variants.annotated.bcf", # .vcf, .vcf.gz or .bcf
stats="variants.html",
params:
# Pass a list of plugins to use, see https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html
# Plugin args can be added as well, e.g. via an entry "MyPlugin,1,FOO", see docs.
plugins=["LoFtool"],
extra="--everything", # optional: extra arguments
log:
"logs/vep/annotate.log",
threads: 4
wrapper:
"v5.0.0/bio/vep/annotate"
Note that input, output and log file paths can be chosen freely.
When running with
snakemake --use-conda
the software dependencies will be automatically deployed into an isolated environment before execution.
Software dependencies
ensembl-vep=113.0
bcftools=1.21
perl-encode-locale=1.05
perl=5.32.1
Code
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2020, Johannes Köster"
__email__ = "johannes.koester@uni-due.de"
__license__ = "MIT"
import os
from pathlib import Path
from snakemake.shell import shell
def get_only_child_dir(path):
children = [child for child in path.iterdir() if child.is_dir()]
assert (
len(children) == 1
), "Invalid VEP cache directory, only a single entry is allowed, make sure that cache was created with the snakemake VEP cache wrapper"
return children[0]
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)
fork = "--fork {}".format(snakemake.threads) if snakemake.threads > 1 else ""
stats = snakemake.output.stats
cache = snakemake.input.get("cache", "")
plugins = snakemake.input.plugins
plugin_aux_files = {"LoFtool": "LoFtool_scores.txt", "ExACpLI": "ExACpLI_values.txt"}
load_plugins = []
for plugin in snakemake.params.plugins:
if plugin in plugin_aux_files.keys():
aux_path = os.path.join(plugins, plugin_aux_files[plugin])
load_plugins.append(",".join([plugin, aux_path]))
else:
load_plugins.append(",".join([plugin, snakemake.input.get(plugin.lower(), "")]))
load_plugins = " ".join(map("--plugin {}".format, load_plugins))
if snakemake.output.calls.endswith(".vcf.gz"):
fmt = "z"
elif snakemake.output.calls.endswith(".bcf"):
fmt = "b"
else:
fmt = "v"
fasta = snakemake.input.get("fasta", "")
if fasta:
fasta = "--fasta {}".format(fasta)
gff = snakemake.input.get("gff", "")
if gff:
gff = "--gff {}".format(gff)
if cache:
entrypath = get_only_child_dir(get_only_child_dir(Path(cache)))
species = (
entrypath.parent.name[:-7]
if entrypath.parent.name.endswith("_refseq")
else entrypath.parent.name
)
release, build = entrypath.name.split("_")
cache = (
"--offline --cache --dir_cache {cache} --cache_version {release} --species {species} --assembly {build}"
).format(cache=cache, release=release, build=build, species=species)
shell(
"(bcftools view '{snakemake.input.calls}' | "
"vep {extra} {fork} "
"--format vcf "
"--vcf "
"{cache} "
"{gff} "
"{fasta} "
"--dir_plugins {plugins} "
"{load_plugins} "
"--output_file STDOUT "
"--stats_file {stats} | "
"bcftools view -O{fmt} > {snakemake.output.calls}) {log}"
)