DESEQ2
Call differentially expressed genes with DESeq2
URL: https://bioconductor.org/packages/3.16/bioc/html/DESeq2.html
Example
This wrapper can be used in the following way:
rule test_deseq2_wald_apeglm:
input:
dds="dds.RDS",
output:
wald_rds="wald_apeglm.RDS",
wald_tsv="dge_apeglm.tsv",
deseq2_result_dir=directory("deseq_results_apeglm"),
normalized_counts_table="counts_apeglm.tsv",
normalized_counts_rds="counts_apeglm.RDS",
params:
deseq_extra="",
# Optional parameters for `lfcShrink` besides dds, parallel, contrast, and coef
shrink_extra="type='apeglm'",
results_extra="",
contrast=["condition", "A", "B"],
threads: 1
log:
"logs/deseq2_apeglm.log",
wrapper:
"v7.0.0/bio/deseq2/wald"
rule test_deseq2_wald_ashr:
input:
dds="dds.RDS",
output:
wald_rds="wald_ashr.RDS",
wald_tsv="dge_ashr.tsv",
deseq2_result_dir=directory("deseq_results_ashr"),
normalized_counts_table="counts_ashr.tsv",
normalized_counts_rds="counts_ashr.RDS",
params:
deseq_extra="",
# Optional parameters for `lfcShrink` besides dds, parallel, contrast, and coef
shrink_extra="type='ashr'",
results_extra="",
contrast=["condition", "A", "B"],
threads: 1
log:
"logs/deseq2_ashr.log",
wrapper:
"v7.0.0/bio/deseq2/wald"
rule test_deseq2_wald_normal:
input:
dds="dds.RDS",
output:
wald_rds="wald_normal.RDS",
wald_tsv="dge_normal.tsv",
deseq2_result_dir=directory("deseq_results_normal"),
normalized_counts_table="counts_normal.tsv",
normalized_counts_rds="counts_normal.RDS",
params:
deseq_extra="",
# Optional parameters for `lfcShrink` besides dds, parallel, contrast, and coef
shrink_extra="type='normal'",
results_extra="",
contrast=["condition", "A", "B"],
threads: 1
log:
"logs/deseq2_normal.log",
wrapper:
"v7.0.0/bio/deseq2/wald"
rule test_deseq2_wald_contrast_2_factors:
input:
dds="dds.RDS",
output:
wald_rds="wald_2f.RDS",
wald_tsv="dge_2f.tsv",
deseq2_result_dir=directory("deseq_results_2f"),
normalized_counts_table="counts_2f.tsv",
normalized_counts_rds="counts_2f.RDS",
params:
deseq_extra="",
# Optional parameters for `lfcShrink` besides dds, parallel, contrast, and coef
shrink_extra="type='apeglm'",
results_extra="",
contrast=["A", "B"],
threads: 1
log:
"logs/deseq2_2f.log",
wrapper:
"v7.0.0/bio/deseq2/wald"
rule test_deseq2_wald_contrast_1_string:
input:
dds="dds.RDS",
output:
wald_rds="wald_1s.RDS",
wald_tsv="dge_1s.tsv",
deseq2_result_dir=directory("deseq_results_1s"),
normalized_counts_table="counts_1s.tsv",
normalized_counts_rds="counts_1s.RDS",
params:
deseq_extra="",
# Optional parameters for `lfcShrink` besides dds, parallel, contrast, and coef
shrink_extra="type='apeglm'",
results_extra="",
contrast=["condition_B_vs_A"],
threads: 1
log:
"logs/deseq2_1s.log",
wrapper:
"v7.0.0/bio/deseq2/wald"
Note that input, output and log file paths can be chosen freely.
When running with
snakemake --use-conda
the software dependencies will be automatically deployed into an isolated environment before execution.
Software dependencies
bioconductor-deseq2=1.46.0bioconductor-biocparallel=1.40.0bioconductor-apeglm=1.28.0r-ashr=2.2_63
Input/Output
Input:
dds: Path to RDS-formatted DESeq2-object
Output:
wald_rds: Optional path to wald test results (RDS formatted)wald_tsv: Optional path to wald test results (TSV formatted). Required optional parameter contrast (see below)deseq2_result_dir: Optional path to a directory that shall contain all DESeq2 results for each comparison (each file is TSV formatted)normalized_counts_table: Optional path to normalized counts (TSV formatted)normalized_counts_rds: Optional path to normalized counts (RDS formatted)
Params
deseq_extra: Optional parameters provided to the function DESeq()shrink_extra: Optional parameters provided to the function lfcShrink()results_extra: Optional parameters provided to the function result()contrast: List of characters. See notes below.
Code
# This script takes a deseq2 dataset object, performs
# a DESeq2 wald test, and saves results as requested by user
# __author__ = "Thibault Dayris"
# __copyright__ = "Copyright 2023, Thibault Dayris"
# __email__ = "thibault.dayris@gustaveroussy.fr"
# __license__ = "MIT"
# Sink the stderr and stdout to the snakemake log file
# https://stackoverflow.com/a/48173272
log_file <- base::file(description = snakemake@log[[1]], open = "wt")
base::sink(file = log_file)
base::sink(file = log_file, type = "message")
# Loading libraries (order matters)
base::library(package = "BiocParallel", character.only = TRUE)
base::library(package = "SummarizedExperiment", character.only = TRUE)
base::library(package = "DESeq2", character.only = TRUE)
base::library(package = "ashr", character.only = TRUE)
base::library(package = "apeglm", character.only = TRUE)
# Function to handle optional user-defined parameters
# and still follow R syntax
add_extra <- function(wrapper_extra, snakemake_param_name) {
if (snakemake_param_name %in% base::names(snakemake@params)) {
# Case user provides snakemake_param_name in snakemake rule
user_param <- snakemake@params[[snakemake_param_name]]
base::message(user_param)
base::message(wrapper_extra)
if (! user_param == "") {
# Case user do not provide an empty value
# (R does not like trailing commas at the end
# of a function call)
wrapper_extra <- base::paste(
wrapper_extra,
base::as.character(x = user_param),
sep = ", "
)
} # Nothing to do if user provides an empty / NULL parameter value
} # Nothing to do if user did not provide snakemake_param_name
# In any case, required parameters must be returned
base::return(wrapper_extra)
}
# A result name is build using the following template
# {factor}_{tested}_vs_{ref}
get_result_name_from_params <- function(contrast_vector, dds) {
# Instantiate to null in order to let DESeq2 raise an error if user provided
# contrast_vector does not fit DESeq2 standards
name <- NULL
contrast_length <- base::length(contrast_vector)
if (contrast_length == 1) {
name <- contrast_vector[1]
} else if (contrast_length == 2) {
# There may be ambguity here with factors having identical level names
# The suffix of the expected result:
suffix <- base::paste(
contrast_vector[2], "vs", contrast_vector[1], sep = "_"
)
names <- DESeq2::resultsNames(dds)
# All possible results matchig this suffix
names <- names[base::endsWith(names, suffix)]
if (! base::length(names) == 1) {
base::stop(
"Could not guess correct result name from contrast ",
"due to the following ambiguity:",
base::paste(DESeq2::resultsNames(dds), sep = " "),
" based on the following suffix: ",
suffix
)
}
# Case there is no abiguity:
name <- names[1]
} else if (contrast_length == 3) {
name <- base::paste(
contrast_vector[1],
contrast_vector[3],
"vs",
contrast_vector[2],
sep = "_"
)
}
base::return(name)
}
# Function to address concerns about `coef` used in `apeglm` package
get_coef_from_dds_and_params <- function(contrast_vector, dds) {
# Set to null in order to raise an explicit error in DESeq2
coef <- NULL
results_names <- DESeq2::resultsNames(dds)
name <- get_result_name_from_params(contrast_vector, dds)
# Raising eror with message since the later command
# would fail before DESeq2 could raise any error.
if (! name %in% results_names) {
base::stop(
"Couldn't find resultName: ",
name,
" within ",
base::paste(results_names)
)
}
coef <- base::which(results_names == name)
base::return(coef)
}
# Fill parameters for lfcShrink
lfc_shrink_additional_parameters <- function(shrink_extra, coef, name) {
# Acquire user-defined optional parameters
shrink_extra <- add_extra(
wrapper_extra = "dds = wald, parallel = parallel",
snakemake_param_name = "shrink_extra"
)
# Add coef to access results easily
if (base::grepl(pattern = "apeglm", x = shrink_extra, fixed = TRUE)) {
shrink_extra <- base::paste0(shrink_extra, ", coef=", coef)
} else {
shrink_extra <- base::paste0(shrink_extra, ", coef=", name)
}
base::return(shrink_extra)
}
# Setting up multithreading if required
parallel <- FALSE
if (snakemake@threads > 1) {
BiocParallel::register(
BPPARAM = BiocParallel::MulticoreParam(snakemake@threads)
)
parallel <- TRUE
}
# Load DESeq2 dataset
dds_path <- base::as.character(x = snakemake@input[["dds"]])
dds <- base::readRDS(file = dds_path)
base::message("Libraries and dataset loaded")
# Ensure future "paste" command work
contrast_vector <- base::sapply(
snakemake@params[["contrast"]],
function(extra) base::as.character(x = extra)
)
# Build extra parameters for DESeq2
extra_deseq2 <- add_extra(
wrapper_extra = "object = dds, test = 'Wald', parallel = parallel",
snakemake_param_name = "deseq_extra"
)
deseq2_cmd <- base::paste0(
"DESeq2::DESeq(", extra_deseq2, ")"
)
base::message("DESeq2 command line:")
base::message(deseq2_cmd)
# Running DESeq2::DESeq for wald test result
wald <- base::eval(base::parse(text = deseq2_cmd))
# The rest of the script is here to save part or complete
# list of results in RDS or plain text (TSV) formats.
# Save main result on user request (RDS)
# This includes counts, wald tests for all levels
# assays, design, etc.
if ("wald_rds" %in% base::names(x = snakemake@output)) {
output_rds <- base::as.character(x = snakemake@output[["wald_rds"]])
base::saveRDS(obj = wald, file = output_rds)
base::message("Wald test saved as RDS file")
}
# Saving normalized counts on demand
table <- counts(wald)
# TSV-formatted count table
if ("normalized_counts_table" %in% base::names(snakemake@output)) {
output_table <- base::as.character(
x = snakemake@output[["normalized_counts_table"]]
)
utils::write.table(x = table, file = output_table, sep = "\t", quote = FALSE)
base::message("Normalized counts saved as TSV")
}
# RDS-formated count object with many information,
# including counts, assays, etc.
if ("normalized_counts_rds" %in% base::names(snakemake@output)) {
output_rds <- base::as.character(
x = snakemake@output[["normalized_counts_rds"]]
)
base::saveRDS(obj = table, file = output_rds)
base::message("Normalized counts saved as RDS")
}
# On user request: save all results as TSV in a directory.
# User can later access the directory content, e.g. with
# a snakemake checkpoint-rule.
if ("deseq2_result_dir" %in% base::names(snakemake@output)) {
# Acquire list of available results in DESeqDataSet
wald_results_names <- DESeq2::resultsNames(object = wald)
# DESeq2 result dir will contain all results available in the Wald object
output_prefix <- snakemake@output[["deseq2_result_dir"]]
if (! base::file.exists(output_prefix)) {
base::dir.create(path = output_prefix, recursive = TRUE)
}
# Building command lines for wald results
#
# The variable `result_name` is declared here, but evaluated
# later in the for loop.
results_extra <- add_extra(
wrapper_extra = "object = wald, name = result_name, parallel = parallel",
snakemake_param_name = "results_extra"
)
results_cmd <- base::paste0("DESeq2::results(", results_extra, ")")
base::message("Command line used for TSV results creation:")
base::message(results_cmd)
# For each available comparison in the wald-dds object
for (result_name in wald_results_names) {
# Setting all the lfcShrink values using resultsName
# in order to always shrink the correct result and not
# the one listed in 'snakemake@params[["contrast"]]'
#
# However, the argument names must be adjusted to
# the shrinkage method requested by user
coef <- get_coef_from_dds_and_params(c(result_name), wald)
name <- base::paste0("'", result_name, "'")
shrink_extra <- lfc_shrink_additional_parameters(
shrink_extra = snakemake@params[["shrink_extra"]],
coef = coef,
name = name
)
# Apeglm does not accept to correct intercept. It raises an error
# if coef == 1. This factor is skipped and all other results are
# iteratively saved on disk
if ((! (base::grepl("ashr", shrink_extra)) || ( base::grepl("normal", shrink_extra))) && (coef < 2)) {
base::message(
"Cannot run `apeglm` on `",
result_name,
"`, since it's coef is `",
coef,
"`. Skipping."
)
} else {
shrink_cmd <- base::paste0("DESeq2::lfcShrink(", shrink_extra, ")")
base::message("Command line used for log(FC) shrinkage:")
base::message(shrink_cmd)
# Building table
base::message(base::paste("Saving results for", result_name))
results_frame <- base::eval(base::parse(text = results_cmd))
shrink_frame <- base::eval(base::parse(text = shrink_cmd))
results_frame$log2FoldChange <- shrink_frame$log2FoldChange
results_path <- base::file.path(
output_prefix,
base::paste0(result_name, ".tsv")
)
# Saving table
utils::write.table(
x = results_frame,
file = results_path,
quote = FALSE,
sep = "\t",
row.names = TRUE
)
}
}
}
# If user provides contrasts, then a precise result
# can be extracted from DESeq2 object.
if ("wald_tsv" %in% base::names(x = snakemake@output)) {
if ("contrast" %in% base::names(x = snakemake@params)) {
# Define all `contrast`, `coef`, and `name` to prepare
# to any shrinkage parameter set by user and still focus
# on the provided contrast given in `snakemake@params[["contrast"]]`
name <- get_result_name_from_params(contrast_vector, wald)
coef <- get_coef_from_dds_and_params(contrast_vector, wald)
# Finally saving results as contrast has been
# built from user input.
# Extracting results:
results_extra <- add_extra(
wrapper_extra = "object = wald, name = name, parallel = parallel",
snakemake_param_name = "results_extra"
)
results_cmd <- base::paste0("DESeq2::results(", results_extra, ")")
base::message("Result extraction command: ", results_cmd)
results_frame <- base::eval(base::parse(text = results_cmd))
# Shrinking:
shrink_extra <- lfc_shrink_additional_parameters(
shrink_extra = snakemake@params[["shrink_extra"]],
coef = coef,
name = base::paste0("'", name, "'")
)
shrink_cmd <- base::paste0("DESeq2::lfcShrink(", shrink_extra, ")")
base::message("Command line used for log(FC) shrinkage:")
base::message(shrink_cmd)
shrink_frame <- base::eval(base::parse(text = shrink_cmd))
# Updating results:
results_frame$log2FoldChange <- shrink_frame$log2FoldChange
# Saving table
utils::write.table(
x = results_frame,
file = base::as.character(x = snakemake@output[["wald_tsv"]]),
quote = FALSE,
sep = "\t",
row.names = TRUE
)
} else {
base::stop(
"No contrast provided. ",
"In absence of contrast, it is not possible ",
"to guess the expected result name.",
)
}
}
# Proper syntax to close the connection for the log file
# but could be optional for Snakemake wrapper
base::sink(type = "message")
base::sink()