GSEAPY GSEA
GSEApy, Gene Set Enrichment Analysis in Python
URL: https://github.com/zqfang/GSEApy
Example
This wrapper can be used in the following way:
rule test_gseapy_enrichr:
input:
gene_list="genes_list.txt",
# gmt="", Optional path to a GMT file
output:
enrichr_dir=directory("KEGG_2016"),
threads: 1
params:
extra={"cutoff": 1},
gene_sets=["KEGG_2016"], # Optional list of gene sets available on biomart
log:
wrapper="logs/gseapy/wrapper.enrichr.log",
gseapy="logs/gseapy/enrichr.log",
wrapper:
"v3.9.0/bio/gseapy/gsea"
rule test_gseapy_gsea:
input:
expr="genes_expr.tsv",
cls="sample_class.cls",
gmt="geneset.gmt",
output:
gmt="gene_sets.gmt",
csv="gsea.results.csv",
threads: 1
params:
extra={
"min_size": 1,
},
# gene_sets = [""] # Optional list of gene sets available on biomart
log:
wrapper="logs/gseapy/wrapper.gsea.log",
gseapy="logs/gseapy/gsea.log",
wrapper:
"v3.9.0/bio/gseapy/gsea"
rule test_gseapy_ssgsea:
input:
expr="genes_expr.tsv",
gmt="geneset.gmt",
output:
gmt="gene_sets.gmt",
csv="ssgsea.results.csv",
threads: 1
params:
extra={
"min_size": 1,
},
# gene_sets = [""] # Optional list of gene sets available on biomart
log:
wrapper="logs/gseapy/wrapper.ssgsea.log",
gseapy="logs/gseapy/ssgsea.log",
wrapper:
"v3.9.0/bio/gseapy/gsea"
rule test_gseapy_prerank:
input:
rank="genes.rnk",
gmt="geneset.gmt",
output:
gmt="gene_sets.gmt",
csv="prerank.results.csv",
prerank=directory("prerank_results_dir"),
threads: 1
params:
extra={
"min_size": 1,
},
# gene_sets = [""] # Optional list of gene sets available on biomart
log:
wrapper="logs/gseapy/wrapper.prerank.log",
gseapy="logs/gseapy/prerank.log",
wrapper:
"v3.9.0/bio/gseapy/gsea"
Note that input, output and log file paths can be chosen freely.
When running with
snakemake --use-conda
the software dependencies will be automatically deployed into an isolated environment before execution.
Notes
GSEApy’s subfunctions are selected directly from input files.
Software dependencies
gseapy=1.1.2
Input/Output
Input:
gmt
: Optional path to gene sets (GMT formatted). If no GMT file is provided, then a gene_sets value must be given in params.expr
: Path to expression table, required for gsea and ssgsea. (RNK formatted)cls
: Path to categorical class file, required for gsea. (CLS formatted)rank
: Path to pre-ranked genes, required for prerank. (TSV/CSV formatted)gene_list
: Path to gene list file, required for enrichr. (TXT/TSV/CSV formatted)background
: Optional path to a background file for enrichr. (TXT/TSV/CSV formatted)
Output:
outdir
: Path to output directorypkl
: Path to serialized results (Pickle)
Params
extra
: Dictionnary of arguments given to GSEApy’s subfunction, besides IO and threading options.gene_sets
: Non-file gene set names from Biomart.
Code
# coding: utf-8
__author__ = "Thibault Dayris"
__copyright__ = "Copyright 2023, Thibault Dayris"
__mail__ = "thibault.dayris@gustaveroussy.fr"
__license__ = "MIT"
import glob
import gseapy
import logging
import shutil
from tempfile import TemporaryDirectory
def mouve_output_files(method, tempdir):
"""
Move the predictable output files individually
and let the rest be in an additional directory
"""
log = snakemake.log.get("gseapy")
if log:
logging.info(f"Moving gseapy logs to {log}")
shutil.move(
# The logging file contains a timestamp
# there is only one logging file
glob.glob(f"{tempdir}/gseapy.{method}.*.log")[0],
log,
)
if method != "enrichr":
# `enrichr` method does not produce gmt files
gmt = snakemake.output.get("gmt")
if gmt:
logging.info(f"Moving gseapy GMT to {gmt}")
shutil.move(f"{tempdir}/gene_sets.gmt", gmt)
# `enrichr` method does not produce any csv
csv = snakemake.output.get("csv")
if csv:
logging.info(f"Moving gseapy CSV to {csv}")
if method == "gsea":
shutil.move(f"{tempdir}/gseapy.phenotype.{method}.report.csv", csv)
else:
shutil.move(f"{tempdir}/gseapy.gene_set.{method}.report.csv", csv)
# Only produced with pre-ranking method.
# File names depend on gseapy results and are not predictable
prerank_dir = snakemake.output.get("prerank")
if method == "prerank" and prerank_dir:
logging.info(f"Moving gseapy pre-rank results to {prerank_dir}")
shutil.move(f"{tempdir}/{method}/", prerank_dir)
# Only produced with enrichr method.
# File names depend on gseapy results and are not predictable
else:
# Since logging file has already been moved, only
# PDF/TXT results remains
enrichr_dir = snakemake.output.get("enrichr_dir")
if enrichr_dir:
logging.info(f"Moving gseapy-biomart enrichr to {enrichr_dir}")
shutil.move(f"{tempdir}/", enrichr_dir)
class TooManyThreadsRequested(Exception):
def __init__(self):
super().__init__("This subcommand uses only one threads.")
# Building Gene sets object
gmt_path = snakemake.input.get("gmt", [])
gene_sets = snakemake.params.get("gene_sets", [])
if isinstance(gene_sets, str):
gene_sets = [gene_sets]
if isinstance(gmt_path, list):
gene_sets += gmt_path
else:
gene_sets.append(gmt_path)
extra = snakemake.params.get("extra", {})
# Running gseapy
result = None
with TemporaryDirectory() as tempdir:
if snakemake.input.get("rank"):
print("Using Pre-rank method")
result = gseapy.prerank(
rnk=snakemake.input.rank,
gene_sets=gene_sets,
outdir=tempdir,
threads=snakemake.threads,
**extra,
)
mouve_output_files(method="prerank", tempdir=tempdir)
elif snakemake.input.get("expr"):
if snakemake.input.get("cls"):
print("Using GSEA method")
result = gseapy.gsea(
data=snakemake.input.expr,
gene_sets=gene_sets,
cls=snakemake.input.cls,
threads=snakemake.threads,
outdir=tempdir,
**extra,
)
mouve_output_files(method="gsea", tempdir=tempdir)
else:
if snakemake.threads > 1:
raise TooManyThreadsRequested()
print("Using Single-Sample GSEA method")
result = gseapy.ssgsea(
data=snakemake.input.expr, gene_sets=gene_sets, outdir=tempdir, **extra
)
mouve_output_files(method="ssgsea", tempdir=tempdir)
elif snakemake.input.get("gene_list"):
if snakemake.threads > 1:
raise TooManyThreadsRequested()
print("Using Biomart EnrichR method")
result = gseapy.enrichr(
gene_list=snakemake.input.gene_list,
gene_sets=gene_sets,
outdir=tempdir,
**extra,
)
mouve_output_files(method="enrichr", tempdir=tempdir)
else:
raise ValueError("Could not decide between GSEApy functions")