STAR
Map reads with STAR.
URL: https://github.com/alexdobin/STAR
Example
This wrapper can be used in the following way:
rule star_pe_multi:
input:
# use a list for multiple fastq files for one sample
# usually technical replicates across lanes/flowcells
fq1=["reads/{sample}_R1.1.fastq", "reads/{sample}_R1.2.fastq"],
# paired end reads needs to be ordered so each item in the two lists match
fq2=["reads/{sample}_R2.1.fastq", "reads/{sample}_R2.2.fastq"], #optional
# path to STAR reference genome index
idx="index",
output:
# see STAR manual for additional output files
aln="star/pe/{sample}/pe_aligned.sam",
log="logs/pe/{sample}/Log.out",
sj="star/pe/{sample}/SJ.out.tab",
unmapped=["star/pe/{sample}/unmapped.1.fastq.gz","star/pe/{sample}/unmapped.2.fastq.gz"],
log:
"logs/pe/{sample}.log",
params:
# optional parameters
extra="",
threads: 8
wrapper:
"v3.6.0-3-gc8272d7/bio/star/align"
rule star_se:
input:
fq1="reads/{sample}_R1.1.fastq",
# path to STAR reference genome index
idx="index",
output:
# see STAR manual for additional output files
aln="star/se/{sample}/se_aligned.bam",
log="logs/se/{sample}/Log.out",
log_final="logs/se/{sample}/Log.final.out",
unmapped="star/se/{sample}/unmapped.fastq",
log:
"logs/se/{sample}.log",
params:
# optional parameters
extra="--outSAMtype BAM Unsorted",
threads: 8
wrapper:
"v3.6.0-3-gc8272d7/bio/star/align"
Note that input, output and log file paths can be chosen freely.
When running with
snakemake --use-conda
the software dependencies will be automatically deployed into an isolated environment before execution.
Notes
The extra param allows for additional program arguments.
It is advisable to consider updating the limits setting before running STAR, such as executing ulimit -n 10000, to avoid an issue like this: https://github.com/alexdobin/STAR/issues/1344
Software dependencies
star=2.7.11b
Code
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "koester@jimmy.harvard.edu"
__license__ = "MIT"
import os
import tempfile
from snakemake.shell import shell
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)
fq1 = snakemake.input.get("fq1")
assert fq1 is not None, "input-> fq1 is a required input parameter"
fq1 = (
[snakemake.input.fq1]
if isinstance(snakemake.input.fq1, str)
else snakemake.input.fq1
)
fq2 = snakemake.input.get("fq2")
if fq2:
fq2 = (
[snakemake.input.fq2]
if isinstance(snakemake.input.fq2, str)
else snakemake.input.fq2
)
assert len(fq1) == len(
fq2
), "input-> equal number of files required for fq1 and fq2"
input_str_fq1 = ",".join(fq1)
input_str_fq2 = ",".join(fq2) if fq2 is not None else ""
input_str = " ".join([input_str_fq1, input_str_fq2])
if fq1[0].endswith(".gz"):
readcmd = "--readFilesCommand gunzip -c"
elif fq1[0].endswith(".bz2"):
readcmd = "--readFilesCommand bunzip2 -c"
else:
readcmd = ""
out_unmapped = snakemake.output.get("unmapped", "")
if out_unmapped:
out_unmapped = "--outReadsUnmapped Fastx"
index = snakemake.input.get("idx")
if not index:
index = snakemake.params.get("idx", "")
if "--outSAMtype BAM SortedByCoordinate" in extra:
stdout = "BAM_SortedByCoordinate"
elif "BAM Unsorted" in extra:
stdout = "BAM_Unsorted"
else:
stdout = "SAM"
with tempfile.TemporaryDirectory() as tmpdir:
shell(
"STAR "
" --runThreadN {snakemake.threads}"
" --genomeDir {index}"
" --readFilesIn {input_str}"
" {readcmd}"
" {extra}"
" {out_unmapped}"
" --outTmpDir {tmpdir}/STARtmp"
" --outFileNamePrefix {tmpdir}/"
" --outStd {stdout}"
" > {snakemake.output.aln}"
" {log}"
)
if snakemake.output.get("reads_per_gene"):
shell("cat {tmpdir}/ReadsPerGene.out.tab > {snakemake.output.reads_per_gene:q}")
if snakemake.output.get("chim_junc"):
shell("cat {tmpdir}/Chimeric.out.junction > {snakemake.output.chim_junc:q}")
if snakemake.output.get("sj"):
shell("cat {tmpdir}/SJ.out.tab > {snakemake.output.sj:q}")
if snakemake.output.get("log"):
shell("cat {tmpdir}/Log.out > {snakemake.output.log:q}")
if snakemake.output.get("log_progress"):
shell("cat {tmpdir}/Log.progress.out > {snakemake.output.log_progress:q}")
if snakemake.output.get("log_final"):
shell("cat {tmpdir}/Log.final.out > {snakemake.output.log_final:q}")
unmapped = snakemake.output.get("unmapped")
if unmapped:
# SE
if not fq2:
unmapped = [unmapped]
for i, out_unmapped in enumerate(unmapped, 1):
cmd = "gzip -c" if out_unmapped.endswith("gz") else "cat"
shell("{cmd} {tmpdir}/Unmapped.out.mate{i} > {out_unmapped}")