MERQURY

Evaluate genome assemblies with k-mers and more.

URL: https://github.com/marbl/merqury

Example

This wrapper can be used in the following way:

rule run_merqury_haploid:
    input:
        fasta="hap1.fasta",
        meryldb="meryldb",
    output:
        # meryldb output
        filt="results/haploid/meryldb.filt",
        hist="results/haploid/meryldb.hist",
        hist_ploidy="results/haploid/meryldb.hist.ploidy",
        # general output
        completeness_stats="results/haploid/out.completeness.stats",
        dist_only_hist="results/haploid/out.dist.only.hist",
        qv="results/haploid/out.qv",
        spectra_asm_hist="results/haploid/out.spectra_asm.hist",
        spectra_asm_ln_png="results/haploid/out.spectra_asm.png",
        # haplotype-specific output
        fas1_only_bed="results/haploid/hap1.bed",
        fas1_only_wig="results/haploid/hap1.wig",
        fas1_only_hist="results/haploid/hap1.hist",
        fas1_qv="results/haploid/hap1.qv",
        fas1_spectra_cn_hist="results/haploid/hap1.spectra.hist",
        fas1_spectra_cn_ln_png="results/haploid/hap1.spectra.png",
    log:
        std="logs/haploid.log",
        spectra_cn="logs/haploid.spectra-cn.log",
    threads: 1
    wrapper:
        "v3.8.0-1-g149ef14/bio/merqury"


rule run_merqury_diploid:
    input:
        fasta=["hap1.fasta", "hap2.fasta"],
        meryldb="meryldb",
    output:
        # meryldb output
        filt="results/diploid/meryldb.filt",
        hist="results/diploid/meryldb.hist",
        hist_ploidy="results/diploid/meryldb.hist.ploidy",
        # general output
        completeness_stats="results/diploid/out.completeness.stats",
        dist_only_hist="results/diploid/out.dist.only.hist",
        only_hist="results/diploid/out.only.hist",
        qv="results/diploid/out.qv",
        spectra_asm_hist="results/diploid/out.spectra_asm.hist",
        spectra_asm_ln_png="results/diploid/out.spectra_asm.png",
        spectra_cn_hist="results/diploid/out.spectra_cn.hist",
        spectra_cn_ln_png="results/diploid/out.spectra_cn.png",
        # haplotype-specific output
        fas1_only_bed="results/diploid/hap1.bed",
        fas1_only_wig="results/diploid/hap1.wig",
        fas1_only_hist="results/diploid/hap1.hist",
        fas1_qv="results/diploid/hap1.qv",
        fas1_spectra_cn_hist="results/diploid/hap1.spectra.hist",
        fas1_spectra_cn_ln_png="results/diploid/hap1.spectra.png",
        fas2_only_bed="results/diploid/hap2.bed",
        fas2_only_wig="results/diploid/hap2.wig",
        fas2_only_hist="results/diploid/hap2.hist",
        fas2_qv="results/diploid/hap2.qv",
        fas2_spectra_cn_hist="results/diploid/hap2.spectra.hist",
        fas2_spectra_cn_ln_png="results/diploid/hap2.spectra.png",
    log:
        std="logs/diploid.log",
        spectra_cn="logs/diploid.spectra-cn.log",
    threads: 1
    wrapper:
        "v3.8.0-1-g149ef14/bio/merqury"

Note that input, output and log file paths can be chosen freely.

When running with

snakemake --use-conda

the software dependencies will be automatically deployed into an isolated environment before execution.

Notes

Merqury does not return non-zero exit code when it fails, so always include (at least) one PNG file in your rule’s output.
Merqury does not allow for extra params.

Software dependencies

merqury=1.3

Input/Output

Input:

one on two assembly fasta file(s)
meryl database

Output:

annotation quality files

Authors

Filipe G. Vieira

Code

__author__ = "Filipe G. Vieira"
__copyright__ = "Copyright 2022, Filipe G. Vieira"
__license__ = "MIT"


import os
import tempfile
from pathlib import Path
from snakemake.shell import shell

meryldb_parents = snakemake.input.get("meryldb_parents", "")
out_prefix = "out"
log_tmp = "__LOG__.tmp"


def save_output(out_prefix, ext, cwd, file):
    if file is None:
        return 0
    src = f"{out_prefix}{ext}"
    dest = cwd / file
    shell("cat {src} > {dest}")


with tempfile.TemporaryDirectory() as tmpdir:
    cwd = Path.cwd()
    # Create symlinks for input files
    for input in snakemake.input:
        src = Path(input)
        dst = Path(tmpdir) / input
        src = Path(os.path.relpath(src.resolve(), dst.resolve().parent))
        dst.symlink_to(src)
    os.chdir(tmpdir)

    shell(
        "merqury.sh"
        " {snakemake.input.meryldb}"
        " {meryldb_parents}"
        " {snakemake.input.fasta}"
        " {out_prefix}"
        " > {log_tmp} 2>&1"
    )

    ### Saving LOG files
    save_output(log_tmp, "", cwd, snakemake.log.get("std"))
    for type in ["spectra_cn"]:
        save_output(
            f"logs/{out_prefix}",
            "." + type.replace("_", "-") + ".log",
            cwd,
            snakemake.log.get(type),
        )

    ### Saving OUTPUT files
    # EXT: replace all "_" with "."
    meryldb = Path(snakemake.input.meryldb.rstrip("/")).stem
    for type in ["filt", "hist", "hist_ploidy"]:
        save_output(
            meryldb, "." + type.replace("_", "."), cwd, snakemake.output.get(type)
        )

    # EXT: replace last "_" with "."
    for type in [
        "completeness_stats",
        "dist_only_hist",
        "only_hist",
        "qv",
        "hapmers_count",
        "hapmers_blob_png",
    ]:
        save_output(
            out_prefix,
            "." + type[::-1].replace("_", ".", 1)[::-1],
            cwd,
            snakemake.output.get(type),
        )

    # EXT: replace first "_" with "-", and remaining with "."
    for type in [
        "spectra_asm_hist",
        "spectra_asm_ln_png",
        "spectra_asm_fl_png",
        "spectra_asm_st_png",
        "spectra_cn_hist",
        "spectra_cn_ln_png",
        "spectra_cn_fl_png",
        "spectra_cn_st_png",
    ]:
        save_output(
            out_prefix,
            "." + type.replace("_", ".").replace(".", "-", 1),
            cwd,
            snakemake.output.get(type),
        )

    input_fas = snakemake.input.fasta
    if isinstance(input_fas, str):
        input_fas = [input_fas]

    for fas in range(1, len(input_fas) + 1):
        prefix = Path(input_fas[fas - 1]).name.removesuffix(".fasta")

        # EXT: remove everything until first "_" and replace last "_" with "."
        for type in [f"fas{fas}_only_bed", f"fas{fas}_only_wig"]:
            save_output(
                prefix,
                type[type.find("_") :][::-1].replace("_", ".", 1)[::-1],
                cwd,
                snakemake.output.get(type),
            )

        # EXT: remove everything until first "_" and replace all "_" with "."
        for type in [f"fas{fas}_only_hist", f"fas{fas}_qv"]:
            save_output(
                f"{out_prefix}.{prefix}",
                type[type.find("_") :].replace("_", "."),
                cwd,
                snakemake.output.get(type),
            )

        # EXT: remove everything until first "_", replace first "_" with "-", and remaining with "."
        for type in [
            f"fas{fas}_spectra_cn_hist",
            f"fas{fas}_spectra_cn_ln_png",
            f"fas{fas}_spectra_cn_fl_png",
            f"fas{fas}_spectra_cn_st_png",
        ]:
            save_output(
                f"{out_prefix}.{prefix}",
                "." + type[type.find("_") + 1 :].replace("_", ".").replace(".", "-", 1),
                cwd,
                snakemake.output.get(type),
            )