Skip to content
Commits on Source (7)
SINGULARITY_VER=2.5.1
SINGULARITY_VER=3.3.0
......@@ -27,13 +27,16 @@ jobs:
# TODO only install if singularity is not yet present
# if type singularity > /dev/null; then exit 0; fi
source .circleci/common.sh
sudo apt-get update; sudo apt-get install squashfs-tools libarchive-dev
wget https://github.com/singularityware/singularity/releases/download/$SINGULARITY_VER/singularity-$SINGULARITY_VER.tar.gz
tar xvf singularity-$SINGULARITY_VER.tar.gz
cd singularity-$SINGULARITY_VER
./configure --prefix=/usr/local --sysconfdir=/etc
make
sudo make install
sudo add-apt-repository ppa:gophers/archive
sudo apt-get update
sudo apt-get install build-essential libssl-dev uuid-dev libgpgme11-dev libseccomp-dev wget pkg-config squashfs-tools libarchive-dev golang-1.11
export PATH=/usr/lib/go-1.11/bin:$PATH
wget https://github.com/sylabs/singularity/releases/download/v${SINGULARITY_VER}/singularity-${SINGULARITY_VER}.tar.gz
tar -xvf singularity-$SINGULARITY_VER.tar.gz
cd singularity
./mconfig
make -C builddir
sudo make -C builddir install
- run:
name: Setup Snakemake
command: |
......
......@@ -17,3 +17,6 @@ dist/
.ipynb*
.ropeproject
.test*
tests/test*/*
playground/*
tutorial/*
\ No newline at end of file
[5.6.0] - 2019-09-06
====================
Changed
-------
- Fix compatibility with latest singularity versions.
- Various bug fixes (e.g. in cluster error handling, remote providers, kubernetes backend).
Added
-----
- Add --default-resources flag, that allows to define default resources for jobs (e.g. mem_mb, disk_mb), see `docs <https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#resources>`_.
- Accept ``--dry-run`` as a synonym of ``--dryrun``. Other Snakemake options are similarly hyphenated, so other documentation now refers to ``--dry-run`` but both (and also ``-n``) will always be accepted equivalently.
[5.5.4] - 2019-07-21
====================
Changed
......
......@@ -21,7 +21,7 @@ Missing (optional) dependencies:
Missing (optional) Python dependencies:
- moto
- moto https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=777089
- google.cloud (google-cloud-sdk)
- ftputil
- pysftp
......
snakemake (5.6.0-1) unstable; urgency=medium
* Team upload.
* New upstream version
* 0013-remove-duplicate-keyword-argument.patch removed, applied upstream
* Add AutoPkgTests
-- Michael R. Crusoe <michael.crusoe@gmail.com> Fri, 13 Sep 2019 16:15:23 +0200
snakemake (5.5.4-2) unstable; urgency=medium
* set $HOME to fix build on sbuild
......
From 6d013348a3501b6c183438cfb44bf78704128925 Mon Sep 17 00:00:00 2001
From: Alistair Miles <alimanfoo@googlemail.com>
Date: Mon, 29 Jul 2019 14:53:04 +0000
Subject: [PATCH] Merged in
alimanfoo/snakemake/Alistair-Miles/remove-duplicate-keyword-argument-1563308166092
(pull request #397)
remove duplicate keyword argument
---
snakemake/remote/gfal.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- snakemake.orig/snakemake/remote/gfal.py
+++ snakemake/snakemake/remote/gfal.py
@@ -26,7 +26,7 @@
supports_default = True
allows_directories = True
- def __init__(self, *args, keep_local=False, stay_on_remote=False, is_default=False, stay_on_remote=False, retry=5, **kwargs):
+ def __init__(self, *args, keep_local=False, stay_on_remote=False, is_default=False, retry=5, **kwargs):
super(RemoteProvider, self).__init__(*args, keep_local=keep_local, stay_on_remote=stay_on_remote, is_default=is_default, **kwargs)
self.retry = retry
......@@ -10,5 +10,4 @@
# 0010-skip-test-without-rmarkdown.patch
0011-fix-privacy-breach.patch
0012-reproducible-build.patch
0013-remove-duplicate-keyword-argument.patch
boto3_is_just_boto
......@@ -7,7 +7,7 @@ export HOME=$(CURDIR)/fakehome
export PYBUILD_NAME=snakemake
export PYBUILD_DESTDIR_python3=debian/snakemake
export PYBUILD_BEFORE_TEST_python3=chmod +x {dir}/bin/snakemake; cp -r {dir}/bin {dir}/tests {build_dir}
export PYBUILD_TEST_ARGS=python{version} -m pytest tests/test*.py -v -k 'not report and not ancient and not test_script and not default_remote and not issue635 and not convert_to_cwl and not issue1083 and not issue1092 and not issue1093'
export PYBUILD_TEST_ARGS=python{version} -m pytest tests/test*.py -v -k 'not report and not ancient and not test_script and not default_remote and not issue635 and not convert_to_cwl and not issue1083 and not issue1092 and not issue1093 and not test_remote and not test_default_resources'
# test_report
# test_ancient
......@@ -17,6 +17,7 @@ export PYBUILD_TEST_ARGS=python{version} -m pytest tests/test*.py -v -k 'not rep
# test_convert_to_cwl tries to build a singularity format software image from docker://quay.io/snakemake/snakemake:v5.5.4
# test_issue1083 tries to build a singularity format software image from docker://bash
# test_issue1093 fails due to conda usage; commenting that out and installing bwa produces a different ordering than desired
# test_default_resources and test_remote needs moto to be packaged https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=777089
export PYBUILD_AFTER_TEST_python3=rm -fr {build_dir}/bin {build_dir}/tests
......
Tests: run-unit-test
Depends: @
Restrictions: allow-stderr
#!/bin/bash
set -e
pkg=snakemake
ROOT=$(pwd)
if [ "${AUTOPKGTEST_TMP}" = "" ] ; then
AUTOPKGTEST_TMP=$(mktemp -d /tmp/${pkg}-test.XXXXXX)
# Double quote below to expand the temporary directory variable now versus
# later is on purpose.
# shellcheck disable=SC2064
trap "rm -rf ${AUTOPKGTEST_TMP}" 0 INT QUIT ABRT PIPE TERM
fi
cd "${AUTOPKGTEST_TMP}"
python3 -m pytest ${ROOT}/tests/test*.py -v -k 'not report and not ancient and not test_script and not default_remote and not issue635 and not convert_to_cwl and not issue1083 and not issue1092 and not issue1093 and not test_remote and not test_default_resources and not test_singularity and not test_singularity_conda and not test_cwl_singularity and not test_cwl'
......@@ -172,7 +172,7 @@ The cluster command can be decorated with job specific information, e.g.
$ snakemake --cluster "qsub {threads}"
Thereby, all keywords of a rule are allowed (e.g. params, input, output, threads, priority, ...).
Thereby, all keywords of a rule are allowed (e.g. rulename, params, input, output, threads, priority, ...).
For example, you could encode the expected running time into params:
.. code-block:: python
......@@ -204,7 +204,7 @@ Else, the arguments will be interpreted as part of the normal Snakemake argument
Job Properties
~~~~~~~~~~~~~~
When executing a workflow on a cluster using the ``--cluster`` parameter (see below), Snakemake creates a job script for each job to execute. This script is then invoked using the provided cluster submission command (e.g. ``qsub``). Sometimes you want to provide a custom wrapper for the cluster submission command that decides about additional parameters. As this might be based on properties of the job, Snakemake stores the job properties (e.g. rule name, threads, input files, params etc.) as JSON inside the job script. For convenience, there exists a parser function `snakemake.utils.read_job_properties` that can be used to access the properties. The following shows an example job submission wrapper:
When executing a workflow on a cluster using the ``--cluster`` parameter (see below), Snakemake creates a job script for each job to execute. This script is then invoked using the provided cluster submission command (e.g. ``qsub``). Sometimes you want to provide a custom wrapper for the cluster submission command that decides about additional parameters. As this might be based on properties of the job, Snakemake stores the job properties (e.g. name, rulename, threads, input, output, params etc.) as JSON inside the job script (for group jobs, the rulename will be "GROUP", otherwise it will be the same as the job name). For convenience, there exists a parser function `snakemake.utils.read_job_properties` that can be used to access the properties. The following shows an example job submission wrapper:
.. code-block:: python
......@@ -228,6 +228,8 @@ When executing a workflow on a cluster using the ``--cluster`` parameter (see be
os.system("qsub -t {threads} {script}".format(threads=threads, script=jobscript))
.. _profiles:
--------
Profiles
--------
......
......@@ -493,7 +493,7 @@ To remove all files created by snakemake as output files to start from scratch,
Only files that are output of snakemake rules will be removed, not those that serve as primary inputs to the workflow.
Note that this will only affect the files involved in reaching the specified target(s).
It is strongly advised to first run together with ``--dryrun`` to list the files that would be removed without actually deleting anything.
It is strongly advised to first run together with ``--dry-run`` to list the files that would be removed without actually deleting anything.
The flag ``--delete-temp-output`` can be used in a similar manner to only delete files flagged as temporary.
......@@ -509,7 +509,7 @@ It is recommended to use the script directive instead (see :ref:`snakefiles-exte
My workflow is very large, how do I stop Snakemake from printing all this rule/job information in a dry-run?
------------------------------------------------------------------------------------------------------------
Indeed, the information for each individual job can slow down a dryrun if there are tens of thousands of jobs.
Indeed, the information for each individual job can slow down a dry-run if there are tens of thousands of jobs.
If you are just interested in the final summary, you can use the ``--quiet`` flag to suppress this.
.. code-block:: console
......
......@@ -235,12 +235,19 @@ If limits for the resources are given via the command line, e.g.
$ snakemake --resources mem_mb=100
the scheduler will ensure that the given resources are not exceeded by running jobs.
If no limits are given, the resources are ignored.
If no limits are given, the resources are ignored in local execution.
In cluster or cloud execution, resources are always passed to the backend, even if ``--resources`` is not specified.
Apart from making Snakemake aware of hybrid-computing architectures (e.g. with a limited number of additional devices like GPUs) this allows to control scheduling in various ways, e.g. to limit IO-heavy jobs by assigning an artificial IO-resource to them and limiting it via the ``--resources`` flag.
Resources must be ``int`` values.
Note that you are free to choose any names for the given resources.
When defining memory constraints, it is however advised to use ``mem_mb``, because there are
Snakemake execution modes that make use of this information, (e.g., when using :ref:`kubernetes`).
There are two **standard resources** for memory and disk usage though: ``mem_mb`` and ``disk_mb``.
When defining memory constraints, it is advised to use ``mem_mb``, because there are
Some execution modes make direct use of this information (e.g., when using :ref:`Kubernetes <kubernetes>`).
Since it would be cumbersome to define them for every rule, you can set default values at the terminal or in a :ref:`profile <profiles>`.
This works via the command line flag ``--default-resources``, see ``snakemake --help`` for more information.
If those resource definitions are mandatory for a certain execution mode, Snakemake will fail with a hint if they are missing.
Any resource definitions inside a rule override what has been defined with ``--default-resources``.
Resources can also be callables that return ``int`` values.
The signature of the callable has to be ``callable(wildcards [, input] [, threads] [, attempt])`` (``input``, ``threads``, and ``attempt`` are optional parameters).
......@@ -298,7 +305,7 @@ Snakemake allows rules to specify numeric priorities:
Per default, each rule has a priority of 0. Any rule that specifies a higher priority, will be preferred by the scheduler over all rules that are ready to execute at the same time without having at least the same priority.
Furthermore, the ``--prioritize`` or ``-P`` command line flag allows to specify files (or rules) that shall be created with highest priority during the workflow execution. This means that the scheduler will assign the specified target and all its dependencies highest priority, such that the target is finished as soon as possible.
The ``--dryrun`` or ``-n`` option allows you to see the scheduling plan including the assigned priorities.
The ``--dry-run`` (equivalently ``--dryrun``) or ``-n`` option allows you to see the scheduling plan including the assigned priorities.
......
......@@ -47,7 +47,7 @@ In the following, we will introduce the Snakemake syntax by creating an example
The workflow comes from the domain of genome analysis.
It maps sequencing reads to a reference genome and call variants on the mapped reads.
The tutorial does not require you to know what this is about.
Nevertheless, we provide some background in the following.
Nevertheless, we provide some background in the following paragraph.
.. _tutorial-background:
......@@ -122,7 +122,7 @@ By executing
$ snakemake -np mapped_reads/A.bam
in the working directory containing the Snakefile, we tell Snakemake to generate the target file ``mapped_reads/A.bam``.
Since we used the ``-n`` (or ``--dryrun``) flag, Snakemake will only show the execution plan instead of actually perform the steps.
Since we used the ``-n`` (or ``--dry-run``) flag, Snakemake will only show the execution plan instead of actually perform the steps.
The ``-p`` flag instructs Snakemake to also print the resulting shell command for illustration.
To generate the target files, **Snakemake applies the rules given in the Snakefile in a top-down way**.
The application of a rule to generate a set of output files is called **job**.
......@@ -412,7 +412,7 @@ Create the file ``scripts/plot-quals.py``, with the following content:
Although there are other strategies to invoke separate scripts from your workflow
(e.g., invoking them via shell commands), the benefit of this is obvious:
the script logic is separated from the workflow logic (and can be even shared between workflows),
but **boilerplate code like the parsing of command line arguments in unnecessary**.
but **boilerplate code like the parsing of command line arguments is unnecessary**.
Apart from Python scripts, it is also possible to use R scripts. In R scripts,
an S4 object named ``snakemake`` analog to the Python case above is available and
......
Short tutorial
==============
Here we provide a short tutorial that guides you through the main features of Snakemake.
Note that this is not suited to learn Snakemake from scratch, rather to give a first impression.
To really learn Snakemake (starting from something simple, and extending towards advanced features), use the main :ref:`tutorial`.
This document shows all steps performed in the official `Snakemake live demo <https://youtu.be/hPrXcUUp70Y>`_,
such that it becomes possible to follow them at your own pace.
Solutions to each step can be found at the bottom of this document.
The examples presented in this tutorial come from Bioinformatics.
However, Snakemake is a general-purpose workflow management system for any discipline.
For an explanation of the steps you will perform here, have a look at :ref:`tutorial-background`.
More thorough explanations are provided in the full :ref:`tutorial`.
Prerequisites
-------------
First, install Snakemake via Conda, as outlined in :ref:`conda-install`.
The minimal version of Snakemake is sufficient for this demo.
Second, download and unpack the test data needed for this example from
`here <https://bitbucket.org/snakemake/snakemake-tutorial/get/v5.2.3.tar.bz2>`_,
e.g., via
::
mkdir snakemake-demo
cd snakemake-demo
wget https://bitbucket.org/snakemake/snakemake-tutorial/get/v5.2.3.tar.bz2
tar --wildcards -xf v5.2.3.tar.bz2 --strip 1 "*/data"
Step 1
------
First, create an empty workflow in the current directory with:
::
touch Snakefile
Once a Snakefile is present, you can perform a dry run of Snakemake
with:
::
snakemake -n
Since the Snakefile is empty, it will report that nothing has to be
done. In the next steps, we will gradually fill the Snakefile with an
example analysis workflow.
Step 2
------
The data folder in your working directory looks as follows:
::
data
├── genome.fa
├── genome.fa.amb
├── genome.fa.ann
├── genome.fa.bwt
├── genome.fa.fai
├── genome.fa.pac
├── genome.fa.sa
└── samples
├── A.fastq
├── B.fastq
└── C.fastq
You will create a workflow that maps the sequencing samples in the
``data/samples`` folder to the reference genome ``data/genome.fa``.
Then, you will call genomic variants over the mapped samples, and create
an example plot.
First, create a rule called ``bwa``, with input files
- ``data/genome.fa``
- ``data/samples/A.fastq``
and output file
- ``mapped/A.bam``
To generate output from input, use the shell command
.. code:: python
"bwa mem {input} | samtools view -Sb - > {output}"
Providing a shell command is not enough to run your workflow on an
unprepared system. For reproducibility, you also have to provide the
required software stack and define the desired version. This can be done
with the `Conda package manager <https://conda.io>`__, which is directly
integrated with Snakemake: add a directive
``conda: "envs/mapping.yaml"`` that points to a `Conda environment
definition <https://conda.io/docs/user-guide/tasks/manage-environments.html?highlight=environment#creating-an-environment-file-manually>`__,
with the following content
.. code:: yaml
channels:
- bioconda
- conda-forge
dependencies:
- bwa =0.7.17
- samtools =1.9
Upon execution, Snakemake will automatically create that environment,
and execute the shell command within.
Now, test your workflow by simulating the creation of the file
``mapped/A.bam`` via
::
snakemake --use-conda -n mapped/A.bam
to perform a dry-run and
::
snakemake --use-conda mapped/A.bam
to perform the actual execution.
Step 3
------
Now, generalize the rule ``bwa`` by replacing the concrete sample name
``A`` with a wildcard ``{sample}`` in input and output file the rule
``bwa``. This way, Snakemake can apply the rule to map any of the three
available samples to the reference genome.
Test this by creating the file ``mapped/B.bam``.
Step 4
------
Next, create a rule ``sort`` that sorts the obtained ``.bam`` file by
genomic coordinate. The rule should have the input file
- ``mapped/{sample}.bam``
and the output file
- ``mapped/{sample}.sorted.bam``
and uses the shell command
::
samtools sort -o {output} {input}
to perform the sorting. Moreover, use the same ``conda:`` directive as
for the previous rule.
Test your workflow with
::
snakemake --use-conda -n mapped/A.sorted.bam
and
::
snakemake --use-conda mapped/A.sorted.bam
Step 5
------
Now, we aggregate over all samples to perform a joint calling of genomic
variants. First, we define a variable
.. code:: python
samples = ["A", "B", "C"]
at the top of the ``Snakefile``. This serves as a definition of the
samples over which we would want to aggregate. In real life, you would
want to use an external sample sheet or a `config
file <http://snakemake.readthedocs.io/en/stable/tutorial/advanced.html#step-2-config-files>`__
for things like this.
For aggregation over many files, Snakemake provides the helper function
``expand`` (see `the
docs <http://snakemake.readthedocs.io/en/stable/tutorial/basics.html#step-5-calling-genomic-variants>`__).
Create a rule ``call`` with input files
- ``fa="data/genome.fa"``
- ``bam=expand("mapped/{sample}.sorted.bam", sample=samples)``
output file
- ``"calls/all.vcf"``
and shell command
::
samtools mpileup -g -f {input.fa} {input.bam} | bcftools call -mv - > {output}
Further, define a new conda environment file with the following content:
.. code:: yaml
channels:
- bioconda
- conda-forge
dependencies:
- bcftools =1.9
- samtools =1.9
Step 6
------
Finally, we strive to calculate some exemplary statistics. This time, we
don’t use a shell command, but rather employ Snakemake’s ability to
integrate with scripting languages like R and Python.
First, we create a rule ``stats`` with input file
- ``"calls/all.vcf"``
and output file
- ``"plots/quals.svg"``.
Instead of a shell command, we write
.. code:: python
script:
"scripts/plot-quals.py"
and create the corresponding script and its containing folder in our
working directory with
::
mkdir scripts
touch scripts/plot-quals.py
We open the script in the editor and add the following content
.. code:: python
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from pysam import VariantFile
quals = [record.qual for record in VariantFile(snakemake.input[0])]
plt.hist(quals)
plt.savefig(snakemake.output[0])
As you can see, instead of writing a command line parser for passing
parameters like input and output files, you have direct access to the
properties of the rule via a magic ``snakemake`` object, that Snakemake
automatically inserts into the script before executing the rule.
Finally, we have to define a conda environment for the rule, say
``envs/stats.yaml``, that provides the required Python packages to
execute the script:
.. code:: yaml
channels:
- bioconda
- conda-forge
dependencies:
- pysam =0.15
- matplotlib =3.1
- python =3.7
Make sure to test your workflow with
::
snakemake --use-conda plots/quals.svg
Step 7
------
So far, we have always specified a target file at the command line when
invoking Snakemake. When no target file is specified, Snakemake tries to
execute the first rule in the ``Snakefile``. We can use this property to
define default target files.
At the top of your ``Snakefile`` define a rule ``all``, with input files
- ``"calls/all.vcf"``
- ``"plots/quals.svg"``
and neither a shell command nor output files. This rule simply serves as
an indicator of what shall be collected as results.
Step 8
------
As a last step, we strive to annotate our workflow with some additional
information.
Automatic reports
~~~~~~~~~~~~~~~~~
Snakemake can automatically create HTML reports with
::
snakemake --report report.html
Such a report contains runtime statistics, a visualization of the
workflow topology, used software and data provenance information.
In addition, you can mark any output file generated in your workflow for
inclusion into the report. It will be encoded directly into the report,
such that it can be, e.g., emailed as a self-contained document. The
reader (e.g., a collaborator of yours) can at any time download the
enclosed results from the report for further use, e.g., in a manuscript
you write together. In this example, please mark the output file
``"plots/quals.svg"`` for inclusion by replacing it with
``report("plots/quals.svg", caption="report/calling.rst")`` and adding a
file ``report/calling.rst``, containing some description of the output
file. This description will be presented as caption in the resulting
report.
Threads
~~~~~~~
The first rule ``bwa`` can in theory use multiple threads. You can make
Snakemake aware of this, such that the information can be used for
scheduling. Add a directive ``threads: 8`` to the rule and alter the
shell command to
::
bwa mem -t {threads} {input} | samtools view -Sb - > {output}
This passes the threads defined in the rule as a command line argument
to the ``bwa`` process.
Temporary files
~~~~~~~~~~~~~~~
The output of the ``bwa`` rule becomes superfluous once the sorted
version of the ``.bam`` file is generated by the rule ``sort``.
Snakemake can automatically delete the superfluous output once it is not
needed anymore. For this, mark the output as temporary by replacing
``"mapped/{sample}.bam"`` in the rule ``bwa`` with
``temp("mapped/{sample}.bam")``.
Solutions
---------
Only read this if you have a problem with one of the steps.
.. _step-2-1:
Step 2
~~~~~~
The rule should look like this:
.. code:: python
rule bwa:
input:
"data/genome.fa",
"data/samples/A.fastq"
output:
"mapped/A.bam"
conda:
"envs/mapping.yaml"
shell:
"bwa mem {input} | samtools view -Sb - > {output}"
.. _step-3-1:
Step 3
~~~~~~
The rule should look like this:
.. code:: python
rule bwa:
input:
"data/genome.fa",
"data/samples/{sample}.fastq"
output:
"mapped/{sample}.bam"
conda:
"envs/mapping.yaml"
shell:
"bwa mem {input} | samtools view -Sb - > {output}"
.. _step-4-1:
Step 4
~~~~~~
The rule should look like this:
.. code:: python
rule sort:
input:
"mapped/{sample}.bam"
output:
"mapped/{sample}.sorted.bam"
conda:
"envs/mapping.yaml"
shell:
"samtools sort -o {output} {input}"
.. _step-5-1:
Step 5
~~~~~~
The rule should look like this:
.. code:: python
samples = ["A", "B", "C"]
rule call:
input:
fa="data/genome.fa",
bam=expand("mapped/{sample}.sorted.bam", sample=samples)
output:
"calls/all.vcf"
conda:
"envs/calling.yaml"
shell:
"samtools mpileup -g -f {input.fa} {input.bam} | "
"bcftools call -mv - > {output}"
.. _step-6-1:
Step 6
~~~~~~
The rule should look like this:
.. code:: python
rule stats:
input:
"calls/all.vcf"
output:
"plots/quals.svg"
conda:
"envs/stats.yaml"
script:
"scripts/plot-quals.py"
.. _step-7-1:
Step 7
~~~~~~
The rule should look like this:
.. code:: python
rule all:
input:
"calls/all.vcf",
"plots/quals.svg"
It has to appear as first rule in the ``Snakefile``.
.. _step-8-1:
Step 8
~~~~~~
The complete workflow should look like this:
.. code:: python
samples = ["A", "B"]
rule all:
input:
"calls/all.vcf",
"plots/quals.svg"
rule bwa:
input:
"data/genome.fa",
"data/samples/{sample}.fastq"
output:
temp("mapped/{sample}.bam")
conda:
"envs/mapping.yaml"
threads: 8
shell:
"bwa mem -t {threads} {input} | samtools view -Sb - > {output}"
rule sort:
input:
"mapped/{sample}.bam"
output:
"mapped/{sample}.sorted.bam"
conda:
"envs/mapping.yaml"
shell:
"samtools sort -o {output} {input}"
rule call:
input:
fa="data/genome.fa",
bam=expand("mapped/{sample}.sorted.bam", sample=samples)
output:
"calls/all.vcf"
conda:
"envs/calling.yaml"
shell:
"samtools mpileup -g -f {input.fa} {input.bam} | "
"bcftools call -mv - > {output}"
rule stats:
input:
"calls/all.vcf"
output:
report("plots/quals.svg", caption="report/calling.rst")
conda:
"envs/stats.yaml"
script:
"scripts/plot-quals.py"
......@@ -32,3 +32,4 @@ dependencies:
- xorg-libxpm
- gitpython
- pygments
- imagemagick
" Vim syntax file
" Language: Snakemake (extended from python.vim)
" Maintainer: Jay Hesselberth (jay.hesselberth@gmail.com)
" Last Change: 2016 Jan 23
" Last Change: 2019 Jul 26
"
" Usage
"
......@@ -43,10 +43,11 @@ source $VIMRUNTIME/syntax/python.vim
" singularity = "singularity" ":" stringliteral
" conda = "conda" ":" stringliteral
" shadow = "shadow" ":" stringliteral
" group = "group" ":" stringliteral
syn keyword pythonStatement include workdir onsuccess onerror
syn keyword pythonStatement ruleorder localrules configfile
syn keyword pythonStatement ruleorder localrules configfile group
syn keyword pythonStatement touch protected temp wrapper conda shadow
syn keyword pythonStatement input output params message threads resources singularity
syn keyword pythonStatement version run shell benchmark snakefile log script
......
......@@ -106,7 +106,7 @@ code > span.er { color: #ff0000; font-weight: bold; }
<p>A Snakemake rule has a name (here <code>bwa_map</code>) and a number of directives, here <code>input</code>, <code>output</code> and <code>shell</code>. The <code>input</code> and <code>output</code> directives are followed by lists of files that are expected to be used or created by the rule. In the simplest case, these are just explicit Python strings. The <code>shell</code> directive is followed by a Python string containing the shell command to execute. In the shell command string, we can refer to elements of the rule via braces notation (similar to the Python format function). Here, we refer to the output file by specifying <code>{output}</code> and to the input files by specifying <code>{input}</code>. Since the rule has multiple input files, Snakemake will concatenate them separated by a whitespace. In other words, Snakemake will replace <code>{input}</code> with <code>data/genome.fa data/samples/A.fastq</code> before executing the command. The shell command invokes <code>bwa mem</code> with reference genome and reads, and pipes the output into <code>samtools</code> which creates a compressed BAM file containing the alignments. The output of <code>samtools</code> is piped into the output file defined by the rule.</p>
<p>When a workflow is executed, Snakemake tries to generate given <strong>target</strong> files. Target files can be specified via the command line. By executing</p>
<pre class="sourceCode bash"><code class="sourceCode bash"><span class="kw">snakemake</span> -np mapped_reads/A.bam</code></pre>
<p>in the working directory containing the Snakefile, we tell Snakemake to generate the target file <code>mapped_reads/A.bam</code>. Since we used the <code>-n</code> (or <code>--dryrun</code>) flag, Snakemake will only show the execution plan instead of actually perform the steps. The <code>-p</code> flag instructs Snakemake to also print the resulting shell command for illustation. To generate the target files, <strong>Snakemake applies the rules given in the Snakefile in a top-down way</strong>. The application of a rule to generate a set of output files is called <strong>job</strong>. For each input file of a job, Snakemake again (i.e. recursively) determines rules that can be applied to generate it. This yields a directed acyclic graph (DAG) of jobs where the edges represent dependencies. So far, we only have a single rule, and the DAG of jobs consists of a single node. Nevertheless, we can <strong>execute our workflow</strong> with</p>
<p>in the working directory containing the Snakefile, we tell Snakemake to generate the target file <code>mapped_reads/A.bam</code>. Since we used the <code>-n</code> (or <code>--dry-run</code>) flag, Snakemake will only show the execution plan instead of actually perform the steps. The <code>-p</code> flag instructs Snakemake to also print the resulting shell command for illustation. To generate the target files, <strong>Snakemake applies the rules given in the Snakefile in a top-down way</strong>. The application of a rule to generate a set of output files is called <strong>job</strong>. For each input file of a job, Snakemake again (i.e. recursively) determines rules that can be applied to generate it. This yields a directed acyclic graph (DAG) of jobs where the edges represent dependencies. So far, we only have a single rule, and the DAG of jobs consists of a single node. Nevertheless, we can <strong>execute our workflow</strong> with</p>
<pre class="sourceCode bash"><code class="sourceCode bash"><span class="kw">snakemake</span> mapped_reads/A.bam</code></pre>
<p>Note that, after completion of above command, Snakemake will not try to create <code>mapped_reads/A.bam</code> again, because it is already present in the file system. Snakemake <strong>only re-runs jobs if one of the input files is newer than one of the output files or one of the input files will be updated by another job</strong>.</p>
<h2 id="step-2-generalizing-the-read-mapping-rule">Step 2: Generalizing the read mapping rule</h2>
......
......@@ -37,6 +37,7 @@ def snakemake(snakefile,
nodes=1,
local_cores=1,
resources=dict(),
default_resources=dict(),
config=dict(),
configfile=None,
config_args=None,
......@@ -141,6 +142,7 @@ def snakemake(snakefile,
nodes (int): the number of provided cluster nodes (ignored without cluster support) (default 1)
local_cores (int): the number of provided local cores if in cluster mode (ignored without cluster support) (default 1)
resources (dict): provided resources, a dictionary assigning integers to resource names, e.g. {gpu=1, io=5} (default {})
default_resources (dict): default values for resources not defined in rules (default {})
config (dict): override values for workflow config
workdir (str): path to working directory (default None)
targets (list): list of targets, e.g. rule or file names (default None)
......@@ -397,7 +399,8 @@ def snakemake(snakefile,
attempt=attempt,
default_remote_provider=_default_remote_provider,
default_remote_prefix=default_remote_prefix,
run_local=run_local)
run_local=run_local,
default_resources=default_resources)
success = True
workflow.include(snakefile,
overwrite_first_rule=True,
......@@ -419,6 +422,7 @@ def snakemake(snakefile,
nodes=nodes,
local_cores=local_cores,
resources=resources,
default_resources=default_resources,
dryrun=dryrun,
touch=touch,
printreason=printreason,
......@@ -566,12 +570,12 @@ def snakemake(snakefile,
return success
def parse_resources(args):
def parse_resources(resources_args, fallback=None):
"""Parse resources from args."""
resources = dict()
if args.resources is not None:
if resources_args is not None:
valid = re.compile("[a-zA-Z_]\w*$")
for res in args.resources:
for res in resources_args:
try:
res, val = res.split("=")
except ValueError:
......@@ -583,8 +587,11 @@ def parse_resources(args):
try:
val = int(val)
except ValueError:
raise ValueError(
"Resource definiton must contain an integer after the identifier.")
if fallback is not None:
val = fallback(val)
else:
raise ValueError(
"Resource definiton must contain an integer after the identifier.")
if res == "_cores":
raise ValueError(
"Resource _cores is already defined internally. Use a different name.")
......@@ -592,6 +599,17 @@ def parse_resources(args):
return resources
def parse_default_resources(resources_args):
"""Parse default resource definition args."""
def fallback(val):
def callable(wildcards, input, attempt, threads, rulename):
value = eval(val, {"input": input, "attempt": attempt, "threads": threads})
return value
return callable
return parse_resources(resources_args, fallback=fallback)
def parse_config(args):
"""Parse config from args."""
parsers = [int, float, eval, str]
......@@ -700,10 +718,11 @@ def get_argument_parser(profile=None):
default=None,
help="Targets to build. May be rules or files.")
group_exec.add_argument("--dryrun", "-n",
group_exec.add_argument("--dry-run", "--dryrun", "-n",
dest="dryrun",
action="store_true",
help="Do not execute anything, and display what would be done. "
"If you have a very large workflow, use --dryrun --quiet to just "
"If you have a very large workflow, use --dry-run --quiet to just "
"print a summary of the DAG of jobs.")
group_exec.add_argument("--profile",
......@@ -762,6 +781,15 @@ def get_argument_parser(profile=None):
"use resources by defining the resource keyword, e.g. "
"resources: gpu=1. If now two rules require 1 of the resource "
"'gpu' they won't be run in parallel by the scheduler."))
group_exec.add_argument(
"--default-resources", "--default-res",
nargs="*",
metavar="NAME=INT",
help=("Define default values of resources for rules that do not define their own values. "
"In addition to plain integers, python expressions over inputsize are allowed (e.g. '2*input.size')."
"When specifying this without any arguments (--default-resources), it defines 'mem_mb=max(2*input.size, 1000)' "
"'disk_mb=max(2*input.size, 1000)', i.e., default disk and mem usage is twice the input file size but at least 1GB.")
)
group_exec.add_argument(
"--config", "-C",
nargs="*",
......@@ -968,7 +996,7 @@ def get_argument_parser(profile=None):
group_utils.add_argument(
"--delete-all-output",
action="store_true",
help="Remove all files generated by the workflow. Use together with --dryrun "
help="Remove all files generated by the workflow. Use together with --dry-run "
"to list files without actually deleting anything. Note that this will "
"not recurse into subworkflows. Write-protected files are not removed. "
"Nevertheless, use with care!"
......@@ -977,7 +1005,7 @@ def get_argument_parser(profile=None):
"--delete-temp-output",
action="store_true",
help="Remove all temporary files generated by the workflow. Use together "
"with --dryrun to list files without actually deleting anything. Note "
"with --dry-run to list files without actually deleting anything. Note "
"that this will not recurse into subworkflows."
)
group_utils.add_argument(
......@@ -1191,7 +1219,7 @@ def get_argument_parser(profile=None):
"submitted to the cluster with the given command, once all input "
"files for a particular job are present.\n"
"The submit command can be decorated to make it aware of certain "
"job properties (input, output, params, wildcards, log, threads "
"job properties (name, rulename, input, output, params, wildcards, log, threads "
"and dependencies (see the argument below)), e.g.:\n"
"$ snakemake --cluster 'qsub -pe threaded {threads}'.")),
cluster_mode_group.add_argument(
......@@ -1211,7 +1239,7 @@ def get_argument_parser(profile=None):
"submitted to the cluster with the given command, once all input "
"files for a particular job are present. ARGS can be used to "
"specify options of the underlying cluster system, "
"thereby using the job properties input, output, params, wildcards, log, "
"thereby using the job properties name, rulename, input, output, params, wildcards, log, "
"threads and dependencies, e.g.: "
"--drmaa ' -pe threaded {threads}'. Note that ARGS must be given in quotes and "
"with a leading whitespace.")
......@@ -1384,8 +1412,11 @@ def main(argv=None):
sys.exit(0)
try:
resources = parse_resources(args)
resources = parse_resources(args.resources)
config = parse_config(args)
if args.default_resources is not None and not args.default_resources:
args.default_resources = ["mem_mb=max(2*input.size, 1000)", "disk_mb=max(2*input.size, 1000)"]
default_resources = parse_default_resources(args.default_resources)
except ValueError as e:
print(e, file=sys.stderr)
print("", file=sys.stderr)
......@@ -1505,6 +1536,7 @@ def main(argv=None):
local_cores=args.local_cores,
nodes=args.cores,
resources=resources,
default_resources=default_resources,
config=config,
configfile=args.configfile,
config_args=args.config,
......