Andreas Tille · Andreas Tille · Andreas Tille · Andreas Tille · Andreas Tille · 14383b90
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -4,6 +4,7 @@ jobs:
    machine: true
    environment:
      - GCLOUD: /opt/google-cloud-sdk/bin/gcloud
+      - BOTO_CONFIG: /dev/null
    steps:
      - checkout
      - restore_cache:

--- a/.dockerignore
+++ b/.dockerignore
-.git
 .eggs
 images
 misc

--- a/.gitignore
+++ b/.gitignore
@@ -11,3 +11,9 @@ dist/
 .snakemake*

 .idea
+
+.pytest*
+.cache
+.ipynb*
+.ropeproject
+.test*
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
 # Change Log

+# [5.1.5] - 2018-06-24
+## Changed
+- fixed missing version info in docker image.
+- several minor fixes to EGA support.
+
+# [5.1.4] - 2018-05-28
+## Added
+- Allow `category` to be set.
+## Changed
+- Various cosmetic changes to reports.
+- Fixed encoding issues in reports.
+
+# [5.1.3] - 2018-05-22
+## Changed
+- Fixed various bugs in job groups, shadow directive, singularity directive, and more.
+
+# [5.1.2] - 2018-05-18
+## Changed
+- Fixed a bug in the report stylesheet.
+
+# [5.1.0] - 2018-05-17
+## Added
+- A new framework for self-contained HTML reports, including results, statistics and topology information. In future releases this will be further extended.
+- A new utility snakemake.utils.validate() which allows to validate config and pandas data frames using JSON schemas.
+- Two new flags --cleanup-shadow and --cleanup-conda to clean up old unused conda and shadow data.
+## Changed
+- Benchmark repeats are now specified inside the workflow via a new flag repeat().
+- Command line interface help has been refactored into groups for better readability.
+
+# [5.0.0] - 2018-05-11
+# Added
+- Group jobs for reduced queuing and network overhead, in particular with short running jobs.
+- Output files can be marked as pipes, such that producing and consuming job are executed simultaneously and interfomation is transferred directly without using disk.
+- Command line flags to clean output files.
+- Command line flag to list files in working directory that are not tracked by Snakemake.
+# Changes
+- Fix of --default-remote-prefix in case of input functions returning lists or dicts.
+- Scheduler no longer prefers jobs with many downstream jobs.
+
 # [4.8.1] - 2018-04-25
 # Added
 - Allow URLs for the conda directive.

--- a/Dockerfile
+++ b/Dockerfile
@@ -2,15 +2,16 @@ FROM bitnami/minideb:stretch
 MAINTAINER Johannes Köster <johannes.koester@tu-dortmund.de>
 ENV SINGULARITY_VERSION=2.4.5
 ADD . /tmp/repo
-RUN install_packages wget bzip2 ca-certificates gnupg2 squashfs-tools
-RUN wget -O- http://neuro.debian.net/lists/xenial.us-ca.full > /etc/apt/sources.list.d/neurodebian.sources.list
-RUN wget -O- http://neuro.debian.net/_static/neuro.debian.net.asc | apt-key add -
-RUN install_packages singularity-container
-RUN wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
-    bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda && \
-    rm Miniconda3-latest-Linux-x86_64.sh
+WORKDIR /tmp/repo
 ENV PATH /opt/conda/bin:${PATH}
 ENV LANG C.UTF-8
 ENV SHELL /bin/bash
-RUN conda update -n base conda && conda env update --name root --file /tmp/repo/environment.yml && conda clean --all -y
-RUN pip install /tmp/repo
+RUN install_packages wget bzip2 ca-certificates gnupg2 squashfs-tools git && \
+    wget -O- http://neuro.debian.net/lists/xenial.us-ca.full > /etc/apt/sources.list.d/neurodebian.sources.list && \
+    wget -O- http://neuro.debian.net/_static/neuro.debian.net.asc | apt-key add - && \
+    install_packages singularity-container && \
+    wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
+    bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda && \
+    rm Miniconda3-latest-Linux-x86_64.sh && \
+    conda update -n base conda && conda env update --name root --file /tmp/repo/environment.yml && conda clean --all -y && \
+    pip install .
--- a/debian/changelog
+++ b/debian/changelog
-snakemake (4.8.1-1) UNRELEASED; urgency=medium
+snakemake (5.1.5-1) UNRELEASED; urgency=medium

  * Team upload.
  * New upstream version
@@ -8,7 +8,7 @@ snakemake (4.8.1-1) UNRELEASED; urgency=medium
  * Build-Depends: r-cran-rmarkdown and disable patch that skips test using
    RMarkdown
  * Recommends: r-cran-rmarkdown
-  * Build-Depends: python-datrie
+  * Build-Depends: python3-datrie, python3-recommonmark
  * Point Vcs-fields to Salsa
  * Drop X-Python-Version


--- a/debian/control
+++ b/debian/control
@@ -13,6 +13,7 @@ Build-Depends: debhelper (>= 11~),
               python3-psutil,
               python3-pytools,
               python3-ratelimiter,
+               python3-recommonmark,
               python3-rpy2,
               python3-setuptools,
               python3-sphinx,
@@ -34,8 +35,11 @@ Depends: ${misc:Depends},
         ${python3:Depends},
         ${sphinxdoc:Depends},
         python3-docutils,
+         python3-datrie,
         python3-psutil,
         python3-pytools,
+         python3-ratelimiter,
+         python3-recommonmark,
         python3-rpy2,
         python3-setuptools,
         python3-wrapt,

--- a/debian/patches/0001-Use-the-inbuild-sphinx.ext.napoleon.patch
+++ b/debian/patches/0001-Use-the-inbuild-sphinx.ext.napoleon.patch
@@ -10,7 +10,7 @@ sphinx proper, as sphinx.ext.napoleon.

 --- a/docs/conf.py
 +++ b/docs/conf.py
-@@ -33,7 +33,7 @@
+@@ -37,7 +37,7 @@ extensions = [
     'sphinx.ext.autodoc',
     'sphinx.ext.mathjax',
     'sphinx.ext.viewcode',

--- a/debian/patches/0003-Compat-fix.patch
+++ b/debian/patches/0003-Compat-fix.patch
@@ -6,10 +6,8 @@ Subject: Compat fix
 tests/test_symlink_time_handling/Snakefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

-Index: snakemake/tests/test_symlink_time_handling/Snakefile
-===================================================================
--- snakemake.orig/tests/test_symlink_time_handling/Snakefile
-+++ snakemake/tests/test_symlink_time_handling/Snakefile
+--- a/tests/test_symlink_time_handling/Snakefile
+++ b/tests/test_symlink_time_handling/Snakefile
 @@ -42,7 +42,7 @@ if not os.path.exists("input_file"):
     shell("ln -s input_link output_link")
     shell("touch -h -t {} output_link".format(timestr(2)))

--- a/debian/patches/0003-Use-debian-s-mathjax-package.patch
+++ b/debian/patches/0003-Use-debian-s-mathjax-package.patch
@@ -10,7 +10,7 @@ Use debian's mathjax package

 --- a/docs/conf.py
 +++ b/docs/conf.py
-@@ -37,6 +37,9 @@ extensions = [
+@@ -41,6 +41,9 @@ extensions = [
     'sphinxarg.ext'
 ]
 

--- a/debian/patches/0004-drop_test_symlink_time_handling.patch
+++ b/debian/patches/0004-drop_test_symlink_time_handling.patch
@@ -5,7 +5,7 @@ Description: Ignore failures of test that might happpen in pbuilder

 --- a/tests/tests.py
 +++ b/tests/tests.py
-@@ -353,7 +353,11 @@
+@@ -371,7 +371,11 @@ def test_input_generator():
 def test_symlink_time_handling():
     #See Snakefile for notes on why this fails on some systems
     if os.utime in os.supports_follow_symlinks:

--- a/debian/patches/0008-remove_sphinx.ext.patch
+++ b/debian/patches/0008-remove_sphinx.ext.patch
@@ -4,7 +4,7 @@ Last-Update: Wed, 06 Dec 2017 22:10:17 +0100

 --- a/docs/conf.py
 +++ b/docs/conf.py
-@@ -33,8 +33,7 @@ extensions = [
+@@ -37,8 +37,7 @@ extensions = [
     'sphinx.ext.autodoc',
     'sphinx.ext.mathjax',
     'sphinx.ext.viewcode',

--- a/docs/conf.py
+++ b/docs/conf.py
@@ -15,6 +15,10 @@

 import sys
 import os
+from recommonmark.parser import CommonMarkParser
+
+source_parsers = {'.md': CommonMarkParser}
+

 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
@@ -41,7 +45,7 @@ extensions = [
 templates_path = ['_templates']

 # The suffix of source filenames.
-source_suffix = '.rst'
+source_suffix = ['.rst', '.md']

 # The encoding of source files.
 #source_encoding = 'utf-8-sig'

--- a/docs/executable.rst
+++ b/docs/executable.rst
@@ -114,6 +114,11 @@ Of course, if any input or output already defines a different remote location, t
 Importantly, this means that Snakemake does **not** require a shared network
 filesystem to work in the cloud.

+
+.. sidebar:: Note
+
+  Consider to :ref:`group jobs <snakefiles-grouping>` in order to minimize overhead, in particular for short-running jobs.
+
 Currently, this mode requires that the Snakemake workflow is stored in a git repository.
 Snakemake uses git to query necessary source files (the Snakefile, scripts, config, ...)
 for workflow execution and encodes them into the kubernetes job.
@@ -144,6 +149,11 @@ In this case, Snakemake simply needs to be given a submit command that accepts a
 Here, ``-j`` denotes the number of jobs submitted being submitted to the cluster at the same time (here 32).
 The cluster command can be decorated with job specific information, e.g.

+.. sidebar:: Note
+
+  Consider to :ref:`group jobs <snakefiles-grouping>` in order to minimize overhead, in particular for short-running jobs.
+
+
 .. code-block:: console

    $ snakemake --cluster "qsub {threads}"
@@ -176,8 +186,9 @@ With DRMAA, no ``qsub`` command needs to be provided, but system specific argume
 Note that the string has to contain a leading whitespace.
 Else, the arguments will be interpreted as part of the normal Snakemake arguments, and execution will fail.

+
 Job Properties
-..............
+~~~~~~~~~~~~~~

 When executing a workflow on a cluster using the ``--cluster`` parameter (see below), Snakemake creates a job script for each job to execute. This script is then invoked using the provided cluster submission command (e.g. ``qsub``). Sometimes you want to provide a custom wrapper for the cluster submission command that decides about additional parameters. As this might be based on properties of the job, Snakemake stores the job properties (e.g. rule name, threads, input files, params etc.) as JSON inside the job script. For convenience, there exists a parser function `snakemake.utils.read_job_properties` that can be used to access the properties. The following shows an example job submission wrapper:


--- a/docs/index.rst
+++ b/docs/index.rst
@@ -80,7 +80,8 @@ To get started, consider the :ref:`tutorial`, the `introductory slides <http://s
 Support
 -------

-* First, check the :ref:`FAQ <project_info-faq>`.
+* For releases, see :ref:`Changelog <changelog>`.
+* Check :ref:`frequently asked questions (FAQ) <project_info-faq>`.
 * In case of questions, please post on `stack overflow <http://stackoverflow.com/questions/tagged/snakemake>`_.
 * To discuss with other Snakemake users, you can use the `mailing list <https://groups.google.com/forum/#!forum/snakemake>`_. **Please do not post questions there. Use stack overflow for questions.**
 * For bugs and feature requests, please use the `issue tracker <https://bitbucket.org/snakemake/snakemake/issues>`_.
@@ -191,6 +192,7 @@ Please consider to add your own.
    snakefiles/remote_files
    snakefiles/utils
    snakefiles/deployment
+    snakefiles/reporting


 .. toctree::

--- a/docs/project_info/faq.rst
+++ b/docs/project_info/faq.rst
@@ -14,7 +14,7 @@ The key idea is very similar to GNU Make. The workflow is determined automatical
 .. image:: img/idea.png
    :alt: Snakemake idea

-When you start using Snakemake, please make sure to walk through the :ref:`official tutorial <tutorial-welcome>`.
+When you start using Snakemake, please make sure to walk through the :ref:`official tutorial <tutorial>`.
 It is crucial to understand how to properly use the system.

 What is the recommended way to distribute a Snakemake workflow?
@@ -199,7 +199,7 @@ becomes:

 Here the double braces are escapes, i.e. there will remain single braces in the final command. In contrast, ``{input}`` is replaced with an input filename.

-In addition, if your shell command has literal slashes, `\\ `, you must escape them with a slash, `\\\\ `. For example:
+In addition, if your shell command has literal backslashes, ``\\``, you must escape them with a backslash, ``\\\\``. For example:

 This:

@@ -437,6 +437,7 @@ and

 Again, the list commands in backticks return the list of output files with changes, which are fed into ``-R`` to trigger a re-run.

+
 How do I remove all files created by snakemake, i.e. like ``make clean``
 ------------------------------------------------------------------------

@@ -444,7 +445,12 @@ To remove all files created by snakemake as output files to start from scratch,

 .. code-block:: console

-    rm $(snakemake --summary | tail -n+2 | cut -f1)
+    $ snakemake some_target --delete-all-output
+
+Only files that are output of snakemake rules will be removed, not those that serve as primary inputs to the workflow.
+Note that this will only affect the files involved in reaching the specified target(s).
+It is strongly advised to first run together with ``--dryrun`` to list the files that would be removed without actually deleting anything.
+The flag ``--delete-temp-output`` can be used in a similar manner to only delete files flagged as temporary.


 Why can't I use the conda directive with a run block?

--- a/docs/project_info/history.rst
+++ b/docs/project_info/history.rst
 .. project_info-history:

+.. _changelog:
+
 ==========
 Change Log
 ==========

--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -3,3 +3,4 @@ sphinxcontrib-napoleon
 sphinx-argparse
 sphinx_rtd_theme
 docutils==0.12
+recommonmark
--- a/docs/snakefiles/configuration.rst
+++ b/docs/snakefiles/configuration.rst
@@ -49,6 +49,76 @@ For adding config placeholders into a shell command, Python string formatting sy
    shell:
        "mycommand {config[foo]} ..."

+---------------------
+Tabular configuration
+---------------------
+
+It is usually advisable to complement YAML based configuration (see above) by a sheet based approach for meta-data that is of tabular form. For example, such
+a sheet can contain per-sample information.
+With the `Pandas library <https://pandas.pydata.org/>`_ such data can be read and used with minimal overhead, e.g.,
+
+.. code-block:: python
+
+    import pandas as pd
+
+    samples = pd.read_table("samples.tsv").set_index("samples", drop=False)
+
+reads in a table ``samples.tsv`` in TSV format and makes every record accessible by the sample name.
+For details, see the `Pandas documentation <http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_table.html?highlight=read_table#pandas-read-table>`_.
+A fully working real-world example containing both types of configuration can be found `here <https://github.com/snakemake-workflows/rna-seq-star-deseq2>`_.
+
+----------
+Validation
+----------
+
+With Snakemake 5.1, it is possible to validate both types of configuration via `JSON schemas <http://json-schema.org>`_.
+The function ``snakemake.utils.validate`` takes a loaded configuration (a config dictionary or a Pandas data frame) and validates it with a given JSON schema.
+Thereby, the schema can be provided in JSON or YAML format.
+In case of the data frame, the schema should model the record that is expected in each row of the data frame.
+In the following example,
+
+.. code-block:: python
+
+  import pandas as pd
+  from snakemake.utils import validate
+
+  configfile: "config.yaml"
+  validate(config, "config.schema.yaml")
+
+  samples = pd.read_table(config["samples"]).set_index("sample", drop=False)
+  validate(samples, "samples.schema.yaml")
+
+
+  rule all:
+      input:
+          expand("test.{sample}.txt", sample=samples.index)
+
+
+  rule a:
+      output:
+          "test.{sample}.txt"
+      shell:
+          "touch {output}"
+
+the schema for validating the samples data frame looks like this:
+
+.. code-block:: yaml
+
+  $schema: "http://json-schema.org/draft-06/schema#"
+  description: an entry in the sample sheet
+  properties:
+    sample:
+      type: string
+      description: sample name/identifier
+    condition:
+      type: string
+      description: sample condition that will be compared during differential expression analysis (e.g. a treatment, a tissue time, a disease)
+
+  required:
+    - sample
+    - condition
+
+

 .. _snakefiles-cluster_configuration:


--- a/docs/snakefiles/remote_files.rst
+++ b/docs/snakefiles/remote_files.rst
@@ -24,7 +24,7 @@ Snakemake includes the following remote providers, supported by the correspondin
 * GFAL: ``snakemake.remote.gfal``
 * GridFTP: ``snakemake.remote.gridftp``
 * iRODS: ``snakemake.remote.iRODS``
-
+* EGA: ``snakemake.remote.EGA``

 Amazon Simple Storage Service (S3)
 ==================================
@@ -725,3 +725,35 @@ download attempt is issued (uploading is not a problem, though).
 In the Snakemake source directory in ``snakemake/tests/test_remote_irods`` you
 can find a working example.

+
+EGA
+===
+
+The European Genome-phenome Archive (EGA) is a service for permanent archiving
+and sharing of all types of personally identifiable genetic and phenotypic data
+resulting from biomedical research projects.
+
+From version 5.2 on, Snakemake provides experimental support to use EGA as a remote provider, such that
+EGA hosted files can be transparently used as input.
+For this to work, you need to define your username and password as environment
+variables ``EGA_USERNAME`` and ``EGA_PASSWORD``.
+
+Files in a dataset are addressed via the pattern ``ega/<dataset_id>/<filename>``.
+Note that the filename should not include the ``.cip`` ending that is sometimes displayed in EGA listings:
+
+.. code-block:: python
+
+  import snakemake.remote.EGA as EGA
+
+  ega = EGA.RemoteProvider()
+
+
+  rule a:
+    input:
+        ega.remote("ega/EGAD00001002142/COLO_829_EPleasance_TGENPipe.bam.bai")
+    output:
+        "data/COLO_829BL_BCGSC_IlluminaPipe.bam.bai"
+    shell:
+        "cp {input} {output}"
+
+Upon download, Snakemake will automatically decrypt the file and check the MD5 hash.