Andreas Tille · Andreas Tille · Andreas Tille · Andreas Tille · Andreas Tille · Andreas Tille
--- a/CHANGES.md
+++ b/CHANGES.md
 # Changelog

+## [1.2.14] - 2019-12-01
+### Changed
+  - Improved handling of K-groups in zonkey database files
+  - Change BAM pipeline version requirement for GATK to < v4.0, as the
+    the Indel Realigner has been removed in GATK v4.0
+
+### Fixed
+  - Fixed version detection of GATK for v4.0 (issue #23)
+
+
+## [1.2.13.8] - 2019-10-27
+### Changed
+  - Zonkey now identifies nuclear chromosomes by size instead of name; this
+    is done to better handle FASTAs downloaded from different sources
+
+
+## [1.2.13.7] - 2019-10-15
+### Fixed
+  - Fixed handling of digit only chromosome names in Zonkey
+  - Remove dashes from Zonkey MT genomes when running 'mito' command
+
+
+## [1.2.13.6] - 2019-10-13
+### Fixed
+  - Handle .*miss files created by some versions of plink in Zonkey
+
+
+## [1.2.13.5] - 2019-09-29
+### Fixed
+  - Ignore ValidateSamFile warning REF_SEQ_TOO_LONG_FOR_BAI warning when
+    processing genomes with contigs too large for BAI index files.
+
+
 ## [1.2.13.4] - 2019-03-25
 ### Fixed
  - Improved detection of Picard versions in cases where 'java'
@@ -587,7 +620,12 @@ the (partially) updated documentation now hosted on ReadTheDocs.
  - Switching to more traditional version-number tracking.


-[Unreleased]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.13.4...HEAD
+[Unreleased]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.14...HEAD
+[1.2.14]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.13.8...v1.2.14
+[1.2.13.8]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.13.7...v1.2.13.8
+[1.2.13.7]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.13.6...v1.2.13.7
+[1.2.13.6]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.13.5...v1.2.13.6
+[1.2.13.5]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.13.4...v1.2.13.5
 [1.2.13.4]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.13.3...v1.2.13.4
 [1.2.13.3]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.13.2...v1.2.13.3
 [1.2.13.2]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.13.1...v1.2.13.2

--- a/debian/changelog
+++ b/debian/changelog
-paleomix (1.2.13.4-1) UNRELEASED; urgency=medium
+paleomix (1.2.14-1) UNRELEASED; urgency=medium

  * New upstream version
  * debhelper-compat 12
-  * Standards-Version: 4.4.0
+  * Standards-Version: 4.4.1
  * Use secure URI in Homepage field.
-  TODO: see https://github.com/MikkelSchubert/paleomix/issues/15
+  * autopkgtest: s/ADTTMP/AUTOPKGTEST_TMP/g
+  * Set upstream metadata fields: Repository, Repository-Browse.
+  TODO: Python3 port see
+       https://github.com/MikkelSchubert/paleomix/issues/15

- -- Andreas Tille <tille@debian.org>  Wed, 11 Sep 2019 11:21:12 +0200
+ -- Andreas Tille <tille@debian.org>  Tue, 17 Dec 2019 08:45:11 +0100

 paleomix (1.2.13.3-1) unstable; urgency=medium


--- a/debian/control
+++ b/debian/control
@@ -15,7 +15,7 @@ Build-Depends: debhelper-compat (= 12),
               default-jre-headless,
               bowtie2,
               rsync
-Standards-Version: 4.4.0
+Standards-Version: 4.4.1
 Vcs-Browser: https://salsa.debian.org/med-team/paleomix
 Vcs-Git: https://salsa.debian.org/med-team/paleomix.git
 Homepage: https://geogenetics.ku.dk/publications/paleomix

--- a/debian/tests/run-unit-test
+++ b/debian/tests/run-unit-test
@@ -3,14 +3,14 @@ set -e

 pkg="paleomix"

-if [ "$ADTTMP" = "" ] ; then
-  ADTTMP=`mktemp -d /tmp/${pkg}-test.XXXXXX`
-#  trap "rm -rf $ADTTMP" 0 INT QUIT ABRT PIPE TERM
+if [ "$AUTOPKGTEST_TMP" = "" ] ; then
+  AUTOPKGTEST_TMP=`mktemp -d /tmp/${pkg}-test.XXXXXX`
+#  trap "rm -rf $AUTOPKGTEST_TMP" 0 INT QUIT ABRT PIPE TERM
 fi

-cp -a /usr/share/doc/${pkg}/tests $ADTTMP
+cp -a /usr/share/doc/${pkg}/tests $AUTOPKGTEST_TMP

-cd $ADTTMP
+cd $AUTOPKGTEST_TMP

 for gz in `find . -name "*.gz"` ; do
    # fasta_file.fasta.gz needs to stay compressed for testing

--- a/debian/upstream/metadata
+++ b/debian/upstream/metadata
@@ -23,3 +23,5 @@ Registry:
  Entry: PALEOMIX
 - Name: conda:bioconda
  Entry: NA
+Repository: https://github.com/MikkelSchubert/paleomix.git
+Repository-Browse: https://github.com/MikkelSchubert/paleomix
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -57,7 +57,7 @@ author = u'Mikkel Schubert'
 # The short X.Y version.
 version = u'1.2'
 # The full version, including alpha/beta/rc tags.
-release = u'1.2.13'
+release = u'1.2.14'

 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

--- a/paleomix/__init__.py
+++ b/paleomix/__init__.py
@@ -21,8 +21,8 @@
 # SOFTWARE.
 #

-__version_info__ = (1, 2, 13, 3)
-__version__ = '%i.%i.%i.%i' % __version_info__
+__version_info__ = (1, 2, 14)
+__version__ = "%i.%i.%i" % __version_info__


 def run(command=None):

--- a/paleomix/nodes/gatk.py
+++ b/paleomix/nodes/gatk.py
@@ -45,6 +45,9 @@ from paleomix.nodes.bwa import \
 import paleomix.common.versions as versions


+_RE_GATK_VERSION = r"^(?:The Genome Analysis Toolkit \(GATK\) v)?(\d+)\.(\d+)"
+
+
 def _get_gatk_version_check(config):
    """Returns a version-check object for the "GenomeAnalysisTK.jar" located at
    config.jar_root; for now, this check only serves to verify that the JAR can
@@ -60,8 +63,8 @@ def _get_gatk_version_check(config):
        # Any version is fine; for now just catch old JREs
        requirement = versions.Requirement(call=params.finalized_call,
                                           name="GenomeAnalysisTK",
-                                           search=r"^(\d+)\.(\d+)",
-                                           checks=versions.Any())
+                                           search=_RE_GATK_VERSION,
+                                           checks=versions.LT(4, 0))
        _GATK_VERSION[jar_file] = requirement
    return _GATK_VERSION[jar_file]
 _GATK_VERSION = {}

--- a/paleomix/nodes/picard.py
+++ b/paleomix/nodes/picard.py
@@ -60,10 +60,13 @@ class PicardNode(CommandNode):

 class ValidateBAMNode(PicardNode):
    def __init__(self, config, input_bam, input_index=None, output_log=None,
-                 ignored_checks=(), dependencies=()):
+                 ignored_checks=(), big_genome_mode=False, dependencies=()):
        builder = picard_command(config, "ValidateSamFile")
        _set_max_open_files(builder, "MAX_OPEN_TEMP_FILES")

+        if True or big_genome_mode:
+            self._configure_for_big_genome(config, builder)
+
        builder.set_option("I", "%(IN_BAM)s", sep="=")
        for check in ignored_checks:
            builder.add_option("IGNORE", check, sep="=")
@@ -79,6 +82,22 @@ class ValidateBAMNode(PicardNode):
                            description=description,
                            dependencies=dependencies)

+    @staticmethod
+    def _configure_for_big_genome(config, builder):
+        # CSI uses a different method for assigning BINs to records, which
+        # Picard currently does not support.
+        builder.add_option("IGNORE", "INVALID_INDEXING_BIN", sep="=")
+
+        jar_path = os.path.join(config.jar_root, _PICARD_JAR)
+        version_check = _PICARD_VERSION_CACHE[jar_path]
+
+        try:
+            if version_check.version >= (2, 19, 0):
+                # Useless warning, as we do not build BAI indexes for large genomes
+                builder.add_option("IGNORE", "REF_SEQ_TOO_LONG_FOR_BAI", sep="=")
+        except versions.VersionRequirementError:
+            pass  # Ignored here, handled elsewhere
+

 class BuildSequenceDictNode(PicardNode):
    def __init__(self, config, reference, dependencies=()):
@@ -247,7 +266,8 @@ def picard_command(config, command):
    params = AtomicJavaCmdBuilder(jar_path,
                                  temp_root=config.temp_root,
                                  jre_options=config.jre_options,
-                                  CHECK_JAR=version)
+                                  CHECK_JAR=version,
+                                  set_cwd=True)
    params.set_option(command)

    return params

--- a/paleomix/tools/bam_pipeline/nodes.py
+++ b/paleomix/tools/bam_pipeline/nodes.py
@@ -43,15 +43,11 @@ def index_and_validate_bam(config, prefix, node, log_file=None,
        # where high-quality reads may cause mis-identification of qualities
        "INVALID_QUALITY_FORMAT"]

-    if prefix['IndexFormat'] == '.csi':
-        # CSI uses a different method for assigning BINs to records, which
-        # Picard currently does not support.
-        ignored_checks.append("INVALID_INDEXING_BIN")
-
    return ValidateBAMNode(config=config,
                           input_bam=input_file,
                           input_index=index_file,
                           ignored_checks=ignored_checks,
+                           big_genome_mode=prefix["IndexFormat"] == ".csi",
                           output_log=log_file,
                           dependencies=node)


--- a/paleomix/tools/zonkey/build_tped.py
+++ b/paleomix/tools/zonkey/build_tped.py
@@ -258,9 +258,10 @@ def write_summary(args, filename, statistics):
            handle.write("%s: %s\n" % (key, statistics.get(key, "MISSING")))


-def process_bam(args, data, bam_handle):
+def process_bam(args, data, bam_handle, mapping):
+    reverse_mapping = dict(zip(mapping.values(), mapping))
    raw_references = bam_handle.references
-    references = map(common.contig_name_to_plink_name, raw_references)
+    references = [reverse_mapping.get(name, name) for name in raw_references]

    if args.downsample:
        sys.stderr.write("Downsampling to at most %i BAM records ...\n"
@@ -343,7 +344,7 @@ def main(argv):
                             "identifiable nuclear alignments.\n")
            return 1

-        process_bam(args, data, bam_handle)
+        process_bam(args, data, bam_handle, bam_info.nuclear_contigs)

    return 0


--- a/paleomix/tools/zonkey/common.py
+++ b/paleomix/tools/zonkey/common.py
@@ -56,7 +56,7 @@ def contig_name_to_plink_name(chrom):
    identified.
    """
    if chrom.isdigit():
-        return chrom.upper
+        return chrom
    elif chrom.upper() in "XY":
        return chrom.upper()
    elif chrom.lower().startswith("chr") and chrom[3:].isdigit():

--- a/paleomix/tools/zonkey/config.py
+++ b/paleomix/tools/zonkey/config.py
@@ -218,14 +218,14 @@ def _process_samples(config):
                              "mitochondrial BAM specified; the mitochondrial "
                              "genome in the first BAM will not be used!")

-                files["Nuc"] = filename
+                files["Nuc"] = {"Path": filename, "Info": filetype}
                files.setdefault("Mito", filename)
            elif filetype.is_nuclear:
                if "Nuc" in files:
                    print_err("ERROR: Two nuclear BAMs specified!")
                    return False

-                files["Nuc"] = filename
+                files["Nuc"] = {"Path": filename, "Info": filetype}
            elif filetype.is_mitochondrial:
                if "Mito" in files:
                    print_err("ERROR: Two nuclear BAMs specified!")

--- a/paleomix/tools/zonkey/database.py
+++ b/paleomix/tools/zonkey/database.py
@@ -19,7 +19,9 @@
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
+import collections
 import os
+import re
 import tarfile

 import pysam
@@ -46,14 +48,14 @@ _SETTINGS_KEYS = ('Format', 'Revision', 'Plink', 'NChroms', 'MitoPadding',

 class BAMInfo(object):
    def __init__(self):
-        self.nuclear = False
+        self.nuclear_contigs = {}
        self.mt_contig = None
        self.mt_length = None
        self.mt_padding = None

    @property
    def is_nuclear(self):
-        return self.nuclear
+        return bool(self.nuclear_contigs)

    @property
    def is_mitochondrial(self):
@@ -62,7 +64,7 @@ class BAMInfo(object):
    def __repr__(self):
        tmpl = "BAMInfo(nuclear=%r, mt_contig=%r, mt_length=%r, mt_padding=%r)"

-        return tmpl % (self.nuclear, self.mt_contig,
+        return tmpl % (self.nuclear_contigs, self.mt_contig,
                       self.mt_length, self.mt_padding)


@@ -74,9 +76,10 @@ _SUPPORTED_DB_FORMAT_MINOR = 20160112

 # Required columns in the 'contigs.txt' table; additional columns are ignored
 _CONTIGS_TABLE_COLUMNS = frozenset(('ID', 'Size', 'Checksum'))
-# Required columns in the 'samples.txt' table; additional columns are ignored
-_SAMPELS_TABLE_COLUMNS = frozenset(('ID', 'Group(2)', 'Group(3)', 'Species',
-                                    'Sex', 'SampleID', 'Publication'))
+# Required columns in the 'samples.txt' table; additional non-group columns are ignored
+_SAMPLES_TABLE_COLUMNS = frozenset(('ID', 'Species', 'Sex', 'SampleID', 'Publication'))
+# Regular expression for parsing Group(K) columns in samples.txt
+_SAMPLES_TABLE_GROUP = re.compile(r'^Group\((?P<K>.+)\)$')


 class ZonkeyDBError(RuntimeError):
@@ -103,7 +106,7 @@ class ZonkeyDB(object):
            print_info('  - Reading list of contigs ...')
            self.contigs = self._read_contigs_table(tar_handle, "contigs.txt")
            print_info('  - Reading list of samples ...')
-            self.samples = self._read_samples_table(tar_handle, "samples.txt")
+            self.samples, self.groups = self._read_samples_table(tar_handle, "samples.txt")
            print_info('  - Reading mitochondrial sequences ...')
            self.mitochondria = self._read_mitochondria(tar_handle,
                                                        "mitochondria.fasta")
@@ -196,7 +199,7 @@ class ZonkeyDB(object):
    def _read_samples_table(cls, tar_handle, filename):
        cls._check_required_file(tar_handle, filename)

-        samples = cls._read_table(tar_handle, "samples.txt")
+        samples = cls._read_table(tar_handle, "samples.txt", _SAMPLES_TABLE_COLUMNS)
        if not samples:
            raise ZonkeyDBError("ERROR: No samples found in genotypes table!")

@@ -206,16 +209,42 @@ class ZonkeyDB(object):
                                    "expected 'MALE', 'FEMALE', or 'NA'"
                                    % (row["Sex"],))

-        for k_groups in (2, 3):
-            key = "Group(%i)" % (k_groups,)
-            groups = frozenset(row[key] for row in samples.itervalues())
-
-            if len(groups - set('-')) not in (0, k_groups):
-                raise ZonkeyDBError("The %r column in the samples table must "
-                                    "either contain %i ancestral groups, or "
-                                    "none" % (key, k_groups))
-
-        return samples
+        group_keys = []
+        for key in samples.values()[0]:
+            match = _SAMPLES_TABLE_GROUP.match(key)
+            if match is not None:
+                k_value = match.groupdict()["K"]
+                if not k_value.isdigit():
+                    raise ZonkeyDBError("Malformed Group column name; K is "
+                                        "not a number: %r" % (key,))
+                elif not (2 <= int(k_value) <= 7):
+                    raise ZonkeyDBError("K must be between 2 and 7, but found %r"
+                                        % (key,))
+
+                group_keys.append((key, int(k_value)))
+
+        groups = {}
+        for key, k_value in group_keys:
+            group = {}
+            for sample_key, sample in samples.items():
+                group[sample_key] = sample.pop(key)
+
+            group_labels = frozenset(group.values())
+            if group_labels == frozenset('-'):
+                continue  # Allowed for backwards compatibility
+            elif '-' in group_labels:
+                raise ZonkeyDBError("Not all samples column %r assignd a group"
+                                    % (key,))
+            elif len(group_labels) != k_value:
+                raise ZonkeyDBError("Expected %i groups in column %r, found %i"
+                                    % (k_value, key, len(group_labels)))
+
+            groups[k_value] = group
+
+        if not groups:
+            raise ZonkeyDBError("No valid groups in samples.txt")
+
+        return samples, groups

    @classmethod
    def _read_sample_order(cls, tar_handle, filename):
@@ -390,9 +419,7 @@ class ZonkeyDB(object):
                                        % (key, linenum, filename, row[key]))

            for key in ('Sample1', 'Sample2'):
-                group_key = 'Group(%i)' % (row['K'],)
-                groups = frozenset(row[group_key]
-                                   for row in self.samples.itervalues())
+                groups = frozenset(self.groups[int(row["K"])].values())

                if row[key] not in groups and row[key] != '-':
                    raise ZonkeyDBError('Invalid group in column %r in '
@@ -508,37 +535,43 @@ def _validate_mito_bam(data, handle, info):


 def _validate_nuclear_bam(data, handle, info):
-    # Check that chromosomes are of expected size; unused chroms are ignored.
-    bam_contigs = dict(zip(map(contig_name_to_plink_name, handle.references),
-                           handle.lengths))
-    ref_contigs = data.contigs
-
-    contigs_found = {}
-    for name, stats in sorted(ref_contigs.iteritems()):
-        if name not in bam_contigs:
-            contigs_found[name] = False
-        elif bam_contigs[name] != stats["Size"]:
-            print_err("\nERROR: Chrom %r in the BAM does not match the "
-                      "length specified in data file:\n"
-                      "    - Expected: %i\n"
-                      "    - Found: %i"
-                      % (name, bam_contigs[name], stats["Size"]))
-
-            return False
+    # Match reference panel contigs with BAM contigs; identification is done
+    # by size since different repositories use different naming schemes.
+    bam_contigs = collections.defaultdict(list)
+    for name, length in zip(handle.references, handle.lengths):
+        bam_contigs[length].append(name)
+
+    panel_names_to_bam = {}
+    for name, stats in sorted(data.contigs.iteritems()):
+        bam_contig_names = bam_contigs.get(stats["Size"], ())
+        if len(bam_contig_names) == 1:
+            panel_names_to_bam[name] = bam_contig_names[0]
+        elif len(bam_contig_names) > 1:
+            candidates = []
+            for bam_name in bam_contig_names:
+                if contig_name_to_plink_name(bam_name) == name:
+                    candidates.append(bam_name)
+
+            if len(candidates) == 1:
+                panel_names_to_bam[name] = candidates[0]
            else:
-            contigs_found[name] = True
+                print_warn("\WARNING: Multiple candidates for chr%s with size %i:"
+                           % (name, stats["Size"]))

-    if any(contigs_found.itervalues()):
-        if not all(contigs_found.itervalues()):
+                for name in bam_contig_names:
+                    print_warn("  - %r" % (name,))
+
+    if len(panel_names_to_bam) == len(data.contigs):
+        info.nuclear_contigs = panel_names_to_bam
+        return True
+    elif panel_names_to_bam:
        print_err("\nERROR: Not all nuclear chromosomes found in BAM:")
-            for (name, stats) in sorted(ref_contigs.iteritems()):
-                is_found = "Found" if contigs_found[name] else "Not found!"
+        for (name, stats) in sorted(data.contigs.iteritems()):
+            is_found = "OK" if name in panel_names_to_bam else "Not found!"
            print_err("  - %s: %s" % (name, is_found))

        return False
    else:
-            info.nuclear = True
-
        return True



--- a/paleomix/tools/zonkey/parts/admixture.py
+++ b/paleomix/tools/zonkey/parts/admixture.py
@@ -36,7 +36,6 @@ class AdmixtureError(RuntimeError):


 def read_admixture_results(filename, data, k_groups, cutoff=CUTOFF):
-    key = "Group(%i)" % (k_groups,)
    names = tuple(data.sample_order) + ("-",)
    table = _admixture_read_results(filename, names)
    _admixture_validate_ancestral_groups(data, table, k_groups, cutoff)
@@ -46,7 +45,7 @@ def read_admixture_results(filename, data, k_groups, cutoff=CUTOFF):
        if sample == '-':
            continue

-        group = data.samples[sample][key]
+        group = data.groups[k_groups][sample]
        for index, value in enumerate(row):
            if value >= cutoff:
                ancestral_groups[index][0].add(group)
@@ -146,13 +145,10 @@ def _admixture_read_results(filename, samples):


 def _admixture_validate_ancestral_groups(data, table, k_groups, cutoff):
-    key = "Group(%i)" % (k_groups,)
    groups = collections.defaultdict(dict)
    for sample, row in table.iteritems():
-        if sample not in data.samples:
-            continue
-
-        group = data.samples[sample][key]
+        group = data.groups[k_groups].get(sample)
+        if group is not None:
            for index, value in enumerate(row):
                if value >= cutoff:
                    groups[group][index] = True

--- a/paleomix/tools/zonkey/parts/common.py
+++ b/paleomix/tools/zonkey/parts/common.py
@@ -34,6 +34,7 @@ _DEFAULT_COLORS = ("#E69F00", "#56B4E9",
 class WriteSampleList(Node):
    def __init__(self, config, output_file, dependencies=()):
        self._samples = config.database.samples
+        self._groups = config.database.groups

        Node.__init__(self,
                      description="<WriteSampleList -> %r>" % (output_file,),
@@ -44,17 +45,18 @@ class WriteSampleList(Node):
    def _run(self, config, temp):
        output_file, = self.output_files
        samples = self._samples
-        groups = set(sample["Group(3)"] for sample in samples.itervalues())
-        colors = dict(zip(groups, _DEFAULT_COLORS))
+
+        group = self._groups[max(self._groups)]
+        group_colors = dict(zip(sorted(set(group.values())), _DEFAULT_COLORS))

        with open(fileutils.reroot_path(temp, output_file), "w") as handle:
            handle.write("Name\tGroup\tColor\n")

-            for name, sample in sorted(samples.iteritems()):
-                group = sample["Group(3)"]
-                color = colors[group]
+            for sample_name in sorted(samples):
+                group_name = group[sample_name]
+                group_color = group_colors[group_name]

-                handle.write("%s\t%s\t%s\n" % (name, group, color))
+                handle.write("%s\t%s\t%s\n" % (sample_name, group_name, group_color))

            handle.write("Sample\t-\t#000000\n")


--- a/paleomix/tools/zonkey/parts/nuclear.py
+++ b/paleomix/tools/zonkey/parts/nuclear.py
@@ -192,17 +192,10 @@ class BuildFilteredBEDFilesNode(CommandNode):


 class AdmixtureNode(CommandNode):
-    def __init__(self, input_file, k_groups, output_root,
-                 samples=None, dependencies=()):
-        self._samples = samples
+    def __init__(self, input_file, k_groups, output_root, groups, dependencies=()):
+        self._groups = groups
        self._input_file = input_file
-        self._k_groups = k_groups
-
-        group_key = "Group(%i)" % (self._k_groups,)
-        self._supervised = samples and any((row[group_key] != '-')
-                                           for row in samples.itervalues())

-        assert k_groups in (2, 3), k_groups
        prefix = os.path.splitext(os.path.basename(input_file))[0]
        output_prefix = os.path.join(output_root,
                                     "%s.%i" % (prefix, k_groups))
@@ -227,8 +220,6 @@ class AdmixtureNode(CommandNode):
                               set_cwd=True)

        cmd.set_option("-s", random.randint(0, 2 ** 16 - 1))
-
-        if self._supervised:
        cmd.set_option("--supervised")

        cmd.add_value("%(TEMP_OUT_FILE_BED)s")
@@ -253,20 +244,16 @@ class AdmixtureNode(CommandNode):
            basename = os.path.basename(filename)
            os.symlink(os.path.abspath(filename), os.path.join(temp, basename))

-        if self._supervised:
        fam_filename = fileutils.swap_ext(self._input_file, ".fam")
-
        pop_filename = fileutils.swap_ext(fam_filename, ".pop")
        pop_filename = fileutils.reroot_path(temp, pop_filename)

-            key = "Group(%i)" % (self._k_groups,)
        with open(fam_filename) as fam_handle:
            with open(pop_filename, "w") as pop_handle:
                for line in fam_handle:
                    sample, _ = line.split(None, 1)
-                        group = self._samples.get(sample, {}).get(key, "-")

-                        pop_handle.write("%s\n" % (group,))
+                    pop_handle.write("%s\n" % (self._groups.get(sample, "-"),))


 class SelectBestAdmixtureNode(Node):
@@ -399,6 +386,8 @@ class BuildFreqFilesNode(CommandNode):
                          IN_BIM=input_prefix + ".bim",
                          IN_FAM=input_prefix + ".fam",
                          TEMP_OUT_CLUST="samples.clust",
+                          TEMP_OUT_IMISS=basename + ".imiss",
+                          TEMP_OUT_LMISS=basename + ".lmiss",
                          OUT_NOSEX=output_prefix + ".frq.strat.nosex",
                          OUT_LOG=output_prefix + ".frq.strat.log",
                          TEMP_OUT_PREFIX=basename,
@@ -692,8 +681,9 @@ class PlotPCANode(CommandNode):


 class PlotCoverageNode(CommandNode):
-    def __init__(self, contigs, input_file, output_prefix, dependencies=()):
+    def __init__(self, contigs, mapping, input_file, output_prefix, dependencies=()):
        self._contigs = contigs
+        self._mapping = dict(zip(mapping.values(), mapping))
        self._input_file = input_file

        script = rtools.rscript("zonkey", "coverage.r")
@@ -726,19 +716,11 @@ class PlotCoverageNode(CommandNode):
                    continue

                name, size, hits, _ = line.split('\t')
-                name = contig_name_to_plink_name(name)
-                if name is None or not (name.isdigit() or name == 'X'):
-                    continue
-                elif name not in self._contigs:
+                name = self._mapping.get(name, name)
+                if name not in self._contigs:
                    # Excluding contigs is allowed
                    continue

-                if int(size) != self._contigs[name]['Size']:
-                    raise NodeError("Size mismatch between database and BAM; "
-                                    "expected size %i, found %i for contig %r"
-                                    % (int(size), self._contigs[name]['Size'],
-                                       name))
-
                row = {
                    'ID': name,
                    'Size': self._contigs[name]['Size'],

--- a/paleomix/tools/zonkey/parts/report.py
+++ b/paleomix/tools/zonkey/parts/report.py
@@ -139,26 +139,19 @@ class ReportNode(Node):
        output_handle.write(_SECTION_HEADER.format(name="samples",
                                                   title="Reference Panel"))

-        output_handle.write(_SAMPLE_LIST_HEADER)
+        group_headers = []
+        for k_value in sorted(self._data.groups):
+            group_headers.append("          <th>Group(%i)</th>" % k_value)
+        output_handle.write(_SAMPLE_LIST_HEADER % ('\n'.join(group_headers),))

        last_group_2 = None
        last_group_3 = None
-        for row in sorted(self._data.samples.itervalues(),
-                          key=lambda row: (row["Group(2)"],
-                                           row["Group(3)"],
-                                           row["ID"])):
-
+        for name, row in sorted(self._data.samples.iteritems()):
            row = dict(row)
-            if last_group_2 != row["Group(2)"]:
-                last_group_2 = row["Group(2)"]
-                last_group_3 = row["Group(3)"]
-            else:
-                row["Group(2)"] = ""
-
-                if last_group_3 != row["Group(3)"]:
-                    last_group_3 = row["Group(3)"]
-                else:
-                    row["Group(3)"] = ""
+            groups = []
+            for key in self._data.groups:
+                groups.append("          <td>%s</td>" % self._data.groups[key][name])
+            row["Groups"] = "\n".join(groups)

            pub = row["Publication"]
            if pub.startswith("http"):
@@ -183,7 +176,7 @@ class ReportNode(Node):
        overview = _ADMIXTURE_OVERVIEW.format(ADMIXTURE=admixture_v)
        output_handle.write(overview)

-        for k_groups in (2, 3):
+        for k_groups in sorted(self._data.groups):
            summary_incl = self._build_admixture_cell(k_groups, True)
            summary_excl = self._build_admixture_cell(k_groups, False)

@@ -372,7 +365,7 @@ class AnalysisReport(object):

                # Include files showing proproation of ancestral populations,
                # which are required to build admixture figures in the reports.
-                for k_groups in (2, 3):
+                for k_groups in sorted(self._data.groups):
                    input_files.append(os.path.join(admix_root,
                                                    "%s.%i.Q" % (postfix,
                                                                 k_groups)))
@@ -625,9 +618,8 @@ _OVERVIEW_FOOTER = """
 _SAMPLE_LIST_HEADER = """
      <table summary="List of samples in the reference panel.">
        <tr>
-          <th>Group(2)</th>
-          <th>Group(3)</th>
          <th>ID</th>
+%s
          <th>Species</th>
          <th>Sex</th>
          <th>Sample Name</th>
@@ -637,9 +629,8 @@ _SAMPLE_LIST_HEADER = """

 _SAMPLE_LIST_ROW = """
        <tr>
-          <td>{Group(2)}
-          <td>{Group(3)}
          <td>{ID}</td>
+{Groups}
          <td><em>{Species}</em></td>
          <td>{Sex}</th>
          <td>{SampleID}</td>

--- a/paleomix/tools/zonkey/parts/summary.py
+++ b/paleomix/tools/zonkey/parts/summary.py
@@ -146,7 +146,7 @@ class SummaryNode(Node):
                admix_root = os.path.join(self._root, sample,
                                          "results", "admixture")

-                for k_groups in (2, 3):
+                for k_groups in sorted(self._data.groups):
                    filename = os.path.join(admix_root, "%s.%i.Q" % (postfix,
                                                                     k_groups))

@@ -189,7 +189,7 @@ class SummaryNode(Node):
            admix_root = os.path.join(self._root, sample,
                                      "results", "admixture")

-            for k_groups in (2, 3):
+            for k_groups in sorted(self._data.groups):
                filename = os.path.join(admix_root, "%s.%i.Q" % (postfix, k_groups))

                lines.append("                <td>")

--- a/paleomix/tools/zonkey/pipeline.py
+++ b/paleomix/tools/zonkey/pipeline.py
@@ -37,6 +37,8 @@ from paleomix.common.console import \
    print_info, \
    print_warn

+from paleomix.common.formats.fasta import FASTA
+
 from paleomix.pipeline import \
    Pypeline

@@ -122,7 +124,7 @@ def build_admixture_nodes(config, data, root, plink):

        admix_root = os.path.join(root, "results", "admixture")
        report_root = os.path.join(root, "figures", "admixture")
-        for k_groups in (2, 3):
+        for k_groups in sorted(data.groups):
            replicates = []

            input_file = os.path.join(plink["root"], postfix + ".bed")
@@ -132,7 +134,7 @@ def build_admixture_nodes(config, data, root, plink):
                node = nuclear.AdmixtureNode(input_file=input_file,
                                             output_root=output_root,
                                             k_groups=k_groups,
-                                             samples=data.samples,
+                                             groups=data.groups[k_groups],
                                             dependencies=(bed_node,))

                replicates.append(node)
@@ -228,10 +230,11 @@ def build_pca_nodes(config, data, root, plink):
    return nodes


-def build_coverage_nodes(data, root, nuc_bam, dependencies=()):
+def build_coverage_nodes(contigs, mapping, root, nuc_bam, dependencies=()):
    output_prefix = os.path.join(root, 'figures', 'coverage', 'coverage')

-    return nuclear.PlotCoverageNode(contigs=data.contigs,
+    return nuclear.PlotCoverageNode(contigs=contigs,
+                                    mapping=mapping,
                                    input_file=nuc_bam,
                                    output_prefix=output_prefix,
                                    dependencies=dependencies),
@@ -278,6 +281,8 @@ def build_pipeline(config, root, nuc_bam, mito_bam, cache):
                                           output_file=sample_tbl)

    if nuc_bam is not None:
+        nuc_bam, nuc_bam_info = nuc_bam["Path"], nuc_bam["Info"]
+
        # When not sampling, BuildTPED relies on indexed access to ease
        # processing of one chromosome at a time. The index is further required
        # for idxstats used by the PlotCoverageNode.
@@ -292,8 +297,11 @@ def build_pipeline(config, root, nuc_bam, mito_bam, cache):
                                           plink))

        if not config.admixture_only:
-            nodes.extend(build_coverage_nodes(config.database,
-                                              root, nuc_bam, (index,)))
+            nodes.extend(build_coverage_nodes(contigs=config.database.contigs,
+                                              mapping=nuc_bam_info.nuclear_contigs,
+                                              root=root,
+                                              nuc_bam=nuc_bam,
+                                              dependencies=(index,)))
            nodes.extend(build_pca_nodes(config, config.database,
                                         root, plink))
            nodes.extend(build_treemix_nodes(config, config.database,
@@ -384,8 +392,6 @@ def setup_mito_mapping(config):

            info = config.database.samples.get(record.name)
            if info is not None:
-                mkfile.write("    # Group: %s\n"
-                             % (info.get('Group(3)', 'NA'),))
                mkfile.write("    # Species: %s\n"
                             % (info.get('Species', 'NA'),))
                mkfile.write("    # Sex: %s\n"
@@ -401,6 +407,11 @@ def setup_mito_mapping(config):
                                       "%s.fasta" % (record.name,))

            with open(fasta_fpath, "w") as fasta_handle:
+                record = FASTA(
+                    name=record.name,
+                    meta=None,
+                    sequence=record.sequence.replace('-', ''))
+
                fasta_handle.write(str(record))
                fasta_handle.write("\n")