Skip to content
Commits on Source (8)
# Changelog
## [1.2.14] - 2019-12-01
### Changed
- Improved handling of K-groups in zonkey database files
- Change BAM pipeline version requirement for GATK to < v4.0, as the
the Indel Realigner has been removed in GATK v4.0
### Fixed
- Fixed version detection of GATK for v4.0 (issue #23)
## [1.2.13.8] - 2019-10-27
### Changed
- Zonkey now identifies nuclear chromosomes by size instead of name; this
is done to better handle FASTAs downloaded from different sources
## [1.2.13.7] - 2019-10-15
### Fixed
- Fixed handling of digit only chromosome names in Zonkey
- Remove dashes from Zonkey MT genomes when running 'mito' command
## [1.2.13.6] - 2019-10-13
### Fixed
- Handle .*miss files created by some versions of plink in Zonkey
## [1.2.13.5] - 2019-09-29
### Fixed
- Ignore ValidateSamFile warning REF_SEQ_TOO_LONG_FOR_BAI warning when
processing genomes with contigs too large for BAI index files.
## [1.2.13.4] - 2019-03-25
### Fixed
- Improved detection of Picard versions in cases where 'java'
......@@ -587,7 +620,12 @@ the (partially) updated documentation now hosted on ReadTheDocs.
- Switching to more traditional version-number tracking.
[Unreleased]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.13.4...HEAD
[Unreleased]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.14...HEAD
[1.2.14]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.13.8...v1.2.14
[1.2.13.8]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.13.7...v1.2.13.8
[1.2.13.7]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.13.6...v1.2.13.7
[1.2.13.6]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.13.5...v1.2.13.6
[1.2.13.5]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.13.4...v1.2.13.5
[1.2.13.4]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.13.3...v1.2.13.4
[1.2.13.3]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.13.2...v1.2.13.3
[1.2.13.2]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.13.1...v1.2.13.2
......
paleomix (1.2.13.4-1) UNRELEASED; urgency=medium
paleomix (1.2.14-1) UNRELEASED; urgency=medium
* New upstream version
* debhelper-compat 12
* Standards-Version: 4.4.0
* Standards-Version: 4.4.1
* Use secure URI in Homepage field.
TODO: see https://github.com/MikkelSchubert/paleomix/issues/15
* autopkgtest: s/ADTTMP/AUTOPKGTEST_TMP/g
* Set upstream metadata fields: Repository, Repository-Browse.
TODO: Python3 port see
https://github.com/MikkelSchubert/paleomix/issues/15
-- Andreas Tille <tille@debian.org> Wed, 11 Sep 2019 11:21:12 +0200
-- Andreas Tille <tille@debian.org> Tue, 17 Dec 2019 08:45:11 +0100
paleomix (1.2.13.3-1) unstable; urgency=medium
......
......@@ -15,7 +15,7 @@ Build-Depends: debhelper-compat (= 12),
default-jre-headless,
bowtie2,
rsync
Standards-Version: 4.4.0
Standards-Version: 4.4.1
Vcs-Browser: https://salsa.debian.org/med-team/paleomix
Vcs-Git: https://salsa.debian.org/med-team/paleomix.git
Homepage: https://geogenetics.ku.dk/publications/paleomix
......
......@@ -3,14 +3,14 @@ set -e
pkg="paleomix"
if [ "$ADTTMP" = "" ] ; then
ADTTMP=`mktemp -d /tmp/${pkg}-test.XXXXXX`
# trap "rm -rf $ADTTMP" 0 INT QUIT ABRT PIPE TERM
if [ "$AUTOPKGTEST_TMP" = "" ] ; then
AUTOPKGTEST_TMP=`mktemp -d /tmp/${pkg}-test.XXXXXX`
# trap "rm -rf $AUTOPKGTEST_TMP" 0 INT QUIT ABRT PIPE TERM
fi
cp -a /usr/share/doc/${pkg}/tests $ADTTMP
cp -a /usr/share/doc/${pkg}/tests $AUTOPKGTEST_TMP
cd $ADTTMP
cd $AUTOPKGTEST_TMP
for gz in `find . -name "*.gz"` ; do
# fasta_file.fasta.gz needs to stay compressed for testing
......
......@@ -23,3 +23,5 @@ Registry:
Entry: PALEOMIX
- Name: conda:bioconda
Entry: NA
Repository: https://github.com/MikkelSchubert/paleomix.git
Repository-Browse: https://github.com/MikkelSchubert/paleomix
......@@ -57,7 +57,7 @@ author = u'Mikkel Schubert'
# The short X.Y version.
version = u'1.2'
# The full version, including alpha/beta/rc tags.
release = u'1.2.13'
release = u'1.2.14'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
......
......@@ -21,8 +21,8 @@
# SOFTWARE.
#
__version_info__ = (1, 2, 13, 3)
__version__ = '%i.%i.%i.%i' % __version_info__
__version_info__ = (1, 2, 14)
__version__ = "%i.%i.%i" % __version_info__
def run(command=None):
......
......@@ -45,6 +45,9 @@ from paleomix.nodes.bwa import \
import paleomix.common.versions as versions
_RE_GATK_VERSION = r"^(?:The Genome Analysis Toolkit \(GATK\) v)?(\d+)\.(\d+)"
def _get_gatk_version_check(config):
"""Returns a version-check object for the "GenomeAnalysisTK.jar" located at
config.jar_root; for now, this check only serves to verify that the JAR can
......@@ -60,8 +63,8 @@ def _get_gatk_version_check(config):
# Any version is fine; for now just catch old JREs
requirement = versions.Requirement(call=params.finalized_call,
name="GenomeAnalysisTK",
search=r"^(\d+)\.(\d+)",
checks=versions.Any())
search=_RE_GATK_VERSION,
checks=versions.LT(4, 0))
_GATK_VERSION[jar_file] = requirement
return _GATK_VERSION[jar_file]
_GATK_VERSION = {}
......
......@@ -60,10 +60,13 @@ class PicardNode(CommandNode):
class ValidateBAMNode(PicardNode):
def __init__(self, config, input_bam, input_index=None, output_log=None,
ignored_checks=(), dependencies=()):
ignored_checks=(), big_genome_mode=False, dependencies=()):
builder = picard_command(config, "ValidateSamFile")
_set_max_open_files(builder, "MAX_OPEN_TEMP_FILES")
if True or big_genome_mode:
self._configure_for_big_genome(config, builder)
builder.set_option("I", "%(IN_BAM)s", sep="=")
for check in ignored_checks:
builder.add_option("IGNORE", check, sep="=")
......@@ -79,6 +82,22 @@ class ValidateBAMNode(PicardNode):
description=description,
dependencies=dependencies)
@staticmethod
def _configure_for_big_genome(config, builder):
# CSI uses a different method for assigning BINs to records, which
# Picard currently does not support.
builder.add_option("IGNORE", "INVALID_INDEXING_BIN", sep="=")
jar_path = os.path.join(config.jar_root, _PICARD_JAR)
version_check = _PICARD_VERSION_CACHE[jar_path]
try:
if version_check.version >= (2, 19, 0):
# Useless warning, as we do not build BAI indexes for large genomes
builder.add_option("IGNORE", "REF_SEQ_TOO_LONG_FOR_BAI", sep="=")
except versions.VersionRequirementError:
pass # Ignored here, handled elsewhere
class BuildSequenceDictNode(PicardNode):
def __init__(self, config, reference, dependencies=()):
......@@ -247,7 +266,8 @@ def picard_command(config, command):
params = AtomicJavaCmdBuilder(jar_path,
temp_root=config.temp_root,
jre_options=config.jre_options,
CHECK_JAR=version)
CHECK_JAR=version,
set_cwd=True)
params.set_option(command)
return params
......
......@@ -43,15 +43,11 @@ def index_and_validate_bam(config, prefix, node, log_file=None,
# where high-quality reads may cause mis-identification of qualities
"INVALID_QUALITY_FORMAT"]
if prefix['IndexFormat'] == '.csi':
# CSI uses a different method for assigning BINs to records, which
# Picard currently does not support.
ignored_checks.append("INVALID_INDEXING_BIN")
return ValidateBAMNode(config=config,
input_bam=input_file,
input_index=index_file,
ignored_checks=ignored_checks,
big_genome_mode=prefix["IndexFormat"] == ".csi",
output_log=log_file,
dependencies=node)
......
......@@ -258,9 +258,10 @@ def write_summary(args, filename, statistics):
handle.write("%s: %s\n" % (key, statistics.get(key, "MISSING")))
def process_bam(args, data, bam_handle):
def process_bam(args, data, bam_handle, mapping):
reverse_mapping = dict(zip(mapping.values(), mapping))
raw_references = bam_handle.references
references = map(common.contig_name_to_plink_name, raw_references)
references = [reverse_mapping.get(name, name) for name in raw_references]
if args.downsample:
sys.stderr.write("Downsampling to at most %i BAM records ...\n"
......@@ -343,7 +344,7 @@ def main(argv):
"identifiable nuclear alignments.\n")
return 1
process_bam(args, data, bam_handle)
process_bam(args, data, bam_handle, bam_info.nuclear_contigs)
return 0
......
......@@ -56,7 +56,7 @@ def contig_name_to_plink_name(chrom):
identified.
"""
if chrom.isdigit():
return chrom.upper
return chrom
elif chrom.upper() in "XY":
return chrom.upper()
elif chrom.lower().startswith("chr") and chrom[3:].isdigit():
......
......@@ -218,14 +218,14 @@ def _process_samples(config):
"mitochondrial BAM specified; the mitochondrial "
"genome in the first BAM will not be used!")
files["Nuc"] = filename
files["Nuc"] = {"Path": filename, "Info": filetype}
files.setdefault("Mito", filename)
elif filetype.is_nuclear:
if "Nuc" in files:
print_err("ERROR: Two nuclear BAMs specified!")
return False
files["Nuc"] = filename
files["Nuc"] = {"Path": filename, "Info": filetype}
elif filetype.is_mitochondrial:
if "Mito" in files:
print_err("ERROR: Two nuclear BAMs specified!")
......
......@@ -19,7 +19,9 @@
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import collections
import os
import re
import tarfile
import pysam
......@@ -46,14 +48,14 @@ _SETTINGS_KEYS = ('Format', 'Revision', 'Plink', 'NChroms', 'MitoPadding',
class BAMInfo(object):
def __init__(self):
self.nuclear = False
self.nuclear_contigs = {}
self.mt_contig = None
self.mt_length = None
self.mt_padding = None
@property
def is_nuclear(self):
return self.nuclear
return bool(self.nuclear_contigs)
@property
def is_mitochondrial(self):
......@@ -62,7 +64,7 @@ class BAMInfo(object):
def __repr__(self):
tmpl = "BAMInfo(nuclear=%r, mt_contig=%r, mt_length=%r, mt_padding=%r)"
return tmpl % (self.nuclear, self.mt_contig,
return tmpl % (self.nuclear_contigs, self.mt_contig,
self.mt_length, self.mt_padding)
......@@ -74,9 +76,10 @@ _SUPPORTED_DB_FORMAT_MINOR = 20160112
# Required columns in the 'contigs.txt' table; additional columns are ignored
_CONTIGS_TABLE_COLUMNS = frozenset(('ID', 'Size', 'Checksum'))
# Required columns in the 'samples.txt' table; additional columns are ignored
_SAMPELS_TABLE_COLUMNS = frozenset(('ID', 'Group(2)', 'Group(3)', 'Species',
'Sex', 'SampleID', 'Publication'))
# Required columns in the 'samples.txt' table; additional non-group columns are ignored
_SAMPLES_TABLE_COLUMNS = frozenset(('ID', 'Species', 'Sex', 'SampleID', 'Publication'))
# Regular expression for parsing Group(K) columns in samples.txt
_SAMPLES_TABLE_GROUP = re.compile(r'^Group\((?P<K>.+)\)$')
class ZonkeyDBError(RuntimeError):
......@@ -103,7 +106,7 @@ class ZonkeyDB(object):
print_info(' - Reading list of contigs ...')
self.contigs = self._read_contigs_table(tar_handle, "contigs.txt")
print_info(' - Reading list of samples ...')
self.samples = self._read_samples_table(tar_handle, "samples.txt")
self.samples, self.groups = self._read_samples_table(tar_handle, "samples.txt")
print_info(' - Reading mitochondrial sequences ...')
self.mitochondria = self._read_mitochondria(tar_handle,
"mitochondria.fasta")
......@@ -196,7 +199,7 @@ class ZonkeyDB(object):
def _read_samples_table(cls, tar_handle, filename):
cls._check_required_file(tar_handle, filename)
samples = cls._read_table(tar_handle, "samples.txt")
samples = cls._read_table(tar_handle, "samples.txt", _SAMPLES_TABLE_COLUMNS)
if not samples:
raise ZonkeyDBError("ERROR: No samples found in genotypes table!")
......@@ -206,16 +209,42 @@ class ZonkeyDB(object):
"expected 'MALE', 'FEMALE', or 'NA'"
% (row["Sex"],))
for k_groups in (2, 3):
key = "Group(%i)" % (k_groups,)
groups = frozenset(row[key] for row in samples.itervalues())
if len(groups - set('-')) not in (0, k_groups):
raise ZonkeyDBError("The %r column in the samples table must "
"either contain %i ancestral groups, or "
"none" % (key, k_groups))
return samples
group_keys = []
for key in samples.values()[0]:
match = _SAMPLES_TABLE_GROUP.match(key)
if match is not None:
k_value = match.groupdict()["K"]
if not k_value.isdigit():
raise ZonkeyDBError("Malformed Group column name; K is "
"not a number: %r" % (key,))
elif not (2 <= int(k_value) <= 7):
raise ZonkeyDBError("K must be between 2 and 7, but found %r"
% (key,))
group_keys.append((key, int(k_value)))
groups = {}
for key, k_value in group_keys:
group = {}
for sample_key, sample in samples.items():
group[sample_key] = sample.pop(key)
group_labels = frozenset(group.values())
if group_labels == frozenset('-'):
continue # Allowed for backwards compatibility
elif '-' in group_labels:
raise ZonkeyDBError("Not all samples column %r assignd a group"
% (key,))
elif len(group_labels) != k_value:
raise ZonkeyDBError("Expected %i groups in column %r, found %i"
% (k_value, key, len(group_labels)))
groups[k_value] = group
if not groups:
raise ZonkeyDBError("No valid groups in samples.txt")
return samples, groups
@classmethod
def _read_sample_order(cls, tar_handle, filename):
......@@ -390,9 +419,7 @@ class ZonkeyDB(object):
% (key, linenum, filename, row[key]))
for key in ('Sample1', 'Sample2'):
group_key = 'Group(%i)' % (row['K'],)
groups = frozenset(row[group_key]
for row in self.samples.itervalues())
groups = frozenset(self.groups[int(row["K"])].values())
if row[key] not in groups and row[key] != '-':
raise ZonkeyDBError('Invalid group in column %r in '
......@@ -508,37 +535,43 @@ def _validate_mito_bam(data, handle, info):
def _validate_nuclear_bam(data, handle, info):
# Check that chromosomes are of expected size; unused chroms are ignored.
bam_contigs = dict(zip(map(contig_name_to_plink_name, handle.references),
handle.lengths))
ref_contigs = data.contigs
contigs_found = {}
for name, stats in sorted(ref_contigs.iteritems()):
if name not in bam_contigs:
contigs_found[name] = False
elif bam_contigs[name] != stats["Size"]:
print_err("\nERROR: Chrom %r in the BAM does not match the "
"length specified in data file:\n"
" - Expected: %i\n"
" - Found: %i"
% (name, bam_contigs[name], stats["Size"]))
return False
# Match reference panel contigs with BAM contigs; identification is done
# by size since different repositories use different naming schemes.
bam_contigs = collections.defaultdict(list)
for name, length in zip(handle.references, handle.lengths):
bam_contigs[length].append(name)
panel_names_to_bam = {}
for name, stats in sorted(data.contigs.iteritems()):
bam_contig_names = bam_contigs.get(stats["Size"], ())
if len(bam_contig_names) == 1:
panel_names_to_bam[name] = bam_contig_names[0]
elif len(bam_contig_names) > 1:
candidates = []
for bam_name in bam_contig_names:
if contig_name_to_plink_name(bam_name) == name:
candidates.append(bam_name)
if len(candidates) == 1:
panel_names_to_bam[name] = candidates[0]
else:
contigs_found[name] = True
print_warn("\WARNING: Multiple candidates for chr%s with size %i:"
% (name, stats["Size"]))
if any(contigs_found.itervalues()):
if not all(contigs_found.itervalues()):
for name in bam_contig_names:
print_warn(" - %r" % (name,))
if len(panel_names_to_bam) == len(data.contigs):
info.nuclear_contigs = panel_names_to_bam
return True
elif panel_names_to_bam:
print_err("\nERROR: Not all nuclear chromosomes found in BAM:")
for (name, stats) in sorted(ref_contigs.iteritems()):
is_found = "Found" if contigs_found[name] else "Not found!"
for (name, stats) in sorted(data.contigs.iteritems()):
is_found = "OK" if name in panel_names_to_bam else "Not found!"
print_err(" - %s: %s" % (name, is_found))
return False
else:
info.nuclear = True
return True
......
......@@ -36,7 +36,6 @@ class AdmixtureError(RuntimeError):
def read_admixture_results(filename, data, k_groups, cutoff=CUTOFF):
key = "Group(%i)" % (k_groups,)
names = tuple(data.sample_order) + ("-",)
table = _admixture_read_results(filename, names)
_admixture_validate_ancestral_groups(data, table, k_groups, cutoff)
......@@ -46,7 +45,7 @@ def read_admixture_results(filename, data, k_groups, cutoff=CUTOFF):
if sample == '-':
continue
group = data.samples[sample][key]
group = data.groups[k_groups][sample]
for index, value in enumerate(row):
if value >= cutoff:
ancestral_groups[index][0].add(group)
......@@ -146,13 +145,10 @@ def _admixture_read_results(filename, samples):
def _admixture_validate_ancestral_groups(data, table, k_groups, cutoff):
key = "Group(%i)" % (k_groups,)
groups = collections.defaultdict(dict)
for sample, row in table.iteritems():
if sample not in data.samples:
continue
group = data.samples[sample][key]
group = data.groups[k_groups].get(sample)
if group is not None:
for index, value in enumerate(row):
if value >= cutoff:
groups[group][index] = True
......
......@@ -34,6 +34,7 @@ _DEFAULT_COLORS = ("#E69F00", "#56B4E9",
class WriteSampleList(Node):
def __init__(self, config, output_file, dependencies=()):
self._samples = config.database.samples
self._groups = config.database.groups
Node.__init__(self,
description="<WriteSampleList -> %r>" % (output_file,),
......@@ -44,17 +45,18 @@ class WriteSampleList(Node):
def _run(self, config, temp):
output_file, = self.output_files
samples = self._samples
groups = set(sample["Group(3)"] for sample in samples.itervalues())
colors = dict(zip(groups, _DEFAULT_COLORS))
group = self._groups[max(self._groups)]
group_colors = dict(zip(sorted(set(group.values())), _DEFAULT_COLORS))
with open(fileutils.reroot_path(temp, output_file), "w") as handle:
handle.write("Name\tGroup\tColor\n")
for name, sample in sorted(samples.iteritems()):
group = sample["Group(3)"]
color = colors[group]
for sample_name in sorted(samples):
group_name = group[sample_name]
group_color = group_colors[group_name]
handle.write("%s\t%s\t%s\n" % (name, group, color))
handle.write("%s\t%s\t%s\n" % (sample_name, group_name, group_color))
handle.write("Sample\t-\t#000000\n")
......
......@@ -192,17 +192,10 @@ class BuildFilteredBEDFilesNode(CommandNode):
class AdmixtureNode(CommandNode):
def __init__(self, input_file, k_groups, output_root,
samples=None, dependencies=()):
self._samples = samples
def __init__(self, input_file, k_groups, output_root, groups, dependencies=()):
self._groups = groups
self._input_file = input_file
self._k_groups = k_groups
group_key = "Group(%i)" % (self._k_groups,)
self._supervised = samples and any((row[group_key] != '-')
for row in samples.itervalues())
assert k_groups in (2, 3), k_groups
prefix = os.path.splitext(os.path.basename(input_file))[0]
output_prefix = os.path.join(output_root,
"%s.%i" % (prefix, k_groups))
......@@ -227,8 +220,6 @@ class AdmixtureNode(CommandNode):
set_cwd=True)
cmd.set_option("-s", random.randint(0, 2 ** 16 - 1))
if self._supervised:
cmd.set_option("--supervised")
cmd.add_value("%(TEMP_OUT_FILE_BED)s")
......@@ -253,20 +244,16 @@ class AdmixtureNode(CommandNode):
basename = os.path.basename(filename)
os.symlink(os.path.abspath(filename), os.path.join(temp, basename))
if self._supervised:
fam_filename = fileutils.swap_ext(self._input_file, ".fam")
pop_filename = fileutils.swap_ext(fam_filename, ".pop")
pop_filename = fileutils.reroot_path(temp, pop_filename)
key = "Group(%i)" % (self._k_groups,)
with open(fam_filename) as fam_handle:
with open(pop_filename, "w") as pop_handle:
for line in fam_handle:
sample, _ = line.split(None, 1)
group = self._samples.get(sample, {}).get(key, "-")
pop_handle.write("%s\n" % (group,))
pop_handle.write("%s\n" % (self._groups.get(sample, "-"),))
class SelectBestAdmixtureNode(Node):
......@@ -399,6 +386,8 @@ class BuildFreqFilesNode(CommandNode):
IN_BIM=input_prefix + ".bim",
IN_FAM=input_prefix + ".fam",
TEMP_OUT_CLUST="samples.clust",
TEMP_OUT_IMISS=basename + ".imiss",
TEMP_OUT_LMISS=basename + ".lmiss",
OUT_NOSEX=output_prefix + ".frq.strat.nosex",
OUT_LOG=output_prefix + ".frq.strat.log",
TEMP_OUT_PREFIX=basename,
......@@ -692,8 +681,9 @@ class PlotPCANode(CommandNode):
class PlotCoverageNode(CommandNode):
def __init__(self, contigs, input_file, output_prefix, dependencies=()):
def __init__(self, contigs, mapping, input_file, output_prefix, dependencies=()):
self._contigs = contigs
self._mapping = dict(zip(mapping.values(), mapping))
self._input_file = input_file
script = rtools.rscript("zonkey", "coverage.r")
......@@ -726,19 +716,11 @@ class PlotCoverageNode(CommandNode):
continue
name, size, hits, _ = line.split('\t')
name = contig_name_to_plink_name(name)
if name is None or not (name.isdigit() or name == 'X'):
continue
elif name not in self._contigs:
name = self._mapping.get(name, name)
if name not in self._contigs:
# Excluding contigs is allowed
continue
if int(size) != self._contigs[name]['Size']:
raise NodeError("Size mismatch between database and BAM; "
"expected size %i, found %i for contig %r"
% (int(size), self._contigs[name]['Size'],
name))
row = {
'ID': name,
'Size': self._contigs[name]['Size'],
......
......@@ -139,26 +139,19 @@ class ReportNode(Node):
output_handle.write(_SECTION_HEADER.format(name="samples",
title="Reference Panel"))
output_handle.write(_SAMPLE_LIST_HEADER)
group_headers = []
for k_value in sorted(self._data.groups):
group_headers.append(" <th>Group(%i)</th>" % k_value)
output_handle.write(_SAMPLE_LIST_HEADER % ('\n'.join(group_headers),))
last_group_2 = None
last_group_3 = None
for row in sorted(self._data.samples.itervalues(),
key=lambda row: (row["Group(2)"],
row["Group(3)"],
row["ID"])):
for name, row in sorted(self._data.samples.iteritems()):
row = dict(row)
if last_group_2 != row["Group(2)"]:
last_group_2 = row["Group(2)"]
last_group_3 = row["Group(3)"]
else:
row["Group(2)"] = ""
if last_group_3 != row["Group(3)"]:
last_group_3 = row["Group(3)"]
else:
row["Group(3)"] = ""
groups = []
for key in self._data.groups:
groups.append(" <td>%s</td>" % self._data.groups[key][name])
row["Groups"] = "\n".join(groups)
pub = row["Publication"]
if pub.startswith("http"):
......@@ -183,7 +176,7 @@ class ReportNode(Node):
overview = _ADMIXTURE_OVERVIEW.format(ADMIXTURE=admixture_v)
output_handle.write(overview)
for k_groups in (2, 3):
for k_groups in sorted(self._data.groups):
summary_incl = self._build_admixture_cell(k_groups, True)
summary_excl = self._build_admixture_cell(k_groups, False)
......@@ -372,7 +365,7 @@ class AnalysisReport(object):
# Include files showing proproation of ancestral populations,
# which are required to build admixture figures in the reports.
for k_groups in (2, 3):
for k_groups in sorted(self._data.groups):
input_files.append(os.path.join(admix_root,
"%s.%i.Q" % (postfix,
k_groups)))
......@@ -625,9 +618,8 @@ _OVERVIEW_FOOTER = """
_SAMPLE_LIST_HEADER = """
<table summary="List of samples in the reference panel.">
<tr>
<th>Group(2)</th>
<th>Group(3)</th>
<th>ID</th>
%s
<th>Species</th>
<th>Sex</th>
<th>Sample Name</th>
......@@ -637,9 +629,8 @@ _SAMPLE_LIST_HEADER = """
_SAMPLE_LIST_ROW = """
<tr>
<td>{Group(2)}
<td>{Group(3)}
<td>{ID}</td>
{Groups}
<td><em>{Species}</em></td>
<td>{Sex}</th>
<td>{SampleID}</td>
......
......@@ -146,7 +146,7 @@ class SummaryNode(Node):
admix_root = os.path.join(self._root, sample,
"results", "admixture")
for k_groups in (2, 3):
for k_groups in sorted(self._data.groups):
filename = os.path.join(admix_root, "%s.%i.Q" % (postfix,
k_groups))
......@@ -189,7 +189,7 @@ class SummaryNode(Node):
admix_root = os.path.join(self._root, sample,
"results", "admixture")
for k_groups in (2, 3):
for k_groups in sorted(self._data.groups):
filename = os.path.join(admix_root, "%s.%i.Q" % (postfix, k_groups))
lines.append(" <td>")
......
......@@ -37,6 +37,8 @@ from paleomix.common.console import \
print_info, \
print_warn
from paleomix.common.formats.fasta import FASTA
from paleomix.pipeline import \
Pypeline
......@@ -122,7 +124,7 @@ def build_admixture_nodes(config, data, root, plink):
admix_root = os.path.join(root, "results", "admixture")
report_root = os.path.join(root, "figures", "admixture")
for k_groups in (2, 3):
for k_groups in sorted(data.groups):
replicates = []
input_file = os.path.join(plink["root"], postfix + ".bed")
......@@ -132,7 +134,7 @@ def build_admixture_nodes(config, data, root, plink):
node = nuclear.AdmixtureNode(input_file=input_file,
output_root=output_root,
k_groups=k_groups,
samples=data.samples,
groups=data.groups[k_groups],
dependencies=(bed_node,))
replicates.append(node)
......@@ -228,10 +230,11 @@ def build_pca_nodes(config, data, root, plink):
return nodes
def build_coverage_nodes(data, root, nuc_bam, dependencies=()):
def build_coverage_nodes(contigs, mapping, root, nuc_bam, dependencies=()):
output_prefix = os.path.join(root, 'figures', 'coverage', 'coverage')
return nuclear.PlotCoverageNode(contigs=data.contigs,
return nuclear.PlotCoverageNode(contigs=contigs,
mapping=mapping,
input_file=nuc_bam,
output_prefix=output_prefix,
dependencies=dependencies),
......@@ -278,6 +281,8 @@ def build_pipeline(config, root, nuc_bam, mito_bam, cache):
output_file=sample_tbl)
if nuc_bam is not None:
nuc_bam, nuc_bam_info = nuc_bam["Path"], nuc_bam["Info"]
# When not sampling, BuildTPED relies on indexed access to ease
# processing of one chromosome at a time. The index is further required
# for idxstats used by the PlotCoverageNode.
......@@ -292,8 +297,11 @@ def build_pipeline(config, root, nuc_bam, mito_bam, cache):
plink))
if not config.admixture_only:
nodes.extend(build_coverage_nodes(config.database,
root, nuc_bam, (index,)))
nodes.extend(build_coverage_nodes(contigs=config.database.contigs,
mapping=nuc_bam_info.nuclear_contigs,
root=root,
nuc_bam=nuc_bam,
dependencies=(index,)))
nodes.extend(build_pca_nodes(config, config.database,
root, plink))
nodes.extend(build_treemix_nodes(config, config.database,
......@@ -384,8 +392,6 @@ def setup_mito_mapping(config):
info = config.database.samples.get(record.name)
if info is not None:
mkfile.write(" # Group: %s\n"
% (info.get('Group(3)', 'NA'),))
mkfile.write(" # Species: %s\n"
% (info.get('Species', 'NA'),))
mkfile.write(" # Sex: %s\n"
......@@ -401,6 +407,11 @@ def setup_mito_mapping(config):
"%s.fasta" % (record.name,))
with open(fasta_fpath, "w") as fasta_handle:
record = FASTA(
name=record.name,
meta=None,
sequence=record.sequence.replace('-', ''))
fasta_handle.write(str(record))
fasta_handle.write("\n")
......