Skip to content
Commits on Source (5)
......@@ -11,3 +11,4 @@ dist/
TAGS
evidence_dump/
nosetests.xml
_deps/
#################################################################################
# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Pacific Biosciences nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
# Author: David Alexander, Jim Drake
import cProfile, logging, os.path, sys
......@@ -37,6 +7,7 @@ from collections import OrderedDict, defaultdict
from .options import options
from GenomicConsensus import reference, consensus, utils, windows
from .io.VariantsGffWriter import VariantsGffWriter
from .io.VariantsVcfWriter import VariantsVcfWriter
from pbcore.io import FastaWriter, FastqWriter
class ResultCollector(object):
......@@ -84,7 +55,10 @@ class ResultCollector(object):
self.consensusChunksByRefId = defaultdict(list)
# open file writers
self.fastaWriter = self.fastqWriter = self.gffWriter = None
self.fastaWriter = None
self.fastqWriter = None
self.gffWriter = None
self.vcfWriter = None
if options.fastaOutputFilename:
self.fastaWriter = FastaWriter(options.fastaOutputFilename)
if options.fastqOutputFilename:
......@@ -93,6 +67,10 @@ class ResultCollector(object):
self.gffWriter = VariantsGffWriter(options.gffOutputFilename,
vars(options),
reference.byName.values())
if options.vcfOutputFilename:
self.vcfWriter = VariantsVcfWriter(options.vcfOutputFilename,
vars(options),
reference.byName.values())
def onResult(self, result):
window, cssAndVariants = result
......@@ -105,6 +83,7 @@ class ResultCollector(object):
if self.fastaWriter: self.fastaWriter.close()
if self.fastqWriter: self.fastqWriter.close()
if self.gffWriter: self.gffWriter.close()
if self.vcfWriter: self.vcfWriter.close()
logging.info("Output files completed.")
def _recordNewResults(self, window, css, variants):
......@@ -122,8 +101,12 @@ class ResultCollector(object):
if basesProcessed == requiredBases:
# This contig is done, so we can dump to file and delete
# the data structures.
if self.gffWriter:
self.gffWriter.writeVariants(sorted(self.variantsByRefId[refId]))
if self.gffWriter or self.vcfWriter:
variants = sorted(self.variantsByRefId[refId])
if self.gffWriter:
self.gffWriter.writeVariants(variants)
if self.vcfWriter:
self.vcfWriter.writeVariants(variants)
del self.variantsByRefId[refId]
#
......
#################################################################################
# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Pacific Biosciences nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
# Author: David Alexander, Jim Drake
import cProfile, logging, os.path
......
#################################################################################
# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Pacific Biosciences nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
# Author: David Alexander, David Seifert
# Author: David Alexander
__VERSION__ = "2.1.0"
__VERSION__ = '2.2.2' # don't forget to update setup.py and doc/conf.py too
#!/usr/bin/env python
#################################################################################
# Copyright (c) 2011-2016, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Pacific Biosciences nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
# Author: David Alexander
from .utils import die
......
#################################################################################
# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Pacific Biosciences nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
# Authors: David Alexander, Lance Hepler
import utils
import model
from __future__ import absolute_import
from . import utils
from . import model
# import evidence
#################################################################################
# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Pacific Biosciences nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
# Authors: David Alexander, Lance Hepler
import logging, os.path
......@@ -52,7 +22,7 @@ def consensusAndVariantsForWindow(alnFile, refWindow, referenceContig,
depthLimit, arrowConfig):
"""
High-level routine for calling the consensus for a
window of the genome given a cmp.h5.
window of the genome given a BAM file.
Identifies the coverage contours of the window in order to
identify subintervals where a good consensus can be called.
......@@ -112,17 +82,22 @@ def consensusAndVariantsForWindow(alnFile, refWindow, referenceContig,
(reference.windowToString(subWin),
" ".join([str(hit.readName) for hit in alns])))
alnsUsed = [] if options.reportEffectiveCoverage else None
css = U.consensusForAlignments(subWin,
intRefSeq,
clippedAlns,
arrowConfig)
arrowConfig,
alnsUsed=alnsUsed)
# Tabulate the coverage implied by these alignments, as
# well as the post-filtering ("effective") coverage
siteCoverage = U.coverageInWindow(subWin, alns)
effectiveSiteCoverage = U.coverageInWindow(subWin, alnsUsed) if options.reportEffectiveCoverage else None
variants_ = U.variantsFromConsensus(subWin, windowRefSeq,
css.sequence, css.confidence, siteCoverage,
options.aligner,
ai=None)
variants_, newPureCss = U.variantsFromConsensus(subWin, windowRefSeq, css.sequence, css.confidence,
siteCoverage, effectiveSiteCoverage,
options.aligner, ai=None,
diploid=arrowConfig.polishDiploid)
filteredVars = filterVariants(options.minCoverage,
options.minConfidence,
......@@ -133,11 +108,18 @@ def consensusAndVariantsForWindow(alnFile, refWindow, referenceContig,
variants += filteredVars
# The nascent consensus sequence might contain ambiguous bases, these
# need to be removed as software in the wild cannot deal with such
# characters and we only use IUPAC for *internal* bookkeeping.
if arrowConfig.polishDiploid:
css.sequence = newPureCss
# Dump?
maybeDumpEvidence = \
((options.dumpEvidence == "all") or
(options.dumpEvidence == "outliers") or
(options.dumpEvidence == "variants") and (len(variants) > 0))
(options.dumpEvidence == "variants") and
(len(variants) > 0) and css.hasEvidence)
if maybeDumpEvidence:
refId, refStart, refEnd = subWin
refName = reference.idToName(refId)
......@@ -148,7 +130,7 @@ def consensusAndVariantsForWindow(alnFile, refWindow, referenceContig,
ev = ArrowEvidence.fromConsensus(css)
if options.dumpEvidence != "outliers":
ev.save(windowDirectory)
elif (np.max(np.abs(ev.delta)) > 20):
elif (np.max(ev.delta) > 20):
# Mathematically I don't think we should be seeing
# deltas > 6 in magnitude, but let's just restrict
# attention to truly bonkers outliers.
......@@ -249,19 +231,39 @@ def configure(options, alnFile):
"The Arrow algorithm requires a BAM file containing standard (non-CCS) reads." )
if options.diploid:
logging.warn("Diploid analysis not yet supported under Arrow model.")
logging.info(
"Diploid polishing in the Arrow model is in *BETA* mode.\n"
"Any multi-base string that appears in annotation files\n"
"is not phased!")
# load parameters from file
if options.parametersFile:
logging.info("Loading model parameters from: ({0})".format(options.parametersFile))
if not cc.LoadModels(options.parametersFile):
die("Arrow: unable to load parameters from: ({0})".format(options.parametersFile))
# test available chemistries
supp = set(cc.SupportedChemistries())
logging.info("Found consensus models for: ({0})".format(", ".join(sorted(supp))))
used = set(alnFile.sequencingChemistry)
used = set([])
if options.parametersSpec != "auto":
used = set([options.parametersSpec])
unsupp = used - supp
if unsupp:
die("Arrow: unsupported chemistries found: ({0})".format(", ".join(sorted(unsupp))))
logging.info("Overriding model selection with: ({0})".format(options.parametersSpec))
if not cc.OverrideModel(options.parametersSpec):
die("Arrow: unable to override model with: ({0})".format(options.parametersSpec))
used.add(options.parametersSpec)
else:
used.update(alnFile.sequencingChemistry)
unsupp = used - supp
if used - supp:
die("Arrow: unsupported chemistries found: ({0})".format(", ".join(sorted(unsupp))))
# All arrow models require PW except P6 and the first S/P1-C1
for readGroup in alnFile.readGroupTable:
if set([readGroup["SequencingChemistry"]]) - set(["P6-C4", "S/P1-C1/beta"]):
if ("Ipd" not in readGroup["BaseFeatures"] or
"PulseWidth" not in readGroup["BaseFeatures"]):
die("Arrow model requires missing base feature: IPD or PulseWidth")
logging.info("Using consensus models for: ({0})".format(", ".join(sorted(used))))
......@@ -272,8 +274,9 @@ def configure(options, alnFile):
minHqRegionSnr=options.minHqRegionSnr,
minZScore=options.minZScore,
minAccuracy=options.minAccuracy,
chemistryOverride=(None if options.parametersSpec == "auto"
else options.parametersSpec))
maskRadius=options.maskRadius,
maskErrorRate=options.maskErrorRate,
polishDiploid=options.diploid)
def slaveFactories(threaded):
# By default we use slave processes. The tuple ordering is important.
......
# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Pacific Biosciences nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
# Authors: David Alexander, Lance Hepler
from GenomicConsensus.arrow.utils import allSingleBaseMutations
from GenomicConsensus.variants import Variant
from GenomicConsensus.quiver.diploid import variantsFromAlignment
import numpy as np
import ConsensusCore2 as cc
......@@ -77,9 +49,9 @@ def packMuts(cssBase, mut1, mut2):
#
nonNullMut = mut1 or mut2
start = nonNullMut.Start()
mutType = nonNullMut.Type
newBase1 = mut1.Base if mut1 else cssBase
newBase2 = mut2.Base if mut2 else cssBase
mutType = nonNullMut.Type()
newBase1 = mut1.Bases() if mut1 else cssBase
newBase2 = mut2.Bases() if mut2 else cssBase
newBasePacked = packIUPAC((newBase1, newBase2))
return cc.Mutation(mutType, start, newBasePacked)
......@@ -148,76 +120,3 @@ def variantsFromConsensus(refWindow, refSequenceInWindow, cssSequenceInWindow,
refSequenceInWindow, cssSequenceInWindow,
cssQvInWindow, siteCoverage)
return vars
def variantsFromAlignment(refWindow, refSeq, cssSeq,
cssQV=None, refCoverage=None):
"""
Extract the variants implied by a pairwise alignment of cssSeq to
refSeq reference. If cssQV, refCoverage are provided, they will
be used to decorate the variants with those attributes.
Arguments:
- cssQV: QV array, same length as css
- refCoverage: coverage array, sample length as reference window
This is trickier than in the haploid case. We have to break out
diploid variants as single bases, in order to avoid implying
phase.
"""
variants = []
refId, refStart, refEnd = refWindow
aln = cc.AlignAffineIupac(refSeq, cssSeq);
alnTarget = aln.Target()
alnQuery = aln.Query()
assert (cssQV is None) == (refCoverage is None) # Both or none
assert len(refSeq) == refEnd - refStart
assert cssQV is None or len(cssSeq) == len(cssQV)
assert refCoverage is None or len(refSeq) == len(refCoverage)
transcript = [ X if (Q != "N" and T != "N") else "N"
for (X, T, Q) in zip(aln.Transcript(),
alnTarget,
alnQuery) ]
variants = []
runStart = -1
runStartRefPos = None
runX = None
refPos = refStart
for pos, (X, T, Q) in enumerate(zip(transcript,
alnTarget,
alnQuery)):
if X != runX or isHeterozygote(Q):
if runStart >= 0 and runX not in "MN":
# Package up the run and dump a variant
ref = alnTarget[runStart:pos].replace("-", "")
read = alnQuery [runStart:pos].replace("-", "")
if isHeterozygote(read):
allele1, allele2 = unpackIUPAC(read)
var = Variant(refId, runStartRefPos, refPos, ref, allele1, allele2)
else:
var = Variant(refId, runStartRefPos, refPos, ref, read)
variants.append(var)
runStart = pos
runStartRefPos = refPos
runX = X
if T != "-": refPos += 1
# This might be better handled within the loop above, just keeping
# track of Qpos, Tpos
if cssQV is not None:
cssPosition = cc.TargetToQueryPositions(aln)
for v in variants:
# HACK ALERT: we are not really handling the confidence or
# coverage for variants at last position of the window
# correctly here.
refPos_ = min(v.refStart-refStart, len(refCoverage)-1)
cssPos_ = min(cssPosition[v.refStart-refStart], len(cssQV)-1)
if refCoverage is not None: v.coverage = refCoverage[refPos_]
if cssQV is not None: v.confidence = cssQV[cssPos_]
return variants
#################################################################################
# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Pacific Biosciences nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
# Authors: David Alexander, Lance Hepler
__all__ = [ "ArrowEvidence" ]
import h5py, logging, os.path, numpy as np
import logging, os.path, numpy as np
from collections import namedtuple
from itertools import groupby
from bisect import bisect_left, bisect_right
......@@ -41,7 +11,9 @@ from .utils import scoreMatrix
from .. import reference
class ArrowEvidence(object):
"""If used at runtime, this will require h5py
(for storing/loading 'arrow-scores.h5')
"""
Mutation = namedtuple("Mutation", ("Position", "Type", "FromBase", "ToBase"))
@staticmethod
......@@ -105,6 +77,7 @@ class ArrowEvidence(object):
refWindow = (refName, refStart, refEnd)
with FastaReader(dir + "/consensus.fa") as fr:
consensus = next(iter(fr)).sequence
import h5py
with h5py.File(dir + "/arrow-scores.h5", "r") as f:
scores = f["Scores"].value
baselineScores = f["BaselineScores"].value
......@@ -131,7 +104,7 @@ class ArrowEvidence(object):
logging.info("Dumping evidence to %s" % (dir,))
join = os.path.join
if os.path.exists(dir):
raise Exception, "Evidence dump does not expect directory %s to exist." % dir
raise Exception("Evidence dump does not expect directory %s to exist." % dir)
os.makedirs(dir)
#refFasta = FastaWriter(join(dir, "reference.fa"))
#readsFasta = FastaWriter(join(dir, "reads.fa"))
......@@ -143,6 +116,7 @@ class ArrowEvidence(object):
consensusFasta.writeRecord(windowName + "|arrow", self.consensus)
consensusFasta.close()
import h5py
arrowScoreFile = h5py.File(join(dir, "arrow-scores.h5"))
arrowScoreFile.create_dataset("Scores", data=self.scores)
vlen_str = h5py.special_dtype(vlen=str)
......@@ -150,6 +124,7 @@ class ArrowEvidence(object):
arrowScoreFile.create_dataset("ColumnNames", data=self.colNames, dtype=vlen_str)
arrowScoreFile.create_dataset("BaselineScores", data=self.baselineScores)
arrowScoreFile.close()
# for aln in alns:
# readsFasta.writeRecord(str(aln.rowNumber),
# aln.read(orientation="genomic", aligned=False))
......
#################################################################################
# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Pacific Biosciences nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
# Authors: David Alexander, Lance Hepler
import numpy as np, ConfigParser, collections, logging
......@@ -66,7 +36,9 @@ class ArrowConfig(object):
minHqRegionSnr=3.75,
minZScore=-3.5,
minAccuracy=0.82,
chemistryOverride=None):
maskRadius=0,
maskErrorRate=0.5,
polishDiploid=False):
self.minMapQV = minMapQV
self.minPoaCoverage = minPoaCoverage
......@@ -81,7 +53,9 @@ class ArrowConfig(object):
self.minHqRegionSnr = minHqRegionSnr
self.minZScore = minZScore
self.minAccuracy = minAccuracy
self.chemistryOverride = chemistryOverride
self.maskRadius = maskRadius
self.maskErrorRate = maskErrorRate
self.polishDiploid = polishDiploid
def extractMappedRead(self, aln, windowStart):
"""
......@@ -99,7 +73,7 @@ class ArrowConfig(object):
rawFeature = aln.baseFeature(featureName, aligned=False, orientation="native")
return rawFeature.clip(0,255).astype(np.uint8)
else:
return np.zeros((aln.qLen,), dtype=np.uint8)
return np.zeros((aln.readLength,), dtype=np.uint8)
name = aln.readName
chemistry = aln.sequencingChemistry
......@@ -109,7 +83,7 @@ class ArrowConfig(object):
cc.Uint8Vector(baseFeature("Ipd").tolist()),
cc.Uint8Vector(baseFeature("PulseWidth").tolist()),
cc.SNR(aln.hqRegionSnr),
chemistry if self.chemistryOverride is None else self.chemistryOverride)
chemistry)
return cc.MappedRead(read,
strand,
int(aln.referenceStart - windowStart),
......
#################################################################################
# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Pacific Biosciences nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
# Authors: David Alexander, Lance Hepler
import numpy as np, itertools, logging, re, sys
from collections import Counter
from traceback import format_exception
from GenomicConsensus.variants import *
from GenomicConsensus.utils import *
......@@ -53,19 +22,15 @@ def uniqueSingleBaseMutations(templateSequence, positions=None):
# snvs
for subsBase in allBases:
if subsBase != tplBase:
yield cc.Mutation(cc.MutationType_SUBSTITUTION,
tplStart,
subsBase)
yield cc.Mutation_Substitution(tplStart, subsBase)
# Insertions---only allowing insertions that are not cognate
# with the previous base.
for insBase in allBases:
if insBase != prevTplBase:
yield cc.Mutation(cc.MutationType_INSERTION,
tplStart,
insBase)
yield cc.Mutation_Insertion(tplStart, insBase)
# Deletion--only allowed if refBase does not match previous tpl base
if tplBase != prevTplBase:
yield cc.Mutation(cc.MutationType_DELETION, tplStart)
yield cc.Mutation_Deletion(tplStart, 1)
def allSingleBaseMutations(templateSequence, positions=None):
"""
......@@ -79,16 +44,12 @@ def allSingleBaseMutations(templateSequence, positions=None):
# snvs
for subsBase in allBases:
if subsBase != tplBase:
yield cc.Mutation(cc.MutationType_SUBSTITUTION,
tplStart,
subsBase)
yield cc.Mutation_Substitution(tplStart, subsBase)
# Insertions
for insBase in allBases:
yield cc.Mutation(cc.MutationType_INSERTION,
tplStart,
insBase)
yield cc.Mutation_Insertion(tplStart, insBase)
# Deletion
yield cc.Mutation(cc.MutationType_DELETION, tplStart)
yield cc.Mutation_Deletion(tplStart, 1)
def nearbyMutations(mutations, tpl, neighborhoodSize):
"""
......@@ -124,14 +85,18 @@ def bestSubset(mutationsAndScores, separation):
return output
def refineConsensus(ai, arrowConfig):
def refineConsensus(ai, arrowConfig, polishDiploid=False):
"""
Given a MultiReadMutationScorer, identify and apply favorable
template mutations. Return (consensus, didConverge) :: (str, bool)
"""
cfg = cc.PolishConfig(arrowConfig.maxIterations,
arrowConfig.mutationSeparation,
arrowConfig.mutationNeighborhood)
arrowConfig.mutationNeighborhood,
polishDiploid)
if arrowConfig.maskRadius:
_ = cc.Polish(ai, cfg)
ai.MaskIntervals(arrowConfig.maskRadius, arrowConfig.maskErrorRate)
polishResult = cc.Polish(ai, cfg)
return str(ai), polishResult.hasConverged
......@@ -144,7 +109,50 @@ def consensusConfidence(ai, positions=None):
"""
return np.array(np.clip(cc.ConsensusQualities(ai), 0, 93), dtype=np.uint8)
def variantsFromAlignment(a, refWindow, cssQvInWindow=None, siteCoverage=None):
IUPACdict = {
'N' : ('N'),
'n' : ('n'),
'A' : ('A'),
'a' : ('a'),
'C' : ('C'),
'c' : ('c'),
'G' : ('G'),
'g' : ('g'),
'T' : ('T'),
't' : ('t'),
'M' : ('A', 'C'),
'm' : ('a', 'c'),
'R' : ('A', 'G'),
'r' : ('a', 'g'),
'W' : ('A', 'T'),
'w' : ('a', 't'),
'S' : ('C', 'G'),
's' : ('c', 'g'),
'Y' : ('C', 'T'),
'y' : ('c', 't'),
'K' : ('G', 'T'),
'k' : ('g', 't')}
def splitupIUPAC(css):
listSeq1 = [IUPACdict[x][0] for x in css]
listSeq2 = [IUPACdict[x][-1] for x in css]
if listSeq1 == listSeq2:
# haploid
readSeq1 = ''.join(listSeq1)
readSeq2 = None
freq1 = None
freq2 = None
else:
# diploid
readSeq1 = ''.join(listSeq1)
readSeq2 = ''.join(listSeq2)
freq1 = 0.5
freq2 = 0.5
return readSeq1, readSeq2, freq1, freq2
def variantsFromAlignment(a, refWindow, cssQvInWindow=None, siteCoverage=None, effectiveSiteCoverage=None):
"""
Extract the variants implied by a pairwise alignment to the
reference.
......@@ -161,6 +169,10 @@ def variantsFromAlignment(a, refWindow, cssQvInWindow=None, siteCoverage=None):
grouper = lambda row: "N" if (row[1]=="N" or row[2]=="N") else row[0]
runs = itertools.groupby(tbl, grouper)
# track predecessor "anchor" base for vcf output of indel variants
refPrev = "N"
cssPrev = "N"
for code, run in runs:
assert code in "RIDMN"
run = list(run)
......@@ -174,11 +186,20 @@ def variantsFromAlignment(a, refWindow, cssQvInWindow=None, siteCoverage=None):
pass
elif code == "R":
assert len(css)==len(ref)
variant = Variant(refId, refPos, refPos+len(css), ref, css)
css, readSeq2, freq1, freq2 = splitupIUPAC(css)
variant = Variant(refId=refId, refStart=refPos, refEnd=refPos+len(css),
refSeq=ref, readSeq1=css, readSeq2=readSeq2,
frequency1=freq1, frequency2=freq2,
refPrev=refPrev, readPrev=cssPrev)
elif code == "I":
variant = Variant(refId, refPos, refPos, "", css)
css, readSeq2, freq1, freq2 = splitupIUPAC(css)
variant = Variant(refId=refId, refStart=refPos, refEnd=refPos,
refSeq="", readSeq1=css, readSeq2=readSeq2,
frequency1=freq1, frequency2=freq2,
refPrev=refPrev, readPrev=cssPrev)
elif code == "D":
variant = Variant(refId, refPos, refPos + len(ref), ref, "")
variant = Variant(refId, refPos, refPos + len(ref), ref, "",
refPrev=refPrev, readPrev=cssPrev)
if variant is not None:
# HACK ALERT: variants at the first and last position
......@@ -186,6 +207,11 @@ def variantsFromAlignment(a, refWindow, cssQvInWindow=None, siteCoverage=None):
if siteCoverage is not None and np.size(siteCoverage) > 0:
refPos_ = min(refPos-refStart, len(siteCoverage)-1)
variant.coverage = siteCoverage[refPos_]
if effectiveSiteCoverage is not None and np.size(effectiveSiteCoverage) > 0:
refPos_ = min(refPos-refStart, len(siteCoverage)-1)
variant.annotate("effectiveCoverage", effectiveSiteCoverage[refPos_])
#import ipdb
#ipdb.set_trace()
if cssQvInWindow is not None and np.size(cssQvInWindow) > 0:
cssPos_ = min(cssPos, len(cssQvInWindow)-1)
variant.confidence = cssQvInWindow[cssPos_]
......@@ -193,6 +219,10 @@ def variantsFromAlignment(a, refWindow, cssQvInWindow=None, siteCoverage=None):
refPos += refLen
cssPos += cssLen
ref = ref.replace("-", "")
css = css.replace("-", "")
refPrev = ref[-1] if ref else refPrev
cssPrev = css[-1] if css else cssPrev
return variants
......@@ -231,12 +261,10 @@ def _shortMutationDescription(mut, tpl):
201 Sub C > T
201 Del C > .
"""
_type = _typeMap[mut.Type]
_type = _typeMap[mut.Type()]
_pos = mut.Start()
_oldBase = "." if mut.Type == cc.MutationType_INSERTION \
else tpl[_pos]
_newBase = "." if mut.Type == cc.MutationType_DELETION \
else mut.Base
_oldBase = "." if mut.IsInsertion() else tpl[_pos]
_newBase = "." if mut.IsDeletion() else mut.Bases()
return "%d %s %s > %s" % (_pos, _type, _oldBase, _newBase)
def scoreMatrix(ai):
......@@ -266,24 +294,68 @@ def scoreMatrix(ai):
rowNames = readNames
return (rowNames, columnNames, baselineScores, scoreMatrix)
def constructIUPACfreeConsensus(a):
targetAln = a.Target()
queryAln = a.Query()
assert len(targetAln) == len(queryAln)
pureCss = []
for i in xrange(len(queryAln)):
curBase = queryAln[i]
if curBase != '-':
if curBase == 'N' or curBase == 'n':
newBase = curBase
elif targetAln[i] in IUPACdict[curBase]:
# construct new sequence with a
# minimum of divergence, i.e.
# target: A
# query: R (=A+G)
# -> new base = A
newBase = targetAln[i]
else:
newBase = IUPACdict[curBase][0]
pureCss.append(newBase)
newCss = ''.join(pureCss)
# Be absolutely sure that *really* all
# ambiguous bases have been removed.
for i in ('M', 'm', 'R', 'r', 'W', 'w', 'S', 's', 'Y', 'y', 'K', 'k'):
assert newCss.find(i) == -1
return newCss
def variantsFromConsensus(refWindow, refSequenceInWindow, cssSequenceInWindow,
cssQvInWindow=None, siteCoverage=None, aligner="affine",
ai=None):
cssQvInWindow=None, siteCoverage=None, effectiveSiteCoverage=None,
aligner="affine", ai=None, diploid=False):
"""
Compare the consensus and the reference in this window, returning
a list of variants.
"""
refId, refStart, refEnd = refWindow
if aligner == "affine":
align = cc.AlignAffine
if diploid:
align = cc.AlignAffineIupac
else:
align = cc.Align
newCss = ""
if aligner == "affine":
align = cc.AlignAffine
else:
align = cc.Align
ga = align(refSequenceInWindow, cssSequenceInWindow)
return variantsFromAlignment(ga, refWindow, cssQvInWindow, siteCoverage)
if diploid:
newCss = constructIUPACfreeConsensus(ga)
# new de-IUPACed sequence still
# needs to be of same length
assert(len(newCss) == len(cssSequenceInWindow))
return variantsFromAlignment(ga, refWindow, cssQvInWindow, siteCoverage, effectiveSiteCoverage), newCss
def filterAlns(refWindow, alns, arrowConfig):
......@@ -328,33 +400,61 @@ def sufficientlyAccurate(mappedRead, poaCss, minAccuracy):
return acc >= minAccuracy
def consensusForAlignments(refWindow, refSequence, alns, arrowConfig):
def poaConsensus(fwdSequences, arrowConfig):
seqLens = [len(seq) for seq in fwdSequences]
median = np.median(seqLens)
ordSeqs = sorted(fwdSequences, key=lambda seq: abs(len(seq) - median))
ordSeqs = ordSeqs[:arrowConfig.maxPoaCoverage]
cov = len(ordSeqs)
minCov = 1 if cov < 5 else ((cov + 1) / 2 - 1)
poaConfig = cc.DefaultPoaConfig(cc.AlignMode_GLOBAL)
return cc.PoaConsensus.FindConsensus(ordSeqs, poaConfig, minCov)
def consensusForAlignments(refWindow, refSequence, alns, arrowConfig, draft=None, polish=True,
alnsUsed=None):
"""
Call consensus on this interval---without subdividing the interval
further.
Testable!
Returns an ArrowConsensus object.
Requires that clipping has already been done.
If `draft` is provided, it will serve as the starting
point for polishing. If not, the POA will be used to generate a
draft starting point.
Clipping has already been done!
If `polish` is False, the arrow polishing procedure will not be
used, and the draft consensus will be returned.
`alnsUsed` is an output parameter; if not None, it should be an
empty list on entry; on return from this function, the list will
contain the alns objects that were actually used to compute the
consensus (those not filtered out).
"""
_, refStart, refEnd = refWindow
# Compute the POA consensus, which is our initial guess, and
# should typically be > 99.5% accurate
fwdSequences = [ a.read(orientation="genomic", aligned=False)
for a in alns
if a.spansReferenceRange(refStart, refEnd) ]
assert len(fwdSequences) >= arrowConfig.minPoaCoverage
try:
p = cc.PoaConsensus.FindConsensus(fwdSequences[:arrowConfig.maxPoaCoverage])
except:
logging.info("%s: POA could not be generated" % (refWindow,))
return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus,
refWindow, refSequence)
ga = cc.Align(refSequence, p.Sequence)
numPoaVariants = ga.Errors()
poaCss = p.Sequence
if alnsUsed is not None:
assert alnsUsed == []
if draft is None:
# Compute the POA consensus, which is our initial guess, and
# should typically be > 99.5% accurate
fwdSequences = [ a.read(orientation="genomic", aligned=False)
for a in alns
if a.spansReferenceRange(refStart, refEnd) ]
assert len(fwdSequences) >= arrowConfig.minPoaCoverage
try:
p = poaConsensus(fwdSequences, arrowConfig)
except Exception:
logging.info("%s: POA could not be generated" % (refWindow,))
return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus,
refWindow, refSequence)
draft = p.Sequence
ga = cc.Align(refSequence, draft)
# Extract reads into ConsensusCore2-compatible objects, and map them into the
# coordinates relative to the POA consensus
......@@ -364,15 +464,15 @@ def consensusForAlignments(refWindow, refSequence, alns, arrowConfig):
# Load the mapped reads into the mutation scorer, and iterate
# until convergence.
ai = cc.MultiMolecularIntegrator(poaCss, cc.IntegratorConfig(arrowConfig.minZScore))
ai = cc.Integrator(draft, cc.IntegratorConfig(arrowConfig.minZScore))
coverage = 0
for mr in mappedReads:
for i, mr in enumerate(mappedReads):
if (mr.TemplateEnd <= mr.TemplateStart or
mr.TemplateEnd - mr.TemplateStart < 2 or
mr.Length() < 2):
continue
if not sufficientlyAccurate(mr, poaCss, arrowConfig.minAccuracy):
tpl = poaCss[mr.TemplateStart:mr.TemplateEnd]
if not sufficientlyAccurate(mr, draft, arrowConfig.minAccuracy):
tpl = draft[mr.TemplateStart:mr.TemplateEnd]
if mr.Strand == cc.StrandType_FORWARD:
pass
elif mr.Strand == cc.StrandType_REVERSE:
......@@ -383,36 +483,58 @@ def consensusForAlignments(refWindow, refSequence, alns, arrowConfig):
continue
if ai.AddRead(mr) == cc.State_VALID:
coverage += 1
if alnsUsed is not None:
alnsUsed.append(alns[i])
# Iterate until covergence
if coverage < arrowConfig.minPoaCoverage:
logging.info("%s: Inadequate coverage to call consensus" % (refWindow,))
return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus,
refWindow, refSequence)
_, converged = refineConsensus(ai, arrowConfig)
if not polish:
confidence = np.zeros(len(draft), dtype=int)
return ArrowConsensus(refWindow, draft, confidence, ai)
# Iterate until covergence
_, converged = refineConsensus(ai, arrowConfig, polishDiploid=False)
if converged:
arrowCss = str(ai)
if arrowConfig.computeConfidence:
confidence = consensusConfidence(ai)
else:
confidence = np.zeros(shape=len(arrowCss), dtype=int)
return ArrowConsensus(refWindow,
arrowCss,
confidence,
ai)
else:
logging.info("%s: Arrow did not converge to MLE" % (refWindow,))
return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus,
refWindow, refSequence)
if arrowConfig.polishDiploid:
# additional rounds of diploid polishing
_, converged = refineConsensus(ai, arrowConfig, polishDiploid=True)
if converged:
arrowCss = str(ai)
if arrowConfig.computeConfidence:
confidence = consensusConfidence(ai)
else:
confidence = np.zeros(shape=len(arrowCss), dtype=int)
else:
logging.info("%s: Arrow (diploid) did not converge to optimal solution" % (refWindow,))
return ArrowConsensus(refWindow,
arrowCss,
confidence,
ai)
def coverageInWindow(refWin, hits):
winId, winStart, winEnd = refWin
a = np.array([(hit.referenceStart, hit.referenceEnd)
for hit in hits
if hit.referenceName == winId])
tStart = a[:,0]
tEnd = a[:,1]
cov = projectIntoRange(tStart, tEnd, winStart, winEnd)
return cov
if len(a) == 0:
return np.zeros(winEnd - winStart, dtype=np.uint)
else:
tStart = a[:,0]
tEnd = a[:,1]
cov = projectIntoRange(tStart, tEnd, winStart, winEnd)
return cov
#################################################################################
# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Pacific Biosciences nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
# Author: David Alexander
import numpy as np
......@@ -85,6 +55,14 @@ class Consensus(object):
factory = d[noCallStyle]
return factory(refWin, refSequence)
@property
def hasEvidence(self):
if isinstance(self, ArrowConsensus) and self.ai is None:
return False
if isinstance(self, QuiverConsensus) and self.mms is None:
return False
return True
class QuiverConsensus(Consensus):
"""
......@@ -142,7 +120,7 @@ def join(consensi):
assert len(consensi) >= 1
sortedConsensi = sorted(consensi)
if not areContiguous([cssChunk.refWindow for cssChunk in sortedConsensi]):
raise ValueError, "Consensus chunks must be contiguous"
raise ValueError("Consensus chunks must be contiguous")
joinedRefWindow = (sortedConsensi[0].refWindow[0],
sortedConsensi[0].refWindow[1],
......
#################################################################################
# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Pacific Biosciences nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
# Author: David Alexander
import time
......@@ -46,9 +16,9 @@ def gffVariantFrequency(var):
if var.frequency1==None:
return None
elif var.isHeterozygous:
return "%d/%d" % (var.frequency1, var.frequency2)
return "{0:.3g}/{1:.3g}".format(var.frequency1, var.frequency2)
else:
return str(var.frequency1)
return "{0:.3g}".format(var.frequency1)
def toGffRecord(var):
varType = var.variantType
......
# Author: Lance Hepler
from __future__ import division, print_function
import time
from textwrap import dedent
from GenomicConsensus import __VERSION__, reference
def vcfVariantFrequency(var, labels):
if var.frequency1 is None:
return None
elif var.isHeterozygous:
denom = var.frequency1 + var.frequency2
names = ['frequency{}'.format(label) for label in labels]
freqs = [getattr(var, name) for name in names if getattr(var, name) is not None]
return 'AF={}'.format(','.join('{:.3g}'.format(f / denom) for f in freqs))
else:
# the frequency is 100%, so no need
return None
class VariantsVcfWriter(object):
def __init__(self, f, optionsDict, referenceEntries):
self._vcfFile = open(f, "w")
print(dedent('''\
##fileformat=VCFv4.3
##fileDate={date}
##source=GenomicConsensusV{version}
##reference={reference}''').format(
date=time.strftime("%Y%m%d"),
version=__VERSION__,
reference="file://" + optionsDict["referenceFilename"],
), file=self._vcfFile)
# reference contigs
for entry in referenceEntries:
print("##contig=<ID={name},length={length}>".format(
name=entry.name,
length=entry.length
# TODO(lhepler): evaluate adding md5 hexdigest here on large genomes
), file=self._vcfFile)
print("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO", file=self._vcfFile)
def writeVariants(self, variants):
for var in variants:
pos = var.refStart
ref = ""
alt = ""
labels = (1, 2)
# insertion or deletion
if var.refSeq == "" or var.readSeq1 == "" or \
(var.isHeterozygous and var.readSeq2 == ""):
# we're anchored on the previous base so no 0- to 1-indexing
# correction required
ref = var.refPrev + var.refSeq
if var.isHeterozygous:
alt = ",".join(var.readPrev + seq for seq in (var.readSeq1, var.readSeq2))
else:
alt = var.readPrev + var.readSeq1
# substitution
else:
# due to 1-indexing, pos needs to be incremented
pos += 1
ref = var.refSeq
if var.isHeterozygous:
alt = ",".join(seq for seq in (var.readSeq1, var.readSeq2))
if var.refSeq == var.readSeq1:
# first variant is same as wildtype
alt = var.readSeq2
labels = (2,)
elif var.refSeq == var.readSeq2:
# second variant is same as wildtype
alt = var.readSeq1
labels = (1,)
else:
# both variants differ from wildtype
alt = ",".join(seq for seq in (var.readSeq1, var.readSeq2))
else:
alt = var.readSeq1
freq = vcfVariantFrequency(var=var, labels=labels)
info = "DP={0}".format(var.coverage)
if freq:
info = info + ";" + freq
print("{chrom}\t{pos}\t{id}\t{ref}\t{alt}\t{qual}\t{filter}\t{info}".format(
chrom=reference.idToFullName(var.refId),
pos=pos,
id=".",
ref=ref,
alt=alt,
qual=var.confidence,
filter="PASS",
info=info), file=self._vcfFile)
def close(self):
self._vcfFile.close()
#################################################################################
# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Pacific Biosciences nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
# Author: David Alexander
from __future__ import absolute_import
......
#################################################################################
# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Pacific Biosciences nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
# Author: David Alexander
__all__ = ["loadCmpH5", "loadBam"]
import h5py, os.path
import os.path
from pbcore.io import AlignmentSet
......
#!/usr/bin/env python
#################################################################################
# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Pacific Biosciences nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
# Author: David Alexander
from __future__ import absolute_import
from __future__ import print_function
import argparse, atexit, cProfile, gc, glob, h5py, logging, multiprocessing
import os, pstats, random, shutil, tempfile, time, threading, Queue, traceback
import argparse, atexit, cProfile, gc, glob, logging, multiprocessing
import os, pstats, random, shutil, tempfile, time, threading, Queue, traceback, pprint
import functools
import re
import sys
......@@ -88,11 +58,6 @@ class ToolRunner(object):
elif name == "arrow":
from GenomicConsensus.arrow import arrow
algo = arrow
# All arrow models require PW except P6 and the first S/P1-C1
if set(peekFile.sequencingChemistry) - set(["P6-C4", "S/P1-C1/beta"]):
if (not peekFile.hasBaseFeature("Ipd") or
not peekFile.hasBaseFeature("PulseWidth")):
die("Model requires missing base feature: IPD or PulseWidth")
elif name == "poa":
from GenomicConsensus.poa import poa
algo = poa
......@@ -159,9 +124,7 @@ class ToolRunner(object):
def _loadReference(self, alnFile):
logging.info("Loading reference")
err = reference.loadFromFile(options.referenceFilename, alnFile)
if err:
die("Error loading reference")
reference.loadFromFile(options.referenceFilename, alnFile)
# Grok the referenceWindow spec, if any.
if options.referenceWindowsAsString is None:
options.referenceWindows = ()
......@@ -172,7 +135,9 @@ class ToolRunner(object):
try:
win = reference.stringToWindow(s)
options.referenceWindows.append(win)
except:
except Exception:
msg = traceback.format_exc()
logging.debug(msg)
pass
else:
options.referenceWindows = map(reference.stringToWindow,
......@@ -183,7 +148,7 @@ class ToolRunner(object):
def _checkFileCompatibility(self, alnFile):
if not alnFile.isSorted:
die("Input Alignment file must be sorted.")
if alnFile.isEmpty:
if alnFile.isCmpH5 and alnFile.isEmpty:
die("Input Alignment file must be nonempty.")
def _shouldDisableChunkCache(self, alnFile):
......@@ -275,7 +240,7 @@ class ToolRunner(object):
random.seed(42)
if options.pdb or options.pdbAtStartup:
print >>sys.stderr, "Process ID: %d" % os.getpid()
print("Process ID: %d" % os.getpid(), file=sys.stderr)
try:
import ipdb
except ImportError:
......@@ -287,8 +252,8 @@ class ToolRunner(object):
if options.pdbAtStartup:
ipdb.set_trace()
logging.info("h5py version: %s" % h5py.version.version)
logging.info("hdf5 version: %s" % h5py.version.hdf5_version)
#logging.info("h5py version: %s" % h5py.version.version)
#logging.info("hdf5 version: %s" % h5py.version.hdf5_version)
logging.info("ConsensusCore version: %s" %
(consensusCoreVersion() or "ConsensusCore unavailable"))
logging.info("ConsensusCore2 version: %s" %
......@@ -341,9 +306,10 @@ class ToolRunner(object):
else:
self._mainLoop()
except:
why = traceback.format_exc()
self.abortWork(why)
except BaseException as exc:
msg = 'options={}'.format(pprint.pformat(vars(options)))
logging.exception(msg)
self.abortWork(repr(exc))
monitoringThread.join()
......@@ -383,16 +349,22 @@ def args_runner(args):
options.__dict__.update(args.__dict__)
processOptions()
tr = ToolRunner()
return tr.main()
try:
return tr.main()
except Exception:
if options.notrace:
return -1
raise
def resolved_tool_contract_runner(resolved_contract):
rc = resolved_contract
alignment_path = rc.task.input_files[0]
reference_path = rc.task.input_files[1]
gff_path = rc.task.output_files[0]
dataset_path = rc.task.output_files[1]
vcf_path = rc.task.output_files[1]
dataset_path = rc.task.output_files[2]
fasta_path = re.sub(".contigset.xml", ".fasta", dataset_path)
fastq_path = rc.task.output_files[2]
fastq_path = rc.task.output_files[3]
args = [
alignment_path,
"--verbose",
......@@ -400,14 +372,15 @@ def resolved_tool_contract_runner(resolved_contract):
"--outputFilename", gff_path,
"--outputFilename", fasta_path,
"--outputFilename", fastq_path,
"--outputFilename", vcf_path,
"--numWorkers", str(rc.task.nproc),
"--minCoverage", str(rc.task.options[Constants.MIN_COVERAGE_ID]),
"--minConfidence", str(rc.task.options[Constants.MIN_CONFIDENCE_ID]),
"--maskRadius", str(Constants.DEFAULT_MASK_RADIUS) if \
bool(rc.task.options[Constants.MASKING_ID]) else "0",
"--algorithm", rc.task.options[Constants.ALGORITHM_ID],
"--alignmentSetRefWindows",
]
if rc.task.options[Constants.DIPLOID_MODE_ID]:
args.append("--diploid")
args_ = get_parser().arg_parser.parser.parse_args(args)
rc = args_runner(args_)
if rc == 0:
......
#################################################################################
# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Pacific Biosciences nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
# Author: David Alexander
#
......@@ -48,7 +18,7 @@
# and get the loaded options dictionary.
#
from __future__ import absolute_import
import argparse, h5py, os, os.path, sys, json
import argparse, os, os.path, sys, json
from pbcommand.models import FileTypes, SymbolTypes, get_pbparser
from pbcommand.common_options import (add_resolved_tool_contract_option,
......@@ -64,14 +34,14 @@ def consensusCoreVersion():
try:
import ConsensusCore
return ConsensusCore.Version.VersionString()
except:
except Exception:
return None
def consensusCore2Version():
try:
import ConsensusCore2
return ConsensusCore2.__version__
except:
except Exception:
return None
class Constants(object):
......@@ -80,8 +50,9 @@ class Constants(object):
ALGORITHM_ID = "genomic_consensus.task_options.algorithm"
MIN_CONFIDENCE_ID = "genomic_consensus.task_options.min_confidence"
MIN_COVERAGE_ID = "genomic_consensus.task_options.min_coverage"
DIPLOID_MODE_ID = "genomic_consensus.task_options.diploid"
MASKING_ID = "genomic_consensus.task_options.masking"
ALGORITHM_CHOICES = ("quiver", "arrow", "plurality", "poa", "best")
DEFAULT_ALGORITHM = "best"
DEFAULT_MIN_CONFIDENCE = 40
DEFAULT_MIN_COVERAGE = 5
......@@ -91,6 +62,8 @@ class Constants(object):
DEFAULT_MIN_HQREGIONSNR = 3.75
DEFAULT_MIN_ZSCORE = -3.5
DEFAULT_MIN_ACCURACY = 0.82
DEFAULT_MASK_RADIUS = 3
DEFAULT_MASK_ERROR_RATE = 0.7
def get_parser():
"""
......@@ -112,30 +85,35 @@ def get_parser():
tcp.add_input_file_type(FileTypes.DS_REF, "reference",
"Reference DataSet", "Fasta or Reference DataSet")
tcp.add_output_file_type(FileTypes.GFF, "variants",
name="Consensus GFF",
description="Consensus GFF",
name="Variant Calls",
description="List of variants from the reference",
default_name="variants")
tcp.add_output_file_type(FileTypes.VCF, "variants_vcf",
name="Variant Calls",
description="List of variants from the reference in VCF format",
default_name="variants")
tcp.add_output_file_type(FileTypes.DS_CONTIG, "consensus",
name="Consensus ContigSet",
description="Consensus sequence in Fasta format",
name="Consensus Contigs",
description="Consensus contigs dataset",
default_name="consensus")
tcp.add_output_file_type(FileTypes.FASTQ, "consensus_fastq",
name="Consensus fastq",
description="Consensus fastq",
name="Consensus Contigs",
description="Consensus contigs in FASTQ format",
default_name="consensus")
tcp.add_str(
tcp.add_choice_str(
option_id=Constants.ALGORITHM_ID,
option_str="algorithm",
default=Constants.DEFAULT_ALGORITHM,
name="Algorithm",
description="Variant calling algorithm")
description="Variant calling algorithm",
choices=Constants.ALGORITHM_CHOICES)
tcp.add_int(
option_id=Constants.MIN_CONFIDENCE_ID,
option_str="minConfidence",
default=Constants.DEFAULT_MIN_CONFIDENCE,
name="Minimum confidence",
description="The minimum confidence for a variant call to be output "+\
"to variants.gff")
"to variants.{gff,vcf}")
tcp.add_int(
option_id=Constants.MIN_COVERAGE_ID,
option_str="minCoverage",
......@@ -143,13 +121,14 @@ def get_parser():
name="Minimum coverage",
description="The minimum site coverage that must be achieved for " +\
"variant calls and consensus to be calculated for a site.")
tcp.add_boolean(
option_id=Constants.DIPLOID_MODE_ID,
option_str="diploid",
default=False,
name="Diploid mode (experimental)",
description="Enable detection of heterozygous variants (experimental)")
option_id=Constants.MASKING_ID,
option_str="masking",
default=True,
name="Masking",
description="During the polishing step omit regions of reads " +\
"that have low concordance with the template.")
add_options_to_argument_parser(p.arg_parser.parser)
return p
......@@ -162,7 +141,7 @@ def add_options_to_argument_parser(parser):
basics.add_argument(
"inputFilename",
type=canonicalizedFilePath,
help="The input cmp.h5 file")
help="The input cmp.h5 or BAM alignment file")
basics.add_argument(
"--referenceFilename", "--reference", "-r",
action="store",
......@@ -178,7 +157,7 @@ def add_options_to_argument_parser(parser):
action="append",
default=[],
help="The output filename(s), as a comma-separated list." + \
"Valid output formats are .fa/.fasta, .fq/.fastq, .gff")
"Valid output formats are .fa/.fasta, .fq/.fastq, .gff, .vcf")
parallelism = parser.add_argument_group("Parallelism")
parallelism.add_argument(
......@@ -195,7 +174,7 @@ def add_options_to_argument_parser(parser):
dest="minConfidence",
type=int,
default=Constants.DEFAULT_MIN_CONFIDENCE,
help="The minimum confidence for a variant call to be output to variants.gff")
help="The minimum confidence for a variant call to be output to variants.{gff,vcf}")
filtering.add_argument(
"--minCoverage", "-x",
action="store",
......@@ -315,15 +294,16 @@ def add_options_to_argument_parser(parser):
action="store",
dest="algorithm",
type=str,
choices=["quiver", "arrow", "plurality", "poa", "best"],
default="best")
choices=Constants.ALGORITHM_CHOICES,
default=Constants.DEFAULT_ALGORITHM)
algorithm.add_argument(
"--parametersFile", "-P",
dest="parametersFile",
type=str,
default=None,
help="Parameter set filename (QuiverParameters.ini), or directory D " + \
"such that either D/*/GenomicConsensus/QuiverParameters.ini, " + \
help="Parameter set filename (such as ArrowParameters.json or " + \
"QuiverParameters.ini), or directory D such that either " + \
"D/*/GenomicConsensus/QuiverParameters.ini, " + \
"or D/GenomicConsensus/QuiverParameters.ini, is found. In the " + \
"former case, the lexically largest path is chosen.")
algorithm.add_argument(
......@@ -335,10 +315,30 @@ def add_options_to_argument_parser(parser):
help="Name of parameter set (chemistry.model) to select from the " + \
"parameters file, or just the name of the chemistry, in which " + \
"case the best available model is chosen. Default is 'auto', " + \
"which selects the best parameter set from the cmp.h5")
"which selects the best parameter set from the alignment data")
algorithm.add_argument(
"--maskRadius",
dest="maskRadius",
type=int,
default=Constants.DEFAULT_MASK_RADIUS,
help="Radius of window to use when excluding local regions for " + \
"exceeding maskMinErrorRate, where 0 disables any filtering (arrow-only).")
algorithm.add_argument(
"--maskErrorRate",
dest="maskErrorRate",
type=float,
default=Constants.DEFAULT_MASK_ERROR_RATE,
help="Maximum local error rate before the local region defined by " + \
"maskRadius is excluded from polishing (arrow-only).")
debugging = parser.add_argument_group("Verbosity and debugging/profiling")
add_debug_option(debugging)
debugging.add_argument(
"--notrace",
action="store_true",
dest="notrace",
default=False,
help="Suppress stacktrace for exceptions (to simplify testing)")
debugging.add_argument(
"--pdbAtStartup",
action="store_true",
......@@ -365,6 +365,10 @@ def add_options_to_argument_parser(parser):
"--annotateGFF",
action="store_true",
help="Augment GFF variant records with additional information")
debugging.add_argument(
"--reportEffectiveCoverage",
action="store_true",
help="Additionally record the *post-filtering* coverage at variant sites")
advanced = parser.add_argument_group("Advanced configuration options")
advanced.add_argument(
......@@ -454,16 +458,17 @@ def processOptions():
parser = get_parser().arg_parser.parser
def checkInputFile(path):
if not os.path.isfile(path):
parser.error("Input file %s not found." % (path,))
parser.error("Input file {!r} not found.".format(path))
def checkOutputFile(path):
try:
f = open(path, "a")
f.close()
except:
parser.error("Output file %s cannot be written." % (path,))
except Exception:
parser.error("Output file {!r} cannot be written.".format(path))
options.gffOutputFilename = None
options.vcfOutputFilename = None
options.fastaOutputFilename = None
options.fastqOutputFilename = None
options.csvOutputFilename = None
......@@ -472,6 +477,7 @@ def processOptions():
for outputFilename in options.outputFilenames:
fmt = fileFormat(outputFilename)
if fmt == "GFF": options.gffOutputFilename = outputFilename
elif fmt == "VCF": options.vcfOutputFilename = outputFilename
elif fmt == "FASTA": options.fastaOutputFilename = outputFilename
elif fmt == "FASTQ": options.fastqOutputFilename = outputFilename
elif fmt == "CSV": options.csvOutputFilename = outputFilename
......
#################################################################################
# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Pacific Biosciences nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
# Author: David Alexander
#################################################################################
# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Pacific Biosciences nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
# Author: David Alexander
from __future__ import absolute_import
......@@ -240,6 +210,10 @@ def _computeVariants(config,
assert len(alternateAlleleArray) == windowSize
assert len(alternateAlleleFrequency) == windowSize
# these are predecessor anchors, to support VCF output
refPrev = "N"
cssPrev = "N"
vars = []
for j in xrange(windowSize):
refPos = j + refStart
......@@ -266,7 +240,8 @@ def _computeVariants(config,
vs = varsFromRefAndReads(refId, refPos, refBase,
cssBases, altBases,
confidence=hetConf, coverage=cov,
frequency1=cssFreq, frequency2=altFreq)
frequency1=cssFreq, frequency2=altFreq,
refPrev=refPrev, readPrev=cssPrev)
vars = vars + vs
else:
......@@ -281,9 +256,14 @@ def _computeVariants(config,
vs = varsFromRefAndRead(refId, refPos, refBase, cssBases,
confidence=conf, coverage=cov,
frequency1=cssFreq)
frequency1=cssFreq,
refPrev=refPrev, readPrev=cssPrev)
vars = vars + vs
# if we have ref or css bases, update the anchors
refPrev = refBase if refBase else refPrev
cssPrev = cssBases[-1] if cssBases else cssPrev
if config.diploid:
vars = filter(_isSameLengthVariant, vars)
return sorted(vars)
......