Skip to content
GitLab
Explore
Sign in
Register
Commits on Source (5)
New upstream version 2.2.2
· 1211bfa6
Afif Elghraoui
authored
May 11, 2018
1211bfa6
Merge tag 'upstream/2.2.2'
· f95dec6e
Afif Elghraoui
authored
May 11, 2018
Upstream version 2.2.2
f95dec6e
Add README.source explaining how to select upstream snapshots for packaging
· 4d4c8a01
Afif Elghraoui
authored
May 11, 2018
4d4c8a01
Update changelog
· 4a506e15
Afif Elghraoui
authored
May 11, 2018
4a506e15
Merge branch 'master' of
ssh://salsa.debian.org/med-team/pbgenomicconsensus
· c380f08c
Afif Elghraoui
authored
May 11, 2018
c380f08c
Show whitespace changes
Inline
Side-by-side
.gitignore
View file @
c380f08c
...
...
@@ -11,3 +11,4 @@ dist/
TAGS
evidence_dump/
nosetests.xml
_deps/
GenomicConsensus/ResultCollector.py
View file @
c380f08c
#################################################################################
# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Pacific Biosciences nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
# Author: David Alexander, Jim Drake
import
cProfile
,
logging
,
os
.
path
,
sys
...
...
@@ -37,6 +7,7 @@ from collections import OrderedDict, defaultdict
from
.options
import
options
from
GenomicConsensus
import
reference
,
consensus
,
utils
,
windows
from
.io.VariantsGffWriter
import
VariantsGffWriter
from
.io.VariantsVcfWriter
import
VariantsVcfWriter
from
pbcore.io
import
FastaWriter
,
FastqWriter
class
ResultCollector
(
object
):
...
...
@@ -84,7 +55,10 @@ class ResultCollector(object):
self
.
consensusChunksByRefId
=
defaultdict
(
list
)
# open file writers
self
.
fastaWriter
=
self
.
fastqWriter
=
self
.
gffWriter
=
None
self
.
fastaWriter
=
None
self
.
fastqWriter
=
None
self
.
gffWriter
=
None
self
.
vcfWriter
=
None
if
options
.
fastaOutputFilename
:
self
.
fastaWriter
=
FastaWriter
(
options
.
fastaOutputFilename
)
if
options
.
fastqOutputFilename
:
...
...
@@ -93,6 +67,10 @@ class ResultCollector(object):
self
.
gffWriter
=
VariantsGffWriter
(
options
.
gffOutputFilename
,
vars
(
options
),
reference
.
byName
.
values
())
if
options
.
vcfOutputFilename
:
self
.
vcfWriter
=
VariantsVcfWriter
(
options
.
vcfOutputFilename
,
vars
(
options
),
reference
.
byName
.
values
())
def
onResult
(
self
,
result
):
window
,
cssAndVariants
=
result
...
...
@@ -105,6 +83,7 @@ class ResultCollector(object):
if
self
.
fastaWriter
:
self
.
fastaWriter
.
close
()
if
self
.
fastqWriter
:
self
.
fastqWriter
.
close
()
if
self
.
gffWriter
:
self
.
gffWriter
.
close
()
if
self
.
vcfWriter
:
self
.
vcfWriter
.
close
()
logging
.
info
(
"
Output files completed.
"
)
def
_recordNewResults
(
self
,
window
,
css
,
variants
):
...
...
@@ -122,8 +101,12 @@ class ResultCollector(object):
if
basesProcessed
==
requiredBases
:
# This contig is done, so we can dump to file and delete
# the data structures.
if
self
.
gffWriter
or
self
.
vcfWriter
:
variants
=
sorted
(
self
.
variantsByRefId
[
refId
])
if
self
.
gffWriter
:
self
.
gffWriter
.
writeVariants
(
sorted
(
self
.
variantsByRefId
[
refId
]))
self
.
gffWriter
.
writeVariants
(
variants
)
if
self
.
vcfWriter
:
self
.
vcfWriter
.
writeVariants
(
variants
)
del
self
.
variantsByRefId
[
refId
]
#
...
...
GenomicConsensus/Worker.py
View file @
c380f08c
#################################################################################
# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Pacific Biosciences nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
# Author: David Alexander, Jim Drake
import
cProfile
,
logging
,
os
.
path
...
...
GenomicConsensus/__init__.py
View file @
c380f08c
#################################################################################
# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Pacific Biosciences nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
# Author: David Alexander, David Seifert
# Author: David Alexander
__VERSION__
=
"
2.1.0
"
__VERSION__
=
'
2.2.2
'
# don't forget to update setup.py and doc/conf.py too
GenomicConsensus/algorithmSelection.py
View file @
c380f08c
#!/usr/bin/env python
#################################################################################
# Copyright (c) 2011-2016, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Pacific Biosciences nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
# Author: David Alexander
from
.utils
import
die
...
...
GenomicConsensus/arrow/__init__.py
View file @
c380f08c
#################################################################################
# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Pacific Biosciences nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
# Authors: David Alexander, Lance Hepler
import
utils
import
model
from
__future__
import
absolute_import
from
.
import
utils
from
.
import
model
# import evidence
GenomicConsensus/arrow/arrow.py
View file @
c380f08c
#################################################################################
# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Pacific Biosciences nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
# Authors: David Alexander, Lance Hepler
import
logging
,
os
.
path
...
...
@@ -52,7 +22,7 @@ def consensusAndVariantsForWindow(alnFile, refWindow, referenceContig,
depthLimit
,
arrowConfig
):
"""
High-level routine for calling the consensus for a
window of the genome given a
cmp.h5
.
window of the genome given a
BAM file
.
Identifies the coverage contours of the window in order to
identify subintervals where a good consensus can be called.
...
...
@@ -112,17 +82,22 @@ def consensusAndVariantsForWindow(alnFile, refWindow, referenceContig,
(
reference
.
windowToString
(
subWin
),
"
"
.
join
([
str
(
hit
.
readName
)
for
hit
in
alns
])))
alnsUsed
=
[]
if
options
.
reportEffectiveCoverage
else
None
css
=
U
.
consensusForAlignments
(
subWin
,
intRefSeq
,
clippedAlns
,
arrowConfig
)
arrowConfig
,
alnsUsed
=
alnsUsed
)
# Tabulate the coverage implied by these alignments, as
# well as the post-filtering ("effective") coverage
siteCoverage
=
U
.
coverageInWindow
(
subWin
,
alns
)
effectiveSiteCoverage
=
U
.
coverageInWindow
(
subWin
,
alnsUsed
)
if
options
.
reportEffectiveCoverage
else
None
variants_
=
U
.
variantsFromConsensus
(
subWin
,
windowRefSeq
,
css
.
sequence
,
css
.
confidence
,
s
iteCoverage
,
options
.
aligner
,
ai
=
None
)
variants_
,
newPureCss
=
U
.
variantsFromConsensus
(
subWin
,
windowRefSeq
,
css
.
sequence
,
css
.
confidence
,
siteCoverage
,
effectiveS
iteCoverage
,
options
.
aligner
,
ai
=
None
,
diploid
=
arrowConfig
.
polishDiploid
)
filteredVars
=
filterVariants
(
options
.
minCoverage
,
options
.
minConfidence
,
...
...
@@ -133,11 +108,18 @@ def consensusAndVariantsForWindow(alnFile, refWindow, referenceContig,
variants
+=
filteredVars
# The nascent consensus sequence might contain ambiguous bases, these
# need to be removed as software in the wild cannot deal with such
# characters and we only use IUPAC for *internal* bookkeeping.
if
arrowConfig
.
polishDiploid
:
css
.
sequence
=
newPureCss
# Dump?
maybeDumpEvidence
=
\
((
options
.
dumpEvidence
==
"
all
"
)
or
(
options
.
dumpEvidence
==
"
outliers
"
)
or
(
options
.
dumpEvidence
==
"
variants
"
)
and
(
len
(
variants
)
>
0
))
(
options
.
dumpEvidence
==
"
variants
"
)
and
(
len
(
variants
)
>
0
)
and
css
.
hasEvidence
)
if
maybeDumpEvidence
:
refId
,
refStart
,
refEnd
=
subWin
refName
=
reference
.
idToName
(
refId
)
...
...
@@ -148,7 +130,7 @@ def consensusAndVariantsForWindow(alnFile, refWindow, referenceContig,
ev
=
ArrowEvidence
.
fromConsensus
(
css
)
if
options
.
dumpEvidence
!=
"
outliers
"
:
ev
.
save
(
windowDirectory
)
elif
(
np
.
max
(
np
.
abs
(
ev
.
delta
)
)
>
20
):
elif
(
np
.
max
(
ev
.
delta
)
>
20
):
# Mathematically I don't think we should be seeing
# deltas > 6 in magnitude, but let's just restrict
# attention to truly bonkers outliers.
...
...
@@ -249,20 +231,40 @@ def configure(options, alnFile):
"
The Arrow algorithm requires a BAM file containing standard (non-CCS) reads.
"
)
if
options
.
diploid
:
logging
.
warn
(
"
Diploid analysis not yet supported under Arrow model.
"
)
logging
.
info
(
"
Diploid polishing in the Arrow model is in *BETA* mode.
\n
"
"
Any multi-base string that appears in annotation files
\n
"
"
is not phased!
"
)
# load parameters from file
if
options
.
parametersFile
:
logging
.
info
(
"
Loading model parameters from: ({0})
"
.
format
(
options
.
parametersFile
))
if
not
cc
.
LoadModels
(
options
.
parametersFile
):
die
(
"
Arrow: unable to load parameters from: ({0})
"
.
format
(
options
.
parametersFile
))
# test available chemistries
supp
=
set
(
cc
.
SupportedChemistries
())
logging
.
info
(
"
Found consensus models for: ({0})
"
.
format
(
"
,
"
.
join
(
sorted
(
supp
))))
used
=
set
(
alnFile
.
sequencingChemistry
)
used
=
set
(
[]
)
if
options
.
parametersSpec
!=
"
auto
"
:
used
=
set
([
options
.
parametersSpec
])
logging
.
info
(
"
Overriding model selection with: ({0})
"
.
format
(
options
.
parametersSpec
))
if
not
cc
.
OverrideModel
(
options
.
parametersSpec
):
die
(
"
Arrow: unable to override model with: ({0})
"
.
format
(
options
.
parametersSpec
))
used
.
add
(
options
.
parametersSpec
)
else
:
used
.
update
(
alnFile
.
sequencingChemistry
)
unsupp
=
used
-
supp
if
u
n
supp
:
if
u
sed
-
supp
:
die
(
"
Arrow: unsupported chemistries found: ({0})
"
.
format
(
"
,
"
.
join
(
sorted
(
unsupp
))))
# All arrow models require PW except P6 and the first S/P1-C1
for
readGroup
in
alnFile
.
readGroupTable
:
if
set
([
readGroup
[
"
SequencingChemistry
"
]])
-
set
([
"
P6-C4
"
,
"
S/P1-C1/beta
"
]):
if
(
"
Ipd
"
not
in
readGroup
[
"
BaseFeatures
"
]
or
"
PulseWidth
"
not
in
readGroup
[
"
BaseFeatures
"
]):
die
(
"
Arrow model requires missing base feature: IPD or PulseWidth
"
)
logging
.
info
(
"
Using consensus models for: ({0})
"
.
format
(
"
,
"
.
join
(
sorted
(
used
))))
return
M
.
ArrowConfig
(
minMapQV
=
options
.
minMapQV
,
...
...
@@ -272,8 +274,9 @@ def configure(options, alnFile):
minHqRegionSnr
=
options
.
minHqRegionSnr
,
minZScore
=
options
.
minZScore
,
minAccuracy
=
options
.
minAccuracy
,
chemistryOverride
=
(
None
if
options
.
parametersSpec
==
"
auto
"
else
options
.
parametersSpec
))
maskRadius
=
options
.
maskRadius
,
maskErrorRate
=
options
.
maskErrorRate
,
polishDiploid
=
options
.
diploid
)
def
slaveFactories
(
threaded
):
# By default we use slave processes. The tuple ordering is important.
...
...
GenomicConsensus/arrow/diploid.py
View file @
c380f08c
# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Pacific Biosciences nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
# Authors: David Alexander, Lance Hepler
from
GenomicConsensus.arrow.utils
import
allSingleBaseMutations
from
GenomicConsensus.variants
import
Variant
from
GenomicConsensus.quiver.diploid
import
variantsFromAlignment
import
numpy
as
np
import
ConsensusCore2
as
cc
...
...
@@ -77,9 +49,9 @@ def packMuts(cssBase, mut1, mut2):
#
nonNullMut
=
mut1
or
mut2
start
=
nonNullMut
.
Start
()
mutType
=
nonNullMut
.
Type
newBase1
=
mut1
.
Base
if
mut1
else
cssBase
newBase2
=
mut2
.
Base
if
mut2
else
cssBase
mutType
=
nonNullMut
.
Type
()
newBase1
=
mut1
.
Base
s
()
if
mut1
else
cssBase
newBase2
=
mut2
.
Base
s
()
if
mut2
else
cssBase
newBasePacked
=
packIUPAC
((
newBase1
,
newBase2
))
return
cc
.
Mutation
(
mutType
,
start
,
newBasePacked
)
...
...
@@ -148,76 +120,3 @@ def variantsFromConsensus(refWindow, refSequenceInWindow, cssSequenceInWindow,
refSequenceInWindow
,
cssSequenceInWindow
,
cssQvInWindow
,
siteCoverage
)
return
vars
def
variantsFromAlignment
(
refWindow
,
refSeq
,
cssSeq
,
cssQV
=
None
,
refCoverage
=
None
):
"""
Extract the variants implied by a pairwise alignment of cssSeq to
refSeq reference. If cssQV, refCoverage are provided, they will
be used to decorate the variants with those attributes.
Arguments:
- cssQV: QV array, same length as css
- refCoverage: coverage array, sample length as reference window
This is trickier than in the haploid case. We have to break out
diploid variants as single bases, in order to avoid implying
phase.
"""
variants
=
[]
refId
,
refStart
,
refEnd
=
refWindow
aln
=
cc
.
AlignAffineIupac
(
refSeq
,
cssSeq
);
alnTarget
=
aln
.
Target
()
alnQuery
=
aln
.
Query
()
assert
(
cssQV
is
None
)
==
(
refCoverage
is
None
)
# Both or none
assert
len
(
refSeq
)
==
refEnd
-
refStart
assert
cssQV
is
None
or
len
(
cssSeq
)
==
len
(
cssQV
)
assert
refCoverage
is
None
or
len
(
refSeq
)
==
len
(
refCoverage
)
transcript
=
[
X
if
(
Q
!=
"
N
"
and
T
!=
"
N
"
)
else
"
N
"
for
(
X
,
T
,
Q
)
in
zip
(
aln
.
Transcript
(),
alnTarget
,
alnQuery
)
]
variants
=
[]
runStart
=
-
1
runStartRefPos
=
None
runX
=
None
refPos
=
refStart
for
pos
,
(
X
,
T
,
Q
)
in
enumerate
(
zip
(
transcript
,
alnTarget
,
alnQuery
)):
if
X
!=
runX
or
isHeterozygote
(
Q
):
if
runStart
>=
0
and
runX
not
in
"
MN
"
:
# Package up the run and dump a variant
ref
=
alnTarget
[
runStart
:
pos
].
replace
(
"
-
"
,
""
)
read
=
alnQuery
[
runStart
:
pos
].
replace
(
"
-
"
,
""
)
if
isHeterozygote
(
read
):
allele1
,
allele2
=
unpackIUPAC
(
read
)
var
=
Variant
(
refId
,
runStartRefPos
,
refPos
,
ref
,
allele1
,
allele2
)
else
:
var
=
Variant
(
refId
,
runStartRefPos
,
refPos
,
ref
,
read
)
variants
.
append
(
var
)
runStart
=
pos
runStartRefPos
=
refPos
runX
=
X
if
T
!=
"
-
"
:
refPos
+=
1
# This might be better handled within the loop above, just keeping
# track of Qpos, Tpos
if
cssQV
is
not
None
:
cssPosition
=
cc
.
TargetToQueryPositions
(
aln
)
for
v
in
variants
:
# HACK ALERT: we are not really handling the confidence or
# coverage for variants at last position of the window
# correctly here.
refPos_
=
min
(
v
.
refStart
-
refStart
,
len
(
refCoverage
)
-
1
)
cssPos_
=
min
(
cssPosition
[
v
.
refStart
-
refStart
],
len
(
cssQV
)
-
1
)
if
refCoverage
is
not
None
:
v
.
coverage
=
refCoverage
[
refPos_
]
if
cssQV
is
not
None
:
v
.
confidence
=
cssQV
[
cssPos_
]
return
variants
GenomicConsensus/arrow/evidence.py
View file @
c380f08c
#################################################################################
# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Pacific Biosciences nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
# Authors: David Alexander, Lance Hepler
__all__
=
[
"
ArrowEvidence
"
]
import
h5py
,
logging
,
os
.
path
,
numpy
as
np
import
logging
,
os
.
path
,
numpy
as
np
from
collections
import
namedtuple
from
itertools
import
groupby
from
bisect
import
bisect_left
,
bisect_right
...
...
@@ -41,7 +11,9 @@ from .utils import scoreMatrix
from
..
import
reference
class
ArrowEvidence
(
object
):
"""
If used at runtime, this will require h5py
(for storing/loading
'
arrow-scores.h5
'
)
"""
Mutation
=
namedtuple
(
"
Mutation
"
,
(
"
Position
"
,
"
Type
"
,
"
FromBase
"
,
"
ToBase
"
))
@staticmethod
...
...
@@ -105,6 +77,7 @@ class ArrowEvidence(object):
refWindow
=
(
refName
,
refStart
,
refEnd
)
with
FastaReader
(
dir
+
"
/consensus.fa
"
)
as
fr
:
consensus
=
next
(
iter
(
fr
)).
sequence
import
h5py
with
h5py
.
File
(
dir
+
"
/arrow-scores.h5
"
,
"
r
"
)
as
f
:
scores
=
f
[
"
Scores
"
].
value
baselineScores
=
f
[
"
BaselineScores
"
].
value
...
...
@@ -131,7 +104,7 @@ class ArrowEvidence(object):
logging
.
info
(
"
Dumping evidence to %s
"
%
(
dir
,))
join
=
os
.
path
.
join
if
os
.
path
.
exists
(
dir
):
raise
Exception
,
"
Evidence dump does not expect directory %s to exist.
"
%
dir
raise
Exception
(
"
Evidence dump does not expect directory %s to exist.
"
%
dir
)
os
.
makedirs
(
dir
)
#refFasta = FastaWriter(join(dir, "reference.fa"))
#readsFasta = FastaWriter(join(dir, "reads.fa"))
...
...
@@ -143,6 +116,7 @@ class ArrowEvidence(object):
consensusFasta
.
writeRecord
(
windowName
+
"
|arrow
"
,
self
.
consensus
)
consensusFasta
.
close
()
import
h5py
arrowScoreFile
=
h5py
.
File
(
join
(
dir
,
"
arrow-scores.h5
"
))
arrowScoreFile
.
create_dataset
(
"
Scores
"
,
data
=
self
.
scores
)
vlen_str
=
h5py
.
special_dtype
(
vlen
=
str
)
...
...
@@ -150,6 +124,7 @@ class ArrowEvidence(object):
arrowScoreFile
.
create_dataset
(
"
ColumnNames
"
,
data
=
self
.
colNames
,
dtype
=
vlen_str
)
arrowScoreFile
.
create_dataset
(
"
BaselineScores
"
,
data
=
self
.
baselineScores
)
arrowScoreFile
.
close
()
# for aln in alns:
# readsFasta.writeRecord(str(aln.rowNumber),
# aln.read(orientation="genomic", aligned=False))
...
...
GenomicConsensus/arrow/model.py
View file @
c380f08c
#################################################################################
# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Pacific Biosciences nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
# Authors: David Alexander, Lance Hepler
import
numpy
as
np
,
ConfigParser
,
collections
,
logging
...
...
@@ -66,7 +36,9 @@ class ArrowConfig(object):
minHqRegionSnr
=
3.75
,
minZScore
=-
3.5
,
minAccuracy
=
0.82
,
chemistryOverride
=
None
):
maskRadius
=
0
,
maskErrorRate
=
0.5
,
polishDiploid
=
False
):
self
.
minMapQV
=
minMapQV
self
.
minPoaCoverage
=
minPoaCoverage
...
...
@@ -81,7 +53,9 @@ class ArrowConfig(object):
self
.
minHqRegionSnr
=
minHqRegionSnr
self
.
minZScore
=
minZScore
self
.
minAccuracy
=
minAccuracy
self
.
chemistryOverride
=
chemistryOverride
self
.
maskRadius
=
maskRadius
self
.
maskErrorRate
=
maskErrorRate
self
.
polishDiploid
=
polishDiploid
def
extractMappedRead
(
self
,
aln
,
windowStart
):
"""
...
...
@@ -99,7 +73,7 @@ class ArrowConfig(object):
rawFeature
=
aln
.
baseFeature
(
featureName
,
aligned
=
False
,
orientation
=
"
native
"
)
return
rawFeature
.
clip
(
0
,
255
).
astype
(
np
.
uint8
)
else
:
return
np
.
zeros
((
aln
.
qLen
,),
dtype
=
np
.
uint8
)
return
np
.
zeros
((
aln
.
readLength
,),
dtype
=
np
.
uint8
)
name
=
aln
.
readName
chemistry
=
aln
.
sequencingChemistry
...
...
@@ -109,7 +83,7 @@ class ArrowConfig(object):
cc
.
Uint8Vector
(
baseFeature
(
"
Ipd
"
).
tolist
()),
cc
.
Uint8Vector
(
baseFeature
(
"
PulseWidth
"
).
tolist
()),
cc
.
SNR
(
aln
.
hqRegionSnr
),
chemistry
if
self
.
chemistryOverride
is
None
else
self
.
chemistryOverride
)
chemistry
)
return
cc
.
MappedRead
(
read
,
strand
,
int
(
aln
.
referenceStart
-
windowStart
),
...
...
GenomicConsensus/arrow/utils.py
View file @
c380f08c
#################################################################################
# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Pacific Biosciences nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
# Authors: David Alexander, Lance Hepler
import
numpy
as
np
,
itertools
,
logging
,
re
,
sys
from
collections
import
Counter
from
traceback
import
format_exception
from
GenomicConsensus.variants
import
*
from
GenomicConsensus.utils
import
*
...
...
@@ -53,19 +22,15 @@ def uniqueSingleBaseMutations(templateSequence, positions=None):
# snvs
for
subsBase
in
allBases
:
if
subsBase
!=
tplBase
:
yield
cc
.
Mutation
(
cc
.
MutationType_SUBSTITUTION
,
tplStart
,
subsBase
)
yield
cc
.
Mutation_Substitution
(
tplStart
,
subsBase
)
# Insertions---only allowing insertions that are not cognate
# with the previous base.
for
insBase
in
allBases
:
if
insBase
!=
prevTplBase
:
yield
cc
.
Mutation
(
cc
.
MutationType_INSERTION
,
tplStart
,
insBase
)
yield
cc
.
Mutation_Insertion
(
tplStart
,
insBase
)
# Deletion--only allowed if refBase does not match previous tpl base
if
tplBase
!=
prevTplBase
:
yield
cc
.
Mutation
(
cc
.
MutationType_DELETION
,
tplStart
)
yield
cc
.
Mutation
_Deletion
(
tplStart
,
1
)
def
allSingleBaseMutations
(
templateSequence
,
positions
=
None
):
"""
...
...
@@ -79,16 +44,12 @@ def allSingleBaseMutations(templateSequence, positions=None):
# snvs
for
subsBase
in
allBases
:
if
subsBase
!=
tplBase
:
yield
cc
.
Mutation
(
cc
.
MutationType_SUBSTITUTION
,
tplStart
,
subsBase
)
yield
cc
.
Mutation_Substitution
(
tplStart
,
subsBase
)
# Insertions
for
insBase
in
allBases
:
yield
cc
.
Mutation
(
cc
.
MutationType_INSERTION
,
tplStart
,
insBase
)
yield
cc
.
Mutation_Insertion
(
tplStart
,
insBase
)
# Deletion
yield
cc
.
Mutation
(
cc
.
MutationType_DELETION
,
tplStart
)
yield
cc
.
Mutation
_Deletion
(
tplStart
,
1
)
def
nearbyMutations
(
mutations
,
tpl
,
neighborhoodSize
):
"""
...
...
@@ -124,14 +85,18 @@ def bestSubset(mutationsAndScores, separation):
return
output
def
refineConsensus
(
ai
,
arrowConfig
):
def
refineConsensus
(
ai
,
arrowConfig
,
polishDiploid
=
False
):
"""
Given a MultiReadMutationScorer, identify and apply favorable
template mutations. Return (consensus, didConverge) :: (str, bool)
"""
cfg
=
cc
.
PolishConfig
(
arrowConfig
.
maxIterations
,
arrowConfig
.
mutationSeparation
,
arrowConfig
.
mutationNeighborhood
)
arrowConfig
.
mutationNeighborhood
,
polishDiploid
)
if
arrowConfig
.
maskRadius
:
_
=
cc
.
Polish
(
ai
,
cfg
)
ai
.
MaskIntervals
(
arrowConfig
.
maskRadius
,
arrowConfig
.
maskErrorRate
)
polishResult
=
cc
.
Polish
(
ai
,
cfg
)
return
str
(
ai
),
polishResult
.
hasConverged
...
...
@@ -144,7 +109,50 @@ def consensusConfidence(ai, positions=None):
"""
return
np
.
array
(
np
.
clip
(
cc
.
ConsensusQualities
(
ai
),
0
,
93
),
dtype
=
np
.
uint8
)
def
variantsFromAlignment
(
a
,
refWindow
,
cssQvInWindow
=
None
,
siteCoverage
=
None
):
IUPACdict
=
{
'
N
'
:
(
'
N
'
),
'
n
'
:
(
'
n
'
),
'
A
'
:
(
'
A
'
),
'
a
'
:
(
'
a
'
),
'
C
'
:
(
'
C
'
),
'
c
'
:
(
'
c
'
),
'
G
'
:
(
'
G
'
),
'
g
'
:
(
'
g
'
),
'
T
'
:
(
'
T
'
),
'
t
'
:
(
'
t
'
),
'
M
'
:
(
'
A
'
,
'
C
'
),
'
m
'
:
(
'
a
'
,
'
c
'
),
'
R
'
:
(
'
A
'
,
'
G
'
),
'
r
'
:
(
'
a
'
,
'
g
'
),
'
W
'
:
(
'
A
'
,
'
T
'
),
'
w
'
:
(
'
a
'
,
'
t
'
),
'
S
'
:
(
'
C
'
,
'
G
'
),
'
s
'
:
(
'
c
'
,
'
g
'
),
'
Y
'
:
(
'
C
'
,
'
T
'
),
'
y
'
:
(
'
c
'
,
'
t
'
),
'
K
'
:
(
'
G
'
,
'
T
'
),
'
k
'
:
(
'
g
'
,
'
t
'
)}
def
splitupIUPAC
(
css
):
listSeq1
=
[
IUPACdict
[
x
][
0
]
for
x
in
css
]
listSeq2
=
[
IUPACdict
[
x
][
-
1
]
for
x
in
css
]
if
listSeq1
==
listSeq2
:
# haploid
readSeq1
=
''
.
join
(
listSeq1
)
readSeq2
=
None
freq1
=
None
freq2
=
None
else
:
# diploid
readSeq1
=
''
.
join
(
listSeq1
)
readSeq2
=
''
.
join
(
listSeq2
)
freq1
=
0.5
freq2
=
0.5
return
readSeq1
,
readSeq2
,
freq1
,
freq2
def
variantsFromAlignment
(
a
,
refWindow
,
cssQvInWindow
=
None
,
siteCoverage
=
None
,
effectiveSiteCoverage
=
None
):
"""
Extract the variants implied by a pairwise alignment to the
reference.
...
...
@@ -161,6 +169,10 @@ def variantsFromAlignment(a, refWindow, cssQvInWindow=None, siteCoverage=None):
grouper
=
lambda
row
:
"
N
"
if
(
row
[
1
]
==
"
N
"
or
row
[
2
]
==
"
N
"
)
else
row
[
0
]
runs
=
itertools
.
groupby
(
tbl
,
grouper
)
# track predecessor "anchor" base for vcf output of indel variants
refPrev
=
"
N
"
cssPrev
=
"
N
"
for
code
,
run
in
runs
:
assert
code
in
"
RIDMN
"
run
=
list
(
run
)
...
...
@@ -174,11 +186,20 @@ def variantsFromAlignment(a, refWindow, cssQvInWindow=None, siteCoverage=None):
pass
elif
code
==
"
R
"
:
assert
len
(
css
)
==
len
(
ref
)
variant
=
Variant
(
refId
,
refPos
,
refPos
+
len
(
css
),
ref
,
css
)
css
,
readSeq2
,
freq1
,
freq2
=
splitupIUPAC
(
css
)
variant
=
Variant
(
refId
=
refId
,
refStart
=
refPos
,
refEnd
=
refPos
+
len
(
css
),
refSeq
=
ref
,
readSeq1
=
css
,
readSeq2
=
readSeq2
,
frequency1
=
freq1
,
frequency2
=
freq2
,
refPrev
=
refPrev
,
readPrev
=
cssPrev
)
elif
code
==
"
I
"
:
variant
=
Variant
(
refId
,
refPos
,
refPos
,
""
,
css
)
css
,
readSeq2
,
freq1
,
freq2
=
splitupIUPAC
(
css
)
variant
=
Variant
(
refId
=
refId
,
refStart
=
refPos
,
refEnd
=
refPos
,
refSeq
=
""
,
readSeq1
=
css
,
readSeq2
=
readSeq2
,
frequency1
=
freq1
,
frequency2
=
freq2
,
refPrev
=
refPrev
,
readPrev
=
cssPrev
)
elif
code
==
"
D
"
:
variant
=
Variant
(
refId
,
refPos
,
refPos
+
len
(
ref
),
ref
,
""
)
variant
=
Variant
(
refId
,
refPos
,
refPos
+
len
(
ref
),
ref
,
""
,
refPrev
=
refPrev
,
readPrev
=
cssPrev
)
if
variant
is
not
None
:
# HACK ALERT: variants at the first and last position
...
...
@@ -186,6 +207,11 @@ def variantsFromAlignment(a, refWindow, cssQvInWindow=None, siteCoverage=None):
if
siteCoverage
is
not
None
and
np
.
size
(
siteCoverage
)
>
0
:
refPos_
=
min
(
refPos
-
refStart
,
len
(
siteCoverage
)
-
1
)
variant
.
coverage
=
siteCoverage
[
refPos_
]
if
effectiveSiteCoverage
is
not
None
and
np
.
size
(
effectiveSiteCoverage
)
>
0
:
refPos_
=
min
(
refPos
-
refStart
,
len
(
siteCoverage
)
-
1
)
variant
.
annotate
(
"
effectiveCoverage
"
,
effectiveSiteCoverage
[
refPos_
])
#import ipdb
#ipdb.set_trace()
if
cssQvInWindow
is
not
None
and
np
.
size
(
cssQvInWindow
)
>
0
:
cssPos_
=
min
(
cssPos
,
len
(
cssQvInWindow
)
-
1
)
variant
.
confidence
=
cssQvInWindow
[
cssPos_
]
...
...
@@ -193,6 +219,10 @@ def variantsFromAlignment(a, refWindow, cssQvInWindow=None, siteCoverage=None):
refPos
+=
refLen
cssPos
+=
cssLen
ref
=
ref
.
replace
(
"
-
"
,
""
)
css
=
css
.
replace
(
"
-
"
,
""
)
refPrev
=
ref
[
-
1
]
if
ref
else
refPrev
cssPrev
=
css
[
-
1
]
if
css
else
cssPrev
return
variants
...
...
@@ -231,12 +261,10 @@ def _shortMutationDescription(mut, tpl):
201 Sub C > T
201 Del C > .
"""
_type
=
_typeMap
[
mut
.
Type
]
_type
=
_typeMap
[
mut
.
Type
()
]
_pos
=
mut
.
Start
()
_oldBase
=
"
.
"
if
mut
.
Type
==
cc
.
MutationType_INSERTION
\
else
tpl
[
_pos
]
_newBase
=
"
.
"
if
mut
.
Type
==
cc
.
MutationType_DELETION
\
else
mut
.
Base
_oldBase
=
"
.
"
if
mut
.
IsInsertion
()
else
tpl
[
_pos
]
_newBase
=
"
.
"
if
mut
.
IsDeletion
()
else
mut
.
Bases
()
return
"
%d %s %s > %s
"
%
(
_pos
,
_type
,
_oldBase
,
_newBase
)
def
scoreMatrix
(
ai
):
...
...
@@ -266,16 +294,54 @@ def scoreMatrix(ai):
rowNames
=
readNames
return
(
rowNames
,
columnNames
,
baselineScores
,
scoreMatrix
)
def
constructIUPACfreeConsensus
(
a
):
targetAln
=
a
.
Target
()
queryAln
=
a
.
Query
()
assert
len
(
targetAln
)
==
len
(
queryAln
)
pureCss
=
[]
for
i
in
xrange
(
len
(
queryAln
)):
curBase
=
queryAln
[
i
]
if
curBase
!=
'
-
'
:
if
curBase
==
'
N
'
or
curBase
==
'
n
'
:
newBase
=
curBase
elif
targetAln
[
i
]
in
IUPACdict
[
curBase
]:
# construct new sequence with a
# minimum of divergence, i.e.
# target: A
# query: R (=A+G)
# -> new base = A
newBase
=
targetAln
[
i
]
else
:
newBase
=
IUPACdict
[
curBase
][
0
]
pureCss
.
append
(
newBase
)
newCss
=
''
.
join
(
pureCss
)
# Be absolutely sure that *really* all
# ambiguous bases have been removed.
for
i
in
(
'
M
'
,
'
m
'
,
'
R
'
,
'
r
'
,
'
W
'
,
'
w
'
,
'
S
'
,
'
s
'
,
'
Y
'
,
'
y
'
,
'
K
'
,
'
k
'
):
assert
newCss
.
find
(
i
)
==
-
1
return
newCss
def
variantsFromConsensus
(
refWindow
,
refSequenceInWindow
,
cssSequenceInWindow
,
cssQvInWindow
=
None
,
siteCoverage
=
None
,
aligner
=
"
affi
ne
"
,
a
i
=
Non
e
):
cssQvInWindow
=
None
,
siteCoverage
=
None
,
effectiveSiteCoverage
=
No
ne
,
a
ligner
=
"
affine
"
,
ai
=
None
,
diploid
=
Fals
e
):
"""
Compare the consensus and the reference in this window, returning
a list of variants.
"""
refId
,
refStart
,
refEnd
=
refWindow
if
diploid
:
align
=
cc
.
AlignAffineIupac
else
:
newCss
=
""
if
aligner
==
"
affine
"
:
align
=
cc
.
AlignAffine
else
:
...
...
@@ -283,7 +349,13 @@ def variantsFromConsensus(refWindow, refSequenceInWindow, cssSequenceInWindow,
ga
=
align
(
refSequenceInWindow
,
cssSequenceInWindow
)
return
variantsFromAlignment
(
ga
,
refWindow
,
cssQvInWindow
,
siteCoverage
)
if
diploid
:
newCss
=
constructIUPACfreeConsensus
(
ga
)
# new de-IUPACed sequence still
# needs to be of same length
assert
(
len
(
newCss
)
==
len
(
cssSequenceInWindow
))
return
variantsFromAlignment
(
ga
,
refWindow
,
cssQvInWindow
,
siteCoverage
,
effectiveSiteCoverage
),
newCss
def
filterAlns
(
refWindow
,
alns
,
arrowConfig
):
...
...
@@ -328,17 +400,45 @@ def sufficientlyAccurate(mappedRead, poaCss, minAccuracy):
return
acc
>=
minAccuracy
def
consensusForAlignments
(
refWindow
,
refSequence
,
alns
,
arrowConfig
):
def
poaConsensus
(
fwdSequences
,
arrowConfig
):
seqLens
=
[
len
(
seq
)
for
seq
in
fwdSequences
]
median
=
np
.
median
(
seqLens
)
ordSeqs
=
sorted
(
fwdSequences
,
key
=
lambda
seq
:
abs
(
len
(
seq
)
-
median
))
ordSeqs
=
ordSeqs
[:
arrowConfig
.
maxPoaCoverage
]
cov
=
len
(
ordSeqs
)
minCov
=
1
if
cov
<
5
else
((
cov
+
1
)
/
2
-
1
)
poaConfig
=
cc
.
DefaultPoaConfig
(
cc
.
AlignMode_GLOBAL
)
return
cc
.
PoaConsensus
.
FindConsensus
(
ordSeqs
,
poaConfig
,
minCov
)
def
consensusForAlignments
(
refWindow
,
refSequence
,
alns
,
arrowConfig
,
draft
=
None
,
polish
=
True
,
alnsUsed
=
None
):
"""
Call consensus on this interval---without subdividing the interval
further.
Testable!
Returns an ArrowConsensus object.
Requires that clipping has already been done.
If `draft` is provided, it will serve as the starting
point for polishing. If not, the POA will be used to generate a
draft starting point.
Clipping has already been done!
If `polish` is False, the arrow polishing procedure will not be
used, and the draft consensus will be returned.
`alnsUsed` is an output parameter; if not None, it should be an
empty list on entry; on return from this function, the list will
contain the alns objects that were actually used to compute the
consensus (those not filtered out).
"""
_
,
refStart
,
refEnd
=
refWindow
if
alnsUsed
is
not
None
:
assert
alnsUsed
==
[]
if
draft
is
None
:
# Compute the POA consensus, which is our initial guess, and
# should typically be > 99.5% accurate
fwdSequences
=
[
a
.
read
(
orientation
=
"
genomic
"
,
aligned
=
False
)
...
...
@@ -347,14 +447,14 @@ def consensusForAlignments(refWindow, refSequence, alns, arrowConfig):
assert
len
(
fwdSequences
)
>=
arrowConfig
.
minPoaCoverage
try
:
p
=
cc
.
P
oaConsensus
.
FindConsensus
(
fwdSequences
[:
arrowConfig
.
maxPoaCoverage
]
)
except
:
p
=
p
oaConsensus
(
fwdSequences
,
arrowConfig
)
except
Exception
:
logging
.
info
(
"
%s: POA could not be generated
"
%
(
refWindow
,))
return
ArrowConsensus
.
noCallConsensus
(
arrowConfig
.
noEvidenceConsensus
,
refWindow
,
refSequence
)
ga
=
cc
.
Align
(
refSequence
,
p
.
Sequence
)
numPoaVariants
=
ga
.
Errors
()
poaCss
=
p
.
Sequence
draft
=
p
.
Sequence
ga
=
cc
.
Align
(
refSequence
,
draft
)
# Extract reads into ConsensusCore2-compatible objects, and map them into the
# coordinates relative to the POA consensus
...
...
@@ -364,15 +464,15 @@ def consensusForAlignments(refWindow, refSequence, alns, arrowConfig):
# Load the mapped reads into the mutation scorer, and iterate
# until convergence.
ai
=
cc
.
MultiMolecular
Integrator
(
poaCss
,
cc
.
IntegratorConfig
(
arrowConfig
.
minZScore
))
ai
=
cc
.
Integrator
(
draft
,
cc
.
IntegratorConfig
(
arrowConfig
.
minZScore
))
coverage
=
0
for
mr
in
mappedReads
:
for
i
,
mr
in
enumerate
(
mappedReads
)
:
if
(
mr
.
TemplateEnd
<=
mr
.
TemplateStart
or
mr
.
TemplateEnd
-
mr
.
TemplateStart
<
2
or
mr
.
Length
()
<
2
):
continue
if
not
sufficientlyAccurate
(
mr
,
poaCss
,
arrowConfig
.
minAccuracy
):
tpl
=
poaCss
[
mr
.
TemplateStart
:
mr
.
TemplateEnd
]
if
not
sufficientlyAccurate
(
mr
,
draft
,
arrowConfig
.
minAccuracy
):
tpl
=
draft
[
mr
.
TemplateStart
:
mr
.
TemplateEnd
]
if
mr
.
Strand
==
cc
.
StrandType_FORWARD
:
pass
elif
mr
.
Strand
==
cc
.
StrandType_REVERSE
:
...
...
@@ -383,35 +483,57 @@ def consensusForAlignments(refWindow, refSequence, alns, arrowConfig):
continue
if
ai
.
AddRead
(
mr
)
==
cc
.
State_VALID
:
coverage
+=
1
if
alnsUsed
is
not
None
:
alnsUsed
.
append
(
alns
[
i
])
# Iterate until covergence
if
coverage
<
arrowConfig
.
minPoaCoverage
:
logging
.
info
(
"
%s: Inadequate coverage to call consensus
"
%
(
refWindow
,))
return
ArrowConsensus
.
noCallConsensus
(
arrowConfig
.
noEvidenceConsensus
,
refWindow
,
refSequence
)
_
,
converged
=
refineConsensus
(
ai
,
arrowConfig
)
if
not
polish
:
confidence
=
np
.
zeros
(
len
(
draft
),
dtype
=
int
)
return
ArrowConsensus
(
refWindow
,
draft
,
confidence
,
ai
)
# Iterate until covergence
_
,
converged
=
refineConsensus
(
ai
,
arrowConfig
,
polishDiploid
=
False
)
if
converged
:
arrowCss
=
str
(
ai
)
if
arrowConfig
.
computeConfidence
:
confidence
=
consensusConfidence
(
ai
)
else
:
confidence
=
np
.
zeros
(
shape
=
len
(
arrowCss
),
dtype
=
int
)
return
ArrowConsensus
(
refWindow
,
arrowCss
,
confidence
,
ai
)
else
:
logging
.
info
(
"
%s: Arrow did not converge to MLE
"
%
(
refWindow
,))
return
ArrowConsensus
.
noCallConsensus
(
arrowConfig
.
noEvidenceConsensus
,
refWindow
,
refSequence
)
if
arrowConfig
.
polishDiploid
:
# additional rounds of diploid polishing
_
,
converged
=
refineConsensus
(
ai
,
arrowConfig
,
polishDiploid
=
True
)
if
converged
:
arrowCss
=
str
(
ai
)
if
arrowConfig
.
computeConfidence
:
confidence
=
consensusConfidence
(
ai
)
else
:
confidence
=
np
.
zeros
(
shape
=
len
(
arrowCss
),
dtype
=
int
)
else
:
logging
.
info
(
"
%s: Arrow (diploid) did not converge to optimal solution
"
%
(
refWindow
,))
return
ArrowConsensus
(
refWindow
,
arrowCss
,
confidence
,
ai
)
def
coverageInWindow
(
refWin
,
hits
):
winId
,
winStart
,
winEnd
=
refWin
a
=
np
.
array
([(
hit
.
referenceStart
,
hit
.
referenceEnd
)
for
hit
in
hits
if
hit
.
referenceName
==
winId
])
if
len
(
a
)
==
0
:
return
np
.
zeros
(
winEnd
-
winStart
,
dtype
=
np
.
uint
)
else
:
tStart
=
a
[:,
0
]
tEnd
=
a
[:,
1
]
cov
=
projectIntoRange
(
tStart
,
tEnd
,
winStart
,
winEnd
)
...
...
GenomicConsensus/consensus.py
View file @
c380f08c
#################################################################################
# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Pacific Biosciences nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
# Author: David Alexander
import
numpy
as
np
...
...
@@ -85,6 +55,14 @@ class Consensus(object):
factory
=
d
[
noCallStyle
]
return
factory
(
refWin
,
refSequence
)
@property
def
hasEvidence
(
self
):
if
isinstance
(
self
,
ArrowConsensus
)
and
self
.
ai
is
None
:
return
False
if
isinstance
(
self
,
QuiverConsensus
)
and
self
.
mms
is
None
:
return
False
return
True
class
QuiverConsensus
(
Consensus
):
"""
...
...
@@ -142,7 +120,7 @@ def join(consensi):
assert
len
(
consensi
)
>=
1
sortedConsensi
=
sorted
(
consensi
)
if
not
areContiguous
([
cssChunk
.
refWindow
for
cssChunk
in
sortedConsensi
]):
raise
ValueError
,
"
Consensus chunks must be contiguous
"
raise
ValueError
(
"
Consensus chunks must be contiguous
"
)
joinedRefWindow
=
(
sortedConsensi
[
0
].
refWindow
[
0
],
sortedConsensi
[
0
].
refWindow
[
1
],
...
...
GenomicConsensus/io/VariantsGffWriter.py
View file @
c380f08c
#################################################################################
# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Pacific Biosciences nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
# Author: David Alexander
import
time
...
...
@@ -46,9 +16,9 @@ def gffVariantFrequency(var):
if
var
.
frequency1
==
None
:
return
None
elif
var
.
isHeterozygous
:
return
"
%d/%d
"
%
(
var
.
frequency1
,
var
.
frequency2
)
return
"
{0:.3g}/{1:.3g}
"
.
format
(
var
.
frequency1
,
var
.
frequency2
)
else
:
return
str
(
var
.
frequency1
)
return
"
{0:.3g}
"
.
format
(
var
.
frequency1
)
def
toGffRecord
(
var
):
varType
=
var
.
variantType
...
...
GenomicConsensus/io/VariantsVcfWriter.py
0 → 100644
View file @
c380f08c
# Author: Lance Hepler
from
__future__
import
division
,
print_function
import
time
from
textwrap
import
dedent
from
GenomicConsensus
import
__VERSION__
,
reference
def
vcfVariantFrequency
(
var
,
labels
):
if
var
.
frequency1
is
None
:
return
None
elif
var
.
isHeterozygous
:
denom
=
var
.
frequency1
+
var
.
frequency2
names
=
[
'
frequency{}
'
.
format
(
label
)
for
label
in
labels
]
freqs
=
[
getattr
(
var
,
name
)
for
name
in
names
if
getattr
(
var
,
name
)
is
not
None
]
return
'
AF={}
'
.
format
(
'
,
'
.
join
(
'
{:.3g}
'
.
format
(
f
/
denom
)
for
f
in
freqs
))
else
:
# the frequency is 100%, so no need
return
None
class
VariantsVcfWriter
(
object
):
def
__init__
(
self
,
f
,
optionsDict
,
referenceEntries
):
self
.
_vcfFile
=
open
(
f
,
"
w
"
)
print
(
dedent
(
'''
\
##fileformat=VCFv4.3
##fileDate={date}
##source=GenomicConsensusV{version}
##reference={reference}
'''
).
format
(
date
=
time
.
strftime
(
"
%Y%m%d
"
),
version
=
__VERSION__
,
reference
=
"
file://
"
+
optionsDict
[
"
referenceFilename
"
],
),
file
=
self
.
_vcfFile
)
# reference contigs
for
entry
in
referenceEntries
:
print
(
"
##contig=<ID={name},length={length}>
"
.
format
(
name
=
entry
.
name
,
length
=
entry
.
length
# TODO(lhepler): evaluate adding md5 hexdigest here on large genomes
),
file
=
self
.
_vcfFile
)
print
(
"
#CHROM
\t
POS
\t
ID
\t
REF
\t
ALT
\t
QUAL
\t
FILTER
\t
INFO
"
,
file
=
self
.
_vcfFile
)
def
writeVariants
(
self
,
variants
):
for
var
in
variants
:
pos
=
var
.
refStart
ref
=
""
alt
=
""
labels
=
(
1
,
2
)
# insertion or deletion
if
var
.
refSeq
==
""
or
var
.
readSeq1
==
""
or
\
(
var
.
isHeterozygous
and
var
.
readSeq2
==
""
):
# we're anchored on the previous base so no 0- to 1-indexing
# correction required
ref
=
var
.
refPrev
+
var
.
refSeq
if
var
.
isHeterozygous
:
alt
=
"
,
"
.
join
(
var
.
readPrev
+
seq
for
seq
in
(
var
.
readSeq1
,
var
.
readSeq2
))
else
:
alt
=
var
.
readPrev
+
var
.
readSeq1
# substitution
else
:
# due to 1-indexing, pos needs to be incremented
pos
+=
1
ref
=
var
.
refSeq
if
var
.
isHeterozygous
:
alt
=
"
,
"
.
join
(
seq
for
seq
in
(
var
.
readSeq1
,
var
.
readSeq2
))
if
var
.
refSeq
==
var
.
readSeq1
:
# first variant is same as wildtype
alt
=
var
.
readSeq2
labels
=
(
2
,)
elif
var
.
refSeq
==
var
.
readSeq2
:
# second variant is same as wildtype
alt
=
var
.
readSeq1
labels
=
(
1
,)
else
:
# both variants differ from wildtype
alt
=
"
,
"
.
join
(
seq
for
seq
in
(
var
.
readSeq1
,
var
.
readSeq2
))
else
:
alt
=
var
.
readSeq1
freq
=
vcfVariantFrequency
(
var
=
var
,
labels
=
labels
)
info
=
"
DP={0}
"
.
format
(
var
.
coverage
)
if
freq
:
info
=
info
+
"
;
"
+
freq
print
(
"
{chrom}
\t
{pos}
\t
{id}
\t
{ref}
\t
{alt}
\t
{qual}
\t
{filter}
\t
{info}
"
.
format
(
chrom
=
reference
.
idToFullName
(
var
.
refId
),
pos
=
pos
,
id
=
"
.
"
,
ref
=
ref
,
alt
=
alt
,
qual
=
var
.
confidence
,
filter
=
"
PASS
"
,
info
=
info
),
file
=
self
.
_vcfFile
)
def
close
(
self
):
self
.
_vcfFile
.
close
()
GenomicConsensus/io/__init__.py
View file @
c380f08c
#################################################################################
# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Pacific Biosciences nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
# Author: David Alexander
from
__future__
import
absolute_import
...
...
GenomicConsensus/io/utils.py
View file @
c380f08c
#################################################################################
# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Pacific Biosciences nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
# Author: David Alexander
__all__
=
[
"
loadCmpH5
"
,
"
loadBam
"
]
import
h5py
,
os
.
path
import
os.path
from
pbcore.io
import
AlignmentSet
...
...
GenomicConsensus/main.py
View file @
c380f08c
#!/usr/bin/env python
#################################################################################
# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Pacific Biosciences nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
# Author: David Alexander
from
__future__
import
absolute_import
from
__future__
import
print_function
import
argparse
,
atexit
,
cProfile
,
gc
,
glob
,
h5py
,
logging
,
multiprocessing
import
os
,
pstats
,
random
,
shutil
,
tempfile
,
time
,
threading
,
Queue
,
traceback
import
argparse
,
atexit
,
cProfile
,
gc
,
glob
,
logging
,
multiprocessing
import
os
,
pstats
,
random
,
shutil
,
tempfile
,
time
,
threading
,
Queue
,
traceback
,
pprint
import
functools
import
re
import
sys
...
...
@@ -88,11 +58,6 @@ class ToolRunner(object):
elif
name
==
"
arrow
"
:
from
GenomicConsensus.arrow
import
arrow
algo
=
arrow
# All arrow models require PW except P6 and the first S/P1-C1
if
set
(
peekFile
.
sequencingChemistry
)
-
set
([
"
P6-C4
"
,
"
S/P1-C1/beta
"
]):
if
(
not
peekFile
.
hasBaseFeature
(
"
Ipd
"
)
or
not
peekFile
.
hasBaseFeature
(
"
PulseWidth
"
)):
die
(
"
Model requires missing base feature: IPD or PulseWidth
"
)
elif
name
==
"
poa
"
:
from
GenomicConsensus.poa
import
poa
algo
=
poa
...
...
@@ -159,9 +124,7 @@ class ToolRunner(object):
def
_loadReference
(
self
,
alnFile
):
logging
.
info
(
"
Loading reference
"
)
err
=
reference
.
loadFromFile
(
options
.
referenceFilename
,
alnFile
)
if
err
:
die
(
"
Error loading reference
"
)
reference
.
loadFromFile
(
options
.
referenceFilename
,
alnFile
)
# Grok the referenceWindow spec, if any.
if
options
.
referenceWindowsAsString
is
None
:
options
.
referenceWindows
=
()
...
...
@@ -172,7 +135,9 @@ class ToolRunner(object):
try
:
win
=
reference
.
stringToWindow
(
s
)
options
.
referenceWindows
.
append
(
win
)
except
:
except
Exception
:
msg
=
traceback
.
format_exc
()
logging
.
debug
(
msg
)
pass
else
:
options
.
referenceWindows
=
map
(
reference
.
stringToWindow
,
...
...
@@ -183,7 +148,7 @@ class ToolRunner(object):
def
_checkFileCompatibility
(
self
,
alnFile
):
if
not
alnFile
.
isSorted
:
die
(
"
Input Alignment file must be sorted.
"
)
if
alnFile
.
isEmpty
:
if
alnFile
.
isCmpH5
and
alnFile
.
isEmpty
:
die
(
"
Input Alignment file must be nonempty.
"
)
def
_shouldDisableChunkCache
(
self
,
alnFile
):
...
...
@@ -275,7 +240,7 @@ class ToolRunner(object):
random
.
seed
(
42
)
if
options
.
pdb
or
options
.
pdbAtStartup
:
print
>>
sys
.
stderr
,
"
Process ID: %d
"
%
os
.
getpid
()
print
(
"
Process ID: %d
"
%
os
.
getpid
()
,
file
=
sys
.
stderr
)
try
:
import
ipdb
except
ImportError
:
...
...
@@ -287,8 +252,8 @@ class ToolRunner(object):
if
options
.
pdbAtStartup
:
ipdb
.
set_trace
()
logging
.
info
(
"
h5py version: %s
"
%
h5py
.
version
.
version
)
logging
.
info
(
"
hdf5 version: %s
"
%
h5py
.
version
.
hdf5_version
)
#
logging.info("h5py version: %s" % h5py.version.version)
#
logging.info("hdf5 version: %s" % h5py.version.hdf5_version)
logging
.
info
(
"
ConsensusCore version: %s
"
%
(
consensusCoreVersion
()
or
"
ConsensusCore unavailable
"
))
logging
.
info
(
"
ConsensusCore2 version: %s
"
%
...
...
@@ -341,9 +306,10 @@ class ToolRunner(object):
else
:
self
.
_mainLoop
()
except
:
why
=
traceback
.
format_exc
()
self
.
abortWork
(
why
)
except
BaseException
as
exc
:
msg
=
'
options={}
'
.
format
(
pprint
.
pformat
(
vars
(
options
)))
logging
.
exception
(
msg
)
self
.
abortWork
(
repr
(
exc
))
monitoringThread
.
join
()
...
...
@@ -383,16 +349,22 @@ def args_runner(args):
options
.
__dict__
.
update
(
args
.
__dict__
)
processOptions
()
tr
=
ToolRunner
()
try
:
return
tr
.
main
()
except
Exception
:
if
options
.
notrace
:
return
-
1
raise
def
resolved_tool_contract_runner
(
resolved_contract
):
rc
=
resolved_contract
alignment_path
=
rc
.
task
.
input_files
[
0
]
reference_path
=
rc
.
task
.
input_files
[
1
]
gff_path
=
rc
.
task
.
output_files
[
0
]
dataset_path
=
rc
.
task
.
output_files
[
1
]
vcf_path
=
rc
.
task
.
output_files
[
1
]
dataset_path
=
rc
.
task
.
output_files
[
2
]
fasta_path
=
re
.
sub
(
"
.contigset.xml
"
,
"
.fasta
"
,
dataset_path
)
fastq_path
=
rc
.
task
.
output_files
[
2
]
fastq_path
=
rc
.
task
.
output_files
[
3
]
args
=
[
alignment_path
,
"
--verbose
"
,
...
...
@@ -400,14 +372,15 @@ def resolved_tool_contract_runner(resolved_contract):
"
--outputFilename
"
,
gff_path
,
"
--outputFilename
"
,
fasta_path
,
"
--outputFilename
"
,
fastq_path
,
"
--outputFilename
"
,
vcf_path
,
"
--numWorkers
"
,
str
(
rc
.
task
.
nproc
),
"
--minCoverage
"
,
str
(
rc
.
task
.
options
[
Constants
.
MIN_COVERAGE_ID
]),
"
--minConfidence
"
,
str
(
rc
.
task
.
options
[
Constants
.
MIN_CONFIDENCE_ID
]),
"
--maskRadius
"
,
str
(
Constants
.
DEFAULT_MASK_RADIUS
)
if
\
bool
(
rc
.
task
.
options
[
Constants
.
MASKING_ID
])
else
"
0
"
,
"
--algorithm
"
,
rc
.
task
.
options
[
Constants
.
ALGORITHM_ID
],
"
--alignmentSetRefWindows
"
,
]
if
rc
.
task
.
options
[
Constants
.
DIPLOID_MODE_ID
]:
args
.
append
(
"
--diploid
"
)
args_
=
get_parser
().
arg_parser
.
parser
.
parse_args
(
args
)
rc
=
args_runner
(
args_
)
if
rc
==
0
:
...
...
GenomicConsensus/options.py
View file @
c380f08c
#################################################################################
# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Pacific Biosciences nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
# Author: David Alexander
#
...
...
@@ -48,7 +18,7 @@
# and get the loaded options dictionary.
#
from
__future__
import
absolute_import
import
argparse
,
h5py
,
os
,
os
.
path
,
sys
,
json
import
argparse
,
os
,
os
.
path
,
sys
,
json
from
pbcommand.models
import
FileTypes
,
SymbolTypes
,
get_pbparser
from
pbcommand.common_options
import
(
add_resolved_tool_contract_option
,
...
...
@@ -64,14 +34,14 @@ def consensusCoreVersion():
try
:
import
ConsensusCore
return
ConsensusCore
.
Version
.
VersionString
()
except
:
except
Exception
:
return
None
def
consensusCore2Version
():
try
:
import
ConsensusCore2
return
ConsensusCore2
.
__version__
except
:
except
Exception
:
return
None
class
Constants
(
object
):
...
...
@@ -80,8 +50,9 @@ class Constants(object):
ALGORITHM_ID
=
"
genomic_consensus.task_options.algorithm
"
MIN_CONFIDENCE_ID
=
"
genomic_consensus.task_options.min_confidence
"
MIN_COVERAGE_ID
=
"
genomic_consensus.task_options.min_coverage
"
DIPLOID_MODE
_ID
=
"
genomic_consensus.task_options.
diploid
"
MASKING
_ID
=
"
genomic_consensus.task_options.
masking
"
ALGORITHM_CHOICES
=
(
"
quiver
"
,
"
arrow
"
,
"
plurality
"
,
"
poa
"
,
"
best
"
)
DEFAULT_ALGORITHM
=
"
best
"
DEFAULT_MIN_CONFIDENCE
=
40
DEFAULT_MIN_COVERAGE
=
5
...
...
@@ -91,6 +62,8 @@ class Constants(object):
DEFAULT_MIN_HQREGIONSNR
=
3.75
DEFAULT_MIN_ZSCORE
=
-
3.5
DEFAULT_MIN_ACCURACY
=
0.82
DEFAULT_MASK_RADIUS
=
3
DEFAULT_MASK_ERROR_RATE
=
0.7
def
get_parser
():
"""
...
...
@@ -112,30 +85,35 @@ def get_parser():
tcp
.
add_input_file_type
(
FileTypes
.
DS_REF
,
"
reference
"
,
"
Reference DataSet
"
,
"
Fasta or Reference DataSet
"
)
tcp
.
add_output_file_type
(
FileTypes
.
GFF
,
"
variants
"
,
name
=
"
Consensus GFF
"
,
description
=
"
Consensus GFF
"
,
name
=
"
Variant Calls
"
,
description
=
"
List of variants from the reference
"
,
default_name
=
"
variants
"
)
tcp
.
add_output_file_type
(
FileTypes
.
VCF
,
"
variants_vcf
"
,
name
=
"
Variant Calls
"
,
description
=
"
List of variants from the reference in VCF format
"
,
default_name
=
"
variants
"
)
tcp
.
add_output_file_type
(
FileTypes
.
DS_CONTIG
,
"
consensus
"
,
name
=
"
Consensus Contig
Set
"
,
description
=
"
Consensus
sequence in Fasta forma
t
"
,
name
=
"
Consensus Contig
s
"
,
description
=
"
Consensus
contigs datase
t
"
,
default_name
=
"
consensus
"
)
tcp
.
add_output_file_type
(
FileTypes
.
FASTQ
,
"
consensus_fastq
"
,
name
=
"
Consensus
fastq
"
,
description
=
"
Consensus
fastq
"
,
name
=
"
Consensus
Contigs
"
,
description
=
"
Consensus
contigs in FASTQ format
"
,
default_name
=
"
consensus
"
)
tcp
.
add_str
(
tcp
.
add_
choice_
str
(
option_id
=
Constants
.
ALGORITHM_ID
,
option_str
=
"
algorithm
"
,
default
=
Constants
.
DEFAULT_ALGORITHM
,
name
=
"
Algorithm
"
,
description
=
"
Variant calling algorithm
"
)
description
=
"
Variant calling algorithm
"
,
choices
=
Constants
.
ALGORITHM_CHOICES
)
tcp
.
add_int
(
option_id
=
Constants
.
MIN_CONFIDENCE_ID
,
option_str
=
"
minConfidence
"
,
default
=
Constants
.
DEFAULT_MIN_CONFIDENCE
,
name
=
"
Minimum confidence
"
,
description
=
"
The minimum confidence for a variant call to be output
"
+
\
"
to variants.gff
"
)
"
to variants.
{
gff
,vcf}
"
)
tcp
.
add_int
(
option_id
=
Constants
.
MIN_COVERAGE_ID
,
option_str
=
"
minCoverage
"
,
...
...
@@ -143,13 +121,14 @@ def get_parser():
name
=
"
Minimum coverage
"
,
description
=
"
The minimum site coverage that must be achieved for
"
+
\
"
variant calls and consensus to be calculated for a site.
"
)
tcp
.
add_boolean
(
option_id
=
Constants
.
DIPLOID_MODE_ID
,
option_str
=
"
diploid
"
,
default
=
False
,
name
=
"
Diploid mode (experimental)
"
,
description
=
"
Enable detection of heterozygous variants (experimental)
"
)
option_id
=
Constants
.
MASKING_ID
,
option_str
=
"
masking
"
,
default
=
True
,
name
=
"
Masking
"
,
description
=
"
During the polishing step omit regions of reads
"
+
\
"
that have low concordance with the template.
"
)
add_options_to_argument_parser
(
p
.
arg_parser
.
parser
)
return
p
...
...
@@ -162,7 +141,7 @@ def add_options_to_argument_parser(parser):
basics
.
add_argument
(
"
inputFilename
"
,
type
=
canonicalizedFilePath
,
help
=
"
The input cmp.h5 file
"
)
help
=
"
The input cmp.h5
or BAM alignment
file
"
)
basics
.
add_argument
(
"
--referenceFilename
"
,
"
--reference
"
,
"
-r
"
,
action
=
"
store
"
,
...
...
@@ -178,7 +157,7 @@ def add_options_to_argument_parser(parser):
action
=
"
append
"
,
default
=
[],
help
=
"
The output filename(s), as a comma-separated list.
"
+
\
"
Valid output formats are .fa/.fasta, .fq/.fastq, .gff
"
)
"
Valid output formats are .fa/.fasta, .fq/.fastq, .gff
, .vcf
"
)
parallelism
=
parser
.
add_argument_group
(
"
Parallelism
"
)
parallelism
.
add_argument
(
...
...
@@ -195,7 +174,7 @@ def add_options_to_argument_parser(parser):
dest
=
"
minConfidence
"
,
type
=
int
,
default
=
Constants
.
DEFAULT_MIN_CONFIDENCE
,
help
=
"
The minimum confidence for a variant call to be output to variants.gff
"
)
help
=
"
The minimum confidence for a variant call to be output to variants.
{
gff
,vcf}
"
)
filtering
.
add_argument
(
"
--minCoverage
"
,
"
-x
"
,
action
=
"
store
"
,
...
...
@@ -315,15 +294,16 @@ def add_options_to_argument_parser(parser):
action
=
"
store
"
,
dest
=
"
algorithm
"
,
type
=
str
,
choices
=
[
"
quiver
"
,
"
arrow
"
,
"
plurality
"
,
"
poa
"
,
"
best
"
]
,
default
=
"
best
"
)
choices
=
Constants
.
ALGORITHM_CHOICES
,
default
=
Constants
.
DEFAULT_ALGORITHM
)
algorithm
.
add_argument
(
"
--parametersFile
"
,
"
-P
"
,
dest
=
"
parametersFile
"
,
type
=
str
,
default
=
None
,
help
=
"
Parameter set filename (QuiverParameters.ini), or directory D
"
+
\
"
such that either D/*/GenomicConsensus/QuiverParameters.ini,
"
+
\
help
=
"
Parameter set filename (such as ArrowParameters.json or
"
+
\
"
QuiverParameters.ini), or directory D such that either
"
+
\
"
D/*/GenomicConsensus/QuiverParameters.ini,
"
+
\
"
or D/GenomicConsensus/QuiverParameters.ini, is found. In the
"
+
\
"
former case, the lexically largest path is chosen.
"
)
algorithm
.
add_argument
(
...
...
@@ -335,10 +315,30 @@ def add_options_to_argument_parser(parser):
help
=
"
Name of parameter set (chemistry.model) to select from the
"
+
\
"
parameters file, or just the name of the chemistry, in which
"
+
\
"
case the best available model is chosen. Default is
'
auto
'
,
"
+
\
"
which selects the best parameter set from the cmp.h5
"
)
"
which selects the best parameter set from the alignment data
"
)
algorithm
.
add_argument
(
"
--maskRadius
"
,
dest
=
"
maskRadius
"
,
type
=
int
,
default
=
Constants
.
DEFAULT_MASK_RADIUS
,
help
=
"
Radius of window to use when excluding local regions for
"
+
\
"
exceeding maskMinErrorRate, where 0 disables any filtering (arrow-only).
"
)
algorithm
.
add_argument
(
"
--maskErrorRate
"
,
dest
=
"
maskErrorRate
"
,
type
=
float
,
default
=
Constants
.
DEFAULT_MASK_ERROR_RATE
,
help
=
"
Maximum local error rate before the local region defined by
"
+
\
"
maskRadius is excluded from polishing (arrow-only).
"
)
debugging
=
parser
.
add_argument_group
(
"
Verbosity and debugging/profiling
"
)
add_debug_option
(
debugging
)
debugging
.
add_argument
(
"
--notrace
"
,
action
=
"
store_true
"
,
dest
=
"
notrace
"
,
default
=
False
,
help
=
"
Suppress stacktrace for exceptions (to simplify testing)
"
)
debugging
.
add_argument
(
"
--pdbAtStartup
"
,
action
=
"
store_true
"
,
...
...
@@ -365,6 +365,10 @@ def add_options_to_argument_parser(parser):
"
--annotateGFF
"
,
action
=
"
store_true
"
,
help
=
"
Augment GFF variant records with additional information
"
)
debugging
.
add_argument
(
"
--reportEffectiveCoverage
"
,
action
=
"
store_true
"
,
help
=
"
Additionally record the *post-filtering* coverage at variant sites
"
)
advanced
=
parser
.
add_argument_group
(
"
Advanced configuration options
"
)
advanced
.
add_argument
(
...
...
@@ -454,16 +458,17 @@ def processOptions():
parser
=
get_parser
().
arg_parser
.
parser
def
checkInputFile
(
path
):
if
not
os
.
path
.
isfile
(
path
):
parser
.
error
(
"
Input file
%s
not found.
"
%
(
path
,
))
parser
.
error
(
"
Input file
{!r}
not found.
"
.
format
(
path
))
def
checkOutputFile
(
path
):
try
:
f
=
open
(
path
,
"
a
"
)
f
.
close
()
except
:
parser
.
error
(
"
Output file
%s
cannot be written.
"
%
(
path
,
))
except
Exception
:
parser
.
error
(
"
Output file
{!r}
cannot be written.
"
.
format
(
path
))
options
.
gffOutputFilename
=
None
options
.
vcfOutputFilename
=
None
options
.
fastaOutputFilename
=
None
options
.
fastqOutputFilename
=
None
options
.
csvOutputFilename
=
None
...
...
@@ -472,6 +477,7 @@ def processOptions():
for
outputFilename
in
options
.
outputFilenames
:
fmt
=
fileFormat
(
outputFilename
)
if
fmt
==
"
GFF
"
:
options
.
gffOutputFilename
=
outputFilename
elif
fmt
==
"
VCF
"
:
options
.
vcfOutputFilename
=
outputFilename
elif
fmt
==
"
FASTA
"
:
options
.
fastaOutputFilename
=
outputFilename
elif
fmt
==
"
FASTQ
"
:
options
.
fastqOutputFilename
=
outputFilename
elif
fmt
==
"
CSV
"
:
options
.
csvOutputFilename
=
outputFilename
...
...
GenomicConsensus/plurality/__init__.py
View file @
c380f08c
#################################################################################
# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Pacific Biosciences nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
# Author: David Alexander
GenomicConsensus/plurality/plurality.py
View file @
c380f08c
#################################################################################
# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Pacific Biosciences nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
# Author: David Alexander
from
__future__
import
absolute_import
...
...
@@ -240,6 +210,10 @@ def _computeVariants(config,
assert
len
(
alternateAlleleArray
)
==
windowSize
assert
len
(
alternateAlleleFrequency
)
==
windowSize
# these are predecessor anchors, to support VCF output
refPrev
=
"
N
"
cssPrev
=
"
N
"
vars
=
[]
for
j
in
xrange
(
windowSize
):
refPos
=
j
+
refStart
...
...
@@ -266,7 +240,8 @@ def _computeVariants(config,
vs
=
varsFromRefAndReads
(
refId
,
refPos
,
refBase
,
cssBases
,
altBases
,
confidence
=
hetConf
,
coverage
=
cov
,
frequency1
=
cssFreq
,
frequency2
=
altFreq
)
frequency1
=
cssFreq
,
frequency2
=
altFreq
,
refPrev
=
refPrev
,
readPrev
=
cssPrev
)
vars
=
vars
+
vs
else
:
...
...
@@ -281,9 +256,14 @@ def _computeVariants(config,
vs
=
varsFromRefAndRead
(
refId
,
refPos
,
refBase
,
cssBases
,
confidence
=
conf
,
coverage
=
cov
,
frequency1
=
cssFreq
)
frequency1
=
cssFreq
,
refPrev
=
refPrev
,
readPrev
=
cssPrev
)
vars
=
vars
+
vs
# if we have ref or css bases, update the anchors
refPrev
=
refBase
if
refBase
else
refPrev
cssPrev
=
cssBases
[
-
1
]
if
cssBases
else
cssPrev
if
config
.
diploid
:
vars
=
filter
(
_isSameLengthVariant
,
vars
)
return
sorted
(
vars
)
...
...
Prev
1
2
3
4
5
6
Next