Skip to content
Commits on Source (18)
# Author: David Alexander, Jim Drake
from __future__ import absolute_import, division, print_function
import cProfile, logging, os.path, sys
from multiprocessing import Process
......
# Author: David Alexander, Jim Drake
from __future__ import absolute_import, division, print_function
import cProfile, logging, os.path
from multiprocessing import Process
......
# Author: David Alexander, David Seifert
from __future__ import absolute_import, division, print_function
__VERSION__ = '2.2.2' # don't forget to update setup.py and doc/conf.py too
__VERSION__ = '2.3.2' # don't forget to update setup.py and doc/conf.py too
# Author: David Alexander
from __future__ import absolute_import, division, print_function
from .utils import die
......
# Authors: David Alexander, Lance Hepler
from __future__ import absolute_import
from __future__ import absolute_import, division, print_function
from . import utils
from . import model
# import evidence
# Authors: David Alexander, Lance Hepler
from __future__ import absolute_import, division, print_function
import logging, os.path
import ConsensusCore2 as cc, numpy as np
......@@ -11,7 +12,6 @@ from ..ResultCollector import ResultCollectorProcess, ResultCollectorThread
from GenomicConsensus.consensus import Consensus, ArrowConsensus, join
from GenomicConsensus.windows import kSpannedIntervals, holes, subWindow
from GenomicConsensus.variants import filterVariants, annotateVariants
from GenomicConsensus.arrow.evidence import ArrowEvidence
from GenomicConsensus.arrow import diploid
from GenomicConsensus.utils import die
......@@ -113,29 +113,6 @@ def consensusAndVariantsForWindow(alnFile, refWindow, referenceContig,
# characters and we only use IUPAC for *internal* bookkeeping.
if arrowConfig.polishDiploid:
css.sequence = newPureCss
# Dump?
maybeDumpEvidence = \
((options.dumpEvidence == "all") or
(options.dumpEvidence == "outliers") or
(options.dumpEvidence == "variants") and
(len(variants) > 0) and css.hasEvidence)
if maybeDumpEvidence:
refId, refStart, refEnd = subWin
refName = reference.idToName(refId)
windowDirectory = os.path.join(
options.evidenceDirectory,
refName,
"%d-%d" % (refStart, refEnd))
ev = ArrowEvidence.fromConsensus(css)
if options.dumpEvidence != "outliers":
ev.save(windowDirectory)
elif (np.max(ev.delta) > 20):
# Mathematically I don't think we should be seeing
# deltas > 6 in magnitude, but let's just restrict
# attention to truly bonkers outliers.
ev.save(windowDirectory)
else:
css = ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus,
subWin, intRefSeq)
......
# Authors: David Alexander, Lance Hepler
from __future__ import absolute_import, division, print_function
from GenomicConsensus.arrow.utils import allSingleBaseMutations
from GenomicConsensus.variants import Variant
......
# Authors: David Alexander, Lance Hepler
__all__ = [ "ArrowEvidence" ]
import logging, os.path, numpy as np
from collections import namedtuple
from itertools import groupby
from bisect import bisect_left, bisect_right
from pbcore.io import FastaReader, FastaWriter
from .utils import scoreMatrix
from .. import reference
class ArrowEvidence(object):
"""If used at runtime, this will require h5py
(for storing/loading 'arrow-scores.h5')
"""
Mutation = namedtuple("Mutation", ("Position", "Type", "FromBase", "ToBase"))
@staticmethod
def _parseMutName(mutName):
fields = mutName.split(" ")
pos = int(fields[0])
type, fromBase, _, toBase = fields[1:]
return ArrowEvidence.Mutation(pos, type, fromBase, toBase)
def __init__(self, refWindow, consensus, rowNames, colNames, baselineScores, scores):
assert isinstance(consensus, str)
self.refWindow = refWindow # tuple(str, int, int)
self.consensus = consensus
self.rowNames = rowNames
self.colNames = colNames
self.baselineScores = baselineScores
self.scores = scores
self.muts = map(ArrowEvidence._parseMutName, self.colNames)
@staticmethod
def fromConsensus(css):
rowNames, colNames, baselineScores, scores = scoreMatrix(css.ai)
return ArrowEvidence(css.refWindow,
css.sequence,
rowNames,
colNames,
baselineScores,
scores)
@property
def refName(self):
return self.refWindow[0]
@property
def refStart(self):
return self.refWindow[1]
@property
def refEnd(self):
return self.refWindow[2]
@property
def positions(self):
return [ mut.Position for mut in self.muts ]
@property
def uniquePositions(self):
return sorted(list(set(self.positions)))
@property
def delta(self):
return self.scores - self.baselineScores[:, np.newaxis]
@staticmethod
def load(dir):
"""
Load an ArrowEvidence from a directory
"""
if dir.endswith("/"): dir = dir[:-1]
refStart, refEnd = map(int, dir.split("/")[-1].split("-"))
refName = dir.split("/")[-2]
refWindow = (refName, refStart, refEnd)
with FastaReader(dir + "/consensus.fa") as fr:
consensus = next(iter(fr)).sequence
import h5py
with h5py.File(dir + "/arrow-scores.h5", "r") as f:
scores = f["Scores"].value
baselineScores = f["BaselineScores"].value
colNames = f["ColumnNames"].value
rowNames = f["RowNames"].value
return ArrowEvidence(refWindow, consensus,
rowNames, colNames,
baselineScores, scores)
def save(self, dir):
"""
Save this ArrowEvidence to a directory. The directory will be
*created* by this method.
Format of evidence dump:
evidence_dump/
ref000001/
0-1005/
consensus.fa
arrow-scores.h5
995-2005/
...
"""
logging.info("Dumping evidence to %s" % (dir,))
join = os.path.join
if os.path.exists(dir):
raise Exception("Evidence dump does not expect directory %s to exist." % dir)
os.makedirs(dir)
#refFasta = FastaWriter(join(dir, "reference.fa"))
#readsFasta = FastaWriter(join(dir, "reads.fa"))
consensusFasta = FastaWriter(join(dir, "consensus.fa"))
windowName = self.refName + (":%d-%d" % (self.refStart, self.refEnd))
#refFasta.writeRecord(windowName, self.refSequence)
#refFasta.close()
consensusFasta.writeRecord(windowName + "|arrow", self.consensus)
consensusFasta.close()
import h5py
arrowScoreFile = h5py.File(join(dir, "arrow-scores.h5"))
arrowScoreFile.create_dataset("Scores", data=self.scores)
vlen_str = h5py.special_dtype(vlen=str)
arrowScoreFile.create_dataset("RowNames", data=self.rowNames, dtype=vlen_str)
arrowScoreFile.create_dataset("ColumnNames", data=self.colNames, dtype=vlen_str)
arrowScoreFile.create_dataset("BaselineScores", data=self.baselineScores)
arrowScoreFile.close()
# for aln in alns:
# readsFasta.writeRecord(str(aln.rowNumber),
# aln.read(orientation="genomic", aligned=False))
# readsFasta.close()
def forPosition(self, pos):
posStart = bisect_left(self.positions, pos)
posEnd = bisect_right(self.positions, pos)
return ArrowEvidence(self.refStart,
self.consensus,
self.rowNames,
self.colNames[posStart:posEnd],
self.baselineScores,
self.scores[:, posStart:posEnd])
def justSubstitutions(self):
colMask = np.array(map(lambda s: ("Sub" in s), self.colNames))
return ArrowEvidence(self.refStart,
self.consensus,
self.rowNames,
self.colNames[colMask],
self.baselineScores,
self.scores[:, colMask])
def rowNumbers(self):
# with FastaReader(self.dir + "/reads.fa") as fr:
# return [ int(ctg.name) for ctg in fr ]
raise NotImplementedError
# Authors: David Alexander, Lance Hepler
from __future__ import absolute_import, division, print_function
import numpy as np, ConfigParser, collections, logging
from glob import glob
......
# Authors: David Alexander, Lance Hepler
from __future__ import absolute_import, division, print_function
import numpy as np, itertools, logging, re, sys
from collections import Counter
......@@ -406,7 +407,7 @@ def poaConsensus(fwdSequences, arrowConfig):
ordSeqs = sorted(fwdSequences, key=lambda seq: abs(len(seq) - median))
ordSeqs = ordSeqs[:arrowConfig.maxPoaCoverage]
cov = len(ordSeqs)
minCov = 1 if cov < 5 else ((cov + 1) / 2 - 1)
minCov = 1 if cov < 5 else ((cov + 1) // 2 - 1)
poaConfig = cc.DefaultPoaConfig(cc.AlignMode_GLOBAL)
return cc.PoaConsensus.FindConsensus(ordSeqs, poaConfig, minCov)
......
# Author: David Alexander
from __future__ import absolute_import, division, print_function
import numpy as np
......
# Author: David Alexander
from __future__ import absolute_import, division, print_function
import time
from pbcore.io import GffWriter, Gff3Record
......
# Author: Lance Hepler
from __future__ import division, print_function
from __future__ import absolute_import, division, print_function
import time
from textwrap import dedent
......
# Author: David Alexander
from __future__ import absolute_import
from __future__ import absolute_import, division, print_function
from .VariantsGffWriter import VariantsGffWriter
from .utils import *
# Author: David Alexander
from __future__ import absolute_import, division, print_function
__all__ = ["loadCmpH5", "loadBam"]
......
# Author: David Alexander
from __future__ import absolute_import
from __future__ import print_function
from __future__ import absolute_import, division, print_function
import argparse, atexit, cProfile, gc, glob, logging, multiprocessing
import os, pstats, random, shutil, tempfile, time, threading, Queue, traceback, pprint
......@@ -201,11 +199,6 @@ class ToolRunner(object):
logging.info("Removing %s" % options.temporaryDirectory)
shutil.rmtree(options.temporaryDirectory, ignore_errors=True)
def _setupEvidenceDumpDirectory(self, directoryName):
if os.path.exists(directoryName):
shutil.rmtree(directoryName)
os.makedirs(directoryName)
@property
def aborting(self):
return self._aborting
......@@ -252,8 +245,6 @@ class ToolRunner(object):
if options.pdbAtStartup:
ipdb.set_trace()
#logging.info("h5py version: %s" % h5py.version.version)
#logging.info("hdf5 version: %s" % h5py.version.hdf5_version)
logging.info("ConsensusCore version: %s" %
(consensusCoreVersion() or "ConsensusCore unavailable"))
logging.info("ConsensusCore2 version: %s" %
......@@ -281,10 +272,6 @@ class ToolRunner(object):
#options.disableHdf5ChunkCache = self._shouldDisableChunkCache(peekFile)
#if options.disableHdf5ChunkCache:
# logging.info("Will disable HDF5 chunk cache (large number of datasets)")
#logging.debug("After peek, # hdf5 objects open: %d" % h5py.h5f.get_obj_count())
if options.dumpEvidence:
self._setupEvidenceDumpDirectory(options.evidenceDirectory)
self._launchSlaves()
self._readAlignmentInput()
......
......@@ -17,7 +17,7 @@
# > from options import options
# and get the loaded options dictionary.
#
from __future__ import absolute_import
from __future__ import absolute_import, division, print_function
import argparse, os, os.path, sys, json
from pbcommand.models import FileTypes, SymbolTypes, get_pbparser
......@@ -351,16 +351,6 @@ def add_options_to_argument_parser(parser):
dest="doProfiling",
default=False,
help="Enable Python-level profiling (using cProfile).")
debugging.add_argument(
"--dumpEvidence", "-d",
dest="dumpEvidence",
nargs="?",
default=None,
const="variants",
choices=["variants", "all", "outliers"])
debugging.add_argument(
"--evidenceDirectory",
default="evidence_dump")
debugging.add_argument(
"--annotateGFF",
action="store_true",
......
# Author: David Alexander
from __future__ import absolute_import, division, print_function
# Author: David Alexander
from __future__ import absolute_import
from __future__ import absolute_import, division, print_function
import math, logging, numpy as np, random
from itertools import izip
......
# Author: David Alexander
from __future__ import absolute_import, division, print_function