Skip to content
Commits on Source (7)
canu (1.7.1+dfsg-1) unstable; urgency=medium
* Team upload.
* New upstream version
* debhelper 11
* Point Vcs fields to salsa.debian.org
* Standards-Version: 4.1.5
-- Andreas Tille <tille@debian.org> Tue, 17 Jul 2018 12:34:50 +0200
canu (1.7+dfsg-1) unstable; urgency=medium
* New upstream version
......
Source: canu
Section: science
Priority: optional
Maintainer: Debian Med Packaging Team <debian-med-packaging@lists.alioth.debian.org>
Uploaders: Afif Elghraoui <afif@debian.org>
Build-Depends:
debhelper (>= 10),
Section: science
Priority: optional
Build-Depends: debhelper (>= 11~),
libboost-dev,
libmeryl-dev,
# For File::Path
libfilesys-df-perl,
mhap (>= 2.1.3)
Standards-Version: 4.1.3
Standards-Version: 4.1.5
Vcs-Browser: https://salsa.debian.org/med-team/canu
Vcs-Git: https://salsa.debian.org/med-team/canu.git
Homepage: http://canu.readthedocs.org/en/latest/
Vcs-Git: https://anonscm.debian.org/git/debian-med/canu.git
Vcs-Browser: https://anonscm.debian.org/cgit/debian-med/canu.git
Package: canu
Architecture: any
Depends:
${shlibs:Depends},
Depends: ${shlibs:Depends},
${misc:Depends},
${perl:Depends},
libfilesys-df-perl,
mhap (>= 2.1.3),
gnuplot,
Suggests:
pbgenomicconsensus,
nanopolish,
gnuplot
Suggests: pbgenomicconsensus,
nanopolish
Description: single molecule sequence assembler for genomes
Canu is a fork of the Celera Assembler, designed for high-noise
single-molecule sequencing (such as the PacBio RS II or Oxford
......
......@@ -286,7 +286,7 @@ computeExponentialMovingAverage(TT alpha, TT ema, TT value) {
#if 0
template<typename TT>
class genericStatistics {
......@@ -338,12 +338,12 @@ public:
vector<uint64> &histogram(void) { // Returns pointer to private histogram data
finalizeData();
return(&_histogram);
return(_histogram);
};
vector<uint64> &Nstatistics(void) { // Returns pointer to private N data
finalizeData();
return(&_Nstatistics);
return(_Nstatistics);
};
void finalizeData(void) {
......@@ -375,7 +375,7 @@ private:
vector<uint64> _Nstatistics;
};
#endif
......@@ -422,12 +422,12 @@ public:
#if 0
vector<uint64> &histogram(void) { // Returns pointer to private histogram data
finalizeData();
return(&_histogram);
return(_histogram);
};
vector<uint64> &Nstatistics(void) { // Returns pointer to private N data
finalizeData();
return(&_Nstatistics);
return(_Nstatistics);
};
#endif
......
......@@ -130,6 +130,27 @@ BestOverlapGraph::removeHighErrorBestEdges(void) {
if (b5->readId() != 0) edgeStats.insert(erates[eratesLen++] = b5->erate());
if (b3->readId() != 0) edgeStats.insert(erates[eratesLen++] = b3->erate());
// If there are NO best edges, find the overlap with the most matches and use that.
if ((b5->readId() == 0) &&
(b3->readId() == 0)) {
uint32 no = 0;
BAToverlap *ovl = OC->getOverlaps(fi, no);
uint32 bestM = 0;
double bestE = 0.0;
for (uint32 oo=0; oo<no; oo++) {
double matches = (1 - ovl[oo].erate()) * RI->overlapLength(ovl[oo].a_iid, ovl[oo].b_iid, ovl[oo].a_hang, ovl[oo].b_hang);
if (bestM < matches) {
bestM = matches;
bestE = ovl[oo].erate();
}
}
if (no > 0)
edgeStats.insert(erates[eratesLen++] = bestE);
}
}
_mean = edgeStats.mean();
......@@ -357,6 +378,53 @@ BestOverlapGraph::removeSpurs(const char *prefix) {
// Mark zombie masters. Any read that has only contained overlaps (it is the container) and is the
// smallest ID of those with no hangs, is a master. These get promoted to unitigs.
//
void
BestOverlapGraph::findZombies(const char *prefix) {
uint32 fiLimit = RI->numReads();
uint32 numThreads = omp_get_max_threads();
uint32 blockSize = (fiLimit < 100 * numThreads) ? numThreads : fiLimit / 99;
#pragma omp parallel for schedule(dynamic, blockSize)
for (uint32 fi=1; fi <= fiLimit; fi++) {
uint32 no = 0;
BAToverlap *ovl = OC->getOverlaps(fi, no);
uint32 nc = 0;
if (no == 0)
continue;
for (uint32 ii=0; ii<no; ii++, nc++) // If any overlap makes A not
if (ovl[ii].AisContainer() == false) // a container, it's not a zombie
break;
if (nc < no)
continue;
nc = UINT32_MAX;
for (uint32 ii=0; ii<no; ii++) // Find the smallest ID
if ((ovl[ii].a_hang == 0) && // with no hangs.
(ovl[ii].b_hang == 0) &&
(ovl[ii].b_iid < nc))
nc = ovl[ii].b_iid;
if (fi < nc) { // If we're smaller, we're a
#pragma omp critical (suspInsert) // Zombie Master!
{
writeLog("read %u is a zombie.\n", fi);
_zombie.insert(fi);
}
}
}
writeStatus("BestOverlapGraph()-- detected " F_SIZE_T " zombie reads.\n", _zombie.size());
}
void
BestOverlapGraph::findEdges(void) {
uint32 fiLimit = RI->numReads();
......@@ -513,6 +581,8 @@ BestOverlapGraph::BestOverlapGraph(double erateGraph,
findEdges();
}
findZombies(prefix);
reportBestEdges(prefix, logFileFlagSet(LOG_ALL_BEST_EDGES) ? "best.3.final" : "best");
// One more pass, to find any ambiguous best edges.
......
......@@ -183,6 +183,7 @@ private:
void removeSuspicious(const char *prefix);
void removeSpurs(const char *prefix);
void removeLopsidedEdges(const char *prefix);
void findZombies(const char *prefix);
void findEdges(void);
......@@ -240,6 +241,10 @@ public:
return(_suspicious.count(readid) > 0);
};
bool isZombie(const uint32 readid) {
return(_zombie.count(readid) > 0);
};
void reportEdgeStatistics(const char *prefix, const char *label);
void reportBestEdges(const char *prefix, const char *label);
......@@ -283,6 +288,7 @@ private:
set<uint32> _suspicious;
set<uint32> _singleton;
set<uint32> _spur;
set<uint32> _zombie;
map<uint32, BestOverlaps> _bestM;
map<uint32, BestScores> _scorM;
......
......@@ -133,17 +133,16 @@ findPrevRead(Unitig *tig,
uint32
dropDeadFirstRead(AssemblyGraph *AG,
Unitig *tig) {
Unitig *tig,
bool isForward) {
ufNode *fn = tig->firstRead();
ufNode *sn = findNextRead(tig, fn);
// No next read, keep fn in the tig.
if (sn == NULL) {
writeLog("dropDead()- read %8u no sn\n", fn->ident);
if (sn == NULL)
return(0);
}
// Over all edges from the first read, look for any edge to something else.
//
......@@ -153,23 +152,35 @@ dropDeadFirstRead(AssemblyGraph *AG,
// the tig. We assume that this is always the first read, which is OK, because the function name
// says so. Any edge to anywhere means the read is good and should be kept.
if (AG->getForward(fn->ident).size() == 0)
writeLog("dropDead()-- (%s) 1st read %8u has no edges\n", (isForward) ? "fwd" : "rev", fn->ident);
for (uint32 pp=0; pp<AG->getForward(fn->ident).size(); pp++) {
BestPlacement &pf = AG->getForward(fn->ident)[pp];
writeLog("dropDead()-- 1st read %8u %s pf %3u/%3u best5 %8u best3 %8u bestC %8u\n",
if (pf.bestC.b_iid > 0) {
writeLog("dropDead()-- (%s) 1st read %8u %s edge %4u/%-4u - contained in %u - confirmed\n",
(isForward) ? "fwd" : "rev",
fn->ident,
fn->position.isForward() ? "->" : "<-",
pp, AG->getForward(fn->ident).size(),
pf.best5.b_iid, pf.best3.b_iid, pf.bestC.b_iid);
if (pf.bestC.b_iid > 0) {
pf.bestC.b_iid);
return(0);
}
if (((fn->position.isForward() == true) && (pf.best5.b_iid != 0)) ||
((fn->position.isForward() == false) && (pf.best3.b_iid != 0)))
((fn->position.isForward() == false) && (pf.best3.b_iid != 0))) {
writeLog("dropDead()-- (%s) 1st read %8u %s edge %4u/%-4u - 5:%u 3:%u - confirmed\n",
(isForward) ? "fwd" : "rev",
fn->ident,
fn->position.isForward() ? "->" : "<-",
pp, AG->getForward(fn->ident).size(),
pf.best5.b_iid, pf.best3.b_iid);
return(0);
}
}
// But no edge means we need to check the second read. If it has an edge, then we infer the
// first read is bogus and should be removed. If it also has no edge (except to the first read,
......@@ -180,26 +191,41 @@ dropDeadFirstRead(AssemblyGraph *AG,
// first read. Well, and that if the second read has an edge we declare the first read to be
// junk. That's also a bit of a difference from the previous loop.
if (AG->getForward(sn->ident).size() == 0) {
writeLog("dropDead()-- (%s) 2nd read %8u has no edges - keep first\n", (isForward) ? "fwd" : "rev", sn->ident);
return(0);
}
for (uint32 pp=0; pp<AG->getForward(sn->ident).size(); pp++) {
BestPlacement &pf = AG->getForward(sn->ident)[pp];
writeLog("dropDead()-- 2nd read %8u %s pf %3u/%3u best5 %8u best3 %8u bestC %8u\n",
if ((pf.bestC.b_iid > 0) && (pf.bestC.b_iid != fn->ident)) {
writeLog("dropDead()-- (%s) 2nd read %8u %s edge %4u/%-4u - contained in %u - drop first\n",
(isForward) ? "fwd" : "rev",
sn->ident,
sn->position.isForward() ? "->" : "<-",
pp, AG->getForward(sn->ident).size(),
pf.best5.b_iid, pf.best3.b_iid, pf.bestC.b_iid);
if ((pf.bestC.b_iid > 0) && (pf.bestC.b_iid != fn->ident))
pf.bestC.b_iid);
return(fn->ident);
}
if (((sn->position.isForward() == true) && (pf.best5.b_iid != 0) && (pf.best5.b_iid != fn->ident)) ||
((sn->position.isForward() == false) && (pf.best3.b_iid != 0) && (pf.best3.b_iid != fn->ident)))
((sn->position.isForward() == false) && (pf.best3.b_iid != 0) && (pf.best3.b_iid != fn->ident))) {
writeLog("dropDead()-- (%s) 2nd read %8u %s edge %4u/%-4u - 5:%u 3:8u - drop first\n",
(isForward) ? "fwd" : "rev",
sn->ident,
sn->position.isForward() ? "->" : "<-",
pp, AG->getForward(sn->ident).size(),
pf.best5.b_iid, pf.best3.b_iid);
return(fn->ident);
}
}
// Otherwise, the second read had only edges to the first read, and we should keep the first
// read.
writeLog("dropDead()-- (%s) 2nd read %8u has no useful external edges - keep first\n", (isForward) ? "fwd" : "rev", sn->ident);
return(0);
}
......@@ -222,20 +248,32 @@ dropDeadEnds(AssemblyGraph *AG,
(tig->_isUnassembled == true))
continue;
uint32 fn = dropDeadFirstRead(AG, tig); // Decide if the first read is junk.
writeLog("\n");
writeLog("dropDead()-- testing tig %u length %u\n", ti, tig->getLength());
uint32 fn = dropDeadFirstRead(AG, tig, true); // Decide if the first read is junk.
//fprintf(stderr, "drop dead for tig %u (flipped)\n", ti);
tig->reverseComplement(); // Flip.
uint32 ln = dropDeadFirstRead(AG, tig); // Decide if the last (now first) read is junk.
uint32 ln = dropDeadFirstRead(AG, tig, false); // Decide if the last (now first) read is junk.
tig->reverseComplement(); // Flip back.
if ((fn == 0) && (ln == 0)) // Nothing to remove, just get out of here.
if ((fn == 0) && (ln == 0)) { // Nothing to remove, nothing to do.
writeLog("dropDead()-- both ends confirmed\n");
continue;
}
if (fn == ln) { // The same thing to remove, ignore it.
writeLog("dropDead()-- retaining spanning read %u\n", fn);
continue;
}
// At least one read needs to be kicked out. Make new tigs for everything.
char fnMsg[80] = {0}; Unitig *fnTig = NULL;
char nnMsg[80] = {0}; Unitig *nnTig = NULL; int32 nnOff = INT32_MAX;
char lnMsg[80] = {0}; Unitig *lnTig = NULL;
Unitig *fnTig = NULL;
Unitig *nnTig = NULL; int32 nnOff = INT32_MAX;
Unitig *lnTig = NULL;
if (fn > 0)
fnTig = tigs.newUnitig(false);
......@@ -256,32 +294,26 @@ dropDeadEnds(AssemblyGraph *AG,
// Move reads to their new unitig.
strcpy(fnMsg, " ");
strcpy(nnMsg, " ");
strcpy(lnMsg, "");
for (uint32 cc=0, tt=0; tt<tig->ufpath.size(); tt++) {
ufNode &read = tig->ufpath[tt];
if (read.ident == fn) {
sprintf(fnMsg, "first read %9u to tig %7u --", read.ident, fnTig->id());
writeLog("dropDead()-- tig %u gets first read %u\n", fnTig->id(), read.ident);
fnTig->addRead(read, -read.position.min(), false);
} else if (read.ident == ln) {
sprintf(lnMsg, "-- last read %9u to tig %7u", read.ident, lnTig->id());
writeLog("dropDead()-- tig %u gets last read %u\n", lnTig->id(), read.ident);
lnTig->addRead(read, -read.position.min(), false);
} else {
if (nnOff == INT32_MAX) {
sprintf(nnMsg, "other reads to tig %7u", nnTig->id());
writeLog("dropDead()-- tig %u gets all other reads\n", nnTig->id());
nnOff = read.position.min();
}
nnTig->addRead(read, -nnOff, false);
}
}
writeLog("dropDeadEnds()-- tig %7u --> %s %s %s\n", tig->id(), fnMsg, nnMsg, lnMsg);
if (fnTig) fnTig->cleanUp(); // Probably not neeeded, but cheap.
if (lnTig) lnTig->cleanUp(); // Probably not neeeded, but cheap.
if (nnTig) nnTig->cleanUp(); // Most likely needed.
......@@ -292,6 +324,6 @@ dropDeadEnds(AssemblyGraph *AG,
tigs[ti] = NULL;
}
writeStatus("dropDeadEnds()-- Modified %u tigs. Dropped %u first and %u last reads, %u tig%s had both reads dropped.\n",
writeStatus("dropDead()-- Modified %u tigs. Dropped %u first and %u last reads, %u tig%s had both reads dropped.\n",
numT, numF, numL, numB, (numB == 1) ? "" : "s");
}
......@@ -708,6 +708,9 @@ findConfusedEdges(TigVector &tigs,
continue;
// Skip if this overlap is the best we're trying to match.
//
// NOTE. This doesn't care about potential duplicate overlaps between a pair of reads,
// as we're looking for thicker overlaps off only one end of the read.
if ((rdBid == b5->readId()) ||
(rdBid == b3->readId()))
continue;
......
......@@ -73,8 +73,15 @@ typedef map<uint32, vector<uint32> > BubTargetList;
// Decide which tigs can be orphans. Any unitig where (nearly) every dovetail read has an overlap
// to some other unitig is a candidate for orphan popping.
// Decide which tigs can be orphans. Any unitig where (nearly) every dovetail
// read has an overlap to some other unitig is a candidate for orphan popping.
//
// Counts the number of reads that have an overlap to some other tig
// (tigOlapsTo). if more than half the reads in the tig have an overlap to
// some other tig, it is a potential place to pop the bubble.
//
// Returns BubTargetList, a map of uint32 to vector<uint32>, of the potential
// places that some tig could be popped into.
void
findPotentialOrphans(TigVector &tigs,
......@@ -617,6 +624,9 @@ mergeOrphans(TigVector &tigs,
vector<candidatePop *> targets;
for (map<uint32, intervalList<uint32> *>::iterator it=targetIntervals.begin(); it != targetIntervals.end(); ++it)
if (tigs[it->first] == NULL)
writeLog("mergeOrphans()-- orphan %u wants to go into nonexistent tig %u!\n", ti, it->first);
else
saveCorrectlySizedInitialIntervals(orphan,
tigs[it->first], // The targetID in targetIntervals
it->second, // The interval list in targetIntervals
......@@ -657,7 +667,7 @@ mergeOrphans(TigVector &tigs,
targets[tt]->placed[op].frgID,
targets[tt]->placed[op].position.bgn, targets[tt]->placed[op].position.end);
writeLog("mergeOrphans()-- tig %8u length %9u -> target %8u piece %2u position %9u-%-9u length %8u - expected %3" F_SIZE_TP " reads, had %3" F_SIZE_TP " reads.\n",
writeLog("mergeOrphans()-- tig %8u length %9u -> target %8u piece %2u position %9u-%-9u length %8u - expected %3" F_U32 " reads, had %3" F_U32 " reads.\n",
orphan->id(), orphan->getLength(),
targets[tt]->target->id(), tt, targets[tt]->bgn, targets[tt]->end, targets[tt]->end - targets[tt]->bgn,
orphanSize, targetSize);
......
......@@ -59,6 +59,77 @@ public:
// Decide if two reads in a tig are compatible with an overlap.
//
// Returns true if the reads are overlapping in the tig, and are oriented consistent with the overlap.
//
bool
Unitig::optimize_isCompatible(uint32 ii,
uint32 jj,
BAToverlap &olap,
bool inInit,
bool secondPass,
bool beVerbose) {
SeqInterval &ip = ufpath[ii].position;
SeqInterval &jp = ufpath[jj].position;
bool isOvl = isOverlapping(ufpath[ii].position, ufpath[jj].position);
// Decide if the overlap preserves the ordering in the tig.
//
// This is of questionable value, and might be incorrect. We can have two overlaps between
// a pair of reads iff the orientation differs - the flipped flag. That gets tested elsewhere.
bool isOrdered = true;
#if 1
bool isOvlLo = ((jp.min() <= ip.min()) && (ip.min() <= jp.max()) && (jp.max() <= ip.max()));
bool isOvlHi = ((ip.min() <= jp.min()) && (jp.min() <= ip.max()) && (ip.max() <= jp.max()));
//bool isOvlLo = jp.min() < ip.min(); // Is the above too rigorous?
//bool isOvlHi = ip.min() < jp.min();
// isOvl tells the end of the read that should have the overlap (according to the tig layout),
// so check if the overlap actually is on that end. If not, flag this as 'mis ordered'.
if (((isOvlLo == true) && (ip.isForward()) && (olap.AEndIs5prime() == false)) ||
((isOvlLo == true) && (ip.isReverse()) && (olap.AEndIs3prime() == false)) ||
((isOvlHi == true) && (ip.isForward()) && (olap.AEndIs3prime() == false)) ||
((isOvlHi == true) && (ip.isReverse()) && (olap.AEndIs5prime() == false)))
isOrdered = false;
// But if in a containment relationship, it cannot be mis ordered.
if ((olap.AisContained() == true) ||
(olap.AisContainer() == true))
isOrdered = true;
#endif
bool isOriented = true;
if (((ip.isForward() == jp.isForward()) && (olap.flipped == true)) ||
((ip.isForward() != jp.isForward()) && (olap.flipped == false)))
isOriented = false;
if ((beVerbose) || (secondPass))
writeLog("optimize_%s()-- tig %7u read %9u (at %9d %9d) olap to read %9u (at %9d %9d) - hangs %7d %7d - %s %s ovlLo %d ovlHi %d contained %d container %d\n",
(inInit) ? "initPlace" : "recompute",
id(),
ufpath[ii].ident, ip.bgn, ip.end,
ufpath[jj].ident, jp.bgn, jp.end,
olap.a_hang, olap.b_hang,
(isOvl == true) ? "overlapping" : "not-overlapping",
(isOrdered == true) ? "ordered" : "mis-ordered",
isOvlLo, isOvlHi, olap.AisContained(), olap.AisContainer());
return((isOvl == true) &&
(isOrdered == true) &&
(isOriented == true));
}
void
Unitig::optimize_initPlace(uint32 ii,
optPos *op,
......@@ -87,37 +158,18 @@ Unitig::optimize_initPlace(uint32 ii,
uint32 uu = inUnitig (jid);
uint32 jj = ufpathIdx(jid);
// Probably overkill, but report ALL overlaps for the troubling reads.
if ((beVerbose) || (firstPass == false))
writeLog("optimize_initPlace()-- olap %u a %u b %u hangs %d %d\n", oo, iid, jid, ovl[oo].a_hang, ovl[oo].b_hang);
if (uu != id()) // Skip if the overlap is to a different tig.
continue; // (the ufpathIdx() call is valid, but using it isn't)
// Reads are in the same tig. Decide if they overlap in position.
bool isOvl = isOverlapping(ufpath[ii].position, ufpath[jj].position);
// Log! beVerbose should be true for the second pass, but just in case it isn't.
if (uu != id()) // Skip if to a different tig.
continue; // (otherwise, ufpath[jj] is invalid below)
if ((beVerbose) || (firstPass == false))
writeLog("optimize_initPlace()-- olap %4u tig %7u read %8u (at %9d %9d) olap to read %8u (at %9d %9d) - hangs %7d %7d - %s %s\n",
oo, id(),
iid, ufpath[ii].position.bgn, ufpath[ii].position.end,
jid, ufpath[jj].position.bgn, ufpath[jj].position.end,
ovl[oo].a_hang, ovl[oo].b_hang,
(isOvl == true) ? "overlapping" : "not-overlapping",
(jj > ii) ? "after" : "before");
if (isOvl == false) // Skip if the reads
continue; // don't overlap
// Skip if:
// this is the first pass, but the overlap is to a read after us.
// the overlap isn't compatible with the layout.
if ((firstPass) && (jj > ii)) // We're setting initial positions, so overlaps to reads after
continue; // us aren't correct, unless we're in the 2nd pass
if (((firstPass) && (ii < jj)) ||
(optimize_isCompatible(ii, jj, ovl[oo], true, !firstPass, beVerbose) == false))
continue;
// Reads overlap. Compute the position of the read using
// the overlap and the other read.
// Compute the position of the read using the overlap and the other read.
nmin += (op[iid].fwd) ? (op[jid].min - ovl[oo].a_hang) : (op[jid].min + ovl[oo].b_hang);
cnt += 1;
......@@ -176,8 +228,9 @@ Unitig::optimize_recompute(uint32 iid,
uint32 cnt = 0;
if (beVerbose) {
writeLog("optimize()-- tig %8u read %8u previous - %9.2f-%-9.2f\n", id(), iid, op[iid].min, op[iid].max);
writeLog("optimize()-- tig %8u read %8u length - %9.2f-%-9.2f\n", id(), iid, op[iid].max - readLen, op[iid].min + readLen);
writeLog("optimize()--\n");
writeLog("optimize()-- tig %8u read %9u previous - %9.2f-%-9.2f\n", id(), iid, op[iid].min, op[iid].max);
writeLog("optimize()-- tig %8u read %9u length - %9.2f-%-9.2f\n", id(), iid, op[iid].max - readLen, op[iid].min + readLen);
}
// Process all overlaps.
......@@ -187,26 +240,34 @@ Unitig::optimize_recompute(uint32 iid,
uint32 uu = inUnitig (jid);
uint32 jj = ufpathIdx(jid);
if (uu != id()) // Skip if the overlap is to a different tig.
continue; // (the ufpathIdx() call is valid, but using it isn't)
if (uu != id()) // Skip if to a different tig.
continue; // (otherwise, ufpath[jj] is invalid below)
if (isOverlapping(ufpath[ii].position, ufpath[jj].position) == false) // Skip if the reads
continue; // don't overlap
// Reads overlap. Compute the position of the read using
// the overlap and the other read.
// Compute the position of the read using the overlap and the other read.
double tmin = (op[iid].fwd) ? (op[jid].min - ovl[oo].a_hang) : (op[jid].min + ovl[oo].b_hang);
double tmax = (op[iid].fwd) ? (op[jid].max - ovl[oo].b_hang) : (op[jid].max + ovl[oo].a_hang);
if (beVerbose)
writeLog("optimize()-- tig %8u read %8u olap %4u - %9.2f-%-9.2f\n", id(), iid, oo, tmin, tmax);
writeLog("optimize()-- tig %8u read %9u olap %4u - %9.2f-%-9.2f\n", id(), iid, oo, tmin, tmax);
// Skip if the overlap isn't compatible with the layout.
if (optimize_isCompatible(ii, jj, ovl[oo], false, false, beVerbose) == false)
continue;
// Update estimate.
nmin += tmin;
nmax += tmax;
cnt += 1;
} // over all overlaps
}
if (cnt == 0) {
writeLog("Failed to optimize read %u in tig %u\n", iid, id());
fprintf(stderr, "Failed to optimize read %u in tig %u\n", iid, id());
flushLog();
}
assert(cnt > 0);
// Add in some evidence for the bases in the read. We want higher weight than the overlaps,
......@@ -226,7 +287,7 @@ Unitig::optimize_recompute(uint32 iid,
double dmax = 2 * (op[iid].max - np[iid].max) / (op[iid].max + np[iid].max);
double npll = np[iid].max - np[iid].min;
writeLog("optimize()-- tig %8u read %8u - %9.2f-%-9.2f length %9.2f/%-6d %7.2f%% posChange %+6.4f %+6.4f\n",
writeLog("optimize()-- tig %8u read %9u - %9.2f-%-9.2f length %9.2f/%-6d %7.2f%% posChange %+6.4f %+6.4f\n",
id(), iid,
np[iid].min, np[iid].max,
npll, readLen,
......@@ -237,8 +298,6 @@ Unitig::optimize_recompute(uint32 iid,
void
Unitig::optimize_expand(optPos *op) {
......@@ -300,7 +359,7 @@ Unitig::optimize_setPositions(optPos *op,
if (op[iid].fwd) {
if (beVerbose)
writeLog("optimize()-- read %8u -> from %9d,%-9d %7d to %9d,%-9d %7d readLen %7d diff %7.4f%%\n",
writeLog("optimize()-- read %9u -> from %9d,%-9d %7d to %9d,%-9d %7d readLen %7d diff %7.4f%%\n",
iid,
ufpath[ii].position.bgn,
ufpath[ii].position.end,
......@@ -315,7 +374,7 @@ Unitig::optimize_setPositions(optPos *op,
ufpath[ii].position.end = (int32)op[iid].max;
} else {
if (beVerbose)
writeLog("optimize()-- read %8u <- from %9d,%-9d %7d to %9d,%-9d %7d readLen %7d diff %7.4f%%\n",
writeLog("optimize()-- read %9u <- from %9d,%-9d %7d to %9d,%-9d %7d readLen %7d diff %7.4f%%\n",
iid,
ufpath[ii].position.bgn,
ufpath[ii].position.end,
......@@ -360,12 +419,13 @@ TigVector::optimizePositions(const char *prefix, const char *label) {
for (uint32 fi=0; fi<fiLimit; fi++) {
uint32 ti = inUnitig(fi);
uint32 pp = ufpathIdx(fi);
Unitig *tig = operator[](ti);
if (ti == 0)
if (tig == NULL)
continue;
op[fi].set(operator[](ti)->ufpath[pp]);
np[fi].set(operator[](ti)->ufpath[pp]);
op[fi].set(tig->ufpath[pp]);
np[fi].set(tig->ufpath[pp]);
}
// Compute initial positions using previously placed reads and the read length.
......@@ -383,7 +443,7 @@ TigVector::optimizePositions(const char *prefix, const char *label) {
Unitig *tig = operator[](ti);
set<uint32> failed;
if (tig == NULL)
if ((tig == NULL) || (tig->ufpath.size() == 1))
continue;
for (uint32 ii=0; ii<tig->ufpath.size(); ii++)
......@@ -407,11 +467,12 @@ TigVector::optimizePositions(const char *prefix, const char *label) {
#pragma omp parallel for schedule(dynamic, fiBlockSize)
for (uint32 fi=0; fi<fiLimit; fi++) {
uint32 ti = inUnitig(fi);
Unitig *tig = operator[](ti);
if (ti == 0)
if ((tig == NULL) || (tig->ufpath.size() == 1))
continue;
operator[](ti)->optimize_recompute(fi, op, np, beVerbose);
tig->optimize_recompute(fi, op, np, beVerbose);
}
// Reset zero
......@@ -421,7 +482,7 @@ TigVector::optimizePositions(const char *prefix, const char *label) {
for (uint32 ti=0; ti<tiLimit; ti++) {
Unitig *tig = operator[](ti);
if (tig == NULL)
if ((tig == NULL) || (tig->ufpath.size() == 1))
continue;
int32 z = np[ tig->ufpath[0].ident ].min;
......@@ -479,7 +540,7 @@ TigVector::optimizePositions(const char *prefix, const char *label) {
for (uint32 ti=0; ti<tiLimit; ti++) {
Unitig *tig = operator[](ti);
if (tig == NULL)
if ((tig == NULL) || (tig->ufpath.size() == 1))
continue;
tig->optimize_expand(op);
......@@ -494,7 +555,7 @@ TigVector::optimizePositions(const char *prefix, const char *label) {
for (uint32 ti=0; ti<tiLimit; ti++) {
Unitig *tig = operator[](ti);
if (tig == NULL)
if ((tig == NULL) || (tig->ufpath.size() == 1))
continue;
tig->optimize_setPositions(op, beVerbose);
......
......@@ -344,7 +344,7 @@ uint32
OverlapCache::filterDuplicates(uint32 &no) {
uint32 nFiltered = 0;
for (uint32 ii=0, jj=1; jj<no; ii++, jj++) {
for (uint32 ii=0, jj=1, dd=0; jj<no; ii++, jj++) {
if (_ovs[ii].b_iid != _ovs[jj].b_iid)
continue;
......@@ -352,22 +352,35 @@ OverlapCache::filterDuplicates(uint32 &no) {
nFiltered++;
// Drop the shorter overlap, or the one with the higher erate.
// Drop the weaker overlap. If a tie, drop the flipped one.
uint32 iilen = RI->overlapLength(_ovs[ii].a_iid, _ovs[ii].b_iid, _ovs[ii].a_hang(), _ovs[ii].b_hang());
uint32 jjlen = RI->overlapLength(_ovs[jj].a_iid, _ovs[jj].b_iid, _ovs[jj].a_hang(), _ovs[jj].b_hang());
double iiSco = RI->overlapLength(_ovs[ii].a_iid, _ovs[ii].b_iid, _ovs[ii].a_hang(), _ovs[ii].b_hang()) * _ovs[ii].erate();
double jjSco = RI->overlapLength(_ovs[jj].a_iid, _ovs[jj].b_iid, _ovs[jj].a_hang(), _ovs[jj].b_hang()) * _ovs[jj].erate();
if (iilen == jjlen) {
if (_ovs[ii].evalue() < _ovs[jj].evalue())
jjlen = 0;
else
iilen = 0;
if (iiSco == jjSco) { // Hey gcc! See how nice I was by putting brackets
if (_ovs[ii].flipped()) // around this so you don't get confused by the
iiSco = 0; // non-ambiguous ambiguous else clause?
else //
jjSco = 0; // You're welcome.
}
if (iilen < jjlen)
_ovs[ii].a_iid = _ovs[ii].b_iid = 0;
if (iiSco < jjSco)
dd = ii;
else
_ovs[jj].a_iid = _ovs[jj].b_iid = 0;
dd = jj;
#if 0
writeLog("OverlapCache::filterDuplicates()-- Dropping overlap A: %9" F_U64P " B: %9" F_U64P " - %6.4f%% - %6" F_S32P " %6" F_S32P " - %s\n",
_ovs[dd].a_iid,
_ovs[dd].b_iid,
_ovs[dd].a_hang(),
_ovs[dd].b_hang(),
_ovs[dd].erate(),
_ovs[dd].flipped() ? "flipped" : "");
#endif
_ovs[dd].a_iid = 0;
_ovs[dd].b_iid = 0;
}
// If nothing was filtered, return.
......@@ -616,30 +629,29 @@ OverlapCache::loadOverlaps(ovStore *ovlStore, bool doSave) {
// Binary search a list of overlaps for one matching bID and flipped.
bool
searchForOverlap(BAToverlap *ovl, uint32 ovlLen, uint32 bID) {
searchForOverlap(BAToverlap *ovl, uint32 ovlLen, uint32 bID, bool flipped) {
int32 F = 0;
int32 L = ovlLen - 1;
int32 M = 0;
#ifdef TEST_LINEAR_SEARCH
bool linearSearchFound = false;
for (uint32 ss=0; ss<ovlLen; ss++)
if (ovl[ss].b_iid == bID) {
if ((ovl[ss].b_iid == bID) &&
(ovl[ss].flipped == flipped)) {
linearSearchFound = true;
break;
}
#endif
// If not, these are repeats and we should binary search everything.
// There will be no short lists where we could exhaustively search.
int32 F = 0;
int32 L = ovlLen - 1;
int32 M = 0;
while (F <= L) {
M = (F + L) / 2;
if (ovl[M].b_iid == bID) {
if ((ovl[M].b_iid == bID) &&
(ovl[M].flipped == flipped)) {
ovl[M].symmetric = true;
#ifdef TEST_LINEAR_SEARCH
assert(linearSearchFound == true);
......@@ -647,7 +659,8 @@ searchForOverlap(BAToverlap *ovl, uint32 ovlLen, uint32 bID) {
return(true);
}
if (ovl[M].b_iid < bID)
if (((ovl[M].b_iid < bID)) ||
((ovl[M].b_iid == bID) && (ovl[M].flipped < flipped)))
F = M+1;
else
L = M-1;
......@@ -694,7 +707,7 @@ OverlapCache::symmetrizeOverlaps(void) {
// Search for the twin overlap, and if found, we're done. The twin is marked as symmetric in the function.
if (searchForOverlap(_overlaps[rb], _overlapLen[rb], rr)) {
if (searchForOverlap(_overlaps[rb], _overlapLen[rb], rr, _overlaps[rr][oo].flipped)) {
_overlaps[rr][oo].symmetric = true;
continue;
}
......
......@@ -65,6 +65,11 @@ breakSingletonTigs(TigVector &tigs) {
if (utg->ufpath.size() > 1)
continue;
if (OG->isZombie(utg->ufpath[0].ident) == true) {
writeLog("Not breaking sinleton zombie tig %u with read %u\n", ti, utg->ufpath[0].ident);
continue;
}
tigs[ti] = NULL; // Remove the tig from the list
tigs.registerRead(utg->ufpath[0].ident); // Eject the read
delete utg; // Reclaim space
......@@ -125,6 +130,16 @@ placeUnplacedUsingAllOverlaps(TigVector &tigs,
placeReadUsingOverlaps(tigs, NULL, fid, placements, placeRead_fullMatch);
// If all placements are in singletons, allow them. If any placement is to a 'real' tig,
// ignore singleton placements.
bool ignoreSingleton = false;
for (uint32 i=0; i<placements.size(); i++)
if ((placements[i].fCoverage >= 0.99) &&
(tigs[placements[i].tigID]->ufpath.size() > 1))
ignoreSingleton = true;
// Search the placements for the highest expected identity placement using all overlaps in the unitig.
uint32 b = UINT32_MAX;
......@@ -135,7 +150,8 @@ placeUnplacedUsingAllOverlaps(TigVector &tigs,
if (placements[i].fCoverage < 0.99) // Ignore partially placed reads.
continue;
if (tig->ufpath.size() == 1) // Ignore placements in singletons.
if ((ignoreSingleton == true) &&
(tig->ufpath.size() == 1)) // Ignore placements in singletons.
continue;
uint32 bgn = placements[i].position.min();
......
......@@ -119,8 +119,11 @@ populateUnitig(TigVector &tigs,
int32 fi) {
if ((RI->readLength(fi) == 0) || // Skip deleted
(tigs.inUnitig(fi) != 0) || // Skip placed
(OG->isContained(fi) == true)) // Skip contained
(tigs.inUnitig(fi) != 0)) // Skip placed
return;
if ((OG->isContained(fi) == true) && // Skip contained...
(OG->isZombie(fi) == false)) // that aren't zombies.
return;
Unitig *utg = tigs.newUnitig(logFileFlagSet(LOG_BUILD_UNITIG));
......@@ -140,6 +143,22 @@ populateUnitig(TigVector &tigs,
utg->addRead(read, 0, logFileFlagSet(LOG_BUILD_UNITIG));
// If suspicious or a zombie, don't bother trying to extend. In the former
// case, we don't want to extend, and in the latter case, there isn't anything
// to extend.
if (OG->isSuspicious(fi)) {
writeLog("Stopping unitig construction of suspicious read %d in unitig %d\n",
utg->ufpath.back().ident, utg->id());
return;
}
if (OG->isZombie(fi)) {
writeLog("Stopping unitig construction of zombie read %d in unitig %d\n",
utg->ufpath.back().ident, utg->id());
return;
}
// Add reads as long as there is a path to follow...from the 3' end of the first read.
BestEdgeOverlap *bestedge5 = OG->getBestEdgeOverlap(fi, false);
......@@ -150,34 +169,6 @@ populateUnitig(TigVector &tigs,
assert(bestedge3->ahang() >= 0);
assert(bestedge3->bhang() >= 0);
// If this read is not covered by the two best overlaps we are finished. We will not follow
// the paths out. This indicates either low coverage, or a chimeric read. If it is low
// coverage, then the best overlaps will be mutual and we'll recover the same path. If it is a
// chimeric read the overlaps will not be mutual and we will skip this read.
//
// The amount of our read that is covered by the two best overlaps is
//
// (readLen + bestedge5->bhang()) + (readLen - bestedge3->ahang())
//
// If that is not significantly longer than the read length, then we will not use this
// read as a seed for unitig construction.
//
if (OG->isSuspicious(fi))
return;
#if 0
uint32 covered = RI->readLength(fi) + bestedge5->bhang() + RI->readLength(fi) - bestedge3->ahang();
// This breaks tigs at 0x best-coverage regions. There might be a contain that spans (joins)
// the two best overlaps to verify the read, but we can't easily tell right now.
if (covered < RI->readLength(fi) + AS_OVERLAP_MIN_LEN / 2) {
writeLog("Stopping unitig construction of suspicious read %d in unitig %d\n",
utg->ufpath.back().ident, utg->id());
return;
}
#endif
if (logFileFlagSet(LOG_BUILD_UNITIG))
writeLog("Adding 5' edges off of read %d in unitig %d\n",
utg->ufpath.back().ident, utg->id());
......
......@@ -40,6 +40,7 @@
#include "AS_global.H"
#include "AS_BAT_TigVector.H"
#include "AS_BAT_OverlapCache.H"
#include "stddev.H"
......@@ -204,6 +205,13 @@ public:
void cleanUp(void);
// Recompute bgn/end positions using all overlaps.
bool optimize_isCompatible(uint32 ii,
uint32 jj,
BAToverlap &olap,
bool inInit,
bool secondPass,
bool beVerbose);
void optimize_initPlace(uint32 pp,
optPos *op,
optPos *np,
......
......@@ -32,7 +32,7 @@ my $cwd = getcwd();
my $label = "snapshot"; # Automagically set to 'release' for releases.
my $major = "1"; # Bump before release.
my $minor = "7"; # Bump before release.
my $minor = "7.1"; # Bump before release.
my $commits = "0";
my $hash1 = undef; # This from 'git describe'
......
......@@ -60,9 +60,6 @@ Read_Frags(feParameters *G,
uint64 votesLength = 0;
uint64 readsLoaded = 0;
fprintf(stderr, "Read_Frags()-- from " F_U32 " through " F_U32 "\n",
G->bgnID, G->endID);
for (uint32 curID=G->bgnID; curID<=G->endID; curID++) {
gkRead *read = gkpStore->gkStore_getRead(curID);
......@@ -74,11 +71,7 @@ Read_Frags(feParameters *G,
sizeof(Vote_Tally_t) * votesLength +
sizeof(Frag_Info_t) * G->readsLen);
fprintf(stderr, "Read_Frags()-- allocate " F_U64 " MB for bases, votes and info, for %u reads of total length " F_U64 " (%.2f MB)\n",
totAlloc >> 20,
G->endID - G->bgnID + 1,
basesLength,
totAlloc / 1024.0 / 1024.0);
fprintf(stderr, "Read_Frags()-- Loading target reads " F_U32 " through " F_U32 " with " F_U64 " bases.\n", G->bgnID, G->endID, basesLength);
G->readBases = new char [basesLength];
G->readVotes = new Vote_Tally_t [votesLength]; // NO constructor, MUST INIT
......@@ -122,6 +115,6 @@ Read_Frags(feParameters *G,
delete readData;
fprintf(stderr, "Read_Frags()-- from " F_U32 " through " F_U32 " -- loaded " F_U64 " bases in " F_U64 " reads.\n",
G->bgnID, G->endID-1, basesLength, readsLoaded);
fprintf(stderr, "Read_Frags()-- %.3f GB for bases, votes and info.\n", totAlloc / 1024.0 / 1024.0 / 1024.0);
fprintf(stderr, "\n");
}
......@@ -41,8 +41,7 @@ Read_Olaps(feParameters *G, gkStore *gkpStore) {
uint64 numolaps = ovs->numOverlapsInRange();
fprintf(stderr, "Read_Olaps()-- loading " F_U64 " overlaps (%.2f MB).\n",
numolaps, sizeof(Olap_Info_t) * numolaps / 1024.0 / 1024.0);
fprintf(stderr, "Read_Olaps()-- Loading " F_U64 " overlaps.\n", numolaps);
G->olaps = new Olap_Info_t [numolaps];
G->olapsLen = 0;
......@@ -68,6 +67,9 @@ Read_Olaps(feParameters *G, gkStore *gkpStore) {
}
delete ovs;
fprintf(stderr, "Read_Olaps()-- %.3f GB for overlaps..\n", sizeof(Olap_Info_t) * numolaps / 1024.0 / 1024.0 / 1024.0);
fprintf(stderr, "\n");
}
......@@ -51,24 +51,21 @@ Output_Corrections(feParameters *G);
// From overlapInCore.C
int
Binomial_Bound(int e, double p, int Start, double Limit);
// Read fragments lo_frag..hi_frag (INCLUSIVE) from store and save the ids and sequences of those
// with overlaps to fragments in global Frag .
static
void
Extract_Needed_Frags(feParameters *G,
extractReads(feParameters *G,
gkStore *gkpStore,
uint32 loID,
uint32 hiID,
Frag_List_t *fl,
uint64 &nextOlap) {
// Clear the buffer.
fl->readsLen = 0;
fl->basesLen = 0;
// The original converted to lowercase, and made non-acgt be 'a'.
char filter[256];
......@@ -81,35 +78,44 @@ Extract_Needed_Frags(feParameters *G,
filter['G'] = filter['g'] = 'g';
filter['T'] = filter['t'] = 't';
// Count the amount of stuff we're loading.
// Return if we've exhausted the overlaps.
fl->readsLen = 0;
fl->basesLen = 0;
if (nextOlap >= G->olapsLen)
return;
// Count the amount of stuff we're loading.
uint64 lastOlap = nextOlap;
uint32 ii = 0; // Index into reads arrays
uint32 fi = G->olaps[lastOlap].b_iid; // Actual ID we're extracting
uint32 loID = G->olaps[lastOlap].b_iid; // Actual ID we're extracting
uint32 hiID = loID;
uint64 maxBases = 512 * 1024 * 1024;
assert(loID <= fi);
// Find the highest read ID that we can load without exceeding maxBases.
fprintf(stderr, "\n");
fprintf(stderr, "Extract_Needed_Frags()-- Loading used reads between " F_U32 " and " F_U32 ", at overlap " F_U64 ".\n", fi, hiID, lastOlap);
while ((fl->basesLen < maxBases) &&
(lastOlap < G->olapsLen)) {
hiID = G->olaps[lastOlap].b_iid; // Grab the ID of the overlap we're at.
while (fi <= hiID) {
gkRead *read = gkpStore->gkStore_getRead(fi);
gkRead *read = gkpStore->gkStore_getRead(hiID); // Grab that read.
fl->readsLen += 1;
fl->readsLen += 1; // Add the read to our set.
fl->basesLen += read->gkRead_sequenceLength() + 1;
// Advance to the next overlap
lastOlap++;
while ((lastOlap < G->olapsLen) && (G->olaps[lastOlap].b_iid == fi))
lastOlap++;
fi = (lastOlap < G->olapsLen) ? G->olaps[lastOlap].b_iid : hiID + 1;
lastOlap++; // Advance to the next overlap
while ((lastOlap < G->olapsLen) && //
(G->olaps[lastOlap].b_iid == hiID)) // If we've exceeded the max size or hit the last overlap,
lastOlap++; // the loop will stop on the next iteration.
}
fprintf(stderr, "Extract_Needed_Frags()-- Loading reads for overlaps " F_U64 " to " F_U64 " (reads " F_U32 " bases " F_U64 ")\n", nextOlap, lastOlap, fl->readsLen, fl->basesLen);
// If nothing to load, just return.
if (fl->readsLen == 0)
return;
// Report what we're going to do.
fprintf(stderr, "extractReads()-- Loading reads " F_U32 " to " F_U32 " (" F_U32 " reads with " F_U64 " bases) overlaps " F_U64 " through " F_U64 ".\n",
loID, hiID, fl->readsLen, fl->basesLen, nextOlap, lastOlap);
// Ensure there is space.
......@@ -117,42 +123,31 @@ Extract_Needed_Frags(feParameters *G,
delete [] fl->readIDs;
delete [] fl->readBases;
//fprintf(stderr, "Extract_Needed_Frags()-- realloc reads from " F_U32 " to " F_U32 "\n", fl->readsMax, 12 * fl->readsLen / 10);
fl->readIDs = new uint32 [12 * fl->readsLen / 10];
fl->readBases = new char * [12 * fl->readsLen / 10];
fl->readsMax = 12 * fl->readsLen / 10;
fl->readIDs = new uint32 [fl->readsMax];
fl->readBases = new char * [fl->readsMax];
}
if (fl->basesMax < fl->basesLen) {
delete [] fl->bases;
//fprintf(stderr, "Extract_Needed_Frags()-- realloc bases from " F_U64 " to " F_U64 "\n", fl->basesMax, 12 * fl->basesLen / 10);
fl->bases = new char [12 * fl->basesLen / 10];
fl->basesMax = 12 * fl->basesLen / 10;
fl->bases = new char [fl->basesMax];
}
// Load. This is complicated by loading only the reads that have overlaps we care about.
fl->readsLen = 0;
fl->basesLen = 0;
// Load the sequence data for reads loID to hiID, as long as the read has an overlap.
gkReadData *readData = new gkReadData;
ii = 0;
fi = G->olaps[nextOlap].b_iid;
assert(loID <= fi);
fl->readsLen = 0;
fl->basesLen = 0;
while (fi <= hiID) {
gkRead *read = gkpStore->gkStore_getRead(fi);
while ((loID <= hiID) &&
(nextOlap < G->olapsLen)) {
gkRead *read = gkpStore->gkStore_getRead(loID);
fl->readIDs[ii] = fi;
fl->readBases[ii] = fl->bases + fl->basesLen;
fl->basesLen += read->gkRead_sequenceLength() + 1;
fl->readIDs[fl->readsLen] = loID; // Save the ID of _this_ read.
fl->readBases[fl->readsLen] = fl->bases + fl->basesLen; // Set the data pointer to where this read should start.
gkpStore->gkStore_loadReadData(read, readData);
......@@ -160,31 +155,25 @@ Extract_Needed_Frags(feParameters *G,
char *readBases = readData->gkReadData_getSequence();
for (uint32 bb=0; bb<readLen; bb++)
fl->readBases[ii][bb] = filter[readBases[bb]];
fl->readBases[fl->readsLen][bb] = filter[readBases[bb]];
fl->readBases[ii][readLen] = 0; // All good reads end.
fl->readBases[fl->readsLen][readLen] = 0; // All good reads end.
ii++;
fl->basesLen += read->gkRead_sequenceLength() + 1; // Update basesLen to account for this read.
fl->readsLen += 1; // And note that we loaded a read.
// Advance to the next overlap.
nextOlap++;
while ((nextOlap < G->olapsLen) && (G->olaps[nextOlap].b_iid == fi))
nextOlap++; // Advance past all the overlaps for this read.
while ((nextOlap < G->olapsLen) &&
(G->olaps[nextOlap].b_iid == loID))
nextOlap++;
fi = (nextOlap < G->olapsLen) ? G->olaps[nextOlap].b_iid : hiID + 1;
if (nextOlap < G->olapsLen) // If we have valid overlap, grab the read ID.
loID = G->olaps[nextOlap].b_iid; // If we don't have a valid overlap, the loop will stop.
}
delete readData;
fl->readsLen = ii;
if (fl->readsLen > 0)
fprintf(stderr, "Extract_Needed_Frags()-- Loaded " F_U32 " reads (%.4f%%). Loaded IDs " F_U32 " through " F_U32 ".\n",
fl->readsLen, 100.0 * fl->readsLen / (hiID - 1 - loID),
fl->readIDs[0], fl->readIDs[fl->readsLen-1]);
else
fprintf(stderr, "Extract_Needed_Frags()-- Loaded " F_U32 " reads (%.4f%%).\n",
fl->readsLen, 100.0 * fl->readsLen / (hiID - 1 - loID));
fprintf(stderr, "extractReads()-- Loaded.\n");
}
......@@ -262,8 +251,6 @@ Threaded_Stream_Old_Frags(feParameters *G,
for (uint32 i=0; i<G->numThreads; i++) {
thread_wa[i].thread_id = i;
thread_wa[i].loID = 0;
thread_wa[i].hiID = 0;
thread_wa[i].nextOlap = 0;
thread_wa[i].G = G;
thread_wa[i].frag_list = NULL;
......@@ -278,14 +265,6 @@ Threaded_Stream_Old_Frags(feParameters *G,
thread_wa[i].ped.initialize(G, G->errorRate);
}
uint32 loID = G->olaps[0].b_iid;
uint32 hiID = loID + FRAGS_PER_BATCH - 1;
uint32 endID = G->olaps[G->olapsLen - 1].b_iid;
if (hiID > endID)
hiID = endID;
uint64 frstOlap = 0;
uint64 nextOlap = 0;
......@@ -295,15 +274,13 @@ Threaded_Stream_Old_Frags(feParameters *G,
Frag_List_t *curr_frag_list = &frag_list_1;
Frag_List_t *next_frag_list = &frag_list_2;
Extract_Needed_Frags(G, gkpStore, loID, hiID, curr_frag_list, nextOlap);
extractReads(G, gkpStore, curr_frag_list, nextOlap);
while (loID <= endID) {
while (curr_frag_list->readsLen > 0) {
// Process fragments in curr_frag_list in background
for (uint32 i=0; i<G->numThreads; i++) {
thread_wa[i].loID = loID;
thread_wa[i].hiID = hiID;
thread_wa[i].nextOlap = frstOlap;
thread_wa[i].frag_list = curr_frag_list;
......@@ -315,21 +292,14 @@ Threaded_Stream_Old_Frags(feParameters *G,
// Read next batch of fragments
loID = hiID + 1;
if (loID <= endID) {
hiID = loID + FRAGS_PER_BATCH - 1;
if (hiID > endID)
hiID = endID;
frstOlap = nextOlap;
Extract_Needed_Frags(G, gkpStore, loID, hiID, next_frag_list, nextOlap);
}
extractReads(G, gkpStore, next_frag_list, nextOlap);
// Wait for background processing to finish
fprintf(stderr, "processReads()-- Waiting for compute.\n");
for (uint32 i=0; i<G->numThreads; i++) {
void *ptr;
......
......@@ -72,10 +72,6 @@ using namespace std;
// Factor by which to grow memory in olap array when reading it
#define EXPANSION_FACTOR 1.4
// Number of old fragments to read into memory-based fragment
// store at a time for processing
#define FRAGS_PER_BATCH 100000
// Longest name allowed for a file in the overlap store
#define MAX_FILENAME_LEN 1000
......@@ -273,8 +269,6 @@ public:
struct Thread_Work_Area_t {
int32 thread_id;
uint32 loID;
uint32 hiID;
uint64 nextOlap;
feParameters *G;
......
......@@ -682,7 +682,7 @@ sub setOverlapDefaults ($$$) {
setOverlapDefault($tag, "OvlHashBlockLength", undef, "Amount of sequence (bp) to load into the overlap hash table");
setOverlapDefault($tag, "OvlRefBlockLength", undef, "Amount of sequence (bp) to search against the hash table per batch");
setOverlapDefault($tag, "OvlHashBits", ($tag eq "cor") ? 18 : 23, "Width of the kmer hash. Width 22=1gb, 23=2gb, 24=4gb, 25=8gb. Plus 10b per ${tag}OvlHashBlockLength");
setOverlapDefault($tag, "OvlHashBits", undef, "Width of the kmer hash. Width 22=1gb, 23=2gb, 24=4gb, 25=8gb. Plus 10b per ${tag}OvlHashBlockLength");
setOverlapDefault($tag, "OvlHashLoad", 0.75, "Maximum hash table load. If set too high, table lookups are inefficent; if too low, search overhead dominates run time; default 0.75");
setOverlapDefault($tag, "OvlMerSize", ($tag eq "cor") ? 19 : 22, "K-mer size for seeds in overlaps");
setOverlapDefault($tag, "OvlMerThreshold", "auto", "K-mer frequency threshold; mers more frequent than this count are ignored; default 'auto'");
......