Skip to content
Commits on Source (7)
......@@ -25,7 +25,6 @@ Files-Excluded:
src/index
src/intronerator
src/isPcr
src/j*
src/k*
src/m*
src/o*
......
src/lib/tests/bin/*/* usr/bin
......@@ -13,11 +13,12 @@ include /usr/share/dpkg/default.mk
# DEB_DISTRIBUTION: the distribution(s) listed in the current entry of debian/changelog
# for hardening you might like to uncomment this:
# export DEB_BUILD_MAINT_OPTIONS=hardening=+all
export DEB_BUILD_MAINT_OPTIONS=hardening=+all
%:
dh $@ --sourcedirectory=src/lib --no-parallel
override_dh_auto_build:
make --directory=src/htslib
make --directory=src/jkOwnLib
dh_auto_build
The modules in this library generally are supporting
the BLAT or WABA programs. These modules are all
copyrighted Jim Kent. License hereby is granted for
personal, academic and non-profit purposes. Commercial
use is permitted only by explicit agreement with Jim Kent.
/* bandExt - banded Smith-Waterman extension of alignments.
* An aligner might first find perfectly matching hits of
* a small size, then extend these hits as far as possible
* while the sequences perfectly match, then call on routines
* in this module to do further extensions allowing small
* gaps and mismatches.
*
* This version of the algorithm uses affine gap scorea and
* has the neat feature that the band can wander around.
* When a score exceeds any previous score, the band will be
* recentered around the highest scoring position. */
/* Copyright 2003-5 Jim Kent. All rights reserved. */
#include "common.h"
#include "dnaseq.h"
#include "axt.h"
#include "fuzzyFind.h"
#include "localmem.h"
#include "bandExt.h"
/* Definitions for traceback byte. This is encoded as so:
* xxxxLUMM
* where the x's are not used. The L bit is set if parent of the left insert
* is also a left insert (otherwise it is a match). Likewise the U bit is set
* if the parent of an up insert is also an up insert. The MM bits contain the
* parent of the match state. */
#define mpMatch 1
#define mpUp 2
#define mpLeft 3
#define mpMask 3
#define upExt (1<<2)
#define lpExt (1<<3)
struct score
/* Info on score in our three states. */
{
int match;
int up;
int left;
};
boolean bandExt(boolean global, struct axtScoreScheme *ss, int maxInsert,
char *aStart, int aSize, char *bStart, int bSize, int dir,
int symAlloc, int *retSymCount, char *retSymA, char *retSymB,
int *retStartA, int *retStartB)
/* Try to extend an alignment from aStart/bStart onwards.
* Set maxInsert to the maximum gap size allowed. 3 is often
* a good choice. aStart/aSize bStart/bSize describe the
* sequence to extend through (not including any of the existing
* alignment. Set dir = 1 for forward extension, dir = -1 for backwards.
* retSymA and retSymB should point to arrays of characters of
* symAlloc size. symAlloc needs to be aSize*2 or so. The
* alignment is returned in the various ret values. The function
* overall returns TRUE if an extension occurred, FALSE if not. */
{
int i; /* A humble index or two. */
int *bOffsets = NULL; /* Offset of top of band. */
UBYTE **parents = NULL; /* Array of parent positions. */
struct score *curScores = NULL; /* Scores for current column. */
struct score *prevScores = NULL; /* Scores for previous column. */
struct score *swapScores; /* Helps to swap cur & prev column. */
int bandSize = 2*maxInsert + 1; /* Size of band including middle */
int maxIns1 = maxInsert + 1; /* Max insert plus one. */
int bandPlus = bandSize + 2*maxIns1; /* Band plus sentinels on either end. */
int bestScore = 0; /* Best score so far. */
int aPos, aBestPos = -1; /* Current and best position in a. */
int bPos, bBestPos = -1; /* Current and best position in b. */
int bandCenter = 0; /* b Coordinate of current band center. */
int colShift = 1; /* Vertical shift between this column and previous. */
int prevScoreOffset; /* Offset into previous score column. */
int curScoreOffset; /* Offset into current score column. */
int symCount = 0; /* Size of alignment and size allocated for it. */
int gapOpen = ss->gapOpen; /* First base in gap costs this. */
int gapExtend = ss->gapExtend; /* Extra bases in gap cost this. */
int badScore = -gapOpen*100; /* A score bad enough no one will want to link with us. */
int maxDrop = gapOpen + gapExtend*maxInsert; /* Max score drop allowed before giving up. */
int midScoreOff; /* Offset to middle of scoring array. */
struct lm *lm; /* Local memory pool. */
boolean didExt = FALSE;
int initGapScore = -gapOpen;
/* For reverse direction just reverse bytes here and there. It's
* a lot easier than the alternative and doesn't cost much time in
* the global scheme of things. */
if (dir < 0)
{
reverseBytes(aStart, aSize);
reverseBytes(bStart, bSize);
}
#ifdef DEBUG
uglyf("bandExt: dir %d, aStart %d, aSize %d, symAlloc %d\n", dir,
aSize, bSize, symAlloc);
mustWrite(uglyOut, aStart, aSize);
uglyf("\n");
mustWrite(uglyOut, bStart, bSize);
uglyf("\n");
#endif /* DEBUG */
/* Make up a local mem structure big enough for everything.
* This is just a minor, perhaps misguided speed tweak to
* avoid multiple mallocs. */
lm = lmInit(
sizeof(bOffsets[0])*aSize +
sizeof(parents[0])*bandSize +
bandSize*(sizeof(parents[0][0])*aSize) +
sizeof(curScores[0]) * bandPlus +
sizeof(prevScores[0]) * bandPlus );
/* Allocate data structures out of local memory pool. */
lmAllocArray(lm, bOffsets, aSize);
lmAllocArray(lm, parents, bandSize);
for (i=0; i<bandSize; ++i)
lmAllocArray(lm, parents[i], aSize);
lmAllocArray(lm, curScores, bandPlus);
lmAllocArray(lm, prevScores, bandPlus);
/* Set up scoring arrays so that stuff outside of the band boundary
* looks bad. There will be maxIns+1 of these sentinel values at
* the start and at the beginning. */
for (i=0; i<bandPlus; ++i)
{
struct score *cs = &curScores[i];
cs->match = cs->up = cs->left = badScore;
cs = &prevScores[i];
cs->match = cs->up = cs->left = badScore;
}
/* Set up scoring array so that extending without an initial insert
* looks relatively good. */
midScoreOff = 1 + 2 * maxInsert;
prevScores[midScoreOff].match = 0;
/* Set up scoring array so that initial inserts of up to maxInsert
* are allowed but penalized appropriately. */
{
int score = -gapOpen;
for (i=0; i<maxInsert; ++i)
{
prevScores[midScoreOff+i].up = score;
score -= gapExtend;
}
}
for (aPos=0; aPos < aSize; ++aPos)
{
char aBase = aStart[aPos];
int *matRow = ss->matrix[(int)aBase];
int bestColScore = badScore;
int bestColPos = -1;
int colTop = bandCenter - maxInsert;
int colBottom = bandCenter + maxIns1;
if (colTop < 0) colTop = 0;
if (colBottom > bSize) colBottom = bSize;
curScoreOffset = maxIns1 + colTop - (bandCenter - maxInsert);
prevScoreOffset = curScoreOffset + colShift;
#ifdef DEBUG
uglyf("curScoreOffset %d, maxIns %d, prevScoreOffset %d\n", curScoreOffset, maxInsert, prevScoreOffset);
#endif /* DEBUG */
/* At top we deal with possibility of initial insert that
* comes in at this point, unless the band has wandered off. */
if (aPos < maxInsert)
{
curScores[curScoreOffset-1].up = initGapScore;
initGapScore -= gapExtend;
}
else
curScores[curScoreOffset-1].up = badScore;
for (bPos = colTop; bPos < colBottom; ++bPos)
{
UBYTE parent;
struct score *curScore = &curScores[curScoreOffset];
#ifdef DEBUG
uglyf("aPos %d, bPos %d, %c vs %c\n", aPos, bPos, aBase, bStart[bPos]);
uglyf(" diag[%d %d %d], left[%d %d %d], up[%d %d %d]\n",
prevScores[prevScoreOffset-1].match,
prevScores[prevScoreOffset-1].left,
prevScores[prevScoreOffset-1].up,
prevScores[prevScoreOffset].match,
prevScores[prevScoreOffset].left,
prevScores[prevScoreOffset].up,
curScores[curScoreOffset-1].match,
curScores[curScoreOffset-1].left,
curScores[curScoreOffset-1].up);
#endif /* DEBUG */
/* Handle ways into the matching state and record if it's
* best score so far. */
{
int match = matRow[(int)bStart[bPos]];
struct score *a = &prevScores[prevScoreOffset-1];
int diagScore = a->match;
int leftScore = a->left;
int upScore = a->up;
int score;
if (diagScore >= leftScore && diagScore >= upScore)
{
score = diagScore + match;
parent = mpMatch;
}
else if (leftScore > upScore)
{
score = leftScore + match;
parent = mpLeft;
}
else
{
score = upScore + match;
parent = mpUp;
}
curScore->match = score;
if (score > bestColScore)
{
bestColScore = score;
bestColPos = bPos;
}
}
/* Handle ways into left gap state. */
{
struct score *a = &prevScores[prevScoreOffset];
int extScore = a->left - gapExtend;
int openScore = a->match - gapOpen;
if (extScore >= openScore)
{
parent |= lpExt;
curScore->left = extScore;
}
else
{
curScore->left = openScore;
}
}
/* Handle ways into up gap state. */
{
struct score *a = &curScore[-1];
int extScore = a->up - gapExtend;
int openScore = a->match - gapOpen;
if (extScore >= openScore)
{
parent |= upExt;
curScore->up = extScore;
}
else
{
curScore->up = openScore;
}
}
parents[curScoreOffset - maxIns1][aPos] = parent;
#ifdef DEBUG
uglyf(" cur [%d %d %d]\n",
curScore->match,
curScore->left,
curScore->up);
#endif /* DEBUG */
/* Advance to next row in column. */
curScoreOffset += 1;
prevScoreOffset += 1;
}
/* If this column's score is best so far make note of
* it and shift things so that the matching bases at the
* best score are centered in the band. This allows the
* band to wander where appropriate. Having the band shift
* to the best column position if it is not the best score
* so far doesn't work so well though. */
if (bestScore < bestColScore)
{
bestScore = bestColScore;
aBestPos = aPos;
bBestPos = bestColPos;
colShift = (bestColPos + 1) - bandCenter;
}
else if (bestColScore < bestScore - maxDrop)
{
if (!global)
break;
}
else
{
colShift = 1;
}
/* Keep track of vertical offset of this column, and
* set up offset of next column. */
bOffsets[aPos] = bandCenter;
bandCenter += colShift;
/* Swap scoring arrays so current becomes next iteration's previous. */
swapScores = curScores;
curScores = prevScores;
prevScores = swapScores;
}
/* Trace back. */
#ifdef DEBUG
uglyf("traceback: bestScore %d, aBestPos %d, bBestPos %d\n", bestScore, aBestPos, bBestPos);
#endif /* DEBUG */
if (global || bestScore > 0)
{
boolean upState = FALSE;
boolean leftState = FALSE;
didExt = TRUE;
if (global)
{
aPos = aSize-1;
bPos = bSize-1;
}
else
{
aPos = aBestPos;
bPos = bBestPos;
}
for (;;)
{
int pOffset;
UBYTE parent;
pOffset = bPos - bOffsets[aPos] + maxInsert;
if (pOffset < 0) pOffset = 0;
// FIXME: there may be some cases near beginning where
// pOffset is not right. Clamping it at 0 prevents a crash
// and produces correct behavior in many cases. However
// I'm thinking a better fix would be to have bOffsets follow
// colTop rather than bandCenter somehow. Exactly how is
// beyond me at the moment.
if (pOffset >= bandSize)
{
#ifdef DEBUG
uglyf("bPos %d, aPos %d, bOffsets[aPos] %d, maxInsert %d\n", bPos, aPos, bOffsets[aPos], maxInsert);
uglyf("pOffset = %d, bandSize = %d\n", pOffset, bandSize);
uglyf("aSize %d, bSize %d\n", aSize, bSize);
faWriteNext(uglyOut, "uglyA", aStart, aSize);
faWriteNext(uglyOut, "uglyB", bStart, bSize);
#endif /* DEBUG */
assert(global);
return FALSE;
}
parent = parents[pOffset][aPos];
#ifdef DEBUG
uglyf("aPos %d, bPos %d, parent %d, pMask %d upState %d, leftState %d\n", aPos, bPos, parent, parent & mpMask, upState, leftState);
#endif /* DEBUG */
if (upState)
{
retSymA[symCount] = '-';
retSymB[symCount] = bStart[bPos];
bPos -= 1;
upState = (parent&upExt);
}
else if (leftState)
{
retSymA[symCount] = aStart[aPos];
retSymB[symCount] = '-';
aPos -= 1;
leftState = (parent&lpExt);
}
else
{
retSymA[symCount] = aStart[aPos];
retSymB[symCount] = bStart[bPos];
aPos -= 1;
bPos -= 1;
parent &= mpMask;
if (parent == mpUp)
upState = TRUE;
else if (parent == mpLeft)
leftState = TRUE;
}
if (++symCount >= symAlloc)
errAbort("unsufficient symAlloc in bandExt");
if (aPos < 0 || bPos < 0)
{
while (aPos >= 0)
{
retSymA[symCount] = aStart[aPos];
retSymB[symCount] = '-';
if (++symCount >= symAlloc)
errAbort("unsufficient symAlloc in bandExt");
--aPos;
}
while (bPos >= 0)
{
retSymA[symCount] = '-';
retSymB[symCount] = bStart[bPos];
if (++symCount >= symAlloc)
errAbort("unsufficient symAlloc in bandExt");
--bPos;
}
break;
}
}
if (dir > 0)
{
reverseBytes(retSymA, symCount);
reverseBytes(retSymB, symCount);
}
retSymA[symCount] = 0;
retSymB[symCount] = 0;
}
else
{
retSymA[0] = retSymB[0] = 0;
}
if (dir < 0)
{
reverseBytes(aStart, aSize);
reverseBytes(bStart, bSize);
}
/* Clean up, set return values and go home */
lmCleanup(&lm);
if (retStartA != NULL) *retStartA = aBestPos;
if (retStartB != NULL) *retStartB = bBestPos;
*retSymCount = symCount;
return didExt;
}
struct ffAli *bandExtFf(
struct lm *lm, /* Local memory pool, NULL to use global allocation for ff */
struct axtScoreScheme *ss, /* Scoring scheme to use. */
int maxInsert, /* Maximum number of inserts to allow. */
struct ffAli *origFf, /* Alignment block to extend. */
char *nStart, char *nEnd, /* Bounds of region to extend through */
char *hStart, char *hEnd, /* Bounds of region to extend through */
int dir, /* +1 to extend end, -1 to extend start */
int maxExt) /* Maximum length of extension. */
/* Extend a gapless alignment in one direction. Returns extending
* ffAlis, not linked into origFf, or NULL if no extension possible. */
{
int symAlloc = 2*maxExt;
char *symBuf = needMem(4*maxExt);
char *nBuf = symBuf;
char *hBuf = symBuf + symAlloc;
char *ns, *hs;
int symCount, nExt, hExt;
int nSize, hSize;
struct ffAli *ffList = NULL;
boolean gotExt;
if (dir > 0)
{
nSize = nEnd - origFf->nEnd;
hSize = hEnd - origFf->hEnd;
if (nSize > maxExt) nSize = maxExt;
if (hSize > maxExt) hSize = maxExt;
ns = origFf->nEnd;
hs = origFf->hEnd;
}
else
{
nSize = origFf->nStart - nStart;
hSize = origFf->hStart - hStart;
if (nSize > maxExt) nSize = maxExt;
if (hSize > maxExt) hSize = maxExt;
ns = origFf->nStart - nSize;
hs = origFf->hStart - hSize;
}
gotExt = bandExt(FALSE, ss, maxInsert, ns, nSize, hs, hSize, dir,
symAlloc, &symCount, nBuf, hBuf, &nExt, &hExt);
if (gotExt)
{
char *nExtStart, *hExtStart;
if (dir > 0)
{
nExtStart = ns;
hExtStart = hs;
}
else
{
nExtStart = origFf->nStart - nExt - 1;
hExtStart = origFf->hStart - hExt - 1;
}
ffList = ffAliFromSym(symCount, nBuf, hBuf, lm, nExtStart, hExtStart);
}
freeMem(symBuf);
return ffList;
}
This diff is collapsed.
/* ffAliHelp - Helper routines for things that produce (rather than just
* consume) ffAli type alignments. */
/* Copyright 2000-2003 Jim Kent. All rights reserved. */
#include "common.h"
#include "fuzzyFind.h"
#include "dnaseq.h"
void ffCat(struct ffAli **pA, struct ffAli **pB)
/* Concatenate B to the end of A. Eat up second list
* in process. */
{
struct ffAli *a = *pA;
struct ffAli *b = *pB;
/* If list to add is empty our job is real easy. */
if (b == NULL)
return;
/* If list to add into is empty, then just switch in the
* second list. */
if (a == NULL)
{
*pA = *pB;
*pB = NULL;
return;
}
/* Neither list empty. Find rightmost element of first list
* and cross-link it with leftmost element of second list. */
while (a->right != NULL) a = a->right;
b->left = a;
a->right = b;
*pB = NULL;
}
void ffAliSort(struct ffAli **pList, int (*compare )(const void *elem1, const void *elem2))
/* Sort a doubly linked list of ffAlis. */
{
/* Get head of list and handle easy special empty case. */
struct ffAli *r = *pList;
if (r == NULL) return;
/* Since first pointer is "left", in order to reuse slSort, have
* to jump through some minor hoops. First go to right end of list,
* then sort it. */
while (r->right) r = r->right;
slSort(&r, compare);
/* We're sorted, but our right links are all broken. Fix this. */
slReverse(&r);
r = ffMakeRightLinks(r);
*pList = r;
}
int ffCmpHitsHayFirst(const void *va, const void *vb)
/* Compare function to sort hit array by ascending
* target offset followed by ascending query offset. */
{
const struct ffAli *a = *((struct ffAli **)va);
const struct ffAli *b = *((struct ffAli **)vb);
int diff;
if ((diff = a->hStart - b->hStart) != 0)
return diff;
return a->nStart - b->nStart;
}
int ffCmpHitsNeedleFirst(const void *va, const void *vb)
/* Compare function to sort hit array by ascending
* query offset followed by ascending target offset. */
{
const struct ffAli *a = *((struct ffAli **)va);
const struct ffAli *b = *((struct ffAli **)vb);
int diff;
if ((diff = a->nStart - b->nStart) != 0)
return diff;
return a->hStart - b->hStart;
}
void ffExpandExactRight(struct ffAli *ali, DNA *needleEnd, DNA *hayEnd)
/* Expand aligned segment to right as far as can exactly. */
{
DNA *nEnd = ali->nEnd;
DNA *hEnd = ali->hEnd;
while (nEnd < needleEnd && hEnd < hayEnd)
{
if (*nEnd != *hEnd)
break;
nEnd += 1;
hEnd += 1;
}
ali->nEnd = nEnd;
ali->hEnd = hEnd;
return;
}
void ffExpandExactLeft(struct ffAli *ali, DNA *needleStart, DNA *hayStart)
/* Expand aligned segment to left as far as can exactly. */
{
DNA *nStart = ali->nStart-1;
DNA *hStart = ali->hStart-1;
while (nStart >= needleStart && hStart >= hayStart)
{
if (*nStart != *hStart)
break;
nStart -= 1;
hStart -= 1;
}
ali->nStart = nStart + 1;
ali->hStart = hStart + 1;
return;
}
struct ffAli *ffMergeClose(struct ffAli *aliList,
DNA *needleStart, DNA *hayStart)
/* Remove overlapping areas needle in alignment. Assumes ali is sorted on
* ascending nStart field. Also merge perfectly abutting neighbors or
* ones that could be merged at the expense of just a few mismatches.*/
{
struct ffAli *mid, *ali;
int closeEnough = -3;
if (aliList == NULL)
return NULL;
for (mid = aliList->right; mid != NULL; mid = mid->right)
{
for (ali = aliList; ali != mid; ali = ali->right)
{
char *nStart, *nEnd;
int nOverlap;
nStart = max(ali->nStart, mid->nStart);
nEnd = min(ali->nEnd, mid->nStart);
nOverlap = nEnd - nStart;
/* Overlap or perfectly abut in needle, and needle/hay
* offset the same. */
if (nOverlap >= closeEnough)
{
int aliDiag = (ali->nStart - needleStart) - (ali->hStart - hayStart);
int midDiag = (mid->nStart - needleStart) - (mid->hStart - hayStart);
if (aliDiag == midDiag)
{
/* Make mid encompass both, and make ali empty. */
mid->nStart = min(ali->nStart, mid->nStart);
mid->nEnd = max(ali->nEnd, mid->nEnd);
mid->hStart = min(ali->hStart, mid->hStart);
mid->hEnd = max(ali->hEnd, mid->hEnd);
ali->hStart = ali->hEnd = mid->hStart;
ali->nEnd = ali->nStart = mid->nStart;;
}
}
}
}
aliList = ffRemoveEmptyAlis(aliList, TRUE);
return aliList;
}
int ffScoreIntron(DNA a, DNA b, DNA y, DNA z, int orientation)
/* Return a better score the closer an intron is to
* consensus. */
{
int score = 0;
int revScore = 0;
if (orientation >= 0)
{
if (a == 'g' || a == 'G') ++score;
if (b == 't' || b == 'T') ++score;
if (y == 'a' || y == 'A') ++score;
if (z == 'g' || z == 'G') ++score;
}
if (orientation <= 0)
{
if (a == 'c' || a == 'C') ++revScore;
if (b == 't' || b == 'T') ++revScore;
if (y == 'a' || y == 'A') ++revScore;
if (z == 'c' || z == 'C') ++revScore;
}
return score > revScore ? score : revScore;
}
static int slideIntron(struct ffAli *left, struct ffAli *right, int orientation)
/* Slides space between alignments if possible to match
* intron consensus better. Returns how much it slid intron. */
{
DNA *nLeft = left->nEnd;
DNA *hLeft = left->hEnd;
DNA *nRight = right->nStart;
DNA *hRight = right->hStart;
DNA nl, nr, hl, hr;
DNA *nLeftEnd = left->nStart;
DNA *nRightEnd = right->nEnd;
DNA *nBestLeft = NULL;
int bestScore = -0x7fffffff;
int curScore;
int offset;
if (hRight-hLeft < 4) /* Too short to be an intron. */
return 0;
if (nRight-nLeft > 2) /* Too big of a gap to be an intron. */
return 0;
/* Slide as far to the left as possible without inserting mismatches. */
while (nLeft > nLeftEnd)
{
nl = nLeft[-1];
hl = hLeft[-1];
nr = nRight[-1];
hr = hRight[-1];
if (!(nl == 'n' && nr == 'n')) /* N's in needle freely slide. */
{
if (nr != hr)
break;
}
nLeft -= 1;
hLeft -= 1;
nRight -= 1;
hRight -= 1;
}
/* Slide as far to the right as possible computing
intron score as you go. */
while (nRight < nRightEnd)
{
curScore = ffScoreIntron(hLeft[0], hLeft[1], hRight[-2], hRight[-1], orientation);
if (curScore > bestScore)
{
bestScore = curScore;
nBestLeft = nLeft;
}
nl = nLeft[0];
hl = hLeft[0];
if (nl != 'n' && nl != hl)
break;
nr = nRight[0];
hr = hRight[0];
nLeft += 1;
hLeft += 1;
nRight += 1;
hRight += 1;
}
if (nBestLeft == NULL)
return 0;
offset = nBestLeft - left->nEnd;
if (offset == 0)
return offset;
left->nEnd += offset;
left->hEnd += offset;
right->nStart += offset;
right->hStart += offset;
return offset;
}
boolean ffSlideOrientedIntrons(struct ffAli *ali, int orient)
/* Slide introns (or spaces between aligned blocks)
* to match consensus on given strand. */
{
struct ffAli *left = ali, *right;
boolean slid = FALSE;
if (left == NULL)
return FALSE;
while((right = left->right) != NULL)
{
if (slideIntron(left, right, orient))
slid = TRUE;
left = right;
}
return slid;
}
boolean ffSlideIntrons(struct ffAli *ali)
/* Slide introns (or spaces between aligned blocks)
* to match consensus. Return TRUE if any slid. */
{
int orient = ffIntronOrientation(ali);
return ffSlideOrientedIntrons(ali, orient);
}
struct ffAli *ffRemoveEmptyAlis(struct ffAli *ali, boolean doFree)
/* Remove empty blocks from list. Optionally free empties too. */
{
struct ffAli *leftAli;
struct ffAli *startAli;
struct ffAli *rightAli;
if (ali == NULL)
return NULL;
/* Figure out left most non-empty ali. */
while (ali->left)
ali = ali->left;
while (ali)
{
/* If current ali is empty, chuck it out. */
if (ali->nEnd <= ali->nStart || ali->hEnd <= ali->hStart)
{
struct ffAli *empty = ali;
ali = ali->right;
if (doFree) freeMem(empty);
}
else
break;
}
if (ali == NULL)
return NULL;
ali->left = NULL;
/* Get rid of empty middle alis. */
startAli = leftAli = ali;
ali = ali->right;
while (ali)
{
rightAli = ali->right;
if (ali->nEnd <= ali->nStart || ali->hEnd <= ali->hStart)
{
leftAli->right = rightAli;
if (rightAli != NULL)
rightAli->left = leftAli;
if (doFree) freeMem(ali);
}
else
{
leftAli = ali;
}
ali = rightAli;
}
return startAli;
}
struct ffAli *ffMergeHayOverlaps(struct ffAli *ali)
/* Remove overlaps in haystack that perfectly abut in needle.
* These are transformed into perfectly abutting haystacks
* that have a gap in the needle. */
{
struct ffAli *a = NULL;
struct ffAli *leftA = NULL;
if (ali == NULL)
return NULL;
a = ali;
for (;;)
{
int nOverlap;
int hOverlap;
int aSize;
/* Advance to next ali */
leftA = a;
a = a->right;
if (a == NULL)
break;
nOverlap = leftA->nEnd - a->nStart;
hOverlap = leftA->hEnd - a->hStart;
aSize = a->nEnd - a->nStart;
if (hOverlap > 0 && hOverlap < aSize && nOverlap <= 0)
{
a->hStart += hOverlap;
a->nStart += hOverlap;
}
}
return ali;
}
struct ffAli *ffMergeNeedleAlis(struct ffAli *ali, boolean doFree)
/* Remove overlapping areas needle in alignment. Assumes ali is sorted on
* ascending nStart field. Also merge perfectly abutting neighbors.*/
{
struct ffAli *a = NULL;
struct ffAli *leftA = NULL;
struct ffAli *rightA;
if (ali == NULL)
return NULL;
rightA = ali;
for (;;)
{
/* Advance to next ali */
leftA = a;
a = rightA;
if (a == NULL)
break;
rightA = a->right;
/* See if can merge current alignment into left one. */
if (leftA != NULL)
{
int overlap = leftA->nEnd - a->nStart;
/* Deal with overlaps in needle */
if (overlap > 0)
{
/* See if left encompasses current segment. */
if (leftA->nStart <= a->nStart && leftA->nEnd >= a->nEnd)
{
/* Eliminate current segment. */
leftA->right = rightA;
if (rightA != NULL)
rightA->left = leftA;
if (doFree) freeMem(a);
a = leftA;
}
else
{
/* Remove overlapping area from current segment, leave
* it in left segment. */
a->hStart += overlap;
a->nStart += overlap;
}
}
else if (overlap == 0 && leftA->hEnd == a->hStart)
{
/* Remove current segment from list. */
leftA->right = rightA;
if (rightA != NULL)
rightA->left = leftA;
/* Fold data from current segment into left segment */
leftA->nEnd = a->nEnd;
leftA->hEnd = a->hEnd;
if (doFree) freeMem(a);
a = leftA;
}
}
}
return ali;
}
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
/* gfClientLib - stuff that both blat and pcr clients of
* genoFind use. */
/* Copyright 2001-2005 Jim Kent. All rights reserved. */
#include "common.h"
#include "hash.h"
#include "linefile.h"
#include "obscure.h"
#include "dnaseq.h"
#include "fa.h"
#include "nib.h"
#include "twoBit.h"
#include "repMask.h"
#include "gfClientLib.h"
void gfClientFileArray(char *fileName, char ***retFiles, int *retFileCount)
/* Check if file if .2bit or .nib or .fa. If so return just that
* file in a list of one. Otherwise read all file and treat file
* as a list of filenames. */
{
boolean gotSingle = FALSE;
char *buf; /* This will leak memory but won't matter. */
if (nibIsFile(fileName) || twoBitIsSpec(fileName)
|| sameString(fileName, "stdin")
|| endsWith(fileName, ".Z")
|| endsWith(fileName, ".gz")
|| endsWith(fileName, ".bz2"))
gotSingle = TRUE;
/* Detect .fa files (where suffix is not standardized)
* by first character being a '>'. */
else
{
FILE *f = mustOpen(fileName, "r");
char c = fgetc(f);
fclose(f);
if (c == '>')
gotSingle = TRUE;
}
if (gotSingle)
{
char **files;
*retFiles = AllocArray(files, 1);
files[0] = cloneString(fileName);
*retFileCount = 1;
return;
}
else
{
readAllWords(fileName, retFiles, retFileCount, &buf);
}
}
void gfClientUnmask(struct dnaSeq *seqList)
/* Unmask all sequences. */
{
struct dnaSeq *seq;
for (seq = seqList; seq != NULL; seq = seq->next)
faToDna(seq->dna, seq->size);
}
static void maskFromOut(struct dnaSeq *seqList, char *outFile, float minRepDivergence)
/* Mask DNA sequence by putting bits more than 85% identical to
* repeats as defined in RepeatMasker .out file to upper case. */
{
struct lineFile *lf = lineFileOpen(outFile, TRUE);
struct hash *hash = newHash(0);
struct dnaSeq *seq;
char *line;
for (seq = seqList; seq != NULL; seq = seq->next)
hashAdd(hash, seq->name, seq);
if (!lineFileNext(lf, &line, NULL))
errAbort("Empty mask file %s\n", lf->fileName);
if (!startsWith("There were no", line)) /* No repeats is ok. Not much work. */
{
if (!startsWith(" SW", line))
errAbort("%s isn't a RepeatMasker .out file.", lf->fileName);
if (!lineFileNext(lf, &line, NULL) || !startsWith("score", line))
errAbort("%s isn't a RepeatMasker .out file.", lf->fileName);
lineFileNext(lf, &line, NULL); /* Blank line. */
while (lineFileNext(lf, &line, NULL))
{
char *words[32];
struct repeatMaskOut rmo;
int wordCount;
int seqSize;
int repSize;
wordCount = chopLine(line, words);
if (wordCount < 14)
errAbort("%s line %d - error in repeat mask .out file\n", lf->fileName, lf->lineIx);
repeatMaskOutStaticLoad(words, &rmo);
/* If repeat is more than 15% divergent don't worry about it. */
if (rmo.percDiv + rmo.percDel + rmo.percInc <= minRepDivergence)
{
if((seq = hashFindVal(hash, rmo.qName)) == NULL)
errAbort("%s is in %s but not corresponding sequence file, files out of sync?\n",
rmo.qName, lf->fileName);
seqSize = seq->size;
if (rmo.qStart <= 0 || rmo.qStart > seqSize || rmo.qEnd <= 0
|| rmo.qEnd > seqSize || rmo.qStart > rmo.qEnd)
{
warn("Repeat mask sequence out of range (%d-%d of %d in %s)\n",
rmo.qStart, rmo.qEnd, seqSize, rmo.qName);
if (rmo.qStart <= 0)
rmo.qStart = 1;
if (rmo.qEnd > seqSize)
rmo.qEnd = seqSize;
}
repSize = rmo.qEnd - rmo.qStart + 1;
if (repSize > 0)
toUpperN(seq->dna + rmo.qStart - 1, repSize);
}
}
}
freeHash(&hash);
lineFileClose(&lf);
}
static void maskNucSeqList(struct dnaSeq *seqList, char *seqFileName, char *maskType,
boolean hardMask, float minRepDivergence)
/* Apply masking to simple nucleotide sequence by making masked nucleotides
* upper case (since normal DNA sequence is lower case for us). */
{
struct dnaSeq *seq;
char *outFile = NULL, outNameBuf[512];
if (sameWord(maskType, "upper"))
{
/* Already has dna to be masked in upper case. */
}
else if (sameWord(maskType, "lower"))
{
for (seq = seqList; seq != NULL; seq = seq->next)
toggleCase(seq->dna, seq->size);
}
else
{
/* Masking from a RepeatMasker .out file. */
if (sameWord(maskType, "out"))
{
sprintf(outNameBuf, "%s.out", seqFileName);
outFile = outNameBuf;
}
else
{
outFile = maskType;
}
gfClientUnmask(seqList);
maskFromOut(seqList, outFile, minRepDivergence);
}
if (hardMask)
{
for (seq = seqList; seq != NULL; seq = seq->next)
upperToN(seq->dna, seq->size);
}
}
bioSeq *gfClientSeqList(int fileCount, char *files[],
boolean isProt, boolean isTransDna, char *maskType,
float minRepDivergence, boolean showStatus)
/* From an array of .fa and .nib file names, create a
* list of dnaSeqs, which are set up so that upper case is masked and lower case
* is unmasked sequence. (This is the opposite of the input, but set so that
* we can use lower case as our primary DNA sequence, which historically predates
* our use of lower case masking.) Protein sequence on the other hand is
* all upper case. */
{
int i;
char *fileName;
bioSeq *seqList = NULL, *seq;
boolean doMask = (maskType != NULL);
for (i=0; i<fileCount; ++i)
{
struct dnaSeq *list = NULL, sseq;
ZeroVar(&sseq);
fileName = files[i];
if (nibIsFile(fileName))
list = nibLoadAllMasked(NIB_MASK_MIXED|NIB_BASE_NAME, fileName);
else if (twoBitIsSpec(fileName))
list = twoBitLoadAll(fileName);
else if (isProt)
list = faReadAllPep(fileName);
else
list = faReadAllMixed(fileName);
/* If necessary mask sequence from file. */
if (doMask)
{
maskNucSeqList(list, fileName, maskType, isTransDna, minRepDivergence);
}
else
{
/* If not masking send everything to proper case here. */
for (seq = list; seq != NULL; seq = seq->next)
{
if (isProt)
faToProtein(seq->dna, seq->size);
else
faToDna(seq->dna, seq->size);
}
}
/* Move local list to end of bigger list. */
seqList = slCat(seqList, list);
}
if (showStatus)
{
/* Total up size and sequence count and report. */
int count = 0;
unsigned long totalSize = 0;
for (seq = seqList; seq != NULL; seq = seq->next)
{
totalSize += seq->size;
count += 1;
}
printf("Loaded %lu letters in %d sequences\n", totalSize, count);
}
return seqList;
}
/* Some stuff shared by gfClient and gfPcr.
* Copyright 2001-2004 Jim Kent. All rights reserved. */
#include "common.h"
#include "linefile.h"
#include "dnaseq.h"
#include "genoFind.h"
#include "gfInternal.h"
#include "errAbort.h"
#include "nib.h"
#include "twoBit.h"
static int extendRespect(int oldX, int newX)
/* Return newX modified slightly so as to be in same frame as oldX. */
{
int frame = oldX % 3;
newX = newX - (newX % 3) + frame;
return newX;
}
void gfiExpandRange(struct gfRange *range, int qSize, int tSize,
boolean respectFrame, boolean isRc, int expansion)
/* Expand range to cover an additional 500 bases on either side. */
{
int x;
x = range->qStart - expansion;
if (x < 0) x = 0;
range->qStart = x;
x = range->qEnd + expansion;
if (x > qSize) x = qSize;
range->qEnd = x;
x = range->tStart - expansion;
if (x < 0) x = 0;
if (respectFrame && !isRc)
{
x = extendRespect(range->tStart, x);
}
range->tStart = x;
x = range->tEnd + expansion;
if (x > tSize) x = tSize;
if (respectFrame && isRc)
{
x = extendRespect(range->tEnd, x);
if (x > tSize)
x -= 3;
}
range->tEnd = x;
}
struct dnaSeq *gfiExpandAndLoadCached(struct gfRange *range,
struct hash *tFileCache, char *tSeqDir, int querySize,
int *retTotalSeqSize, boolean respectFrame, boolean isRc, int expansion)
/* Expand range to cover an additional expansion bases on either side.
* Load up target sequence and return. (Done together because don't
* know target size before loading.) */
{
struct dnaSeq *target = NULL;
char fileName[PATH_LEN+256];
safef(fileName, sizeof(fileName), "%s/%s", tSeqDir, range->tName);
if (nibIsFile(fileName))
{
struct nibInfo *nib = hashFindVal(tFileCache, fileName);
if (nib == NULL)
{
nib = nibInfoNew(fileName);
hashAdd(tFileCache, fileName, nib);
}
if (isRc)
reverseIntRange(&range->tStart, &range->tEnd, nib->size);
gfiExpandRange(range, querySize, nib->size, respectFrame, isRc, expansion);
target = nibLdPart(fileName, nib->f, nib->size,
range->tStart, range->tEnd - range->tStart);
if (isRc)
{
reverseComplement(target->dna, target->size);
reverseIntRange(&range->tStart, &range->tEnd, nib->size);
}
*retTotalSeqSize = nib->size;
}
else
{
struct twoBitFile *tbf = NULL;
// split a filename+chrom like "/gbdb/hg19/hg19.2bit:chr1"
// (can also be URL like "http://hgdownload.soe.ucsc.edu/gbdb/hg19/hg19.2bit:chr1")
// into filename and tSeqName
char *tSeqName = strrchr(fileName, ':');
int tSeqSize = 0;
if (tSeqName == NULL)
errAbort("No colon in .2bit response from gfServer");
*tSeqName++ = 0;
tbf = hashFindVal(tFileCache, fileName);
if (tbf == NULL)
{
tbf = twoBitOpen(fileName);
hashAdd(tFileCache, fileName, tbf);
}
tSeqSize = twoBitSeqSize(tbf, tSeqName);
if (isRc)
reverseIntRange(&range->tStart, &range->tEnd, tSeqSize);
gfiExpandRange(range, querySize, tSeqSize, respectFrame, isRc, expansion);
target = twoBitReadSeqFragLower(tbf, tSeqName, range->tStart, range->tEnd);
if (isRc)
{
reverseComplement(target->dna, target->size);
reverseIntRange(&range->tStart, &range->tEnd, tSeqSize);
}
*retTotalSeqSize = tSeqSize;
}
return target;
}
void gfiGetSeqName(char *spec, char *name, char *file)
/* Extract sequence name and optionally file name from spec,
* which includes nib and 2bit files. (The file may be NULL
* if you don't care.) */
{
if (nibIsFile(spec))
{
splitPath(spec, NULL, name, NULL);
if (file != NULL)
strcpy(file, spec);
}
else
{
char *s = strchr(spec, ':');
if (s == NULL)
errAbort("Expecting colon in %s", spec);
strcpy(name, s+1);
if (file != NULL)
{
int fileNameSize = s - spec;
memcpy(file, spec, fileNameSize);
file[fileNameSize] = 0;
}
}
}
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
kentSrc = ..
include ../inc/common.mk
O = bandExt.o crudeali.o ffAliHelp.o ffSeedExtend.o fuzzyFind.o \
genoFind.o gfBlatLib.o gfClientLib.o gfInternal.o gfOut.o gfPcrLib.o gfWebLib.o ooc.o \
patSpace.o splix.o supStitch.o trans3.o xenbig.o xensmall.o
T = ../lib/$(MACHTYPE)/jkOwnLib.a
$(T): $(O) ../lib/$(MACHTYPE)
ar rcus $(T) $(O)
../lib/$(MACHTYPE):
mkdir ../lib/$(MACHTYPE)
clean:
rm -f ${O} ../lib/$(MACHTYPE)/jkOwnLib.a
tags:
etags ../inc/*.h ../lib/*.h ../lib/*.c ../hg/inc/*.h ../hg/lib/*.h ../hg/lib/*.c ../hg/hgTracks/hgTracks.c ../hg/hgc/hgc.c ../hg/hgTrackUi/hgTrackUi.c
This diff is collapsed.
This diff is collapsed.