Commit 3b7bc214 authored by Andreas Tille's avatar Andreas Tille

Imported Upstream version 3.8.31

parent 277aabcf
muscle:
chmod +x ./mk
./mk
MUSCLE v3.0 source code README
------------------------------
http://www.drive5.com/muscle
This version of MUSCLE was built and tested on two platforms:
Windows XP and Red Hat Linux 8.0.
On Windows, I used Microsoft Visual C++ .Net, which I find
to be the best C++ compile / edit / test environment I've
tried on any platform. The Microsoft project file is
muscle.vcproj.
The Linux make file is Makefile. This is a very simple-minded
make file (because I am a Linux development novice), so should
be easy to understand. By default, it uses shared libraries,
but I found this to give problems when copying between
different Linux versions. The fix was to use the linker
flag -lm static (commented out), which gives a much bigger
but more portable binary. The posted binary was linked with
static libraries.
The source code was not written to be maintained by anyone
but me, so the usual apologies and caveats apply.
Bob Edgar,
January 2004
This diff is collapsed.
#include "muscle.h"
#include "msa.h"
#include "pwpath.h"
#include "profile.h"
#define TRACE 0
static void AppendDelete(const MSA &msaA, unsigned &uColIndexA,
unsigned uSeqCountA, unsigned uSeqCountB, MSA &msaCombined,
unsigned &uColIndexCombined)
{
#if TRACE
Log("AppendDelete ColIxA=%u ColIxCmb=%u\n",
uColIndexA, uColIndexCombined);
#endif
for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA)
{
char c = msaA.GetChar(uSeqIndexA, uColIndexA);
msaCombined.SetChar(uSeqIndexA, uColIndexCombined, c);
}
for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB)
msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined, '-');
++uColIndexCombined;
++uColIndexA;
}
static void AppendInsert(const MSA &msaB, unsigned &uColIndexB,
unsigned uSeqCountA, unsigned uSeqCountB, MSA &msaCombined,
unsigned &uColIndexCombined)
{
#if TRACE
Log("AppendInsert ColIxB=%u ColIxCmb=%u\n",
uColIndexB, uColIndexCombined);
#endif
for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA)
msaCombined.SetChar(uSeqIndexA, uColIndexCombined, '-');
for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB)
{
char c = msaB.GetChar(uSeqIndexB, uColIndexB);
msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined, c);
}
++uColIndexCombined;
++uColIndexB;
}
static void AppendUnalignedTerminals(const MSA &msaA, unsigned &uColIndexA, unsigned uColCountA,
const MSA &msaB, unsigned &uColIndexB, unsigned uColCountB, unsigned uSeqCountA,
unsigned uSeqCountB, MSA &msaCombined, unsigned &uColIndexCombined)
{
#if TRACE
Log("AppendUnalignedTerminals ColIxA=%u ColIxB=%u ColIxCmb=%u\n",
uColIndexA, uColIndexB, uColIndexCombined);
#endif
const unsigned uLengthA = msaA.GetColCount();
const unsigned uLengthB = msaB.GetColCount();
unsigned uNewColCount = uColCountA;
if (uColCountB > uNewColCount)
uNewColCount = uColCountB;
for (unsigned n = 0; n < uColCountA; ++n)
{
for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA)
{
char c = msaA.GetChar(uSeqIndexA, uColIndexA + n);
c = UnalignChar(c);
msaCombined.SetChar(uSeqIndexA, uColIndexCombined + n, c);
}
}
for (unsigned n = uColCountA; n < uNewColCount; ++n)
{
for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA)
msaCombined.SetChar(uSeqIndexA, uColIndexCombined + n, '.');
}
for (unsigned n = 0; n < uColCountB; ++n)
{
for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB)
{
char c = msaB.GetChar(uSeqIndexB, uColIndexB + n);
c = UnalignChar(c);
msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined + n, c);
}
}
for (unsigned n = uColCountB; n < uNewColCount; ++n)
{
for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB)
msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined + n, '.');
}
uColIndexCombined += uNewColCount;
uColIndexA += uColCountA;
uColIndexB += uColCountB;
}
static void AppendMatch(const MSA &msaA, unsigned &uColIndexA, const MSA &msaB,
unsigned &uColIndexB, unsigned uSeqCountA, unsigned uSeqCountB,
MSA &msaCombined, unsigned &uColIndexCombined)
{
#if TRACE
Log("AppendMatch ColIxA=%u ColIxB=%u ColIxCmb=%u\n",
uColIndexA, uColIndexB, uColIndexCombined);
#endif
for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA)
{
char c = msaA.GetChar(uSeqIndexA, uColIndexA);
msaCombined.SetChar(uSeqIndexA, uColIndexCombined, c);
}
for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB)
{
char c = msaB.GetChar(uSeqIndexB, uColIndexB);
msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined, c);
}
++uColIndexA;
++uColIndexB;
++uColIndexCombined;
}
void AlignTwoMSAsGivenPathSW(const PWPath &Path, const MSA &msaA, const MSA &msaB,
MSA &msaCombined)
{
msaCombined.Clear();
#if TRACE
Log("AlignTwoMSAsGivenPathSW\n");
Log("Template A:\n");
msaA.LogMe();
Log("Template B:\n");
msaB.LogMe();
#endif
const unsigned uColCountA = msaA.GetColCount();
const unsigned uColCountB = msaB.GetColCount();
const unsigned uSeqCountA = msaA.GetSeqCount();
const unsigned uSeqCountB = msaB.GetSeqCount();
msaCombined.SetSeqCount(uSeqCountA + uSeqCountB);
// Copy sequence names into combined MSA
for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA)
{
msaCombined.SetSeqName(uSeqIndexA, msaA.GetSeqName(uSeqIndexA));
msaCombined.SetSeqId(uSeqIndexA, msaA.GetSeqId(uSeqIndexA));
}
for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB)
{
msaCombined.SetSeqName(uSeqCountA + uSeqIndexB, msaB.GetSeqName(uSeqIndexB));
msaCombined.SetSeqId(uSeqCountA + uSeqIndexB, msaB.GetSeqId(uSeqIndexB));
}
unsigned uColIndexA = 0;
unsigned uColIndexB = 0;
unsigned uColIndexCombined = 0;
const unsigned uEdgeCount = Path.GetEdgeCount();
for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex)
{
const PWEdge &Edge = Path.GetEdge(uEdgeIndex);
#if TRACE
Log("\nEdge %u %c%u.%u\n",
uEdgeIndex,
Edge.cType,
Edge.uPrefixLengthA,
Edge.uPrefixLengthB);
#endif
const char cType = Edge.cType;
const unsigned uPrefixLengthA = Edge.uPrefixLengthA;
unsigned uColCountA = 0;
if (uPrefixLengthA > 0)
{
const unsigned uNodeIndexA = uPrefixLengthA - 1;
const unsigned uTplColIndexA = uNodeIndexA;
if (uTplColIndexA > uColIndexA)
uColCountA = uTplColIndexA - uColIndexA;
}
const unsigned uPrefixLengthB = Edge.uPrefixLengthB;
unsigned uColCountB = 0;
if (uPrefixLengthB > 0)
{
const unsigned uNodeIndexB = uPrefixLengthB - 1;
const unsigned uTplColIndexB = uNodeIndexB;
if (uTplColIndexB > uColIndexB)
uColCountB = uTplColIndexB - uColIndexB;
}
AppendUnalignedTerminals(msaA, uColIndexA, uColCountA, msaB, uColIndexB, uColCountB,
uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined);
switch (cType)
{
case 'M':
{
assert(uPrefixLengthA > 0);
assert(uPrefixLengthB > 0);
const unsigned uColA = uPrefixLengthA - 1;
const unsigned uColB = uPrefixLengthB - 1;
assert(uColIndexA == uColA);
assert(uColIndexB == uColB);
AppendMatch(msaA, uColIndexA, msaB, uColIndexB, uSeqCountA, uSeqCountB,
msaCombined, uColIndexCombined);
break;
}
case 'D':
{
assert(uPrefixLengthA > 0);
const unsigned uColA = uPrefixLengthA - 1;
assert(uColIndexA == uColA);
AppendDelete(msaA, uColIndexA, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined);
break;
}
case 'I':
{
assert(uPrefixLengthB > 0);
const unsigned uColB = uPrefixLengthB - 1;
assert(uColIndexB == uColB);
AppendInsert(msaB, uColIndexB, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined);
break;
}
default:
assert(false);
}
}
unsigned uInsertColCountA = uColCountA - uColIndexA;
unsigned uInsertColCountB = uColCountB - uColIndexB;
AppendUnalignedTerminals(msaA, uColIndexA, uInsertColCountA, msaB, uColIndexB,
uInsertColCountB, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined);
}
#include "muscle.h"
#include "msa.h"
#include "profile.h"
#include "pwpath.h"
#include "textfile.h"
#include "timing.h"
SCORE AlignTwoMSAs(const MSA &msa1, const MSA &msa2, MSA &msaOut, PWPath &Path,
bool bLockLeft, bool bLockRight)
{
const unsigned uLengthA = msa1.GetColCount();
const unsigned uLengthB = msa2.GetColCount();
ProfPos *PA = ProfileFromMSA(msa1);
ProfPos *PB = ProfileFromMSA(msa2);
if (bLockLeft)
{
PA[0].m_scoreGapOpen = MINUS_INFINITY;
PB[0].m_scoreGapOpen = MINUS_INFINITY;
}
if (bLockRight)
{
PA[uLengthA-1].m_scoreGapClose = MINUS_INFINITY;
PB[uLengthB-1].m_scoreGapClose = MINUS_INFINITY;
}
float r = (float) uLengthA/ (float) (uLengthB + 1); // +1 to prevent div 0
if (r < 1)
r = 1/r;
SCORE Score = GlobalAlign(PA, uLengthA, PB, uLengthB, Path);
AlignTwoMSAsGivenPath(Path, msa1, msa2, msaOut);
delete[] PA;
delete[] PB;
return Score;
}
#include "muscle.h"
#include "msa.h"
#include "profile.h"
#include "pwpath.h"
SCORE GlobalAlign4(ProfPos *PA, unsigned uLengthA, ProfPos *PB,
unsigned uLengthB, PWPath &Path);
SCORE AlignTwoProfs(
const ProfPos *PA, unsigned uLengthA, WEIGHT wA,
const ProfPos *PB, unsigned uLengthB, WEIGHT wB,
PWPath &Path, ProfPos **ptrPout, unsigned *ptruLengthOut)
{
assert(uLengthA < 100000);
assert(uLengthB < 100000);
float r = (float) uLengthA/ (float) (uLengthB + 1); // +1 to prevent div 0
if (r < 1)
r = 1/r;
SCORE Score = GlobalAlign(PA, uLengthA, PB, uLengthB, Path);
AlignTwoProfsGivenPath(Path, PA, uLengthB, wA/(wA + wB), PB, uLengthB, wB/(wA + wB),
ptrPout, ptruLengthOut);
#if HYDRO
if (ALPHA_Amino == g_Alpha)
Hydro(*ptrPout, *ptruLengthOut);
#endif
return Score;
}
#include "muscle.h"
#include <stdio.h>
#include <ctype.h>
#include "msa.h"
#include "textfile.h"
const unsigned uCharsPerLine = 60;
const int MIN_NAME = 10;
const int MAX_NAME = 32;
static char GetAlnConsensusChar(const MSA &a, unsigned uColIndex);
void MSA::ToAlnFile(TextFile &File) const
{
if (g_bClwStrict)
File.PutString("CLUSTAL W (1.81) multiple sequence alignment\n");
else
{
File.PutString("MUSCLE ("
SHORT_VERSION ")"
" multiple sequence alignment\n");
File.PutString("\n");
}
int iLongestNameLength = 0;
for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)
{
const char *ptrName = GetSeqName(uSeqIndex);
const char *ptrBlank = strchr(ptrName, ' ');
int iLength;
if (0 != ptrBlank)
iLength = (int) (ptrBlank - ptrName);
else
iLength = (int) strlen(ptrName);
if (iLength > iLongestNameLength)
iLongestNameLength = iLength;
}
if (iLongestNameLength > MAX_NAME)
iLongestNameLength = MAX_NAME;
if (iLongestNameLength < MIN_NAME)
iLongestNameLength = MIN_NAME;
unsigned uLineCount = (GetColCount() - 1)/uCharsPerLine + 1;
for (unsigned uLineIndex = 0; uLineIndex < uLineCount; ++uLineIndex)
{
File.PutString("\n");
unsigned uStartColIndex = uLineIndex*uCharsPerLine;
unsigned uEndColIndex = uStartColIndex + uCharsPerLine - 1;
if (uEndColIndex >= GetColCount())
uEndColIndex = GetColCount() - 1;
char Name[MAX_NAME+1];
for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)
{
const char *ptrName = GetSeqName(uSeqIndex);
const char *ptrBlank = strchr(ptrName, ' ');
int iLength;
if (0 != ptrBlank)
iLength = (int) (ptrBlank - ptrName);
else
iLength = (int) strlen(ptrName);
if (iLength > MAX_NAME)
iLength = MAX_NAME;
memset(Name, ' ', MAX_NAME);
memcpy(Name, ptrName, iLength);
Name[iLongestNameLength] = 0;
File.PutFormat("%s ", Name);
for (unsigned uColIndex = uStartColIndex; uColIndex <= uEndColIndex;
++uColIndex)
{
const char c = GetChar(uSeqIndex, uColIndex);
File.PutFormat("%c", toupper(c));
}
File.PutString("\n");
}
memset(Name, ' ', MAX_NAME);
Name[iLongestNameLength] = 0;
File.PutFormat("%s ", Name);
for (unsigned uColIndex = uStartColIndex; uColIndex <= uEndColIndex;
++uColIndex)
{
const char c = GetAlnConsensusChar(*this, uColIndex);
File.PutChar(c);
}
File.PutString("\n");
}
}
static char GetAlnConsensusChar(const MSA &a, unsigned uColIndex)
{
const unsigned uSeqCount = a.GetSeqCount();
unsigned BitMap = 0;
unsigned Count = 0;
for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
{
unsigned uLetter = a.GetLetterEx(uSeqIndex, uColIndex);
assert(uLetter < 32);
unsigned Bit = (1 << uLetter);
if (!(BitMap & Bit))
++Count;
BitMap |= Bit;
}
// '*' indicates positions which have a single, fully conserved residue
if (1 == Count)
return '*';
if (ALPHA_Amino != g_Alpha)
return ' ';
#define B(a) (1 << AX_##a)
#define S2(a, b) S(B(a) | B(b))
#define S3(a, b, c) S(B(a) | B(b) | B(c))
#define S4(a, b, c, d) S(B(a) | B(b) | B(c) | B(d))
#define S(w) if (0 == (BitMap & ~(w)) && (BitMap & (w)) != 0) return ':';
#define W3(a, b, c) W(B(a) | B(b) | B(c))
#define W4(a, b, c, d) W(B(a) | B(b) | B(c) | B(d))
#define W5(a, b, c, d, e) W(B(a) | B(b) | B(c) | B(d) | B(e))
#define W6(a, b, c, d, e, f) W(B(a) | B(b) | B(c) | B(d) | B(e) | B(f))
#define W(w) if (0 == (BitMap & ~(w)) && (BitMap & (w)) != 0) return '.';
// ':' indicates that one of the following 'strong'
// groups is fully conserved
// STA
// NEQK
// NHQK
// NDEQ
// QHRK
// MILV
// MILF
// HY
// FYW
//
S3(S, T, A)
S4(N, E, Q, K)
S4(N, H, Q, K)
S4(N, D, E, Q)
S4(M, I, L, V)
S4(M, I, L, F)
S2(H, Y)
S3(F, Y, W)
// '.' indicates that one of the following 'weaker'
// groups is fully conserved
// CSA
// ATV
// SAG
// STNK
// STPA
// SGND
// SNDEQK
// NDEQHK
// NEQHRK
// FVLIM
// HFY
W3(C, S, A)
W3(A, T, V)
W3(S, A, G)
W4(S, T, N, K)
W4(S, T, P, A)
W4(S, G, N, D)
W6(S, N, D, E, Q, K)
W6(N, W, Q, H, R, K)
W5(F, V, L, I, M)
W3(H, F, Y)
return ' ';
}
#include "muscle.h"
#include <ctype.h>
/***
From Bioperl docs:
Extended DNA / RNA alphabet
------------------------------------------
Symbol Meaning Nucleic Acid
------------------------------------------
A A Adenine
C C Cytosine
G G Guanine
T T Thymine
U U Uracil
M A or C
R A or G
W A or T
S C or G
Y C or T
K G or T
V A or C or G
H A or C or T
D A or G or T
B C or G or T
X G or A or T or C
N G or A or T or C
IUPAC-IUB SYMBOLS FOR NUCLEOTIDE NOMENCLATURE:
Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030.
***/
unsigned g_CharToLetter[MAX_CHAR];
unsigned g_CharToLetterEx[MAX_CHAR];
char g_LetterToChar[MAX_ALPHA];
char g_LetterExToChar[MAX_ALPHA_EX];
char g_UnalignChar[MAX_CHAR];
char g_AlignChar[MAX_CHAR];
bool g_IsWildcardChar[MAX_CHAR];
bool g_IsResidueChar[MAX_CHAR];
ALPHA g_Alpha = ALPHA_Undefined;
unsigned g_AlphaSize = 0;
#define Res(c, Letter) \
{ \
const unsigned char Upper = (unsigned char) toupper(c); \
const unsigned char Lower = (unsigned char) tolower(c); \
g_CharToLetter[Upper] = Letter; \
g_CharToLetter[Lower] = Letter; \
g_CharToLetterEx[Upper] = Letter; \
g_CharToLetterEx[Lower] = Letter; \
g_LetterToChar[Letter] = Upper; \
g_LetterExToChar[Letter] = Upper; \
g_IsResidueChar[Upper] = true; \
g_IsResidueChar[Lower] = true; \
g_AlignChar[Upper] = Upper; \
g_AlignChar[Lower] = Upper; \
g_UnalignChar[Upper] = Lower; \
g_UnalignChar[Lower] = Lower; \
}
#define Wild(c, Letter) \
{ \
const unsigned char Upper = (unsigned char) toupper(c); \
const unsigned char Lower = (unsigned char) tolower(c); \
g_CharToLetterEx[Upper] = Letter; \
g_CharToLetterEx[Lower] = Letter; \
g_LetterExToChar[Letter] = Upper; \
g_IsResidueChar[Upper] = true; \
g_IsResidueChar[Lower] = true; \
g_AlignChar[Upper] = Upper; \
g_AlignChar[Lower] = Upper; \
g_UnalignChar[Upper] = Lower; \
g_UnalignChar[Lower] = Lower; \
g_IsWildcardChar[Lower] = true; \
g_IsWildcardChar[Upper] = true; \
}
static unsigned GetAlphaSize(ALPHA Alpha)
{
switch (Alpha)
{
case ALPHA_Amino:
return 20;
case ALPHA_RNA:
case ALPHA_DNA:
return 4;
}
Quit("Invalid Alpha=%d", Alpha);
return 0;
}
static void InitArrays()
{
memset(g_CharToLetter, 0xff, sizeof(g_CharToLetter));
memset(g_CharToLetterEx, 0xff, sizeof(g_CharToLetterEx));
memset(g_LetterToChar, '?', sizeof(g_LetterToChar));
memset(g_LetterExToChar, '?', sizeof(g_LetterExToChar));
memset(g_AlignChar, '?', sizeof(g_UnalignChar));
memset(g_UnalignChar, '?', sizeof(g_UnalignChar));
memset(g_IsWildcardChar, 0, sizeof(g_IsWildcardChar));
}
static void SetGapChar(char c)
{
unsigned char u = (unsigned char) c;
g_CharToLetterEx[u] = AX_GAP;
g_LetterExToChar[AX_GAP] = u;
g_AlignChar[u] = u;
g_UnalignChar[u] = u;
}
static void SetAlphaDNA()
{
Res('A', NX_A)
Res('C', NX_C)
Res('G', NX_G)
Res('T', NX_T)
Wild('M', NX_M)
Wild('R', NX_R)
Wild('W', NX_W)
Wild('S', NX_S)
Wild('Y', NX_Y)
Wild('K', NX_K)
Wild('V', NX_V)
Wild('H', NX_H)
Wild('D', NX_D)
Wild('B', NX_B)