Commit 32fcd128 authored by Charles Plessy's avatar Charles Plessy

[svn-inject] Installing original source of muscle

parents
# Porting notes:
# For Solaris and other platforms where the logf function
# is missing from the math library, add the following line
# to the end of muscle.h:
# #define logf(x) ((float) log(x))
# Using -static increases the executable size and thus gives a very
# small increase in start time, but is more portable (the binding
# to dynamic libraries often breaks when a new library is released).
# On OSX, using -static gives the error "ld: can't locate file for: -lcrt0.o",
# this is fixed by deleting "-static" from the LDLIBS line.
CFLAGS = -O3 -funroll-loops -Winline -DNDEBUG=1
LDLIBS = -lm -static
# LDLIBS = -lm
OBJ = .o
EXE =
RM = rm -f
CP = cp
GPP = g++
LD = $(GPP) $(CFLAGS)
CPP = $(GPP) -c $(CFLAGS)
all: muscle
CPPSRC = $(sort $(wildcard *.cpp))
CPPOBJ = $(subst .cpp,.o,$(CPPSRC))
$(CPPOBJ): %.o: %.cpp
$(CPP) $< -o $@
muscle: $(CPPOBJ)
$(LD) -o muscle $(CPPOBJ) $(LDLIBS)
strip muscle
This diff is collapsed.
#include "muscle.h"
#include "msa.h"
#include "pwpath.h"
#include "profile.h"
#define TRACE 0
static void AppendDelete(const MSA &msaA, unsigned &uColIndexA,
unsigned uSeqCountA, unsigned uSeqCountB, MSA &msaCombined,
unsigned &uColIndexCombined)
{
#if TRACE
Log("AppendDelete ColIxA=%u ColIxCmb=%u\n",
uColIndexA, uColIndexCombined);
#endif
for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA)
{
char c = msaA.GetChar(uSeqIndexA, uColIndexA);
msaCombined.SetChar(uSeqIndexA, uColIndexCombined, c);
}
for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB)
msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined, '-');
++uColIndexCombined;
++uColIndexA;
}
static void AppendInsert(const MSA &msaB, unsigned &uColIndexB,
unsigned uSeqCountA, unsigned uSeqCountB, MSA &msaCombined,
unsigned &uColIndexCombined)
{
#if TRACE
Log("AppendInsert ColIxB=%u ColIxCmb=%u\n",
uColIndexB, uColIndexCombined);
#endif
for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA)
msaCombined.SetChar(uSeqIndexA, uColIndexCombined, '-');
for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB)
{
char c = msaB.GetChar(uSeqIndexB, uColIndexB);
msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined, c);
}
++uColIndexCombined;
++uColIndexB;
}
static void AppendUnalignedTerminals(const MSA &msaA, unsigned &uColIndexA, unsigned uColCountA,
const MSA &msaB, unsigned &uColIndexB, unsigned uColCountB, unsigned uSeqCountA,
unsigned uSeqCountB, MSA &msaCombined, unsigned &uColIndexCombined)
{
#if TRACE
Log("AppendUnalignedTerminals ColIxA=%u ColIxB=%u ColIxCmb=%u\n",
uColIndexA, uColIndexB, uColIndexCombined);
#endif
const unsigned uLengthA = msaA.GetColCount();
const unsigned uLengthB = msaB.GetColCount();
unsigned uNewColCount = uColCountA;
if (uColCountB > uNewColCount)
uNewColCount = uColCountB;
for (unsigned n = 0; n < uColCountA; ++n)
{
for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA)
{
char c = msaA.GetChar(uSeqIndexA, uColIndexA + n);
c = UnalignChar(c);
msaCombined.SetChar(uSeqIndexA, uColIndexCombined + n, c);
}
}
for (unsigned n = uColCountA; n < uNewColCount; ++n)
{
for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA)
msaCombined.SetChar(uSeqIndexA, uColIndexCombined + n, '.');
}
for (unsigned n = 0; n < uColCountB; ++n)
{
for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB)
{
char c = msaB.GetChar(uSeqIndexB, uColIndexB + n);
c = UnalignChar(c);
msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined + n, c);
}
}
for (unsigned n = uColCountB; n < uNewColCount; ++n)
{
for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB)
msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined + n, '.');
}
uColIndexCombined += uNewColCount;
uColIndexA += uColCountA;
uColIndexB += uColCountB;
}
static void AppendMatch(const MSA &msaA, unsigned &uColIndexA, const MSA &msaB,
unsigned &uColIndexB, unsigned uSeqCountA, unsigned uSeqCountB,
MSA &msaCombined, unsigned &uColIndexCombined)
{
#if TRACE
Log("AppendMatch ColIxA=%u ColIxB=%u ColIxCmb=%u\n",
uColIndexA, uColIndexB, uColIndexCombined);
#endif
for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA)
{
char c = msaA.GetChar(uSeqIndexA, uColIndexA);
msaCombined.SetChar(uSeqIndexA, uColIndexCombined, c);
}
for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB)
{
char c = msaB.GetChar(uSeqIndexB, uColIndexB);
msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined, c);
}
++uColIndexA;
++uColIndexB;
++uColIndexCombined;
}
void AlignTwoMSAsGivenPathSW(const PWPath &Path, const MSA &msaA, const MSA &msaB,
MSA &msaCombined)
{
msaCombined.Clear();
#if TRACE
Log("AlignTwoMSAsGivenPathSW\n");
Log("Template A:\n");
msaA.LogMe();
Log("Template B:\n");
msaB.LogMe();
#endif
const unsigned uColCountA = msaA.GetColCount();
const unsigned uColCountB = msaB.GetColCount();
const unsigned uSeqCountA = msaA.GetSeqCount();
const unsigned uSeqCountB = msaB.GetSeqCount();
msaCombined.SetSeqCount(uSeqCountA + uSeqCountB);
// Copy sequence names into combined MSA
for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA)
{
msaCombined.SetSeqName(uSeqIndexA, msaA.GetSeqName(uSeqIndexA));
msaCombined.SetSeqId(uSeqIndexA, msaA.GetSeqId(uSeqIndexA));
}
for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB)
{
msaCombined.SetSeqName(uSeqCountA + uSeqIndexB, msaB.GetSeqName(uSeqIndexB));
msaCombined.SetSeqId(uSeqCountA + uSeqIndexB, msaB.GetSeqId(uSeqIndexB));
}
unsigned uColIndexA = 0;
unsigned uColIndexB = 0;
unsigned uColIndexCombined = 0;
const unsigned uEdgeCount = Path.GetEdgeCount();
for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex)
{
const PWEdge &Edge = Path.GetEdge(uEdgeIndex);
#if TRACE
Log("\nEdge %u %c%u.%u\n",
uEdgeIndex,
Edge.cType,
Edge.uPrefixLengthA,
Edge.uPrefixLengthB);
#endif
const char cType = Edge.cType;
const unsigned uPrefixLengthA = Edge.uPrefixLengthA;
unsigned uColCountA = 0;
if (uPrefixLengthA > 0)
{
const unsigned uNodeIndexA = uPrefixLengthA - 1;
const unsigned uTplColIndexA = uNodeIndexA;
if (uTplColIndexA > uColIndexA)
uColCountA = uTplColIndexA - uColIndexA;
}
const unsigned uPrefixLengthB = Edge.uPrefixLengthB;
unsigned uColCountB = 0;
if (uPrefixLengthB > 0)
{
const unsigned uNodeIndexB = uPrefixLengthB - 1;
const unsigned uTplColIndexB = uNodeIndexB;
if (uTplColIndexB > uColIndexB)
uColCountB = uTplColIndexB - uColIndexB;
}
AppendUnalignedTerminals(msaA, uColIndexA, uColCountA, msaB, uColIndexB, uColCountB,
uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined);
switch (cType)
{
case 'M':
{
assert(uPrefixLengthA > 0);
assert(uPrefixLengthB > 0);
const unsigned uColA = uPrefixLengthA - 1;
const unsigned uColB = uPrefixLengthB - 1;
assert(uColIndexA == uColA);
assert(uColIndexB == uColB);
AppendMatch(msaA, uColIndexA, msaB, uColIndexB, uSeqCountA, uSeqCountB,
msaCombined, uColIndexCombined);
break;
}
case 'D':
{
assert(uPrefixLengthA > 0);
const unsigned uColA = uPrefixLengthA - 1;
assert(uColIndexA == uColA);
AppendDelete(msaA, uColIndexA, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined);
break;
}
case 'I':
{
assert(uPrefixLengthB > 0);
const unsigned uColB = uPrefixLengthB - 1;
assert(uColIndexB == uColB);
AppendInsert(msaB, uColIndexB, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined);
break;
}
default:
assert(false);
}
}
unsigned uInsertColCountA = uColCountA - uColIndexA;
unsigned uInsertColCountB = uColCountB - uColIndexB;
AppendUnalignedTerminals(msaA, uColIndexA, uInsertColCountA, msaB, uColIndexB,
uInsertColCountB, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined);
}
#include "muscle.h"
#include "msa.h"
#include "profile.h"
#include "pwpath.h"
#include "textfile.h"
#include "timing.h"
SCORE AlignTwoMSAs(const MSA &msa1, const MSA &msa2, MSA &msaOut, PWPath &Path,
bool bLockLeft, bool bLockRight)
{
const unsigned uLengthA = msa1.GetColCount();
const unsigned uLengthB = msa2.GetColCount();
ProfPos *PA = ProfileFromMSA(msa1);
ProfPos *PB = ProfileFromMSA(msa2);
if (bLockLeft)
{
PA[0].m_scoreGapOpen = MINUS_INFINITY;
PB[0].m_scoreGapOpen = MINUS_INFINITY;
}
if (bLockRight)
{
PA[uLengthA-1].m_scoreGapClose = MINUS_INFINITY;
PB[uLengthB-1].m_scoreGapClose = MINUS_INFINITY;
}
float r = (float) uLengthA/ (float) (uLengthB + 1); // +1 to prevent div 0
if (r < 1)
r = 1/r;
SCORE Score = GlobalAlign(PA, uLengthA, PB, uLengthB, Path);
AlignTwoMSAsGivenPath(Path, msa1, msa2, msaOut);
delete[] PA;
delete[] PB;
return Score;
}
#include "muscle.h"
#include "msa.h"
#include "profile.h"
#include "pwpath.h"
SCORE GlobalAlign4(ProfPos *PA, unsigned uLengthA, ProfPos *PB,
unsigned uLengthB, PWPath &Path);
SCORE AlignTwoProfs(
const ProfPos *PA, unsigned uLengthA, WEIGHT wA,
const ProfPos *PB, unsigned uLengthB, WEIGHT wB,
PWPath &Path, ProfPos **ptrPout, unsigned *ptruLengthOut)
{
assert(uLengthA < 100000);
assert(uLengthB < 100000);
float r = (float) uLengthA/ (float) (uLengthB + 1); // +1 to prevent div 0
if (r < 1)
r = 1/r;
SCORE Score = GlobalAlign(PA, uLengthA, PB, uLengthB, Path);
AlignTwoProfsGivenPath(Path, PA, uLengthB, wA/(wA + wB), PB, uLengthB, wB/(wA + wB),
ptrPout, ptruLengthOut);
#if HYDRO
if (ALPHA_Amino == g_Alpha)
Hydro(*ptrPout, *ptruLengthOut);
#endif
return Score;
}
#include "muscle.h"
#include <stdio.h>
#include <ctype.h>
#include "msa.h"
#include "textfile.h"
const unsigned uCharsPerLine = 60;
const int MIN_NAME = 10;
const int MAX_NAME = 32;
static char GetAlnConsensusChar(const MSA &a, unsigned uColIndex);
void MSA::ToAlnFile(TextFile &File) const
{
if (g_bClwStrict)
File.PutString("CLUSTAL W (1.81) multiple sequence alignment\n");
else
{
File.PutString("MUSCLE ("
MUSCLE_MAJOR_VERSION "." MUSCLE_MINOR_VERSION ")"
" multiple sequence alignment\n");
File.PutString("\n");
}
int iLongestNameLength = 0;
for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)
{
const char *ptrName = GetSeqName(uSeqIndex);
const char *ptrBlank = strchr(ptrName, ' ');
int iLength;
if (0 != ptrBlank)
iLength = (int) (ptrBlank - ptrName);
else
iLength = (int) strlen(ptrName);
if (iLength > iLongestNameLength)
iLongestNameLength = iLength;
}
if (iLongestNameLength > MAX_NAME)
iLongestNameLength = MAX_NAME;
if (iLongestNameLength < MIN_NAME)
iLongestNameLength = MIN_NAME;
unsigned uLineCount = (GetColCount() - 1)/uCharsPerLine + 1;
for (unsigned uLineIndex = 0; uLineIndex < uLineCount; ++uLineIndex)
{
File.PutString("\n");
unsigned uStartColIndex = uLineIndex*uCharsPerLine;
unsigned uEndColIndex = uStartColIndex + uCharsPerLine - 1;
if (uEndColIndex >= GetColCount())
uEndColIndex = GetColCount() - 1;
char Name[MAX_NAME+1];
for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)
{
const char *ptrName = GetSeqName(uSeqIndex);
const char *ptrBlank = strchr(ptrName, ' ');
int iLength;
if (0 != ptrBlank)
iLength = (int) (ptrBlank - ptrName);
else
iLength = (int) strlen(ptrName);
if (iLength > MAX_NAME)
iLength = MAX_NAME;
memset(Name, ' ', MAX_NAME);
memcpy(Name, ptrName, iLength);
Name[iLongestNameLength] = 0;
File.PutFormat("%s ", Name);
for (unsigned uColIndex = uStartColIndex; uColIndex <= uEndColIndex;
++uColIndex)
{
const char c = GetChar(uSeqIndex, uColIndex);
File.PutFormat("%c", toupper(c));
}
File.PutString("\n");
}
memset(Name, ' ', MAX_NAME);
Name[iLongestNameLength] = 0;
File.PutFormat("%s ", Name);
for (unsigned uColIndex = uStartColIndex; uColIndex <= uEndColIndex;
++uColIndex)
{
const char c = GetAlnConsensusChar(*this, uColIndex);
File.PutChar(c);
}
File.PutString("\n");
}
}
static char GetAlnConsensusChar(const MSA &a, unsigned uColIndex)
{
const unsigned uSeqCount = a.GetSeqCount();
unsigned BitMap = 0;
unsigned Count = 0;
for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
{
unsigned uLetter = a.GetLetterEx(uSeqIndex, uColIndex);
assert(uLetter < 32);
unsigned Bit = (1 << uLetter);
if (!(BitMap & Bit))
++Count;
BitMap |= Bit;
}
// '*' indicates positions which have a single, fully conserved residue
if (1 == Count)
return '*';
if (ALPHA_Amino != g_Alpha)
return ' ';
#define B(a) (1 << AX_##a)
#define S2(a, b) S(B(a) | B(b))
#define S3(a, b, c) S(B(a) | B(b) | B(c))
#define S4(a, b, c, d) S(B(a) | B(b) | B(c) | B(d))
#define S(w) if (0 == (BitMap & ~(w)) && (BitMap & (w)) != 0) return ':';
#define W3(a, b, c) W(B(a) | B(b) | B(c))
#define W4(a, b, c, d) W(B(a) | B(b) | B(c) | B(d))
#define W5(a, b, c, d, e) W(B(a) | B(b) | B(c) | B(d) | B(e))
#define W6(a, b, c, d, e, f) W(B(a) | B(b) | B(c) | B(d) | B(e) | B(f))
#define W(w) if (0 == (BitMap & ~(w)) && (BitMap & (w)) != 0) return '.';
// ':' indicates that one of the following 'strong'
// groups is fully conserved
// STA
// NEQK
// NHQK
// NDEQ
// QHRK
// MILV
// MILF
// HY
// FYW
//
S3(S, T, A)
S4(N, E, Q, K)
S4(N, H, Q, K)
S4(N, D, E, Q)
S4(M, I, L, V)
S4(M, I, L, F)
S2(H, Y)
S3(F, Y, W)
// '.' indicates that one of the following 'weaker'
// groups is fully conserved
// CSA
// ATV
// SAG
// STNK
// STPA
// SGND
// SNDEQK
// NDEQHK
// NEQHRK
// FVLIM
// HFY
W3(C, S, A)
W3(A, T, V)
W3(S, A, G)
W4(S, T, N, K)
W4(S, T, P, A)
W4(S, G, N, D)
W6(S, N, D, E, Q, K)
W6(N, W, Q, H, R, K)
W5(F, V, L, I, M)
W3(H, F, Y)
return ' ';
}
#include "muscle.h"
#include <ctype.h>
/***
From Bioperl docs:
Extended DNA / RNA alphabet
------------------------------------------
Symbol Meaning Nucleic Acid
------------------------------------------
A A Adenine
C C Cytosine
G G Guanine
T T Thymine
U U Uracil
M A or C
R A or G
W A or T
S C or G
Y C or T
K G or T
V A or C or G
H A or C or T
D A or G or T
B C or G or T
X G or A or T or C
N G or A or T or C
IUPAC-IUB SYMBOLS FOR NUCLEOTIDE NOMENCLATURE:
Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030.
***/
unsigned g_CharToLetter[MAX_CHAR];
unsigned g_CharToLetterEx[MAX_CHAR];
char g_LetterToChar[MAX_ALPHA];
char g_LetterExToChar[MAX_ALPHA_EX];
char g_UnalignChar[MAX_CHAR];
char g_AlignChar[MAX_CHAR];
bool g_IsWildcardChar[MAX_CHAR];
bool g_IsResidueChar[MAX_CHAR];
ALPHA g_Alpha = ALPHA_Undefined;
unsigned g_AlphaSize = 0;
#define Res(c, Letter) \
{ \
const unsigned char Upper = (unsigned char) toupper(c); \
const unsigned char Lower = (unsigned char) tolower(c); \
g_CharToLetter[Upper] = Letter; \
g_CharToLetter[Lower] = Letter; \
g_CharToLetterEx[Upper] = Letter; \
g_CharToLetterEx[Lower] = Letter; \
g_LetterToChar[Letter] = Upper; \
g_LetterExToChar[Letter] = Upper; \
g_IsResidueChar[Upper] = true; \
g_IsResidueChar[Lower] = true; \
g_AlignChar[Upper] = Upper; \
g_AlignChar[Lower] = Upper; \
g_UnalignChar[Upper] = Lower; \
g_UnalignChar[Lower] = Lower; \
}
#define Wild(c, Letter) \
{ \
const unsigned char Upper = (unsigned char) toupper(c); \
const unsigned char Lower = (unsigned char) tolower(c); \
g_CharToLetterEx[Upper] = Letter; \
g_CharToLetterEx[Lower] = Letter; \
g_LetterExToChar[Letter] = Upper; \
g_IsResidueChar[Upper] = true; \
g_IsResidueChar[Lower] = true; \
g_AlignChar[Upper] = Upper; \
g_AlignChar[Lower] = Upper; \
g_UnalignChar[Upper] = Lower; \
g_UnalignChar[Lower] = Lower; \
g_IsWildcardChar[Lower] = true; \
g_IsWildcardChar[Upper] = true; \
}
static unsigned GetAlphaSize(ALPHA Alpha)
{
switch (Alpha)
{
case ALPHA_Amino:
return 20;
case ALPHA_RNA:
case ALPHA_DNA:
return 4;
}
Quit("Invalid Alpha=%d", Alpha);
return 0;
}
static void InitArrays()
{
memset(g_CharToLetter, 0xff, sizeof(g_CharToLetter));
memset(g_CharToLetterEx, 0xff, sizeof(g_CharToLetterEx));
memset(g_LetterToChar, '?', sizeof(g_LetterToChar));
memset(g_LetterExToChar, '?', sizeof(g_LetterExToChar));
memset(g_AlignChar, '?', sizeof(g_UnalignChar));
memset(g_UnalignChar, '?', sizeof(g_UnalignChar));
memset(g_IsWildcardChar, 0, sizeof(g_IsWildcardChar));
}
static void SetGapChar(char c)
{
unsigned char u = (unsigned char) c;
g_CharToLetterEx[u] = AX_GAP;
g_LetterExToChar[AX_GAP] = u;
g_AlignChar[u] = u;
g_UnalignChar[u] = u;
}
static void SetAlphaDNA()
{
Res('A', NX_A)
Res('C', NX_C)
Res('G', NX_G)
Res('T', NX_T)
Wild('M', NX_M)
Wild('R', NX_R)
Wild('W', NX_W)
Wild('S', NX_S)
Wild('Y', NX_Y)
Wild('K', NX_K)
Wild('V', NX_V)
Wild('H', NX_H)
Wild('D', NX_D)
Wild('B', NX_B)
Wild('X', NX_X)
Wild('N', NX_N)
}
static void SetAlphaRNA()
{
Res('A', NX_A)
Res('C', NX_C)
Res('G', NX_G)
Res('U', NX_U)
Res('T', NX_T)