Commit a88fbdf7 authored by Fabian Klötzl's avatar Fabian Klötzl

Imported Upstream version 0.9.6

parent abc0e9a9
BasedOnStyle: LLVM
IndentWidth: 4
TabWidth: 4
UseTab: Always
AllowShortIfStatementsOnASingleLine: true
AllowShortFunctionsOnASingleLine: false
IndentCaseLabels: true
AllowShortCaseLabelsOnASingleLine: true
BreakBeforeBraces: Attach
language: cpp
compiler:
- gcc
sudo: false
addons:
apt:
sources:
- deadsnakes
- ubuntu-toolchain-r-test
packages:
- cmake
- g++-4.8
- libglib2.0-dev
- libgsl0-dev
install:
- export LIBDIVDIR="$HOME/libdivsufsort"
- pip install --user cpp-coveralls
- wget https://github.com/y-256/libdivsufsort/archive/master.tar.gz
- tar -xzvf master.tar.gz
- cd libdivsufsort-master && mkdir build && cd build
- cmake -DCMAKE_BUILD_TYPE="Release" -DCMAKE_INSTALL_PREFIX="$LIBDIVDIR" ..
- make && make install
before_install:
- sudo pip install cpp-coveralls
- sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y
- sudo apt-get update -qq
- if [ "$CXX" = "g++" ]; then sudo apt-get install -qq g++-4.8; fi
- if [ "$CXX" = "g++" ]; then export CXX="g++-4.8" CC="gcc-4.8"; fi
- sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.8 90
- sudo apt-get install libglib2.0-dev libgsl0-dev
- wget https://github.com/y-256/libdivsufsort/archive/master.tar.gz
- tar -xzvf master.tar.gz
- cd libdivsufsort-master
- mkdir build
- cd build
- cmake -DCMAKE_BUILD_TYPE="Release" -DCMAKE_INSTALL_PREFIX="/usr/local" ..
- make
- sudo make install
- cd $TRAVIS_BUILD_DIR
- export LD_LIBRARY_PATH=/usr/local:$LD_LIBRARY_PATH
# - sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.8 90
script:
- export LD_LIBRARY_PATH=/usr/local/lib:/usr/local:$LD_LIBRARY_PATH
- export OMP_NUM_THREADS=4
- export LD_LIBRARY_PATH="$LIBDIVDIR:$LIBDIVDIR/lib"
- export LIBRARY_PATH="$LIBDIVDIR:$LIBRARY_PATH"
- cd $TRAVIS_BUILD_DIR
- autoreconf -fvi -Im4
- ./configure --enable-unit-tests CFLAGS='-fprofile-arcs -ftest-coverage' CXXFLAGS='-fprofile-arcs -ftest-coverage'
- export MYFLAGS="-fprofile-arcs -ftest-coverage -I$LIBDIVDIR/include"
- ./configure --enable-unit-tests LDFLAGS="-L$LIBDIVDIR/lib" CFLAGS="$MYFLAGS" CXXFLAGS="$MYFLAGS"
- make
- make check
- make distcheck
- make distcheck DISTCHECK_CONFIGURE_FLAGS="LDFLAGS=\"-L$LIBDIVDIR/lib\" CFLAGS=\"-I$LIBDIVDIR/include\" CXXFLAGS=\"-I$LIBDIVDIR/include\""
- tar xzvf andi-*.tar.gz
- cd andi-*
- ./configure --enable-unit-tests --without-libdivsufsort
......
......@@ -10,15 +10,17 @@ endif
SUBDIRS = . $(OPT_PSUFSORT) libs opt src docs
DIST_SUBDIRS = . libs opt src docs test opt/psufsort
# Conditionaly build the tests
# Conditionally build the tests
if BUILD_TESTS
SUBDIRS+= test
AM_TESTS_ENVIRONMENT= \
RANDOM_SEED='@SEED@' ; export RANDOM_SEED ;
TESTS = test/test_esa test/test_seq test/test_extra.sh test/test_random.sh test/test_join.sh
$(TESTS): src/andi
# $(MAKE) -C test
endif # BUILD_TESTS
......
......@@ -14,7 +14,7 @@ For the latest [stable release](https://github.com/EvolBioInf/andi/releases) of
## Prerequisites
This program has the following external dependencies: [libdivsufsort](https://code.google.com/p/libdivsufsort/) and the [GSL](https://www.gnu.org/software/gsl/). Please make sure you installed both before attempting a build. If you did get the source, not as a tarball, but straight from the git repository, you will also need the autotools. Run `autoreconf -i` to generate the configure script and continue with the next step.
This program has the following external dependencies: [libdivsufsort](https://github.com/y-256/libdivsufsort) and the [GSL](https://www.gnu.org/software/gsl/). Please make sure you installed both before attempting a build. If you did get the source, not as a tarball, but straight from the git repository, you will also need the autotools. Run `autoreconf -i` to generate the configure script and continue with the next step.
## Compiling
......@@ -42,11 +42,11 @@ From this distance matrix the phylogeny can be inferred via neighbor-joining. Ch
# Links and Additional Resources
The release of this software is accompanied by a paper from [Haubold et al.](http://bioinformatics.oxfordjournals.org/content/31/8/1169). It explains the used *anchor distance* strategy in great detail. The `maf2phy.awk` script used in the validation process is located under `scripts`. Simulations were done using our own [simK](http://guanine.evolbio.mpg.de/bioBox/) tool.
The release of this software is accompanied by a paper from [Haubold et al.](http://bioinformatics.oxfordjournals.org/content/31/8/1169). It explains the used *anchor distance* strategy in great detail. The `maf2phy.awk` script used in the validation process is located under `scripts`. Simulations were done using our own [simK](http://guanine.evolbio.mpg.de/bioBox/) tool. For a demo visualising the internals of andi visit our [GitHub pages](http://evolbioinf.github.io/andi/).
## Data Sets
1. 29. E. coli strains: [data](http://guanine.evolbio.mpg.de/andi/eco29.fasta.gz)
1. 29 E. coli and Shigella strains: [data](http://guanine.evolbio.mpg.de/andi/eco29.fasta.gz)
2. 109 E. coli ST131 strains ([paper](http://www.pnas.org/content/early/2014/03/28/1322678111.abstract)):
* [99 newly sequenced strains](https://github.com/BeatsonLab-MicrobialGenomics/ST131_99)
* [10 previously published strains](http://guanine.evolbio.mpg.de/andi/st131_extra.tgz)
......
No preview for this file type
AC_INIT([andi], [0.9.5])
AC_INIT([andi], [0.9.6])
AM_INIT_AUTOMAKE([-Wall foreign ])
AC_CONFIG_MACRO_DIR([m4])
......@@ -52,6 +52,8 @@ The latter may result in longer runtimes.])
AM_CONDITIONAL([BUILD_WITH_LIBDIVSUFSORT],[test "x${with_libdivsufsort}" != "xno"])
# The unit tests require GLIB2. So by default do not build the test.
# If enabled, check for glib.
......@@ -62,6 +64,16 @@ AC_ARG_ENABLE([unit-tests],
AM_CONDITIONAL([BUILD_TESTS],[test "x${try_unit_tests}" = xyes])
# The user may set a seed for the unit tests, so that builds are reproducible.
# A value of 0 makes the tests random.
AC_ARG_WITH([seed],
[AS_HELP_STRING([--with-seed=INT],
[random seed for reproducible builds. @<:@default: 0@:>@])],
[SEED=$withval],
[SEED=0])
AC_SUBST([SEED])
if test "x${try_unit_tests}" = xyes; then
have_glib=yes
......@@ -74,6 +86,7 @@ if test "x${try_unit_tests}" = xyes; then
AX_CXX_COMPILE_STDCXX_11([],[mandatory])
fi
# Check for various headers including those used by libdivsufsort.
AC_CHECK_HEADERS([limits.h stdlib.h string.h unistd.h stdint.h inttypes.h err.h errno.h fcntl.h])
......@@ -92,7 +105,7 @@ AC_HEADER_STDBOOL
# it should be avoided in the first place. Thus I really don't need these checks.
AC_CHECK_FUNCS([floor pow sqrt strdup strerror])
AC_CHECK_FUNCS([strndup])
AC_CHECK_FUNCS([strndup strcasecmp])
AC_CHECK_FUNCS([strchr strrchr strchrnul])
AC_CHECK_FUNCS([strtoul strtod])
AC_CHECK_FUNCS([reallocarray])
......
......@@ -3,7 +3,7 @@
andi \- estimates evolutionary distance
.SH SYNOPSIS
.B andi
[\fI-jrv\fR] [\fI-p FLOAT\fR] [\fI-t INT\fR] \fIFILES\fR...
[\fI-jlv\fR] [\fI-b INT\fR] [\fI-p FLOAT\fR] [\fI-m MODEL\fR] [\fI-t INT\fR] \fIFILES\fR...
.SH DESCRIPTION
.TP
\fBandi\fR estimates the evolutionary distance between closely related genomes. For this \fBandi\fR reads the input sequences from \fIFASTA\fR files and computes the pairwise anchor distance. The idea behind this is explained in a paper by Haubold et al. (see below).
......@@ -11,18 +11,21 @@ andi \- estimates evolutionary distance
The output is a symmetrical distance matrix in \fIPHYLIP\fR format, with each entry representing divergence with a positive real number. A distance of zero means that two sequences are identical, whereas other values are estimates for the nucleotide substitution rate (Jukes-Cantor corrected). For technical reasons the comparison might fail and no estimate can be computed. In such cases \fInan\fR is printed. This either means that the input sequences were too short (<200bp) or too diverse (K>0.5) for our method to work properly.
.SH OPTIONS
.TP
\fB\-b, \-\-bootstrap\fR <INT>
Compute multiple distance matrices, with \fIn-1\fR bootstrapped from the first. See the paper Klötzl & Haubold (2016, in review) for a detailed explanation.
.TP
\fB\-j\fR, \fB\-\-join\fR
Use this mode if each of your \fIFASTA\fR files represents one assembly with numerous contigs. \fBandi\fR will then treat all of the contained sequences per file as a single genome. In this mode at least one filename must be provided via command line arguments. For the output the filename is used to identify each sequence.
.TP
\fB\-m\fR, \fB\-\-low-memory\fR
\fB\-l\fR, \fB\-\-low-memory\fR
In multithreaded mode, \fBandi\fR requires memory linear to the amount of threads. The low memory mode changes this to a constant demand independent from the used number of threads. Unfortunately, this comes at a significant runtime cost.
.TP
\fB\-m\fR, \fB\-\-model\fR <Raw|JC|Kimura>
Different models of nucleotide evolution are supported. By default the Jukes-Cantor correction is used.
.TP
\fB\-p\fR <FLOAT>
Significance of an anchor pair; default: 0.05.
.TP
\fB\-r\fR, \fB\-\-raw\fR
Calculates raw distances; default: Jukes\-Cantor corrected.
.TP
\fB\-t\fR <INT>
The number of threads to be used; by default, all available processors are used.
.br
......
......@@ -153,16 +153,19 @@ Once you have downloaded the package, unzip it and change into the newly created
Now \andi should be ready for use. Try invoking the help.
\begin{lstlisting}
~/andi-0.9 % andi -h
Usage: andi [-jrv] [-p FLOAT] FILES...
FILES... can be any sequence of FASTA files. If no files are supplied, stdin is used instead.
Usage: andi [-jlv] [-b INT] [-p FLOAT] [-m MODEL] [-t INT] FILES...
FILES... can be any sequence of FASTA files. If no files are supplied, stdin is used instead.
Options:
-b, --bootstrap <INT>
Print additional bootstrap matrices
-j, --join Treat all sequences from one file as a single genome
-m, --low-memory Use less memory at the cost of speed
-l, --low-memory Use less memory at the cost of speed
-m, --model <Raw|JC|Kimura>
Pick an evolutionary model; default: JC
-p <FLOAT> Significance of an anchor pair; default: 0.05
-r, --raw Calculates raw distances; default: Jukes-Cantor corrected
-v, --verbose Prints additional information
-t <INT> The number of threads to be used; default: 1
-t, --threads <INT>
The number of threads to be used; by default, all available processors are used
-h, --help Display this help and exit
--version Output version information and acknowledgments
\end{lstlisting}
......@@ -264,7 +267,7 @@ S2 0.0995 0.0000
In the above examples the runtime dropped from \SI{0.613}{\second}, to \SI{0.362}{\second} using two threads. Giving \andi more threads than input genomes leads to no further speed improvement. \, The other important option is \lstinline$--join$ (see Section~\ref{sec:join}).
By default, the distances computed by \andi are \emph{Jukes-Cantor} corrected \cite{jukescantor}. This means, the output is substitution rates, which is greater than the mismatch rate. To obtain the pure mismatch rate, use the \lstinline$--raw$ switch.
By default, the distances computed by \andi are \emph{Jukes-Cantor} corrected \cite{jukescantor}. Other evolutionary models are also implemented (Kimura, raw). The \lstinline$--model$ parameter can be used to switch between them.
Since version 0.9.4 \andi includes a bootstrapping method. It can be activated via the \lstinline$--bootstrap$ or \lstinline$-b$ switch. This option takes a numeric argument representing the number of matrices to create. The output can then be piped into \algo{phylip}.
......
noinst_LIBRARIES = libpsufsort.a
libpsufsort_a_SOURCES = psufsort.cxx c_interface.cxx interface.h $(top_srcdir)/src/global.h
libpsufsort_a_CXXFLAGS = $(OPENMP_CXXFLAGS) -W -Wall
libpsufsort_a_CXXFLAGS = $(OPENMP_CXXFLAGS) -Wall -Wextra
libpsufsort_a_CPPFLAGS = $(OPENMP_CXXFLAGS) -I$(top_srcdir)/src
......@@ -6,10 +6,11 @@ PSUFSORT=$(top_builddir)/opt/psufsort/libpsufsort.a
DUMMY=dummy.cxx
endif
andi_SOURCES = andi.c esa.c process.c sequence.c io.c global.h esa.h process.h sequence.h io.h dist_hack.h
andi_SOURCES = andi.c esa.c process.c sequence.c io.c global.h esa.h process.h sequence.h io.h dist_hack.h \
model.h model.c
andi_CPPFLAGS = $(OPENMP_CFLAGS) -I$(top_srcdir)/libs -I$(top_srcdir)/opt -std=gnu99
andi_CFLAGS = $(OPENMP_CFLAGS) -W -Wall -Wno-missing-field-initializers
andi_CXXFLAGS = $(OPENMP_CXXFLAGS) -W -Wall
andi_CFLAGS = $(OPENMP_CFLAGS) -Wall -Wextra -Wno-missing-field-initializers
andi_CXXFLAGS = $(OPENMP_CXXFLAGS) -Wall -Wextra
andi_LDADD = $(PSUFSORT) $(top_builddir)/libs/libpfasta.a $(top_builddir)/opt/libcompat.a
nodist_EXTRA_andi_SOURCES = $(DUMMY)
......
This diff is collapsed.
/** @file
* @brief This file is a preprocessor hack for the two functions `distMatrix` and `distMatrixLM`.
* @brief This file is a preprocessor hack for the two functions `distMatrix`
* and `distMatrixLM`.
*/
#ifdef FAST
#define NAME distMatrix
......@@ -10,63 +11,56 @@
#undef P_OUTER
#undef P_INNER
#define NAME distMatrixLM
#define P_OUTER
#define P_OUTER
#define P_INNER _Pragma("omp parallel for num_threads( THREADS)")
#endif
/** @brief This function calls dist_andi for pairs of subjects and queries, and thereby fills the distance matrix.
/** @brief This function calls dist_andi for pairs of subjects and queries, and
*thereby fills the distance matrix.
*
* This function is actually two functions. It is one template that gets compiled into two functions via
* This function is actually two functions. It is one template that gets
*compiled into two functions via
* preprocessor hacks. The reason is DRY (Do not Repeat Yourselves).
* The two functions only differ by their name and pragmas; i.e. They run in different parallel modes.
* The two functions only differ by their name and pragmas; i.e. They run in
*different parallel modes.
* `distMatrix` is faster than `distMatrixLM` but needs more memory.
*
* @param sequences - The sequences to compare
* @param n - The number of sequences
* @param M - A matrix for additional output data
*/
void NAME( data_t *M, seq_t* sequences, size_t n){
void NAME(struct model *M, seq_t *sequences, size_t n) {
size_t i;
//#pragma
P_OUTER
for(i=0;i<n;i++){
for (i = 0; i < n; i++) {
seq_t *subject = &sequences[i];
esa_s E;
seq_subject_init( subject);
if( esa_init( &E, subject)){
warnx("Failed to create index for %s.", subject->name);
for( size_t j=0; j< n; j++){
M(i,j).distance = (i==j) ? 0.0 : NAN;
M(i,j).coverage = 0.0;
}
continue;
if (seq_subject_init(subject) || esa_init(&E, subject)) {
errx(1, "Failed to create index for %s.", subject->name);
}
// now compare every other sequence to i
size_t j;
P_INNER
for(j=0; j<n; j++){
if( j == i) {
M(i,j) = (data_t){0.0,0.0};
for (j = 0; j < n; j++) {
if (j == i) {
M(i, j) = (struct model){.seq_len = 9, .counts = {9}};
continue;
}
// TODO: Provide a nicer progress indicator.
if( FLAGS & F_EXTRA_VERBOSE ){
#pragma omp critical
{
fprintf( stderr, "comparing %zu and %zu\n", i, j);
}
if (FLAGS & F_EXTRA_VERBOSE) {
#pragma omp critical
{ fprintf(stderr, "comparing %zu and %zu\n", i, j); }
}
size_t ql = sequences[j].len;
M(i,j) = dist_anchor( &E, sequences[j].S, ql, subject->gc);
M(i, j) = dist_anchor(&E, sequences[j].S, ql, subject->gc);
}
esa_free(&E);
......
This diff is collapsed.
......@@ -10,7 +10,7 @@
#include "config.h"
#ifdef HAVE_LIBDIVSUFSORT
# include <divsufsort.h>
#include <divsufsort.h>
#else
#include "../opt/psufsort/interface.h"
......@@ -41,9 +41,9 @@ typedef struct {
saidx_t m;
} lcp_inter_t;
/**
/**
* @brief The ESA type.
*
*
* This structure holds arrays and objects associated with an enhanced
* suffix array (ESA).
*/
......@@ -66,16 +66,15 @@ typedef struct esa_s {
saidx_t *CLD;
} esa_s;
lcp_inter_t get_match_cached( const esa_s *, const char *query, size_t qlen);
lcp_inter_t get_match( const esa_s *, const char *query, size_t qlen);
int esa_init( esa_s *, const seq_t *S);
void esa_free( esa_s *);
lcp_inter_t get_match_cached(const esa_s *, const char *query, size_t qlen);
lcp_inter_t get_match(const esa_s *, const char *query, size_t qlen);
int esa_init(esa_s *, const seq_t *S);
void esa_free(esa_s *);
#ifdef DEBUG
char code2char( ssize_t code);
char code2char(ssize_t code);
#endif //DEBUG
#endif // DEBUG
#endif
......@@ -7,6 +7,8 @@
*/
#ifndef _GLOBAL_H_
#define _GLOBAL_H_
#include <gsl/gsl_rng.h>
#include "config.h"
/**
......@@ -29,15 +31,29 @@ extern int THREADS;
*/
extern double RANDOM_ANCHOR_PROP;
/**
* The number of matrices that should be bootstrapped.
*/
extern long unsigned int BOOTSTRAP;
/**
/**
* A globel random number generator. Has to be seedable.
*/
extern gsl_rng *RNG;
/**
* The evolutionary model.
*/
extern int MODEL;
enum { M_RAW, M_JC, M_KIMURA };
/**
* This enum contains the available flags. Please note that all
* available options are a power of 2.
*/
enum {
F_NONE = 0,
F_RAW = 1,
F_VERBOSE = 2,
F_EXTRA_VERBOSE = 4,
F_NON_ACGT = 8,
......@@ -47,4 +63,3 @@ enum {
};
#endif
......@@ -28,45 +28,52 @@
* @param dsa - An array that holds found sequences.
* @param name - The name of the file to be used for the name of the sequence.
*/
void read_fasta_join( const char* file_name, dsa_t *dsa){
if( !file_name || !dsa ) return;
void read_fasta_join(const char *file_name, dsa_t *dsa) {
if (!file_name || !dsa) return;
dsa_t single;
dsa_init(&single);
read_fasta( file_name, &single);
if( dsa_size( &single) == 0 ){
read_fasta(file_name, &single);
if (dsa_size(&single) == 0) {
return;
}
seq_t joined = dsa_join( &single);
seq_t joined = dsa_join(&single);
/* In join mode we try to be clever about the sequence name. Given the file
* path we extract just the file name. ie. path/file.ext -> file
* This obviously fails on Windows.
*/
const char *left = strrchr( file_name, '/'); // find the last path separator
left = (left == NULL) ? file_name : left + 1; // left is the position one of to the right of the path separator
const char *dot = strchrnul( left, '.'); // find the extension
joined.name = strndup( left, dot-left ); // copy only the file name, not its path or extension
dsa_push( dsa, joined);
dsa_free( &single);
}
const char *left = strrchr(file_name, '/'); // find the last path separator
left = (left == NULL) ? file_name : left + 1; // left is the position one of
// to the right of the path
// separator
const char *dot = strchrnul(left, '.'); // find the extension
joined.name = strndup(
left, dot - left); // copy only the file name, not its path or extension
dsa_push(dsa, joined);
dsa_free(&single);
}
/**
* @brief This function reads sequences from a file.
* @param in - The file pointer to read from.
* @param dsa - An array that holds found sequences.
*/
void read_fasta( const char* file_name, dsa_t *dsa){
if( !file_name || !dsa) return;
void read_fasta(const char *file_name, dsa_t *dsa) {
if (!file_name || !dsa) return;
int file_descriptor =
strcmp(file_name, "-") ? open(file_name, O_RDONLY) : STDIN_FILENO;
int file_descriptor = strcmp(file_name, "-") ? open(file_name, O_RDONLY) : STDIN_FILENO;
if (file_descriptor < 0) warn("%s", file_name);
if (file_descriptor < 0) {
warn("%s", file_name);
return;
}
int l;
int check;
......@@ -81,12 +88,12 @@ void read_fasta( const char* file_name, dsa_t *dsa){
pfasta_seq ps;
while ((l = pfasta_read(&pf, &ps)) == 0) {
check = seq_init( &top, ps.seq, ps.name);
check = seq_init(&top, ps.seq, ps.name);
// skip broken sequences
if( check != 0) continue;
dsa_push( dsa, top);
if (check != 0) continue;
dsa_push(dsa, top);
pfasta_seq_free(&ps);
}
......@@ -100,83 +107,79 @@ fail:
close(file_descriptor);
}
/**
* @brief Prints the distance matrix.
*
* This function pretty prints the distance matrix. For small distances
* scientific notation is used.
*
* @param D - The distance matrix
* @param sequences - An array of pointers to the sequences.
* @param n - The number of sequences.
* @param warnings - Print warnings? Set to 0 for bootstrapped matrices.
*/
void print_distances( const data_t *D, const seq_t *sequences, size_t n){
void print_distances(const struct model *D, const seq_t *sequences, size_t n,
int warnings) {
size_t i, j;
int use_scientific = 0;
size_t i,j;
for( i=0; i<n; i++){
for( j=0; j<n; j++){
if( isnan(D(i,j).distance)){
double *DD = malloc(n * n * sizeof(*DD));
if (!DD) err(errno, "meh.");
#define DD(X, Y) (DD[(X)*n + (Y)])
typedef double(estimate_fn)(const model *);
estimate_fn *estimate;
switch (MODEL) {
case M_RAW: estimate = &estimate_RAW; break;
case M_JC: estimate = &estimate_JC; break;
case M_KIMURA: estimate = &estimate_KIMURA; break;
}
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
model datum = D(i, j);
if (!(FLAGS & F_EXTRA_VERBOSE)) {
datum = model_average(&D(i, j), &D(j, i));
}
double dist = DD(i, j) = estimate(&datum);
double coverage = model_coverage(&datum);
if (isnan(dist) && warnings) {
const char str[] = {
"For the two sequences '%s' and '%s' the distance computation "
"failed and is reported as nan. "
"Please refer to the documentation for further details."
};
"For the two sequences '%s' and '%s' the distance "
"computation failed and is reported as nan. "
"Please refer to the documentation for further details."};
warnx(str, sequences[i].name, sequences[j].name);
} else if( D(i,j).distance > 0 && D(i,j).distance < 0.001 ){
} else if (dist > 0 && dist < 0.001) {
use_scientific = 1;
} else if( i < j && D(i,j).coverage < 0.05 && D(j,i).coverage < 0.05){
} else if (i < j && coverage < 0.05 && warnings) {
const char str[] = {
"For the two sequences '%s' and '%s' less than 5%% "
"homology were found (%f and %f, respectively)."
};
warnx(str, sequences[i].name, sequences[j].name, D(i,j).coverage, D(j,i).coverage);
"homology were found (%f and %f, respectively)."};
warnx(str, sequences[i].name, sequences[j].name,
model_coverage(&D(i, j)), model_coverage(&D(j, i)));
}
}
}
printf("%zu\n", n);
for( i=0;i<n;i++){
// Print exactly nine characters of the name. Pad with spaces if necessary.
for (i = 0; i < n; i++) {
// Print exactly nine characters of the name. Pad with spaces if
// necessary.
printf("%-9.9s", sequences[i].name);
for( j=0;j<n;j++){
// print average, weighted by covered nucleotides
double val = 0;
if( i != j){
double ijnucl = D(i,j).coverage * (double)sequences[j].len;
double jinucl = D(j,i).coverage * (double)sequences[i].len;
val = (D(i,j).distance * ijnucl + D(j,i).distance * jinucl)
/ (ijnucl + jinucl);
}
if( FLAGS & F_EXTRA_VERBOSE ){
val = D(i,j).distance;
}
if( !(FLAGS & F_RAW)){
val = -0.75 * log(1.0- (4.0 / 3.0) * val ); // jukes cantor
}
// fix negative zero
if( val <= 0.0 ){
val = 0.0;
}
if( !(FLAGS & F_RAW)){
val = -0.75 * log(1.0- (4.0 / 3.0) * val ); // jukes cantor
}
// fix negative zero
if( val <= 0.0 ){
val = 0.0;
}
for (j = 0; j < n; j++) {
// use scientific notation for small numbers
printf(use_scientific ? " %1.4e" : " %1.4f", val);
printf(use_scientific ? " %1.4e" : " %1.4f", DD(i, j));
}
printf("\n");
}
free(DD);
}
/**
......@@ -184,12 +187,12 @@ void print_distances( const data_t *D, const seq_t *sequences, size_t n){
* @param D - The distance matrix
* @param n - The number of sequences.
*/
void print_coverages( const data_t *D, size_t n){
size_t i,j;
void print_coverages(const struct model *D, size_t n) {
size_t i, j;
printf("\nCoverage:\n");
for(i=0; i<n; i++){
for(j=0; j<n; j++){
printf("%1.4e ", D(i,j).coverage);
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
printf("%1.4e ", model_coverage(&D(i, j)));
}
printf("\n");
}
......
......@@ -9,28 +9,18 @@
#include <errno.h>
#include <stdio.h>
#include "sequence.h"
/** @brief The data structure can be used to store output data resulting
* from the computation of distance.
*/
typedef struct data_s {
/** The distance */
double distance;
/** The coverage */
double coverage;
} data_t;
#include "model.h"
/**
* This is a neat hack for dealing with matrices.
*/
#define D( X, Y) (D[ (X)*n + (Y) ])
#define M( X, Y) (M[ (X)*n + (Y) ])
#define D(X, Y) (D[(X)*n + (Y)])
#define M(X, Y) (M[(X)*n + (Y)])
void read_fasta( const char *, dsa_t *dsa);
void read_fasta_join( const char *, dsa_t *dsa);
void read_fasta(const char *, dsa_t *dsa);
void read_fasta_join(const char *, dsa_t *dsa);
void print_distances( const data_t *D, const seq_t *sequences, size_t n);
void print_coverages( const data_t *D, size_t n);
void print_distances(const struct model *, const seq_t *, size_t, int);
void print_coverages(const struct model *, size_t);
#endif // _IO_H_
/** @file
* @brief This file contains all functions for the mutation matrix and the
* estimation of evolutionary distances thereof.
*/
#include <inttypes.h>
#include <math.h>
#include <stdio.h>
#include <gsl/gsl_randist.h>
#include "global.h"
#include "model.h"
/**
* @brief Sum some mutation count specified by `summands`. Intended to be used
* through the `model_sum` macro.
*
* @param MM - The mutation matrix.
* @param summands - The mutations to add.
* @returns The sum of mutations.
*/
static size_t model_sum_types(const model *MM, const int summands[]) {
size_t total = 0;
for (int i = 0; summands[i] != MUTCOUNTS; ++i) {
total += MM->counts[summands[i]];
}
return total;
}
#define model_sum(MM, ...) \
model_sum_types((MM), (int[]){__VA_ARGS__, MUTCOUNTS})