...
 
Commits (3)
......@@ -58,6 +58,7 @@ test-driver
test_esa
test_seq
test_fasta
test_process
*.trs
# Coverage
......
language: cpp
os:
- linux
# - osx
compiler:
- gcc
- clang
......@@ -13,9 +10,9 @@ addons:
- ubuntu-toolchain-r-test
packages:
- cmake
- g++-4.8
- libglib2.0-dev
- libgsl0-dev
install:
- export LIBDIVDIR="$HOME/libdivsufsort"
- pip install --user cpp-coveralls
......@@ -24,30 +21,26 @@ install:
- cd libdivsufsort-master && mkdir build && cd build
- cmake -DCMAKE_BUILD_TYPE="Release" -DCMAKE_INSTALL_PREFIX="$LIBDIVDIR" ..
- make && make install
- if [ "${TRAVIS_OS_NAME}" = "osx" ]; then brew install gsl; fi
- if [ "${TRAVIS_OS_NAME}" = "osx" ]; then brew install glib; fi
before_install:
- if [ "$CXX" = "g++" ]; then export CXX="g++-4.8" CC="gcc-4.8"; fi
# - sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.8 90
script:
- export DYLD_LIBRARY_PATH="$DYLD_LIBRARY_PATH:$LIBDIVDIR/lib"
- CONFIGURE_FLAGS=""
- export LD_LIBRARY_PATH="$LIBDIVDIR:$LIBDIVDIR/lib"
- export LIBRARY_PATH="$LIBDIVDIR:$LIBRARY_PATH"
- cd $TRAVIS_BUILD_DIR
- autoreconf -fvi -Im4
- export MYFLAGS="-fprofile-arcs -ftest-coverage -I$LIBDIVDIR/include"
- ./configure --enable-unit-tests LDFLAGS="-L$LIBDIVDIR/lib" CFLAGS="$MYFLAGS" CXXFLAGS="$MYFLAGS"
- if [ "${CC}" = "clang" ]; then export CONFIGURE_FLAGS="--disable-openmp"; fi
- ./configure $CONFIGURE_FLAGS --enable-unit-tests LDFLAGS="-L$LIBDIVDIR/lib" CFLAGS="$MYFLAGS" CXXFLAGS="$MYFLAGS"
- make
- make check
- make check || cat ./test-suite.log || exit 1
- export MYFLAGS="-I$LIBDIVDIR/include"
- ./configure --enable-unit-tests LDFLAGS="-L$LIBDIVDIR/lib" CFLAGS="$MYFLAGS" CXXFLAGS="$MYFLAGS"
- make distcheck DISTCHECK_CONFIGURE_FLAGS="LDFLAGS=\"-L$LIBDIVDIR/lib\" CFLAGS=\"-I$LIBDIVDIR/include\" CXXFLAGS=\"-I$LIBDIVDIR/include\""
- ./configure $CONFIGURE_FLAGS --enable-unit-tests LDFLAGS="-L$LIBDIVDIR/lib" CFLAGS="$MYFLAGS" CXXFLAGS="$MYFLAGS"
- make distcheck DISTCHECK_CONFIGURE_FLAGS="LDFLAGS=\"-L$LIBDIVDIR/lib\" CFLAGS=\"-I$LIBDIVDIR/include\" CXXFLAGS=\"-I$LIBDIVDIR/include\" $CONFIGURE_FLAGS"
- tar xzvf andi-*.tar.gz
- cd andi-*
- ./configure --enable-unit-tests --without-libdivsufsort
- ./configure $CONFIGURE_FLAGS --enable-unit-tests --without-libdivsufsort
- make
- make check
- make check || cat ./test-suite.log || exit 1
- cd ..
after_success:
- if [ "${TRAVIS_OS_NAME}" = "linux" -a "$CXX" = "g++-4.8" ]; then coveralls --exclude libdivsufsort-master -E '^andi-.*' --exclude libs --exclude test --gcov `which gcov-4.8` --gcov-options '\-lp'; fi
- if [ "$CXX" = "g++" ]; then coveralls --exclude libdivsufsort-master -E '^andi-.*' --exclude libs --exclude test --gcov `which gcov-4.8` --gcov-options '\-lp'; fi
......@@ -18,7 +18,8 @@ SUBDIRS+= test
AM_TESTS_ENVIRONMENT= \
RANDOM_SEED='@SEED@' ; export RANDOM_SEED ;
TESTS = test/test_esa test/test_seq test/test_extra.sh test/test_random.sh test/test_join.sh
XFAIL_TESTS=
TESTS = $(XFAIL_TESTS) test/nan.sh test/low_homo.sh test/test_esa test/test_seq test/test_extra.sh test/test_random.sh test/test_join.sh test/test_process
$(TESTS): src/andi
......
No preview for this file type
AC_INIT([andi], [0.11])
AM_INIT_AUTOMAKE([-Wall foreign ])
AC_INIT([andi], [0.12])
AM_INIT_AUTOMAKE([-Wall foreign])
AC_CONFIG_MACRO_DIR([m4])
......@@ -78,8 +78,7 @@ AC_ARG_WITH([seed],
AC_SUBST([SEED])
if test "x${try_unit_tests}" = xyes; then
AS_IF([test "x${try_unit_tests}" = xyes], [
have_glib=yes
PKG_CHECK_MODULES([GLIB], [glib-2.0], [], [have_glib=no])
......@@ -88,7 +87,7 @@ if test "x${try_unit_tests}" = xyes; then
fi
AX_CXX_COMPILE_STDCXX_11([],[mandatory])
fi
])
# Check for various headers including those used by libdivsufsort.
......
andi (0.11-2) UNRELEASED; urgency=medium
andi (0.12-1) UNRELEASED; urgency=medium
* Team upload.
[ Fabian Klötzl ]
* Remove useless dh-autoreconf build dependency
* Import new upstream release
[ Steffen Moeller ]
* debian/upstream/metadata: added refs to registries
......
.TH ANDI "1" "2017-06-30" "@VERSION@" "andi manual"
.TH ANDI "1" "2017-09-17" "@VERSION@" "andi manual"
.SH NAME
andi \- estimates evolutionary distance
.SH SYNOPSIS
.B andi
[\fI-jlv\fR] [\fI-b INT\fR] [\fI-p FLOAT\fR] [\fI-m MODEL\fR] [\fI-t INT\fR] \fIFILES\fR...
[\fIOPTIONS...\fR] \fIFILES\fR...
.SH DESCRIPTION
.TP
\fBandi\fR estimates the evolutionary distance between closely related genomes. For this \fBandi\fR reads the input sequences from \fIFASTA\fR files and computes the pairwise anchor distance. The idea behind this is explained in a paper by Haubold et al. (2015).
......@@ -11,10 +11,10 @@ andi \- estimates evolutionary distance
The output is a symmetrical distance matrix in \fIPHYLIP\fR format, with each entry representing divergence with a positive real number. A distance of zero means that two sequences are identical, whereas other values are estimates for the nucleotide substitution rate (Jukes-Cantor corrected). For technical reasons the comparison might fail and no estimate can be computed. In such cases \fInan\fR is printed. This either means that the input sequences were too short (<200bp) or too diverse (K>0.5) for our method to work properly.
.SH OPTIONS
.TP
\fB\-b\fR, \fB\-\-bootstrap\fR <INT>
\fB\-b\fR \fIINT\fR, \fB\-\-bootstrap\fR=\fIINT\fR
Compute multiple distance matrices, with \fIn-1\fR bootstrapped from the first. See the paper Klötzl & Haubold (2016) for a detailed explanation.
.TP
\fB--file-of-filenames\fR <FILE>
\fB--file-of-filenames\fR=\fIFILE\fR
Usually, \fBandi\fR is called with the filenames as commandline arguments. With this option the filenames may also be read from a file itself, with one name per line. Use a single dash (\fB'-'\fR) to read from stdin.
.TP
\fB\-j\fR, \fB\-\-join\fR
......@@ -23,13 +23,16 @@ Use this mode if each of your \fIFASTA\fR files represents one assembly with num
\fB\-l\fR, \fB\-\-low-memory\fR
In multithreaded mode, \fBandi\fR requires memory linear to the amount of threads. The low memory mode changes this to a constant demand independent from the used number of threads. Unfortunately, this comes at a significant runtime cost.
.TP
\fB\-m\fR, \fB\-\-model\fR <Raw|JC|Kimura>
Different models of nucleotide evolution are supported. By default the Jukes-Cantor correction is used.
\fB\-m\fR \fIMODEL\fR, \fB\-\-model\fR=\fIMODEL\fr
Set the nucleotide evolution model to one of 'Raw', 'JC', or 'Kimura'. By default the Jukes-Cantor correction is used.
.TP
\fB\-p\fR <FLOAT>
\fB\-p\fR \fIFLOAT\fR
Significance of an anchor; default: 0.025.
.TP
\fB\-t\fR, \fB\-\-threads\fR <INT>
\fB--progress\fR[=\fIWHEN\fR]
Print a progress bar. \fIWHEN\fR can be 'auto' (default if omitted), 'always', or 'never'.
.TP
\fB\-t\fR \fIINT\fR, \fB\-\-threads\fR=\fIINT\fR
The number of threads to be used; by default, all available processors are used.
.br
Multithreading is only available if \fBandi\fR was compiled with OpenMP support.
......@@ -38,7 +41,7 @@ Multithreading is only available if \fBandi\fR was compiled with OpenMP support.
By default \fBandi\fR outputs the full names of sequences, optionally padded with spaces, if they are shorter than ten characters. Names longer than ten characters may lead to problems with downstream tools. With this switch names will be truncated.
.TP
\fB\-v\fR, \fB\-\-verbose\fR
Prints additional information. Apply multiple times for extra verboseness.
Prints additional information, including the amount of found homology. Apply multiple times for extra verboseness.
.TP
\fB\-h\fR, \fB\-\-help\fR
Prints the synopsis and an explanation of available options.
......@@ -46,7 +49,7 @@ Prints the synopsis and an explanation of available options.
\fB\-\-version\fR
Outputs version information and acknowledgments.
.SH COPYRIGHT
Copyright \(co 2014 - 2016 Fabian Klötzl
Copyright \(co 2014 - 2017 Fabian Klötzl
License GPLv3+: GNU GPL version 3 or later.
.br
This is free software: you are free to change and redistribute it.
......
This diff is collapsed.
......@@ -27,6 +27,7 @@ args+=(
Kimura\:Kimura\-two\-parameter
))'
"($info)-p+[Significance of an anchor; default\: 0.025]:float:"
"($info)--progress=[Show progress bar]:when:(always auto never)"
"($info -t --threads)"{-t+,--threads=}'[The number of threads to be used; by default, all available processors are used]:num_threads:'
"($info)--truncate-names[Print only the first ten characters of each name]"
"($info)*"{-v,--verbose}'[Prints additional information]'
......
......@@ -45,10 +45,9 @@
int FLAGS = 0;
int THREADS = 1;
long unsigned int BOOTSTRAP = 0;
double RANDOM_ANCHOR_PROP = 0.025;
double ANCHOR_P_VALUE = 0.025;
gsl_rng *RNG = NULL;
int MODEL = M_JC;
int EXIT_CODE = EXIT_SUCCESS;
void usage(int);
void version(void);
......@@ -66,6 +65,7 @@ int main(int argc, char *argv[]) {
{"version", no_argument, NULL, 0},
{"truncate-names", no_argument, NULL, 0},
{"file-of-filenames", required_argument, NULL, 0},
{"progress", optional_argument, NULL, 0},
{"help", no_argument, NULL, 'h'},
{"verbose", no_argument, NULL, 'v'},
{"join", no_argument, NULL, 'j'},
......@@ -79,6 +79,9 @@ int main(int argc, char *argv[]) {
// Use all available processors by default.
THREADS = omp_get_num_procs();
#endif
enum { P_AUTO, P_NEVER, P_ALWAYS } progress = P_AUTO;
struct string_vector file_names;
string_vector_init(&file_names);
......@@ -105,6 +108,19 @@ int main(int argc, char *argv[]) {
if (strcasecmp(option_str, "file-of-filenames") == 0) {
read_into_string_vector(optarg, &file_names);
}
if (strcasecmp(option_str, "progress") == 0) {
if (!optarg || strcasecmp(optarg, "always") == 0) {
progress = P_ALWAYS;
} else if (strcasecmp(optarg, "auto") == 0) {
progress = P_AUTO;
} else if (strcasecmp(optarg, "never") == 0) {
progress = P_NEVER;
} else {
warnx("invalid argument to --progress '%s'. Expected "
"one of 'auto', 'always', or 'never'.",
optarg);
}
}
break;
}
case 'h': usage(EXIT_SUCCESS); break;
......@@ -117,21 +133,21 @@ int main(int argc, char *argv[]) {
double prop = strtod(optarg, &end);
if (errno || end == optarg || *end != '\0') {
warnx(
soft_errx(
"Expected a floating point number for -p argument, but "
"'%s' was given. Skipping argument.",
optarg);
break;
}
if (prop < 0.0 || prop > 1.0) {
warnx("A probability should be a value between 0 and 1; "
"Ignoring -p %f argument.",
prop);
if (prop <= 0.0 || prop >= 1.0) {
soft_errx("A probability should be a value between 0 and "
"1, exlusive; Ignoring -p %f argument.",
prop);
break;
}
RANDOM_ANCHOR_PROP = prop;
ANCHOR_P_VALUE = prop;
break;
}
case 'l': FLAGS |= F_LOW_MEMORY; break;
......@@ -172,7 +188,7 @@ int main(int argc, char *argv[]) {
long unsigned int bootstrap = strtoul(optarg, &end, 10);
if (errno || end == optarg || *end != '\0' || bootstrap == 0) {
warnx(
soft_errx(
"Expected a positive number for -b argument, but '%s' "
"was given. Ignoring -b argument.",
optarg);
......@@ -191,8 +207,9 @@ int main(int argc, char *argv[]) {
} else if (strcasecmp(optarg, "KIMURA") == 0) {
MODEL = M_KIMURA;
} else {
warnx("Ignoring argument for --model. Expected Raw, JC or "
"Kimura");
soft_errx(
"Ignoring argument for --model. Expected Raw, JC or "
"Kimura");
}
break;
}
......@@ -217,11 +234,13 @@ int main(int argc, char *argv[]) {
size_t minfiles = FLAGS & F_JOIN ? 2 : 1;
if (string_vector_size(&file_names) < minfiles) {
// not enough files passed via arguments; read from stdin.
string_vector_push_back(&file_names, "-");
// explain to the user, why nothing is happening.
if (isatty(STDIN_FILENO)) {
warnx("Not enough file names given; expecting input via stdin.");
// not enough files passed via arguments
if (!isatty(STDIN_FILENO)) {
// read from stdin in pipe
string_vector_push_back(&file_names, "-");
} else {
// print a helpful message on './andi' without args
usage(EXIT_FAILURE);
}
}
......@@ -248,11 +267,6 @@ int main(int argc, char *argv[]) {
n);
}
if (FLAGS & F_VERBOSE) {
fprintf(stderr, "Comparing %zu sequences\n", n);
fflush(stderr);
}
RNG = gsl_rng_alloc(gsl_rng_default);
if (!RNG) {
err(1, "RNG allocation failed.");
......@@ -293,20 +307,27 @@ int main(int argc, char *argv[]) {
}
if (FLAGS & F_SHORT) {
warnx("One of the given input sequences is shorter than a thousand "
"nucleotides. This may result in inaccurate distances. Try an "
"alignment instead.");
soft_errx(
"One of the given input sequences is shorter than a thousand "
"nucleotides. This may result in inaccurate distances. Try an "
"alignment instead.");
}
// side channel
EXIT_CODE = EXIT_SUCCESS;
// determine whether to print a progress bar
if (progress == P_AUTO) {
progress = isatty(STDERR_FILENO) ? P_ALWAYS : P_NEVER;
}
if (progress == P_ALWAYS) {
FLAGS |= F_PRINT_PROGRESS;
}
// compute distance matrix
calculate_distances(dsa_data(&dsa), n);
dsa_free(&dsa);
gsl_rng_free(RNG);
return EXIT_CODE;
return FLAGS & F_SOFT_ERROR ? EXIT_FAILURE : EXIT_SUCCESS;
}
/**
......@@ -314,22 +335,25 @@ int main(int argc, char *argv[]) {
*/
void usage(int status) {
const char str[] = {
"Usage: andi [-jlv] [-b INT] [-p FLOAT] [-m MODEL] [-t INT] FILES...\n"
"\tFILES... can be any sequence of FASTA files. If no files are "
"supplied, stdin is used instead.\n"
"Usage: andi [OPTIONS...] FILES...\n"
"\tFILES... can be any sequence of FASTA files.\n"
"\tUse '-' as file name to read from stdin.\n"
"Options:\n"
" -b, --bootstrap <INT> Print additional bootstrap matrices\n"
" --file-of-filenames <FILE> Read additional filenames from "
" -b, --bootstrap=INT Print additional bootstrap matrices\n"
" --file-of-filenames=FILE Read additional filenames from "
"FILE; one per line\n"
" -j, --join Treat all sequences from one file as a single "
"genome\n"
" -l, --low-memory Use less memory at the cost of speed\n"
" -m, --model <Raw|JC|Kimura> Pick an evolutionary model; default: "
" -m, --model=MODEL Pick an evolutionary model of 'Raw', 'JC', "
"'Kimura'; default: "
"JC\n"
" -p <FLOAT> Significance of an anchor; default: 0.025\n"
" -p FLOAT Significance of an anchor; default: 0.025\n"
" --progress=WHEN Print a progress bar 'always', 'never', or "
"'auto'; default: auto\n"
#ifdef _OPENMP
" -t, --threads <INT> Set the number of threads; by default, all "
"available processors are used\n"
" -t, --threads=INT Set the number of threads; by default, all "
"processors are used\n"
#endif
" --truncate-names Truncate names to ten characters\n"
" -v, --verbose Prints additional information\n"
......
......@@ -2,9 +2,10 @@
* @brief This file is a preprocessor hack for the two functions `distMatrix`
* and `distMatrixLM`.
*/
// clang-format off
#ifdef FAST
#define NAME distMatrix
#define P_OUTER _Pragma("omp parallel for num_threads( THREADS)")
#define P_OUTER _Pragma("omp parallel for num_threads( THREADS) default(none) shared(progress_counter) firstprivate( stderr, M, sequences, n, print_progress)")
#define P_INNER
#else
#undef NAME
......@@ -12,8 +13,9 @@
#undef P_INNER
#define NAME distMatrixLM
#define P_OUTER
#define P_INNER _Pragma("omp parallel for num_threads( THREADS)")
#define P_INNER _Pragma("omp parallel for num_threads( THREADS) default(none) shared(progress_counter) firstprivate( stderr, M, sequences, n, print_progress, i, E, subject)")
#endif
// clang-format on
/** @brief This function calls dist_andi for pairs of subjects and queries, and
* thereby fills the distance matrix.
......@@ -32,6 +34,14 @@
void NAME(struct model *M, const seq_t *sequences, size_t n) {
size_t i;
size_t progress_counter = 0;
int print_progress = FLAGS & F_PRINT_PROGRESS;
if (print_progress) {
fprintf(stderr, "Comparing %zu sequences: %5.1f%% (%zu/%zu)", n, 0.0,
(size_t)0, n * n - n);
}
//#pragma
P_OUTER
for (i = 0; i < n; i++) {
......@@ -56,15 +66,31 @@ void NAME(struct model *M, const seq_t *sequences, size_t n) {
size_t ql = sequences[j].len;
M(i, j) = dist_anchor(&E, sequences[j].S, ql, subject.gc);
#pragma omp atomic update
progress_counter++;
}
// TODO: Provide a nicer progress indicator.
if (FLAGS & F_EXTRA_VERBOSE) {
if (print_progress) {
size_t local_progress_counter;
size_t num_comparisons = n * n - n;
#pragma omp atomic read
local_progress_counter = progress_counter;
double progress =
100.0 * (double)local_progress_counter / num_comparisons;
#pragma omp critical
fprintf(stderr, "Subject %s done.\n", sequences[i].name);
fprintf(stderr, "\rComparing %zu sequences: %5.1f%% (%zu/%zu)", n,
progress, local_progress_counter, num_comparisons);
}
esa_free(&E);
seq_subject_free(&subject);
}
if (print_progress) {
fprintf(stderr, ", done.\n");
}
}
......@@ -313,7 +313,9 @@ int esa_init_CLD(esa_s *C) {
const saidx_t *LCP = C->LCP;
typedef struct pair_s { saidx_t idx, lcp; } pair_t;
typedef struct pair_s {
saidx_t idx, lcp;
} pair_t;
pair_t *stack = malloc((C->len + 1) * sizeof(*stack));
CHECK_MALLOC(stack);
......
......@@ -26,11 +26,11 @@ extern int FLAGS;
extern int THREADS;
/**
* The ::RANDOM_ANCHOR_PROP represents the probability with which a found
* anchor is a random match and not homologous. Its value can be set using
* the `-p` switch.
* The ::ANCHOR_P_VALUE represents the probability that an anchor will be found,
* if the two sequences are unrelated. I.e. it is the p-value for H_0: random
* sequences. Its value can be set using the `-p` switch.
*/
extern double RANDOM_ANCHOR_PROP;
extern double ANCHOR_P_VALUE;
/**
* The number of matrices that should be bootstrapped.
......@@ -49,11 +49,6 @@ extern int MODEL;
enum { M_RAW, M_JC, M_KIMURA };
/**
* Global exit code. Should be non-zero on error.
*/
extern int EXIT_CODE;
/**
* This enum contains the available flags. Please note that all
* available options are a power of 2.
......@@ -66,7 +61,9 @@ enum {
F_NON_ACGT = 8,
F_JOIN = 16,
F_LOW_MEMORY = 32,
F_SHORT = 64
F_SHORT = 64,
F_PRINT_PROGRESS = 128,
F_SOFT_ERROR = 256
};
/**
......@@ -79,6 +76,26 @@ enum {
if (PTR == NULL) { \
err(errno, "Out of memory"); \
} \
} while (0);
} while (0)
/**
* @brief This macro is used to print a warning and make the program exit with
* an failure exit code, later.
*/
#define soft_err(...) \
do { \
FLAGS |= F_SOFT_ERROR; \
warn(__VA_ARGS__); \
} while (0)
/**
* @brief This macro is used to print a warning and make the program exit with
* an failure exit code, later.
*/
#define soft_errx(...) \
do { \
FLAGS |= F_SOFT_ERROR; \
warnx(__VA_ARGS__); \
} while (0)
#endif
......@@ -103,7 +103,8 @@ size_t string_vector_size(const struct string_vector *sv) {
void read_into_string_vector(const char *file_name, struct string_vector *sv) {
FILE *file = strcmp(file_name, "-") ? fopen(file_name, "r") : stdin;
if (!file) {
err(errno, "%s", file_name);
soft_err("%s", file_name);
return;
}
while (1) {
......@@ -118,7 +119,8 @@ void read_into_string_vector(const char *file_name, struct string_vector *sv) {
}
if (check == -1) {
err(errno, "%s", file_name);
soft_err("%s", file_name);
break;
}
char *nl = strchr(str, '\n');
......@@ -137,7 +139,7 @@ void read_into_string_vector(const char *file_name, struct string_vector *sv) {
int check = fclose(file);
if (check != 0) {
err(errno, "%s", file_name);
soft_err("%s", file_name);
}
}
......@@ -198,7 +200,7 @@ void read_fasta(const char *file_name, dsa_t *dsa) {
strcmp(file_name, "-") ? open(file_name, O_RDONLY) : STDIN_FILENO;
if (file_descriptor < 0) {
warn("%s", file_name);
soft_err("%s", file_name);
return;
}
......@@ -209,7 +211,7 @@ void read_fasta(const char *file_name, dsa_t *dsa) {
pfasta_file pf;
if ((l = pfasta_parse(&pf, file_descriptor)) != 0) {
warnx("%s: %s", file_name, pfasta_strerror(&pf));
soft_errx("%s: %s", file_name, pfasta_strerror(&pf));
goto fail;
}
......@@ -225,7 +227,7 @@ void read_fasta(const char *file_name, dsa_t *dsa) {
}
if (l < 0) {
warnx("%s: %s", file_name, pfasta_strerror(&pf));
soft_errx("%s: %s", file_name, pfasta_strerror(&pf));
pfasta_seq_free(&ps);
}
......@@ -276,23 +278,30 @@ void print_distances(const struct model *D, const seq_t *sequences, size_t n,
}
double dist = DD(i, j) = i == j ? 0.0 : estimate(&datum);
double coverage = model_coverage(&datum);
if (dist > 0 && dist < 0.001) {
use_scientific = 1;
}
if (isnan(dist) && warnings) {
EXIT_CODE = EXIT_FAILURE;
const char str[] = {
"For the two sequences '%s' and '%s' the distance "
"computation failed and is reported as nan. "
"Please refer to the documentation for further details."};
warnx(str, sequences[i].name, sequences[j].name);
} else if (dist > 0 && dist < 0.001) {
use_scientific = 1;
} else if (i < j && coverage < 0.05 && warnings) {
const char str[] = {
"For the two sequences '%s' and '%s' less than 5%% "
"homology were found (%f and %f, respectively)."};
warnx(str, sequences[i].name, sequences[j].name,
model_coverage(&D(i, j)), model_coverage(&D(j, i)));
soft_errx(str, sequences[i].name, sequences[j].name);
}
if (!isnan(dist) && i < j && warnings) {
double coverage1 = model_coverage(&D(i, j));
double coverage2 = model_coverage(&D(j, i));
if (coverage1 < 0.05 || coverage2 < 0.05) {
const char str[] = {
"For the two sequences '%s' and '%s' less than 5%% "
"homology were found (%f and %f, respectively)."};
soft_errx(str, sequences[i].name, sequences[j].name,
coverage1, coverage2);
}
}
}
}
......
......@@ -202,6 +202,26 @@ void model_count_equal(model *MM, const char *S, size_t len) {
MM->counts[TtoT] += local_counts[2];
}
/** @brief Convert a nucleotide to a 2bit representation.
*
* We want to map characters:
* A → 0
* C → 1
* G → 2
* T → 3
* The trick used below is that the three lower bits of the
* characters are unique. Thus, they can be used to compute the mapping
* above. The mapping itself is done via tricky bitwise operations.
*
* @param c - input nucleotide
* @returns 2bit representation.
*/
char nucl2bit(char c) {
c &= 6;
c ^= c >> 1;
return c >> 1;
}
/**
* @brief Count the substitutions and add them to the mutation matrix.
*
......@@ -213,7 +233,7 @@ void model_count_equal(model *MM, const char *S, size_t len) {
void model_count(model *MM, const char *S, const char *Q, size_t len) {
size_t local_counts[MUTCOUNTS] = {0};
for (; len--; S++, Q++) {
for (size_t i = 0; i < len; S++, Q++, i++) {
char s = *S;
char q = *Q;
......@@ -222,24 +242,9 @@ void model_count(model *MM, const char *S, const char *Q, size_t len) {
continue;
}
/* We want to map characters:
* A → 0
* C → 1
* G → 2
* T → 3
* The trick used below is that the three lower bits of the
* characters are unique. Thus, they can be used to compute the mapping
* above. The mapping itself is done via tricky bitwise operations.
*/
unsigned char nibble_s = s & 7;
unsigned char nibble_q = q & 7;
static const unsigned int mm1 = 0x20031000;
// Pick the correct two bits representing s and q.
unsigned char foo = (mm1 >> (4 * nibble_s)) & 0x3;
unsigned char baz = (mm1 >> (4 * nibble_q)) & 0x3;
unsigned char foo = nucl2bit(s);
unsigned char baz = nucl2bit(q);
/*
* The mutation matrix is symmetric. For convenience we define the
......
......@@ -21,7 +21,7 @@
#include <omp.h>
#endif
double shuprop(size_t x, double g, size_t l);
double shustring_cum_prob(size_t x, double g, size_t l);
int calculate_bootstrap(const struct model *M, const seq_t *sequences,
size_t n);
......@@ -31,18 +31,17 @@ int calculate_bootstrap(const struct model *M, const seq_t *sequences,
* Given some parameters calculate the minimum length for anchors according
* to the distribution from Haubold et al. (2009).
*
* @param p - The probability with which an anchor is allowed to be random.
* @param p - The probability with which an anchor will be created under a
* random model.
* @param g - The the relative amount of GC in the subject.
* @param l - The length of the subject.
* @param l - The length of the subject (includes revcomp).
* @returns The minimum length of an anchor.
*/
size_t min_anchor_length(double p, double g, size_t l) {
size_t x = 1;
double prop = 0.0;
// Find smallest x with P(X > x) < p
for (; prop < 1 - p; x++) {
prop = shuprop(x, g / 2, l);
while (shustring_cum_prob(x, g / 2, l) < 1 - p) {
x++;
}
return x;
......@@ -82,18 +81,20 @@ size_t binomial_coefficient(size_t n, size_t k) {
/**
* @brief Given `x` this function calculates the probability of a shustring
* with a length less than `x`.
* with a length less or equal to `x` under a random model. This means, it is
* the cumulative probability.
*
* Let X be the longest shortest unique substring (shustring) at any position.
* Then this function computes P{X <= x} with respect to the given parameter
* set. See Haubold et al. (2009).
* set. See Haubold et al. (2009). Note that `x` includes the final mismatch.
* Thus, `x` is `match length + 1`.
*
* @param x - The maximum length of a shustring.
* @param p - The half of the relative amount of GC in the DNA.
* @param l - The length of the subject.
* @returns The probability of a certain shustring length.
*/
double shuprop(size_t x, double p, size_t l) {
double shustring_cum_prob(size_t x, double p, size_t l) {
double xx = (double)x;
double ll = (double)l;
size_t k;
......@@ -115,6 +116,106 @@ double shuprop(size_t x, double p, size_t l) {
return s;
}
typedef _Bool bool;
#define false 0
#define true !false
/**
* @brief This structure captures properties of an anchor.
*/
struct anchor {
/** The position on the subject. */
size_t pos_S;
/** The position on the query. */
size_t pos_Q;
/** The length of the exact match. */
size_t length;
};
/**
* @brief This is a structure of assorted variables needed for anchor finding.
*/
struct context {
const esa_s *C;
const char *query;
size_t query_length;
size_t threshold;
};
/**
* @brief Compute the length of the longest common prefix of two strings.
*
* @param S - One string.
* @param Q - Another string.
* @param remaining - The length of one of the strings.
* @returns the length of the lcp.
*/
static inline size_t lcp(const char *S, const char *Q, size_t remaining) {
size_t length = 0;
while (length < remaining && S[length] == Q[length]) {
length++;
}
return length;
}
/**
* @brief Check whether the last anchor can be extended by a lucky anchor.
*
* Anchors are defined to be unique and of a minimum length. The uniqueness
* requires us to search throw the suffix array for a second appearance of the
* anchor. However, if a left anchor is already unique, we could be sloppy and
* drop the uniqueness criterion for the second anchor. This way we can skip the
* lookup and just compare characters directly. However, for a lucky anchor the
* match still has to be longer than the threshold.
*
* @param ctx - Matching context of various variables.
* @param last_match - The last anchor.
* @param this_match - Input/Output variable for the current match.
* @returns true iff the current match is a lucky anchor.
*/
static inline bool lucky_anchor(const struct context *ctx,
const struct anchor *last_match,
struct anchor *this_match) {
size_t advance = this_match->pos_Q - last_match->pos_Q;
size_t gap = this_match->pos_Q - last_match->pos_Q - last_match->length;
size_t try_pos_S = last_match->pos_S + advance;
if (try_pos_S >= (size_t)ctx->C->len || gap > ctx->threshold) {
return false;
}
this_match->pos_S = try_pos_S;
this_match->length =
lcp(ctx->query + this_match->pos_Q, ctx->C->S + try_pos_S,
ctx->query_length - this_match->pos_Q);
return this_match->length >= ctx->threshold;
}
/**
* @brief Check for a new anchor.
*
* Given the current context and starting position check if the new match is an
* anchor. The latter requires uniqueness and a certain minimum length.
*
* @param ctx - Matching context of various variables.
* @param last_match - (unused)
* @param this_match - Input/Output variable for the current match.
* @returns true iff an anchor was found.
*/
static inline bool anchor(const struct context *ctx,
const struct anchor *last_match,
struct anchor *this_match) {
lcp_inter_t inter = get_match_cached(ctx->C, ctx->query + this_match->pos_Q,
ctx->query_length - this_match->pos_Q);
this_match->pos_S = ctx->C->SA[inter.i];
this_match->length = inter.l <= 0 ? 0 : inter.l;
return inter.i == inter.j && this_match->length >= ctx->threshold;
}
/**
* @brief Divergence estimation using the anchor technique.
*
......@@ -135,83 +236,72 @@ model dist_anchor(const esa_s *C, const char *query, size_t query_length,
double gc) {
struct model ret = {.seq_len = query_length, .counts = {0}};
lcp_inter_t inter;
size_t last_pos_Q = 0;
size_t last_pos_S = 0;
size_t last_length = 0;
// This variable indicates that the last anchor was the right anchor of a
// pair.
size_t last_was_right_anchor = 0;
struct anchor this_match = {0};
struct anchor last_match = {0};
bool last_was_right_anchor = false;
size_t this_pos_Q = 0;
size_t this_pos_S;
size_t this_length;
size_t threshold = min_anchor_length(ANCHOR_P_VALUE, gc, C->len);
size_t num_right_anchors = 0;
size_t threshold = min_anchor_length(RANDOM_ANCHOR_PROP, gc, C->len);
struct context ctx = {C, query, query_length, threshold};
// Iterate over the complete query.
while (this_pos_Q < query_length) {
inter =
get_match_cached(C, query + this_pos_Q, query_length - this_pos_Q);
this_length = inter.l <= 0 ? 0 : inter.l;
while (this_match.pos_Q < query_length) {
if (inter.i == inter.j && this_length >= threshold) {
// Check for lucky anchors and fall back to normal strategy.
if (lucky_anchor(&ctx, &last_match, &this_match) ||
anchor(&ctx, &last_match, &this_match)) {
// We have reached a new anchor.
this_pos_S = C->SA[inter.i];
size_t end_S = last_match.pos_S + last_match.length;
size_t end_Q = last_match.pos_Q + last_match.length;
// Check if this can be a right anchor to the last one.
if (this_pos_S > last_pos_S &&
this_pos_Q - last_pos_Q == this_pos_S - last_pos_S) {
num_right_anchors++;
if (this_match.pos_S > end_S &&
this_match.pos_Q - end_Q == this_match.pos_S - end_S) {
// classify nucleotides in the qanchor
model_count_equal(&ret, query + last_pos_Q, last_length);
// classify nucleotides in the left qanchor
model_count_equal(&ret, query + last_match.pos_Q,
last_match.length);
// Count the SNPs in between.
model_count(&ret, C->S + last_pos_S + last_length,
query + last_pos_Q + last_length,
this_pos_Q - last_pos_Q - last_length);
last_was_right_anchor = 1;
model_count(&ret, C->S + end_S, query + end_Q,
this_match.pos_Q - end_Q);
last_was_right_anchor = true;
} else {
if (last_was_right_anchor) {
// If the last was a right anchor, but with the current one,
// we cannot extend, then add its length.
model_count_equal(&ret, query + last_pos_Q, last_length);
} else if (last_length >= threshold * 2) {
model_count_equal(&ret, query + last_match.pos_Q,
last_match.length);
} else if (last_match.length >= threshold * 2) {
// The last anchor wasn't neither a left or right anchor.
// But, it was as long as an anchor pair. So still count it.
model_count_equal(&ret, query + last_pos_Q, last_length);
model_count_equal(&ret, query + last_match.pos_Q,
last_match.length);
}
last_was_right_anchor = 0;
last_was_right_anchor = false;
}
// Cache values for later
last_pos_Q = this_pos_Q;
last_pos_S = this_pos_S;
last_length = this_length;
last_match = this_match;
}
// Advance
this_pos_Q += this_length + 1;
this_match.pos_Q += this_match.length + 1;
}
// Very special case: The sequences are identical
if (last_length >= query_length) {
model_count(&ret, C->S + last_pos_S, query, query_length);
if (last_match.length >= query_length) {
model_count_equal(&ret, query, query_length);
return ret;
}
// We might miss a few nucleotides if the last anchor was also a right
// anchor. The logic is the same as a few lines above.
if (last_was_right_anchor) {
model_count(&ret, C->S + last_pos_S, query + last_pos_Q, last_length);
} else if (last_length >= threshold * 2) {
model_count_equal(&ret, query + last_pos_Q, last_length);
model_count_equal(&ret, query + last_match.pos_Q, last_match.length);
} else if (last_match.length >= threshold * 2) {
model_count_equal(&ret, query + last_match.pos_Q, last_match.length);
}
return ret;
......@@ -266,7 +356,7 @@ void calculate_distances(seq_t *sequences, size_t n) {
if (BOOTSTRAP) {
int res = calculate_bootstrap(M, sequences, n);
if (res) {
warnx("Bootstrapping failed.");
soft_errx("Bootstrapping failed.");
}
}
......
check_PROGRAMS = test_esa test_seq test_fasta
dist_noinst_DATA = test_extra.sh test_random.sh test_join.sh
check_PROGRAMS = test_esa test_seq test_fasta test_process
dist_noinst_DATA = test_extra.sh test_random.sh test_join.sh nan.sh low_homo.sh
if !BUILD_WITH_LIBDIVSUFSORT
PSUFSORT=$(top_builddir)/opt/psufsort/libpsufsort.a
......@@ -7,12 +7,19 @@ PSUFSORT=$(top_builddir)/opt/psufsort/libpsufsort.a
DUMMY=dummy.cxx
endif
test_seq_SOURCES = test_seq.c ../src/sequence.c
test_seq_SOURCES = test_seq.c $(top_srcdir)/src/sequence.c
test_seq_CPPFLAGS = -I$(top_srcdir)/src -I$(top_srcdir)/opt -DDEBUG -std=gnu99
test_seq_CFLAGS = -Wall -Wextra $(GLIB_CFLAGS) -Wno-missing-field-initializers
test_seq_LDADD = $(GLIB_LIBS) $(top_builddir)/opt/libcompat.a
test_esa_SOURCES = test_esa.c ../src/esa.c ../src/sequence.c $(top_srcdir)/src/esa.h
test_process_SOURCES = test_process.c $(top_srcdir)/src/esa.c $(top_srcdir)/src/io.c $(top_srcdir)/src/model.c $(top_srcdir)/src/process.c $(top_srcdir)/src/sequence.c $(top_srcdir)/src/global.h
test_process_CPPFLAGS = $(OPENMP_CFLAGS) -I$(top_srcdir)/src -I$(top_srcdir)/opt -I$(top_srcdir)/libs -DDEBUG -std=gnu99
test_process_CFLAGS = $(OPENMP_CFLAGS) -Wall -Wextra $(GLIB_CFLAGS) -Wno-missing-field-initializers
test_process_LDADD = $(GLIB_LIBS) $(PSUFSORT) $(top_builddir)/opt/libcompat.a $(top_builddir)/libs/libpfasta.a
test_process_CXXFLAGS = $(OPENMP_CXXFLAGS) -Wall -Wextra
nodist_EXTRA_test_process_SOURCES = $(DUMMY)
test_esa_SOURCES = test_esa.c $(top_srcdir)/src/esa.c $(top_srcdir)/src/sequence.c $(top_srcdir)/src/esa.h
test_esa_CPPFLAGS = $(OPENMP_CFLAGS) -I$(top_srcdir)/libs -I$(top_srcdir)/opt -I$(top_srcdir)/src -DDEBUG -std=gnu99
test_esa_CFLAGS = $(OPENMP_CFLAGS) -Wall -Wextra $(GLIB_CFLAGS) -Wno-missing-field-initializers
test_esa_LDADD = $(GLIB_LIBS) $(PSUFSORT) $(top_builddir)/opt/libcompat.a
......
#!/bin/bash -f
./test/test_fasta -l 100000 > a.fa
./test/test_fasta -l 100000 > b.fa
./test/test_fasta -l 100 > both.fa
cat both.fa a.fa | awk -vRS='>' '{if($1 == "S0")print ">"$0 > "S0.fa"}'
cat both.fa b.fa | awk -vRS='>' '{if($1 == "S1")print ">"$0 > "S1.fa"}'
# this is expected to trigger the low homology warning
./src/andi -j S0.fa S1.fa 2>&1 | grep 'homology'
EXIT_VAL=$?
if [[ EXIT_VAL -ge 1 ]]; then
echo "Triggering low homology failed" >&2
grep '^>' a.fa b.fa both.fa
fi
rm -f a.fa b.fa both.fa S0.fa S1.fa
exit $EXIT_VAL
#!/bin/bash -f
./test/test_fasta -l 10000 > a.fa
./test/test_fasta -l 10000 > b.fa
# this is expected to trigger the nan warning
./src/andi -j a.fa b.fa 2>&1 | grep 'nan'
EXIT_VAL=$?
if [[ EXIT_VAL -ge 1 ]]; then
echo "Triggering nan failed" >&2
grep '^>' a.fa b.fa both.fa
fi
rm -f a.fa b.fa
exit $EXIT_VAL
......@@ -16,11 +16,11 @@ rm p1.fasta p2.fasta p3.fasta;
RES=$(./src/andi -m RAW -t 1 -j S0.fasta S1.fasta |
tail -n 1 |
awk '{print ($2 - 0.1)}' |
awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) < 0.01}'
awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) < 0.03}'
)
if test $RES -ne 1; then
echo "The last test computed a distance deviating more than one percent from its intended value."
echo "The last test computed a distance deviating more than three percent from its intended value."
echo "See S0.fasta and S1.fasta for the used sequences."
exit 1;
fi
......@@ -38,11 +38,11 @@ rm p2.fasta p3.fasta;
RES=$(./src/andi -m RAW -t1 -j S0.fasta S1.fasta |
tail -n 1 |
awk '{print ($2 - 0.1)}' |
awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) < 0.01}'
awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) < 0.03}'
)
if test $RES -ne 1; then
echo "The last test computed a distance deviating more than one percent from its intended value."
echo "The last test computed a distance deviating more than three percent from its intended value."
echo "See S0.fasta and S1.fasta for the used sequences."
exit 1;
fi
......@@ -62,11 +62,11 @@ rm p1.fasta p2.fasta p3.fasta;
RES=$(./src/andi -mRAW -t 1 -j S0.fasta S1.fasta |
tail -n 1 |
awk '{print ($2 - 0.1)}' |
awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) < 0.01}'
awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) < 0.03}'
)
if test $RES -ne 1; then
echo "The last test computed a distance deviating more than one percent from its intended value."
echo "The last test computed a distance deviating more than three percent from its intended value."
echo "See S0.fasta and S1.fasta for the used sequences."
exit 1;
fi
......
#include "global.h"
#include "process.h"
#include <glib.h>
#include <math.h>
int FLAGS = 0;
int THREADS = 1;
long unsigned int BOOTSTRAP = 0;
double ANCHOR_P_VALUE = 0.025;
gsl_rng *RNG = NULL;
int MODEL = M_JC;
double shustring_cum_prob(size_t x, double g, size_t l);
size_t min_anchor_length(double p, double g, size_t l);
void test_shustring_cum_prob() {
int len = 100000;
double gc = 0.5;
double p_value = 0.025;
size_t threshold = min_anchor_length(p_value, gc, len);
g_assert_cmpfloat(1 - p_value, <, shustring_cum_prob(threshold + 1, gc / 2, len));
g_assert_cmpfloat(1 - p_value, <=, shustring_cum_prob(threshold, gc / 2, len));
g_assert_cmpfloat(1 - p_value, >, shustring_cum_prob(threshold - 1, gc / 2, len));
}
int main(int argc, char *argv[]) {
g_test_init(&argc, &argv, NULL);
g_test_add_func("/process/shustring_cum_prob", test_shustring_cum_prob);
return g_test_run();
}
......@@ -35,9 +35,9 @@ do
./src/andi -t 1 |
tail -n 1 |
awk -v dist=$dist '{print $2, dist}' |
awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) <= 0.02 && abs($1-$2) <= 0.02 * $2}')
awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) <= 0.055 && abs($1-$2) <= 0.055 * $2}')
if test $res -ne 1; then
echo "The last test computed a distance deviating more than two percent from its intended value."
echo "The last test computed a distance deviating more than five percent from its intended value."
echo "See test_random.fasta for the used sequences."
echo "./test/test_fasta -s $SEED -l $LENGTH -d $dist"
head -n 1 ./test/test_random.fasta
......@@ -57,9 +57,9 @@ do
./src/andi -m RAW -t 1 |
tail -n 1 |
awk -v dist=$dist '{print $2, dist}' |
awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) <= 0.02 && abs($1-$2) <= 0.02 * $2}')
awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) <= 0.055 && abs($1-$2) <= 0.055 * $2}')
if test $res -ne 1; then
echo "The last test computed a distance deviating more than two percent from its intended value."
echo "The last test computed a distance deviating more than five percent from its intended value."
echo "See test_random.fasta for the used sequences."
echo "./test/test_fasta -r -s $SEED -l $LENGTH -d $dist"
head -n 1 ./test/test_random.fasta
......