Skip to content
Commits on Source (5)
[bumpversion]
current_version = 1.5.0
current_version = 1.6.0
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<stage>(dev|rc)))?(\.(?P<again>\d+))?
serialize =
{major}.{minor}.{patch}-{stage}.{again}
......@@ -18,5 +18,5 @@ values =
[bumpversion:file:configure.ac]
[bumpversion:file:doc/sina.tex]
[bumpversion:file:doc/source/install.rst]
......@@ -79,6 +79,7 @@ jobs:
make code-coverage-capture
bash <(curl -s https://codecov.io/bash) -X gcov -F unittest \
-R ~/project -n Unit-Linux-CircleCI
no_output_timeout: 20m # kmer_search_test is slow
- run:
<<: *configure
environment:
......@@ -95,6 +96,11 @@ jobs:
make code-coverage-capture
bash <(curl -s https://codecov.io/bash) -X gcov -F integration \
-R ~/project -n Integration-Linux-CircleCI
- run:
name: Collecting binaries
when: on_fail
command: |
tar -C ~/build -czvf $ARTIFACT_PATH/build_src.tgz src
- run:
name: Copying logs
when: always
......
......@@ -29,27 +29,30 @@
*.app
# Automake/Autoconf stuff
build-aux/*
m4/libtool.m4
m4/ltoptions.m4
m4/ltsugar.m4
m4/ltversion.m4
m4/lt~obsolete.m4
Makefile.in
aclocal.m4
config.h
config.h.in
config.log
config.status
configure
autom4te.cache
libtool
/build-aux/*
/m4/libtool.m4
/m4/ltoptions.m4
/m4/ltsugar.m4
/m4/ltversion.m4
/m4/lt~obsolete.m4
/Makefile.in
/aclocal.m4
/config.h
/config.h.in
/config.log
/config.status
/configure
/autom4te.cache
/libtool
.deps
.dirstamp
Makefile
/Makefile
stamp-h1
# temp files
*~
\#*\#
.\#*
\ No newline at end of file
.\#*
# build dirs
/build*/
\ No newline at end of file
......@@ -14,7 +14,6 @@ env:
global:
MINICONDA: $HOME/miniconda
BASH_ENV: $HOME/.bashrc
MAKEFLAGS: -j2
install:
- ./ci_scripts/install_conda.sh
......
......@@ -80,6 +80,8 @@ src_libsina_la_SOURCES = \
src/aligned_base.h \
src/alignment_stats.cpp \
src/alignment_stats.h \
src/buffer.h \
src/cache.h \
src/cseq.cpp \
src/cseq.h \
src/cseq_impl.h \
......@@ -94,6 +96,7 @@ src_libsina_la_SOURCES = \
src/mesh_debug.h \
src/mseq.cpp \
src/mseq.h \
src/progress.h \
src/pseq.cpp \
src/pseq.h \
src/query_arb.cpp \
......@@ -108,6 +111,7 @@ src_libsina_la_SOURCES = \
src/log.h \
src/scoring_schemes.h \
src/search.h \
src/search.cpp \
src/search_filter.cpp \
src/search_filter.h \
src/timer.h \
......@@ -180,12 +184,12 @@ man1_MANS = doc/man/sina.1
$(doc_DATA): doc/build.stamp
doc/build.stamp: $(DOC_SOURCE)
sphinx-build -M text $$(dirname $<) doc
$(SPHINX_BUILD) -M text $$(dirname $<) doc
touch $@
CLEANFILES += doc/build.stamp
$(man1_MANS): $(DOC_SOURCE)
sphinx-build -M man $$(dirname $<) doc
$(SPHINX_BUILD) -M man $$(dirname $<) doc
%: doc/text/%.txt
cp $< $@
......@@ -225,7 +229,7 @@ bindistdir: all
cp -fpR $(top_builddir)/$$f $(bindistdir)/$$f; done; \
find $(bindistdir) -depth -type d -empty -exec rmdir {} \;
binaries=$$(find $(bindistdir) -type f -exec file --mime {} \; | \
grep -E "application/x-.*(mach|executable|sharedlib)" | cut -d: -f1); \
$(GREP) -E "application/x-.*(mach|executable|sharedlib)" | cut -d: -f1); \
export LDPATHS="$(subst -L,,$(filter -L%, $(BOOST_LDFLAGS)))"; \
for binary in $$binaries; do $(top_srcdir)/tools/fix_libpaths.sh "$$binary"; done; \
find $(bindistdir) \( -name \*.la -or -name \*.a \) -delete
......@@ -252,7 +256,7 @@ test_libs = \
LOG_COMPILER = $(top_srcdir)/tools/test_driver.sh
TEST_LOG_DRIVER = env AM_TAP_AWK='$(AWK)' $(SHELL) \
$(top_srcdir)/build-aux/tap-driver.sh
$(top_srcdir)/build-aux/tap-driver.sh --comments
check_PROGRAMS = \
src/unit_tests/cseq_test \
......@@ -260,7 +264,8 @@ check_PROGRAMS = \
src/unit_tests/idset_test \
src/unit_tests/kmer_test \
src/unit_tests/kmer_search_test \
src/unit_tests/famfinder_test
src/unit_tests/famfinder_test \
src/unit_tests/progress_test
src_unit_tests_cseq_test_LDADD = $(test_libs)
src_unit_tests_cseq_test_LDFLAGS = $(ARB_LDFLAGS)
......@@ -274,12 +279,14 @@ src_unit_tests_kmer_search_test_LDADD = $(test_libs)
src_unit_tests_kmer_search_test_LDFLAGS = $(ARB_LDFLAGS)
src_unit_tests_famfinder_test_LDADD = $(test_libs)
src_unit_tests_famfinder_test_LDFLAGS = $(ARB_LDFLAGS)
src_unit_tests_progress_test_LDADD = $(BOOST_UNIT_TEST_FRAMEWORK_LIB)
test_scripts = \
tests/readwrite.test \
tests/align.test \
tests/search.test \
tests/infocommands.test \
tests/accuracy_kmer.test \
tests/accuracy.test
......@@ -305,7 +312,7 @@ check-filtered:
## pass ARGS to tests
TEST_ARGS := $(check_DATA) $(TEST_EXTRA_ARGS)
export TEST_ARGS
$(test_scripts): $(bin_PROGRAMS) $(check_DATA)
$(test_scripts): $(bin_PROGRAMS) $(check_DATA) tests/test_helper.sh
## unzip source test data into build folder
test_data/%: test_data/%.xz
......
SINA - reference based multiple sequence alignment
==================================================
|latest| |release| |Bioconda| |TravisCI| |CircleCI| |Read the Docs| |Codecov|
|latest| |Bioconda| |downloads| |TravisCI| |CircleCI| |Read the Docs| |Codecov|
.. |latest| image:: https://img.shields.io/github/release/epruesse/SINA/all.svg?label=latest
.. |release| image:: https://img.shields.io/github/release/epruesse/SINA.svg
......@@ -15,37 +15,65 @@ SINA - reference based multiple sequence alignment
:target: https://codecov.io/gh/epruesse/SINA
.. |Read the Docs| image:: https://img.shields.io/readthedocs/sina/latest.svg
:target: https://readthedocs.org/projects/sina/builds
.. |downloads| image:: https://img.shields.io/conda/dn/bioconda/sina.svg?style=flat
SINA is a tool to add sequences to an existing multiple sequence
alignment. It needs about 1 second on a single core to add one 16S
full length sequence (about 100k/h on a 32-core workstation). It was
developed to create the multi-million sequence alignment that is the
core of the SILVA SSU and LSU rRNA databases.
Installation
------------
SINA aligns nucleotide sequences to match a pre-existing MSA using
a graph based alignment algorithm similar to PoA. The graph approach
allows SINA to incorporate information from many reference sequences
building without blurring highly variable regions. While
pure NAST implementations depend highly on finding a good match in
the reference database, SINA is able to align sequences relatively
distant to references with good quality and will yield a robust result
for query sequences with many close reference.
- Use the `online <https://www.arb-silva.de/aligner>`_ version hosted
by the SILVA project to align small batches of LSU and SSU
sequences to their databses.
- The preferred way to install SINA locally is via Bioconda
(`full instructions <https://sina.readthedocs.io/en/latest/install.html>`_)::
Features
--------
- Speed. Aligning 100,000 full length rRNA against the SILVA NR takes 40 minutes on a mid-sized 2018 desktop computer. Aligning 1,000,000 V4 amplicons takes about 60 minutes.
- Accuracy. SINA is used to build the SILVA_ SSU and LSU rRNA databases.
- Classification. SINA includes an LCA based classification module.
- ARB. SINA is able to directly read and write ARB_ format files such as distributed by the SILVA_ project.
.. _SILVA: https://www.arb-silva.de
.. _ARB: https://www.arb-home.de
Online Version
--------------
An online version for submitting small batches of sequences is made
available by the SILVA_ project as part of their
`ACT: Alignment, Classification and Tree Service <https://www.arb-silva.de/aligner>`_.
In addition to SINA's alignment and classification stages, ACT allows directly building
phylogenetic trees with RAxML or FastTree from your sequences and (optionally)
additional sequences chosen using SINA's add-neighbors feature.
Installing SINA
---------------
The preferred way to install SINA locally is via `Bioconda <https://bioconda.github.io>`_.
If you have a working Bioconda installation, just run::
conda create -n sina sina
conda activate sina
Alternatively, self-contained images are available at
https://github.com/epruesse/SINA/releases. Choose the most recent ``tar.gz``
appropriate for your operating system and unpack::
tar xf sina-1.6.0-linux.tar.gz
cd sina-1.6.0
./sina
Documentation
-------------
- The `SINA manual <https://sina.readthedocs.io>`_ is hosted at readthedocs.
- Please refer to the `publication in
Bioinformatics <https://doi.org/10.1093/bioinformatics/bts252>`_ for
a description of the algorithm.
The full documentation is available at https://sina.readthedocs.io.
If you use SINA in your research, please don't forget to cite us:
The algorithm is explained in the paper:
Elmar Pruesse, Jörg Peplies, Frank Oliver Glöckner; *SINA: Accurate
high-throughput multiple sequence alignment of ribosomal RNA
genes.* Bioinformatics 2012; 28 (14): 1823-1829.
doi:10.1093/bioinformatics/bts252
`doi:10.1093/bioinformatics/bts252 <https://doi.org/10.1093/bioinformatics/bts252>`_
# Init
AC_INIT([SINA],[1.5.0],[elmar@pruesse.net],,[http://github.com/epruesse/SINA])
AC_INIT([SINA],[1.6.0],[elmar@pruesse.net],,[http://github.com/epruesse/SINA])
AC_COPYRIGHT([Copyright (c) 2005-2018 Elmar Pruesse])
AC_CONFIG_AUX_DIR([build-aux])
AC_CONFIG_MACRO_DIR([m4])
......@@ -28,6 +28,12 @@ AC_PROG_SED
AC_PROG_LN_S
AC_PROG_INSTALL
AC_PROG_MKDIR_P
AC_CHECK_TOOL([BC], [bc], [:])
AC_CHECK_TOOL([TEE], [tee], [:])
AC_CHECK_TOOL([MKTEMP], [mktemp], [:])
AC_CHECK_TOOL([SPHINX_BUILD], [sphinx-build], [:])
AC_PROG_GREP
LT_INIT([shared static])
AC_REQUIRE_AUX_FILE([tap-driver.sh])
......@@ -125,7 +131,7 @@ AX_ARG_ENABLE([debug], [disable optimizations and add debug symbols],
AC_DEFINE([DEBUG])
])
CXXFLAGS="$(echo $CXXFLAGS | sed 's/-O@<:@@<:@:alnum:@:>@@:>@*//g')"
CXXFLAGS="$CXXFLAGS -O0 -ggdb3"
CXXFLAGS="$CXXFLAGS -Og -ggdb3"
AX_CHECK_COMPILE_FLAG([-fno-omit-frame-pointer],[
CXXFLAGS="$CXXFLAGS -fno-omit-frame-pointer"
])
......@@ -191,6 +197,7 @@ if test x"$ARB_HELIX_LIBS" = x""; then
fi
AX_ARB_CHECK_FUNC([GBT_FIND_SEQUENCE], [GBT_find_sequence(NULL, NULL)])
AX_ARB_STATUS_RETURN_TYPE
case "$host_os" in
darwin*)
......@@ -268,6 +275,7 @@ CXXFLAGS="$CXXFLAGS -W"
AC_CONFIG_HEADERS([config.h])
AC_CONFIG_FILES([
Makefile
tests/test_helper.sh
])
......@@ -304,7 +312,8 @@ AC_MSG_NOTICE([ CXXFLAGS: $CODE_COVERAGE_CXXFLAGS])
AC_MSG_NOTICE([ LIBS: $CODE_COVERAGE_LIBS])
AC_MSG_NOTICE([])
])
AC_MSG_NOTICE([ ARB: ${ARBHOME}])
AC_MSG_NOTICE([ ARB:])
AC_MSG_NOTICE([ ARBHOME: ${ARBHOME}])
AC_MSG_NOTICE([ CPPFLAGS: ${ARB_CPPFLAGS}])
AC_MSG_NOTICE([ LDFLAGS: ${ARB_LDFLAGS}])
AC_MSG_NOTICE([ LIBS: ${ARB_LIBS}])
......
sina (1.5.0+dfsg-1) UNRELEASED; urgency=medium
sina (1.6.0+dfsg-1) UNRELEASED; urgency=medium
* Initial release (Closes: #<bug>)
TODO: Needs libarb which is non-free see
https://github.com/epruesse/SINA/issues/68
TODO: Needs libarb
https://salsa.debian.org/med-team/libarb
-- Andreas Tille <tille@debian.org> Fri, 29 Mar 2019 22:30:54 +0100
-- Andreas Tille <tille@debian.org> Thu, 13 Jun 2019 08:25:26 +0200
Source: sina
Maintainer: Debian Med Packaging Team <debian-med-packaging@lists.alioth.debian.org>
Uploaders: Andreas Tille <tille@debian.org>
Uploaders: Andreas Tille <tille@debian.org>,
Liubov Chuprikova <chuprikovalv@gmail.com>
Section: science
Priority: optional
Build-Depends: debhelper (>= 12~),
......
......@@ -4,15 +4,6 @@
export LC_ALL=C.UTF-8
include /usr/share/dpkg/default.mk
# this provides:
# DEB_SOURCE: the source package name
# DEB_VERSION: the full version of the package (epoch + upstream vers. + revision)
# DEB_VERSION_EPOCH_UPSTREAM: the package's version without the Debian revision
# DEB_VERSION_UPSTREAM_REVISION: the package's version without the Debian epoch
# DEB_VERSION_UPSTREAM: the package's upstream version
# DEB_DISTRIBUTION: the distribution(s) listed in the current entry of debian/changelog
# SOURCE_DATE_EPOCH: the source release date as seconds since the epoch, as
# specified by <https://reproducible-builds.org/specs/source-date-epoch/>
# for hardening you might like to uncomment this:
# export DEB_BUILD_MAINT_OPTIONS=hardening=+all
......@@ -20,12 +11,11 @@ include /usr/share/dpkg/default.mk
%:
dh $@
override_dh_auto_configure:
dh_auto_configure -- --with-arbhome=/usr/lib/x86_64-linux-gnu/arb
### When overriding auto_test make sure DEB_BUILD_OPTIONS will be respected
#override_dh_auto_test:
#ifeq (,$(filter nocheck,$(DEB_BUILD_OPTIONS)))
# do_stuff_for_testing
#endif
### If you **really** can not use uscan (even not with mode=git) use a debian/get-orig-script
#get-orig-source:
# . debian/get-orig-source
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
SOURCEDIR = source
BUILDDIR = build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
doxygen
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
......@@ -3,6 +3,31 @@
Changelog
=========
Version 1.6.0:
--------------
- make internal kmer engine the default (:issue:`23`)
- add pretty progress monitor
- run search stage in parallel (:issue:`32`)
- :option:`--num-pts` defaults to number of cores available
(previous: 1)
- add :option:`--search-engine` setting search engine for search
module
- always run internal engine without thread limit
- split num pt servers evently between search and align
- use fixed point format for logging (instead of scientific format)
- rewrote family selection (use :option:`--fs-oldmatch` for old
implementation)
- replace boost::mutex with std::mutex (c++11)
- fix :option:`--show-dist` if alignment width don't match
- fix race starting pt servers (library code not threadsafe)
- fix engine type not shown in :option:`--show-conf`
- fix writing to ARB sequence cache not threadsafe
- use lock free map for ARB sequence cache (speedup)
- add pod buffer to replace std::vector (speedup)
- add FIFO cache for kmer search results (speedup for
:option:`--search` and :option:`--turn`)
Version 1.5.0:
--------------
- update documentation (:issue:`20`)
......@@ -26,7 +51,8 @@ Version 1.5.0:
- fix out-of-bounds access on iterator in NAST implementation
- remove dependency on boost serialization library
- build release binaries with GCC 7 and C++11 ABI
- add integration tests watching for accuracy regressions (:issue:`25`)
- add integration tests watching for accuracy regressions
(:issue:`25`)
Version 1.4.0:
......
......@@ -65,7 +65,7 @@ General Options
``sina -i reference.fasta --prealigned -o reference.arb``
.. option:: -t [all], --turn[=all]
.. option:: -t [all], --turn [=all]
Enables turn check stage: Sequences not oriented in accordance with
the reference database will be reverse complemented as needed.
......@@ -267,6 +267,11 @@ at least the fraction :option:`--lca-quorum` of the search result.
alignment reference, but wish to search a larger set of sequences
for classification purposes.
.. option:: --search-engine=[internal|pt-server]
Override the value of :option:`--fs-engine` for use within the
search module.
.. option:: --search-min-sim=id (0.7)
The minimum fractional identity each result sequence must have with
......@@ -721,6 +726,11 @@ Advanced Reference Selection Options
selection. See :option:`--lca-quorum` for information on how the
value is interpreted.
.. option:: --fs-oldmatch
Use the pre-1.6.0 implementation for composing the alignment
family. Requires :option:`--fs-engine` = ``pt-server``.
Search & Classify Options
-------------------------
......
Reference based multiple sequence alignment using SINA
======================================================
Reference based multiple sequence alignment
===========================================
SINA allows incorporating additional sequences into an existing
multiple sequence alignment (MSA) without modifying the original
......@@ -24,8 +24,7 @@ format reference databases released by the project `here
An `online version of SINA <https://www.arb-silva.de/aligner/>`_ is provided
by the SILVA_ project.
Publication
~~~~~~~~~~~
.. rubric:: Publication
If you use SINA in your work, please cite:
......
......@@ -5,9 +5,9 @@ Installing SINA
You can install SINA
- `using Bioconda`_ (simple, recommended)
- from `pre-compiled tarballs`_ (alternative if conda is misbehaving for you)
- or build SINA `from source`_ (for developers)
1. `using Bioconda`_ (recommended)
2. from `pre-compiled tarballs`_ (alternate)
3. or build SINA `from source`_ (for developers)
.. _`using Bioconda`:
......@@ -25,11 +25,12 @@ To install, follow these steps:
Download the Miniconda_ installer (links for MacOS_ and Linux_),
execute it and follow the instructions it shows in the shell::
if test $(uname) == Linux; then
wget https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
else
wget https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
fi
# if you are on MacOS
wget https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
# if you are on Linux
wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
# then
sh Miniconda3-lastest-*-x86_64.sh
2. Add the Conda-Forge_ and Bioconda_ channels::
......@@ -70,16 +71,16 @@ are available on the `SINA releases`_ page at Github. Head on over
there and download the Linux or Macos one. Inside the folder created
by unpacking the archive, you should find a `sina` executable::
tar xf ~/Downloads/sina-1.4.0-linux.tar.gz
~/Downloads/sina-1.4.0-linux/sina --help
tar xf ~/Downloads/sina-1.6.0-linux.tar.gz
~/Downloads/sina-1.6.0-linux/sina --help
To install SINA system wide, place the contents of the archive in
`/opt` and create symlinks into `/usr/local/bin`::
wget https://github.com/epruesse/SINA/releases/download/v1.4.0/sina-1.4.0-linux.tar.gz
sudo tar xf sina-1.4.0-linux.tar.gz -C /opt
rm sina-1.4.0-linux.tar.gz
sudo ln -s /opt/sina-1.4.0-linux /opt/sina
wget https://github.com/epruesse/SINA/releases/download/v1.6.0/sina-1.5.0-linux.tar.gz
sudo tar xf sina-1.6.0-linux.tar.gz -C /opt
rm sina-1.6.0-linux.tar.gz
sudo ln -s /opt/sina-1.6.0-linux /opt/sina
sudo ln -s /opt/sina/bin/sina /usr/local/bin/sina
.. _`SINA releases`: https://github.com/epruesse/SINA/releases
......
AC_DEFUN([AX_LIB_ARBDB],
[
AH_TEMPLATE([HAVE_ARB], [], [Defined to 1 if ARB libraries are present])
AH_TEMPLATE([HAVE_ARB], [Defined to 1 if ARB libraries are present])
# Check for dynamic libraries
AC_ARG_WITH([arbhome],
......@@ -47,7 +47,7 @@ AC_DEFUN([AX_LIB_ARBDB],
success="no"
fi
fi
if test x"ax_arb_path" != x""; then
if test x"$ax_arb_path" != x""; then
saved_CPPFLAGS="$CPPFLAGS"
CPPFLAGS="$CPPFLAGS -I$ax_arb_path/INCLUDE $GLIB_CFLAGS"
AC_CHECK_HEADER([arbdb.h], [
......@@ -247,7 +247,7 @@ AC_DEFUN([AX_LIB_ARB_HELIX],
[
AC_REQUIRE([AX_LIB_ARBDB])
AH_TEMPLATE([HAVE_ARB_HELIX], [], [Defined to 1 if ARB SL HELIX is present])
AH_TEMPLATE([HAVE_ARB_HELIX], [Defined to 1 if ARB SL HELIX is present])
AC_MSG_CHECKING([for ARB HELIX static lib])
saved_CPPFLAGS="$CPPFLAGS"
......@@ -281,3 +281,56 @@ AC_DEFUN([AX_LIB_ARB_HELIX],
LDFLAGS="$saved_LDFLAGS"
])
AC_DEFUN([AX_ARB_STATUS_RETURN_TYPE],
[
AH_TEMPLATE([ARB_STATUS_RETURN_TYPE], [void or bool depending on version...])
AH_TEMPLATE([ARB_STATUS_RETURN_VALUE], ['' or true depending on RETURN_TYPE])
AC_REQUIRE([AX_LIB_ARBDB])
saved_CPPFLAGS="$CPPFLAGS"
saved_LDFLAGS="$LDFLAGS"
saved_LIBS="$LIBS"
CPPFLAGS="$CPPFLAGS $ARB_CPPFLAGS"
LDFLAGS="$LDFLAGS $ARB_LDFLAGS"
LIBS="$LIBS $ARBHOME/SL/HELIX/HELIX.a $ARB_LIBS"
AC_LANG_PUSH(C++)
AC_CHECK_HEADER([arb_handlers.h])
AC_MSG_CHECKING([ARB status return type])
ARB_STATUS_RETURN_TYPE=error
for sig in bool void; do
AC_LINK_IFELSE([
AC_LANG_PROGRAM([[
#include <arb_handlers.h>
]], [[
arb_status_implementation status;
$sig (*x)(const char*) = status.set_title;
]])
],[
ARB_STATUS_RETURN_TYPE=$sig
break
])
done
AC_MSG_RESULT([$ARB_STATUS_RETURN_TYPE])
if test x"$ARB_STATUS_RETURN_TYPE" == x"error"; then
ARB_STATUS_RETURN_TYPE=
fi
AC_DEFINE_UNQUOTED(ARB_STATUS_RETURN_TYPE, [$ARB_STATUS_RETURN_TYPE])
AC_SUBST(ARB_STATUS_RETURN_TYPE)
if test x"$ARB_STATUS_RETURN_TYPE" == x"bool"; then
ARB_STATUS_RETURN_VALUE=true
else
ARB_STAUTS_RETURN_VALUE=
fi
AC_DEFINE_UNQUOTED(ARB_STATUS_RETURN_VALUE, [$ARB_STATUS_RETURN_VALUE])
AC_SUBST(ARB_STATUS_RETURN_VALUE)
AC_LANG_POP(C++)
LIBS="$saved_LIBS"
CPPFLAGS="$saved_CPPFLAGS"
LDFLAGS="$saved_LDFLAGS"
])
......@@ -94,10 +94,10 @@ auto logger = sina::Log::create_logger("align");
namespace sina {
template<typename SCORING_SCHEME, typename MASTER>
void choose_transition(cseq& c, cseq& orig, MASTER& m, SCORING_SCHEME& s, ostream& log);
void choose_transition(cseq& c, const cseq& orig, MASTER& m, SCORING_SCHEME& s, ostream& log);
template<typename transition, typename MASTER>
void do_align(cseq& c, cseq& orig, MASTER& m, transition& tr, ostream& log);
void do_align(cseq& c, const cseq& orig, MASTER& m, transition& tr, ostream& log);
struct aligner::options {
bool realign;
......@@ -297,23 +297,6 @@ make_datetime() {
return string(buf);
}
struct not_icontains {
using result_type = bool;
const string bases;
explicit not_icontains(string _bases) : bases(std::move(_bases)) {}
bool operator()(const cseq& c) {
return !boost::algorithm::icontains(c.getBases(), bases);
}
};
struct iequals_cmp {
using result_type = bool;
const string bases;
explicit iequals_cmp(string _bases) : bases(std::move(_bases)) {}
bool operator()(const cseq& c) {
return iequals(bases, c.getBases());
}
};
aligner::aligner() = default;
aligner::~aligner() = default;
......@@ -324,65 +307,71 @@ aligner& aligner::operator=(const aligner& /*a*/) = default;
tray
aligner::operator()(tray t) {
// skip if requirements missing
// FIXME: add logging here
if ((t.input_sequence == nullptr) ||
(t.alignment_reference == nullptr) ||
(t.astats == nullptr) ) {
logger->error("Internal error - incomplete data for alignment");
return t;
}
// prepare variables
cseq &c = *(new cseq(*t.input_sequence)); // working copy
vector<cseq> &vc = *t.alignment_reference;
string bases = c.getBases(); // unaligned sequence
search::result_vector &vc = *t.alignment_reference;
const string bases = c.getBases(); // unaligned sequence
if (opts->lowercase != LOWERCASE_ORIGINAL) {
c.upperCaseAll();
}
// sort reference sequences containing candidate to end of family
auto it = partition(vc.begin(), vc.end(), not_icontains(bases));
auto not_contains_query = [&](search::result_item& item) {
// FIXME: we can do this w/o converting to string
return !boost::algorithm::icontains(item.sequence->getBases(), bases);
};
auto begin_containing = partition(vc.begin(), vc.end(), not_contains_query);
// if there are such sequences...
if (it != vc.end()) {
if (opts->realign) { // ...either realign (throw them out)
if (begin_containing != vc.end()) {
if (opts->realign) { // realign means ignore those sequences
// FIXME: this should be done in famfinder to re-fill family
t.log << "sequences ";
for (auto it2 = it; it2 != vc.end(); ++it2) {
t.log << it->get_attr<string>(query_arb::fn_acc) << " ";
for (auto it = begin_containing; it != vc.end(); ++it) {
t.log << it->sequence->get_attr<string>(query_arb::fn_acc) << " ";
}
t.log << "containing exact candidate removed from family;";
vc.erase(it, vc.end());
if (it == vc.begin()) {
vc.erase(begin_containing, vc.end());
if (vc.empty()) {
t.log << "that's ALL of them. skipping sequence;";
return t;
}
} else { // ...or steal their alignment
auto exact_match = find_if(it, vc.end(), iequals_cmp(bases));
} else { // otherwise, we steal the alignment
auto same_as_query = [&](search::result_item& item) {
return iequals(bases, item.sequence->getBases());
};
auto exact_match = find_if(begin_containing, vc.end(), same_as_query);
if (exact_match != vc.end()) {
c.setAlignedBases(exact_match->getAlignedBases());
c.setAlignedBases(exact_match->sequence->getAlignedBases());
t.log << "copied alignment from identical template sequence "
<< exact_match->get_attr<string>(query_arb::fn_acc) << ":"
<< exact_match->get_attr<string>(query_arb::fn_start)
<< exact_match->sequence->get_attr<string>(query_arb::fn_acc) << ":"
<< exact_match->sequence->get_attr<string>(query_arb::fn_start, "0")
<< "; ";
} else {
vector<aligned_base> subalignment, refalignment;
string refsequence = it->getBases();
boost::iterator_range<string::iterator> substr;
refalignment = it->getAlignedBases();
substr = boost::ifind_first(refsequence,bases);
vector<aligned_base> subalignment;
const vector<aligned_base>& refalignment = begin_containing->sequence->getAlignedBases();
string refsequence = begin_containing->sequence->getBases();
boost::iterator_range<string::iterator> substr = boost::ifind_first(refsequence, bases);
subalignment.reserve(substr.size());
std::copy( refalignment.begin() + std::distance(refsequence.begin(), substr.begin()),
refalignment.begin() + std::distance(refsequence.begin(), substr.end()),
std::back_inserter(subalignment) );
std::copy(refalignment.begin() + std::distance(refsequence.begin(), substr.begin()),
refalignment.begin() + std::distance(refsequence.begin(), substr.end()),
std::back_inserter(subalignment));
c.setAlignedBases(subalignment);
t.log << "copied alignment from (longer) template sequence "
<< it->get_attr<string>(query_arb::fn_acc) << ":"
<< it->get_attr<string>(query_arb::fn_start)
<< begin_containing->sequence->get_attr<string>(query_arb::fn_acc) << ":"
<< begin_containing->sequence->get_attr<string>(query_arb::fn_start, "0")
<< "; ";
BOOST_ASSERT(bases == c.getBases());
}
c.setWidth(it->getWidth());
}
c.setWidth(begin_containing->sequence->getWidth());
c.set_attr(query_arb::fn_date, make_datetime());
c.set_attr(query_arb::fn_qual, 100);
if (opts->calc_idty) {
......@@ -390,15 +379,21 @@ aligner::operator()(tray t) {
}
c.set_attr(query_arb::fn_head, 0);
c.set_attr(query_arb::fn_tail, 0);
c.set_attr("align_filter_slv", "");
c.set_attr(query_arb::fn_filter, "");
t.aligned_sequence = &c;
return t;
}
}
std::vector<const cseq*> vcp;
vcp.reserve(vc.size());
for (auto& r : vc) {
vcp.push_back(r.sequence);
}
if (!opts->fs_no_graph) {
// prepare reference
mseq m(vc.begin(), vc.end(), opts->fs_weight);
mseq m(vcp.begin(), vcp.end(), opts->fs_weight);
// (remove duplicate edges:)
m.sort();
m.reduce_edges();
......@@ -416,11 +411,11 @@ aligner::operator()(tray t) {
choose_transition(c, *t.input_sequence, m, s, t.log);
}
} else {
vector<float> weights(vc.begin()->getWidth(), 1.f);
vector<float> weights(vc.begin()->sequence->getWidth(), 1.f);
if (t.astats->getWidth() == 0) { // FIXME: this looks broken
weights = t.astats->getWeights();
}
float dist = vc.begin()->getScore();
float dist = vc.begin()->score;
t.log << "using dist: " << dist << endl;
scoring_scheme_matrix<aligned_base::matrix_type>
s(opts->gap_penalty, opts->gap_ext_penalty, weights,
......@@ -428,7 +423,7 @@ aligner::operator()(tray t) {
choose_transition(c, *t.input_sequence, m, s, t.log);
}
} else {
pseq p(vc.begin(), vc.end());
pseq p(vcp.begin(), vcp.end());
scoring_scheme_profile s(-opts->match_score, -opts->mismatch_score,
opts->gap_penalty, opts->gap_ext_penalty);
choose_transition(c, *t.input_sequence, p, s, t.log);
......@@ -436,8 +431,8 @@ aligner::operator()(tray t) {
if (opts->write_used_rels) {
stringstream tmp;
for (const cseq &s: vc) {
tmp << s.getName() << " ";
for (auto &s: vc) {
tmp << s.sequence->getName() << " ";
}
c.set_attr(query_arb::fn_used_rels, tmp.str());
}
......@@ -448,14 +443,14 @@ aligner::operator()(tray t) {
CMP_COVER_OVERLAP,
false);
float idty = 0;
for (const cseq &s: vc) {
idty = std::max(idty, calc_id(c, s));
for (auto &s: vc) {
idty = std::max(idty, calc_id(c, *s.sequence));
}
c.set_attr(query_arb::fn_idty, 100.f*idty);
}
c.set_attr(query_arb::fn_date, make_datetime());
c.set_attr("align_filter_slv", t.astats->getName());
c.set_attr(query_arb::fn_filter, t.astats->getName());
t.aligned_sequence = &c;
return t;
......@@ -463,7 +458,7 @@ aligner::operator()(tray t) {
template<typename SCORING_SCHEME, typename MASTER>
void
sina::choose_transition(cseq& c, cseq& orig, MASTER& m,
sina::choose_transition(cseq& c, const cseq& orig, MASTER& m,
SCORING_SCHEME& s, ostream& log) {
if (aligner::opts->insertion == INSERTION_FORBID) {
transition_aspace_aware<SCORING_SCHEME, MASTER, cseq> tr(s);
......@@ -476,9 +471,8 @@ sina::choose_transition(cseq& c, cseq& orig, MASTER& m,
template<typename transition, typename MASTER>
void
sina::do_align(cseq& c, cseq& orig, MASTER &m,
sina::do_align(cseq& c, const cseq& orig, MASTER &m,
transition &tr, ostream& log) {
using cnsts_type = compute_node_simple<transition>;
using data_type = typename cnsts_type::data_type;
cnsts_type cns(tr);
......@@ -489,9 +483,6 @@ sina::do_align(cseq& c, cseq& orig, MASTER &m,
mesh_t A(m, c);
int oh_head, oh_tail;
#ifdef DEBUG
log << "refsize: " << m.size() << "; ";
#endif
// compute values of mesh nodes
compute(A, cns);
......@@ -500,15 +491,15 @@ sina::do_align(cseq& c, cseq& orig, MASTER &m,
c.clearSequence();
// run backtracking on the mesh
backtrack(A, c, tr,
aligner::opts->overhang,
aligner::opts->lowercase,
aligner::opts->insertion,
oh_head, oh_tail, log);
float score = backtrack(A, c, tr,
aligner::opts->overhang,
aligner::opts->lowercase,
aligner::opts->insertion,
oh_head, oh_tail, log);
// alignment done :-)
c.set_attr(query_arb::fn_head, oh_head);
c.set_attr(query_arb::fn_tail, oh_tail);
c.set_attr(query_arb::fn_qual, (int)std::min(100.f, std::max(0.f, 100.f * c.getScore())));
c.set_attr(query_arb::fn_qual, (int)std::min(100.f, std::max(0.f, 100.f * score)));
if (aligner::opts->debug_graph) {
ofstream out(fmt::format("mseq_{}.dot", c.getName()));
......
......@@ -62,7 +62,7 @@ public:
class bad_character_exception : public std::exception {
public:
bad_character_exception(value_type c) noexcept
explicit bad_character_exception(value_type c) noexcept
: character(c)
{
}
......@@ -102,9 +102,7 @@ public:
}
/* construct from base_type */
base_iupac(base_types b) {
_data = 1 << b;
}
base_iupac(base_types b) : _data(1<<b) {}
/* explicit cast to base_type */
base_types getBaseType() const {
......@@ -217,7 +215,7 @@ public:
}
protected:
int count_bits(unsigned char c) const {
static int count_bits(unsigned char c) {
#define HAVE_BUILTIN_POPCOUNT
#ifdef HAVE_BUILTIN_POPCOUNT
return __builtin_popcount(c);
......