Skip to content
Commits on Source (5)
......@@ -52,6 +52,6 @@ build/
.idea
Makefile
diamond
src/extra/benchmark.cpp
src/extra/test.cpp
src/extra/seed_stat.cpp
\ No newline at end of file
src/extra/
.vs/
CMakeSettings.json
matrix:
include:
- language: c++
compiler: g++
script: mkdir build; cd build; cmake ..; make
- language: c++
compiler: clang
script: mkdir build; cd build; cmake ..; make
......@@ -10,6 +10,8 @@ if(BUILD_STATIC)
set(CMAKE_EXE_LINKER_FLAGS "-static")
endif()
set(CMAKE_CXX_STANDARD 11)
if(CMAKE_BUILD_MARCH)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${CMAKE_BUILD_MARCH}")
else()
......@@ -24,7 +26,11 @@ find_package(ZLIB REQUIRED)
find_package(Threads REQUIRED)
set(CMAKE_BUILD_TYPE Release)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wno-uninitialized -Wno-deprecated-declarations -Wno-ignored-attributes -Wno-unused-variable")
if(WIN32)
add_definitions(-D_CRT_SECURE_NO_WARNINGS)
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wno-uninitialized -Wno-ignored-attributes -Wno-unused-variable")
endif()
include_directories(
"${CMAKE_SOURCE_DIR}/src"
......@@ -113,6 +119,10 @@ add_executable(diamond src/run/main.cpp
src/util/memory/memory_pool.cpp
src/data/seed_array.cpp
src/output/paf_format.cpp
src/util/system/system.cpp
src/run/cluster.cpp
src/util/algo/greedy_vortex_cover.cpp
src/util/algo/greedy_vortex_cover_weighted.cpp
)
if(EXTRA)
......
GNU AFFERO GENERAL PUBLIC LICENSE
Version 3, 19 November 2007
GNU GENERAL PUBLIC LICENSE
Version 3, 29 June 2007
Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
Preamble
The GNU Affero General Public License is a free, copyleft license for
software and other kinds of works, specifically designed to ensure
cooperation with the community in the case of network server software.
The GNU General Public License is a free, copyleft license for
software and other kinds of works.
The licenses for most software and other practical works are designed
to take away your freedom to share and change the works. By contrast,
our General Public Licenses are intended to guarantee your freedom to
the GNU General Public License is intended to guarantee your freedom to
share and change all versions of a program--to make sure it remains free
software for all its users.
software for all its users. We, the Free Software Foundation, use the
GNU General Public License for most of our software; it applies also to
any other work released this way by its authors. You can apply it to
your programs, too.
When we speak of free software, we are referring to freedom, not
price. Our General Public Licenses are designed to make sure that you
......@@ -24,34 +26,44 @@ them if you wish), that you receive source code or can get it if you
want it, that you can change the software or use pieces of it in new
free programs, and that you know you can do these things.
Developers that use our General Public Licenses protect your rights
with two steps: (1) assert copyright on the software, and (2) offer
you this License which gives you legal permission to copy, distribute
and/or modify the software.
A secondary benefit of defending all users' freedom is that
improvements made in alternate versions of the program, if they
receive widespread use, become available for other developers to
incorporate. Many developers of free software are heartened and
encouraged by the resulting cooperation. However, in the case of
software used on network servers, this result may fail to come about.
The GNU General Public License permits making a modified version and
letting the public access it on a server without ever releasing its
source code to the public.
The GNU Affero General Public License is designed specifically to
ensure that, in such cases, the modified source code becomes available
to the community. It requires the operator of a network server to
provide the source code of the modified version running there to the
users of that server. Therefore, public use of a modified version, on
a publicly accessible server, gives the public access to the source
code of the modified version.
An older license, called the Affero General Public License and
published by Affero, was designed to accomplish similar goals. This is
a different license, not a version of the Affero GPL, but Affero has
released a new version of the Affero GPL which permits relicensing under
this license.
To protect your rights, we need to prevent others from denying you
these rights or asking you to surrender the rights. Therefore, you have
certain responsibilities if you distribute copies of the software, or if
you modify it: responsibilities to respect the freedom of others.
For example, if you distribute copies of such a program, whether
gratis or for a fee, you must pass on to the recipients the same
freedoms that you received. You must make sure that they, too, receive
or can get the source code. And you must show them these terms so they
know their rights.
Developers that use the GNU GPL protect your rights with two steps:
(1) assert copyright on the software, and (2) offer you this License
giving you legal permission to copy, distribute and/or modify it.
For the developers' and authors' protection, the GPL clearly explains
that there is no warranty for this free software. For both users' and
authors' sake, the GPL requires that modified versions be marked as
changed, so that their problems will not be attributed erroneously to
authors of previous versions.
Some devices are designed to deny users access to install or run
modified versions of the software inside them, although the manufacturer
can do so. This is fundamentally incompatible with the aim of
protecting users' freedom to change the software. The systematic
pattern of such abuse occurs in the area of products for individuals to
use, which is precisely where it is most unacceptable. Therefore, we
have designed this version of the GPL to prohibit the practice for those
products. If such problems arise substantially in other domains, we
stand ready to extend this provision to those domains in future versions
of the GPL, as needed to protect the freedom of users.
Finally, every program is threatened constantly by software patents.
States should not allow patents to restrict development and use of
software on general-purpose computers, but in those that do, we wish to
avoid the special danger that patents applied to a free program could
make it effectively proprietary. To prevent this, the GPL assures that
patents cannot be used to render the program non-free.
The precise terms and conditions for copying, distribution and
modification follow.
......@@ -60,7 +72,7 @@ modification follow.
0. Definitions.
"This License" refers to version 3 of the GNU Affero General Public License.
"This License" refers to version 3 of the GNU General Public License.
"Copyright" also means copyright-like laws that apply to other kinds of
works, such as semiconductor masks.
......@@ -537,45 +549,35 @@ to collect a royalty for further conveying from those to whom you convey
the Program, the only way you could satisfy both those terms and this
License would be to refrain entirely from conveying the Program.
13. Remote Network Interaction; Use with the GNU General Public License.
Notwithstanding any other provision of this License, if you modify the
Program, your modified version must prominently offer all users
interacting with it remotely through a computer network (if your version
supports such interaction) an opportunity to receive the Corresponding
Source of your version by providing access to the Corresponding Source
from a network server at no charge, through some standard or customary
means of facilitating copying of software. This Corresponding Source
shall include the Corresponding Source for any work covered by version 3
of the GNU General Public License that is incorporated pursuant to the
following paragraph.
13. Use with the GNU Affero General Public License.
Notwithstanding any other provision of this License, you have
permission to link or combine any covered work with a work licensed
under version 3 of the GNU General Public License into a single
under version 3 of the GNU Affero General Public License into a single
combined work, and to convey the resulting work. The terms of this
License will continue to apply to the part which is the covered work,
but the work with which it is combined will remain governed by version
3 of the GNU General Public License.
but the special requirements of the GNU Affero General Public License,
section 13, concerning interaction through a network will apply to the
combination as such.
14. Revised Versions of this License.
The Free Software Foundation may publish revised and/or new versions of
the GNU Affero General Public License from time to time. Such new versions
will be similar in spirit to the present version, but may differ in detail to
the GNU General Public License from time to time. Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.
Each version is given a distinguishing version number. If the
Program specifies that a certain numbered version of the GNU Affero General
Program specifies that a certain numbered version of the GNU General
Public License "or any later version" applies to it, you have the
option of following the terms and conditions either of that numbered
version or of any later version published by the Free Software
Foundation. If the Program does not specify a version number of the
GNU Affero General Public License, you may choose any version ever published
GNU General Public License, you may choose any version ever published
by the Free Software Foundation.
If the Program specifies that a proxy can decide which future
versions of the GNU Affero General Public License can be used, that proxy's
versions of the GNU General Public License can be used, that proxy's
public statement of acceptance of a version permanently authorizes you
to choose that version for the Program.
......@@ -617,3 +619,56 @@ Program, unless a warranty or assumption of liability accompanies a
copy of the Program in return for a fee.
END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Programs
If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.
To do so, attach the following notices to the program. It is safest
to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.
<one line to give the program's name and a brief idea of what it does.>
Copyright (C) <year> <name of author>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
Also add information on how to contact you by electronic and paper mail.
If the program does terminal interaction, make it output a short
notice like this when it starts in an interactive mode:
<program> Copyright (C) <year> <name of author>
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
This is free software, and you are welcome to redistribute it
under certain conditions; type `show c' for details.
The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License. Of course, your program's commands
might be different; for a GUI interface, you would use an "about box".
You should also get your employer (if you work as a programmer) or school,
if any, to sign a "copyright disclaimer" for the program, if necessary.
For more information on this, and how to apply and follow the GNU GPL, see
<https://www.gnu.org/licenses/>.
The GNU General Public License does not permit incorporating your program
into proprietary programs. If your program is a subroutine library, you
may consider it more useful to permit linking proprietary applications with
the library. If this is what you want to do, use the GNU Lesser General
Public License instead of this License. But first, please read
<https://www.gnu.org/licenses/why-not-lgpl.html>.
Introduction
============
DIAMOND is a sequence aligner for protein and translated DNA searches,
designed for high performance analysis of big sequence data. The key
features are:
- Pairwise alignment of proteins and translated DNA at 500x-20,000x
speed of BLAST.
- Frameshift alignments for long read analysis.
- Low resource requirements and suitable for running on standard
desktops or laptops.
- Various output formats, including BLAST pairwise, tabular and XML,
as well as taxonomic classification.
Keep posted about new developments by following me on
[Twitter](https://twitter.com/bbuchfink).
[![Build Status](https://travis-ci.org/bbuchfink/diamond.svg?branch=master)](https://travis-ci.org/bbuchfink/diamond)
[![image](https://anaconda.org/bioconda/diamond/badges/version.svg)](https://anaconda.org/bioconda/diamond)
![image](https://anaconda.org/bioconda/diamond/badges/platforms.svg)
[![image](https://anaconda.org/bioconda/diamond/badges/downloads.svg)](https://anaconda.org/bioconda/diamond)
Quick start guide
=================
Please read the
[manual](https://github.com/bbuchfink/diamond/raw/master/diamond_manual.pdf)
for detailed installation and usage instructions. This demonstrates a
quick example for setting up and using the program on Linux.
Installing the software on your system may be done by downloading it in
binary format for immediate use:
wget http://github.com/bbuchfink/diamond/releases/download/v0.9.23/diamond-linux64.tar.gz
tar xzf diamond-linux64.tar.gz
The extracted `diamond` binary file should be moved to a directory
contained in your executable search path (PATH environment variable).
To now run an alignment task, we assume to have a protein database file
in FASTA format named `nr.faa` and a file of DNA reads that we want to
align named `reads.fna`.
In order to set up a reference database for DIAMOND, the `makedb`
command needs to be executed with the following command line:
$ diamond makedb --in nr.faa -d nr
This will create a binary DIAMOND database file with the specified name
(`nr.dmnd`). The alignment task may then be initiated using the `blastx`
command like this:
$ diamond blastx -d nr -q reads.fna -o matches.m8
The output file here is specified with the `–o` option and named
`matches.m8`. By default, it is generated in BLAST tabular format.
**Note**:
- The program may use quite a lot of memory and also temporary
disk space. Should the program fail due to running out of either
one, you need to set a lower value for the block size parameter
`-b` (see the [manual](https://github.com/bbuchfink/diamond/raw/master/diamond_manual.pdf)).
- The default (fast) mode was mainly designed for short reads. For
longer sequences, the sensitive modes (options `--sensitive` or
`--more-sensitive`) are recommended.
- The runtime of the program is not linear in the size of the
query file and it is much more efficient for large query files
(\> 1 million sequences) than for smaller ones.
- Low complexity masking is applied to the query and reference
sequences by default. Masked residues appear in the output as X.
- The default e-value cutoff of DIAMOND is 0.001 while that of
BLAST is 10, so by default the program will search a lot more
stringently than BLAST and not report weak hits.
About
=====
DIAMOND is developed by Benjamin Buchfink at the Detlef Weigel lab, Max
Planck Institute for Developmental Biology, Tübingen, Germany.
\[[Email](mailto:buchfink@gmail.com)\]
\[[Twitter](https://twitter.com/bbuchfink)\] \[[Google
Scholar](https://scholar.google.de/citations?user=kjPIF1cAAAAJ)\]
\[[MPI-EBIO](http://eb.tuebingen.mpg.de/)\]
Publication:
- Buchfink B, Xie C, Huson DH, \"Fast and sensitive protein alignment
using DIAMOND\", *Nature Methods* **12**, 59-60 (2015).
[doi:10.1038/nmeth.3176](https://doi.org/10.1038/nmeth.3176)
Introduction
============
DIAMOND is a sequence aligner for protein and translated DNA searches, designed for high performance analysis of big sequence data. The key features are:
- Pairwise alignment of proteins and translated DNA at 500x-20,000x speed of BLAST.
- Frameshift alignments for long read analysis.
- Low resource requirements and suitable for running on standard desktops or laptops.
- Various output formats, including BLAST pairwise, tabular and XML, as well as taxonomic classification.
Keep posted about new developments by following me on Twitter.
.. image:: https://image.ibb.co/gAmVKR/twitter1.png
:target: https://twitter.com/bbuchfink
.. image:: https://badges.gitter.im/diamond-aligner/Lobby.svg
:alt: Join the chat at https://gitter.im/diamond-aligner/Lobby
:target: https://gitter.im/diamond-aligner/Lobby?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge
.. image:: https://anaconda.org/bioconda/diamond/badges/downloads.svg
:target: https://anaconda.org/bioconda/diamond
.. image:: https://img.shields.io/badge/Google%20Scholar-546-blue.svg
:target: https://scholar.google.de/citations?user=kjPIF1cAAAAJ
Quick start guide
=================
Please read the `manual <https://github.com/bbuchfink/diamond/raw/master/diamond_manual.pdf>`_ for detailed installation and usage instructions. This demonstrates a quick example for setting up and using the program on Linux.
Installing the software on your system may be done by downloading it in binary format for immediate use::
wget http://github.com/bbuchfink/diamond/releases/download/v0.9.22/diamond-linux64.tar.gz
tar xzf diamond-linux64.tar.gz
The extracted ``diamond`` binary file should be moved to a directory contained in your executable search path (PATH environment variable).
To now run an alignment task, we assume to have a protein database file in FASTA format named ``nr.faa`` and a file of DNA reads that we want to align named ``reads.fna``.
In order to set up a reference database for DIAMOND, the ``makedb`` command needs to be executed with the following command line::
$ diamond makedb --in nr.faa -d nr
This will create a binary DIAMOND database file with the specified name (``nr.dmnd``). The alignment task may then be initiated using the ``blastx`` command like this::
$ diamond blastx -d nr -q reads.fna -o matches.m8
The output file here is specified with the ``–o`` option and named ``matches.m8``. By default, it is generated in BLAST tabular format.
*Note*:
- The program may use quite a lot of memory and also temporary disk space. Should the program fail due to running out of either one, you need to set a lower value for the block size parameter ``-b`` (see the `manual <https://github.com/bbuchfink/diamond/raw/master/diamond_manual.pdf>`_).
- The default (fast) mode was mainly designed for short reads. For longer sequences, the sensitive modes (options ``--sensitive`` or ``--more-sensitive``) are recommended.
- The runtime of the program is not linear in the size of the query file and it is much more efficient for large query files (> 1 million sequences) than for smaller ones.
- Low complexity masking is applied to the query and reference sequences by default. Masked residues appear in the output as X.
- The default e-value cutoff of DIAMOND is 0.001 while that of BLAST is 10, so by default the program will search a lot more stringently than BLAST and not report weak hits.
About
=====
DIAMOND is developed by Benjamin Buchfink. Feel free to contact me for support (`Email <mailto:buchfink@gmail.com>`_ `Twitter <http://twitter.com/bbuchfink>`_).
If you use DIAMOND in published research, please cite B. Buchfink, Xie C., D. Huson, "Fast and sensitive protein alignment using DIAMOND", Nature Methods 12, 59-60 (2015).
......@@ -76,4 +76,8 @@ g++ -DNDEBUG -O3 -Wno-deprecated-declarations $1 $2 $3 \
src/util/memory/memory_pool.cpp \
src/data/seed_array.cpp \
src/output/paf_format.cpp \
src/util/system/system.cpp \
src/run/cluster.cpp \
src/util/algo/greedy_vortex_cover.cpp \
src/util/algo/greedy_vortex_cover_weighted.cpp \
-lz -lpthread -o diamond
diamond-aligner (0.9.23+dfsg-1) unstable; urgency=medium
* New upstream version
* Standards-Version: 4.2.1
-- Andreas Tille <tille@debian.org> Thu, 13 Dec 2018 13:48:51 +0100
diamond-aligner (0.9.22+dfsg-2) unstable; urgency=medium
* Do not use march=native flag
......
......@@ -6,7 +6,7 @@ Priority: optional
Build-Depends: debhelper (>= 11~),
cmake,
zlib1g-dev
Standards-Version: 4.2.0
Standards-Version: 4.2.1
Vcs-Browser: https://salsa.debian.org/med-team/diamond-aligner
Vcs-Git: https://salsa.debian.org/med-team/diamond-aligner.git
Homepage: https://github.com/bbuchfink/diamond
......
[0.9.23]
- Fixed an issue that could cause too high memory usage.
- Added output field 'qqual' to print query FASTQ quality values to the tabular format.
- Changed license to GPL.
- Raised compiler requirement to GCC 4.6.
- Added option to use the DAA output format for diamond view.
- Added support for using a FASTA file as the --db parameter in alignment workflows.
- Added CL (command line) and VN (version) fields to the @PG SAM format header line.
- Fixed a performance issue for very long query sequences.
- Added shortcut --long-reads for --range-culling --top 10 -F 15.
[0.9.22]
- Added output field full_sseq to tabular output format.
- Database sequences that exceed the maximum accession length will no longer cause an error.
......
......@@ -3,19 +3,20 @@ DIAMOND protein aligner
Copyright (C) 2013-2017 Benjamin Buchfink <buchfink@gmail.com>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
GNU General Public License for more details.
You should have received a copy of the GNU Affero General Public License
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include <memory>
#include "../basic/value.h"
#include "align.h"
#include "../data/reference.h"
......@@ -25,7 +26,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "query_mapper.h"
#include "../util/merge_sort.h"
using std::map;
using namespace std;
DpStat dp_stat;
......@@ -35,9 +36,9 @@ struct Align_fetcher
{
it_ = begin;
end_ = end;
queue_ = auto_ptr<Queue>(new Queue(qbegin, qend));
queue_ = unique_ptr<Queue>(new Queue(qbegin, qend));
}
void operator()(size_t query)
bool operator()(size_t query)
{
const unsigned q = (unsigned)query,
c = align_mode.query_contexts;
......@@ -46,19 +47,26 @@ struct Align_fetcher
++it_;
end = it_;
this->query = query;
target_parallel = (end - begin > config.query_parallel_limit) && config.frame_shift != 0 && align_mode.mode == Align_mode::blastx && config.toppercent < 100 && config.query_range_culling;
return target_parallel;
}
bool get()
{
return queue_->get(*this) != Queue::end;
}
void release() {
if (target_parallel)
queue_->release();
}
size_t query;
vector<hit>::iterator begin, end;
bool target_parallel;
private:
static vector<hit>::iterator it_, end_;
static auto_ptr<Queue> queue_;
static unique_ptr<Queue> queue_;
};
auto_ptr<Queue> Align_fetcher::queue_;
unique_ptr<Queue> Align_fetcher::queue_;
vector<hit>::iterator Align_fetcher::it_;
vector<hit>::iterator Align_fetcher::end_;
......@@ -84,27 +92,31 @@ void align_worker(size_t thread_id, const Parameters *params, const Metadata *me
if (config.ext == Config::swipe)
mapper = new ExtensionPipeline::Swipe::Pipeline(*params, hits.query, hits.begin, hits.end);
else if (config.frame_shift != 0)
mapper = new ExtensionPipeline::BandedSwipe::Pipeline(*params, hits.query, hits.begin, hits.end, dp_stat);
mapper = new ExtensionPipeline::BandedSwipe::Pipeline(*params, hits.query, hits.begin, hits.end, dp_stat, hits.target_parallel);
else
mapper = new ExtensionPipeline::Greedy::Pipeline(*params, hits.query, hits.begin, hits.end);
task_timer timer("Initializing mapper", hits.target_parallel ? 3 : UINT_MAX);
mapper->init();
timer.finish();
mapper->run(stat);
timer.go("Generating output");
TextBuffer *buf = 0;
if (*output_format != Output_format::null) {
buf = new TextBuffer;
const bool aligned = mapper->generate_output(*buf, stat, *metadata);
if (aligned && !config.unaligned.empty())
if (aligned && (!config.unaligned.empty() || !config.aligned_file.empty()))
query_aligned[hits.query] = true;
}
delete mapper;
OutputSink::get().push(hits.query, buf);
hits.release();
}
statistics += stat;
::dp_stat += dp_stat;
}
void align_queries(Trace_pt_buffer &trace_pts, OutputFile* output_file, const Parameters &params, const Metadata &metadata)
void align_queries(Trace_pt_buffer &trace_pts, Consumer* output_file, const Parameters &params, const Metadata &metadata)
{
const size_t max_size = (size_t)std::min(config.chunk_size*1e9 * 9 * 2 / config.lowmem, 2e9);
pair<size_t, size_t> query_range;
......@@ -121,11 +133,11 @@ void align_queries(Trace_pt_buffer &trace_pts, OutputFile* output_file, const Pa
v->init();
timer.go("Computing alignments");
Align_fetcher::init(query_range.first, query_range.second, v->begin(), v->end());
OutputSink::instance = auto_ptr<OutputSink>(new OutputSink(query_range.first, output_file));
OutputSink::instance = unique_ptr<OutputSink>(new OutputSink(query_range.first, output_file));
Thread_pool threads;
if (config.verbosity >= 3)
if (config.verbosity >= 3 && config.load_balancing == Config::query_parallel)
threads.push_back(launch_thread(heartbeat_worker, query_range.second));
size_t n_threads = (config.threads_align == 0 ? config.threads_ : config.threads_align);
size_t n_threads = config.load_balancing == Config::query_parallel ? (config.threads_align == 0 ? config.threads_ : config.threads_align) : 1;
for (size_t i = 0; i < n_threads; ++i)
//threads.push_back(launch_thread(static_cast<void(*)(size_t)>(&align_worker), i));
threads.push_back(launch_thread(align_worker, i, &params, &metadata));
......
......@@ -3,16 +3,16 @@ DIAMOND protein aligner
Copyright (C) 2013-2017 Benjamin Buchfink <buchfink@gmail.com>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
GNU General Public License for more details.
You should have received a copy of the GNU Affero General Public License
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
......@@ -29,7 +29,6 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "../data/metadata.h"
using std::vector;
using std::auto_ptr;
struct Output_writer
{
......@@ -65,7 +64,7 @@ private:
Task_queue<_buffer, Output_writer> queue;
};
void align_queries(Trace_pt_buffer &trace_pts, OutputFile* output_file, const Parameters &params, const Metadata &metadata);
void align_queries(Trace_pt_buffer &trace_pts, Consumer* output_file, const Parameters &params, const Metadata &metadata);
namespace ExtensionPipeline {
namespace Greedy {
......@@ -94,8 +93,8 @@ namespace ExtensionPipeline {
struct Target;
struct Pipeline : public QueryMapper
{
Pipeline(const Parameters &params, size_t query_id, Trace_pt_list::iterator begin, Trace_pt_list::iterator end, DpStat &dp_stat) :
QueryMapper(params, query_id, begin, end),
Pipeline(const Parameters &params, size_t query_id, Trace_pt_list::iterator begin, Trace_pt_list::iterator end, DpStat &dp_stat, bool target_parallel) :
QueryMapper(params, query_id, begin, end, target_parallel),
dp_stat(dp_stat)
{}
Target& target(size_t i);
......
......@@ -3,23 +3,26 @@ DIAMOND protein aligner
Copyright (C) 2013-2017 Benjamin Buchfink <buchfink@gmail.com>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
GNU General Public License for more details.
You should have received a copy of the GNU Affero General Public License
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include <algorithm>
#include "align.h"
#include "../dp/dp.h"
#include "../util/interval_partition.h"
using namespace std;
namespace ExtensionPipeline { namespace BandedSwipe {
struct Target : public ::Target
......@@ -55,28 +58,29 @@ struct Target : public ::Target
d1 = std::min(i->diagonal() + band, d_max);
}
else {
v.push_back(DpTarget(subject, d0, d1, &hsps, subject_id));
v.emplace_back(subject, d0, d1, &hsps, subject_id);
d0 = std::max(i->diagonal() - band, d_min);
d1 = std::min(i->diagonal() + band, d_max);
}
}
v.push_back(DpTarget(subject, d0, d1, &hsps, subject_id));
v.emplace_back(subject, d0, d1, &hsps, subject_id);
}
void add(QueryMapper &mapper, vector<DpTarget> &vf, vector<DpTarget> &vr)
{
vector<Seed_hit>::iterator hits = mapper.seed_hits.begin() + begin, hits_end = mapper.seed_hits.begin() + end;
Strand strand = top_hit.strand();
if(mapper.target_parallel)
std::stable_sort(hits, hits_end, Seed_hit::compare_diag_strand);
for (vector<Seed_hit>::iterator i = hits; i < hits_end; ++i)
if (i->strand() == REVERSE) {
if (strand == FORWARD)
add_strand(mapper, vf, hits, i);
else
add_strand(mapper, vr, i, hits_end);
return;
}
if (strand == FORWARD) add_strand(mapper, vf, hits, hits_end);
std::stable_sort(hits, hits_end, Seed_hit::compare_diag_strand2);
const auto it = find_if(hits, hits_end, [](const Seed_hit &x) { return x.strand() == REVERSE; });
if(strand == FORWARD || mapper.target_parallel)
add_strand(mapper, vf, hits, it);
if (strand == REVERSE || mapper.target_parallel)
add_strand(mapper, vr, it, hits_end);
//const int d = hits[0].diagonal();
//const DpTarget t(subject, std::max(d - 32, -int(subject.length() - 1)), std::min(d + 32, int(mapper.query_seq(0).length() - 1)), &hsps, subject_id);
}
......@@ -145,36 +149,97 @@ void Pipeline::run_swipe(bool score_only)
vector<DpTarget> vf, vr;
for (size_t i = 0; i < n_targets(); ++i)
target(i).add(*this, vf, vr);
banded_3frame_swipe(translated_query, FORWARD, vf.begin(), vf.end(), this->dp_stat, score_only);
banded_3frame_swipe(translated_query, REVERSE, vr.begin(), vr.end(), this->dp_stat, score_only);
banded_3frame_swipe(translated_query, FORWARD, vf.begin(), vf.end(), this->dp_stat, score_only, target_parallel);
banded_3frame_swipe(translated_query, REVERSE, vr.begin(), vr.end(), this->dp_stat, score_only, target_parallel);
}
void build_ranking_worker(PtrVector<::Target>::iterator begin, PtrVector<::Target>::iterator end, Atomic<size_t> *next, vector<unsigned> *intervals) {
PtrVector<::Target>::iterator it;
while ((it = begin + next->post_add(64)) < end) {
auto e = min(it + 64, end);
for (; it < e; ++it) {
(*it)->add_ranges(*intervals);
}
}
}
void Pipeline::run(Statistics &stat)
{
//cout << "Query=" << query_ids::get()[this->query_id].c_str() << endl;
task_timer timer("Init banded swipe pipeline", target_parallel ? 3 : UINT_MAX);
Config::set_option(config.padding, 32);
if (n_targets() == 0)
return;
stat.inc(Statistics::TARGET_HITS0, n_targets());
if (!target_parallel) {
timer.go("Ungapped stage");
for (size_t i = 0; i < n_targets(); ++i)
target(i).ungapped_stage(*this);
timer.go("Ranking");
if (!config.query_range_culling)
rank_targets(config.rank_ratio == -1 ? 0.4 : config.rank_ratio, config.rank_factor == -1.0 ? 1e3 : config.rank_factor);
else
range_ranking();
}
else {
timer.finish();
log_stream << "Query: " << query_id << "; Seed hits: " << seed_hits.size() << "; Targets: " << n_targets() << endl;
}
if (n_targets() > config.max_alignments) {
if (n_targets() > config.max_alignments || config.toppercent < 100.0) {
stat.inc(Statistics::TARGET_HITS1, n_targets());
timer.go("Swipe (score only)");
run_swipe(true);
if (target_parallel) {
timer.go("Building score ranking intervals");
vector<vector<unsigned>> intervals(config.threads_);
const size_t interval_count = (source_query_len + ::Target::INTERVAL - 1) / ::Target::INTERVAL;
for (vector<unsigned> &v : intervals)
v.resize(interval_count);
Thread_pool threads;
Atomic<size_t> next(0);
for (unsigned i = 0; i < config.threads_; ++i)
threads.push_back(launch_thread(build_ranking_worker, targets.begin(), targets.end(), &next, &intervals[i]));
threads.join_all();
timer.go("Merging score ranking intervals");
for (auto it = intervals.begin() + 1; it < intervals.end(); ++it) {
vector<unsigned>::iterator i = intervals[0].begin(), i1 = intervals[0].end(), j = it->begin();
for (; i < i1; ++i, ++j)
*i = max(*i, *j);
}
timer.go("Finding outranked targets");
for (auto i = targets.begin(); i < targets.end(); ++i)
if ((*i)->is_outranked(intervals[0], 1.0 - config.toppercent / 100)) {
delete *i;
*i = nullptr;
}
timer.go("Removing outranked targets");
vector<::Target*> &v(*static_cast<vector<::Target*>*>(&targets));
auto it = remove_if(v.begin(), v.end(), [](::Target *t) { return t == nullptr; });
v.erase(it, v.end());
timer.finish();
log_stream << "Targets after score-only ranking: " << targets.size() << endl;
}
else {
timer.go("Score only culling");
for (size_t i = 0; i < n_targets(); ++i)
target(i).set_filter_score();
score_only_culling();
}
}
timer.go("Swipe (traceback)");
stat.inc(Statistics::TARGET_HITS2, n_targets());
for (size_t i = 0; i < n_targets(); ++i)
target(i).reset();
run_swipe(false);
timer.go("Inner culling");
for (size_t i = 0; i < n_targets(); ++i)
target(i).finish(*this);
}
......
......@@ -3,16 +3,16 @@ DIAMOND protein aligner
Copyright (C) 2013-2017 Benjamin Buchfink <buchfink@gmail.com>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
GNU General Public License for more details.
You should have received a copy of the GNU Affero General Public License
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
......
......@@ -3,16 +3,16 @@ DIAMOND protein aligner
Copyright (C) 2013-2017 Benjamin Buchfink <buchfink@gmail.com>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
GNU General Public License for more details.
You should have received a copy of the GNU Affero General Public License
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
......@@ -82,7 +82,7 @@ struct Target : public ::Target
if (filter_score == 0)
return;
if (config.ext == Config::more_greedy)
hsps.push_back(Hsp(filter_score));
hsps.emplace_back(filter_score);
else {
const int qlen = (int)mapper.query_seq(0).length(),
band_plus = qlen <= 50 ? 0 : 16;
......@@ -91,7 +91,7 @@ struct Target : public ::Target
if (log_ga) {
cout << "i_begin=" << i->query_range.begin_ << " j_begin=" << i->subject_range.begin_ << " d_min=" << i->d_min << " d_max=" << i->d_max << endl;
}
hsps.push_back(Hsp());
hsps.emplace_back();
hsps.back().frame = i->frame;
banded_sw(mapper.query_seq(i->frame), subject, i->d_min - band_plus, i->d_max + band_plus + 1, 0, (int)subject.length(), hsps.back());
if (config.comp_based_stats) {
......@@ -126,7 +126,7 @@ struct Target : public ::Target
ts.clear();
for (list<Hsp>::iterator i = hsps.begin(); i != hsps.end(); ++i)
ts.push_back(Hsp_traits(i->query_source_range));
ts.emplace_back(i->query_source_range);
if (config.use_smith_waterman && !hsps.empty()) {
int score;
......
......@@ -3,19 +3,21 @@ DIAMOND protein aligner
Copyright (C) 2013-2017 Benjamin Buchfink <buchfink@gmail.com>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
GNU General Public License for more details.
You should have received a copy of the GNU Affero General Public License
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include <memory>
#include <algorithm>
#include "query_mapper.h"
#include "../data/reference.h"
#include "extend_ungapped.h"
......@@ -24,6 +26,8 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "../output/daa_write.h"
#include "../output/target_culling.h"
using namespace std;
bool Target::envelopes(const Hsp_traits &t, double p) const
{
for (list<Hsp_traits>::const_iterator i = ts.begin(); i != ts.end(); ++i)
......@@ -48,19 +52,40 @@ bool Target::is_enveloped(PtrVector<Target>::const_iterator begin, PtrVector<Tar
return false;
}
void Target::add_ranges(vector<unsigned> &v) {
for (const Hsp &hsp : hsps) {
const int i0 = hsp.query_source_range.begin_ / INTERVAL,
i1 = min(hsp.query_source_range.end_ / INTERVAL, int(v.size() - 1));
for (int i = i0; i <= i1; ++i)
v[i] = max(v[i], hsp.score);
}
}
bool Target::is_outranked(const vector<unsigned> &v, double treshold) {
for (const Hsp &hsp : hsps) {
const int i0 = hsp.query_source_range.begin_ / INTERVAL,
i1 = min(hsp.query_source_range.end_ / INTERVAL, int(v.size() - 1));
for (int i = i0; i <= i1; ++i)
if (hsp.score >= unsigned(v[i] * treshold))
return false;
}
return true;
}
int QueryMapper::raw_score_cutoff() const
{
return score_matrix.rawscore(config.min_bit_score == 0 ? score_matrix.bitscore(config.max_evalue, (unsigned)query_seq(0).length()) : config.min_bit_score);
}
QueryMapper::QueryMapper(const Parameters &params, size_t query_id, Trace_pt_list::iterator begin, Trace_pt_list::iterator end) :
QueryMapper::QueryMapper(const Parameters &params, size_t query_id, Trace_pt_list::iterator begin, Trace_pt_list::iterator end, bool target_parallel) :
parameters(params),
source_hits(std::make_pair(begin, end)),
query_id((unsigned)query_id),
targets_finished(0),
next_target(0),
source_query_len(get_source_query_len((unsigned)query_id)),
translated_query(get_translated_query(query_id))
translated_query(get_translated_query(query_id)),
target_parallel(target_parallel)
{
seed_hits.reserve(source_hits.second - source_hits.first);
}
......@@ -71,10 +96,10 @@ void QueryMapper::init()
cout << "Query = " << query_ids::get()[query_id].c_str() << endl;
if (config.comp_based_stats == 1)
for (unsigned i = 0; i < align_mode.query_contexts; ++i)
query_cb.push_back(Bias_correction(query_seq(i)));
query_cb.emplace_back(query_seq(i));
if (config.ext == Config::greedy || config.ext == Config::more_greedy)
for (unsigned i = 0; i < align_mode.query_contexts; ++i)
profile.push_back(Long_score_profile(query_seq(i)));
profile.emplace_back(query_seq(i));
//profile.push_back(Long_score_profile());
targets.resize(count_targets());
if (targets.empty())
......@@ -94,17 +119,22 @@ unsigned QueryMapper::count_targets()
const unsigned frame = hits[i].query_ % align_mode.query_contexts;
/*const Diagonal_segment d = config.comp_based_stats ? xdrop_ungapped(query_seq(frame), query_cb[frame], ref_seqs::get()[l.first], hits[i].seed_offset_, (int)l.second)
: xdrop_ungapped(query_seq(frame), ref_seqs::get()[l.first], hits[i].seed_offset_, (int)l.second);*/
if (target_parallel) {
seed_hits.emplace_back(frame, (unsigned)l.first, (unsigned)l.second, (unsigned)hits[i].seed_offset_, Diagonal_segment());
if (l.first != subject_id) {
subject_id = l.first;
++n_subject;
}
}
else {
const Diagonal_segment d = xdrop_ungapped(query_seq(frame), ref_seqs::get()[l.first], hits[i].seed_offset_, (int)l.second);
if (d.score >= config.min_ungapped_raw_score) {
if (l.first != subject_id) {
subject_id = l.first;
++n_subject;
}
seed_hits.push_back(Seed_hit(frame,
(unsigned)l.first,
(unsigned)l.second,
hits[i].seed_offset_,
d));
seed_hits.emplace_back(frame, (unsigned)l.first, (unsigned)l.second, (unsigned)hits[i].seed_offset_, d);
}
}
}
return n_subject;
......@@ -155,7 +185,7 @@ void QueryMapper::rank_targets(double ratio, double factor)
void QueryMapper::score_only_culling()
{
std::stable_sort(targets.begin(), targets.end(), Target::compare);
auto_ptr<TargetCulling> target_culling(TargetCulling::get());
unique_ptr<TargetCulling> target_culling(TargetCulling::get());
const unsigned query_len = (unsigned)query_seq(0).length();
PtrVector<Target>::iterator i;
for (i = targets.begin(); i<targets.end();) {
......@@ -184,13 +214,14 @@ bool QueryMapper::generate_output(TextBuffer &buffer, Statistics &stat, const Me
std::stable_sort(targets.begin(), targets.end(), Target::compare);
unsigned n_hsp = 0, n_target_seq = 0, hit_hsps = 0;
auto_ptr<TargetCulling> target_culling(TargetCulling::get());
unique_ptr<TargetCulling> target_culling(TargetCulling::get());
const unsigned query_len = (unsigned)query_seq(0).length();
size_t seek_pos = 0;
const char *query_title = query_ids::get()[query_id].c_str();
auto_ptr<Output_format> f(output_format->clone());
unique_ptr<Output_format> f(output_format->clone());
for (size_t i = 0; i < targets.size(); ++i) {
if ((config.min_bit_score == 0 && score_matrix.evalue(targets[i].filter_score, query_len) > config.max_evalue)
|| score_matrix.bitscore(targets[i].filter_score) < config.min_bit_score)
break;
......@@ -230,7 +261,7 @@ bool QueryMapper::generate_output(TextBuffer &buffer, Statistics &stat, const Me
f->print_query_intro(query_id, query_title, source_query_len, buffer, false);
}
if (*f == Output_format::daa)
write_daa_record(buffer, *j, query_id, subject_id);
write_daa_record(buffer, *j, subject_id);
else
f->print_match(Hsp_context(*j,
query_id,
......@@ -301,7 +332,8 @@ void Target::apply_filters(int dna_len, int subject_len, const char *query_title
&& i->identities == i->length
&& i->query_source_range.length() == (int)dna_len
&& i->subject_range.length() == (int)subject_len
&& strcmp(query_title, ref_title) == 0))
&& strcmp(query_title, ref_title) == 0)
|| (config.filter_locus && !i->subject_range.includes(config.filter_locus)))
i = hsps.erase(i);
else
++i;
......
......@@ -3,16 +3,16 @@ DIAMOND protein aligner
Copyright (C) 2013-2017 Benjamin Buchfink <buchfink@gmail.com>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
GNU General Public License for more details.
You should have received a copy of the GNU Affero General Public License
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
......@@ -56,6 +56,8 @@ struct Target
for (list<Hsp_traits>::iterator i = ts.begin(); i != ts.end(); ++i)
i->query_source_range = TranslatedPosition::absolute_interval(TranslatedPosition(i->query_range.begin_, Frame(i->frame)), TranslatedPosition(i->query_range.end_, Frame(i->frame)), (int)query_len);
}
void add_ranges(vector<unsigned> &v);
bool is_outranked(const vector<unsigned> &v, double treshold);
bool envelopes(const Hsp_traits &t, double p) const;
bool is_enveloped(const Target &t, double p) const;
bool is_enveloped(PtrVector<Target>::const_iterator begin, PtrVector<Target>::const_iterator end, double p, int min_score) const;
......@@ -70,11 +72,13 @@ struct Target
list<Hsp> hsps;
list<Hsp_traits> ts;
Seed_hit top_hit;
enum { INTERVAL = 64 };
};
struct QueryMapper
{
QueryMapper(const Parameters &params, size_t query_id, Trace_pt_list::iterator begin, Trace_pt_list::iterator end);
QueryMapper(const Parameters &params, size_t query_id, Trace_pt_list::iterator begin, Trace_pt_list::iterator end, bool target_parallel = false);
void init();
bool generate_output(TextBuffer &buffer, Statistics &stat, const Metadata &metadata);
void rank_targets(double ratio, double factor);
......@@ -109,6 +113,7 @@ struct QueryMapper
vector<Bias_correction> query_cb;
vector<Long_score_profile> profile;
TranslatedSequence translated_query;
bool target_parallel;
private:
......
......@@ -3,16 +3,16 @@ DIAMOND protein aligner
Copyright (C) 2013-2017 Benjamin Buchfink <buchfink@gmail.com>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
GNU General Public License for more details.
You should have received a copy of the GNU Affero General Public License
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
......
......@@ -3,16 +3,16 @@ DIAMOND protein aligner
Copyright (C) 2013-2017 Benjamin Buchfink <buchfink@gmail.com>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
GNU General Public License for more details.
You should have received a copy of the GNU Affero General Public License
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
......@@ -24,7 +24,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "sequence.h"
#include "masking.h"
const char* Const::version_string = "0.9.22";
const char* Const::version_string = "0.9.23";
const char* Const::program_name = "diamond";
const char* Const::id_delimiters = " \a\b\f\n\r\t\v\1";
......
......@@ -3,19 +3,20 @@ DIAMOND protein aligner
Copyright (C) 2013-2017 Benjamin Buchfink <buchfink@gmail.com>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
GNU General Public License for more details.
You should have received a copy of the GNU Affero General Public License
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include <memory>
#include "../util/command_line_parser.h"
#include "config.h"
#include "../util/util.h"
......@@ -32,6 +33,9 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "../basic/translate.h"
#include "../dp/dp.h"
#include "masking.h"
#include "../util/system/system.h"
using namespace std;
Config config;
......@@ -63,7 +67,10 @@ Config::Config(int argc, const char **argv)
.add_command("db-annot-stats", "")
.add_command("read-sim", "")
.add_command("info", "")
.add_command("seed-stat", "");
.add_command("seed-stat", "")
.add_command("smith-waterman", "")
.add_command("protein-snps", "")
.add_command("cluster", "");
Options_group general("General options");
general.add()
......@@ -106,8 +113,9 @@ Config::Config(int argc, const char **argv)
\tstitle means Subject Title\n\
\tsalltitles means All Subject Title(s), separated by a '<>'\n\
\tqcovhsp means Query Coverage Per HSP\n\
\tqtitle means Query title\n\n\
\tDefault: qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore", output_format)
\tqtitle means Query title\n\
\tqqual means Query quality values\n\
\n\tDefault: qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore", output_format)
("verbose", 'v', "verbose console output", verbose)
("log", 0, "enable debug log", debug_log)
("quiet", 0, "disable console output", quiet);
......@@ -121,6 +129,7 @@ Config::Config(int argc, const char **argv)
("query", 'q', "input query file", query_file)
("strand", 0, "query strands to search (both/minus/plus)", query_strands, string("both"))
("un", 0, "file for unaligned queries", unaligned)
("al", 0, "file or aligned queries", aligned_file)
("unal", 0, "report unaligned queries (0=no, 1=yes)", report_unaligned, -1)
("max-target-seqs", 'k', "maximum number of target sequences to report alignments for", max_alignments, uint64_t(25))
("top", 0, "report alignments within this percentage range of top alignment score (overrides --max-target-seqs)", toppercent, 100.0)
......@@ -139,6 +148,7 @@ Config::Config(int argc, const char **argv)
("gapopen", 0, "gap open penalty", gap_open, -1)
("gapextend", 0, "gap extension penalty", gap_extend, -1)
("frameshift", 'F', "frame shift penalty (default=disabled)", frame_shift)
("long-reads", 0, "short for --range-culling --top 10 -F 15", long_reads)
("matrix", 0, "score matrix for protein alignment (default=BLOSUM62)", matrix, string("blosum62"))
("custom-matrix", 0, "file containing custom scoring matrix", matrix_file)
("lambda", 0, "lambda parameter for custom matrix", lambda)
......@@ -223,7 +233,7 @@ Config::Config(int argc, const char **argv)
("sw", 0, "", use_smith_waterman)
("superblock", 0, "", superblock, 128)
("max-cells", 0, "", max_cells, 10000000u)
("lb", 0, "", load_balancing, (unsigned)Config::query_parallel)
("load-balancing", 0, "", load_balancing, (unsigned)Config::query_parallel)
("ext", 0, "", ext, (int)Config::greedy)
("br", 0, "", benchmark_ranking)
("log-query", 0, "", log_query)
......@@ -246,11 +256,24 @@ Config::Config(int argc, const char **argv)
("hash-join", 0, "", hash_join)
("sort-join", 0, "", sort_join)
("simple-freq", 0, "", simple_freq)
("freq-treshold", 0, "", freq_treshold);
("freq-treshold", 0, "", freq_treshold)
("filter-locus", 0, "", filter_locus)
("use-dataset-field", 0, "", use_dataset_field)
("store-query-quality", 0, "", store_query_quality)
("swipe-chunk-size", 0, "", swipe_chunk_size, 256u)
("query-parallel-limit", 0, "", query_parallel_limit, 1000000u);
parser.add(general).add(makedb).add(aligner).add(advanced).add(view_options).add(getseq_options).add(hidden_options);
parser.store(argc, argv, command);
if (long_reads) {
query_range_culling = true;
if (toppercent == 100.0)
toppercent = 10.0;
if (frame_shift == 0)
frame_shift = 15;
}
switch (command) {
case Config::makedb:
if (database == "")
......@@ -319,12 +342,14 @@ Config::Config(int argc, const char **argv)
;
}
for (int i = 0; i < argc; ++i)
log_stream << argv[i] << ' ';
log_stream << endl;
invocation = join(' ', vector<string>(&argv[0], &argv[argc]));
log_stream << invocation << endl;
if (!no_auto_append) {
if (command == Config::makedb)
auto_append_extension(database, ".dmnd");
else
auto_append_extension_if_exists(database, ".dmnd");
if (command == Config::view)
auto_append_extension(daa_file, ".daa");
if (compression == 1)
......@@ -332,7 +357,7 @@ Config::Config(int argc, const char **argv)
}
message_stream << Const::program_name << " v" << Const::version_string << "." << (unsigned)Const::build_version << " | by Benjamin Buchfink <buchfink@gmail.com>" << endl;
message_stream << "Licensed under the GNU AGPL <https://www.gnu.org/licenses/agpl.txt>" << endl;
message_stream << "Licensed under the GNU GPL <https://www.gnu.org/licenses/gpl.txt>" << endl;
message_stream << "Check http://github.com/bbuchfink/diamond for updates." << endl << endl;
#ifndef NDEBUG
verbose_stream << "Assertions enabled." << endl;
......@@ -344,6 +369,7 @@ Config::Config(int argc, const char **argv)
case Config::blastp:
case Config::blastx:
case Config::view:
case Config::cluster:
message_stream << "#CPU threads: " << threads_ << endl;
default:
;
......@@ -357,6 +383,7 @@ Config::Config(int argc, const char **argv)
case Config::opt:
case Config::mask:
case Config::makedb:
case Config::cluster:
if (frame_shift != 0 && command == Config::blastp)
throw std::runtime_error("Frameshift alignments are only supported for translated searches.");
if (query_range_culling && frame_shift == 0)
......@@ -372,11 +399,11 @@ Config::Config(int argc, const char **argv)
}
message_stream << "Scoring parameters: " << score_matrix << endl;
if (masking == 1)
Masking::instance = auto_ptr<Masking>(new Masking(score_matrix));
Masking::instance = unique_ptr<Masking>(new Masking(score_matrix));
}
if (command == Config::blastp || command == Config::blastx || command == Config::benchmark || command == Config::model_sim || command == Config::opt
|| command == Config::mask) {
|| command == Config::mask || command == Config::cluster) {
if (tmpdir == "")
tmpdir = extract_dir(output_file);
......@@ -398,4 +425,6 @@ Config::Config(int argc, const char **argv)
/*log_stream << "sizeof(hit)=" << sizeof(hit) << " sizeof(packed_uint40_t)=" << sizeof(packed_uint40_t)
<< " sizeof(sorted_list::entry)=" << sizeof(sorted_list::entry) << endl;*/
use_lazy_dict = false;
}