Skip to content
Commits on Source (31)
cmake_minimum_required(VERSION 3.2)
project(spoa LANGUAGES CXX VERSION 1.1.3)
project(spoa LANGUAGES CXX VERSION 3.0.0)
include(GNUInstallDirs)
......@@ -7,13 +7,18 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/bin)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pedantic -march=native")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pedantic")
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
option(spoa_build_executable "Build spoa standalone tool" OFF)
option(spoa_build_tests "Build spoa unit tests" OFF)
option(spoa_optimize_for_native "Buiold spoa with march=native" ON)
if (spoa_optimize_for_native)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
endif()
# build SPOA as a static library by default
set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build all libraries as shared")
......@@ -53,7 +58,7 @@ if (spoa_build_executable)
set_target_properties(spoa_bin PROPERTIES OUTPUT_NAME spoa)
install(TARGETS spoa_bin DESTINATION ${CMAKE_INSTALL_BINDIR})
endif(spoa_build_executable)
endif()
if (spoa_build_tests)
set(spoa_test_data_path ${PROJECT_SOURCE_DIR}/test/data/)
......@@ -69,7 +74,9 @@ if (spoa_build_tests)
if (NOT TARGET bioparser)
add_subdirectory(vendor/bioparser EXCLUDE_FROM_ALL)
endif()
if (NOT TARGET gtest_main)
add_subdirectory(vendor/googletest/googletest EXCLUDE_FROM_ALL)
endif()
target_link_libraries(spoa_test spoa bioparser gtest_main)
endif(spoa_build_tests)
endif()
......@@ -4,7 +4,7 @@
[![Build status for c++/clang++](https://travis-ci.org/rvaser/spoa.svg?branch=master)](https://travis-ci.org/rvaser/spoa)
[![Published in Genome Research](https://img.shields.io/badge/published%20in-Genome%20Research-blue.svg)](https://doi.org/10.1101/gr.214270.116)
Spoa (SIMD POA) is a c++ implementation of the partial order alignment (POA) algorithm (as described in 10.1093/bioinformatics/18.3.452) which is used to generate consensus sequences (as described in 10.1093/bioinformatics/btg109). It supports three alignment modes: local (Smith-Waterman), global (Needleman-Wunsch) and semi-global alignment (overlap). It supports Intel SSE4.1+ and AVX2 (marginally faster due to high latency shifts) vectorization.
Spoa (SIMD POA) is a c++ implementation of the partial order alignment (POA) algorithm (as described in 10.1093/bioinformatics/18.3.452) which is used to generate consensus sequences (as described in 10.1093/bioinformatics/btg109). It supports three alignment modes: local (Smith-Waterman), global (Needleman-Wunsch) and semi-global alignment (overlap), and three gap modes: linear, affine and convex (piecewise affine). It supports Intel SSE4.1+ and AVX2 vectorization (marginally faster due to high latency shifts).
## Dependencies
......@@ -39,25 +39,40 @@ Optionally, you can run `sudo make install` to install spoa library (and executa
***Note***: if you omitted `--recursive` from `git clone`, run `git submodule init` and `git submodule update` before proceeding with compilation.
To build unit tests add `-Dspoa_build_tests=ON` while running `cmake`. After installation, an executable named `spoa_test` will be created in `build/bin`.
## Usage
Usage of spoa is as following:
```bash
spoa [options ...] <sequences>
<sequences>
input file in FASTA/FASTQ format containing sequences
input file in FASTA/FASTQ format (can be compressed with gzip)
containing sequences
options:
-m, --match <int>
-m <int>
default: 5
score for matching bases
-x, --mismatch <int>
-n <int>
default: -4
score for mismatching bases
-g, --gap <int>
-g <int>
default: -8
gap penalty (must be negative)
gap opening penalty (must be non-positive)
-e <int>
default: -6
gap extension penalty (must be non-positive)
-q <int>
default: -10
gap opening penalty of the second affine function
(must be non-positive)
-c <int>
default: -4
gap extension penalty of the second affine function
(must be non-positive)
-l, --algorithm <int>
default: 0
alignment mode:
......@@ -70,11 +85,19 @@ Usage of spoa is as following:
0 - consensus
1 - multiple sequence alignment
2 - 0 & 1
-d, --dot <file>
output file for the final POA graph in DOT format
--version
prints the version number
-h, --help
prints the usage
gap mode:
linear if g >= e
affine if g <= q or e >= c
convex otherwise (default)
```
### Library
Simple library usage can be seen in the following `example.cpp` file. This code shows how to get consensus and multiple sequence alignment for a set of sequences without quality values.
......@@ -94,12 +117,12 @@ int main(int argc, char** argv) {
};
auto alignment_engine = spoa::createAlignmentEngine(static_cast<spoa::AlignmentType>(atoi(argv[1])),
atoi(argv[2]), atoi(argv[3]), atoi(argv[4]));
atoi(argv[2]), atoi(argv[3]), atoi(argv[4]), atoi(argv[5]));
auto graph = spoa::createGraph();
for (const auto& it: sequences) {
auto alignment = alignment_engine->align_sequence_with_graph(it, graph);
auto alignment = alignment_engine->align(it, graph);
graph->add_alignment(alignment, it);
}
......@@ -131,7 +154,7 @@ g++ example.cpp -std=c++11 -lspoa -o example
The executable can be run with:
```bash
./example 0 5 -4 -8
./example 0 5 -4 -8 -6
```
On the other hand, if you are using `cmake` you can add spoa to your project by adding commands `add_subdirectory(vendor/spoa EXCLUDE_FROM_ALL)` and `target_link_libraries(your_exe spoa)` to your main CMakeLists file.
......
spoa (3.0.1-1~bpo9+1) stretch-backports-sloppy; urgency=medium
* Rebuild for stretch-backports.
-- Andreas Tille <tille@debian.org> Tue, 12 Nov 2019 16:10:51 +0100
spoa (3.0.1-1) unstable; urgency=medium
* Fix symbols file
Closes: #925835, #940280
* New upstream version
* debhelper-compat 12
-- Andreas Tille <tille@debian.org> Mon, 16 Sep 2019 13:24:48 +0200
spoa (3.0.0-1) unstable; urgency=medium
* New upstream version
* Standards-Version: 4.4.0
* Versioned Build-Depends: libbioparser-dev (>= 2.0)
* Bump soversion to 3
-- Andreas Tille <tille@debian.org> Mon, 15 Jul 2019 16:12:17 +0200
spoa (1.1.5-1) unstable; urgency=medium
* New upstream version
* debhelper 12
* Standards-Version: 4.3.0
* Upstream changed ABI and we stick here to the broken way to set
soversion = version but upstream was informed to invent better soversions
https://github.com/rvaser/spoa/issues/14
-- Andreas Tille <tille@debian.org> Mon, 28 Jan 2019 21:13:39 +0100
spoa (1.1.3-4) unstable; urgency=medium
* Set architecture to amd64 since the code is not really portable
Closes: #913574
-- Andreas Tille <tille@debian.org> Thu, 06 Dec 2018 11:35:45 +0100
spoa (1.1.3-3) unstable; urgency=medium
* Do not build with -march
Closes: #911970
* Remove trailing whitespace in debian/copyright
* Mark two symbols as optional
-- Andreas Tille <tille@debian.org> Sat, 03 Nov 2018 07:38:44 +0100
spoa (1.1.3-2~bpo9+1) stretch-backports; urgency=medium
* Rebuild for stretch-backports.
......
......@@ -3,20 +3,20 @@ Maintainer: Debian Med Packaging Team <debian-med-packaging@lists.alioth.debian.
Uploaders: Andreas Tille <tille@debian.org>
Section: science
Priority: optional
Build-Depends: debhelper (>= 11~),
Build-Depends: debhelper-compat (= 12),
cmake,
d-shlibs,
rename,
libbioparser-dev,
libbioparser-dev (>= 2.0),
libgtest-dev,
zlib1g-dev
Standards-Version: 4.2.1
Standards-Version: 4.4.0
Vcs-Browser: https://salsa.debian.org/med-team/spoa
Vcs-Git: https://salsa.debian.org/med-team/spoa.git
Homepage: https://github.com/rvaser/spoa
Package: spoa
Architecture: any
Architecture: amd64
Depends: ${shlibs:Depends},
${misc:Depends}
Description: SIMD partial order alignment tool
......@@ -27,8 +27,8 @@ Description: SIMD partial order alignment tool
(Smith-Waterman), global (Needleman-Wunsch) and semi-global alignment
(overlap).
Package: libspoa1.1.3
Architecture: any
Package: libspoa3
Architecture: amd64
Section: libs
Depends: ${shlibs:Depends},
${misc:Depends}
......@@ -43,11 +43,11 @@ Description: SIMD partial order alignment library
This package contains the shared library.
Package: libspoa-dev
Architecture: any
Architecture: amd64
Section: libdevel
Depends: ${shlibs:Depends},
${misc:Depends},
libspoa1.1.3 (= ${binary:Version})
libspoa3 (= ${binary:Version})
Description: SIMD partial order alignment library (development files)
Spoa (SIMD POA) is a c++ implementation of the partial order alignment
(POA) algorithm (as described in 10.1093/bioinformatics/18.3.452) which
......
Note: Patch is not activated to not derive to much from upstream
Description: Invent SOVERSION different from upstream version
Author: Andreas Tille <tille@debian.org>
Last-Update: Mon, 28 Jan 2019 20:04:58 +0100
Bug-Upstream: https://github.com/rvaser/spoa/issues/14
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
cmake_minimum_required(VERSION 3.2)
-project(spoa LANGUAGES CXX VERSION 3.0.0)
+project(spoa LANGUAGES CXX VERSION 3)
include(GNUInstallDirs)
use_debian_packaged_libs.patch
shared_and_static.patch
fix_soversion.patch
......@@ -4,8 +4,8 @@ Description: Build shared and static lib
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,9 +16,15 @@ option(spoa_build_executable "Build spoa
option(spoa_build_tests "Build spoa unit tests" OFF)
@@ -21,9 +21,15 @@ if (spoa_optimize_for_native)
endif()
# build SPOA as a static library by default
-set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build all libraries as shared")
......@@ -22,7 +22,7 @@ Description: Build shared and static lib
src/alignment_engine.cpp
src/graph.cpp
src/simd_alignment_engine.cpp
@@ -28,12 +34,17 @@ target_include_directories(spoa PUBLIC
@@ -33,12 +39,17 @@ target_include_directories(spoa PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
$<INSTALL_INTERFACE:include>)
......
......@@ -4,7 +4,7 @@ Description: Use Debian packaged libraries
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -45,11 +45,7 @@ if (spoa_build_executable)
@@ -50,11 +50,7 @@ if (spoa_build_executable)
src/sequence.cpp
src/main.cpp)
......@@ -17,15 +17,17 @@ Description: Use Debian packaged libraries
set_target_properties(spoa_bin PROPERTIES OUTPUT_NAME spoa)
install(TARGETS spoa_bin DESTINATION ${CMAKE_INSTALL_BINDIR})
@@ -66,10 +62,5 @@ if (spoa_build_tests)
@@ -71,12 +67,5 @@ if (spoa_build_tests)
src/sequence.cpp
test/spoa_test.cpp)
- if (NOT TARGET bioparser)
- add_subdirectory(vendor/bioparser EXCLUDE_FROM_ALL)
- endif()
- if (NOT TARGET gtest_main)
- add_subdirectory(vendor/googletest/googletest EXCLUDE_FROM_ALL)
- endif()
-
- target_link_libraries(spoa_test spoa bioparser gtest_main)
+ target_link_libraries(spoa_test spoa gtest_main gtest z pthread)
endif(spoa_build_tests)
endif()
......@@ -6,7 +6,8 @@ export DEB_BUILD_MAINT_OPTIONS=hardening=+all
DEB_CMAKE_EXTRA_FLAGS = -DCMAKE_BUILD_TYPE=Release \
-Dspoa_build_executable=ON \
-Dspoa_build_tests=ON
-Dspoa_build_tests=ON \
-Dspoa_optimize_for_native=FALSE
%:
dh $@
......
......@@ -6,7 +6,7 @@
#pragma once
#include <stdint.h>
#include <cstdint>
#include <string>
#include <memory>
#include <vector>
......@@ -20,34 +20,54 @@ enum class AlignmentType {
kOV // Overlap
};
enum class AlignmentSubtype {
kLinear,
kAffine,
kConvex
};
class Graph;
using Alignment = std::vector<std::pair<int32_t, int32_t>>;
using Alignment = std::vector<std::pair<std::int32_t, std::int32_t>>;
class AlignmentEngine;
std::unique_ptr<AlignmentEngine> createAlignmentEngine(
AlignmentType alignment_type, int8_t match, int8_t mismatch, int8_t gap);
std::unique_ptr<AlignmentEngine> createAlignmentEngine(AlignmentType type,
std::int8_t m, std::int8_t n, std::int8_t g);
std::unique_ptr<AlignmentEngine> createAlignmentEngine(AlignmentType type,
std::int8_t m, std::int8_t n, std::int8_t g, std::int8_t e);
std::unique_ptr<AlignmentEngine> createAlignmentEngine(AlignmentType type,
std::int8_t m, std::int8_t n, std::int8_t g, std::int8_t e,
std::int8_t q, std::int8_t c);
class AlignmentEngine {
public:
virtual ~AlignmentEngine() {}
virtual void prealloc(uint32_t max_sequence_size, uint32_t alphabet_size) = 0;
virtual void prealloc(std::uint32_t max_sequence_size,
std::uint32_t alphabet_size) = 0;
Alignment align_sequence_with_graph(const std::string& sequence,
Alignment align(const std::string& sequence,
const std::unique_ptr<Graph>& graph);
virtual Alignment align_sequence_with_graph(const char* sequence,
uint32_t sequence_size, const std::unique_ptr<Graph>& graph) = 0;
virtual Alignment align(const char* sequence, std::uint32_t sequence_size,
const std::unique_ptr<Graph>& graph) noexcept = 0;
protected:
AlignmentEngine(AlignmentType alignment_type, int8_t match, int8_t mismatch,
int8_t gap);
AlignmentEngine(AlignmentType type, AlignmentSubtype subtype, std::int8_t m,
std::int8_t n, std::int8_t g, std::int8_t e, std::int8_t q,
std::int8_t c);
AlignmentEngine(const AlignmentEngine&) = delete;
const AlignmentEngine& operator=(const AlignmentEngine&) = delete;
AlignmentType alignment_type_;
int8_t match_;
int8_t mismatch_;
int8_t gap_;
AlignmentType type_;
AlignmentSubtype subtype_;
std::int8_t m_;
std::int8_t n_;
std::int8_t g_;
std::int8_t e_;
std::int8_t q_;
std::int8_t c_;
};
}
......@@ -6,7 +6,7 @@
#pragma once
#include <stdint.h>
#include <cstdint>
#include <memory>
#include <string>
#include <vector>
......@@ -21,7 +21,7 @@ class Edge;
class Graph;
std::unique_ptr<Graph> createGraph();
using Alignment = std::vector<std::pair<int32_t, int32_t>>;
using Alignment = std::vector<std::pair<std::int32_t, std::int32_t>>;
class Graph {
public:
......@@ -31,72 +31,80 @@ public:
return nodes_;
}
const std::vector<uint32_t>& rank_to_node_id() const {
const std::vector<std::uint32_t>& rank_to_node_id() const {
return rank_to_node_id_;
}
uint32_t num_codes() const {
std::uint32_t num_codes() const {
return num_codes_;
};
uint8_t coder(uint8_t c) const {
std::uint8_t coder(std::uint8_t c) const {
return coder_[c];
}
uint8_t decoder(uint8_t code) const {
std::uint8_t decoder(std::uint8_t code) const {
return decoder_[code];
}
void add_alignment(const Alignment& alignment, const std::string& sequence,
uint32_t weight = 1);
std::uint32_t weight = 1);
void add_alignment(const Alignment& alignment, const char* sequence,
uint32_t sequence_size, uint32_t weight = 1);
std::uint32_t sequence_size, std::uint32_t weight = 1);
void add_alignment(const Alignment& alignment, const std::string& sequence,
const std::string& quality);
void add_alignment(const Alignment& alignment, const char* sequence,
uint32_t sequence_size, const char* quality, uint32_t quality_size);
std::uint32_t sequence_size, const char* quality,
std::uint32_t quality_size);
void add_alignment(const Alignment& alignment, const std::string& sequence,
const std::vector<uint32_t>& weights);
const std::vector<std::uint32_t>& weights);
void add_alignment(const Alignment& alignment, const char* sequence,
uint32_t sequence_size, const std::vector<uint32_t>& weights);
std::uint32_t sequence_size, const std::vector<std::uint32_t>& weights);
void generate_multiple_sequence_alignment(std::vector<std::string>& dst,
bool include_consensus = false);
std::string generate_consensus();
// returns base coverages or complete summary matrix if verbose equals true
std::string generate_consensus(std::vector<uint32_t>& dst, bool verbose = false);
std::string generate_consensus(std::vector<std::uint32_t>& dst,
bool verbose = false);
std::unique_ptr<Graph> subgraph(uint32_t begin_node_id, uint32_t end_node_id,
std::vector<int32_t>& subgraph_to_graph_mapping) const;
std::unique_ptr<Graph> subgraph(std::uint32_t begin_node_id,
std::uint32_t end_node_id,
std::vector<std::int32_t>& subgraph_to_graph_mapping) const;
void update_alignment(Alignment& alignment,
const std::vector<int32_t>& subgraph_to_graph_mapping) const;
const std::vector<std::int32_t>& subgraph_to_graph_mapping) const;
void print_graphviz() const;
void print_dot(const std::string& path) const;
void clear();
friend std::unique_ptr<Graph> createGraph();
private:
Graph();
Graph(const Graph&) = delete;
const Graph& operator=(const Graph&) = delete;
static std::unique_ptr<Node> createNode(uint32_t id, uint32_t code);
static std::unique_ptr<Node> createNode(std::uint32_t id, std::uint32_t code);
static std::unique_ptr<Edge> createEdge(uint32_t begin_node_id,
uint32_t end_node_id, uint32_t label, uint32_t weight);
static std::unique_ptr<Edge> createEdge(std::uint32_t begin_node_id,
std::uint32_t end_node_id, std::uint32_t label, std::uint32_t weight);
uint32_t add_node(uint32_t code);
std::uint32_t add_node(std::uint32_t code);
void add_edge(uint32_t begin_node_id, uint32_t end_node_id, uint32_t weight);
void add_edge(std::uint32_t begin_node_id, std::uint32_t end_node_id,
std::uint32_t weight);
int32_t add_sequence(const char* sequence, const std::vector<uint32_t>& weights,
uint32_t begin, uint32_t end);
std::int32_t add_sequence(const char* sequence,
const std::vector<std::uint32_t>& weights, std::uint32_t begin,
std::uint32_t end);
void topological_sort();
......@@ -104,34 +112,35 @@ private:
void traverse_heaviest_bundle();
uint32_t branch_completion(std::vector<int64_t>& scores,
std::vector<int32_t>& predecessors,
uint32_t rank);
std::uint32_t branch_completion(std::vector<std::int64_t>& scores,
std::vector<std::int32_t>& predecessors,
std::uint32_t rank);
void extract_subgraph_nodes(std::vector<bool>& dst, uint32_t current_node_id,
uint32_t end_node_id) const;
void extract_subgraph_nodes(std::vector<bool>& dst,
std::uint32_t current_node_id, std::uint32_t end_node_id) const;
uint32_t initialize_multiple_sequence_alignment(std::vector<uint32_t>& dst) const;
std::uint32_t initialize_multiple_sequence_alignment(
std::vector<std::uint32_t>& dst) const;
uint32_t num_sequences_;
uint32_t num_codes_;
std::vector<int32_t> coder_;
std::vector<int32_t> decoder_;
std::uint32_t num_sequences_;
std::uint32_t num_codes_;
std::vector<std::int32_t> coder_;
std::vector<std::int32_t> decoder_;
std::vector<std::unique_ptr<Node>> nodes_;
std::vector<uint32_t> rank_to_node_id_;
std::vector<uint32_t> sequences_begin_nodes_ids_;
std::vector<uint32_t> consensus_;
std::vector<std::uint32_t> rank_to_node_id_;
std::vector<std::uint32_t> sequences_begin_nodes_ids_;
std::vector<std::uint32_t> consensus_;
};
class Node {
public:
~Node();
uint32_t id() const {
std::uint32_t id() const {
return id_;
}
uint32_t code() const {
std::uint32_t code() const {
return code_;
}
......@@ -143,53 +152,55 @@ public:
return out_edges_;
}
const std::vector<uint32_t>& aligned_nodes_ids() const {
const std::vector<std::uint32_t>& aligned_nodes_ids() const {
return aligned_nodes_ids_;
}
bool successor(uint32_t& dst, uint32_t label) const;
bool successor(std::uint32_t& dst, std::uint32_t label) const;
uint32_t coverage() const;
std::uint32_t coverage() const;
friend Graph;
private:
Node(uint32_t id, uint32_t code);
Node(std::uint32_t id, std::uint32_t code);
Node(const Node&) = delete;
const Node& operator=(const Node&) = delete;
uint32_t id_;
uint32_t code_;
std::uint32_t id_;
std::uint32_t code_;
std::vector<std::shared_ptr<Edge>> in_edges_;
std::vector<std::shared_ptr<Edge>> out_edges_;
std::vector<uint32_t> aligned_nodes_ids_;
std::vector<std::uint32_t> aligned_nodes_ids_;
};
class Edge {
public:
~Edge();
uint32_t begin_node_id() const {
std::uint32_t begin_node_id() const {
return begin_node_id_;
}
uint32_t end_node_id() const {
std::uint32_t end_node_id() const {
return end_node_id_;
}
friend Graph;
friend Node;
private:
Edge(uint32_t begin_node_id, uint32_t end_node_id, uint32_t label,
uint32_t weight);
Edge(std::uint32_t begin_node_id, std::uint32_t end_node_id,
std::uint32_t label, std::uint32_t weight);
Edge(const Edge&) = delete;
const Edge& operator=(const Edge&) = delete;
void add_sequence(uint32_t label, uint32_t weight = 1);
void add_sequence(std::uint32_t label, std::uint32_t weight = 1);
uint32_t begin_node_id_;
uint32_t end_node_id_;
std::vector<uint32_t> sequence_labels_;
int64_t total_weight_;
std::uint32_t begin_node_id_;
std::uint32_t end_node_id_;
std::vector<std::uint32_t> sequence_labels_;
std::int64_t total_weight_;
};
}
......@@ -4,9 +4,9 @@
* @brief AlignmentEngine class source file
*/
#include <stdlib.h>
#include <limits>
#include <algorithm>
#include <exception>
#include "sisd_alignment_engine.hpp"
#include "simd_alignment_engine.hpp"
......@@ -14,45 +14,69 @@
namespace spoa {
std::unique_ptr<AlignmentEngine> createAlignmentEngine(
AlignmentType alignment_type, int8_t match, int8_t mismatch,
int8_t gap) {
std::unique_ptr<AlignmentEngine> createAlignmentEngine(AlignmentType type,
std::int8_t m, std::int8_t n, std::int8_t g) {
if (alignment_type != AlignmentType::kSW &&
alignment_type != AlignmentType::kNW &&
alignment_type != AlignmentType::kOV) {
return createAlignmentEngine(type, m, n, g, g);
}
std::unique_ptr<AlignmentEngine> createAlignmentEngine(AlignmentType type,
std::int8_t m, std::int8_t n, std::int8_t g, std::int8_t e) {
fprintf(stderr, "[spoa::createAlignmentEngine] error: "
"invalid alignment type!\n");
exit(1);
return createAlignmentEngine(type, m, n, g, e, g, e);
}
if (gap >= 0) {
fprintf(stderr, "[spoa::createAlignmentEngine] error: "
"gap penalty must be negative!\n");
exit(1);
std::unique_ptr<AlignmentEngine> createAlignmentEngine(AlignmentType type,
std::int8_t m, std::int8_t n, std::int8_t g, std::int8_t e,
std::int8_t q, std::int8_t c) {
if (type != AlignmentType::kSW &&
type != AlignmentType::kNW &&
type != AlignmentType::kOV) {
throw std::invalid_argument("[spoa::createAlignmentEngine] error: "
"invalid alignment type!");
}
if (g > 0 || q > 0) {
throw std::invalid_argument("[spoa::createAlignmentEngine] error: "
"gap opening penalty must be non-positive!");
}
if (e > 0 || c > 0) {
throw std::invalid_argument("[spoa::createAlignmentEngine] error: "
"gap extension penalty must be non-positive!");
}
AlignmentSubtype subtype = g >= e ?
AlignmentSubtype::kLinear : (g <= q || e >= c ?
AlignmentSubtype::kAffine : AlignmentSubtype::kConvex);
if (subtype == AlignmentSubtype::kLinear) {
e = g;
} else if (subtype == AlignmentSubtype::kAffine) {
q = g;
c = e;
}
auto alignment_engine = createSimdAlignmentEngine(alignment_type, match,
mismatch, gap);
auto alignment_engine = createSimdAlignmentEngine(type, subtype, m, n, g, e,
q, c);
if (alignment_engine == nullptr) {
return createSisdAlignmentEngine(alignment_type, match, mismatch, gap);
return createSisdAlignmentEngine(type, subtype, m, n, g, e, q, c);
}
return alignment_engine;
}
AlignmentEngine::AlignmentEngine(AlignmentType alignment_type, int8_t match,
int8_t mismatch, int8_t gap)
: alignment_type_(alignment_type), match_(match), mismatch_(mismatch),
gap_(gap) {
AlignmentEngine::AlignmentEngine(AlignmentType type, AlignmentSubtype subtype,
std::int8_t m, std::int8_t n, std::int8_t g, std::int8_t e,
std::int8_t q, std::int8_t c)
: type_(type), subtype_(subtype), m_(m), n_(n), g_(g), e_(e), q_(q), c_(c) {
}
Alignment AlignmentEngine::align_sequence_with_graph(const std::string& sequence,
Alignment AlignmentEngine::align(const std::string& sequence,
const std::unique_ptr<Graph>& graph) {
return this->align_sequence_with_graph(sequence.c_str(), sequence.size(), graph);
return align(sequence.c_str(), sequence.size(), graph);
}
}
This diff is collapsed.
#include <stdio.h>
#include <stdlib.h>
#include <getopt.h>
#include <cstdint>
#include <string>
#include <iostream>
#include <exception>
#include "sequence.hpp"
#include "spoa/spoa.hpp"
#include "bioparser/bioparser.hpp"
static const char* version = "v1.1.3";
static const std::string version = "v3.0.1";
static struct option options[] = {
{"match", required_argument, 0, 'm'},
{"mismatch", required_argument, 0, 'x'},
{"gap", required_argument, 0, 'g'},
{"algorithm", required_argument, 0, 'l'},
{"result", required_argument, 0, 'r'},
{"version", no_argument, 0, 'v'},
{"help", no_argument, 0, 'h'},
{0, 0, 0, 0}
{"algorithm", required_argument, nullptr, 'l'},
{"result", required_argument, nullptr, 'r'},
{"dot", required_argument, nullptr, 'd'},
{"version", no_argument, nullptr, 'v'},
{"help", no_argument, nullptr, 'h'},
{nullptr, 0, nullptr, 0}
};
void help();
int main(int argc, char** argv) {
int8_t match = 5;
int8_t mismatch = -4;
int8_t gap = -8;
std::int8_t m = 5;
std::int8_t n = -4;
std::int8_t g = -8;
std::int8_t e = -6;
std::int8_t q = -10;
std::int8_t c = -4;
std::uint8_t algorithm = 0;
std::uint8_t result = 0;
uint8_t algorithm = 0;
uint8_t result = 0;
std::string dot_path = "";
char opt;
while ((opt = getopt_long(argc, argv, "m:x:g:l:r:h", options, nullptr)) != -1) {
while ((opt = getopt_long(argc, argv, "m:n:g:e:q:c:l:r:d:h", options, nullptr)) != -1) {
switch (opt) {
case 'm':
match = atoi(optarg);
break;
case 'x':
mismatch = atoi(optarg);
break;
case 'g':
gap = atoi(optarg);
break;
case 'l':
algorithm = atoi(optarg);
break;
case 'r':
result = atoi(optarg);
break;
case 'v':
printf("%s\n", version);
exit(0);
case 'h':
help();
exit(0);
default:
exit(1);
case 'm': m = atoi(optarg); break;
case 'n': n = atoi(optarg); break;
case 'g': g = atoi(optarg); break;
case 'e': e = atoi(optarg); break;
case 'q': q = atoi(optarg); break;
case 'c': c = atoi(optarg); break;
case 'l': algorithm = atoi(optarg); break;
case 'r': result = atoi(optarg); break;
case 'd': dot_path = optarg; break;
case 'v': std::cout << version << std::endl; return 0;
case 'h': help(); return 0;
default: return 1;
}
}
if (optind >= argc) {
fprintf(stderr, "[spoa::] error: missing input file!\n");
std::cerr << "[spoa::] error: missing input file!" << std::endl;
help();
exit(1);
return 1;
}
std::string input_path = argv[optind];
auto extension = input_path.substr(std::min(input_path.rfind('.'),
input_path.size()));
std::string sequences_path = argv[optind];
auto is_suffix = [](const std::string& src, const std::string& suffix) -> bool {
if (src.size() < suffix.size()) {
return false;
}
return src.compare(src.size() - suffix.size(), suffix.size(), suffix) == 0;
};
std::unique_ptr<bioparser::Parser<spoa::Sequence>> sparser = nullptr;
if (extension == ".fasta" || extension == ".fa") {
if (is_suffix(sequences_path, ".fasta") || is_suffix(sequences_path, ".fa") ||
is_suffix(sequences_path, ".fasta.gz") || is_suffix(sequences_path, ".fa.gz")) {
sparser = bioparser::createParser<bioparser::FastaParser, spoa::Sequence>(
input_path);
} else if (extension == ".fastq" || extension == ".fq") {
sequences_path);
} else if (is_suffix(sequences_path, ".fastq") || is_suffix(sequences_path, ".fq") ||
is_suffix(sequences_path, ".fastq.gz") || is_suffix(sequences_path, ".fq.gz")) {
sparser = bioparser::createParser<bioparser::FastqParser, spoa::Sequence>(
input_path);
sequences_path);
} else {
fprintf(stderr, "[spoa::] error: "
"file %s has unsupported format extension (valid extensions: "
".fasta, .fa, .fastq, .fq)!\n", input_path.c_str());
exit(1);
std::cerr << "[spoa::] error: file " << sequences_path <<
" has unsupported format extension (valid extensions: .fasta, "
".fasta.gz, .fa, .fa.gz, .fastq, .fastq.gz, .fq, .fq.gz)!" <<
std::endl;
return 1;
}
auto alignment_engine = spoa::createAlignmentEngine(
static_cast<spoa::AlignmentType>(algorithm), match, mismatch, gap);
std::unique_ptr<spoa::AlignmentEngine> alignment_engine;
try {
alignment_engine = spoa::createAlignmentEngine(
static_cast<spoa::AlignmentType>(algorithm), m, n, g, e, q, c);
} catch(std::invalid_argument& exception) {
std::cerr << exception.what() << std::endl;
return 1;
}
auto graph = spoa::createGraph();
std::vector<std::unique_ptr<spoa::Sequence>> sequences;
sparser->parse_objects(sequences, -1);
sparser->parse(sequences, -1);
size_t max_sequence_size = 0;
std::size_t max_sequence_size = 0;
for (const auto& it: sequences) {
max_sequence_size = std::max(max_sequence_size, it->data().size());
}
alignment_engine->prealloc(max_sequence_size, 4);
for (const auto& it: sequences) {
auto alignment = alignment_engine->align_sequence_with_graph(it->data(),
graph);
auto alignment = alignment_engine->align(it->data(), graph);
try {
graph->add_alignment(alignment, it->data(), it->quality());
} catch(std::invalid_argument& exception) {
std::cerr << exception.what() << std::endl;
return 1;
}
}
if (result == 0 || result == 2) {
std::string consensus = graph->generate_consensus();
fprintf(stdout, "Consensus (%zu)\n", consensus.size());
fprintf(stdout, "%s\n", consensus.c_str());
std::cout << "Consensus (" << consensus.size() << ")" << std::endl;
std::cout << consensus << std::endl;
}
if (result == 1 || result == 2) {
std::vector<std::string> msa;
graph->generate_multiple_sequence_alignment(msa);
fprintf(stdout, "Multiple sequence alignment\n");
std::cout << "Multiple sequence alignment" << std::endl;
for (const auto& it: msa) {
fprintf(stdout, "%s\n", it.c_str());
std::cout << it << std::endl;
}
}
graph->print_dot(dot_path);
return 0;
}
void help() {
printf(
std::cout <<
"usage: spoa [options ...] <sequences>\n"
"\n"
" <sequences>\n"
" input file in FASTA/FASTQ format containing sequences\n"
" input file in FASTA/FASTQ format (can be compressed with gzip)\n"
" containing sequences\n"
"\n"
" options:\n"
" -m, --match <int>\n"
" -m <int>\n"
" default: 5\n"
" score for matching bases\n"
" -x, --mismatch <int>\n"
" -n <int>\n"
" default: -4\n"
" score for mismatching bases\n"
" -g, --gap <int>\n"
" -g <int>\n"
" default: -8\n"
" gap penalty (must be negative)\n"
" gap opening penalty (must be non-positive)\n"
" -e <int>\n"
" default: -6\n"
" gap extension penalty (must be non-positive)\n"
" -q <int>\n"
" default: -10\n"
" gap opening penalty of the second affine function\n"
" (must be non-positive)\n"
" -c <int>\n"
" default: -4\n"
" gap extension penalty of the second affine function\n"
" (must be non-positive)\n"
" -l, --algorithm <int>\n"
" default: 0\n"
" alignment mode:\n"
......@@ -152,8 +179,15 @@ void help() {
" 0 - consensus\n"
" 1 - multiple sequence alignment\n"
" 2 - 0 & 1\n"
" -d, --dot <file>\n"
" output file for the final POA graph in DOT format\n"
" --version\n"
" prints the version number\n"
" -h, --help\n"
" prints the usage\n");
" prints the usage\n"
"\n"
" gap mode:\n"
" linear if g >= e\n"
" affine if g <= q or e >= c\n"
" convex otherwise (default)\n";
}
......@@ -8,14 +8,15 @@
namespace spoa {
Sequence::Sequence(const char* name, uint32_t name_size, const char* data,
uint32_t data_size)
Sequence::Sequence(const char* name, std::uint32_t name_size,
const char* data, std::uint32_t data_size)
: name_(name, name_size), data_(data, data_size), quality_(
data_size, 34) {
}
Sequence::Sequence(const char* name, uint32_t name_size, const char* data,
uint32_t data_size, const char* quality, uint32_t quality_size)
Sequence::Sequence(const char* name, std::uint32_t name_size,
const char* data, std::uint32_t data_size,
const char* quality, std::uint32_t quality_size)
: name_(name, name_size), data_(data, data_size), quality_(quality,
quality_size) {
}
......
......@@ -6,7 +6,7 @@
#pragma once
#include <stdint.h>
#include <cstdint>
#include <memory>
#include <vector>
#include <string>
......@@ -23,7 +23,6 @@ namespace spoa {
class Sequence {
public:
~Sequence() = default;
const std::string& name() const {
......@@ -42,11 +41,11 @@ public:
friend bioparser::FastqParser<Sequence>;
private:
Sequence(const char* name, uint32_t name_size, const char* data,
uint32_t data_size);
Sequence(const char* name, uint32_t name_size, const char* data,
uint32_t data_size, const char* quality, uint32_t quality_size);
Sequence(const char* name, std::uint32_t name_size,
const char* data, std::uint32_t data_size);
Sequence(const char* name, std::uint32_t name_size,
const char* data, std::uint32_t data_size,
const char* quality, std::uint32_t quality_size);
Sequence(const Sequence&) = delete;
const Sequence& operator=(const Sequence&) = delete;
......
This diff is collapsed.
......@@ -6,11 +6,10 @@
#pragma once
#include <stdint.h>
#include <cstdint>
#include <memory>
#include <string>
#include <vector>
#include <utility>
#include "spoa/alignment_engine.hpp"
......@@ -19,37 +18,51 @@ namespace spoa {
class Graph;
class SimdAlignmentEngine;
std::unique_ptr<AlignmentEngine> createSimdAlignmentEngine(
AlignmentType alignment_type, int8_t match, int8_t mismatch, int8_t gap);
std::unique_ptr<AlignmentEngine> createSimdAlignmentEngine(AlignmentType type,
AlignmentSubtype subtype, std::int8_t m, std::int8_t n, std::int8_t g,
std::int8_t e, std::int8_t q, std::int8_t c);
class SimdAlignmentEngine: public AlignmentEngine {
public:
~SimdAlignmentEngine();
void prealloc(uint32_t max_sequence_size, uint32_t alphabet_size) override;
void prealloc(std::uint32_t max_sequence_size,
std::uint32_t alphabet_size) override;
Alignment align_sequence_with_graph(const char* sequence,
uint32_t sequence_size, const std::unique_ptr<Graph>& graph) override;
Alignment align(const char* sequence, std::uint32_t sequence_size,
const std::unique_ptr<Graph>& graph) noexcept override;
friend std::unique_ptr<AlignmentEngine> createSimdAlignmentEngine(
AlignmentType alignment_type, int8_t match, int8_t mismatch, int8_t gap);
AlignmentType type, AlignmentSubtype subtype, std::int8_t m,
std::int8_t n, std::int8_t g, std::int8_t e, std::int8_t q,
std::int8_t c);
private:
SimdAlignmentEngine(AlignmentType alignment_type, int8_t match,
int8_t mismatch, int8_t gap);
SimdAlignmentEngine(AlignmentType type, AlignmentSubtype subtype,
std::int8_t m, std::int8_t n, std::int8_t g, std::int8_t e,
std::int8_t q, std::int8_t c);
SimdAlignmentEngine(const SimdAlignmentEngine&) = delete;
const SimdAlignmentEngine& operator=(const SimdAlignmentEngine&) = delete;
template<typename T>
Alignment align(const char* sequence, uint32_t sequence_size,
Alignment linear(const char* sequence, std::uint32_t sequence_size,
const std::unique_ptr<Graph>& graph) noexcept;
template<typename T>
Alignment affine(const char* sequence, std::uint32_t sequence_size,
const std::unique_ptr<Graph>& graph) noexcept;
template<typename T>
Alignment convex(const char* sequence, std::uint32_t sequence_size,
const std::unique_ptr<Graph>& graph) noexcept;
void realloc(uint32_t matrix_width, uint32_t matrix_height,
uint32_t num_codes);
void realloc(std::uint32_t matrix_width, std::uint32_t matrix_height,
std::uint32_t num_codes);
template<typename T>
void initialize(const char* sequence, const std::unique_ptr<Graph>& graph,
uint32_t normal_matrix_width, uint32_t matrix_width,
uint32_t matrix_height) noexcept;
std::uint32_t normal_matrix_width, std::uint32_t matrix_width,
std::uint32_t matrix_height) noexcept;
struct Implementation;
std::unique_ptr<Implementation> pimpl_;
......