Skip to content
Commits on Source (11)
......@@ -16,3 +16,9 @@
[submodule "vendor/rampler"]
path = vendor/rampler
url = https://github.com/rvaser/rampler
[submodule "vendor/logger"]
path = vendor/logger
url = https://github.com/rvaser/logger
[submodule "vendor/ClaraGenomicsAnalysis"]
path = vendor/ClaraGenomicsAnalysis
url = https://github.com/clara-genomics/ClaraGenomicsAnalysis
dist: trusty
language: cpp
compiler:
......
......@@ -8,17 +8,40 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/bin)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pedantic")
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
option(racon_build_tests "Build racon unit tests" OFF)
option(racon_build_wrapper "Build racon wrapper" OFF)
option(racon_enable_cuda "Build racon with NVIDIA CUDA support" OFF)
add_executable(racon
# Check CUDA compatibility.
if(racon_enable_cuda)
find_package(CUDA 9.0 QUIET REQUIRED)
if(NOT ${CUDA_FOUND})
message(FATAL_ERROR "CUDA not detected on system. Please install")
else()
message(STATUS "Using CUDA ${CUDA_VERSION} from ${CUDA_TOOLKIT_ROOT_DIR}")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -lineinfo")
endif()
endif()
include_directories(${PROJECT_SOURCE_DIR}/src)
set(racon_sources
src/main.cpp
src/polisher.cpp
src/overlap.cpp
src/sequence.cpp
src/window.cpp)
if(racon_enable_cuda)
list(APPEND racon_sources src/cuda/cudapolisher.cpp src/cuda/cudabatch.cpp src/cuda/cudaaligner.cpp)
cuda_add_executable(racon ${racon_sources})
target_compile_definitions(racon PRIVATE CUDA_ENABLED)
else()
add_executable(racon ${racon_sources})
endif()
if (NOT TARGET bioparser)
add_subdirectory(vendor/bioparser EXCLUDE_FROM_ALL)
endif()
......@@ -31,8 +54,42 @@ endif()
if (NOT TARGET edlib)
add_subdirectory(vendor/edlib EXCLUDE_FROM_ALL)
endif()
if (NOT TARGET logger)
add_subdirectory(vendor/logger EXCLUDE_FROM_ALL)
endif()
if (racon_enable_cuda)
if (DEFINED CLARAGENOMICSANALYSIS_SDK_PATH)
list(APPEND CMAKE_PREFIX_PATH "${CLARAGENOMICSANALYSIS_SDK_PATH}/cmake")
find_package(cudapoa REQUIRED)
find_package(cudaaligner REQUIRED)
elseif (DEFINED CLARAGENOMICSANALYSIS_SRC_PATH)
if (NOT TARGET cudapoa)
add_subdirectory(${CLARAGENOMICSANALYSIS_SRC_PATH} ${CMAKE_CURRENT_BINARY_DIR}/ClaraGenomicsAnalysis EXCLUDE_FROM_ALL)
endif()
if (NOT TARGET cudaaligner)
add_subdirectory(${CLARAGENOMICSANALYSIS_SRC_PATH} ${CMAKE_CURRENT_BINARY_DIR}/ClaraGenomicsAnalysis EXCLUDE_FROM_ALL)
endif()
elseif(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ClaraGenomicsAnalysis)
if (NOT TARGET cudapoa)
add_subdirectory(vendor/ClaraGenomicsAnalysis ${CMAKE_CURRENT_BINARY_DIR}/ClaraGenomicsAnalysis EXCLUDE_FROM_ALL)
endif()
if (NOT TARGET cudaaligner)
add_subdirectory(vendor/ClaraGenomicsAnalysis ${CMAKE_CURRENT_BINARY_DIR}/ClaraGenomicsAnalysis EXCLUDE_FROM_ALL)
endif()
else()
if (NOT TARGET cudapoa)
add_subdirectory(../ClaraGenomicsAnalysis ${CMAKE_CURRENT_BINARY_DIR}/ClaraGenomicsAnalysis EXCLUDE_FROM_ALL)
endif()
if (NOT TARGET cudaaligner)
add_subdirectory(../ClaraGenomicsAnalysis ${CMAKE_CURRENT_BINARY_DIR}/ClaraGenomicsAnalysis EXCLUDE_FROM_ALL)
endif()
endif()
endif()
target_link_libraries(racon bioparser spoa thread_pool pthread edlib_static)
target_link_libraries(racon bioparser spoa thread_pool edlib_static logger)
if (racon_enable_cuda)
target_link_libraries(racon cudapoa cudaaligner)
endif()
install(TARGETS racon DESTINATION bin)
......@@ -43,18 +100,30 @@ if (racon_build_tests)
include_directories(${PROJECT_BINARY_DIR}/config)
include_directories(${PROJECT_SOURCE_DIR}/src)
add_executable(racon_test
set(racon_test_sources
test/racon_test.cpp
src/polisher.cpp
src/overlap.cpp
src/sequence.cpp
src/window.cpp)
add_subdirectory(vendor/googletest/googletest EXCLUDE_FROM_ALL)
if (racon_enable_cuda)
list(APPEND racon_test_sources src/cuda/cudapolisher.cpp src/cuda/cudabatch.cpp src/cuda/cudaaligner.cpp)
cuda_add_executable(racon_test ${racon_test_sources})
target_compile_definitions(racon_test PRIVATE CUDA_ENABLED)
else()
add_executable(racon_test ${racon_test_sources})
endif()
target_link_libraries(racon_test bioparser spoa thread_pool pthread
edlib_static gtest_main)
endif(racon_build_tests)
if (NOT TARGET gtest_main)
add_subdirectory(vendor/googletest/googletest EXCLUDE_FROM_ALL)
endif()
target_link_libraries(racon_test bioparser spoa thread_pool edlib_static logger gtest_main)
if (racon_enable_cuda)
target_link_libraries(racon_test cudapoa cudaaligner)
endif()
endif()
if (racon_build_wrapper)
set(racon_path ${PROJECT_BINARY_DIR}/bin/racon)
......@@ -66,5 +135,7 @@ if (racon_build_wrapper)
FILE_PERMISSIONS OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE
WORLD_READ WORLD_EXECUTE)
add_subdirectory(vendor/rampler)
endif(racon_build_wrapper)
if (NOT TARGET rampler)
add_subdirectory(vendor/rampler)
endif()
endif()
......@@ -11,7 +11,7 @@ Racon is intended as a standalone consensus module to correct raw contigs genera
Racon can be used as a polishing tool after the assembly with **either Illumina data or data produced by third generation of sequencing**. The type of data inputed is automatically detected.
Racon takes as input only three files: contigs in FASTA/FASTQ format, reads in FASTA/FASTQ format and overlaps/alignments between the reads and the contigs in MHAP/PAF/SAM format. Output is a set of polished contigs in FASTA format printed to stdout. All input files **can be compressed with gzip**.
Racon takes as input only three files: contigs in FASTA/FASTQ format, reads in FASTA/FASTQ format and overlaps/alignments between the reads and the contigs in MHAP/PAF/SAM format. Output is a set of polished contigs in FASTA format printed to stdout. All input files **can be compressed with gzip** (which will have impact on parsing time).
Racon can also be used as a read error-correction tool. In this scenario, the MHAP/PAF/SAM file needs to contain pairwise overlaps between reads **including dual overlaps**.
......@@ -21,6 +21,11 @@ A **wrapper script** is also available to enable easier usage to the end-user fo
1. gcc 4.8+ or clang 3.4+
2. cmake 3.2+
### CUDA Support
1. gcc 5.0+
2. cmake 3.10+
4. CUDA 10.0+
## Installation
To install Racon run the following commands:
......@@ -43,6 +48,20 @@ To build unit tests add `-Dracon_build_tests=ON` while running `cmake`. After in
To build the wrapper script add `-Dracon_build_wrapper=ON` while running `cmake`. After installation, an executable named `racon_wrapper` (python script) will be created in `build/bin`.
### CUDA Support
Racon makes use of [NVIDIA's ClaraGenomicsAnalysis SDK](https://github.com/clara-genomics/ClaraGenomicsAnalysis) for CUDA accelerated polishing and alignment.
To build `racon` with CUDA support, add `-Dracon_enable_cuda=ON` while running `cmake`. If CUDA support is unavailable, the `cmake` step will error out.
Note that the CUDA support flag does not produce a new binary target. Instead it augments the existing `racon` binary itself.
```bash
cd build
cmake -DCMAKE_BUILD_TYPE=Release -Dracon_enable_cuda=ON ..
make
```
***Note***: Short read polishing with CUDA is still in development!
## Usage
Usage of `racon` is as following:
......@@ -90,6 +109,15 @@ Usage of `racon` is as following:
-h, --help
prints the usage
only available when built with CUDA:
-c, --cudapoa-batches
default: 1
number of batches for CUDA accelerated polishing
-b, --cuda-banded-alignment
use banding approximation for polishing on GPU. Only applicable when -c is used.
--cudaaligner-batches (experimental)
Number of batches for CUDA accelerated alignment
`racon_test` is run without any parameters.
Usage of `racon_wrapper` equals the one of `racon` with two additional parameters:
......
racon (1.4.3-1) UNRELEASED; urgency=medium
* New upstream version
* debhelper-compat 12
* Standards-Version: 4.4.0
TODO: https://salsa.debian.org/med-team/liblogger
-- Andreas Tille <tille@debian.org> Fri, 02 Aug 2019 22:44:04 +0200
racon (1.3.2-1) unstable; urgency=medium
* Team upload.
......
......@@ -4,7 +4,7 @@ Uploaders: Cédric Lood <cedric.lood@kuleuven.be>,
Andreas Tille <tille@debian.org>
Section: science
Priority: optional
Build-Depends: debhelper (>= 11~),
Build-Depends: debhelper-compat (= 12),
cmake,
libgtest-dev,
libbioparser-dev,
......@@ -12,7 +12,7 @@ Build-Depends: debhelper (>= 11~),
libspoa-dev,
libthread-pool-dev,
rampler
Standards-Version: 4.3.0
Standards-Version: 4.4.0
Vcs-Browser: https://salsa.debian.org/med-team/racon
Vcs-Git: https://salsa.debian.org/med-team/racon.git
Homepage: https://github.com/isovic/racon
......
......@@ -4,9 +4,9 @@ Description: Use Debian packaged libraries
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,20 +19,7 @@ add_executable(racon
src/sequence.cpp
src/window.cpp)
@@ -42,18 +42,6 @@ else()
add_executable(racon ${racon_sources})
endif()
-if (NOT TARGET bioparser)
- add_subdirectory(vendor/bioparser EXCLUDE_FROM_ALL)
......@@ -20,22 +20,28 @@ Description: Use Debian packaged libraries
-if (NOT TARGET edlib)
- add_subdirectory(vendor/edlib EXCLUDE_FROM_ALL)
-endif()
-
-target_link_libraries(racon bioparser spoa thread_pool pthread edlib_static)
+target_link_libraries(racon spoa thread_pool pthread edlib z)
install(TARGETS racon DESTINATION bin)
if (NOT TARGET logger)
add_subdirectory(vendor/logger EXCLUDE_FROM_ALL)
endif()
@@ -86,7 +74,7 @@ if (racon_enable_cuda)
endif()
endif()
@@ -50,10 +37,8 @@ if (racon_build_tests)
src/sequence.cpp
src/window.cpp)
-target_link_libraries(racon bioparser spoa thread_pool edlib_static logger)
+target_link_libraries(racon spoa thread_pool pthread edlib logger z)
if (racon_enable_cuda)
target_link_libraries(racon cudapoa cudaaligner)
endif()
@@ -115,11 +103,7 @@ if (racon_build_tests)
add_executable(racon_test ${racon_test_sources})
endif()
- add_subdirectory(vendor/googletest/googletest EXCLUDE_FROM_ALL)
- if (NOT TARGET gtest_main)
- add_subdirectory(vendor/googletest/googletest EXCLUDE_FROM_ALL)
- endif()
-
- target_link_libraries(racon_test bioparser spoa thread_pool pthread
- edlib_static gtest_main)
+ target_link_libraries(racon_test spoa thread_pool pthread
+ edlib gtest_main gtest z)
endif(racon_build_tests)
if (racon_build_wrapper)
- target_link_libraries(racon_test bioparser spoa thread_pool edlib_static logger gtest_main)
+ target_link_libraries(racon_test spoa thread_pool pthread edlib logger gtest_main gtest z)
if (racon_enable_cuda)
target_link_libraries(racon_test cudapoa cudaaligner)
endif()
/*!
* @file cudaaligner.cpp
*
* @brief CUDABatchAligner class source file
*/
#include <cudautils/cudautils.hpp>
#include "cudaaligner.hpp"
namespace racon {
std::atomic<uint32_t> CUDABatchAligner::batches;
std::unique_ptr<CUDABatchAligner> createCUDABatchAligner(uint32_t max_query_size,
uint32_t max_target_size,
uint32_t max_alignments,
uint32_t device_id)
{
return std::unique_ptr<CUDABatchAligner>(new CUDABatchAligner(max_query_size,
max_target_size,
max_alignments,
device_id));
}
CUDABatchAligner::CUDABatchAligner(uint32_t max_query_size,
uint32_t max_target_size,
uint32_t max_alignments,
uint32_t device_id)
: aligner_(claragenomics::cudaaligner::create_aligner(max_query_size,
max_target_size,
max_alignments,
claragenomics::cudaaligner::AlignmentType::global,
device_id))
, overlaps_()
, stream_(0)
{
bid_ = CUDABatchAligner::batches++;
CGA_CU_CHECK_ERR(cudaStreamCreate(&stream_));
aligner_->set_cuda_stream(stream_);
}
CUDABatchAligner::~CUDABatchAligner()
{
CGA_CU_CHECK_ERR(cudaStreamDestroy(stream_));
}
bool CUDABatchAligner::addOverlap(Overlap* overlap, std::vector<std::unique_ptr<Sequence>>& sequences)
{
const char* q = !overlap->strand_ ? &(sequences[overlap->q_id_]->data()[overlap->q_begin_]) :
&(sequences[overlap->q_id_]->reverse_complement()[overlap->q_length_ - overlap->q_end_]);
const char* t = &(sequences[overlap->t_id_]->data()[overlap->t_begin_]);
claragenomics::cudaaligner::StatusType s =
aligner_->add_alignment(q, overlap->q_end_ - overlap->q_begin_,
t, overlap->t_end_ - overlap->t_begin_);
if (s == claragenomics::cudaaligner::StatusType::exceeded_max_alignments)
{
return false;
}
else if (s == claragenomics::cudaaligner::StatusType::exceeded_max_alignment_difference
|| s == claragenomics::cudaaligner::StatusType::exceeded_max_length)
{
cpu_overlap_data_.emplace_back(std::make_pair<std::string, std::string>(std::string(q, q + overlap->q_end_ - overlap->q_begin_),
std::string(t, t + overlap->t_end_ - overlap->t_begin_)));
cpu_overlaps_.push_back(overlap);
}
else if (s != claragenomics::cudaaligner::StatusType::success)
{
fprintf(stderr, "Unknown error in cuda aligner!\n");
}
else
{
overlaps_.push_back(overlap);
}
return true;
}
void CUDABatchAligner::alignAll()
{
aligner_->align_all();
compute_cpu_overlaps();
}
void CUDABatchAligner::compute_cpu_overlaps()
{
for(std::size_t a = 0; a < cpu_overlaps_.size(); a++)
{
// Run CPU version of overlap.
Overlap* overlap = cpu_overlaps_[a];
overlap->align_overlaps(cpu_overlap_data_[a].first.c_str(), cpu_overlap_data_[a].first.length(),
cpu_overlap_data_[a].second.c_str(), cpu_overlap_data_[a].second.length());
}
}
void CUDABatchAligner::find_breaking_points(uint32_t window_length)
{
aligner_->sync_alignments();
const std::vector<std::shared_ptr<claragenomics::cudaaligner::Alignment>>& alignments = aligner_->get_alignments();
// Number of alignments should be the same as number of overlaps.
if (overlaps_.size() != alignments.size())
{
throw std::runtime_error("Number of alignments doesn't match number of overlaps in cudaaligner.");
}
for(std::size_t a = 0; a < alignments.size(); a++)
{
overlaps_[a]->cigar_ = alignments[a]->convert_to_cigar();
overlaps_[a]->find_breaking_points_from_cigar(window_length);
}
for(Overlap* overlap : cpu_overlaps_)
{
// Run CPU version of breaking points.
overlap->find_breaking_points_from_cigar(window_length);
}
}
void CUDABatchAligner::reset()
{
overlaps_.clear();
cpu_overlaps_.clear();
cpu_overlap_data_.clear();
aligner_->reset();
}
}
/*!
* @file cudaaligner.hpp
*
* @brief CUDA aligner class header file
*/
#include "cudaaligner/cudaaligner.hpp"
#include "cudaaligner/aligner.hpp"
#include "cudaaligner/alignment.hpp"
#include "overlap.hpp"
#include "sequence.hpp"
#include <vector>
#include <atomic>
namespace racon {
class CUDABatchAligner;
std::unique_ptr<CUDABatchAligner> createCUDABatchAligner(uint32_t max_query_size, uint32_t max_target_size, uint32_t max_alignments, uint32_t device_id);
class CUDABatchAligner
{
public:
virtual ~CUDABatchAligner();
/**
* @brief Add a new overlap to the batch.
*
* @param[in] window : The overlap to add to the batch.
* @param[in] sequences: Reference to a database of sequences.
*
* @return True if overlap could be added to the batch.
*/
virtual bool addOverlap(Overlap* overlap, std::vector<std::unique_ptr<Sequence>>& sequences);
/**
* @brief Checks if batch has any overlaps to process.
*
* @return Trie if there are overlaps in the batch.
*/
virtual bool hasOverlaps() const {
return overlaps_.size() > 0;
};
/**
* @brief Runs batched alignment of overlaps on GPU.
*
*/
virtual void alignAll();
/**
* @brief Find breaking points in alignments.
*
*/
virtual void find_breaking_points(uint32_t window_length);
/**
* @brief Resets the state of the object, which includes
* resetting buffer states and counters.
*/
virtual void reset();
/**
* @brief Get batch ID.
*/
uint32_t getBatchID() const { return bid_; }
// Builder function to create a new CUDABatchAligner object.
friend std::unique_ptr<CUDABatchAligner>
createCUDABatchAligner(uint32_t max_query_size, uint32_t max_target_size, uint32_t max_alignments, uint32_t device_id);
protected:
CUDABatchAligner(uint32_t max_query_size, uint32_t max_target_size, uint32_t max_alignments, uint32_t device_id);
CUDABatchAligner(const CUDABatchAligner&) = delete;
const CUDABatchAligner& operator=(const CUDABatchAligner&) = delete;
void compute_cpu_overlaps();
std::unique_ptr<claragenomics::cudaaligner::Aligner> aligner_;
std::vector<Overlap*> overlaps_;
std::vector<Overlap*> cpu_overlaps_;
std::vector<std::pair<std::string, std::string>> cpu_overlap_data_;
// Static batch count used to generate batch IDs.
static std::atomic<uint32_t> batches;
// Batch ID.
uint32_t bid_ = 0;
// CUDA stream for batch.
cudaStream_t stream_;
};
}
/*!
* @file cudabatch.cpp
*
* @brief CUDABatch class source file
*/
#include <string>
#include <iostream>
#include <cstring>
#include <algorithm>
#include "cudabatch.hpp"
#include "cudautils.hpp"
#include "spoa/spoa.hpp"
#include <cudautils/cudautils.hpp>
namespace racon {
std::atomic<uint32_t> CUDABatchProcessor::batches;
std::unique_ptr<CUDABatchProcessor> createCUDABatch(uint32_t max_windows, uint32_t max_window_depth, uint32_t device, int8_t gap, int8_t mismatch, int8_t match, bool cuda_banded_alignment)
{
return std::unique_ptr<CUDABatchProcessor>(new CUDABatchProcessor(max_windows, max_window_depth, device, gap, mismatch, match, cuda_banded_alignment));
}
CUDABatchProcessor::CUDABatchProcessor(uint32_t max_windows, uint32_t max_window_depth, uint32_t device, int8_t gap, int8_t mismatch, int8_t match, bool cuda_banded_alignment)
: max_windows_(max_windows)
, cudapoa_batch_(claragenomics::cudapoa::create_batch(max_windows, max_window_depth, device, claragenomics::cudapoa::OutputType::consensus, gap, mismatch, match, cuda_banded_alignment))
, windows_()
, seqs_added_per_window_()
{
bid_ = CUDABatchProcessor::batches++;
// Create new CUDA stream.
CGA_CU_CHECK_ERR(cudaStreamCreate(&stream_));
cudapoa_batch_->set_cuda_stream(stream_);
}
CUDABatchProcessor::~CUDABatchProcessor()
{
// Destroy CUDA stream.
CGA_CU_CHECK_ERR(cudaStreamDestroy(stream_));
}
bool CUDABatchProcessor::addWindow(std::shared_ptr<Window> window)
{
if (windows_.size() < max_windows_)
{
windows_.push_back(window);
seqs_added_per_window_.push_back(0);
return true;
}
else
{
return false;
}
}
bool CUDABatchProcessor::hasWindows() const
{
return (windows_.size() != 0);
}
void CUDABatchProcessor::convertPhredQualityToWeights(const char* qual,
uint32_t qual_length,
std::vector<int8_t>& weights)
{
weights.clear();
for(uint32_t i = 0; i < qual_length; i++)
{
weights.push_back(static_cast<uint8_t>(qual[i]) - 33); // PHRED quality
}
}
claragenomics::cudapoa::StatusType CUDABatchProcessor::addSequenceToPoa(std::pair<const char*, uint32_t>& seq,
std::pair<const char*, uint32_t>& qualities)
{
// Add sequences to latest poa in batch.
std::vector<int8_t> weights;
claragenomics::cudapoa::StatusType status = claragenomics::cudapoa::StatusType::success;
if (qualities.first == nullptr)
{
status = cudapoa_batch_->add_seq_to_poa(seq.first, nullptr, seq.second);
}
else
{
convertPhredQualityToWeights(qualities.first, qualities.second, weights);
status = cudapoa_batch_->add_seq_to_poa(seq.first, weights.data(), seq.second);
}
return status;
}
void CUDABatchProcessor::generateMemoryMap()
{
auto num_windows = windows_.size();
for(uint32_t w = 0; w < num_windows; w++)
{
// Add new poa
claragenomics::cudapoa::StatusType status = cudapoa_batch_->add_poa();
if (status != claragenomics::cudapoa::StatusType::success)
{
fprintf(stderr, "Failed to add new POA to batch %d.\n",
cudapoa_batch_->batch_id());
exit(1);
}
std::shared_ptr<Window> window = windows_.at(w);
uint32_t num_seqs = window->sequences_.size();
std::vector<uint8_t> weights;
// Add first sequence as backbone to graph.
std::pair<const char*, uint32_t> seq = window->sequences_.front();
std::pair<const char*, uint32_t> qualities = window->qualities_.front();
status = addSequenceToPoa(seq, qualities);
if (status != claragenomics::cudapoa::StatusType::success)
{
fprintf(stderr, "Could not add backbone to window. Fatal error.\n");
exit(1);
}
// Add the rest of the sequences in sorted order of starting positions.
std::vector<uint32_t> rank;
rank.reserve(window->sequences_.size());
for (uint32_t i = 0; i < num_seqs; ++i) {
rank.emplace_back(i);
}
std::sort(rank.begin() + 1, rank.end(), [&](uint32_t lhs, uint32_t rhs) {
return window->positions_[lhs].first < window->positions_[rhs].first; });
// Start from index 1 since first sequence has already been added as backbone.
uint32_t long_seq = 0;
uint32_t skipped_seq = 0;
for(uint32_t j = 1; j < num_seqs; j++)
{
uint32_t i = rank.at(j);
seq = window->sequences_.at(i);
qualities = window->qualities_.at(i);
// Add sequences to latest poa in batch.
status = addSequenceToPoa(seq, qualities);
if (status == claragenomics::cudapoa::StatusType::exceeded_maximum_sequence_size)
{
long_seq++;
continue;
}
else if (status == claragenomics::cudapoa::StatusType::exceeded_maximum_sequences_per_poa)
{
skipped_seq++;
continue;
}
else if (status != claragenomics::cudapoa::StatusType::success)
{
fprintf(stderr, "Could not add sequence to POA in batch %d.\n",
cudapoa_batch_->batch_id());
exit(1);
}
seqs_added_per_window_[w] = seqs_added_per_window_[w] + 1;
}
#ifndef NDEBUG
if (long_seq > 0)
{
fprintf(stderr, "Too long (%d / %d)\n", long_seq, num_seqs);
}
if (skipped_seq > 0)
{
fprintf(stderr, "Skipped (%d / %d)\n", skipped_seq, num_seqs);
}
#endif
}
}
void CUDABatchProcessor::generatePOA()
{
// call generate poa function
cudapoa_batch_->generate_poa();
}
void CUDABatchProcessor::getConsensus()
{
std::vector<std::string> consensuses;
std::vector<std::vector<uint16_t>> coverages;
std::vector<claragenomics::cudapoa::StatusType> output_status;
cudapoa_batch_->get_consensus(consensuses, coverages, output_status);
for(uint32_t i = 0; i < windows_.size(); i++)
{
auto window = windows_.at(i);
if (output_status.at(i) != claragenomics::cudapoa::StatusType::success)
{
// leave the failure cases to CPU polisher
window_consensus_status_.emplace_back(false);
}
else
{
// This is a special case borrowed from the CPU version.
// TODO: We still run this case through the GPU, but could take it out.
if (window->sequences_.size() < 3)
{
window->consensus_ = std::string(window->sequences_.front().first,
window->sequences_.front().second);
// This status is borrowed from the CPU version which considers this
// a failed consensus. All other cases are true.
window_consensus_status_.emplace_back(false);
}
else
{
window->consensus_ = consensuses.at(i);
if (window->type_ == WindowType::kTGS)
{
uint32_t num_seqs_in_window = seqs_added_per_window_[i];
uint32_t average_coverage = num_seqs_in_window / 2;
int32_t begin = 0, end = window->consensus_.size() - 1;
for (; begin < static_cast<int32_t>( window->consensus_.size()); ++begin) {
if (coverages.at(i).at(begin) >= average_coverage) {
break;
}
}
for (; end >= 0; --end) {
if (coverages.at(i).at(end) >= average_coverage) {
break;
}
}
if (begin >= end) {
fprintf(stderr, "[CUDABatchProcessor] warning: "
"contig might be chimeric in window %lu!\n", window->id_);
} else {
window->consensus_ = window->consensus_.substr(begin, end - begin + 1);
}
}
window_consensus_status_.emplace_back(true);
}
}
}
}
const std::vector<bool>& CUDABatchProcessor::generateConsensus()
{
// Generate consensus for all windows in the batch
generateMemoryMap();
generatePOA();
getConsensus();
return window_consensus_status_;
}
void CUDABatchProcessor::reset()
{
windows_.clear();
window_consensus_status_.clear();
seqs_added_per_window_.clear();
cudapoa_batch_->reset();
}
} // namespace racon
/*!
* @file cudabatch.hpp
*
* @brief CUDA batch class header file
*/
#pragma once
#include <memory>
#include <cuda_runtime_api.h>
#include <atomic>
#include "window.hpp"
#include "cudapoa/batch.hpp"
namespace spoa {
class AlignmentEngine;
}
namespace racon {
class Window;
class CUDABatchProcessor;
std::unique_ptr<CUDABatchProcessor> createCUDABatch(uint32_t max_windows, uint32_t max_window_depth, uint32_t device, int8_t gap, int8_t mismatch, int8_t match, bool cuda_banded_alignment);
class CUDABatchProcessor
{
public:
~CUDABatchProcessor();
/**
* @brief Add a new window to the batch.
*
* @param[in] window : The window to add to the batch.
*
* @return True of window could be added to the batch.
*/
bool addWindow(std::shared_ptr<Window> window);
/**
* @brief Checks if batch has any windows to process.
*/
bool hasWindows() const;
/**
* @brief Runs the core computation to generate consensus for
* all windows in the batch.
*
* @return Vector of bool indicating succesful generation of consensus
* for each window in the batch.
*/
const std::vector<bool>& generateConsensus();
/**
* @brief Resets the state of the object, which includes
* resetting buffer states and counters.
*/
void reset();
/**
* @brief Get batch ID.
*/
uint32_t getBatchID() const { return bid_; }
// Builder function to create a new CUDABatchProcessor object.
friend std::unique_ptr<CUDABatchProcessor>
createCUDABatch(uint32_t max_windows, uint32_t max_window_depth, uint32_t device, int8_t gap, int8_t mismatch, int8_t match, bool cuda_banded_alignment);
protected:
/**
* @brief Constructor for CUDABatch class.
*
* @param[in] max_windows : Maximum number windows in batch
* @param[in] max_window_depth : Maximum number of sequences per window
* @param[in] cuda_banded_alignment : Use banded POA alignment
*/
CUDABatchProcessor(uint32_t max_windows, uint32_t max_window_depth, uint32_t device, int8_t gap, int8_t mismatch, int8_t match, bool cuda_banded_alignment);
CUDABatchProcessor(const CUDABatchProcessor&) = delete;
const CUDABatchProcessor& operator=(const CUDABatchProcessor&) = delete;
/*
* @brief Process all the windows and re-map them into
* memory for more efficient processing in the CUDA
* kernels.
*/
void generateMemoryMap();
/*
* @brief Run the CUDA kernel for generating POA on the batch.
* This call is asynchronous.
*/
void generatePOA();
/*
* @brief Wait for execution to complete and grab the output
* consensus from the device.
*/
void getConsensus();
/*
* @brief Convert PHRED quality scores to weights.
*
*/
void convertPhredQualityToWeights(const char* qual,
uint32_t qual_length,
std::vector<int8_t>& weights);
/*
* @brief Add sequence and qualities to cudapoa.
*
*/
claragenomics::cudapoa::StatusType addSequenceToPoa(std::pair<const char*, uint32_t>& seq,
std::pair<const char*, uint32_t>& quality);
protected:
// Static batch count used to generate batch IDs.
static std::atomic<uint32_t> batches;
// Batch ID.
uint32_t bid_ = 0;
// Maximum windows allowed in batch.
uint32_t max_windows_;
// CUDA-POA library object that manages POA batch.
std::unique_ptr<claragenomics::cudapoa::Batch> cudapoa_batch_;
// Stream for running POA batch.
cudaStream_t stream_;
// Windows belonging to the batch.
std::vector<std::shared_ptr<Window>> windows_;
// Consensus generation status for each window.
std::vector<bool> window_consensus_status_;
// Number of sequences actually added per window.
std::vector<uint32_t> seqs_added_per_window_;
};
} // namespace racon
/*!
* @file cudapolisher.cpp
*
* @brief CUDA Polisher class source file
*/
#include <future>
#include <iostream>
#include <chrono>
#include <cuda_profiler_api.h>
#include "sequence.hpp"
#include "cudapolisher.hpp"
#include <cudautils/cudautils.hpp>
#include "bioparser/bioparser.hpp"
#include "logger/logger.hpp"
namespace racon {
// The logger used by racon has a fixed size of 20 bins
// which is used for the progress bar updates. Hence all
// updates need to be broken into 20 bins.
const uint32_t RACON_LOGGER_BIN_SIZE = 20;
CUDAPolisher::CUDAPolisher(std::unique_ptr<bioparser::Parser<Sequence>> sparser,
std::unique_ptr<bioparser::Parser<Overlap>> oparser,
std::unique_ptr<bioparser::Parser<Sequence>> tparser,
PolisherType type, uint32_t window_length, double quality_threshold,
double error_threshold, int8_t match, int8_t mismatch, int8_t gap,
uint32_t num_threads, uint32_t cudapoa_batches, bool cuda_banded_alignment,
uint32_t cudaaligner_batches)
: Polisher(std::move(sparser), std::move(oparser), std::move(tparser),
type, window_length, quality_threshold,
error_threshold, match, mismatch, gap, num_threads)
, cudapoa_batches_(cudapoa_batches)
, cudaaligner_batches_(cudaaligner_batches)
, gap_(gap)
, mismatch_(mismatch)
, match_(match)
, cuda_banded_alignment_(cuda_banded_alignment)
{
claragenomics::cudapoa::Init();
claragenomics::cudaaligner::Init();
CGA_CU_CHECK_ERR(cudaGetDeviceCount(&num_devices_));
if (num_devices_ < 1)
{
throw std::runtime_error("No GPU devices found.");
}
std::cerr << "Using " << num_devices_ << " GPU(s) to perform polishing" << std::endl;
// Run dummy call on each device to initialize CUDA context.
for(int32_t dev_id = 0; dev_id < num_devices_; dev_id++)
{
std::cerr << "Initialize device " << dev_id << std::endl;
CGA_CU_CHECK_ERR(cudaSetDevice(dev_id));
CGA_CU_CHECK_ERR(cudaFree(0));
}
std::cerr << "[CUDAPolisher] Constructed." << std::endl;
}
CUDAPolisher::~CUDAPolisher()
{
cudaDeviceSynchronize();
cudaProfilerStop();
}
std::vector<uint32_t> CUDAPolisher::calculate_batches_per_gpu(uint32_t batches, uint32_t gpus)
{
// Bin batches into each GPU.
std::vector<uint32_t> batches_per_gpu(gpus, batches / gpus);
for(uint32_t i = 0; i < batches % gpus; ++i)
{
++batches_per_gpu[i];
}
return batches_per_gpu;
}
void CUDAPolisher::find_overlap_breaking_points(std::vector<std::unique_ptr<Overlap>>& overlaps)
{
if (cudaaligner_batches_ < 1)
{
// TODO: Kept CPU overlap alignment right now while GPU is a dummy implmentation.
Polisher::find_overlap_breaking_points(overlaps);
}
else
{
// TODO: Experimentally this is giving decent perf
const uint32_t MAX_ALIGNMENTS = 50;
logger_->log();
std::mutex mutex_overlaps;
uint32_t next_overlap_index = 0;
// Lambda expression for filling up next batch of alignments.
auto fill_next_batch = [&mutex_overlaps, &next_overlap_index, &overlaps, this](CUDABatchAligner* batch) -> std::pair<uint32_t, uint32_t> {
batch->reset();
// Use mutex to read the vector containing windows in a threadsafe manner.
std::lock_guard<std::mutex> guard(mutex_overlaps);
uint32_t initial_count = next_overlap_index;
uint32_t count = overlaps.size();
while(next_overlap_index < count)
{
if (batch->addOverlap(overlaps.at(next_overlap_index).get(), sequences_))
{
next_overlap_index++;
}
else
{
break;
}
}
return {initial_count, next_overlap_index};
};
// Variables for keeping track of logger progress bar.
uint32_t logger_step = overlaps.size() / RACON_LOGGER_BIN_SIZE;
int32_t log_bar_idx = 0, log_bar_idx_prev = -1;
uint32_t window_idx = 0;
std::mutex mutex_log_bar_idx;
// Lambda expression for processing a batch of alignments.
auto process_batch = [&fill_next_batch, &logger_step, &log_bar_idx, &log_bar_idx_prev, &window_idx, &mutex_log_bar_idx, this](CUDABatchAligner* batch) -> void {
while(true)
{
auto range = fill_next_batch(batch);
if (batch->hasOverlaps())
{
// Launch workload.
batch->alignAll();
batch->find_breaking_points(window_length_);
// logging bar
{
std::lock_guard<std::mutex> guard(mutex_log_bar_idx);
window_idx += range.second - range.first;
log_bar_idx = window_idx / logger_step;
if (log_bar_idx == log_bar_idx_prev) {
continue;
}
else if (logger_step != 0 && log_bar_idx < static_cast<int32_t>(RACON_LOGGER_BIN_SIZE))
{
logger_->bar("[racon::CUDAPolisher::initialize] aligning overlaps");
std::cerr<<std::endl;
log_bar_idx_prev = log_bar_idx;
}
}
}
else
{
break;
}
}
};
// Bin batches into each GPU.
std::vector<uint32_t> batches_per_gpu = calculate_batches_per_gpu(cudaaligner_batches_, num_devices_);
for(int32_t device = 0; device < num_devices_; device++)
{
for(uint32_t batch = 0; batch < batches_per_gpu.at(device); batch++)
{
batch_aligners_.emplace_back(createCUDABatchAligner(10000, 10000, MAX_ALIGNMENTS, device));
}
}
// Run batched alignment.
std::vector<std::future<void>> thread_futures;
for(auto& aligner : batch_aligners_)
{
thread_futures.emplace_back(
thread_pool_->submit(
process_batch,
aligner.get()
)
);
}
// Wait for threads to finish, and collect their results.
for (const auto& future : thread_futures) {
future.wait();
}
batch_aligners_.clear();
}
}
void CUDAPolisher::polish(std::vector<std::unique_ptr<Sequence>>& dst,
bool drop_unpolished_sequences)
{
if (cudapoa_batches_ < 1)
{
Polisher::polish(dst, drop_unpolished_sequences);
}
else
{
// Creation and use of batches.
const uint32_t MAX_WINDOWS = 256;
const uint32_t MAX_DEPTH_PER_WINDOW = 200;
// Bin batches into each GPU.
std::vector<uint32_t> batches_per_gpu = calculate_batches_per_gpu(cudapoa_batches_, num_devices_);
for(int32_t device = 0; device < num_devices_; device++)
{
for(uint32_t batch = 0; batch < batches_per_gpu.at(device); batch++)
{
batch_processors_.emplace_back(createCUDABatch(MAX_WINDOWS, MAX_DEPTH_PER_WINDOW, device, gap_, mismatch_, match_, cuda_banded_alignment_));
}
}
logger_->log("[racon::CUDAPolisher::polish] allocated memory on GPUs");
// Mutex for accessing the vector of windows.
std::mutex mutex_windows;
// Initialize window consensus statuses.
window_consensus_status_.resize(windows_.size(), false);
// Index of next window to be added to a batch.
uint32_t next_window_index = 0;
// Lambda function for adding windows to batches.
auto fill_next_batch = [&mutex_windows, &next_window_index, this](CUDABatchProcessor* batch) -> std::pair<uint32_t, uint32_t> {
batch->reset();
// Use mutex to read the vector containing windows in a threadsafe manner.
std::lock_guard<std::mutex> guard(mutex_windows);
// TODO: Reducing window wize by 10 for debugging.
uint32_t initial_count = next_window_index;
uint32_t count = windows_.size();
while(next_window_index < count)
{
if (batch->addWindow(windows_.at(next_window_index)))
{
next_window_index++;
}
else
{
break;
}
}
return {initial_count, next_window_index};
};
// Variables for keeping track of logger progress bar.
uint32_t logger_step = windows_.size() / RACON_LOGGER_BIN_SIZE;
int32_t log_bar_idx = 0, log_bar_idx_prev = -1;
uint32_t window_idx = 0;
std::mutex mutex_log_bar_idx;
logger_->log();
// Lambda function for processing each batch.
auto process_batch = [&fill_next_batch, &logger_step, &log_bar_idx, &mutex_log_bar_idx, &window_idx, &log_bar_idx_prev, this](CUDABatchProcessor* batch) -> void {
while(true)
{
std::pair<uint32_t, uint32_t> range = fill_next_batch(batch);
if (batch->hasWindows())
{
// Launch workload.
const std::vector<bool>& results = batch->generateConsensus();
// Check if the number of batches processed is same as the range of
// of windows that were added.
if (results.size() != (range.second - range.first))
{
throw std::runtime_error("Windows processed doesn't match \
range of windows passed to batch\n");
}
// Copy over the results from the batch into the per window
// result vector of the CUDAPolisher.
for(uint32_t i = 0; i < results.size(); i++)
{
window_consensus_status_.at(range.first + i) = results.at(i);
}
// logging bar
{
std::lock_guard<std::mutex> guard(mutex_log_bar_idx);
window_idx += results.size();
log_bar_idx = window_idx / logger_step;
if (log_bar_idx == log_bar_idx_prev) {
continue;
}
else if (logger_step != 0 && log_bar_idx < static_cast<int32_t>(RACON_LOGGER_BIN_SIZE))
{
logger_->bar("[racon::CUDAPolisher::polish] generating consensus");
std::cerr<<std::endl;
log_bar_idx_prev = log_bar_idx;
}
}
}
else
{
break;
}
}
};
// Process each of the batches in a separate thread.
std::vector<std::future<void>> thread_futures;
for(auto& batch_processor : batch_processors_)
{
thread_futures.emplace_back(
thread_pool_->submit(
process_batch,
batch_processor.get()
)
);
}
// Wait for threads to finish, and collect their results.
for (const auto& future : thread_futures) {
future.wait();
}
logger_->log("[racon::CUDAPolisher::polish] polished windows on GPU");
// Start timing CPU time for failed windows on GPU
logger_->log();
// Process each failed windows in parallel on CPU
std::vector<std::future<bool>> thread_failed_windows;
for (uint64_t i = 0; i < windows_.size(); ++i) {
if (window_consensus_status_.at(i) == false)
{
thread_failed_windows.emplace_back(thread_pool_->submit(
[&](uint64_t j) -> bool {
auto it = thread_to_id_.find(std::this_thread::get_id());
if (it == thread_to_id_.end()) {
fprintf(stderr, "[racon::CUDAPolisher::polish] error: "
"thread identifier not present!\n");
exit(1);
}
return window_consensus_status_.at(j) = windows_[j]->generate_consensus(
alignment_engines_[it->second]);
}, i));
}
}
// Wait for threads to finish, and collect their results.
for (const auto& t : thread_failed_windows) {
t.wait();
}
if (thread_failed_windows.size() > 0)
{
logger_->log("[racon::CUDAPolisher::polish] polished remaining windows on CPU");
logger_->log();
}
// Collect results from all windows into final output.
std::string polished_data = "";
uint32_t num_polished_windows = 0;
for (uint64_t i = 0; i < windows_.size(); ++i) {
num_polished_windows += window_consensus_status_.at(i) == true ? 1 : 0;
polished_data += windows_[i]->consensus();
if (i == windows_.size() - 1 || windows_[i + 1]->rank() == 0) {
double polished_ratio = num_polished_windows /
static_cast<double>(windows_[i]->rank() + 1);
if (!drop_unpolished_sequences || polished_ratio > 0) {
std::string tags = type_ == PolisherType::kF ? "r" : "";
tags += " LN:i:" + std::to_string(polished_data.size());
tags += " RC:i:" + std::to_string(targets_coverages_[windows_[i]->id()]);
tags += " XC:f:" + std::to_string(polished_ratio);
dst.emplace_back(createSequence(sequences_[windows_[i]->id()]->name() +
tags, polished_data));
}
num_polished_windows = 0;
polished_data.clear();
}
windows_[i].reset();
}
logger_->log("[racon::CUDAPolisher::polish] generated consensus");
// Clear POA processors.
batch_processors_.clear();
}
}
}
/*!
* @file cudapolisher.hpp
*
* @brief CUDA Polisher class header file
*/
#pragma once
#include <mutex>
#include "polisher.hpp"
#include "cudabatch.hpp"
#include "cudaaligner.hpp"
#include "thread_pool/thread_pool.hpp"
namespace racon {
class CUDAPolisher : public Polisher {
public:
~CUDAPolisher();
virtual void polish(std::vector<std::unique_ptr<Sequence>>& dst,
bool drop_unpolished_sequences) override;
friend std::unique_ptr<Polisher> createPolisher(const std::string& sequences_path,
const std::string& overlaps_path, const std::string& target_path,
PolisherType type, uint32_t window_length, double quality_threshold,
double error_threshold, int8_t match, int8_t mismatch, int8_t gap,
uint32_t num_threads, uint32_t cudapoa_batches, bool cuda_banded_alignment,
uint32_t cudaaligner_batches);
protected:
CUDAPolisher(std::unique_ptr<bioparser::Parser<Sequence>> sparser,
std::unique_ptr<bioparser::Parser<Overlap>> oparser,
std::unique_ptr<bioparser::Parser<Sequence>> tparser,
PolisherType type, uint32_t window_length, double quality_threshold,
double error_threshold, int8_t match, int8_t mismatch, int8_t gap,
uint32_t num_threads, uint32_t cudapoa_batches, bool cuda_banded_alignment,
uint32_t cudaaligner_batches);
CUDAPolisher(const CUDAPolisher&) = delete;
const CUDAPolisher& operator=(const CUDAPolisher&) = delete;
virtual void find_overlap_breaking_points(std::vector<std::unique_ptr<Overlap>>& overlaps) override;
static std::vector<uint32_t> calculate_batches_per_gpu(uint32_t cudapoa_batches, uint32_t gpus);
// Vector of POA batches.
std::vector<std::unique_ptr<CUDABatchProcessor>> batch_processors_;
// Vector of aligner batches.
std::vector<std::unique_ptr<CUDABatchAligner>> batch_aligners_;
// Vector of bool indicating consensus generation status for each window.
std::vector<bool> window_consensus_status_;
// Number of batches for POA processing.
uint32_t cudapoa_batches_;
// Numbver of batches for Alignment processing.
uint32_t cudaaligner_batches_;
// Number of GPU devices to run with.
int32_t num_devices_;
// State transition scores.
int8_t gap_;
int8_t mismatch_;
int8_t match_;
// Use banded POA alignment
bool cuda_banded_alignment_;
};
}
// Implementation file for CUDA POA utilities.
#pragma once
#include <stdlib.h>
#include <cuda_runtime_api.h>
namespace racon {
void cudaCheckError(std::string &msg)
{
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess)
{
fprintf(stderr, "%s (CUDA error %s)\n", msg.c_str(), cudaGetErrorString(error));
exit(-1);
}
}
} // namespace racon
......@@ -8,8 +8,12 @@
#include "sequence.hpp"
#include "polisher.hpp"
#ifdef CUDA_ENABLED
#include "cuda/cudapolisher.hpp"
#endif
static const char* version = "v1.3.2";
static const char* version = "v1.4.3";
static const int32_t CUDAALIGNER_INPUT_CODE = 10000;
static struct option options[] = {
{"include-unpolished", no_argument, 0, 'u'},
......@@ -23,6 +27,11 @@ static struct option options[] = {
{"threads", required_argument, 0, 't'},
{"version", no_argument, 0, 'v'},
{"help", no_argument, 0, 'h'},
#ifdef CUDA_ENABLED
{"cudapoa-batches", optional_argument, 0, 'c'},
{"cuda-banded-alignment", no_argument, 0, 'b'},
{"cudaaligner-batches", required_argument, 0, CUDAALIGNER_INPUT_CODE},
#endif
{0, 0, 0, 0}
};
......@@ -44,8 +53,17 @@ int main(int argc, char** argv) {
bool drop_unpolished_sequences = true;
uint32_t num_threads = 1;
char argument;
while ((argument = getopt_long(argc, argv, "ufw:q:e:m:x:g:t:h", options, nullptr)) != -1) {
uint32_t cudapoa_batches = 0;
uint32_t cudaaligner_batches = 0;
bool cuda_banded_alignment = false;
std::string optstring = "ufw:q:e:m:x:g:t:h";
#ifdef CUDA_ENABLED
optstring += "bc::";
#endif
int32_t argument;
while ((argument = getopt_long(argc, argv, optstring.c_str(), options, nullptr)) != -1) {
switch (argument) {
case 'u':
drop_unpolished_sequences = false;
......@@ -80,6 +98,27 @@ int main(int argc, char** argv) {
case 'h':
help();
exit(0);
#ifdef CUDA_ENABLED
case 'c':
//if option c encountered, cudapoa_batches initialized with a default value of 1.
cudapoa_batches = 1;
// next text entry is not an option, assuming it's the arg for option 'c'
if (optarg == NULL && argv[optind] != NULL
&& argv[optind][0] != '-') {
cudapoa_batches = atoi(argv[optind++]);
}
// optional argument provided in the ususal way
if (optarg != NULL) {
cudapoa_batches = atoi(optarg);
}
break;
case 'b':
cuda_banded_alignment = true;
break;
case CUDAALIGNER_INPUT_CODE: // cudaaligner-batches
cudaaligner_batches = atoi(optarg);
break;
#endif
default:
exit(1);
}
......@@ -98,7 +137,8 @@ int main(int argc, char** argv) {
auto polisher = racon::createPolisher(input_paths[0], input_paths[1],
input_paths[2], type == 0 ? racon::PolisherType::kC :
racon::PolisherType::kF, window_length, quality_threshold,
error_threshold, match, mismatch, gap, num_threads);
error_threshold, match, mismatch, gap, num_threads,
cudapoa_batches, cuda_banded_alignment, cudaaligner_batches);
polisher->initialize();
......@@ -156,5 +196,16 @@ void help() {
" --version\n"
" prints the version number\n"
" -h, --help\n"
" prints the usage\n");
" prints the usage\n"
#ifdef CUDA_ENABLED
" -c, --cudapoa-batches\n"
" default: 1\n"
" number of batches for CUDA accelerated polishing\n"
" -b, --cuda-banded-alignment\n"
" use banding approximation for alignment on GPU\n"
" --cudaaligner-batches (experimental)\n"
" Number of batches for CUDA accelerated alignment\n"
#endif
);
}
......@@ -190,29 +190,41 @@ void Overlap::find_breaking_points(const std::vector<std::unique_ptr<Sequence>>&
}
if (cigar_.empty()) {
// align overlaps with edlib
const char* q = !strand_ ? &(sequences[q_id_]->data()[q_begin_]) :
&(sequences[q_id_]->reverse_complement()[q_length_ - q_end_]);
const char* t = &(sequences[t_id_]->data()[t_begin_]);
EdlibAlignResult result = edlibAlign(q, q_end_ - q_begin_, t, t_end_ -
t_begin_, edlibNewAlignConfig(-1, EDLIB_MODE_NW, EDLIB_TASK_PATH,
nullptr, 0));
align_overlaps(q, q_end_ - q_begin_, t, t_end_ - t_begin_);
}
find_breaking_points_from_cigar(window_length);
std::string().swap(cigar_);
}
void Overlap::align_overlaps(const char* q, uint32_t q_length, const char* t, uint32_t t_length)
{
// align overlaps with edlib
EdlibAlignResult result = edlibAlign(q, q_length, t, t_length,
edlibNewAlignConfig(-1, EDLIB_MODE_NW, EDLIB_TASK_PATH,
nullptr, 0));
if (result.status == EDLIB_STATUS_OK) {
char* cigar = edlibAlignmentToCigar(result.alignment,
if (result.status == EDLIB_STATUS_OK) {
char* cigar = edlibAlignmentToCigar(result.alignment,
result.alignmentLength, EDLIB_CIGAR_STANDARD);
cigar_ = cigar;
free(cigar);
} else {
fprintf(stderr, "[racon::Overlap::find_breaking_points] error: "
cigar_ = cigar;
free(cigar);
} else {
fprintf(stderr, "[racon::Overlap::find_breaking_points] error: "
"edlib unable to align pair (%zu x %zu)!\n", q_id_, t_id_);
exit(1);
}
edlibFreeAlignResult(result);
exit(1);
}
edlibFreeAlignResult(result);
}
void Overlap::find_breaking_points_from_cigar(uint32_t window_length)
{
// find breaking points from cigar
std::vector<int32_t> window_ends;
for (uint32_t i = 0; i < t_end_; i += window_length) {
......@@ -277,8 +289,6 @@ void Overlap::find_breaking_points(const std::vector<std::unique_ptr<Sequence>>&
j = i + 1;
}
}
std::string().swap(cigar_);
}
}
......@@ -71,6 +71,10 @@ public:
friend bioparser::MhapParser<Overlap>;
friend bioparser::PafParser<Overlap>;
friend bioparser::SamParser<Overlap>;
#ifdef CUDA_ENABLED
friend class CUDABatchAligner;
#endif
private:
Overlap(uint64_t a_id, uint64_t b_id, double accuracy, uint32_t minmers,
uint32_t a_rc, uint32_t a_begin, uint32_t a_end, uint32_t a_length,
......@@ -89,6 +93,8 @@ private:
Overlap();
Overlap(const Overlap&) = delete;
const Overlap& operator=(const Overlap&) = delete;
virtual void find_breaking_points_from_cigar(uint32_t window_length);
virtual void align_overlaps(const char* q, uint32_t q_len, const char* t, uint32_t t_len);
std::string q_name_;
uint64_t q_id_;
......
......@@ -6,15 +6,20 @@
#include <algorithm>
#include <unordered_set>
#include <iostream>
#include "overlap.hpp"
#include "sequence.hpp"
#include "window.hpp"
#include "polisher.hpp"
#ifdef CUDA_ENABLED
#include "cuda/cudapolisher.hpp"
#endif
#include "bioparser/bioparser.hpp"
#include "thread_pool/thread_pool.hpp"
#include "spoa/spoa.hpp"
#include "logger/logger.hpp"
namespace racon {
......@@ -51,7 +56,8 @@ std::unique_ptr<Polisher> createPolisher(const std::string& sequences_path,
const std::string& overlaps_path, const std::string& target_path,
PolisherType type, uint32_t window_length, double quality_threshold,
double error_threshold, int8_t match, int8_t mismatch, int8_t gap,
uint32_t num_threads) {
uint32_t num_threads, uint32_t cudapoa_batches, bool cuda_banded_alignment,
uint32_t cudaaligner_batches) {
if (type != PolisherType::kC && type != PolisherType::kF) {
fprintf(stderr, "[racon::createPolisher] error: invalid polisher type!\n");
......@@ -122,10 +128,30 @@ std::unique_ptr<Polisher> createPolisher(const std::string& sequences_path,
exit(1);
}
return std::unique_ptr<Polisher>(new Polisher(std::move(sparser),
std::move(oparser), std::move(tparser), type, window_length,
quality_threshold, error_threshold, match, mismatch, gap,
num_threads));
if (cudapoa_batches > 0 || cudaaligner_batches > 0)
{
#ifdef CUDA_ENABLED
// If CUDA is enabled, return an instance of the CUDAPolisher object.
return std::unique_ptr<Polisher>(new CUDAPolisher(std::move(sparser),
std::move(oparser), std::move(tparser), type, window_length,
quality_threshold, error_threshold, match, mismatch, gap,
num_threads, cudapoa_batches, cuda_banded_alignment, cudaaligner_batches));
#else
fprintf(stderr, "[racon::createPolisher] error: "
"Attemping to use CUDA when CUDA support is not available.\n"
"Please check logic in %s:%s\n",
__FILE__, __func__);
exit(1);
#endif
}
else
{
(void) cuda_banded_alignment;
return std::unique_ptr<Polisher>(new Polisher(std::move(sparser),
std::move(oparser), std::move(tparser), type, window_length,
quality_threshold, error_threshold, match, mismatch, gap,
num_threads));
}
}
Polisher::Polisher(std::unique_ptr<bioparser::Parser<Sequence>> sparser,
......@@ -140,7 +166,7 @@ Polisher::Polisher(std::unique_ptr<bioparser::Parser<Sequence>> sparser,
alignment_engines_(), sequences_(), dummy_quality_(window_length, '!'),
window_length_(window_length), windows_(),
thread_pool_(thread_pool::createThreadPool(num_threads)),
thread_to_id_() {
thread_to_id_(), logger_(new logger::Logger()) {
uint32_t id = 0;
for (const auto& it: thread_pool_->thread_identifiers()) {
......@@ -155,6 +181,7 @@ Polisher::Polisher(std::unique_ptr<bioparser::Parser<Sequence>> sparser,
}
Polisher::~Polisher() {
logger_->total("[racon::Polisher::] total =");
}
void Polisher::initialize() {
......@@ -165,8 +192,10 @@ void Polisher::initialize() {
return;
}
logger_->log();
tparser_->reset();
tparser_->parse_objects(sequences_, -1);
tparser_->parse(sequences_, -1);
uint64_t targets_size = sequences_.size();
if (targets_size == 0) {
......@@ -186,14 +215,15 @@ void Polisher::initialize() {
std::vector<bool> has_data(targets_size, true);
std::vector<bool> has_reverse_data(targets_size, false);
fprintf(stderr, "[racon::Polisher::initialize] loaded target sequences\n");
logger_->log("[racon::Polisher::initialize] loaded target sequences");
logger_->log();
uint64_t sequences_size = 0, total_sequences_length = 0;
sparser_->reset();
while (true) {
uint64_t l = sequences_.size();
auto status = sparser_->parse_objects(sequences_, kChunkSize);
auto status = sparser_->parse(sequences_, kChunkSize);
uint64_t n = 0;
for (uint64_t i = l; i < sequences_.size(); ++i, ++sequences_size) {
......@@ -241,7 +271,8 @@ void Polisher::initialize() {
WindowType window_type = static_cast<double>(total_sequences_length) /
sequences_size <= 1000 ? WindowType::kNGS : WindowType::kTGS;
fprintf(stderr, "[racon::Polisher::initialize] loaded sequences\n");
logger_->log("[racon::Polisher::initialize] loaded sequences");
logger_->log();
std::vector<std::unique_ptr<Overlap>> overlaps;
......@@ -274,7 +305,7 @@ void Polisher::initialize() {
oparser_->reset();
uint64_t l = 0;
while (true) {
auto status = oparser_->parse_objects(overlaps, kChunkSize);
auto status = oparser_->parse(overlaps, kChunkSize);
uint64_t c = l;
for (uint64_t i = l; i < overlaps.size(); ++i) {
......@@ -326,11 +357,13 @@ void Polisher::initialize() {
"empty overlap set!\n");
exit(1);
}
fprintf(stderr, "[racon::Polisher::initialize] loaded overlaps\n");
logger_->log("[racon::Polisher::initialize] loaded overlaps");
logger_->log();
std::vector<std::future<void>> thread_futures;
for (uint64_t i = 0; i < sequences_.size(); ++i) {
thread_futures.emplace_back(thread_pool_->submit_task(
thread_futures.emplace_back(thread_pool_->submit(
[&](uint64_t j) -> void {
sequences_[j]->transmute(has_name[j], has_data[j], has_reverse_data[j]);
}, i));
......@@ -339,19 +372,9 @@ void Polisher::initialize() {
it.wait();
}
thread_futures.clear();
for (uint64_t i = 0; i < overlaps.size(); ++i) {
thread_futures.emplace_back(thread_pool_->submit_task(
[&](uint64_t j) -> void {
overlaps[j]->find_breaking_points(sequences_, window_length_);
}, i));
}
for (uint64_t i = 0; i < thread_futures.size(); ++i) {
thread_futures[i].wait();
fprintf(stderr, "[racon::Polisher::initialize] aligned overlap %zu/%zu\r",
i + 1, overlaps.size());
}
fprintf(stderr, "\n");
find_overlap_breaking_points(overlaps);
logger_->log();
std::vector<uint64_t> id_to_first_window_id(targets_size + 1, 0);
for (uint64_t i = 0; i < targets_size; ++i) {
......@@ -428,15 +451,41 @@ void Polisher::initialize() {
overlaps[i].reset();
}
fprintf(stderr, "[racon::Polisher::initialize] transformed data into windows\n");
logger_->log("[racon::Polisher::initialize] transformed data into windows");
}
void Polisher::find_overlap_breaking_points(std::vector<std::unique_ptr<Overlap>>& overlaps)
{
std::vector<std::future<void>> thread_futures;
for (uint64_t i = 0; i < overlaps.size(); ++i) {
thread_futures.emplace_back(thread_pool_->submit(
[&](uint64_t j) -> void {
overlaps[j]->find_breaking_points(sequences_, window_length_);
}, i));
}
uint32_t logger_step = thread_futures.size() / 20;
for (uint64_t i = 0; i < thread_futures.size(); ++i) {
thread_futures[i].wait();
if (logger_step != 0 && (i + 1) % logger_step == 0 && (i + 1) / logger_step < 20) {
logger_->bar("[racon::Polisher::initialize] aligning overlaps");
}
}
if (logger_step != 0) {
logger_->bar("[racon::Polisher::initialize] aligning overlaps");
} else {
logger_->log("[racon::Polisher::initialize] aligned overlaps");
}
}
void Polisher::polish(std::vector<std::unique_ptr<Sequence>>& dst,
bool drop_unpolished_sequences) {
logger_->log();
std::vector<std::future<bool>> thread_futures;
for (uint64_t i = 0; i < windows_.size(); ++i) {
thread_futures.emplace_back(thread_pool_->submit_task(
thread_futures.emplace_back(thread_pool_->submit(
[&](uint64_t j) -> bool {
auto it = thread_to_id_.find(std::this_thread::get_id());
if (it == thread_to_id_.end()) {
......@@ -452,6 +501,8 @@ void Polisher::polish(std::vector<std::unique_ptr<Sequence>>& dst,
std::string polished_data = "";
uint32_t num_polished_windows = 0;
uint64_t logger_step = thread_futures.size() / 20;
for (uint64_t i = 0; i < thread_futures.size(); ++i) {
thread_futures[i].wait();
......@@ -476,12 +527,18 @@ void Polisher::polish(std::vector<std::unique_ptr<Sequence>>& dst,
}
windows_[i].reset();
fprintf(stderr, "[racon::Polisher::polish] generated consensus for window %zu/%zu\r",
i + 1, thread_futures.size());
if (logger_step != 0 && (i + 1) % logger_step == 0 && (i + 1) / logger_step < 20) {
logger_->bar("[racon::Polisher::polish] generating consensus");
}
}
if (logger_step != 0) {
logger_->bar("[racon::Polisher::polish] generating consensus");
} else {
logger_->log("[racon::Polisher::polish] generated consensus");
}
fprintf(stderr, "\n");
std::vector<std::unique_ptr<Window>>().swap(windows_);
std::vector<std::shared_ptr<Window>>().swap(windows_);
std::vector<std::unique_ptr<Sequence>>().swap(sequences_);
}
......
......@@ -25,6 +25,10 @@ namespace spoa {
class AlignmentEngine;
}
namespace logger {
class Logger;
}
namespace racon {
class Sequence;
......@@ -41,23 +45,26 @@ std::unique_ptr<Polisher> createPolisher(const std::string& sequences_path,
const std::string& overlaps_path, const std::string& target_path,
PolisherType type, uint32_t window_length, double quality_threshold,
double error_threshold, int8_t match, int8_t mismatch, int8_t gap,
uint32_t num_threads);
uint32_t num_threads, uint32_t cuda_batches = 0,
bool cuda_banded_alignment = false, uint32_t cudaaligner_batches = 0);
class Polisher {
public:
~Polisher();
virtual ~Polisher();
void initialize();
virtual void initialize();
void polish(std::vector<std::unique_ptr<Sequence>>& dst,
virtual void polish(std::vector<std::unique_ptr<Sequence>>& dst,
bool drop_unpolished_sequences);
friend std::unique_ptr<Polisher> createPolisher(const std::string& sequences_path,
const std::string& overlaps_path, const std::string& target_path,
PolisherType type, uint32_t window_length, double quality_threshold,
double error_threshold, int8_t match, int8_t mismatch, int8_t gap,
uint32_t num_threads);
private:
uint32_t num_threads, uint32_t cuda_batches, bool cuda_banded_alignment,
uint32_t cudaaligner_batches);
protected:
Polisher(std::unique_ptr<bioparser::Parser<Sequence>> sparser,
std::unique_ptr<bioparser::Parser<Overlap>> oparser,
std::unique_ptr<bioparser::Parser<Sequence>> tparser,
......@@ -66,6 +73,7 @@ private:
uint32_t num_threads);
Polisher(const Polisher&) = delete;
const Polisher& operator=(const Polisher&) = delete;
virtual void find_overlap_breaking_points(std::vector<std::unique_ptr<Overlap>>& overlaps);
std::unique_ptr<bioparser::Parser<Sequence>> sparser_;
std::unique_ptr<bioparser::Parser<Overlap>> oparser_;
......@@ -81,10 +89,12 @@ private:
std::string dummy_quality_;
uint32_t window_length_;
std::vector<std::unique_ptr<Window>> windows_;
std::vector<std::shared_ptr<Window>> windows_;
std::unique_ptr<thread_pool::ThreadPool> thread_pool_;
std::unordered_map<std::thread::id, uint32_t> thread_to_id_;
std::unique_ptr<logger::Logger> logger_;
};
}