Skip to content
Commits on Source (3)
cmake_minimum_required (VERSION 3.1)
cmake_minimum_required(VERSION 3.9)
if(DEFINED ENV{CC})
set(CC $ENV{CC})
else()
set(CC gcc)
endif()
message("CC: ${CC}")
set(CC_VERSION "")
if(${CC} MATCHES ^gcc-)
string(REGEX REPLACE "gcc-" "" CC_VERSION ${CC})
endif()
message("CC version: ${CC_VERSION}")
enable_testing()
project (RapMap)
set(CPACK_PACKAGE_VERSION "0.6.0")
SET(CPACK_PACKAGE_VERSION_MAJOR "0")
set(CPACK_PACKAGE_VERSION_MINOR "6")
set(CPACK_PACKAGE_VERSION_PATCH "0")
# auto-populate version:
# from https://stackoverflow.com/questions/47066115/cmake-get-version-from-multi-line-text-file
file(READ "current_version.txt" ver)
string(REGEX MATCH "VERSION_MAJOR ([0-9]*)" _ ${ver})
set(ver_major ${CMAKE_MATCH_1})
string(REGEX MATCH "VERSION_MINOR ([0-9]*)" _ ${ver})
set(ver_minor ${CMAKE_MATCH_1})
string(REGEX MATCH "VERSION_PATCH ([0-9]*)" _ ${ver})
set(ver_patch ${CMAKE_MATCH_1})
set(CPACK_PACKAGE_VERSION_MAJOR ${ver_major})
set(CPACK_PACKAGE_VERSION_MINOR ${ver_minor})
set(CPACK_PACKAGE_VERSION_PATCH ${ver_patch})
set(CPACK_PACKAGE_VERSION "${ver_major}.${ver_minor}.${ver_patch}")
message("version: ${CPACK_PACKAGE_VERSION}")
set(PROJECT_VERSION ${CPACK_PACKAGE_VERSION})
set(CPACK_GENERATOR "TGZ")
set(CPACK_SOURCE_GENERATOR "TGZ")
set(CPACK_PACKAGE_VENDOR "Stony Brook University")
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "RapMap - Wicked-fast quasi-mapping")
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "rapmap - fast transcriptome mapping")
set(CPACK_PACKAGE_NAME
"${CMAKE_PROJECT_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}")
set(CPACK_SOURCE_PACKAGE_FILE_NAME
"${CMAKE_PROJECT_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}-Source")
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
#include(FindSSE)
#FindSSE ()
#if(SSE4_2_FOUND)
# message("Enabling popcount")
# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2 -DEMPHF_USE_POPCOUNT")
#endif(SSE4_2_FOUND)
# Set a default build type if none was specified
set(default_build_type "Release")
if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
message(STATUS "Setting build type to '${default_build_type}' as none was specified.")
set(CMAKE_BUILD_TYPE "${default_build_type}" CACHE
STRING "Choose the type of build." FORCE)
# Set the possible values of build type for cmake-gui
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS
"Debug" "Release")
endif()
include(CheckIPOSupported)
if (APPLE)
set (WARNING_IGNORE_FLAGS "-Wno-deprecated-register -Wno-unknon-pragmas -Wreturn-type -Werror=return-type")
......@@ -51,6 +88,12 @@ endif()
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -funroll-loops -fPIC -fomit-frame-pointer -O4 -DHAVE_ANSI_TERM ${WALL} -std=c++14")
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CXXSTDFLAG "-std=c++14")
set(GCCVERSION "5.2")
set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
find_package(Threads REQUIRED)
......@@ -74,9 +117,13 @@ endif()
set( BOOST_EXTRA_FLAGS "--layout=tagged" )
## this get's set differently below if we
## are on clang & apple
set (NON_APPLECLANG_LIBS gomp rt)
set (NON_APPLECLANG_LIBS gomp)
set (PTHREAD_LIB)
if(UNIX AND NOT APPLE)
set(LIBRT rt)
endif()
##
# Compiler-specific C++11 activation.
# http://stackoverflow.com/questions/10984442/how-to-detect-c11-support-of-a-compiler-with-cmake
......
......@@ -2,18 +2,11 @@
# What is RapMap?
RapMap is a testing ground for ideas in quasi-mapping / (lightweight / pseudo) transcriptome alignment. That means that, at this point, it is somewhat experimental. The `develop` branch will have the latest improvements and additions, but is not guaranteed to be stable between commits. Breaking changes to the master branch will be accompanied by a tag to the version before the breaking change. Currently, RapMap is a stand-alone quasi-mapper that can be used with other tools. It is also being used as part of [Sailfish](https://github.com/kingsfordgroup/sailfish) and [Salmon](https://github.com/COMBINE-lab/salmon). Eventually, the hope is to create and stabilize an API so that it can be used as a library from other tools.
Quasi-mapping / (lightweight / pseudo)-alignment is the term we are using here for the type of information required for certain tasks (e.g.
transcript quantification) that is less "heavyweight" than what is provided by traditional alignment. For example, one may
only need to know the transcripts / contigs to which a read aligns and, perhaps, the position within those transcripts rather
than the optimal alignment and base-for-base `CIGAR` string that aligns the read and substring of the transcript. For details on RapMap (quasi-mapping in particular), please check out the [associated paper](http://bioinformatics.oxfordjournals.org/content/32/12/i192.full.pdf). Note: RapMap implements both quasi-mapping and pseudo-alignment (originally introduced in [Bray et al. 2016](http://www.nature.com/nbt/journal/v34/n5/full/nbt.3519.html)), these two are not the same thing. They are distinct concepts, and RapMap simply happens to implement algorithms for computing both.
There are a number of different ways to collect such information, and the idea of RapMap (as the repository grows) will be to explore multiple different strategies in how to most rapidly determine all *feasible* / *compatible* locations for a read within the transcriptome. In this sense, it is like an *all-mapper*; the mappings it outputs are intended to be (eventually) disambiguated (*Really, it's more like an "all-best" mapper, since it returns all hits in the top "stratum" of quasi-mapping / (lightweight / pseudo)-alignments*). If there is a need for it, *best-mapper* functionality may be added in the future.
RapMap is a testing ground for ideas in quasi-mapping and selective alignment. That means that, at this point, it is somewhat experimental. The `develop` branch will have the latest improvements and additions, but is not guaranteed to be stable between commits. Breaking changes to the master branch will be accompanied by a tag to the version before the breaking change. Currently, RapMap is a stand-alone quasi-mapper that can be used with other tools. It is also being used as part of [Salmon](https://github.com/COMBINE-lab/salmon) and [Sailfish](https://github.com/kingsfordgroup/sailfish). Eventually, the hope is to create and stabilize an API so that it can be used as a library from other tools.
# Building RapMap
To build RapMap, you need a C++11 compliant compiler (g++ >= 4.7 and clang >= 3.4) and CMake. RapMap is built with the following steps (assuming that `path_to_rapmap` is the toplevel directory where you have cloned this repository):
To build RapMap, you need a C++14 compliant compiler (g++ >= 4.9 and clang >= 3.4) and CMake (>= 3.9). RapMap is built with the following steps (assuming that `path_to_rapmap` is the toplevel directory where you have cloned this repository):
```
[path_to_rapmap] > mkdir build && cd build
......@@ -23,6 +16,7 @@ To build RapMap, you need a C++11 compliant compiler (g++ >= 4.7 and clang >= 3.
[path_to_rapmap/build] > cd ../bin
[path_to_rapmap/bin] > ./rapmap -h
```
This should output the standard help message for rapmap.
# Using RapMap
......@@ -44,48 +38,31 @@ the `-p` option enables the minimum perfect hash and `-x 4` tells RapMap to use
The index itself will record whether it was built with the aid of minimum perfect hashing or not, so no extra information concerning this need be provided when mapping. For the purposes of this example, we'll assume that we wish to map paired-end reads with the first mates in the file `r1.fq.gz` and the second mates in the file `r2.fq.gz`. We can perform the mapping like so:
```
> rapmap quasimap -i ref_index -1 <(gunzip -c r1.fq.gz) -2 <(gunzip -c r2.fq.gz) -t 8 -o mapped_reads.sam
> rapmap quasimap -i ref_index -1 r1.fq.gz -2 r2.fq.gz -s -t 8 -o mapped_reads.sam
```
This will tell RapMap to map the paired-end reads using 8 threads, and to write the resulting `SAM` records to the file `mapped_reads.sam`. The SAM format is rather verbose, and so such output files can be rather large (and slow to write) if you're mapping many reads. For that reason, we recommend that you use [samtools](http://www.htslib.org/) to convert the `SAM` file to a `BAM` file on-the-fly. Assuming `samtools` is installed an in your path, that can be accomplished with the following command:
This will tell RapMap to map the paired-end reads using 8 threads, and to write the resulting `SAM` records to the file `mapped_reads.sam`. The `-s` flag tells RapMap to use selective alignment to generate better mappings and to validate the alignment _score_ of hits. The SAM format is rather verbose, and so such output files can be rather large (and slow to write) if you're mapping many reads. For that reason, we recommend that you use [samtools](http://www.htslib.org/) to convert the `SAM` file to a `BAM` file on-the-fly. Assuming `samtools` is installed an in your path, that can be accomplished with the following command:
```
> rapmap quasimap -i ref_index -1 <(gunzip -c r1.fq.gz) -2 <(gunzip -c r2.fq.gz) -t 8 | samtools view -Sb -@ 4 - > mapped_reads.bam
> rapmap quasimap -i ref_index -1 r1.fq.gz -2 r2.fq.gz -s -t 8 | samtools view -Sb -@ 4 - > mapped_reads.bam
```
This will stream the output from RapMap to standard out, and then convert it into a `BAM` file (using up to an additional 4 threads for `BAM` compression) and write the resulting output to the file `mapped_reads.bam`. To reduce the amount that needs to be typed in the common case, and to prevent the user from having to remember invocations like the above, we inclde a simple wrapper script that simplifies this process. After installing RapMap, there should be a script called `RunRapMap.sh` in the `bin` directory of whereever you have chosen to install RapMap. You can issue a command equivalent to the above using this scrpt as follows:
```
> RunRapMap.sh quasimap -i ref_index -1 <(gunzip -c r1.fq.gz) -2 <(gunzip -c r2.fq.gz) -t 8 --bamOut mapped_reads.sam --bamThreads 4
> RunRapMap.sh quasimap -i ref_index -1 r1.fq.gz -2 r2.fq.gz -s -t 8 --bamOut mapped_reads.sam --bamThreads 4
```
This will run RapMap with a command equivalent to the one mentioned above. If you leave out the `--bamThreads` argument, then a single thread will be used for compression. The `RunRapMap.sh` script can be used even if you don't wish to write the output to `BAM` format; in that case it is simply equivalent to running whichever command you pass with the `rapmap` executable itself.
# Can I use RapMap for genomic alignment?
No, at least not right now. The index and mapping strategy employed by RapMap are highly geared toward mapping to transcriptomes. It may be the case that some of these ideas can be successfully applied to genomic alignment, but
this functionality is not currently suppored (and is not a high priority right now).
# How fast is RapMap?
Speed is relative, but we think it's very fast: On a synthetic test dataset comprised of 75 million 76bp paired-end reads, mapping to a human transcriptome with ~213,000 transcripts, RapMap takes ~ 10 minutes to align all of the reads *on a single core* (on an Intel Xeon E5-2690 @ 3.00 GHz) --- if you actually want to write out the alignments --- it depends on you disk speed, but for us it's ~15 minutes. Again, these mapping times are *on a single core* --- but RapMap is trivially parallelizable and can be run with multiple threads. Additionally, there are other optimizations we are currently exploring.
# OK, that's fast, but is it accurate?
Yes; quasi-mapping seems to provide accurate mapping results. In the above mentioned synthetic dataset (generated *with* sequencing errors), the true location of origin of the read appears in the hits returned by RapMap > 97% of the time. For more details, please refer to [the paper](http://bioinformatics.oxfordjournals.org/content/32/12/i192.full.pdf).
The index and mapping strategy employed by RapMap are highly geared toward mapping to transcriptomes. This means that RapMap will likely use _a lot_ of memory when indexing and mapping to mammalian-sized genomes, though it's possible. We have succesfully applied RapMap to map reads to collections of baterial and viral genomes, however.
# Caveats
RapMap is experimental, and the code, at this point, is subject to me testing out new ideas (see the description above about the master vs. develop branch). This also means that limited effort has been put into size or speed optimizaiton. There are numerous ways that the code can be sped up and the memory footprint reduced, but that hasn't been the focus yet --- it will be eventualy. All of this being said --- RapMap is open to the community because I'd like feedback / help / thoughts. A contribution policy is forthcoming. So, if you're not scared off by any of the above, please *dig in*!
# External dependencies
[tclap](http://tclap.sourceforge.net/)
[cereal](https://github.com/USCiLab/cereal)
[jellyfish](https://github.com/gmarcais/Jellyfish)
# License
Since RapMap uses Jellyfish, it must be released under the GPL. However, this is currently the only GPL dependency. If it can be replaced, I'd like to re-license RapMap under the BSD license. I'd be happy to accept pull-requests that replace the Jellyfish components with a library released under a more liberal license (BSD-compatible), but note that I will *not* accept such pull requests if they reduce the speed or increase the memory consumption over the Jellyfish-based version.
Since RapMap uses Vinga's [rank implementation](https://github.com/COMBINE-lab/RapMap/blob/master/include/rank9b.h), it must be released under the GPL. However, this is currently the only GPL dependency. If it can be replaced, I'd like to re-license RapMap under a BSD license. I'd be happy to accept pull-requests that replace this rank implementation with a library released under a more liberal license (BSD-compatible), but note that I will *not* accept such pull requests if they reduce the speed or increase the memory consumption over the current version.
VERSION_MAJOR 0
VERSION_MINOR 6
VERSION_PATCH 0
rapmap (0.14.1+dfsg-1) unstable; urgency=medium
* New upstream release.
-- Michael R. Crusoe <michael.crusoe@gmail.com> Tue, 23 Jul 2019 12:52:36 +0200
rapmap (0.12.0+dfsg-3) unstable; urgency=medium
* debian/patches/portable_pause: add ppc64el and arm64 alternatives to
......
......@@ -15,7 +15,7 @@ Build-Depends: debhelper (>= 11),
libspdlog-dev,
libtclap-dev,
pkg-config,
python-markdown
python3-markdown
Standards-Version: 4.3.0
Vcs-Browser: https://salsa.debian.org/med-team/rapmap
Vcs-Git: https://salsa.debian.org/med-team/rapmap.git
......
......@@ -2,12 +2,12 @@ Author: Michael R. Crusoe <michael.crusoe@gmail.com>
Description: Avoid unnecessary linkage to gomp or rt
--- rapmap.orig/CMakeLists.txt
+++ rapmap/CMakeLists.txt
@@ -74,7 +74,7 @@
@@ -117,7 +117,7 @@
set( BOOST_EXTRA_FLAGS "--layout=tagged" )
## this get's set differently below if we
## are on clang & apple
-set (NON_APPLECLANG_LIBS gomp rt)
-set (NON_APPLECLANG_LIBS gomp)
+set (NON_APPLECLANG_LIBS "$ENV{NON_APPLECLANG_LIBS}")
set (PTHREAD_LIB)
##
if(UNIX AND NOT APPLE)
......@@ -2,9 +2,9 @@ Description: make RapMap use Debian's versions of dependencies
This involves disabling downloads and adjusting API usage to the versions
packaged in Debian.
Author: Sascha Steinbiss <sascha@steinbiss.name>
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -168,20 +168,6 @@ if (NOT ZLIB_FOUND)
--- rapmap.orig/CMakeLists.txt
+++ rapmap/CMakeLists.txt
@@ -215,20 +215,6 @@
endif()
......@@ -25,7 +25,7 @@ Author: Sascha Steinbiss <sascha@steinbiss.name>
if (NOT CEREAL_ROOT)
set(CEREAL_ROOT ${GAT_SOURCE_DIR}/external/install)
endif()
@@ -234,24 +220,6 @@ else ()
@@ -281,24 +267,6 @@
set (JEMALLOC_FLAGS "CC=${CMAKE_C_COMPILER}")
endif()
......@@ -50,9 +50,9 @@ Author: Sascha Steinbiss <sascha@steinbiss.name>
###
#
# Done building external dependencies.
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -61,8 +61,8 @@ endif()
--- rapmap.orig/src/CMakeLists.txt
+++ rapmap/src/CMakeLists.txt
@@ -63,8 +63,8 @@
add_executable(rapmap ${RAPMAP_MAIN_SRCS})
# our suffix array construction libraries
......@@ -61,5 +61,5 @@ Author: Sascha Steinbiss <sascha@steinbiss.name>
+set (SUFFARRAY_LIB divsufsort)
+set (SUFFARRAY64_LIB divsufsort64)
# Link the executable
target_link_libraries(rapmap
# build KSW2 library
set (KSW2PP_BASIC_LIB_SRCS
......@@ -2,9 +2,9 @@ Author: Andreas Tille <tille@debian.org>
Last-Update: Sat, 15 Oct 2016 15:33:47 +0200
Description: Prefer dynamic to static libraries
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,7 +33,7 @@ set (WARNING_IGNORE_FLAGS "-Wno-unknown-
--- rapmap.orig/CMakeLists.txt
+++ rapmap/CMakeLists.txt
@@ -70,7 +70,7 @@
endif()
## Prefer static to dynamic libraries
......
......@@ -20,7 +20,7 @@ override_dh_auto_configure:
override_dh_auto_build:
dh_auto_build
markdown_py -f README.html README.md
python3 -m markdown -f README.html README.md
override_dh_clean:
dh_clean README.html sample_data/sample_quasi_index/ sample_data/sample_quasi_index_ph/ sample_data/sample_quasi_map.sam sample_data/sample_quasi_map_ph.sam
......
......@@ -115,22 +115,11 @@ namespace rapmap {
int32_t maxSlack,
SAHitMap& outHits);
template <typename RapMapIndexT>
void intersectSAIntervalWithOutput2(SAIntervalHit<typename RapMapIndexT::IndexType>& h,
RapMapIndexT& rmi,
SAProcessedHitVec& outStructs);
/*
void intersectSAIntervalWithOutput3(SAIntervalHit& h,
RapMapSAIndex& rmi,
SAProcessedHitVec& outHits);
*/
//std::vector<ProcessedHit> intersectHits(
// std::vector<HitInfo>& inHits,
// RapMapIndex& rmi);
template <typename RapMapIndexT>
SAHitMap intersectSAHits(
SAIntervalVector<SAIntervalHit<typename RapMapIndexT::IndexType>>& inHits,
......@@ -138,11 +127,6 @@ namespace rapmap {
size_t readLen,
bool strictFilter=false);
template <typename RapMapIndexT>
std::vector<ProcessedSAHit> intersectSAHits2(
std::vector<SAIntervalHit<typename RapMapIndexT::IndexType>>& inHits,
RapMapIndexT& rmi);
template <typename RapMapIndexT>
void hitsToMappingsSimple(RapMapIndexT& rmi,
rapmap::utils::MappingConfig& mc,
......
......@@ -64,12 +64,16 @@ class RapMapSAIndex {
// Given a position, p, in the concatenated text,
// return the corresponding transcript
IndexT transcriptAtPosition(IndexT p);
bool isDecoy(IndexT p);
uint64_t getNumDecoys();
bool load(const std::string& indDir);
std::vector<IndexT> SA;
BitArrayPointer bitArray{nullptr};
uint64_t numDecoys{0};
uint64_t firstDecoyIndex{std::numeric_limits<uint64_t>::max()};
//BitArrayPointer decoyArray{nullptr};
std::unique_ptr<rank9b> rankDict{nullptr};
std::string seq;
......
This diff is collapsed.
......@@ -61,8 +61,8 @@ public:
void setStrictCheck(bool sc) { strictCheck_ = sc; }
/** Get/Set usage of MMP chain scoring **/
void enableChainScoring() { doChaining_ = true; }
void disableChainScoring() { doChaining_ = false; }
void enableChainScoring() { doChaining_ = true; strictCheckFraction_ = 0.65; }
void disableChainScoring() { doChaining_ = false; strictCheckFraction_ = 1.0; }
bool getChainScoring() const { return doChaining_; }
/** Get/Set the "mohsen number" that is used to limit the
......@@ -281,11 +281,15 @@ public:
// If we're computing coverage, then we can make use of that info here
//useCoverageCheck = false;
if (useCoverageCheck) {
if (fwdCov > rcCov) {
rcSAInts.clear();
} else if (rcCov > fwdCov) {
auto maxCov = std::max(fwdCov, rcCov);
int32_t requiredNumHits = (strictCheckFraction_ < 1.0) ?
std::max(static_cast<int32_t>(1), static_cast<int32_t>(std::floor(maxCov * strictCheckFraction_))) : maxCov;
if (static_cast<int32_t>(fwdCov) < requiredNumHits) {
fwdSAInts.clear();
}
if (static_cast<int32_t>(rcCov) < requiredNumHits) {
rcSAInts.clear();
}
} else { // use the k-mer "spot check"
// The first two conditions shouldn't happen
// but I'm just being paranoid here
......@@ -684,6 +688,7 @@ private:
bool strictCheck_;
bool doChaining_;
int32_t maxMMPExtension_{7};
double strictCheckFraction_{1.0};
std::string rcBuffer_;
};
......
......@@ -36,7 +36,7 @@ class SASearcher {
using OffsetT = typename RapMapIndexT::IndexType;
SASearcher(RapMapIndexT* rmi) :
rmi_(rmi), seq_(&rmi->seq), sa_(&rmi->SA) {}
rmi_(rmi), seq_(&rmi->seq), sa_(&rmi->SA), textLen_(rmi->seq.length()){}
int cmp(std::string::iterator abeg,
std::string::iterator aend,
......
#ifndef __SELECTIVE_ALIGNMENT_UTILS__
#define __SELECTIVE_ALIGNMENT_UTILS__
#include "ksw2pp/KSW2Aligner.hpp"
#include "metro/metrohash64.h"
#include "tsl/hopscotch_map.h"
#include "edlib.h"
namespace selective_alignment {
namespace utils {
enum class AlignmentPolicy : uint8_t { DEFAULT, BT2, BT2_STRICT };
/// Get alignment score
struct CacheEntry {
uint64_t hashKey;
int32_t alnScore;
CacheEntry(uint64_t hashKeyIn, int32_t alnScoreIn) :
hashKey(hashKeyIn), alnScore(alnScoreIn) {}
};
// Use a passthrough hash for the alignment cache, because
// the key *is* the hash.
struct PassthroughHash {
std::size_t operator()(uint64_t const& u) const { return u; }
};
using AlnCacheMap = tsl::hopscotch_map<uint64_t, int32_t, PassthroughHash>;
#ifndef RAPMAP_SALMON_SUPPORT
template <typename IndexT>
#endif
inline bool recoverOrphans(std::string& leftRead,
std::string& rightRead,
std::string& rc1,
std::string& rc2,
#ifdef RAPMAP_SALMON_SUPPORT // for salmon, we just have a vector of transcript objects
const std::vector<Transcript>& transcripts,
#else // otherwise, we have to pass in this information from the index
const std::string& seq,
const std::vector<std::string>& txpNames,
const std::vector<IndexT>& txpOffsets,
const std::vector<IndexT>& txpLens,
#endif
const std::vector<rapmap::utils::QuasiAlignment>& leftHits,
const std::vector<rapmap::utils::QuasiAlignment>& rightHits,
std::vector<rapmap::utils::QuasiAlignment>& jointHits) {
using QuasiAlignment = rapmap::utils::QuasiAlignment;
auto* r1 = leftRead.data();
auto* r2 = rightRead.data();
auto l1 = static_cast<int32_t>(leftRead.length());
auto l2 = static_cast<int32_t>(rightRead.length());
// We compute the reverse complements below only if we
// need them and don't have them.
char* r1rc = nullptr;
char* r2rc = nullptr;
const char* windowSeq = nullptr;
int32_t windowLength = -1;
int32_t maxDistRight = l2 / 4;
int32_t maxDistLeft = l1 / 4;
constexpr const int32_t signedZero{0};
int32_t lreadLen = l1;
//int32_t rreadLen = l2;
auto recoverSingleOrphan = [&] (const QuasiAlignment& anchorHit, bool anchorIsLeft) -> bool {
bool recovered = false;
auto txpID = anchorHit.tid;
#ifdef RAPMAP_SALMON_SUPPORT
auto& txp = transcripts[txpID];
const char* tseq = txp.Sequence();
#else
const char* tseq = seq.data() + txpOffsets[txpID];
#endif
int32_t anchorPos{anchorHit.allPositions.front()};
bool anchorFwd{anchorHit.fwd};
bool lfwd = false, rfwd = false;
int32_t startPos = -1, maxDist = -1, lpos = -1, rpos = -1, anchorLen = -1, otherLen = -1, rlen = -1;
const char* rptr{nullptr};
std::string* otherReadPtr{nullptr};
const char* otherRead{nullptr};
char* otherReadRC{nullptr};
std::string* otherRCSpace{nullptr};
rapmap::utils::ChainStatus leftChainStatus = rapmap::utils::ChainStatus::REGULAR;
rapmap::utils::ChainStatus rightChainStatus = rapmap::utils::ChainStatus::REGULAR;
if (anchorIsLeft) {
anchorLen = l1;
otherLen = l2;
maxDist = maxDistRight;
lpos = anchorPos;
rpos = -1;
lfwd = anchorFwd;
rfwd = !lfwd;
otherReadPtr = &rightRead;
otherRCSpace = &rc2;
otherRead = r2;
otherReadRC = r2rc;
leftChainStatus = anchorHit.chainStatus.getLeft();
} else {
anchorLen = l2;
otherLen = l1;
maxDist = maxDistLeft;
lpos = -1;
rpos = anchorPos;
rfwd = anchorFwd;
lfwd = !rfwd;
otherReadPtr = &leftRead;
otherRCSpace = &rc1;
otherRead = r1;
otherReadRC = r1rc;
rightChainStatus = anchorHit.chainStatus.getRight();
}
// if this hit is forward, look downstream, else upstream
#ifdef RAPMAP_SALMON_SUPPORT
auto refLength = txp.RefLength;
#else
auto refLength = txpLens[txpID];
#endif
if (anchorFwd) {
if (!otherReadRC){
rapmap::utils::reverseRead(*otherReadPtr, *otherRCSpace);
otherReadRC = const_cast<char*>(otherRCSpace->data());
}
rptr = otherReadRC;
rlen = otherLen;
startPos = std::max(signedZero, static_cast<int32_t>(anchorPos));
windowLength = std::min(1000, static_cast<int32_t>(refLength - startPos));
} else {
rptr = otherRead;
rlen = otherLen;
int32_t endPos = std::min(static_cast<int32_t>(refLength), static_cast<int32_t>(anchorPos) + anchorLen);
startPos = std::max(signedZero, endPos - 1000);
windowLength = std::min(1000, endPos);
}
windowSeq = tseq + startPos;
EdlibAlignResult result = edlibAlign(rptr, rlen, windowSeq, windowLength,
edlibNewAlignConfig(maxDist, EDLIB_MODE_HW, EDLIB_TASK_DISTANCE));
if (result.editDistance > -1) {
if (anchorIsLeft) {
rpos = startPos + result.endLocations[0] - otherLen;
} else {
lpos = startPos + result.endLocations[0] - otherLen;
}
// If we consider only a single position per transcript
int32_t startRead1 = std::max(lpos, signedZero);
int32_t startRead2 = std::max(rpos, signedZero);
bool read1First{(startRead1 < startRead2)};
int32_t fragStartPos = read1First ? startRead1 : startRead2;
int32_t fragEndPos = read1First ?
(startRead2 + l2) : (startRead1 + l1);
uint32_t fragLen = fragEndPos - fragStartPos;
jointHits.emplace_back(txpID,
lpos,
lfwd,
lreadLen,
fragLen, true);
// Fill in the mate info
auto& qaln = jointHits.back();
qaln.mateLen = rlen;
qaln.matePos = rpos;
qaln.mateIsFwd = rfwd;
jointHits.back().mateStatus = rapmap::utils::MateStatus::PAIRED_END_PAIRED;
jointHits.back().chainStatus = rapmap::utils::FragmentChainStatus(leftChainStatus, rightChainStatus);
recovered = true;
}
edlibFreeAlignResult(result);
return recovered;
};
{
auto leftIt = leftHits.begin();
auto leftEnd = leftHits.end();
auto leftLen = std::distance(leftIt, leftEnd);
auto rightIt = rightHits.begin();
auto rightEnd = rightHits.end();
auto rightLen = std::distance(rightIt, rightEnd);
jointHits.reserve(std::min(leftLen, rightLen));
bool didRecover{false};
while ((leftIt != leftEnd) and (rightIt != rightEnd)) {
uint32_t leftTxp, rightTxp;
leftTxp = leftIt->tid;
rightTxp = rightIt->tid;
if (leftTxp < rightTxp) {
didRecover = recoverSingleOrphan(*leftIt, true);
++leftIt;
} else if (rightTxp < leftTxp) {
didRecover = recoverSingleOrphan(*rightIt, false);
++rightIt;
} else if (rightTxp == leftTxp) {
/*
didRecover = recoverSingleOrphan(*leftIt, true);
++leftIt;
if(!didRecover) {
didRecover = recoverSingleOrphan(*rightIt, false);
}
++rightIt;
*/
//++leftIt; ++rightIt;
// Should not happen!
auto log = spdlog::get("jointLog");
log->error("Found a transcript in common between leftHits and rightHits while "
"trying to recover orphans. This should not happen. "
"Please report this via GitHub. ");
std::stringstream ss;
ss << "left hits : [";
for (auto& lh : leftHits) {
ss << "(" <<
#ifdef RAPMAP_SALMON_SUPPORT
transcripts[lh.tid].RefName
#else
txpNames[lh.tid]
#endif
<< ", " << lh.pos << ", " << lh.fwd << ") ; ";
}
ss << "]\n";
ss << "right hits : [";
for (auto& lh : rightHits ) {
ss << "(" <<
#ifdef RAPMAP_SALMON_SUPPORT
transcripts[lh.tid].RefName
#else
txpNames[lh.tid]
#endif
<< ", " << lh.pos << ", " << lh.fwd << ") ; ";
}
log->error(ss.str());
log->flush();
spdlog::drop_all();
std::exit(1);
}
}
while (leftIt != leftEnd) {
didRecover = recoverSingleOrphan(*leftIt, true);
++leftIt;
}
while(rightIt != rightEnd) {
didRecover = recoverSingleOrphan(*rightIt, false);
++rightIt;
}
(void)didRecover;
} // union / merge left and right orphans
return true;
}
inline int32_t getAlnScore(
ksw2pp::KSW2Aligner& aligner,
ksw_extz_t& ez,
int32_t pos, const char* rptr, int32_t rlen,
char* tseq, int32_t tlen,
int8_t mscore,
int8_t mmcost,
int32_t maxScore,
rapmap::utils::ChainStatus chainStat,
bool multiMapping, // was there > 1 hit for this read
AlignmentPolicy ap,
uint32_t buf,
AlnCacheMap& alnCache) {
// If this was a perfect match, don't bother to align or compute the score
if (chainStat == rapmap::utils::ChainStatus::PERFECT) {
return maxScore;
}
auto ungappedAln = [mscore, mmcost](char* ref, const char* query, int32_t len) -> int32_t {
int32_t ungappedScore{0};
for (int32_t i = 0; i < len; ++i) {
char c1 = *(ref + i);
char c2 = *(query + i);
c1 = (c1 == 'N' or c2 == 'N') ? c2 : c1;
ungappedScore += (c1 == c2) ? mscore : mmcost;
}
return ungappedScore;
};
int32_t s{std::numeric_limits<int32_t>::lowest()};
// TODO : Determine what is the most "appropriate" penalty for
// an overhang (based on the scoring function).
bool invalidStart = (pos < 0);
bool invalidEnd = (pos + rlen > tlen);
if (invalidStart) { rptr += -pos; rlen += pos; pos = 0; }
// if we are trying to mimic Bowtie2 with RSEM params
if (invalidStart or invalidEnd) {
switch (ap) {
case AlignmentPolicy::BT2:
case AlignmentPolicy::BT2_STRICT:
return s;
case AlignmentPolicy::DEFAULT:
default:
break;
}
}
if (pos < tlen) {
bool doUngapped{(!invalidStart) and (chainStat == rapmap::utils::ChainStatus::UNGAPPED)};
buf = (doUngapped) ? 0 : buf;
auto lnobuf = static_cast<uint32_t>(tlen - pos);
auto lbuf = static_cast<uint32_t>(rlen+buf);
auto useBuf = (lbuf < lnobuf);
uint32_t tlen1 = std::min(lbuf, lnobuf);
char* tseq1 = tseq + pos;
ez.max_q = ez.max_t = ez.mqe_t = ez.mte_q = -1;
ez.max = 0, ez.mqe = ez.mte = KSW_NEG_INF;
ez.n_cigar = 0;
uint64_t hashKey{0};
bool didHash{false};
if (!alnCache.empty()) {
// hash the reference string
uint32_t keyLen = useBuf ? tlen1 - buf : tlen1;
MetroHash64::Hash(reinterpret_cast<uint8_t*>(tseq1), keyLen, reinterpret_cast<uint8_t*>(&hashKey), 0);
didHash = true;
// see if we have this hash
auto hit = alnCache.find(hashKey);
// if so, we know the alignment score
if (hit != alnCache.end()) {
s = hit->second;
}
}
// If we got here with s == -1, we don't have the score cached
if (s == std::numeric_limits<int32_t>::lowest()) {
if (doUngapped) {
// signed version of tlen1
int32_t tlen1s = tlen1;
int32_t alnLen = rlen < tlen1s ? rlen : tlen1s;
s = ungappedAln(tseq1, rptr, alnLen);
} else {
/*
auto startBuf = std::min(pos, static_cast<int32_t>(buf));
int32_t bpos = pos - startBuf;
char* tseqTemp = tseq + bpos;
uint32_t tlenTemp = tlen1 + startBuf;
EdlibAlignResult result = edlibAlign(rptr, rlen, tseqTemp, tlenTemp,
edlibNewAlignConfig(-1, EDLIB_MODE_HW, EDLIB_TASK_PATH));
auto spos = result.startLocations[0];
tseq1 = tseq + bpos + spos;
auto* aln = result.alignment;
if (!aln or (aln[0] == 1 or aln[0] == 2)) {
edlibFreeAlignResult(result);
return s;
}
edlibFreeAlignResult(result);
*/
aligner(rptr, rlen, tseq1, tlen1, &ez, ksw2pp::EnumToType<ksw2pp::KSW2AlignmentType::EXTENSION>());
s = std::max(ez.mqe, ez.mte);
}
if (multiMapping) { // don't bother to fill up a cache unless this is a multi-mapping read
if (!didHash) {
uint32_t keyLen = useBuf ? tlen1 - buf : tlen1;
MetroHash64::Hash(reinterpret_cast<uint8_t*>(tseq1), keyLen, reinterpret_cast<uint8_t*>(&hashKey), 0);
}
alnCache[hashKey] = s;
} // was a multi-mapper
}
}
return s;
}
} // namespace utils
} // namespace selective_alignment
#endif //__SELECTIVE_ALIGNMENT_UTILS__
#ifndef EDLIB_H
#define EDLIB_H
#include <cstdint>
#include <vector>
/**
* @file
* @author Martin Sosic
* @brief Main header file, containing all public functions and structures.
*/
//#ifdef __cplusplus
//extern "C" {
//#endif
// Status codes
#define EDLIB_STATUS_OK 0
#define EDLIB_STATUS_ERROR 1
/**
* Alignment methods - how should Edlib treat gaps before and after query?
*/
typedef enum {
/**
* Global method. This is the standard method.
* Useful when you want to find out how similar is first sequence to second sequence.
*/
EDLIB_MODE_NW,
/**
* Prefix method. Similar to global method, but with a small twist - gap at query end is not penalized.
* What that means is that deleting elements from the end of second sequence is "free"!
* For example, if we had "AACT" and "AACTGGC", edit distance would be 0, because removing "GGC" from the end
* of second sequence is "free" and does not count into total edit distance. This method is appropriate
* when you want to find out how well first sequence fits at the beginning of second sequence.
*/
EDLIB_MODE_SHW,
/**
* Infix method. Similar as prefix method, but with one more twist - gaps at query end and start are
* not penalized. What that means is that deleting elements from the start and end of second sequence is "free"!
* For example, if we had ACT and CGACTGAC, edit distance would be 0, because removing CG from the start
* and GAC from the end of second sequence is "free" and does not count into total edit distance.
* This method is appropriate when you want to find out how well first sequence fits at any part of
* second sequence.
* For example, if your second sequence was a long text and your first sequence was a sentence from that text,
* but slightly scrambled, you could use this method to discover how scrambled it is and where it fits in
* that text. In bioinformatics, this method is appropriate for aligning read to a sequence.
*/
EDLIB_MODE_HW
} EdlibAlignMode;
/**
* Alignment tasks - what do you want Edlib to do?
*/
typedef enum {
EDLIB_TASK_DISTANCE, //!< Find edit distance and end locations.
EDLIB_TASK_LOC, //!< Find edit distance, end locations and start locations.
EDLIB_TASK_PATH //!< Find edit distance, end locations and start locations and alignment path.
} EdlibAlignTask;
/**
* Describes cigar format.
* @see http://samtools.github.io/hts-specs/SAMv1.pdf
* @see http://drive5.com/usearch/manual/cigar.html
*/
typedef enum {
EDLIB_CIGAR_STANDARD, //!< Match: 'M', Insertion: 'I', Deletion: 'D', Mismatch: 'M'.
EDLIB_CIGAR_EXTENDED //!< Match: '=', Insertion: 'I', Deletion: 'D', Mismatch: 'X'.
} EdlibCigarFormat;
// Edit operations.
#define EDLIB_EDOP_MATCH 0 //!< Match.
#define EDLIB_EDOP_INSERT 1 //!< Insertion to target = deletion from query.
#define EDLIB_EDOP_DELETE 2 //!< Deletion from target = insertion to query.
#define EDLIB_EDOP_MISMATCH 3 //!< Mismatch.
/**
* @brief Configuration object for edlibAlign() function.
*/
typedef struct {
/**
* Set k to non-negative value to tell edlib that edit distance is not larger than k.
* Smaller k can significantly improve speed of computation.
* If edit distance is larger than k, edlib will set edit distance to -1.
* Set k to negative value and edlib will internally auto-adjust k until score is found.
*/
int k;
/**
* Alignment method.
* EDLIB_MODE_NW: global (Needleman-Wunsch)
* EDLIB_MODE_SHW: prefix. Gap after query is not penalized.
* EDLIB_MODE_HW: infix. Gaps before and after query are not penalized.
*/
EdlibAlignMode mode;
/**
* Alignment task - tells Edlib what to calculate. Less to calculate, faster it is.
* EDLIB_TASK_DISTANCE - find edit distance and end locations of optimal alignment paths in target.
* EDLIB_TASK_LOC - find edit distance and start and end locations of optimal alignment paths in target.
* EDLIB_TASK_PATH - find edit distance, alignment path (and start and end locations of it in target).
*/
EdlibAlignTask task;
} EdlibAlignConfig;
/**
* Helper method for easy construction of configuration object.
* @return Configuration object filled with given parameters.
*/
EdlibAlignConfig edlibNewAlignConfig(int k, EdlibAlignMode mode, EdlibAlignTask task);
/**
* @return Default configuration object, with following defaults:
* k = -1, mode = EDLIB_MODE_NW, task = EDLIB_TASK_DISTANCE.
*/
EdlibAlignConfig edlibDefaultAlignConfig(void);
/**
* Container for results of alignment done by edlibAlign() function.
*/
typedef struct {
/**
* -1 if k is non-negative and edit distance is larger than k.
*/
int editDistance;
/**
* Array of zero-based positions in target where optimal alignment paths end.
* If gap after query is penalized, gap counts as part of query (NW), otherwise not.
* Set to NULL if edit distance is larger than k.
* If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free().
*/
int* endLocations;
/**
* Array of zero-based positions in target where optimal alignment paths start,
* they correspond to endLocations.
* If gap before query is penalized, gap counts as part of query (NW), otherwise not.
* Set to NULL if not calculated or if edit distance is larger than k.
* If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free().
*/
int* startLocations;
/**
* Number of end (and start) locations.
*/
int numLocations;
/**
* Alignment is found for first pair of start and end locations.
* Set to NULL if not calculated.
* Alignment is sequence of numbers: 0, 1, 2, 3.
* 0 stands for match.
* 1 stands for insertion to target.
* 2 stands for insertion to query.
* 3 stands for mismatch.
* Alignment aligns query to target from begining of query till end of query.
* If gaps are not penalized, they are not in alignment.
* If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free().
*/
unsigned char* alignment;
/**
* Length of alignment.
*/
int alignmentLength;
/**
* Number of different characters in query and target together.
*/
int alphabetLength;
} EdlibAlignResult;
/**
* Container for results of alignment done by edlibAlign() function.
*/
typedef struct {
/**
* -1 if k is non-negative and edit distance is larger than k.
*/
int editDistance;
/**
* Array of zero-based positions in target where optimal alignment paths end.
* If gap after query is penalized, gap counts as part of query (NW), otherwise not.
* Set to NULL if edit distance is larger than k.
* If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free().
*/
std::vector<int> endLocations;
//int* endLocations;
/**
* Array of zero-based positions in target where optimal alignment paths start,
* they correspond to endLocations.
* If gap before query is penalized, gap counts as part of query (NW), otherwise not.
* Set to NULL if not calculated or if edit distance is larger than k.
* If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free().
*/
std::vector<int> startLocations;
//int* startLocations;
/**
* Number of end (and start) locations.
*/
int numLocations;
/**
* Alignment is found for first pair of start and end locations.
* Set to NULL if not calculated.
* Alignment is sequence of numbers: 0, 1, 2, 3.
* 0 stands for match.
* 1 stands for insertion to target.
* 2 stands for insertion to query.
* 3 stands for mismatch.
* Alignment aligns query to target from begining of query till end of query.
* If gaps are not penalized, they are not in alignment.
* If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free().
*/
std::vector<unsigned char> alignment;
//unsigned char* alignment;
/**
* Length of alignment.
*/
int alignmentLength;
/**
* Number of different characters in query and target together.
*/
int alphabetLength;
} EdlibAlignResultCustom;
/**
* Frees memory in EdlibAlignResult that was allocated by edlib.
* If you do not use it, make sure to free needed members manually using free().
*/
void edlibFreeAlignResult(EdlibAlignResult result);
/**
* Aligns two sequences (query and target) using edit distance (levenshtein distance).
* Through config parameter, this function supports different alignment methods (global, prefix, infix),
* as well as different modes of search (tasks).
* It always returns edit distance and end locations of optimal alignment in target.
* It optionally returns start locations of optimal alignment in target and alignment path,
* if you choose appropriate tasks.
* @param [in] query First sequence.
* @param [in] queryLength Number of characters in first sequence.
* @param [in] target Second sequence.
* @param [in] targetLength Number of characters in second sequence.
* @param [in] config Additional alignment parameters, like alignment method and wanted results.
* @return Result of alignment, which can contain edit distance, start and end locations and alignment path.
* Make sure to clean up the object using edlibFreeAlignResult() or by manually freeing needed members.
*/
EdlibAlignResult edlibAlign(const char* query, int queryLength,
const char* target, int targetLength,
const EdlibAlignConfig config);
/**
* Builds cigar string from given alignment sequence.
* @param [in] alignment Alignment sequence.
* 0 stands for match.
* 1 stands for insertion to target.
* 2 stands for insertion to query.
* 3 stands for mismatch.
* @param [in] alignmentLength
* @param [in] cigarFormat Cigar will be returned in specified format.
* @return Cigar string.
* I stands for insertion.
* D stands for deletion.
* X stands for mismatch. (used only in extended format)
* = stands for match. (used only in extended format)
* M stands for (mis)match. (used only in standard format)
* String is null terminated.
* Needed memory is allocated and given pointer is set to it.
* Do not forget to free it later using free()!
*/
char* edlibAlignmentToCigar(const unsigned char* alignment, int alignmentLength,
EdlibCigarFormat cigarFormat);
typedef uint64_t Word;
struct Block {
Word P; // Pvin
Word M; // Mvin
int score; // score of last cell in block;
Block() {}
Block(Word P, Word M, int score) :P(P), M(M), score(score) {}
};
class PeqTable {
public:
PeqTable();
void reset(int alphabetLength, std::vector<unsigned char>& query);
Word* getTable() { return peq_.data(); }
private:
int prevQueryLength_;
std::vector<Word> peq_;
};
class AlignerEngine {
public:
AlignerEngine();
void operator()(const char* const queryOriginal, const int queryLength,
const char* const targetOriginal, const int targetLength,
const EdlibAlignConfig config);
const EdlibAlignResultCustom& result() { return result_; }
private:
std::vector<unsigned char> query_;
std::vector<unsigned char> target_;
EdlibAlignResultCustom result_;
std::vector<Block> blocks_;
PeqTable peq_;
};
//#ifdef __cplusplus
//}
//#endif
#endif // EDLIB_H
#ifndef __KSW2_ALIGNER_HPP__
#define __KSW2_ALIGNER_HPP__
#include <memory>
#include <vector>
#include <cstdlib>
extern "C" {
#include "ksw2pp/kalloc.h"
#include "ksw2pp/ksw2.h"
}
namespace ksw2pp {
/**
* When we use a unique_ptr to hold a kalloc allocator, this is the
* deleter we use to call the appropriate function to free / destroy the
*allocator.
**/
class KallocDeleter {
public:
void operator()(void* p) { km_destroy(p); }
};
enum class KSW2AlignmentType : uint8_t { GLOBAL = 1, EXTENSION = 2 };
// Just like Int2Type from
// https://en.wikibooks.org/wiki/More_C%2B%2B_Idioms/Int-To-Type
template <KSW2AlignmentType I> struct EnumToType {
enum { value = static_cast<uint8_t>(I) };
};
// A structure to hold the relvant parameters for the aligner
struct KSW2Config {
int8_t gapo = -1;
int8_t gape = -1;
int bandwidth = -1;
int dropoff = -1;
int flag = 0;
int alphabetSize = 5;
int end_bonus = 10;
KSW2AlignmentType atype = KSW2AlignmentType::EXTENSION;
};
class KSW2Aligner {
public:
KSW2Aligner(int8_t match = 2, int8_t mismatch = -4);
KSW2Aligner(std::vector<int8_t> mat);
int transformSequenceKSW2(const char* const queryOriginal, const int queryLength,
std::vector<unsigned char>& queryTransformed);
int transformSequencesKSW2(const char* const queryOriginal, const int queryLength,
const char* const targetOriginal, const int targetLength,
std::vector<unsigned char>& queryTransformed,
std::vector<unsigned char>& targetTransformed);
/**
* Variants of the operator that require both an explicit type tag to
* determine the type of alignment to perform, as well as a pointer to
* an `ksw_extz_t` structure that will be used to hold the output.
*/
int operator()(const char* const queryOriginal, const int queryLength,
const char* const targetOriginal, const int targetLength,
ksw_extz_t* ez, EnumToType<KSW2AlignmentType::GLOBAL>);
int operator()(const char* const queryOriginal, const int queryLength,
const char* const targetOriginal, const int targetLength,
ksw_extz_t* ez, EnumToType<KSW2AlignmentType::EXTENSION>);
int operator()(const uint8_t* const queryOriginal, const int queryLength,
const uint8_t* const targetOriginal, const int targetLength,
ksw_extz_t* ez, EnumToType<KSW2AlignmentType::GLOBAL>);
int operator()(const uint8_t* const queryOriginal, const int queryLength,
const uint8_t* const targetOriginal, const int targetLength,
ksw_extz_t* ez, EnumToType<KSW2AlignmentType::EXTENSION>);
/**
* Variants of the operator that do not require an output
* `ksw_extz_t*` variable. They will store the result in this object's
* `result_` variable, which can then be queried with the `result()` method.
*/
int operator()(const char* const queryOriginal, const int queryLength,
const char* const targetOriginal, const int targetLength,
EnumToType<KSW2AlignmentType::GLOBAL>);
int operator()(const char* const queryOriginal, const int queryLength,
const char* const targetOriginal, const int targetLength,
EnumToType<KSW2AlignmentType::EXTENSION>);
int operator()(const uint8_t* const queryOriginal, const int queryLength,
const uint8_t* const targetOriginal, const int targetLength,
EnumToType<KSW2AlignmentType::GLOBAL>);
int operator()(const uint8_t* const queryOriginal, const int queryLength,
const uint8_t* const targetOriginal, const int targetLength,
EnumToType<KSW2AlignmentType::EXTENSION>);
/**
* Variants of the operator that require neither an output
* `ksw_extz_t*` variable or an explicit type tag to select the type of
*alignment
* to perform. These variants will perform whichever type of alignment is
* currently specified in the `atype` member of this class's `config_`
* object.
**/
int operator()(const char* const queryOriginal, const int queryLength,
const char* const targetOriginal, const int targetLength);
int operator()(const uint8_t* const queryOriginal, const int queryLength,
const uint8_t* const targetOriginal, const int targetLength);
KSW2Config& config() { return config_; }
const ksw_extz_t& result() { return result_; }
void freeCIGAR(ksw_extz_t* ez) {
if (ez->cigar and kalloc_allocator_) {
kfree(kalloc_allocator_.get(), ez->cigar);
}
}
private:
std::vector<uint8_t> query_;
std::vector<uint8_t> target_;
ksw_extz_t result_;
std::unique_ptr<void, KallocDeleter> kalloc_allocator_{nullptr,
KallocDeleter()};
std::vector<int8_t> mat_;
KSW2Config config_;
bool haveSSE41{false};
bool haveSSE2{false};
};
} // namespace ksw2pp
#endif //__KSW2_ALIGNER_HPP__
#ifndef _KALLOC_H_
#define _KALLOC_H_
#include <stdlib.h>
#define km_size(x) (*(((size_t*)(x))-1) * sizeof(size_t))
#ifdef __cplusplus
extern "C" {
#endif
void *kmalloc(void *km, size_t size);
void *krealloc(void *km, void *ptr, size_t size);
void *kcalloc(void *km, size_t count, size_t size);
void kfree(void *km, void *ptr);
void *km_init(void);
void km_destroy(void *km);
void km_stat(const void *km); // TODO: return numbers instead of print to stderr
#ifdef __cplusplus
}
#endif
#endif
#ifndef KSW2_H_
#define KSW2_H_
#include <stdint.h>
#define KSW_NEG_INF -0x40000000
#define KSW_EZ_SCORE_ONLY 0x01 // don't record alignment path/cigar
#define KSW_EZ_RIGHT 0x02 // right-align gaps
#define KSW_EZ_GENERIC_SC 0x04 // without this flag: match/mismatch only; last symbol is a wildcard
#define KSW_EZ_APPROX_MAX 0x08 // approximate max; this is faster with sse
#define KSW_EZ_APPROX_DROP 0x10 // approximate Z-drop; faster with sse
#define KSW_EZ_EXTZ_ONLY 0x40 // only perform extension
#define KSW_EZ_REV_CIGAR 0x80 // reverse CIGAR in the output
#define KSW_EZ_SPLICE_FOR 0x100
#define KSW_EZ_SPLICE_REV 0x200
#define KSW_EZ_SPLICE_FLANK 0x400
#ifdef __cplusplus
extern "C" {
#endif
typedef struct {
uint32_t max:31, zdropped:1;
int max_q, max_t; // max extension coordinate
int mqe, mqe_t; // max score when reaching the end of query
int mte, mte_q; // max score when reaching the end of target
int score; // max score reaching both ends; may be KSW_NEG_INF
int m_cigar, n_cigar;
int reach_end;
uint32_t *cigar;
} ksw_extz_t;
/**
* NW-like extension
*
* @param km memory pool, when used with kalloc
* @param qlen query length
* @param query query sequence with 0 <= query[i] < m
* @param tlen target length
* @param target target sequence with 0 <= target[i] < m
* @param m number of residue types
* @param mat m*m scoring mattrix in one-dimension array
* @param gapo gap open penalty; a gap of length l cost "-(gapo+l*gape)"
* @param gape gap extension penalty
* @param w band width (<0 to disable)
* @param zdrop off-diagonal drop-off to stop extension (positive; <0 to disable)
* @param flag flag (see KSW_EZ_* macros)
* @param ez (out) scores and cigar
*/
void ksw_extz(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
int8_t q, int8_t e, int w, int zdrop, int flag, ksw_extz_t *ez);
void ksw_extz2_sse(/*unsigned int simd, */void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
int8_t q, int8_t e, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez);
void ksw_extz2_sse41(/*unsigned int simd, */void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
int8_t q, int8_t e, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez);
void ksw_extz2_sse2(/*unsigned int simd, */void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
int8_t q, int8_t e, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez);
void ksw_extd(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
int8_t gapo, int8_t gape, int8_t gapo2, int8_t gape2, int w, int zdrop, int flag, ksw_extz_t *ez);
void ksw_extd2_sse(/*unsigned int simd,*/ void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
int8_t gapo, int8_t gape, int8_t gapo2, int8_t gape2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez);
void ksw_extd2_sse41(/*unsigned int simd,*/ void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
int8_t gapo, int8_t gape, int8_t gapo2, int8_t gape2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez);
void ksw_extd2_sse2(/*unsigned int simd,*/ void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
int8_t gapo, int8_t gape, int8_t gapo2, int8_t gape2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez);
void ksw_exts2_sse(/*unsigned int simd,*/void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
int8_t gapo, int8_t gape, int8_t gapo2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez);
void ksw_exts2_sse41(/*unsigned int simd,*/void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
int8_t gapo, int8_t gape, int8_t gapo2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez);
void ksw_exts2_sse2(/*unsigned int simd,*/void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
int8_t gapo, int8_t gape, int8_t gapo2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez);
void ksw_extf2_sse(/*unsigned int simd,*/void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t mch, int8_t mis, int8_t e, int w, int xdrop, ksw_extz_t *ez);
void ksw_extf2_sse41(/*unsigned int simd,*/void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t mch, int8_t mis, int8_t e, int w, int xdrop, ksw_extz_t *ez);
void ksw_extf2_sse2(/*unsigned int simd,*/void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t mch, int8_t mis, int8_t e, int w, int xdrop, ksw_extz_t *ez);
/**
* Global alignment
*
* (first 10 parameters identical to ksw_extz_sse())
* @param m_cigar (modified) max CIGAR length; feed 0 if cigar==0
* @param n_cigar (out) number of CIGAR elements
* @param cigar (out) BAM-encoded CIGAR; caller need to deallocate with kfree(km, )
*
* @return score of the alignment
*/
int ksw_gg(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t gapo, int8_t gape, int w, int *m_cigar_, int *n_cigar_, uint32_t **cigar_);
int ksw_gg2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t gapo, int8_t gape, int w, int *m_cigar_, int *n_cigar_, uint32_t **cigar_);
int ksw_gg2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t gapo, int8_t gape, int w, int *m_cigar_, int *n_cigar_, uint32_t **cigar_);
void *ksw_ll_qinit(void *km, int size, int qlen, const uint8_t *query, int m, const int8_t *mat);
int ksw_ll_i16(void *q, int tlen, const uint8_t *target, int gapo, int gape, int *qe, int *te);
#ifdef __cplusplus
}
#endif
/************************************
*** Private macros and functions ***
************************************/
#ifdef HAVE_KALLOC
#include "kalloc.h"
#else
#include <stdlib.h>
#define kmalloc(km, size) malloc((size))
#define kcalloc(km, count, size) calloc((count), (size))
#define krealloc(km, ptr, size) realloc((ptr), (size))
#define kfree(km, ptr) free((ptr))
#endif
static inline uint32_t *ksw_push_cigar(void *km, int *n_cigar, int *m_cigar, uint32_t *cigar, uint32_t op, int len)
{
if (*n_cigar == 0 || op != (cigar[(*n_cigar) - 1]&0xf)) {
if (*n_cigar == *m_cigar) {
*m_cigar = *m_cigar? (*m_cigar)<<1 : 4;
cigar = (uint32_t*)krealloc(km, cigar, (*m_cigar) << 2);
}
cigar[(*n_cigar)++] = len<<4 | op;
} else cigar[(*n_cigar)-1] += len<<4;
return cigar;
}
// In the backtrack matrix, value p[] has the following structure:
// bit 0-2: which type gets the max - 0 for H, 1 for E, 2 for F, 3 for \tilde{E} and 4 for \tilde{F}
// bit 3/0x08: 1 if a continuation on the E state (bit 5/0x20 for a continuation on \tilde{E})
// bit 4/0x10: 1 if a continuation on the F state (bit 6/0x40 for a continuation on \tilde{F})
static inline void ksw_backtrack(void *km, int is_rot, int is_rev, int min_intron_len, const uint8_t *p, const int *off, const int *off_end, int n_col, int i0, int j0,
int *m_cigar_, int *n_cigar_, uint32_t **cigar_)
{ // p[] - lower 3 bits: which type gets the max; bit
int n_cigar = 0, m_cigar = *m_cigar_, i = i0, j = j0, r, state = 0;
uint32_t *cigar = *cigar_, tmp;
while (i >= 0 && j >= 0) { // at the beginning of the loop, _state_ tells us which state to check
int force_state = -1;
if (is_rot) {
r = i + j;
if (i < off[r]) force_state = 2;
if (off_end && i > off_end[r]) force_state = 1;
tmp = force_state < 0? p[r * n_col + i - off[r]] : 0;
} else {
if (j < off[i]) force_state = 2;
if (off_end && j > off_end[i]) force_state = 1;
tmp = force_state < 0? p[i * n_col + j - off[i]] : 0;
}
if (state == 0) state = tmp & 7; // if requesting the H state, find state one maximizes it.
else if (!(tmp >> (state + 2) & 1)) state = 0; // if requesting other states, _state_ stays the same if it is a continuation; otherwise, set to H
if (state == 0) state = tmp & 7; // TODO: probably this line can be merged into the "else if" line right above; not 100% sure
if (force_state >= 0) state = force_state;
if (state == 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 0, 1), --i, --j; // match
else if (state == 1 || (state == 3 && min_intron_len <= 0)) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 2, 1), --i; // deletion
else if (state == 3 && min_intron_len > 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 3, 1), --i; // intron
else cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 1, 1), --j; // insertion
}
if (i >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, min_intron_len > 0 && i >= min_intron_len? 3 : 2, i + 1); // first deletion
if (j >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 1, j + 1); // first insertion
if (!is_rev)
for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR
tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp;
*m_cigar_ = m_cigar, *n_cigar_ = n_cigar, *cigar_ = cigar;
}
static inline void ksw_reset_extz(ksw_extz_t *ez)
{
ez->max_q = ez->max_t = ez->mqe_t = ez->mte_q = -1;
ez->max = 0, ez->score = ez->mqe = ez->mte = KSW_NEG_INF;
ez->n_cigar = 0, ez->zdropped = 0, ez->reach_end = 0;
}
static inline int ksw_apply_zdrop(ksw_extz_t *ez, int is_rot, int32_t H, int a, int b, int zdrop, int8_t e)
{
int r, t;
if (is_rot) r = a, t = b;
else r = a + b, t = a;
if (H > (int32_t)ez->max) {
ez->max = H, ez->max_t = t, ez->max_q = r - t;
} else if (t >= ez->max_t && r - t >= ez->max_q) {
int tl = t - ez->max_t, ql = (r - t) - ez->max_q, l;
l = tl > ql? tl - ql : ql - tl;
if (zdrop >= 0 && ez->max - H > zdrop + l * e) {
ez->zdropped = 1;
return 1;
}
}
return 0;
}
#endif
// metrohash.h
//
// The MIT License (MIT)
//
// Copyright (c) 2015 J. Andrew Rogers
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
//
#ifndef METROHASH_METROHASH_H
#define METROHASH_METROHASH_H
#include "metrohash64.h"
#include "metrohash128.h"
#include "metrohash128crc.h"
#endif // #ifndef METROHASH_METROHASH_H