Skip to content
Commits on Source (5)
{
"configurations": [
{
"name": "Mac",
"includePath": [
"/usr/local/include",
"/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/include/c++/v1",
"/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/lib/clang/9.0.0/include",
"/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/include",
"/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.13.sdk/usr/include",
"${workspaceRoot}",
"${workspaceRoot}/include"
],
"defines": [],
"intelliSenseMode": "clang-x64",
"browse": {
"path": [
"/usr/local/include",
"/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/include/c++/v1",
"/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/lib/clang/9.0.0/include",
"/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/include",
"/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.13.sdk/usr/include",
"${workspaceRoot}"
],
"limitSymbolsToIncludedHeaders": true,
"databaseFilename": ""
},
"macFrameworkPath": [
"/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.13.sdk/System/Library/Frameworks"
]
},
{
"name": "Linux",
"includePath": [
"/usr/include",
"/usr/local/include",
"${workspaceRoot}"
],
"defines": [],
"intelliSenseMode": "clang-x64",
"browse": {
"path": [
"/usr/include",
"/usr/local/include",
"${workspaceRoot}"
],
"limitSymbolsToIncludedHeaders": true,
"databaseFilename": ""
}
},
{
"name": "Win32",
"includePath": [
"C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC/include",
"${workspaceRoot}"
],
"defines": [
"_DEBUG",
"UNICODE"
],
"intelliSenseMode": "msvc-x64",
"browse": {
"path": [
"C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC/include/*",
"${workspaceRoot}"
],
"limitSymbolsToIncludedHeaders": true,
"databaseFilename": ""
}
}
],
"version": 3
}
\ No newline at end of file
cmake_minimum_required (VERSION 2.8)
cmake_minimum_required (VERSION 3.1)
enable_testing()
project (RapMap)
set(CPACK_PACKAGE_VERSION "0.5.0")
set(CPACK_PACKAGE_VERSION "0.6.0")
SET(CPACK_PACKAGE_VERSION_MAJOR "0")
set(CPACK_PACKAGE_VERSION_MINOR "5")
set(CPACK_PACKAGE_VERSION_MINOR "6")
set(CPACK_PACKAGE_VERSION_PATCH "0")
set(PROJECT_VERSION ${CPACK_PACKAGE_VERSION})
set(CPACK_GENERATOR "TGZ")
set(CPACK_SOURCE_GENERATOR "TGZ")
set(CPACK_PACKAGE_VENDOR "Stony Brook University")
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "RapMap - Wicked-fast qasi-mapping")
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "RapMap - Wicked-fast quasi-mapping")
set(CPACK_PACKAGE_NAME
"${CMAKE_PROJECT_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}")
set(CPACK_SOURCE_PACKAGE_FILE_NAME
......@@ -46,10 +46,10 @@ endif()
if (QUIET_BUILD)
set(WALL "")
else()
set(WALL "-Wall")
set(WALL "-Wall -Wno-unused-function -Wno-unused-local-typedef")
endif()
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -funroll-loops -fPIC -fomit-frame-pointer -O4 -DHAVE_ANSI_TERM ${WALL} -std=c++11")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -funroll-loops -fPIC -fomit-frame-pointer -O4 -DHAVE_ANSI_TERM ${WALL} -std=c++14")
set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
find_package(Threads REQUIRED)
......@@ -89,11 +89,11 @@ if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
execute_process(
COMMAND ${CMAKE_CXX_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
# If we're on OSX
if (APPLE AND NOT (GCC_VERSION VERSION_GREATER 4.8.2 OR GCC_VERSION VERSION_EQUAL 4.8.2))
if (APPLE AND NOT (GCC_VERSION VERSION_GREATER 5.2 OR GCC_VERSION VERSION_EQUAL 5.2))
message(FATAL_ERROR "When building under OSX, ${PROJECT_NAME} requires "
"either clang or g++ >= 4.8.2")
elseif (NOT (GCC_VERSION VERSION_GREATER 4.7 OR GCC_VERSION VERSION_EQUAL 4.7))
message(FATAL_ERROR "${PROJECT_NAME} requires g++ 4.7 or greater.")
"either clang or g++ >= 5.2")
elseif (NOT (GCC_VERSION VERSION_GREATER 5.2 OR GCC_VERSION VERSION_EQUAL 5.2))
message(FATAL_ERROR "${PROJECT_NAME} requires g++ 5.2 or greater.")
endif ()
set (GCC TRUE)
......@@ -134,7 +134,7 @@ elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
set (PTHREAD_LIB "pthread")
endif()
else ()
message(FATAL_ERROR "Your C++ compiler does not support C++11.")
message(FATAL_ERROR "Your C++ compiler does not support C++14.")
endif ()
include(ExternalProject)
......@@ -209,50 +209,29 @@ if (NOT CEREAL_FOUND)
endif()
if (NOT JELLYFISH_ROOT)
set(JELLYFISH_ROOT ${GAT_SOURCE_DIR}/external/install)
endif()
find_package(Jellyfish 2.2.6)
if (NOT JELLYFISH_FOUND)
message("Build system will fetch and build Jellyfish")
message("==================================================================")
ExternalProject_Add(libjellyfish
DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
DOWNLOAD_COMMAND curl -k -L https://github.com/gmarcais/Jellyfish/releases/download/v2.2.6/jellyfish-2.2.6.tar.gz -o jellyfish-2.2.6.tgz &&
rm -fr jellyfish-2.2.6 &&
tar -xzvf jellyfish-2.2.6.tgz
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/jellyfish-2.2.6
INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
CONFIGURE_COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/external/jellyfish-2.2.6/configure --enable-shared=no --prefix=<INSTALL_DIR> CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} CXXFLAGS=${JELLYFISH_CXX_FLAGS}
BUILD_COMMAND make CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} CXXFLAGS=${JELLYFISH_CXX_FLAGS}
BUILD_IN_SOURCE 1
INSTALL_COMMAND make install
)
endif()
set (FAST_MALLOC_LIB "")
set (HAVE_FAST_MALLOC FALSE)
# See if we have Jemalloc
find_package(Jemalloc)
if (Jemalloc_FOUND)
##
# Don't be so stringent about the version yet
##
#if (NOT (${JEMALLOC_VERSION} VERSION_LESS 5.1.0))
message("Found Jemalloc library --- using this memory allocator")
set (FAST_MALLOC_LIB ${JEMALLOC_LIBRARIES})
set (HAVE_FAST_MALLOC TRUE)
#else()
# message("Fond Jemalloc version ${JEMALLOC_VERSION}, but require >= 5.1.0. Downloading newer version")
#endif()
endif()
if (NOT HAVE_FAST_MALLOC)
# See if we have Tcmalloc
find_package(Tcmalloc)
if (Tcmalloc_FOUND)
message("Fount TCMalloc library --- using this memory allocator")
set (TCMALLOC_LIB ${Tcmalloc_LIBRARIES})
set (FAST_MALLOC_LIB ${TCMALLOC_LIB})
set (HAVE_FAST_MALLOC TRUE)
endif()
if(CONDA_BUILD)
set (JEMALLOC_FLAGS "CC=${CMAKE_C_COMPILER} CFLAGS=-fPIC CPPFLAGS=-fPIC")
else ()
set (JEMALLOC_FLAGS "CC=${CMAKE_C_COMPILER}")
endif()
if (NOT HAVE_FAST_MALLOC)
......
......@@ -28,19 +28,26 @@ find_library(JEMALLOC_LIBRARY NAMES jemalloc libjemalloc
${PC_JEMALLOC_LIBRARY_DIRS}
PATH_SUFFIXES lib lib64)
set(JEMALLOC_LIBRARIES ${JEMALLOC_LIBRARY})
set(JEMALLOC_INCLUDE_DIRS ${JEMALLOC_INCLUDE_DIR})
if(JEMALLOC_INCLUDE_DIR)
set(_version_regex "^#define[ \t]+JEMALLOC_VERSION[ \t]+\"([^\"]+)\".*")
file(STRINGS "${JEMALLOC_INCLUDE_DIR}/jemalloc/jemalloc.h"
JEMALLOC_VERSION REGEX "${_version_regex}")
string(REGEX REPLACE "${_version_regex}" "\\1"
JEMALLOC_VERSION "${JEMALLOC_VERSION}")
unset(_version_regex)
endif()
find_package_handle_standard_args(Jemalloc DEFAULT_MSG
JEMALLOC_LIBRARY JEMALLOC_INCLUDE_DIR)
include(FindPackageHandleStandardArgs)
# handle the QUIETLY and REQUIRED arguments and set JEMALLOC_FOUND to TRUE
# if all listed variables are TRUE and the requested version matches.
find_package_handle_standard_args(Jemalloc REQUIRED_VARS
JEMALLOC_LIBRARY JEMALLOC_INCLUDE_DIR
VERSION_VAR JEMALLOC_VERSION)
get_property(_type CACHE JEMALLOC_ROOT PROPERTY TYPE)
if(_type)
set_property(CACHE JEMALLOC_ROOT PROPERTY ADVANCED 1)
if("x${_type}" STREQUAL "xUNINITIALIZED")
set_property(CACHE JEMALLOC_ROOT PROPERTY TYPE PATH)
endif()
endif()
mark_as_advanced(JEMALLOC_ROOT JEMALLOC_LIBRARY JEMALLOC_INCLUDE_DIR)
if(JEMALLOC_FOUND)
set(JEMALLOC_LIBRARIES ${JEMALLOC_LIBRARY})
set(JEMALLOC_INCLUDE_DIRS ${JEMALLOC_INCLUDE_DIR})
endif()
mark_as_advanced(JEMALLOC_INCLUDE_DIR JEMALLOC_LIBRARY)
rapmap (0.5.0+dfsg-4) UNRELEASED; urgency=medium
rapmap (0.12.0+dfsg-1) UNRELEASED; urgency=medium
* For some reason rapmap releases are prefixed by "salmon-" - adapt
watch file
* New upstream version
-- Andreas Tille <tille@debian.org> Wed, 12 Dec 2018 20:47:27 +0100
rapmap (0.5.0+dfsg-4) unstable; urgency=medium
* Point Vcs fields to salsa.debian.org
* Standards-Version: 4.2.1
* Remove trailing whitespace in debian/rules
-- Andreas Tille <tille@debian.org> Fri, 28 Sep 2018 10:36:40 +0200
-- Andreas Tille <tille@debian.org> Tue, 11 Dec 2018 19:47:36 +0100
rapmap (0.5.0+dfsg-3) unstable; urgency=medium
......
Author: Gert Wollny <gewo@debian.org>
Description: Correct instanziation of consoleSink for new spdlog version
Debian-Bug: https://bugs.debian.org/884218
--- a/src/RapMapSAMapper.cpp
+++ b/src/RapMapSAMapper.cpp
@@ -576,7 +576,7 @@
auto rawConsoleSink = std::make_shared<spdlog::sinks::stderr_sink_mt>();
auto consoleSink =
- std::make_shared<spdlog::sinks::ansicolor_sink>(rawConsoleSink);
+ std::make_shared<spdlog::sinks::ansicolor_stderr_sink_mt>();
auto consoleLog = spdlog::create("stderrLog", {consoleSink});
try {
use-debian-libs.patch
use_shared_libs.patch
libspdlog_14_0.patch
......@@ -2,10 +2,8 @@ Description: make RapMap use Debian's versions of dependencies
This involves disabling downloads and adjusting API usage to the versions
packaged in Debian.
Author: Sascha Steinbiss <sascha@steinbiss.name>
Index: rapmap/CMakeLists.txt
===================================================================
--- rapmap.orig/CMakeLists.txt
+++ rapmap/CMakeLists.txt
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -168,20 +168,6 @@ if (NOT ZLIB_FOUND)
endif()
......@@ -27,34 +25,8 @@ Index: rapmap/CMakeLists.txt
if (NOT CEREAL_ROOT)
set(CEREAL_ROOT ${GAT_SOURCE_DIR}/external/install)
endif()
@@ -214,25 +200,6 @@ if (NOT JELLYFISH_ROOT)
set(JELLYFISH_ROOT ${GAT_SOURCE_DIR}/external/install)
endif()
-find_package(Jellyfish 2.2.6)
-
-if (NOT JELLYFISH_FOUND)
-message("Build system will fetch and build Jellyfish")
-message("==================================================================")
-ExternalProject_Add(libjellyfish
- DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
- DOWNLOAD_COMMAND curl -k -L https://github.com/gmarcais/Jellyfish/releases/download/v2.2.6/jellyfish-2.2.6.tar.gz -o jellyfish-2.2.6.tgz &&
- rm -fr jellyfish-2.2.6 &&
- tar -xzvf jellyfish-2.2.6.tgz
- SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/jellyfish-2.2.6
- INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
- CONFIGURE_COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/external/jellyfish-2.2.6/configure --enable-shared=no --prefix=<INSTALL_DIR> CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} CXXFLAGS=${JELLYFISH_CXX_FLAGS}
- BUILD_COMMAND make CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} CXXFLAGS=${JELLYFISH_CXX_FLAGS}
- BUILD_IN_SOURCE 1
- INSTALL_COMMAND make install
-)
-endif()
-
set (FAST_MALLOC_LIB "")
set (HAVE_FAST_MALLOC FALSE)
@@ -255,24 +222,6 @@ if (NOT HAVE_FAST_MALLOC)
endif()
@@ -234,24 +220,6 @@ else ()
set (JEMALLOC_FLAGS "CC=${CMAKE_C_COMPILER}")
endif()
-if (NOT HAVE_FAST_MALLOC)
......@@ -78,23 +50,16 @@ Index: rapmap/CMakeLists.txt
###
#
# Done building external dependencies.
Index: rapmap/src/CMakeLists.txt
===================================================================
--- rapmap.orig/src/CMakeLists.txt
+++ rapmap/src/CMakeLists.txt
@@ -84,9 +84,12 @@ set (SUFFARRAY64_LIB ${GAT_SOURCE_DIR}/e
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -61,8 +61,8 @@ endif()
add_executable(rapmap ${RAPMAP_MAIN_SRCS})
# our suffix array construction libraries
-set (SUFFARRAY_LIB ${GAT_SOURCE_DIR}/external/install/lib/libdivsufsort.a)
-set (SUFFARRAY64_LIB ${GAT_SOURCE_DIR}/external/install/lib/libdivsufsort64.a)
+set (SUFFARRAY_LIB -ldivsufsort)
+set (SUFFARRAY64_LIB ldivsufsort64)
# Link the executable
target_link_libraries(rapmap
# ${PTHREAD_LIB}
${ZLIB_LIBRARY}
- ${SUFFARRAY_LIB}
- ${SUFFARRAY64_LIB}
- ${GAT_SOURCE_DIR}/external/install/lib/libjellyfish-2.0.a
+ #${SUFFARRAY_LIB}
+ #${SUFFARRAY64_LIB}
+ #${GAT_SOURCE_DIR}/external/install/lib/libjellyfish-2.0.a
+ divsufsort
+ divsufsort64
+ jellyfish-2.0
m
#${LIBLZMA_LIBRARIES}
${NON_APPLECLANG_LIBS}
version=4
opts="repacksuffix=+dfsg,dversionmangle=s/\+dfsg//g,repack,compression=xz" \
https://github.com/COMBINE-lab/RapMap/releases .*/archive/v(\d[\d.-]+)\.(?:tar(?:\.gz|\.bz2)?|tgz)
https://github.com/COMBINE-lab/RapMap/releases .*/archive/[salmon-]*v?(\d[\d.-]+)\.(?:tar(?:\.gz|\.bz2)?|tgz)
......@@ -2,9 +2,7 @@
// intended to be a minimal perfect hash function with fast and low memory construction, at the cost of (slightly) higher bits/elem than other state of the art libraries once built.
// should work with arbitray large number of elements, based on a cascade of "collision-free" bit arrays
#ifndef __BOO_PHF__
#define __BOO_PHF__
#pragma once
#include <stdio.h>
#include <climits>
#include <stdlib.h>
......@@ -18,15 +16,158 @@
#include <sys/time.h>
#include <string.h>
#include <memory> // for make_shared
#include <unistd.h>
namespace boomphf {
inline u_int64_t printPt( pthread_t pt) {
unsigned char *ptc = (unsigned char*)(void*)(&pt);
u_int64_t res =0;
for (size_t i=0; i<sizeof(pt); i++) {
res+= (unsigned)(ptc[i]);
}
return res;
}
////////////////////////////////////////////////////////////////
#pragma mark -
#pragma mark utils
////////////////////////////////////////////////////////////////
// iterator from disk file of u_int64_t with buffered read, todo template
template <typename basetype>
class bfile_iterator : public std::iterator<std::forward_iterator_tag, basetype>{
public:
bfile_iterator()
: _is(nullptr)
, _pos(0) ,_inbuff (0), _cptread(0)
{
_buffsize = 10000;
_buffer = (basetype *) malloc(_buffsize*sizeof(basetype));
}
bfile_iterator(const bfile_iterator& cr)
{
_buffsize = cr._buffsize;
_pos = cr._pos;
_is = cr._is;
_buffer = (basetype *) malloc(_buffsize*sizeof(basetype));
memcpy(_buffer,cr._buffer,_buffsize*sizeof(basetype) );
_inbuff = cr._inbuff;
_cptread = cr._cptread;
_elem = cr._elem;
}
bfile_iterator(FILE* is): _is(is) , _pos(0) ,_inbuff (0), _cptread(0)
{
//printf("bf it %p\n",_is);
_buffsize = 10000;
_buffer = (basetype *) malloc(_buffsize*sizeof(basetype));
int reso = fseek(_is,0,SEEK_SET);
(void)reso;
advance();
}
~bfile_iterator()
{
if(_buffer!=NULL)
free(_buffer);
}
basetype const& operator*() { return _elem; }
bfile_iterator& operator++()
{
advance();
return *this;
}
friend bool operator==(bfile_iterator const& lhs, bfile_iterator const& rhs)
{
if (!lhs._is || !rhs._is) { if (!lhs._is && !rhs._is) { return true; } else { return false; } }
assert(lhs._is == rhs._is);
return rhs._pos == lhs._pos;
}
friend bool operator!=(bfile_iterator const& lhs, bfile_iterator const& rhs) { return !(lhs == rhs); }
private:
void advance()
{
//printf("_cptread %i _inbuff %i \n",_cptread,_inbuff);
_pos++;
if(_cptread >= _inbuff)
{
int res = fread(_buffer,sizeof(basetype),_buffsize,_is);
//printf("read %i new elem last %llu %p\n",res,_buffer[res-1],_is);
_inbuff = res; _cptread = 0;
if(res == 0)
{
_is = nullptr;
_pos = 0;
return;
}
}
_elem = _buffer[_cptread];
_cptread ++;
}
basetype _elem;
FILE * _is;
unsigned long _pos;
basetype * _buffer; // for buffered read
int _inbuff, _cptread;
int _buffsize;
};
template <typename type_elem>
class file_binary{
public:
file_binary(const char* filename)
{
_is = fopen(filename, "rb");
if (!_is) {
throw std::invalid_argument("Error opening " + std::string(filename));
}
}
~file_binary()
{
fclose(_is);
}
bfile_iterator<type_elem> begin() const
{
return bfile_iterator<type_elem>(_is);
}
bfile_iterator<type_elem> end() const {return bfile_iterator<type_elem>(); }
size_t size () const { return 0; }//todo ?
private:
FILE * _is;
};
inline unsigned int popcount_32(unsigned int x)
{
unsigned int m1 = 0x55555555;
......@@ -112,7 +253,7 @@ namespace boomphf {
void finish_threaded()// called by only one of the threads
{
done = 0;
double rem = 0;
//double rem = 0;
for (int ii=0; ii<_nthreads;ii++) done += (done_threaded[ii] );
for (int ii=0; ii<_nthreads;ii++) partial += (partial_threaded[ii] );
......@@ -533,20 +674,20 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
//for debug purposes
void print() const
{
printf("bit array of size %lu: \n",_size);
printf("bit array of size %llu: \n", static_cast<unsigned long long>(_size));
for(uint64_t ii = 0; ii< _size; ii++)
{
if(ii%10==0)
printf(" (%lu) ",ii);
printf(" (%llu) ", static_cast<unsigned long long>(ii));
int val = (_bitArray[ii >> 6] >> (ii & 63 ) ) & 1;
printf("%i",val);
}
printf("\n");
printf("rank array : size %lu \n",_ranks.size());
printf("rank array : size %llu \n", static_cast<unsigned long long>(_ranks.size()));
for (uint64_t ii = 0; ii< _ranks.size(); ii++)
{
printf("%lu : %lu, ",ii,_ranks[ii]);
printf("%llu : %lli, ",static_cast<unsigned long long>(ii), static_cast<long long int>(_ranks[ii]));
}
printf("\n");
}
......@@ -554,7 +695,11 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
//return value at pos
uint64_t operator[](uint64_t pos) const
{
//unsigned char * _bitArray8 = (unsigned char *) _bitArray;
//return (_bitArray8[pos >> 3ULL] >> (pos & 7 ) ) & 1;
return (_bitArray[pos >> 6ULL] >> (pos & 63 ) ) & 1;
}
//atomically return old val and set to 1
......@@ -624,6 +769,7 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
}
void save(std::ostream& os) const
{
os.write(reinterpret_cast<char const*>(&_size), sizeof(_size));
......@@ -665,6 +811,14 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
#pragma mark level
////////////////////////////////////////////////////////////////
static inline uint64_t fastrange64(uint64_t word, uint64_t p) {
//return word % p;
return (uint64_t)(((__uint128_t)word * (__uint128_t)p) >> 64);
}
class level{
public:
level(){ }
......@@ -674,7 +828,9 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
uint64_t get(uint64_t hash_raw)
{
uint64_t hashi = hash_raw % hash_domain;
// uint64_t hashi = hash_raw % hash_domain; //
//uint64_t hashi = (uint64_t)( ((__uint128_t) hash_raw * (__uint128_t) hash_domain) >> 64ULL);
uint64_t hashi = fastrange64(hash_raw,hash_domain);
return bitset.get(hashi);
}
......@@ -691,6 +847,7 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
#define NBBUFF 10000
//#define NBBUFF 2
template<typename Range,typename Iterator>
struct thread_args
......@@ -731,24 +888,51 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
// allow perc_elem_loaded elements to be loaded in ram for faster construction (default 3%), set to 0 to desactivate
template <typename Range>
mphf( size_t n, Range const& input_range,int num_thread = 1, double gamma = 2.0 , bool progress =true, float perc_elem_loaded = 0.03) :
mphf( size_t n, Range const& input_range,int num_thread = 1, double gamma = 2.0 , bool writeEach = true, bool progress =true, float perc_elem_loaded = 0.03) :
_gamma(gamma), _hash_domain(size_t(ceil(double(n) * gamma))), _nelem(n), _num_thread(num_thread), _percent_elem_loaded_for_fastMode (perc_elem_loaded), _withprogress(progress)
{
if(n ==0) return;
_fastmode = false;
if(_percent_elem_loaded_for_fastMode > 0.0 )
_fastmode =true;
if(writeEach)
{
_writeEachLevel =true;
_fastmode = false;
}
else
{
_writeEachLevel = false;
}
setup();
if(_withprogress)
{
_progressBar.timer_mode=1;
if(_fastmode)
_progressBar.init( _nelem * (_fastModeLevel+1) + ( _nelem * pow(_proba_collision,_fastModeLevel)) * (_nb_levels-(_fastModeLevel+1)) ,"Building BooPHF",num_thread);
double total_raw = _nb_levels;
double sum_geom_read = ( 1.0 / (1.0 - _proba_collision));
double total_writeEach = sum_geom_read + 1.0;
double total_fastmode_ram = (_fastModeLevel+1) + ( pow(_proba_collision,_fastModeLevel)) * (_nb_levels-(_fastModeLevel+1)) ;
printf("for info, total work write each : %.3f total work inram from level %i : %.3f total work raw : %.3f \n",total_writeEach,_fastModeLevel,total_fastmode_ram,total_raw);
if(writeEach)
{
_progressBar.init(_nelem * total_writeEach,"Building BooPHF",num_thread);
}
else if(_fastmode)
_progressBar.init( _nelem * total_fastmode_ram ,"Building BooPHF",num_thread);
else
_progressBar.init( _nelem * _nb_levels ,"Building BooPHF",num_thread);
}
......@@ -792,7 +976,7 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
uint64_t non_minimal_hp,minimal_hp;
hash_pair_t bbhash; int level;
hash_pair_t bbhash{}; int level;
uint64_t level_hash = getLevel(bbhash,elem,&level);
if( level == (_nb_levels-1))
......@@ -806,6 +990,8 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
else
{
minimal_hp = in_final_map->second + _lastbitsetrank;
//printf("lookup %llu level %i --> %llu \n",elem,level,minimal_hp);
return minimal_hp;
}
// minimal_hp = _final_hash[elem] + _lastbitsetrank;
......@@ -813,10 +999,11 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
}
else
{
non_minimal_hp = level_hash % _levels[level].hash_domain; // in fact non minimal hp would be + _levels[level]->idx_begin
//non_minimal_hp = level_hash % _levels[level].hash_domain; // in fact non minimal hp would be + _levels[level]->idx_begin
non_minimal_hp = fastrange64(level_hash,_levels[level].hash_domain);
}
minimal_hp = _levels[level].bitset.rank(non_minimal_hp );
// printf("lookup %llu level %i --> %llu \n",elem,level,minimal_hp);
return minimal_hp;
}
......@@ -853,6 +1040,9 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
auto until = *until_p;
uint64_t inbuff =0;
uint64_t writebuff =0;
std::vector< elem_t > & myWriteBuff = bufferperThread[tid];
for (bool isRunning=true; isRunning ; )
{
......@@ -868,17 +1058,33 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
//do work on the n elems of the buffer
// printf("filling input buff \n");
for(uint64_t ii=0; ii<inbuff ; ii++)
{
elem_t val = buffer[ii];
//printf("processing %llu level %i\n",val, i);
//auto hashes = _hasher(val);
hash_pair_t bbhash; int level;
uint64_t level_hash = getLevel(bbhash,val,&level, i);
hash_pair_t bbhash{}; int level;
uint64_t level_hash;
if(_writeEachLevel)
getLevel(bbhash,val,&level, i,i-1);
else
getLevel(bbhash,val,&level, i);
//uint64_t level_hash = getLevel(bbhash,val,&level, i);
//__sync_fetch_and_add(& _cptTotalProcessed,1);
if(level == i) //insert into lvl i
{
__sync_fetch_and_add(& _cptLevel,1);
// __sync_fetch_and_add(& _cptLevel,1);
if(_fastmode && i == _fastModeLevel)
{
......@@ -905,6 +1111,30 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
else
{
//ils ont reach ce level
//insert elem into curr level on disk --> sera utilise au level+1 , (mais encore besoin filtre)
if(_writeEachLevel && i > 0 && i < _nb_levels -1)
{
if(writebuff>=NBBUFF)
{
//flush buffer
flockfile(_currlevelFile);
fwrite(myWriteBuff.data(),sizeof(elem_t),writebuff,_currlevelFile);
funlockfile(_currlevelFile);
writebuff = 0;
}
myWriteBuff[writebuff++] = val;
}
//computes next hash
if ( level == 0)
......@@ -927,6 +1157,15 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
inbuff = 0;
}
if(_writeEachLevel && writebuff>0)
{
//flush buffer
flockfile(_currlevelFile);
fwrite(myWriteBuff.data(),sizeof(elem_t),writebuff,_currlevelFile);
funlockfile(_currlevelFile);
writebuff = 0;
}
}
......@@ -1017,13 +1256,30 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
{
pthread_mutex_init(&_mutex, NULL);
_pid = getpid() + printPt(pthread_self()) ;// + pthread_self();
//printf("pt self %llu pid %i \n",printPt(pthread_self()),_pid);
_cptTotalProcessed=0;
if(_fastmode)
{
setLevelFastmode.resize(_percent_elem_loaded_for_fastMode * (double)_nelem );
}
bufferperThread.resize(_num_thread);
if(_writeEachLevel)
{
for(int ii=0; ii<_num_thread; ii++)
{
bufferperThread[ii].resize(NBBUFF);
}
}
_proba_collision = 1.0 - pow(((_gamma*(double)_nelem -1 ) / (_gamma*(double)_nelem)),_nelem-1);
double sum_geom =_gamma * ( 1.0 + _proba_collision / (1.0 - _proba_collision));
//double sum_geom =_gamma * ( 1.0 + _proba_collision / (1.0 - _proba_collision));
//printf("proba collision %f sum_geom %f \n",_proba_collision,sum_geom);
_nb_levels = 25;
......@@ -1059,7 +1315,9 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
//compute level and returns hash of last level reached
uint64_t getLevel(hash_pair_t & bbhash, elem_t val,int * res_level, int maxlevel = 100)
uint64_t getLevel(hash_pair_t & bbhash, elem_t val,int * res_level, int maxlevel = 100, int minlevel =0)
//uint64_t getLevel(hash_pair_t & bbhash, elem_t val,int * res_level, int maxlevel = 100, int minlevel =0)
{
int level = 0;
uint64_t hash_raw=0;
......@@ -1078,7 +1336,9 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
}
if( _levels[ii].get(hash_raw) )
if( ii >= minlevel && _levels[ii].get(hash_raw) ) //
//if( _levels[ii].get(hash_raw) ) //
{
break;
}
......@@ -1094,7 +1354,8 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
//insert into bitarray
void insertIntoLevel(uint64_t level_hash, int i)
{
uint64_t hashl = level_hash % _levels[i].hash_domain;
// uint64_t hashl = level_hash % _levels[i].hash_domain;
uint64_t hashl = fastrange64( level_hash,_levels[i].hash_domain);
if( _levels[i].bitset.atomic_test_and_set(hashl) )
{
......@@ -1111,6 +1372,33 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
////alloc the bitset for this level
_levels[i].bitset = bitVector(_levels[i].hash_domain); ;
//printf("---process level %i wr %i fast %i ---\n",i,_writeEachLevel,_fastmode);
char fname_old[1000];
sprintf(fname_old,"temp_p%i_level_%i",_pid,i-2);
char fname_curr[1000];
sprintf(fname_curr,"temp_p%i_level_%i",_pid,i);
char fname_prev[1000];
sprintf(fname_prev,"temp_p%i_level_%i",_pid,i-1);
if(_writeEachLevel)
{
//file management :
if(i>2) //delete previous file
{
unlink(fname_old);
}
if(i< _nb_levels-1 && i > 0 ) //create curr file
{
_currlevelFile = fopen(fname_curr,"w");
}
}
_cptLevel = 0;
_hashidx = 0;
_idxLevelsetLevelFastmode =0;
......@@ -1124,19 +1412,51 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
t_arg.it_p = std::static_pointer_cast<void>(std::make_shared<it_type>(input_range.begin()));
t_arg.until_p = std::static_pointer_cast<void>(std::make_shared<it_type>(input_range.end()));
t_arg.level = i;
if(_writeEachLevel && (i > 1))
{
auto data_iterator_level = file_binary<elem_t>(fname_prev);
typedef decltype(data_iterator_level.begin()) disklevel_it_type;
//data_iterator_level.begin();
t_arg.it_p = std::static_pointer_cast<void>(std::make_shared<disklevel_it_type>(data_iterator_level.begin()));
t_arg.until_p = std::static_pointer_cast<void>(std::make_shared<disklevel_it_type>(data_iterator_level.end()));
for(int ii=0;ii<_num_thread;ii++)
pthread_create (&tab_threads[ii], NULL, thread_processLevel<elem_t, Hasher_t, Range, disklevel_it_type>, &t_arg); //&t_arg[ii]
//must join here before the block is closed and file_binary is destroyed (and closes the file)
for(int ii=0;ii<_num_thread;ii++)
{
pthread_join(tab_threads[ii], NULL);
}
}
else
{
if(_fastmode && i >= (_fastModeLevel+1))
{
auto data_iterator = boomphf::range(static_cast<const elem_t*>( &setLevelFastmode[0]), static_cast<const elem_t*>( (&setLevelFastmode[0]) +setLevelFastmode.size()));
typedef decltype(data_iterator.begin()) fastmode_it_type;
t_arg.it_p = std::static_pointer_cast<void>(std::make_shared<fastmode_it_type>(data_iterator.begin()));
t_arg.until_p = std::static_pointer_cast<void>(std::make_shared<fastmode_it_type>(data_iterator.end()));
/* we'd like to do t_arg.it = data_iterator.begin() but types are different;
so, casting to (void*) because of that; and we remember the type in the template */
typedef decltype(setLevelFastmode.begin()) fastmode_it_type;
t_arg.it_p = std::static_pointer_cast<void>(std::make_shared<fastmode_it_type>(setLevelFastmode.begin()));
t_arg.until_p = std::static_pointer_cast<void>(std::make_shared<fastmode_it_type>(setLevelFastmode.end()));
/* we'd like to do t_arg.it = data_iterator.begin() but types are different;
so, casting to (void*) because of that; and we remember the type in the template */
for(int ii=0;ii<_num_thread;ii++)
pthread_create (&tab_threads[ii], NULL, thread_processLevel<elem_t, Hasher_t, Range, fastmode_it_type>, &t_arg); //&t_arg[ii]
}
else
{
......@@ -1148,14 +1468,31 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
{
pthread_join(tab_threads[ii], NULL);
}
}
//printf("\ngoing to level %i : %llu elems %.2f %% expected : %.2f %% \n",i,_cptLevel,100.0* _cptLevel/(float)_nelem,100.0* pow(_proba_collision,i) );
//printf("\ncpt total processed %llu \n",_cptTotalProcessed);
if(_fastmode && i == _fastModeLevel) //shrink to actual number of elements in set
{
//printf("resize setLevelFastmode to %lli \n",_idxLevelsetLevelFastmode);
//printf("\nresize setLevelFastmode to %lli \n",_idxLevelsetLevelFastmode);
setLevelFastmode.resize(_idxLevelsetLevelFastmode);
}
delete [] tab_threads;
if(_writeEachLevel)
{
if(i< _nb_levels-1 && i>0)
{
fflush(_currlevelFile);
fclose(_currlevelFile);
}
if(i== _nb_levels- 1) //delete last file
{
unlink(fname_prev);
}
}
}
private:
......@@ -1177,14 +1514,22 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
uint64_t _lastbitsetrank;
uint64_t _idxLevelsetLevelFastmode;
uint64_t _cptLevel;
uint64_t _cptTotalProcessed;
// fast build mode , requires that _percent_elem_loaded_for_fastMode % elems are loaded in ram
float _percent_elem_loaded_for_fastMode ;
bool _fastmode;
std::vector< elem_t > setLevelFastmode;
// std::vector< elem_t > setLevelFastmode_next; // todo shrinker le set e nram a chaque niveau ?
std::vector< std::vector< elem_t > > bufferperThread;
int _fastModeLevel;
bool _withprogress;
bool _built;
bool _writeEachLevel;
FILE * _currlevelFile;
int _pid;
public:
pthread_mutex_t _mutex;
};
......@@ -1219,5 +1564,3 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
return NULL;
}
}
#endif //__BOO_PHF__
......@@ -21,7 +21,7 @@ extern "C" {
#if __cplusplus >= 201402L
#include <memory>
using std::make_unique
using std::make_unique;
#else
#include <cstddef>
......@@ -121,8 +121,10 @@ public:
FastxParser(std::vector<std::string> files, std::vector<std::string> files2,
uint32_t numConsumers, uint32_t numParsers = 1,
uint32_t chunkSize = 1000);
~FastxParser();
bool start();
bool stop();
ReadGroup<T> getReadGroup();
bool refill(ReadGroup<T>& rg);
void finishedWithGroup(ReadGroup<T>& s);
......@@ -135,7 +137,17 @@ private:
std::vector<std::string> inputStreams2_;
uint32_t numParsers_;
std::atomic<uint32_t> numParsing_;
// NOTE: Would like to use std::future<int> here instead, but that
// solution doesn't seem to work. It's unclear exactly why
// see (https://twitter.com/nomad421/status/917748383321817088)
std::vector<std::unique_ptr<std::thread>> parsingThreads_;
// holds the results of the parsing threads, which is simply equal to
// the return value of kseq_read() for the last call to that function.
// A value < -1 signifies some sort of error.
std::vector<int> threadResults_;
size_t blockSize_;
moodycamel::ConcurrentQueue<std::unique_ptr<ReadChunk<T>>> readQueue_,
seqContainerQueue_;
......@@ -145,6 +157,7 @@ private:
std::vector<std::unique_ptr<moodycamel::ProducerToken>> produceReads_;
std::vector<std::unique_ptr<moodycamel::ConsumerToken>> consumeContainers_;
bool isActive_{false};
};
}
#endif // __FASTX_PARSER__
#ifndef FASTX_PARSER_THREAD_UTILS_HPP
#define FASTX_PARSER_THREAD_UTILS_HPP
#include <cassert>
#include <thread>
#include <chrono>
#include <random>
#include <pthread.h>
// Most of this code is taken directly from https://github.com/geidav/spinlocks-bench/blob/master/os.hpp.
// However, things may be renamed, modified, or randomly mangled over time.
#define ALWAYS_INLINE inline __attribute__((__always_inline__))
namespace fastx_parser {
namespace thread_utils {
static const constexpr size_t MIN_BACKOFF_ITERS = 32;
static const size_t MAX_BACKOFF_ITERS = 1024;
ALWAYS_INLINE static void cpuRelax() {
asm("pause");
}
ALWAYS_INLINE void yieldSleep() {
using namespace std::chrono;
std::chrono::microseconds ytime(500);
std::this_thread::sleep_for(ytime);
}
ALWAYS_INLINE void backoffExp(size_t &curMaxIters) {
thread_local std::uniform_int_distribution<size_t> dist;
thread_local std::minstd_rand gen(std::random_device{}());
const size_t spinIters = dist(gen, decltype(dist)::param_type{0, curMaxIters});
curMaxIters = std::min(2*curMaxIters, MAX_BACKOFF_ITERS);
for (size_t i=0; i<spinIters; i++) { cpuRelax(); }
}
ALWAYS_INLINE void backoffOrYield(size_t& curMaxDelay) {
if (curMaxDelay >= MAX_BACKOFF_ITERS) {
yieldSleep();
curMaxDelay = MIN_BACKOFF_ITERS;
}
backoffExp(curMaxDelay);
}
}
}
#endif // FASTX_PARSER_THREAD_UTILS_HPP
......@@ -251,19 +251,19 @@ public:
inline KeyT getKmerFromInterval_(ValueT& ival) {
rapmap::utils::my_mer m;// copy the global mer to get k-mer object
m.from_chars(txtPtr_ + (*saPtr_)[ival.begin()]);
m.fromChars(txtPtr_ + (*saPtr_)[ival.begin()]);
return m.word(0);
}
// variant where we provide an existing mer object
inline KeyT getKmerFromInterval_(ValueT& ival, rapmap::utils::my_mer& m) {
m.from_chars(txtPtr_ + (*saPtr_)[ival.begin()]);
m.fromChars(txtPtr_ + (*saPtr_)[ival.begin()]);
return m.word(0);
}
// variant where we provide an existing mer object
inline KeyT getKmerFromPos_(IndexT pos, rapmap::utils::my_mer& m) {
m.from_chars(txtPtr_ + (*saPtr_)[pos]);
m.fromChars(static_cast<decltype(txtPtr_)>(txtPtr_ + (*saPtr_)[pos]));
return m.word(0);
}
......
......@@ -23,10 +23,8 @@
#define __HIT_MANAGER_HPP__
#include "RapMapUtils.hpp"
#include "RapMapIndex.hpp"
#include "RapMapSAIndex.hpp"
//#include "eytzinger_array.h"
#include "chobo/small_vector.hpp"
#include <tuple>
#include <vector>
......@@ -46,14 +44,32 @@ namespace rapmap {
template <typename T>
using SAIntervalHit = rapmap::utils::SAIntervalHit<T>;
using SAHitMap = std::map<int, rapmap::utils::ProcessedSAHit>;
using ProcessedSAHit = rapmap::utils::ProcessedSAHit;
using SAHitMap = std::map<int, ProcessedSAHit>;
template <typename SAIntervalHitT>
using SAIntervalVector = std::vector<SAIntervalHitT>;
class SAProcessedHitVec {
public:
std::vector<ProcessedSAHit> hits;
std::vector<uint32_t> txps;
};
template <typename SAIntervalHitT>
class HitCollectorInfo {
public:
void clear() {
readLen = 0;
maxDist = 0;
fwdSAInts.clear();
rcSAInts.clear();
}
size_t readLen{0};
int32_t maxDist{0};
SAIntervalVector<SAIntervalHitT> fwdSAInts;
SAIntervalVector<SAIntervalHitT> rcSAInts;
};
/*
using SAProcessedHitVec = std::tuple<std::vector<ProcessedSAHit>, std::vector<uint32_t>>;
*/
......@@ -70,9 +86,10 @@ namespace rapmap {
// match maxDist
bool collectHitsSimpleSA(SAHitMap& processedHits,
uint32_t readLen,
uint32_t maxDist,
int32_t maxDist,
std::vector<QuasiAlignment>& hits,
MateStatus mateStatus);
MateStatus mateStatus,
rapmap::utils::MappingConfig& mc);
// Return hits from processedHits where position constraints
// match maxDist
......@@ -88,13 +105,14 @@ namespace rapmap {
// entries in outHits that are labeled by the transcripts in
// which h2 appears will have an iterator to the beginning of
// the position list for h2.
void intersectWithOutput(HitInfo& h2, RapMapIndex& rmi,
std::vector<ProcessedHit>& outHits);
//void intersectWithOutput(HitInfo& h2, RapMapIndex& rmi,
// std::vector<ProcessedHit>& outHits);
template <typename RapMapIndexT>
void intersectSAIntervalWithOutput(SAIntervalHit<typename RapMapIndexT::IndexType>& h,
RapMapIndexT& rmi,
uint32_t intervalCounter,
int32_t maxSlack,
SAHitMap& outHits);
......@@ -109,13 +127,13 @@ namespace rapmap {
SAProcessedHitVec& outHits);
*/
std::vector<ProcessedHit> intersectHits(
std::vector<HitInfo>& inHits,
RapMapIndex& rmi);
//std::vector<ProcessedHit> intersectHits(
// std::vector<HitInfo>& inHits,
// RapMapIndex& rmi);
template <typename RapMapIndexT>
SAHitMap intersectSAHits(
std::vector<SAIntervalHit<typename RapMapIndexT::IndexType>>& inHits,
SAIntervalVector<SAIntervalHit<typename RapMapIndexT::IndexType>>& inHits,
RapMapIndexT& rmi,
size_t readLen,
bool strictFilter=false);
......@@ -124,6 +142,14 @@ namespace rapmap {
std::vector<ProcessedSAHit> intersectSAHits2(
std::vector<SAIntervalHit<typename RapMapIndexT::IndexType>>& inHits,
RapMapIndexT& rmi);
template <typename RapMapIndexT>
void hitsToMappingsSimple(RapMapIndexT& rmi,
rapmap::utils::MappingConfig& mc,
rapmap::utils::MateStatus mateStatus,
HitCollectorInfo<SAIntervalHit<typename RapMapIndexT::IndexType>>& hcinfo,
std::vector<rapmap::utils::QuasiAlignment>& hits);
}
}
......
......@@ -50,8 +50,10 @@ class IndexHeader {
ar( cereal::make_nvp("KmerLen", kmerLen_) );
ar( cereal::make_nvp("BigSA", bigSA_) );
ar( cereal::make_nvp("PerfectHash", perfectHash_) );
ar( cereal::make_nvp("SeqHash", seqHash_) );
ar( cereal::make_nvp("NameHash", nameHash_) );
ar( cereal::make_nvp("SeqHash", seqHash256_) );
ar( cereal::make_nvp("NameHash", nameHash256_) );
ar( cereal::make_nvp("SeqHash512", seqHash512_) );
ar( cereal::make_nvp("NameHash512", nameHash512_) );
}
template <typename Archive>
......@@ -63,8 +65,10 @@ class IndexHeader {
ar( cereal::make_nvp("KmerLen", kmerLen_) );
ar( cereal::make_nvp("BigSA", bigSA_) );
ar( cereal::make_nvp("PerfectHash", perfectHash_) );
ar( cereal::make_nvp("SeqHash", seqHash_) );
ar( cereal::make_nvp("NameHash", nameHash_) );
ar( cereal::make_nvp("SeqHash", seqHash256_) );
ar( cereal::make_nvp("NameHash", nameHash256_) );
ar( cereal::make_nvp("SeqHash512", seqHash512_) );
ar( cereal::make_nvp("NameHash512", nameHash512_) );
} catch (const cereal::Exception& e) {
auto cerrLog = spdlog::get("stderrLog");
cerrLog->error("Encountered exception [{}] when loading index.", e.what());
......@@ -82,11 +86,15 @@ class IndexHeader {
bool bigSA() const { return bigSA_; }
bool perfectHash() const { return perfectHash_; }
void setSeqHash(const std::string& seqHash) { seqHash_ = seqHash; }
void setNameHash(const std::string& nameHash) { nameHash_ = nameHash; }
std::string seqHash() const { return seqHash_; }
std::string nameHash() const { return nameHash_; }
void setSeqHash256(const std::string& seqHash) { seqHash256_ = seqHash; }
void setNameHash256(const std::string& nameHash) { nameHash256_ = nameHash; }
std::string seqHash256() const { return seqHash256_; }
std::string nameHash256() const { return nameHash256_; }
void setSeqHash512(const std::string& seqHash) { seqHash512_ = seqHash; }
void setNameHash512(const std::string& nameHash) { nameHash512_ = nameHash; }
std::string seqHash512() const { return seqHash512_; }
std::string nameHash512() const { return nameHash512_; }
private:
// The type of index we have
IndexType type_;
......@@ -102,9 +110,13 @@ class IndexHeader {
// Are we using a perfect hash in the index or not?
bool perfectHash_;
// Hash of sequences in txome
std::string seqHash_;
std::string seqHash256_;
// Hash of sequence names in txome
std::string nameHash_;
std::string nameHash256_;
// Hash of sequences in txome
std::string seqHash512_;
// Hash of sequence names in txome
std::string nameHash512_;
};
......
#ifndef __COMBINELIB_KMERS_HPP__
#define __COMBINELIB_KMERS_HPP__
#include <cassert>
#include <climits>
#include <cstdint>
#include <cstring>
#include <iostream>
#include <type_traits>
namespace combinelib {
namespace kmers {
#ifndef __DEFINE_LIKELY_MACRO__
#define __DEFINE_LIKELY_MACRO__
#ifdef __GNUC__
#define LIKELY(x) __builtin_expect((x), 1)
#define UNLIKELY(x) __builtin_expect((x), 0)
#else
#define LIKELY(x) (x)
#define UNLIKELY(x) (x)
#endif
#endif
/**
*
* The following lookup tables and reverse complement code is taken from
*Jellyfish
* https://github.com/gmarcais/Jellyfish/blob/master/include/jellyfish/mer_dna.hpp
*
**/
#define R -1
#define I -2
#define O -3
#define A 0
#define C 1
#define G 2
#define T 3
static constexpr int codes[256] = {
O, O, O, O, O, O, O, O, O, O, I, O, O, O, O, O, O, O, O, O, O, O, O, O,
O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, R, O, O,
O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, A, R, C, R, O, O, G,
R, O, O, R, O, R, R, O, O, O, R, R, T, O, R, R, R, R, O, O, O, O, O, O,
O, A, R, C, R, O, O, G, R, O, O, R, O, R, R, O, O, O, R, R, T, O, R, R,
R, R, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O,
O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O,
O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O,
O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O,
O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O,
O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O};
static constexpr char complements[256] = {
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
'N', 'N', 'N', 'N', 'N', 'T', 'N', 'G', 'N', 'N', 'N', 'C', 'N', 'N', 'N',
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'A', 'N', 'N', 'N', 'N', 'N',
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'T', 'N', 'G', 'N', 'N', 'N', 'C', 'N',
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'A', 'N', 'N', 'N',
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
'N'};
#undef R
#undef I
#undef O
#undef A
#undef C
#undef G
#undef T
// Checkered mask. cmask<uint16_t, 1> is every other bit on
// (0x55). cmask<uint16_t,2> is two bits one, two bits off (0x33). Etc.
template <typename U, int len, int l = sizeof(U) * 8 / (2 * len)> struct cmask {
static const U v =
(cmask<U, len, l - 1>::v << (2 * len)) | ((static_cast<U>(1) << len) - 1);
};
template <typename U, int len> struct cmask<U, len, 0> {
static const U v = 0;
};
// Fast reverse complement of one word through bit tweedling.
static inline uint64_t word_reverse_complement(uint64_t w, uint16_t k_) {
typedef uint64_t U;
w = ((w >> 2) & cmask<U, 2>::v) | ((w & cmask<U, 2>::v) << 2);
w = ((w >> 4) & cmask<U, 4>::v) | ((w & cmask<U, 4>::v) << 4);
w = ((w >> 8) & cmask<U, 8>::v) | ((w & cmask<U, 8>::v) << 8);
w = ((w >> 16) & cmask<U, 16>::v) | ((w & cmask<U, 16>::v) << 16);
w = (w >> 32) | (w << 32);
return ((static_cast<U>(-1)) - w) >> (2 * (32 - k_));
}
static constexpr char revCodes[4] = {'A', 'C', 'G', 'T'};
/**
* The above from Jellyfish (mer_dna.hpp)
*/
static constexpr int8_t rc_table[128] = {
78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, // 15
78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, // 31
78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, // 787
78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, // 63
78, 84, 78, 71, 78, 78, 78, 67, 78, 78, 78, 78, 78, 78, 78, 78, // 79
78, 78, 78, 78, 65, 65, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, // 95
78, 84, 78, 71, 78, 78, 78, 67, 78, 78, 78, 78, 78, 78, 78, 78, // 101
78, 78, 78, 78, 65, 65, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78 // 127
};
/**
* Since we define these implementations at file scope in a header, we mark them
*constant to
* avoid duplicate symbol errors due to external linkage.
**/
static decltype(codes[0]) codeForChar(char c) {
return codes[static_cast<uint8_t>(c)];
}
static char charForCode(int i) { return revCodes[i]; }
static decltype(complements[0]) complement(char c) {
return complements[static_cast<uint8_t>(c)];
}
static int complement(int i) { return 0x3 - i; }
static bool isValidNuc(int i) { return i >= 0; }
static bool isValidNuc(char c) { return isValidNuc(codeForChar(c)); }
static bool notValidNuc(int i) { return !isValidNuc(i); }
static bool notValidNuc(char c) { return !isValidNuc(c); }
// from :
// https://stackoverflow.com/questions/1392059/algorithm-to-generate-bit-mask
template <typename R> static constexpr R bitmask(unsigned int const onecount) {
return (onecount == 0)
? 0
: (static_cast<R>(-(onecount != 0)) &
(static_cast<R>(-1) >> ((sizeof(R) * CHAR_BIT) - onecount)));
}
// table that contains bit patterns to mask out the top bits of a word.
// The table is such that maskTable[k] will mask out the top (64 - 2*k) bits of
// the word.
static const constexpr uint64_t maskTable[] = {
bitmask<uint64_t>(0), bitmask<uint64_t>(2), bitmask<uint64_t>(4),
bitmask<uint64_t>(6), bitmask<uint64_t>(8), bitmask<uint64_t>(10),
bitmask<uint64_t>(12), bitmask<uint64_t>(14), bitmask<uint64_t>(16),
bitmask<uint64_t>(18), bitmask<uint64_t>(20), bitmask<uint64_t>(22),
bitmask<uint64_t>(24), bitmask<uint64_t>(26), bitmask<uint64_t>(28),
bitmask<uint64_t>(30), bitmask<uint64_t>(32), bitmask<uint64_t>(34),
bitmask<uint64_t>(36), bitmask<uint64_t>(38), bitmask<uint64_t>(40),
bitmask<uint64_t>(42), bitmask<uint64_t>(44), bitmask<uint64_t>(46),
bitmask<uint64_t>(48), bitmask<uint64_t>(50), bitmask<uint64_t>(52),
bitmask<uint64_t>(54), bitmask<uint64_t>(56), bitmask<uint64_t>(58),
bitmask<uint64_t>(60), bitmask<uint64_t>(62)};
constexpr const uint64_t nucleotidesPerByte = 4;
// from :
// https://stackoverflow.com/questions/31952237/looking-for-a-constexpr-ceil-function
constexpr uint64_t ceil(double num) {
return (static_cast<double>(static_cast<uint64_t>(num)) == num)
? static_cast<uint64_t>(num)
: static_cast<uint64_t>(num) + ((num > 0) ? 1 : 0);
}
constexpr uint64_t numWordsRequired(uint64_t K) {
return ceil(K / (1.0 * nucleotidesPerByte * (sizeof(uint64_t))));
}
/**
* @returns the binary encoding for character c
**/
static int64_t doEncodeBinary(char c) { return codes[static_cast<uint8_t>(c)]; }
/**
* @returns true of the character `c` was a valid nucleotide and false
*otherwise. The corresponding
* code for this character is placed in the parameter `code`.
**/
static bool encodeBinary(char c, int64_t& code) {
code = codes[static_cast<uint8_t>(c)];
return code >= 0;
}
static char decodeBinary(uint64_t n) { return revCodes[n]; }
/**
* Convert an ascii character to the corresponding 2-bit encoding
*
* Following the encoding suggested [here](https://www.biostars.org/p/113640/),
*originally
* suggested by G. Rizk:
* A : 0
* C : 1
* G : 3
* T : 2
* N : 4
*
* This function will work with both lower and upper case nucleotides.
* @ASSUMPTION : c is in {A,C,G,T,N,a,c,g,t,n}
**/
static uint64_t charToBitsGATB(char c) {
// Convert to uppercase
// https://stackoverflow.com/questions/10688831/fastest-way-to-capitalize-words
return static_cast<uint64_t>((((c & ~0x20) >> 1) & 0x03) + ((c & 0x08) >> 3));
}
// Adapted from
// https://github.com/mengyao/Complete-Striped-Smith-Waterman-Library/blob/8c9933a1685e0ab50c7d8b7926c9068bc0c9d7d2/src/main.c#L36
static void reverseComplement(const std::string& seq, std::string& readWork) {
readWork.resize(seq.length(), 'A');
int32_t end = seq.length() - 1, start = 0;
while (LIKELY(start < end)) {
readWork[start] = (char)rc_table[(int8_t)seq[end]];
readWork[end] = (char)rc_table[(int8_t)seq[start]];
++start;
--end;
}
// If odd # of bases, we still have to complement the middle
if (start == end) {
readWork[start] = (char)rc_table[(int8_t)seq[start]];
}
}
static std::string reverseComplement(const std::string& seq) {
std::string work;
reverseComplement(seq, work);
return work;
}
static std::string stringRevComp(const std::string& seq) {
return reverseComplement(seq);
}
/**
* From https://www.biostars.org/p/113640/. This only works for a given word
*right now;
* will determine how to best generalize later.
**/
static uint64_t word_reverse_complement_gatb(uint64_t x, size_t k_) {
uint64_t res = x;
res = ((res >> 2 & 0x3333333333333333) | (res & 0x3333333333333333) << 2);
res = ((res >> 4 & 0x0F0F0F0F0F0F0F0F) | (res & 0x0F0F0F0F0F0F0F0F) << 4);
res = ((res >> 8 & 0x00FF00FF00FF00FF) | (res & 0x00FF00FF00FF00FF) << 8);
res = ((res >> 16 & 0x0000FFFF0000FFFF) | (res & 0x0000FFFF0000FFFF) << 16);
res = ((res >> 32 & 0x00000000FFFFFFFF) | (res & 0x00000000FFFFFFFF) << 32);
res = res ^ 0xAAAAAAAAAAAAAAAA;
return (res >> (2 * (32 - k_)));
}
// Some template magic to detect if a template type has a ``length()'' function.
template <typename...> using combinelib_void_t = void;
template <typename, typename = void>
struct has_length : public std::false_type {};
template <typename T>
struct has_length<T, combinelib_void_t<decltype(T().length())>>
: public std::true_type {};
/**
* The first template parameter, K, is the maximum length (in nucleotides)
* of the k-mer that can be represented with this class.
*
* The second template parameter, CID, is a class-type specific tag that
* will allow all instances of this particular class to share a value of
* their k. This idea is used in Jellyfish, which inspired the use here.
**/
template <uint64_t K, uint64_t CID = 0> class Kmer {
static_assert(
K <= 32,
"Currently, the Kmer class can only represent k-mers of size <= 32");
public:
using base_type = uint64_t;
explicit Kmer() {}
template <
typename ViewT,
typename = typename std::enable_if<has_length<ViewT>::value, void>::type>
Kmer(ViewT& v) {
fromChars(v);
}
// NOTE: the template below should take care of this, but doesn't on gcc 4.8.2
// try and figure this out.
Kmer(const char* iter) {
fromCharsIter_(iter);
}
// NOTE: the template below should take care of this, but doesn't on gcc 4.8.2
// try and figure this out.
Kmer(std::string::iterator iter) {
fromCharsIter_(iter);
}
template <
typename IterT,
typename = typename std::enable_if<!has_length<IterT>::value, void>::type>
Kmer(IterT v) {
fromCharsIter_(v);
}
Kmer(const Kmer& other) = default;
Kmer(Kmer&& other) = default;
Kmer(Kmer& other) = default;
Kmer& operator=(Kmer& other) = default;
Kmer& operator=(Kmer&& other) = default;
// NOTE: the template below should take care of this, but doesn't on gcc 4.8.2
// try and figure this out.
Kmer& operator=(const char* iter) {
fromCharsIter_(iter);
return *this;
}
template <
typename IterT,
typename = typename std::enable_if<!has_length<IterT>::value, void>::type>
Kmer& operator=(IterT iter) {
fromCharsIter_(iter);
return *this;
}
template <
typename ViewT,
typename = typename std::enable_if<has_length<ViewT>::value, void>::type>
Kmer& operator=(ViewT& v) {
fromChars(v);
return *this;
}
// NOTE: the template below should take care of this, but doesn't on gcc 4.8.2
// try and figure this out.
bool fromChars(const char* iter) {
return fromCharsIter_(iter);
}
// NOTE: the template below should take care of this, but doesn't on gcc 4.8.2
// try and figure this out.
bool fromChars(std::string::iterator iter) {
return fromCharsIter_(iter);
}
/**
* Populate this kmer by consuming characters pointed to by iter.
*
* @ASSUMPTIONS:
* There are at least k_ characters to consume or
* (2) we will encounter a non-nucleotide (i.e. \0) character
**/
template <
typename IterT,
typename = typename std::enable_if<!has_length<IterT>::value, void>::type>
bool fromChars(IterT iter) {
return fromCharsIter_(iter);
}
/**
* This is the same as the above function, but it will be called if the
*argument type
* `ViewT` has a "length()" member. In that case, the function will
*additionally check
* that the length of `v` is >= k_.
**/
template <
typename ViewT,
typename = typename std::enable_if<has_length<ViewT>::value, void>::type>
bool fromCharsSafe(ViewT& v) {
return (v.length() >= k_) ? fromChars(v.begin()) : false;
}
/**
* This is a convenience function taht lets us call fromChars on a string, or
*string_vew (or similar object);
**/
template <
typename ViewT,
typename = typename std::enable_if<has_length<ViewT>::value, void>::type>
bool fromChars(ViewT& v) {
return fromChars(v.begin());
}
bool fromChars(Kmer& k) {
data_[0] = k.data_[0];
return true;
}
/**
* Append the character `c` to the end of the k-mer
**/
uint64_t append(char c) {
auto r = (data_[0] >> (2 * k_ - 2)) & 0x03;
data_[0] = maskTable[k_] & ((data_[0] << 2) | doEncodeBinary(c));
return r;
}
/**
* Prepend the character `c` to the beginning of the k-mer
**/
uint64_t prepend(char c) {
auto r = (data_[0] & 0x03);
data_[0] = (data_[0] >> 2) | (doEncodeBinary(c) << (2 * k_ - 2));
return r;
}
/**
* Append the character `c` to the end of the k-mer
**/
uint64_t append(int i) {
auto r = (data_[0] >> (2 * k_ - 2)) & 0x03;
data_[0] = maskTable[k_] & ((data_[0] << 2) | static_cast<base_type>(i));
return r;
}
/**
* Prepend the character `c` to the beginning of the k-mer
**/
uint64_t prepend(int i) {
auto r = (data_[0] & 0x03);
data_[0] = (data_[0] >> 2) | (static_cast<base_type>(i) << (2 * k_ - 2));
return r;
}
/**
* @returns a `uint64_t` that represents the encoded `idx`-th word of this
*k-mer
**/
uint64_t word(uint32_t idx) const { return data_[idx]; }
/**
* @returns a reference to the `uint64_t` that represents the encoded `idx`-th
*word of this k-mer
**/
uint64_t& word__(uint32_t idx) { return data_[idx]; }
const base_type* data() const { return &data_[0]; }
/**
* @returns the number of bytes required by this k-mer
**/
uint64_t sizeInBytes() const { return sizeof(data_); }
/**
* @returns the number of words required by this k-mer
**/
uint64_t sizeInWords() const { return sizeof(data_) / sizeof(base_type); }
/**
* @returns the number of words required by this k-mer
**/
uint64_t nb_words() const { return sizeInWords(); }
/**
* Set the dynamic length of this k-mer class to be kIn nucleotides.
* @returns the value of k for this class prior to this update.
**/
static uint16_t k(uint16_t kIn) {
assert(kIn < K);
std::swap(k_, kIn);
return kIn;
}
/**
* @returns the value of k used for this k-mer class
**/
static uint16_t k() { return k_; }
std::string toStr() const {
std::string s(k_, 'X');
auto& d = data_[0];
int32_t offset = (2 * k_) - 2;
for (int32_t idx = 0; offset >= 0; offset -= 2, ++idx) {
s[idx] = decodeBinary((d >> offset & 0x03));
}
return s;
}
bool isHomoPolymer() const {
auto nuc = data_[0] & 0x3;
return (data_[0] == (maskTable[k_] & ((data_[0] << 2) | nuc)));
}
bool is_homopolymer() const { return isHomoPolymer(); }
void rc() { data_[0] = word_reverse_complement(data_[0], k_); }
Kmer<K, CID> getRC() const {
Kmer<K, CID> nk;
nk.data_[0] = word_reverse_complement(data_[0], k_);
return nk;
}
void canonicalize() {
auto wrc = word_reverse_complement(data_[0], k_);
data_[0] = (wrc < data_[0]) ? wrc : data_[0];
}
Kmer<K, CID> getCanonical() {
Kmer<K, CID> rck = getRC();
return (rck < *this) ? rck : *this;
}
template <uint64_t KP, uint64_t CIDP>
friend std::ostream& operator<<(std::ostream& os, const Kmer<KP, CIDP>& k);
template <uint64_t KP, uint64_t CIDP>
friend bool operator==(const Kmer<KP, CIDP>& lhs, const Kmer<KP, CIDP>& rhs);
template <uint64_t KP, uint64_t CIDP>
friend bool operator!=(const Kmer<KP, CIDP>& lhs, const Kmer<KP, CIDP>& rhs);
template <uint64_t KP, uint64_t CIDP>
friend bool operator<(const Kmer<KP, CIDP>& lhs, const Kmer<KP, CIDP>& rhs);
template <uint64_t KP, uint64_t CIDP>
friend bool operator>(const Kmer<KP, CIDP>& lhs, const Kmer<KP, CIDP>& rhs);
private:
template <typename IterT>
bool fromCharsIter_(IterT iter) {
data_[0] = 0;
auto toConsume = 1; // numWordsRequired(k_);
int64_t code{0};
bool success = true;
int32_t remK = static_cast<int32_t>(k_);
for (int32_t w = 0; w < toConsume; ++w) {
int32_t shift = std::min((2 * remK) - 2, 62);
auto& currWord = data_[w];
for (; remK > 0 and shift >= 0; ++iter, --remK, shift -= 2) {
// success &= encodeBinary(*iter, code);
if (!encodeBinary(*iter, code))
return false;
currWord |= (code << shift);
}
}
return success;
}
base_type data_[numWordsRequired(K)] = {};
static uint16_t k_;
};
template <uint64_t K, uint64_t CID> uint16_t Kmer<K, CID>::k_ = 0;
template <uint64_t K, uint64_t CID>
std::ostream& operator<<(std::ostream& os, const Kmer<K, CID>& k) {
os << k.toStr();
return os;
}
template <uint64_t K, uint64_t CID>
bool operator==(const Kmer<K, CID>& lhs, const Kmer<K, CID>& rhs) {
return lhs.data_[0] == rhs.data_[0];
}
template <uint64_t K, uint64_t CID>
bool operator!=(const Kmer<K, CID>& lhs, const Kmer<K, CID>& rhs) {
return !(lhs == rhs);
}
template <uint64_t K, uint64_t CID>
bool operator<(const Kmer<K, CID>& lhs, const Kmer<K, CID>& rhs) {
return (lhs.data_[0] < rhs.data_[0]);
}
template <uint64_t K, uint64_t CID>
bool operator>(const Kmer<K, CID>& lhs, const Kmer<K, CID>& rhs) {
return (lhs.data_[0] > rhs.data_[0]);
}
} // namespace kmers
} // namespace combinelib
#endif // __COMBINELIB_KMERS_HPP__
......@@ -26,9 +26,9 @@
namespace rapmap {
constexpr char majorVersion[] = "0";
constexpr char minorVersion[] = "5";
constexpr char minorVersion[] = "6";
constexpr char patchVersion[] = "0";
constexpr char version [] = "0.5.0";
constexpr char version [] = "0.6.0";
constexpr uint32_t indexVersion = 3;
}
......
This diff is collapsed.
......@@ -25,6 +25,8 @@
#include "RapMapSAIndex.hpp"
#include "RapMapUtils.hpp"
#include "SASearcher.hpp"
#include "HitManager.hpp"
#include "chobo/small_vector.hpp"
#include <algorithm>
#include <iostream>
......@@ -58,11 +60,22 @@ public:
bool getStrictCheck() const { return strictCheck_; };
void setStrictCheck(bool sc) { strictCheck_ = sc; }
/** Get/Set usage of MMP chain scoring **/
void enableChainScoring() { doChaining_ = true; }
void disableChainScoring() { doChaining_ = false; }
bool getChainScoring() const { return doChaining_; }
/** Get/Set the "mohsen number" that is used to limit the
maximum MMP length during interval collection **/
// Note --- cannot set an extension less than 1
void setMaxMMPExtension(int32_t ext) { if (ext > 0) { maxMMPExtension_ = ext; } }
int32_t getMaxMMPExtension() const { return maxMMPExtension_; }
/** Construct an SACollector given an index **/
SACollector(RapMapIndexT* rmi)
: rmi_(rmi), hashEnd_(rmi->khash.end()), disableNIP_(false),
covReq_(0.0), maxInterval_(1000),
strictCheck_(false) {}
strictCheck_(false), doChaining_(false) {}
enum HitStatus { ABSENT = -1, UNTESTED = 0, PRESENT = 1 };
// Record if k-mers are hits in the
......@@ -80,7 +93,7 @@ public:
return kpos < other.kpos;
}
void print() {
std::cerr << "{ " << kmer.to_str() << ", " << kpos << ", "
std::cerr << "{ " << kmer.toStr() << ", " << kpos << ", "
<< ((fwdScore) ? "PRESENT" : "ABSENT") << ", "
<< ((rcScore) ? "PRESENT" : "ABSENT") << "}\t";
}
......@@ -90,25 +103,19 @@ public:
HitStatus rcScore;
};
using KmerScoreVec = chobo::small_vector<KmerDirScore, 32, 33>;
bool operator()(std::string& read,
std::vector<rapmap::utils::QuasiAlignment>& hits,
SASearcher<RapMapIndexT>& saSearcher,
rapmap::utils::MateStatus mateStatus,
bool consistentHits = false) {
rapmap::hit_manager::HitCollectorInfo<rapmap::utils::SAIntervalHit<OffsetT>>& hcInfo) {
using QuasiAlignment = rapmap::utils::QuasiAlignment;
using MateStatus = rapmap::utils::MateStatus;
using SAIntervalHit = rapmap::utils::SAIntervalHit<OffsetT>;
auto& rankDict = rmi_->rankDict;
auto& txpStarts = rmi_->txpOffsets;
auto& SA = rmi_->SA;
auto& khash = rmi_->khash;
auto& text = rmi_->seq;
auto salen = SA.size();
//auto hashEnd_ = khash.end();
auto readLen = read.length();
auto maxDist = 1.5 * readLen;
auto maxDist = static_cast<int32_t>(readLen);
auto k = rapmap::utils::my_mer::k();
auto readStartIt = read.begin();
......@@ -124,7 +131,6 @@ public:
size_t rcCov{0};
bool foundHit = false;
bool isRev = false;
rapmap::utils::my_mer mer;
rapmap::utils::my_mer rcMer;
......@@ -133,11 +139,16 @@ public:
// This allows implementing our heurisic for comparing
// forward and reverse-complement strand matches
std::vector<KmerDirScore> kmerScores;
KmerScoreVec kmerScores;
// Set the readLen and maxDist for this read in the
// HitCollectorInfo structure
hcInfo.readLen = readLen;
hcInfo.maxDist = maxDist;
// Where we store the SA intervals for forward and rc hits
std::vector<SAIntervalHit> fwdSAInts;
std::vector<SAIntervalHit> rcSAInts;
auto& fwdSAInts = hcInfo.fwdSAInts;
auto& rcSAInts = hcInfo.rcSAInts;
// Number of nucleotides to skip when encountering a homopolymer k-mer.
OffsetT homoPolymerSkip = 1; // k / 2;
......@@ -177,20 +188,9 @@ public:
if (mer.is_homopolymer()) {
rb += homoPolymerSkip;
re += homoPolymerSkip;
/* Walk base-by-base rather than skipping
// If the first N is within k bases, then this k-mer is invalid
if (invalidPos < pos + k) {
// Skip to the k-mer starting at the next position
// (i.e. right past the N)
rb = read.begin() + invalidPos + 1;
re = rb + k;
// Go to the next iteration of the while loop
continue;
}
*/
continue;
}
rcMer = mer.get_reverse_complement();
rcMer = mer.getRC();
// See if we can find this k-mer in the hash
merIt = khash.find(mer.word(0));//get_bits(0, 2 * k));
......@@ -315,7 +315,7 @@ public:
// If the rc k-mer is untested, then test it
if (kms.rcScore == UNTESTED) {
rcMer = kms.kmer.get_reverse_complement();
rcMer = kms.kmer.getRC();
auto rcMerIt = khash.find(rcMer.word(0));//get_bits(0, 2 * k));
kms.rcScore = (rcMerIt != hashEnd_) ? PRESENT : ABSENT;
}
......@@ -357,103 +357,6 @@ public:
}
}
auto fwdHitsStart = hits.size();
// If we had > 1 forward hit
if (fwdSAInts.size() > 1) {
auto processedHits = rapmap::hit_manager::intersectSAHits(
fwdSAInts, *rmi_, readLen, consistentHits);
rapmap::hit_manager::collectHitsSimpleSA(processedHits, readLen, maxDist,
hits, mateStatus);
} else if (fwdSAInts.size() == 1) { // only 1 hit!
auto& saIntervalHit = fwdSAInts.front();
auto initialSize = hits.size();
for (OffsetT i = saIntervalHit.begin; i != saIntervalHit.end; ++i) {
auto globalPos = SA[i];
auto txpID = rmi_->transcriptAtPosition(globalPos);
// the offset into this transcript
auto pos = globalPos - txpStarts[txpID];
int32_t hitPos = pos - saIntervalHit.queryPos;
hits.emplace_back(txpID, hitPos, true, readLen);
hits.back().mateStatus = mateStatus;
}
// Now sort by transcript ID (then position) and eliminate
// duplicates
auto sortStartIt = hits.begin() + initialSize;
auto sortEndIt = hits.end();
std::sort(sortStartIt, sortEndIt,
[](const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
if (a.tid == b.tid) {
return a.pos < b.pos;
} else {
return a.tid < b.tid;
}
});
auto newEnd = std::unique(
hits.begin() + initialSize, hits.end(),
[](const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
return a.tid == b.tid;
});
hits.resize(std::distance(hits.begin(), newEnd));
}
auto fwdHitsEnd = hits.size();
auto rcHitsStart = fwdHitsEnd;
// If we had > 1 rc hit
if (rcSAInts.size() > 1) {
auto processedHits = rapmap::hit_manager::intersectSAHits(
rcSAInts, *rmi_, readLen, consistentHits);
rapmap::hit_manager::collectHitsSimpleSA(processedHits, readLen, maxDist,
hits, mateStatus);
} else if (rcSAInts.size() == 1) { // only 1 hit!
auto& saIntervalHit = rcSAInts.front();
auto initialSize = hits.size();
for (OffsetT i = saIntervalHit.begin; i != saIntervalHit.end; ++i) {
auto globalPos = SA[i];
auto txpID = rmi_->transcriptAtPosition(globalPos);
// the offset into this transcript
auto pos = globalPos - txpStarts[txpID];
int32_t hitPos = pos - saIntervalHit.queryPos;
hits.emplace_back(txpID, hitPos, false, readLen);
hits.back().mateStatus = mateStatus;
}
// Now sort by transcript ID (then position) and eliminate
// duplicates
auto sortStartIt = hits.begin() + rcHitsStart;
auto sortEndIt = hits.end();
std::sort(sortStartIt, sortEndIt,
[](const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
if (a.tid == b.tid) {
return a.pos < b.pos;
} else {
return a.tid < b.tid;
}
});
auto newEnd = std::unique(
sortStartIt, sortEndIt,
[](const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
return a.tid == b.tid;
});
hits.resize(std::distance(hits.begin(), newEnd));
}
auto rcHitsEnd = hits.size();
// If we had both forward and RC hits, then merge them
if ((fwdHitsEnd > fwdHitsStart) and (rcHitsEnd > rcHitsStart)) {
// Merge the forward and reverse hits
std::inplace_merge(
hits.begin() + fwdHitsStart, hits.begin() + fwdHitsEnd,
hits.begin() + rcHitsEnd,
[](const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
return a.tid < b.tid;
});
// And get rid of duplicate transcript IDs
auto newEnd = std::unique(
hits.begin() + fwdHitsStart, hits.begin() + rcHitsEnd,
[](const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
return a.tid == b.tid;
});
hits.resize(std::distance(hits.begin(), newEnd));
}
// Return true if we had any valid hits and false otherwise.
return foundHit;
}
......@@ -469,7 +372,7 @@ private:
IteratorT* complementMerItPtr, // nullptr if we haven't checked yet
bool isRC, // is this being called from the RC of the read
uint32_t& strandHits, uint32_t& otherStrandHits,
std::vector<KmerDirScore>& kmerScores
KmerScoreVec& kmerScores
) {
IteratorT merIt = hashEnd_;
IteratorT complementMerIt = hashEnd_;
......@@ -477,7 +380,7 @@ private:
//auto hashEnd_ = khash.end();
auto k = rapmap::utils::my_mer::k();
auto complementMer = mer.get_reverse_complement();
auto complementMer = mer.getRC();
if (merItPtr == nullptr) {
// We haven't tested this, so do that here
......@@ -530,7 +433,7 @@ private:
// Attempts to find the next valid k-mer (a k-mer that doesn't contain an 'N' and is
// not a homopolymer). If no such k-mer exists within the read, then it returns false.
inline bool getNextValidKmer_(std, size_t& pos, rapmap::utils::my_mer& mer) {
bool validMer = mer.from_chars(read + pos);
bool validMer = mer.fromChars(read + pos);
// if this kmer contains an 'N' then validMer is false, else true
}
*/
......@@ -538,10 +441,10 @@ private:
inline void getSAHits_(
SASearcher<RapMapIndexT>& saSearcher, std::string& read,
std::string::iterator startIt,
rapmap::utils::SAInterval<OffsetT>* startInterval, size_t& cov,
const rapmap::utils::SAInterval<OffsetT>* const startInterval, size_t& cov,
uint32_t& strandHits, uint32_t& otherStrandHits,
std::vector<rapmap::utils::SAIntervalHit<OffsetT>>& saInts,
std::vector<KmerDirScore>& kmerScores,
rapmap::hit_manager::SAIntervalVector<rapmap::utils::SAIntervalHit<OffsetT>>& saInts,
KmerScoreVec& kmerScores,
bool isRC // true if read is the reverse complement, false otherwise
) {
using SAIntervalHit = rapmap::utils::SAIntervalHit<OffsetT>;
......@@ -567,7 +470,7 @@ private:
rapmap::utils::my_mer mer, complementMer;
auto merIt = hashEnd_;
auto complementMerIt = hashEnd_;
//auto complementMerIt = hashEnd_;
size_t pos{0};
size_t sampFactor{1};
bool lastSearch{false};
......@@ -592,7 +495,7 @@ private:
// The distance from the beginning of the read to the
// start of the k-mer
pos = std::distance(readStartIt, rb);
validMer = mer.from_chars(read.c_str() + pos);
validMer = mer.fromChars(read.c_str() + pos);
// Get the next valid k-mer at some position >= pos
//validMer = getNextValidKmer_(read, pos, mer);
//if (!validMer) { return; }
......@@ -600,7 +503,7 @@ private:
// If this k-mer contains an 'N', then find the position
// of this character and skip one past it.
if (!validMer) {
invalidPos = read.find_first_of("nN", pos);
invalidPos = read.find_first_of("Nn", pos);
// If the first N is within k bases, then this k-mer is invalid
if (invalidPos < pos + k) {
// Skip to the k-mer starting at the next position
......@@ -634,7 +537,7 @@ private:
// If it's not a homopolymer, then get the complement
// k-mer and query both in the hash.
complementMer = mer.get_reverse_complement();
complementMer = mer.getRC();
merIt = khash.find(mer.word(0));//get_bits(0, 2 * k));
// If we found the k-mer
......@@ -648,8 +551,28 @@ private:
// lb must be 1 *less* then the current lb
// We can't move any further in the reverse complement direction
lb = std::max(static_cast<OffsetT>(0), lb - 1);
// [Nov 21] NOTE: Attempt to cheaply mimic intruders --- hack for now,
// make it nicer if it works.
bool firstAttempt = doChaining_ ? (rb == readStartIt) : true;
auto endIt = (firstAttempt) ? readEndIt : std::min(rb + k + maxMMPExtension_, readEndIt);
auto lbP = lb;
auto ubP = ub;
std::tie(lb, ub, matchedLen) =
saSearcher.extendSearchNaive(lb, ub, k, rb, endIt);
// [Nov 21] NOTE: Attempt to cheaply mimic intruders --- hack for now,
// make it nicer if it works.
if (doChaining_ and firstAttempt and !(matchedLen >= static_cast<OffsetT>(readLen)) and matchedLen >= static_cast<OffsetT>(k + maxMMPExtension_)) {
firstAttempt = false;
lb = lbP;
ub = ubP;
endIt = std::min(rb + k + maxMMPExtension_, readEndIt);
std::tie(lb, ub, matchedLen) =
saSearcher.extendSearchNaive(lb, ub, k, rb, readEndIt);
saSearcher.extendSearchNaive(lb, ub, k, rb, endIt);
}
OffsetT diff = ub - lb;
if (ub > lb and diff < maxInterval_) {
......@@ -676,7 +599,7 @@ private:
if (rb + matchedLen < readEndIt) {
uint32_t kmerPos = static_cast<uint32_t>(
std::distance(readStartIt, rb + matchedLen - skipOverlapMMP));
bool validNucs = mer.from_chars(read.c_str() + kmerPos);
bool validNucs = mer.fromChars(read.c_str() + kmerPos);
if (validNucs) {
/*
// since the MMP *ended* before the end of the read, we assume
......@@ -759,6 +682,8 @@ private:
double covReq_;
OffsetT maxInterval_;
bool strictCheck_;
bool doChaining_;
int32_t maxMMPExtension_{7};
std::string rcBuffer_;
};
......
......@@ -25,7 +25,7 @@
#include <vector>
#include <algorithm>
#include <iterator>
#include "jellyfish/mer_dna.hpp"
#include "Kmer.hpp"
#include "RapMapUtils.hpp"
#include "RapMapSAIndex.hpp"
......@@ -99,10 +99,10 @@ class SASearcher {
std::string& seq = *seq_;
int64_t m = std::distance(qb, qe);
size_t n = seq.length();
int64_t n = static_cast<int64_t>(seq.length());
auto sb = seq.begin();
auto se = seq.end();
//auto se = seq.end();
// If the bounds are already trivial, just figure how long
// of a prefix we share and return the interval.
......@@ -113,7 +113,7 @@ class SASearcher {
char queryChar = ::toupper(*(qb + i));
// If we're reverse complementing
if (complementBases) {
queryChar = rapmap::utils::my_mer::complement(queryChar);
queryChar = combinelib::kmers::complement(queryChar);
}
if ( queryChar < *(sb + SA[lbIn] + i) ) {
break;
......@@ -128,7 +128,7 @@ class SASearcher {
BoundSearchResult<OffsetT> res1, res2;
char smallest = '#';
char largest = '}';
//char largest = '}';
char sentinel = smallest;
// FIX: these have to be large enough to hold the *sum* of the boundaries!
......@@ -138,12 +138,12 @@ class SASearcher {
int64_t i{0};
int64_t maxI{startAt};
int64_t prevI = startAt;
//int64_t prevI = startAt;
int64_t prevILow = startAt;
int64_t prevIHigh = startAt;
int64_t validBoundLow = ubIn;
int64_t validBoundHigh = lbIn;
int64_t validBound = 0;
//int64_t validBound = 0;
bool plt{true};
// Reduce the search interval until we hit a border
// i.e. until c == r - 1 or c == l + 1
......@@ -155,7 +155,7 @@ class SASearcher {
char queryChar = ::toupper(*(qb + i));
// If we're reverse complementing
if (complementBases) {
queryChar = rapmap::utils::my_mer::complement(queryChar);
queryChar = combinelib::kmers::complement(queryChar);
}
if ( queryChar < *(sb + SA[c] + i) ) {
......@@ -208,7 +208,7 @@ class SASearcher {
}
}
bool knownValid{true};
//bool knownValid{true};
m = res1.maxLen + 1;
// first search for the lower bound
......@@ -229,7 +229,7 @@ class SASearcher {
char queryChar = (i < m - 1) ? ::toupper(*(qb + i)) : sentinel;
// If we're reverse complementing
if (queryChar != sentinel and complementBases) {
queryChar = rapmap::utils::my_mer::complement(queryChar);
queryChar = combinelib::kmers::complement(queryChar);
}
if ( queryChar < *(sb + SA[c] + i) ) {
......@@ -275,7 +275,7 @@ class SASearcher {
char queryChar = (i < m - 1) ? ::toupper(*(qb + i)) : sentinel;
// If we're reverse complementing
if (queryChar != sentinel and complementBases) {
queryChar = rapmap::utils::my_mer::complement(queryChar);
queryChar = combinelib::kmers::complement(queryChar);
}
if ( queryChar < *(sb + SA[c] + i) ) {
......@@ -447,7 +447,7 @@ class SASearcher {
char queryChar = ::toupper(*(qb + i));
// If we're reverse complementing
if (complementBases) {
queryChar = rapmap::utils::my_mer::complement(queryChar);
queryChar = combinelib::kmers::complement(queryChar);
}
if ( queryChar < *(sb + SA[lbIn] + i) ) {
break;
......@@ -490,7 +490,7 @@ class SASearcher {
char queryChar = ::toupper(*(qb + i));
// If we're reverse complementing
if (complementBases) {
queryChar = rapmap::utils::my_mer::complement(queryChar);
queryChar = combinelib::kmers::complement(queryChar);
}
if ( queryChar < *(sb + SA[c] + i) ) {
......@@ -612,7 +612,7 @@ class SASearcher {
char queryChar = (i < m - 1) ? ::toupper(*(qb + i)) : sentinel;
// If we're reverse complementing
if (queryChar != sentinel and complementBases) {
queryChar = rapmap::utils::my_mer::complement(queryChar);
queryChar = combinelib::kmers::complement(queryChar);
}
if ( queryChar < *(sb + SA[c] + i) ) {
......
This diff is collapsed.