Skip to content
Commits on Source (7)
gatb-core (1.4.1+git20181225.44d5a44+dfsg-1) unstable; urgency=medium
* New upstream checkout
* Adapt symbols file to new upstream code
* Bump SOVERSION due to ABI changes
-- Andreas Tille <tille@debian.org> Thu, 24 Jan 2019 11:50:33 +0100
gatb-core (1.4.1+git20180206.6f8fce8+dfsg-2) unstable; urgency=medium
* Provide symbols for amd64 only
......
......@@ -35,7 +35,7 @@ Description: Genome Analysis Toolbox with de-Bruijn graph
exist a set of ready-to-use tools relying on GATB-CORE
library: see https://gatb.inria.fr/software/
Package: libgatbcore1
Package: libgatbcore2
Architecture: any
Section: libs
Depends: ${shlibs:Depends},
......@@ -60,7 +60,7 @@ Architecture: any
Section: libdevel
Depends: ${shlibs:Depends},
${misc:Depends},
libgatbcore1 (= ${binary:Version})
libgatbcore2 (= ${binary:Version})
Description: development library of the Genome Analysis Toolbox
The GATB-CORE project provides a set of highly efficient
algorithms to analyse NGS data sets. These methods enable
......
From 0c580ab9e5f4a848c9c245060415e9acfe1655c7 Mon Sep 17 00:00:00 2001
From: rchikhi <rayan.chikhi@univ-lille1.fr>
Date: Mon, 24 Dec 2018 14:57:48 +0100
Origin: https://github.com/GATB/gatb-core/commit/0c580ab.patch
Subject: [PATCH] fixed bug #37 where unitigs are not output with consecutive
IDs, and also made sure that even if they're not, links are robust to that
---
gatb-core/src/gatb/bcalm2/bglue_algo.cpp | 19 ++++----------
gatb-core/src/gatb/bcalm2/bglue_algo.hpp | 15 +++++++++--
gatb-core/src/gatb/debruijn/impl/LinkTigs.cpp | 25 +++++++++++--------
3 files changed, 32 insertions(+), 27 deletions(-)
diff --git a/gatb-core/src/gatb/bcalm2/bglue_algo.cpp b/gatb-core/src/gatb/bcalm2/bglue_algo.cpp
index 6256282e..c12d1f79 100644
--- a/gatb-core/src/gatb/bcalm2/bglue_algo.cpp
+++ b/gatb-core/src/gatb/bcalm2/bglue_algo.cpp
@@ -904,9 +904,7 @@ void bglue(Storage *storage,
// setup output file
string output_prefix = prefix;
- std::atomic<unsigned long> out_id; // identified for output sequences
- out_id = 0;
- BufferedFasta out (output_prefix, 100000);
+ BufferedFasta out (output_prefix, 100000, true);
auto get_UFclass = [&modelCanon, &ufkmers_vector, &hasher, &uf_mphf]
(const string &kmerBegin, const string &kmerEnd,
@@ -966,7 +964,7 @@ void bglue(Storage *storage,
std::mutex mtx; // lock to avoid a nasty bug when calling output()
auto partitionGlue = [k, &modelCanon /* crashes if copied!*/, \
&get_UFclass, &gluePartitions,
- &out, &outLock, &nb_seqs_in_partition, &out_id, nbGluePartitions, &mtx]
+ &out, &outLock, &nb_seqs_in_partition, nbGluePartitions, &mtx]
(const Sequence& sequence)
{
const string &seq = sequence.toString();
@@ -992,12 +990,8 @@ void bglue(Storage *storage,
float mean_abundance = get_mean_abundance(abundances);
uint32_t sum_abundances = get_sum_abundance(abundances);
- // for some reason i do need that lock_guard here.. even though output is itself lock guarded. maybe some lazyness in the evauation of the to_string(out_id++)? who kon
- // anyway this fixes the problem, i'll understand it some other time.
- std::lock_guard<std::mutex> lock(mtx);
- output(seq, out, std::to_string(out_id++) + " LN:i:" + to_string(seq.size()) + " KC:i:" + to_string(sum_abundances) + " km:f:" + to_string_with_precision(mean_abundance));
// km is not a standard GFA field so i'm putting it in lower case as per the spec
- // maybe could optimize by writing to disk using queues, if that's ever a bottleneck
+ output(seq, out, "LN:i:" + to_string(seq.size()) + " KC:i:" + to_string(sum_abundances) + " km:f:" + to_string_with_precision(mean_abundance));
return;
}
@@ -1051,7 +1045,7 @@ void bglue(Storage *storage,
for (int partition = 0; partition < nbGluePartitions; partition++)
{
auto glue_partition = [&modelCanon, &ufkmers, partition, &gluePartition_prefix, nbGluePartitions, &copy_nb_seqs_in_partition,
- &get_UFclass, &out, &outLock, &out_id, kmerSize, &mtx]( int thread_id)
+ &get_UFclass, &out, &outLock, kmerSize, &mtx]( int thread_id)
{
int k = kmerSize;
@@ -1140,10 +1134,7 @@ void bglue(Storage *storage,
float mean_abundance = get_mean_abundance(abs);
uint32_t sum_abundances = get_sum_abundance(abs);
{
- // for some reason i do need that lock_guard here.. even though output is itself lock guarded. maybe some lazyness in the evauation of the to_string(out_id++)? who kon
- // anyway this fixes the problem, i'll understand it some other time.
- std::lock_guard<std::mutex> lock(mtx);
- output(seq, out, std::to_string(out_id++) + " LN:i:" + to_string(seq.size()) + " KC:i:" + to_string(sum_abundances) + " km:f:" + to_string_with_precision(mean_abundance));
+ output(seq, out, "LN:i:" + to_string(seq.size()) + " KC:i:" + to_string(sum_abundances) + " km:f:" + to_string_with_precision(mean_abundance));
}
}
diff --git a/gatb-core/src/gatb/bcalm2/bglue_algo.hpp b/gatb-core/src/gatb/bcalm2/bglue_algo.hpp
index 50ca9176..7783215f 100644
--- a/gatb-core/src/gatb/bcalm2/bglue_algo.hpp
+++ b/gatb-core/src/gatb/bcalm2/bglue_algo.hpp
@@ -43,16 +43,21 @@ class BufferedFasta
std::string buffer;
unsigned long buffer_length;
FILE* _insertHandle;
+ bool needs_consecutive_ids;
+ unsigned long consecutive_id;
public:
unsigned long max_buffer;
bool threadsafe;
- BufferedFasta(const std::string filename, unsigned long given_max_buffer = 50000)
+ BufferedFasta(const std::string filename, unsigned long given_max_buffer = 50000, bool given_needs_consecutive_ids = false)
{
max_buffer = given_max_buffer; // that much of buffering will be written to the file at once (in bytes)
threadsafe = true;
buffer_length = 0;
+ consecutive_id = 0;
+ needs_consecutive_ids = given_needs_consecutive_ids;
+ if ((!threadsafe) && needs_consecutive_ids) { std::cout << "error, non-threadsafe bufferedfasta with consecutive ids asked" << std::endl; exit(1);}
_insertHandle = fopen (filename.c_str(), "w");
if (!_insertHandle) { std::cout << "error opening " << filename << " for writing." << std::endl; exit(1);}
buffer.reserve(max_buffer+1000/*security*/);
@@ -73,7 +78,13 @@ class BufferedFasta
if (buffer_length + insert_size > max_buffer)
flush();
buffer_length += insert_size;
- buffer += ">" + comment + "\n" + seq + "\n";
+ buffer += ">";
+ if (needs_consecutive_ids)
+ {
+ buffer += std::to_string(consecutive_id) + " ";
+ consecutive_id++;
+ }
+ buffer += comment + "\n" + seq + "\n";
if (threadsafe)
mtx.unlock();
}
diff --git a/gatb-core/src/gatb/debruijn/impl/LinkTigs.cpp b/gatb-core/src/gatb/debruijn/impl/LinkTigs.cpp
index a13668ac..b21d164e 100644
--- a/gatb-core/src/gatb/debruijn/impl/LinkTigs.cpp
+++ b/gatb-core/src/gatb/debruijn/impl/LinkTigs.cpp
@@ -41,6 +41,9 @@ namespace gatb { namespace core { namespace debruijn { namespace impl {
* it uses the disk to store the links for extremities until they're merged into the final unitigs file.
*
* so the memory usage is just that of the hash tables that record kmers, not of the links
+ *
+ * Assumption: FASTA header of unitigs starts with a unique number (unitig ID).
+ * Normally bcalm outputs consecutive unitig ID's but LinkTigs could also work with non-consecutive, non-sorted IDs
*/
template<size_t span>
void link_tigs(string unitigs_filename, int kmerSize, int nb_threads, uint64_t &nb_unitigs, bool verbose)
@@ -219,7 +222,6 @@ void link_unitigs_pass(const string unitigs_filename, bool verbose, const int pa
BankFasta inputBank (unitigs_filename);
BankFasta::Iterator itSeq (inputBank);
- uint64_t utig_counter = 0;
Model modelKminusOne(kmerSize - 1); // it's canonical (defined in the .hpp file)
@@ -235,12 +237,15 @@ void link_unitigs_pass(const string unitigs_filename, bool verbose, const int pa
const string& seq = itSeq->toString();
if (debug) std::cout << "unitig: " << seq << std::endl;
+ const string& comment = itSeq->getComment();
+ unsigned long utig_id = std::stoul(comment.substr(0, comment.find(' ')));
+
if (is_in_pass(seq, pass, UNITIG_BEGIN, kmerSize))
{
if (debug) std::cout << "pass " << pass << " examining beginning" << std::endl;
typename Model::Kmer kmerBegin = modelKminusOne.codeSeed(seq.substr(0, kmerSize-1).c_str(), Data::ASCII);
bool beginInSameOrientation = modelKminusOne.toString(kmerBegin.value()) == seq.substr(0,kmerSize-1);
- ExtremityInfo eBegin(utig_counter, !beginInSameOrientation /* because we record rc*/, UNITIG_BEGIN);
+ ExtremityInfo eBegin(utig_id, !beginInSameOrientation /* because we record rc*/, UNITIG_BEGIN);
utigs_links_map[kmerBegin.value()].push_back(eBegin.pack());
}
if (is_in_pass(seq, pass, UNITIG_END, kmerSize))
@@ -248,11 +253,10 @@ void link_unitigs_pass(const string unitigs_filename, bool verbose, const int pa
if (debug) std::cout << "pass " << pass << " examining end" << std::endl;
typename Model::Kmer kmerEnd = modelKminusOne.codeSeed(seq.substr(seq.size() - kmerSize+1).c_str(), Data::ASCII);
bool endInSameOrientation = modelKminusOne.toString(kmerEnd.value()) == seq.substr(seq.size() - kmerSize+1);
- ExtremityInfo eEnd( utig_counter, !endInSameOrientation, UNITIG_END);
+ ExtremityInfo eEnd( utig_id, !endInSameOrientation, UNITIG_END);
utigs_links_map[kmerEnd.value()].push_back(eEnd.pack());
// there is no UNITIG_BOTH here because we're taking (k-1)-mers.
}
- utig_counter++;
}
std::ofstream links_file(unitigs_filename+".links." +to_string(pass));
@@ -262,12 +266,13 @@ void link_unitigs_pass(const string unitigs_filename, bool verbose, const int pa
nb_hashed_entries += v.second.size();
logging("step 2 (" + to_string(utigs_links_map.size()) + "kmers/" + to_string(nb_hashed_entries) + "extremities)");
- utig_counter = 0;
for (itSeq.first(); !itSeq.isDone(); itSeq.next())
{
const string& seq = itSeq->toString();
-
- if (debug) std::cout << "unitig: " << seq << std::endl;
+ const string& comment = itSeq->getComment();
+ unsigned long utig_id = std::stoul(comment.substr(0, comment.find(' ')));
+
+ if (debug) std::cout << "unitig " << std::to_string(utig_id) << " : " << seq << std::endl;
if (is_in_pass(seq, pass, UNITIG_BEGIN, kmerSize))
{
@@ -324,7 +329,7 @@ void link_unitigs_pass(const string unitigs_filename, bool verbose, const int pa
if (debug) std::cout << std::endl;
}
- record_links(utig_counter, pass, in_links, links_file);
+ record_links(utig_id, pass, in_links, links_file);
}
@@ -371,10 +376,8 @@ void link_unitigs_pass(const string unitigs_filename, bool verbose, const int pa
}
if (debug) std::cout << std::endl;
}
- record_links(utig_counter, pass, out_links, links_file);
+ record_links(utig_id, pass, out_links, links_file);
}
-
- utig_counter++;
}
}
From 44d5a444f7d8b921b66065ef2d6270d3eb0f1446 Mon Sep 17 00:00:00 2001
From: rchikhi <rayan.chikhi@univ-lille1.fr>
Date: Tue, 25 Dec 2018 01:05:20 +0100
Origin: https://github.com/GATB/gatb-core/commit/44d5a44.patch
Subject: [PATCH] fixed kmer counting of IUPAC nucleotides
---
gatb-core/src/gatb/tools/misc/api/Data.cpp | 4 ++++
gatb-core/src/gatb/tools/misc/api/Data.hpp | 21 +++++++++++++++++----
2 files changed, 21 insertions(+), 4 deletions(-)
create mode 100644 gatb-core/src/gatb/tools/misc/api/Data.cpp
diff --git a/gatb-core/src/gatb/tools/misc/api/Data.cpp b/gatb-core/src/gatb/tools/misc/api/Data.cpp
new file mode 100644
index 00000000..cddc7e38
--- /dev/null
+++ b/gatb-core/src/gatb/tools/misc/api/Data.cpp
@@ -0,0 +1,4 @@
+#include <gatb/tools/misc/api/Data.hpp>
+
+const unsigned char gatb::core::tools::misc::Data::validNucleotide[]= {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+
diff --git a/gatb-core/src/gatb/tools/misc/api/Data.hpp b/gatb-core/src/gatb/tools/misc/api/Data.hpp
index 0c39e726..22e174a3 100644
--- a/gatb-core/src/gatb/tools/misc/api/Data.hpp
+++ b/gatb-core/src/gatb/tools/misc/api/Data.hpp
@@ -170,17 +170,30 @@ class Data : public Vector<char>
* - second : 0 if valid, 1 if invalid (in case of N character for instance) */
typedef std::pair<char,char> ConvertChar;
- /** Note for the ASCII conversion: the 4th bit is used to tell whether it is invalid or not.
- * => it finds out that 'N' character has this 4th bit equals to 1, which is not the case
- * for 'A', 'C', 'G' and 'T'. */
- struct ConvertASCII { static ConvertChar get (const char* buffer, size_t idx) { return ConvertChar((buffer[idx]>>1) & 3, (buffer[idx]>>3) & 1); }};
+ /** Used to have the following trick:
+ * (buffer[idx]>>3)& 1
+ * was equal to 1 for 'A', 'C', 'G' and 'T' (and also the lower case version)
+ * and was equal to 0 for 'N' and 'n'
+ * but unfortunately it doesn't work for some of the IUPAC codes, like 'R'
+ * */
+ struct ConvertASCII { static ConvertChar get (const char* buffer, size_t idx) { return ConvertChar((buffer[idx]>>1) & 3, validNucleotide[(unsigned char)(buffer[idx])]); }};
struct ConvertInteger { static ConvertChar get (const char* buffer, size_t idx) { return ConvertChar(buffer[idx],0); } };
struct ConvertBinary { static ConvertChar get (const char* buffer, size_t idx) { return ConvertChar(((buffer[idx>>2] >> ((3-(idx&3))*2)) & 3),0); } };
+ static const unsigned char validNucleotide[];
private:
/** Encoding scheme of the data instance. */
Encoding_e encoding;
+
+ /* generated using:
+ * codes=[1]*256
+ * codes[ord('A')] = codes[ord('a')] = 0;
+ * codes[ord('C')] = codes[ord('c')] = 0;
+ * codes[ord('G')] = codes[ord('g')] = 0;
+ * codes[ord('T')] = codes[ord('t')] = 0;
+ * print(codes)
+ */
};
/********************************************************************************/
From 5d4566430e0f708b602d26e0b54bd21dd4d7852b Mon Sep 17 00:00:00 2001
From: rchikhi <rayan.chikhi@univ-lille1.fr>
Date: Sat, 20 Oct 2018 10:30:53 +0200
Origin: https://github.com/GATB/gatb-core/commit/5d45664.patch
Subject: [PATCH] fix typo gatb/minia#11
---
gatb-core/src/gatb/kmer/impl/SortingCountAlgorithm.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/gatb-core/src/gatb/kmer/impl/SortingCountAlgorithm.cpp b/gatb-core/src/gatb/kmer/impl/SortingCountAlgorithm.cpp
index 96cf3b2f..a86969f5 100644
--- a/gatb-core/src/gatb/kmer/impl/SortingCountAlgorithm.cpp
+++ b/gatb-core/src/gatb/kmer/impl/SortingCountAlgorithm.cpp
@@ -212,7 +212,7 @@ IOptionsParser* SortingCountAlgorithm<span>::getOptionsParser (bool mandatory)
parser->push_back (new OptionOneParam (STR_KMER_ABUNDANCE_MIN_THRESHOLD,"min abundance hard threshold (only used when min abundance is \"auto\")",false, "2"));
parser->push_back (new OptionOneParam (STR_HISTOGRAM_MAX, "max number of values in kmers histogram", false, "10000"));
parser->push_back (new OptionOneParam (STR_SOLIDITY_KIND, "way to compute counts of several files (sum, min, max, one, all, custom)",false, "sum"));
- parser->push_back (new OptionOneParam (STR_SOLIDITY_CUSTOM, "when solidity-kind is cutom, specifies list of files where kmer must be present",false, ""));
+ parser->push_back (new OptionOneParam (STR_SOLIDITY_CUSTOM, "when solidity-kind is custom, specifies list of files where kmer must be present",false, ""));
parser->push_back (new OptionOneParam (STR_MAX_MEMORY, "max memory (in MBytes)", false, "5000"));
parser->push_back (new OptionOneParam (STR_MAX_DISK, "max disk (in MBytes)", false, "0"));
parser->push_back (new OptionOneParam (STR_URI_SOLID_KMERS, "output file for solid kmers (only when constructing a graph)", false));
From 61c5ee19d44812f506258c22177c684936f8fd6f Mon Sep 17 00:00:00 2001
From: rchikhi <rayan.chikhi@univ-lille1.fr>
Date: Mon, 22 Oct 2018 11:48:08 +0200
Origin: https://github.com/GATB/gatb-core/commit/61c5ee1.patch
Subject: [PATCH] small bugfix
---
gatb-core/test/unit/src/debruijn/TestDebruijn.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/gatb-core/test/unit/src/debruijn/TestDebruijn.cpp b/gatb-core/test/unit/src/debruijn/TestDebruijn.cpp
index 1be96659..8d12e1fa 100644
--- a/gatb-core/test/unit/src/debruijn/TestDebruijn.cpp
+++ b/gatb-core/test/unit/src/debruijn/TestDebruijn.cpp
@@ -1298,7 +1298,7 @@ class TestDebruijn : public Test
Node n2 = graph.reverse (n1);
CPPUNIT_ASSERT (n2.strand == STRAND_REVCOMP);
- CPPUNIT_ASSERT (graph.toString(n2).compare ("TCAG") == 0 || graph.toString(n1).compare ("TGGA") == 0);
+ CPPUNIT_ASSERT (graph.toString(n2).compare ("TCAG") == 0 || graph.toString(n2).compare ("TGGA") == 0);
}
From da4ceed164c761ad264b5b0b4e6745a9d9e33ea0 Mon Sep 17 00:00:00 2001
From: rchikhi <rayan.chikhi@univ-lille1.fr>
Date: Mon, 24 Dec 2018 15:12:41 +0100
Origin: https://github.com/GATB/gatb-core/commit/da4ceed.patch
Subject: [PATCH] minor cleanup
---
gatb-core/src/gatb/bcalm2/bglue_algo.cpp | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/gatb-core/src/gatb/bcalm2/bglue_algo.cpp b/gatb-core/src/gatb/bcalm2/bglue_algo.cpp
index c12d1f79..6c1ec575 100644
--- a/gatb-core/src/gatb/bcalm2/bglue_algo.cpp
+++ b/gatb-core/src/gatb/bcalm2/bglue_algo.cpp
@@ -961,10 +961,9 @@ void bglue(Storage *storage,
// partition the glue into many files, à la dsk
- std::mutex mtx; // lock to avoid a nasty bug when calling output()
auto partitionGlue = [k, &modelCanon /* crashes if copied!*/, \
&get_UFclass, &gluePartitions,
- &out, &outLock, &nb_seqs_in_partition, nbGluePartitions, &mtx]
+ &out, &outLock, &nb_seqs_in_partition, nbGluePartitions]
(const Sequence& sequence)
{
const string &seq = sequence.toString();
@@ -1045,7 +1044,7 @@ void bglue(Storage *storage,
for (int partition = 0; partition < nbGluePartitions; partition++)
{
auto glue_partition = [&modelCanon, &ufkmers, partition, &gluePartition_prefix, nbGluePartitions, &copy_nb_seqs_in_partition,
- &get_UFclass, &out, &outLock, kmerSize, &mtx]( int thread_id)
+ &get_UFclass, &out, &outLock, kmerSize]( int thread_id)
{
int k = kmerSize;
......@@ -6,8 +6,3 @@ set_soversion.patch
dynamic_linking_of_tools.patch
multiarch.patch
spelling.patch
#44d5a44.patch
#da4ceed.patch
#0c580ab.patch
#61c5ee1.patch
#5d45664.patch
......@@ -9,7 +9,7 @@ Description: Invent soversion 0 since upstream does not set soversion
set_target_properties (gatbcore-static PROPERTIES OUTPUT_NAME gatbcore clean_direct_output 1)
-set_target_properties (gatbcore-dynamic PROPERTIES OUTPUT_NAME gatbcore clean_direct_output 1)
+set_target_properties (gatbcore-dynamic PROPERTIES OUTPUT_NAME gatbcore SOVERSION 1 clean_direct_output 1)
+set_target_properties (gatbcore-dynamic PROPERTIES OUTPUT_NAME gatbcore SOVERSION 2 clean_direct_output 1)
################################################################################
# INSTALLATION
version=4
# New files on Github
opts="repacksuffix=+dfsg,dversionmangle=s/\+dfsg//g,repack,compression=xz" \
https://github.com/GATB/gatb-core/releases .*/archive/v(\d[\d.-]+)\.(?:tar(?:\.gz|\.bz2)?|tgz)
# Minia does not work with the official release - try HEAD
opts="mode=git,pretty=1.4.1+git%cd.%h,repacksuffix=+dfsg,dversionmangle=s/\+dfsg//g,repack,compression=xz" \
https://github.com/GATB/gatb-core.git HEAD
# Regular releases
#opts="repacksuffix=+dfsg,dversionmangle=s/\+dfsg//g,repack,compression=xz" \
# https://github.com/GATB/gatb-core/releases .*/archive/v(\d[\d.-]+)\.(?:tar(?:\.gz|\.bz2)?|tgz)
CMakeLists.txt.user
/build*
/.settings/
/.cproject
*.leon
.DS_Store
......@@ -3,6 +3,7 @@
################################################################################
FIND_PATH (CPPUNIT_INCLUDE_DIR cppunit/extensions/HelperMacros.h
$ENV{HOME}/include
$ENV{HOME}/.linuxbrew/include
/local/include
/usr/include
......@@ -18,10 +19,12 @@ FIND_LIBRARY (CPPUNIT_LIBRARY cppunit
)
# A little hack here... We check whether the static library is reachable too.
IF (EXISTS "${CPPUNIT_INCLUDE_DIR}/cppunit/extensions/HelperMacros.h")
if (NOT EXISTS "${CPPUNIT_INCLUDE_DIR}/../lib/libcppunit.a")
message ("-- CppUnit: found headers (in ${CPPUNIT_INCLUDE_DIR}) but not the static library (${CPPUNIT_INCLUDE_DIR}/../lib/libcppunit.a)")
SET (CPPUNIT_NO_STATIC_LIB_FOUND 1)
endif()
endif()
IF (CPPUNIT_INCLUDE_DIR)
IF (CPPUNIT_LIBRARY)
......
/main.aux
/main.log
/main.pdf
......@@ -42,12 +42,16 @@ int main (int argc, char* argv[])
IBank* bank2 = Bank::open (options->getStr(STR_BANK2)); LOCAL (bank2);
// We iterate the two banks. Note how we provide two iterators from the two banks.
PairedIterator<Sequence> itPair (bank1->iterator(), bank2->iterator());
PairedIterator<Sequence> * itPair = new PairedIterator<Sequence> (bank1->iterator(), bank2->iterator());
LOCAL(itPair);
for (itPair.first(); !itPair.isDone(); itPair.next())
ProgressIterator< std::pair <Sequence, Sequence>> progress_iter (itPair, "coucou la paire", bank1->estimateNbItems());
for (progress_iter.first(); !progress_iter.isDone(); progress_iter.next())
{
Sequence& s1 = itPair.item().first;
Sequence& s2 = itPair.item().second;
Sequence& s1 = itPair->item().first;
Sequence& s2 = itPair->item().second;
std::cout << "seq1.len=" << s1.getDataSize() << " seq2.len=" << s2.getDataSize() << std::endl;
}
......
/*
* KAT-like analysis of a genome assembly
*
* given 2 datasets
* outputs the 2D histogram of kmers
*
* must give 2 datasets, first readset, then genome
* Sample command: histo2D -in readset.fasta,genome.fasta -kmer-size 25 > resultfile
*
*/
#include <gatb/gatb_core.hpp>
using namespace std;
int dim1 = 10000; // max kmer occurence in read set
int dim2 = 10; // max kmer occurence in genome
#define IDX(i,j) ((i) + (j)*dim1)
/********************************************************************************/
template<size_t span>
class CountProcessorCustom : public CountProcessorAbstract<span>
{
public:
// We need the kmer size to dump kmers values as nucl strings
CountProcessorCustom (size_t kmerSize, ISynchronizer* synchro, u_int64_t * histo_table_2D) : kmerSize(kmerSize), synchro(synchro), _histo_table_2D(histo_table_2D) {}
virtual ~CountProcessorCustom () {}
CountProcessorAbstract<span>* clone () { return new CountProcessorCustom<span>(kmerSize, synchro,_histo_table_2D); }
virtual bool process (size_t partId, const typename Kmer<span>::Type& kmer, const CountVector& count, CountNumber sum)
{
if( count[0] < dim1 && count[1] < dim2)
__sync_fetch_and_add ( & _histo_table_2D[IDX( count[0] , count[1] )], 1);
return true;
}
private:
size_t kmerSize;
ISynchronizer *synchro;
u_int64_t * _histo_table_2D;
};
/********************************************************************************/
template<size_t span> struct MainLoop { void operator () (IProperties* options)
{
//check that the user gave 2 input banks
std::string banknames = options->getStr(STR_URI_INPUT);
size_t n = std::count(banknames.begin(), banknames.end(), ',');
if( (n+1) != 2)
{
printf("There must be 2 input banks.\n");
exit(1);
}
u_int64_t * histo_table_2D = (u_int64_t *) calloc( dim1 * dim2 , sizeof(u_int64_t *));
// We force the solidity kind (otherwise default value "sum" will consider N banks as a single one)
options->setStr(STR_SOLIDITY_KIND, "all");
// We create a SortingCountAlgorithm instance.
SortingCountAlgorithm<span> algo (options);
// global synchronization
ISynchronizer* synchro = System::thread().newSynchronizer();
// We create a custom count processor and give it to the sorting count algorithm
algo.addProcessor (new CountProcessorCustom<span> (options->getInt(STR_KMER_SIZE), synchro,histo_table_2D));
// We launch the algorithm
algo.execute();
//outptut the 2D histogram
for(int ii=0; ii< dim1; ii++)
{
printf("%5i:\t",ii);
for(int jj=0; jj< dim2; jj++)
{
printf("\t%6lli",histo_table_2D[IDX(ii,jj)]);
}
printf("\n");
}
free(histo_table_2D);
}};
/********************************************************************************/
int main (int argc, char* argv[])
{
// We create a command line parser for the sorting count algorithm
IOptionsParser* parser = SortingCountAlgorithm<>::getOptionsParser ();
parser->push_back (new OptionOneParam (STR_NB_CORES, "nb cores", false, "1"));
parser->push_back (new OptionOneParam (STR_VERBOSE, "verbosity", false, "1"));
// We launch our functor
return Algorithm::mainloop <MainLoop> (parser, argc, argv);
}
......@@ -138,6 +138,30 @@ struct Sequence
* \param[in] qual : quality string of the sequence. */
void setQuality (const std::string& qual) { _quality = qual; }
/** Returns a string that is the reverse complement of the sequence
* The Sequence object needs to be in ASCII Format
**/
std::string getRevcomp () const {
std::string rc;
std::string s = toString();
for (signed int i = s.length() - 1; i >= 0; i--) {
unsigned char c;
switch(s[i]){
case 'A': c='T';break;
case 'C': c='G';break;
case 'G': c='C';break;
case 'T': c='A';break;
case 'a': c='t';break;
case 'c': c='g';break;
case 'g': c='c';break;
case 't': c='a';break;
default: c='X';break;
}
rc += c;
}
return rc;
}
/** Comment attribute (note: should be private with a setter and getter). */
std::string _comment;
......
......@@ -253,8 +253,9 @@ static void determine_order_sequences(vector<vector<uint32_t>> &res, const vecto
kmerIndex[markedSequences[i].ke].insert(i);
}
auto glue_from_extremity = [&](markedSeq<SPAN> current, uint32_t chain_index, uint32_t markedSequence_index)
auto glue_from_extremity = [&](markedSeq<SPAN> current, uint32_t chain_index, uint32_t markedSequence_index, bool expect_circular=false)
{
uint32_t first_index = markedSequence_index;
vector<uint32_t> chain;
chain.push_back(chain_index);
......@@ -308,10 +309,23 @@ static void determine_order_sequences(vector<vector<uint32_t>> &res, const vecto
current = successor;
markedSequence_index = successor_index;
chain.push_back(chain_index);
rmark = current.rmark;
if (expect_circular)
{
if (usedSeq.find(markedSequence_index) != usedSeq.end())
{
assert(markedSequence_index == first_index);
break;
}
}
else
{
assert((usedSeq.find(markedSequence_index) == usedSeq.end()));
}
usedSeq.insert(markedSequence_index);
chain.push_back(chain_index);
rmark = current.rmark;
}
res.push_back(chain);
......@@ -351,23 +365,26 @@ static void determine_order_sequences(vector<vector<uint32_t>> &res, const vecto
}
/* // attempt to fix circular contigs, but I was in a hurry, so not finished
while (nb_chained < sequences.size())
{
// there might be a special case: a circular unitig, to be glued with multiple sequences all containing doubled kmers at extremities
// special case: a circular unitig, to be glued with multiple sequences all containing doubled kmers at extremities
// my fix plan: we pick an extremity at random, and chop the last nucleotide and mark it to not be glued. also find the corresponding kmer in other extremity, and mark it as not to be glued
while (nb_chained < markedSequences.size())
{
//std::cout << "nb chained " << nb_chained << " markedseq size" << markedSequences.size() << std::endl;
vector<int> remaining_indices;
for (uint32_t i = 0; i < markedSequences.size(); i++)
{
if (usedSeq.find(i) != usedSeq.end())
if (usedSeq.find(i) == usedSeq.end())
remaining_indices.push_back(i);
}
uint32_t chain_index = remaining_indices[0];
string kmer = markedSequences[chain_index].substr(0,kmerSize);
// markedSequences[chain_index] = // TODO continue
assert(remaining_indices.size()>0);
markedSeq<SPAN> current = markedSequences[remaining_indices[0]];
uint32_t chain_index = markedSequences[remaining_indices[0]].index;
glue_from_extremity(current, chain_index, remaining_indices[0], true);
}
*/
if (nb_chained < markedSequences.size())
{
std::cout << " Note: " << markedSequences.size() - nb_chained << " sequence chunks not returned in output unitigs (likely small circular contigs)" << std::endl;
......@@ -732,14 +749,13 @@ void bglue(Storage *storage,
}
if (uf_hashes.size() == 0) // prevent an edge case when there's nothing to glue, boophf doesn't like it
uf_hashes.push_back(0);
for (int pass = 0; pass < nb_prepare_passes; pass++)
System::file().remove (prefix+".glue.hashes." + to_string(pass));
unsigned long nb_uf_keys = uf_hashes.size();
logging("loaded all unique UF elements (" + std::to_string(nb_uf_keys) + ") into a single file vector of size " + to_string(nb_uf_keys* sizeof(partition_t) / 1024/1024) + " MB");
logging("loaded all unique UF elements (" + std::to_string(nb_uf_keys) + ") into a single vector of size " + to_string(nb_uf_keys* sizeof(partition_t) / 1024/1024) + " MB");
int gamma = 3; // make it even faster.
// TODO: why not iterate the UF hashes from disk instead of loading them in memory?
boomphf::mphf<partition_t , /*TODO we don't need hasher_t here now that we're not hashing kmers, but I forgot to change*/ hasher_t< partition_t> > uf_mphf(nb_uf_keys, uf_hashes, nb_threads, gamma, verbose);
free_memory_vector(uf_hashes);
......@@ -750,9 +766,9 @@ void bglue(Storage *storage,
logging("UF MPHF constructed (" + std::to_string(uf_mphf_memory/8/1024/1024) + " MB)" );
}
// create a UF data structure
unionFind<uint32_t> ufkmers(nb_uf_keys);
// this one stores nb_uf_keys * uint64_t (actually, atomic's). so it's bigger than uf_hashes
unionFind ufkmers(nb_uf_keys);
#if 0
unionFind<unsigned int> ufmin;
......@@ -843,6 +859,11 @@ void bglue(Storage *storage,
logging("UF constructed");
// remove the UF hashes, could have done that earlier (right before bbhash), but i'd like to keep them in case of segfault
for (int pass = 0; pass < nb_prepare_passes; pass++)
System::file().remove (prefix+".glue.hashes." + to_string(pass));
if (debug_uf_stats) // for debugging
{
ufkmers.printStats("uf kmers");
......@@ -883,9 +904,7 @@ void bglue(Storage *storage,
// setup output file
string output_prefix = prefix;
std::atomic<unsigned long> out_id; // identified for output sequences
out_id = 0;
BufferedFasta out (output_prefix, 100000);
BufferedFasta out (output_prefix, 100000, true);
auto get_UFclass = [&modelCanon, &ufkmers_vector, &hasher, &uf_mphf]
(const string &kmerBegin, const string &kmerEnd,
......@@ -942,10 +961,9 @@ void bglue(Storage *storage,
// partition the glue into many files, à la dsk
std::mutex mtx; // lock to avoid a nasty bug when calling output()
auto partitionGlue = [k, &modelCanon /* crashes if copied!*/, \
&get_UFclass, &gluePartitions,
&out, &outLock, &nb_seqs_in_partition, &out_id, nbGluePartitions, &mtx]
&out, &outLock, &nb_seqs_in_partition, nbGluePartitions]
(const Sequence& sequence)
{
const string &seq = sequence.toString();
......@@ -971,12 +989,8 @@ void bglue(Storage *storage,
float mean_abundance = get_mean_abundance(abundances);
uint32_t sum_abundances = get_sum_abundance(abundances);
// for some reason i do need that lock_guard here.. even though output is itself lock guarded. maybe some lazyness in the evauation of the to_string(out_id++)? who kon
// anyway this fixes the problem, i'll understand it some other time.
std::lock_guard<std::mutex> lock(mtx);
output(seq, out, std::to_string(out_id++) + " LN:i:" + to_string(seq.size()) + " KC:i:" + to_string(sum_abundances) + " km:f:" + to_string_with_precision(mean_abundance));
// km is not a standard GFA field so i'm putting it in lower case as per the spec
// maybe could optimize by writing to disk using queues, if that's ever a bottleneck
output(seq, out, "LN:i:" + to_string(seq.size()) + " KC:i:" + to_string(sum_abundances) + " km:f:" + to_string_with_precision(mean_abundance));
return;
}
......@@ -1030,7 +1044,7 @@ void bglue(Storage *storage,
for (int partition = 0; partition < nbGluePartitions; partition++)
{
auto glue_partition = [&modelCanon, &ufkmers, partition, &gluePartition_prefix, nbGluePartitions, &copy_nb_seqs_in_partition,
&get_UFclass, &out, &outLock, &out_id, kmerSize, &mtx]( int thread_id)
&get_UFclass, &out, &outLock, kmerSize]( int thread_id)
{
int k = kmerSize;
......@@ -1078,7 +1092,7 @@ void bglue(Storage *storage,
markedSeq<SPAN> ms(seq_index, lmark, rmark, kmmerBegin.value(), kmmerEnd.value());
//if (ufclass == 38145) std::cout << " ufclass " << ufclass << " seq " << seq << " seq index " << seq_index << " " << lmark << rmark << " ks " << kmmerBegin.value() << " ke " << kmmerEnd.value() << std::endl; // debug specific partition
//std::cout << " ufclass " << ufclass << " seq " << seq << " seq index " << seq_index << " " << lmark << rmark << " ks " << kmmerBegin.value() << " ke " << kmmerEnd.value() << std::endl; // debug specific partition
msInPart[ufclass].push_back(ms);
seq_index++;
}
......@@ -1119,10 +1133,7 @@ void bglue(Storage *storage,
float mean_abundance = get_mean_abundance(abs);
uint32_t sum_abundances = get_sum_abundance(abs);
{
// for some reason i do need that lock_guard here.. even though output is itself lock guarded. maybe some lazyness in the evauation of the to_string(out_id++)? who kon
// anyway this fixes the problem, i'll understand it some other time.
std::lock_guard<std::mutex> lock(mtx);
output(seq, out, std::to_string(out_id++) + " LN:i:" + to_string(seq.size()) + " KC:i:" + to_string(sum_abundances) + " km:f:" + to_string_with_precision(mean_abundance));
output(seq, out, "LN:i:" + to_string(seq.size()) + " KC:i:" + to_string(sum_abundances) + " km:f:" + to_string_with_precision(mean_abundance));
}
}
......
......@@ -43,16 +43,21 @@ class BufferedFasta
std::string buffer;
unsigned long buffer_length;
FILE* _insertHandle;
bool needs_consecutive_ids;
unsigned long consecutive_id;
public:
unsigned long max_buffer;
bool threadsafe;
BufferedFasta(const std::string filename, unsigned long given_max_buffer = 50000)
BufferedFasta(const std::string filename, unsigned long given_max_buffer = 50000, bool given_needs_consecutive_ids = false)
{
max_buffer = given_max_buffer; // that much of buffering will be written to the file at once (in bytes)
threadsafe = true;
buffer_length = 0;
consecutive_id = 0;
needs_consecutive_ids = given_needs_consecutive_ids;
if ((!threadsafe) && needs_consecutive_ids) { std::cout << "error, non-threadsafe bufferedfasta with consecutive ids asked" << std::endl; exit(1);}
_insertHandle = fopen (filename.c_str(), "w");
if (!_insertHandle) { std::cout << "error opening " << filename << " for writing." << std::endl; exit(1);}
buffer.reserve(max_buffer+1000/*security*/);
......@@ -73,7 +78,13 @@ class BufferedFasta
if (buffer_length + insert_size > max_buffer)
flush();
buffer_length += insert_size;
buffer += ">" + comment + "\n" + seq + "\n";
buffer += ">";
if (needs_consecutive_ids)
{
buffer += std::to_string(consecutive_id) + " ";
consecutive_id++;
}
buffer += comment + "\n" + seq + "\n";
if (threadsafe)
mtx.unlock();
}
......