Commit e0f90724 authored by Andreas Tille's avatar Andreas Tille

New upstream version 1.1.2+dfsg

parent ff988fa7
......@@ -3,8 +3,8 @@
url = https://github.com/jwalabroad/fermi-lite
[submodule "htslib"]
path = htslib
url = https://github.com/walaj/htslib
branch = develop
url = https://github.com/samtools/htslib
branch = master
[submodule "bwa"]
path = bwa
url = https://github.com/jwalabroad/bwa
......
......@@ -19,7 +19,7 @@ then
## download the test data
mkdir test_data
cd test_data
wget -r -nH -nd -np -R index.html* https://data.broadinstitute.org/snowman/SeqLibTest/
wget -r -nH -nd -np -R index.html* https://data.broadinstitute.org/snowman/SeqLib/
cd ..
export LD_LIBRARY_PATH=${BOOST_ROOT}/lib:${LD_LIBRARY_PATH}
......
......@@ -285,7 +285,6 @@ pdfdir = @pdfdir@
prefix = @prefix@
program_transform_name = @program_transform_name@
psdir = @psdir@
runstatedir = @runstatedir@
sbindir = @sbindir@
sharedstatedir = @sharedstatedir@
srcdir = @srcdir@
......
......@@ -9,6 +9,13 @@ API Documentation
-----------------
[API Documentation][htmldoc]
Citation
--------
If you use SeqLib in your applications, please cite: http://bioinformatics.oxfordjournals.org/content/early/2016/12/21/bioinformatics.btw741.full.pdf+html
Note that the values for the SeqAn benchmarking in Table 2 should be corrected to 7.7 Gb memory and 33.92 seconds in CPU time, when compiling SeqAn with ``-O3 -DNDEBUG``. SeqAn also does full string decompression.
Wall times for SeqAn may be shorter than CPU time because it uses embedded multi-threading during BAM IO.
Table of contents
=================
......@@ -51,7 +58,7 @@ C_INCLUDE_PATH=$C_INCLUDE_PATH:$SEQ:$SEQ/htslib
And need to link the SeqLib static library and Fermi, BWA and HTSlib libraries
```bash
SEQ=<path_to_seqlib>
LDFLAGS="$LDFLAGS -L$SEQ/bin/libseqlib.a -L$SEQ/bin/libbwa.a -L$SEQ/bin/libfml.a -L$SEQ/bin/libhts.a"
LDFLAGS="$LDFLAGS $SEQ/bin/libseqlib.a $SEQ/bin/libbwa.a $SEQ/bin/libfml.a $SEQ/bin/libhts.a"
```
To add support for reading BAMs, etc with HTTPS, FTP, S3, Google cloud, etc, you must compile and link with libcurl.
......@@ -101,13 +108,14 @@ provide excellent and high quality APIs. SeqLib provides further performance enh
bioinformatics problems.
Some differences:
* SeqLib has ~2-4x faster read/write speed over BamTools and SeqAn, and lower memory footprint.
* SeqLib has ~2-4x faster read/write speed over BamTools and lower memory footprint.
* SeqLib has support for CRAM file
* SeqLib provides in memory access to BWA-MEM, BLAT, a chromosome aware interval tree and range operations, and to read correction and sequence assembly with Fermi. BamTools has more support currently for network access.
* SeqAn provide a substantial amount of additional capabilites not in SeqLib, including graph operations and a more expanded suite of multi-sequence alignments.
* SeqLib provides in memory access to BWA-MEM, BLAT, chromosome aware interval tree, read correction, and sequence assembly with Fermi.
* SeqAn provide a substantial amount of additional capabilites not in SeqLib, including graph operations and an expanded suite of multi-sequence alignments.
* SeqAn embeds multi-threading into some functionality like BAM IO to improve wall times.
For your particular application, our hope is that SeqLib will provide a comprehensive and powerful envrionment to develop
bioinformatics tools. Feature requests and comments are welcomed.
bioinformatics tools, or to be used in conjuction with the capablities in SeqAn and BamTools. Feature requests and comments are welcomed.
Command Line Usage
------------------
......@@ -152,7 +160,7 @@ bwa.ConstructIndex(usv);
std::string querySeq = "CAGCCTCACCCAGGAAAGCAGCTGGGGGTCCACTGGGCTCAGGGAAG";
BamRecordVector results;
// hardclip=false, secondary score cutoff=0.9, max secondary alignments=10
bwa.AlignSequence("my_seq", querySeq, results, false, 0.9, 10);
bwa.AlignSequence(querySeq, "my_seq", results, false, 0.9, 10);
// print results to stdout
for (auto& i : results)
......@@ -293,21 +301,22 @@ w.Close(); // Optional. Will close on destruction
using namespace SeqLib;
// brv is some set of reads to train the error corrector
b.TrainCorrection(brv);
for (BamRecordVector::const_iterator r = brv.begin(); r != brv.end(); ++r)
b.AddSequence(r->Sequence().c_str(), r->Qualities().c_str(), r->Qname().c_str());
b.Train();
b.clear(); // clear the training sequences. Training parameters saved in BFC object
// brv2 is some set to correct
b.ErrorCorrect(brv2);
for (BamRecordVector::const_iterator r = brv2.begin(); r != brv2.end(); ++r)
b.AddSequence(r->Sequence().c_str(), r->Qualities().c_str(), r->Qname().c_str());
b.ErrorCorrect();
// retrieve the sequences
UnalignedSequenceVector v;
b.GetSequences(v);
// alternatively, to train and correct the same set of reads
b.TrainAndCorrect(brv);
b.GetSequences(v);
std::string name, seq;
while (b.GetSequences(seq, name))
v.push_back({name, seq});
// alternatively, train and correct, and modify the sequence in-place
b.TrainCorrection(brv);
b.ErrorCorrectInPlace(brv);
```
Support
......
......@@ -24,6 +24,7 @@ namespace SeqLib {
public:
/** Construct a new BFC engine */
BFC() {
m_idx = 0;
bfc_opt_init(&bfc_opt);
ch = NULL;
kmer = 0;
......@@ -43,57 +44,31 @@ namespace SeqLib {
bfc_ch_destroy(ch);
}
/** Allocate a block of memory for the reads if the amount to enter is known
* @note This is not necessary, as reads will dynamically reallocate
*/
bool AllocateMemory(size_t n);
/** Peform BFC error correction on the sequences stored in this object */
bool ErrorCorrect();
/** Train the error corrector using the reads stored in this object */
bool Train();
/** Add a sequence for either training or correction */
bool AddSequence(const BamRecord& r);
/** Add a sequence for either training or correction */
/** Add a sequence for either training or correction
* @param seq A sequence to be copied into this object (A, T, C, G)
*/
bool AddSequence(const char* seq, const char* qual, const char* name);
/** Set the k-mer size */
/** Set the k-mer size for training
* @note zero is auto
*/
void SetKmer(int k) { kmer = k; }
/** Train error correction using sequences from aligned reads */
void TrainCorrection(const BamRecordVector& brv);
/** Train error correction from raw character strings */
void TrainCorrection(const std::vector<char*>& v);
/** Train and error correction on same reads */
void TrainAndCorrect(const BamRecordVector& brv);
/** Error correct a collection of reads */
void ErrorCorrect(const BamRecordVector& brv);
/** Error correct in place, modify sequence, and the clear memory from this object */
void ErrorCorrectInPlace(BamRecordVector& brv);
/** Correct a single new sequence not stored in object
* @param str Sequence of string to correct (ACTG)
* @param q Quality score of sequence to correct
* @value Returns true if corrected */
bool CorrectSequence(std::string& str, const std::string& q);
/** Error correct and add tag with the corrected sequence data, and the clear memory from this object
* @param brv Aligned reads to error correct
* @param tag Tag to assign error corrected sequence to (eg KC)
* @exception Throws an invalid_argument if tag is not length 2
*/
void ErrorCorrectToTag(BamRecordVector& brv, const std::string& tag);
/** Return the reads (error corrected if ran ErrorCorrect) */
void GetSequences(UnalignedSequenceVector& v) const;
/** Clear the stored reads */
void clear();
/** Filter reads with unique k-mers. Do after error correction */
void FilterUnique();
/** Return the calculated kcov */
float GetKCov() const { return kcov; }
......@@ -103,8 +78,21 @@ namespace SeqLib {
/** Return the number of sequences controlled by this */
int NumSequences() const { return n_seqs; }
/** Return the next sequence stored in object
* @param s Empty string to be filled.
* @param q Empty string name to be filled.
* @value True if string was filled with sequence. False if no more sequences.
*/
bool GetSequence(std::string& s, std::string& q);
/** Reset the sequence iterator inside GetSequence
*/
void ResetGetSequence() { m_idx = 0; };
private:
size_t m_idx;
// the amount of memory allocated
size_t m_seqs_size;
......@@ -146,6 +134,8 @@ namespace SeqLib {
// assign names, qualities and seq to m_seqs
void allocate_sequences_from_char(const std::vector<char*>& v);
void allocate_sequences_from_strings(const std::vector<std::string>& v);
// do the actual read correction
void correct_reads();
......
......@@ -42,7 +42,8 @@ namespace SeqLib {
private:
// do the read loading
bool load_read(BamRecord& r);
// the return value here is just passed along from sam_read1
int32_t load_read(BamRecord& r);
void reset() {
empty = true;
......
......@@ -108,6 +108,14 @@ class CigarField {
public:
/** Construct an empty CIGAR */
Cigar() {}
/** Construct from a CIGAR string
* @param cig CIGAR string, e.g. 54M46S
*/
Cigar(const std::string& cig);
typedef std::vector<CigarField>::iterator iterator; ///< Iterator for move between CigarField ops
typedef std::vector<CigarField>::const_iterator const_iterator; ///< Iterator (const) for move between CigarField ops
iterator begin() { return m_data.begin(); } ///< Iterator (aka std::vector<CigarField>.begin()
......@@ -185,11 +193,8 @@ class CigarField {
};
//typedef std::vector<CigarField> Cigar;
typedef SeqHashMap<std::string, size_t> CigarMap;
Cigar cigarFromString(const std::string& cig);
/** Class to store and interact with a SAM alignment record
*
* HTSLibrary reads are stored in the bam1_t struct. Memory allocation
......@@ -345,7 +350,10 @@ class BamRecord {
int32_t CountBWAChimericAlignments() const;
/** Get the end of the alignment */
inline int32_t PositionEnd() const { return b ? bam_endpos(b.get()) : -1; }
int32_t PositionEnd() const;
/** Get the end of the aligment mate pair */
int32_t PositionEndMate() const;
/** Get the chromosome ID of the read */
inline int32_t ChrID() const { return b ? b->core.tid : -1; }
......@@ -398,8 +406,8 @@ class BamRecord {
inline std::string ParseReadGroup() const {
// try to get from RG tag first
std::string RG = GetZTag("RG");
if (!RG.empty())
std::string RG;
if (GetZTag("RG", RG))
return RG;
// try to get the read group tag from qname second
......@@ -414,7 +422,7 @@ class BamRecord {
if (b->core.tid != b->core.mtid || !PairMappedFlag())
return 0;
return std::abs(b->core.pos - b->core.mpos) + Length();
return std::abs(b->core.pos - b->core.mpos) + GetCigar().NumQueryConsumed();
}
......@@ -559,8 +567,8 @@ class BamRecord {
uint8_t * p = bam_get_qual(b);
if (!p)
return std::string();
if (!p[0])
return std::string();
//if (!p[0])
// return std::string();
std::string out(b->core.l_qseq, ' ');
for (int32_t i = 0; i < b->core.l_qseq; ++i)
out[i] = (char)(p[i] + offset);
......@@ -659,9 +667,17 @@ class BamRecord {
/** Get a string (Z) tag
* @param tag Name of the tag. eg "XP"
* @return The value stored in the tag. Returns empty string if it does not exist.
* @param s The string to be filled in with the tag information
* @return Returns true if the tag is present, even if empty. Return false if no tag or not a Z tag.
*/
std::string GetZTag(const std::string& tag) const;
bool GetZTag(const std::string& tag, std::string& s) const;
/** Get a string of either Z, f or i type. Useful if tag type not known at compile time.
* @param tag Name of the tag. eg "XP"
* @param s The string to be filled in with the tag information
* @return Returns true if the tag is present and is either Z or i, even if empty. Return false if no tag or not Z or i.
*/
bool GetTag(const std::string& tag, std::string& s) const;
/** Get a vector of type int from a Z tag delimited by "^"
* Smart-tags allow one to store vectors of strings, ints or doubles in the alignment tags, and
......@@ -691,13 +707,28 @@ class BamRecord {
/** Get an int (i) tag
* @param tag Name of the tag. eg "XP"
* @return The value stored in the tag. Returns 0 if it does not exist.
* @param t Value to be filled in with the tag value.
* @return Return true if the tag exists.
*/
inline int32_t GetIntTag(const std::string& tag) const {
inline bool GetIntTag(const std::string& tag, int32_t& t) const {
uint8_t* p = bam_aux_get(b.get(),tag.c_str());
if (!p)
return 0;
return bam_aux2i(p);
return false;
t = bam_aux2i(p);
return true;
}
/** Get a float (f) tag
* @param tag Name of the tag. eg "AS"
* @param t Value to be filled in with the tag value.
* @return Return true if the tag exists.
*/
inline bool GetFloatTag(const std::string& tag, float& t) const {
uint8_t* p = bam_aux_get(b.get(),tag.c_str());
if (!p)
return false;
t = bam_aux2f(p);
return true;
}
/** Add a string (Z) tag
......@@ -825,7 +856,10 @@ class BamRecord {
*/
int OverlappingCoverage(const BamRecord& r) const;
private:
/** Return the shared pointer */
SeqPointer<bam1_t> shared_pointer() const { return b; }
protected:
SeqPointer<bam1_t> b; // bam1_t shared pointer
......
#ifndef SNOWUTILS_H
#define SNOWUTILS_H
#ifndef SEQLIB_UTILS_H
#define SEQLIB_UTILS_H
#include <string>
#include <time.h>
......
#ifndef SEQLIB_UNALIGNED_SEQ_H__
#define SEQLIB_UNALIGNED_SEQ_H__
#ifndef SEQLIB_UNALIGNED_SEQ_H
#define SEQLIB_UNALIGNED_SEQ_H
extern "C" {
#include "bwa/bwa.h"
......@@ -26,20 +26,20 @@ namespace SeqLib {
/** Construct an empty sequence */
UnalignedSequence() {}
/** Construct an unaliged sequence with name and sequence
/** Construct an unaligned sequence with name and sequence
* @param n Name of the sequence
* @param s Sequence, stored as ACTG or N characters
*/
UnalignedSequence(const std::string& n, const std::string& s) : Name(n), Seq(s), Qual(std::string()), Strand('*') {}
/** Construct an unaliged sequence with name, sequence and quality score
/** Construct an unaligned sequence with name, sequence and quality score
* @param n Name of the sequence
* @param s Sequence, stored as ACTG or N characters
* @param q Quality string
*/
UnalignedSequence(const std::string& n, const std::string& s, const std::string& q) : Name(n), Seq(s), Qual(q), Strand('*') {}
/** Construct an unaliged sequence with name, sequence and quality score
/** Construct an unaligned sequence with name, sequence, quality score and strand
* @param n Name of the sequence
* @param s Sequence, stored as ACTG or N characters
* @param q Quality string
......
##INCLUDES=-I/xchip/gistic/Jeremiah/software/seqan-library-2.0.2/include -I /xchip/gistic/Jeremiah/GIT/SeqLib/src -I/xchip/gistic/Jeremiah/GIT/SeqLib/htslib -I/xchip/gistic/Jeremiah/software/boost_1.61.0_gcc5.1 -I/xchip/gistic/Jeremiah/software/bamtools-2.4.0/include
INCLUDES=-I/xchip/gistic/Jeremiah/software/seqan-library-2.2.0/include -I /xchip/gistic/Jeremiah/GIT/SeqLib/src -I/xchip/gistic/Jeremiah/GIT/SeqLib/htslib -I/xchip/gistic/Jeremiah/software/boost_1.61.0_gcc5.1 -I/xchip/gistic/Jeremiah/software/bamtools-2.4.0/include -I/xchip/gistic/Jeremiah/software/bzip2-1.0.6
INCLUDES=-I/xchip/gistic/Jeremiah/software/seqan-library-2.2.0/include -I /xchip/gistic/Jeremiah/GIT/SeqLib -I/xchip/gistic/Jeremiah/GIT/SeqLib/htslib -I/xchip/gistic/Jeremiah/software/boost_1.61.0_gcc5.1 -I/xchip/gistic/Jeremiah/software/bamtools-2.4.0/include -I/xchip/gistic/Jeremiah/software/bzip2-1.0.6
LIBS=/xchip/gistic/Jeremiah/software/bamtools-2.4.0/lib/libbamtools.a /xchip/gistic/Jeremiah/GIT/SeqLib/src/libseqlib.a /xchip/gistic/Jeremiah/GIT/SeqLib/htslib/libhts.a /xchip/gistic/Jeremiah/software/boost_1.61.0_gcc5.1/stage/lib/libboost_timer.a /xchip/gistic/Jeremiah/software/boost_1.61.0_gcc5.1/stage/lib/libboost_chrono.a /xchip/gistic/Jeremiah/software/boost_1.61.0_gcc5.1/stage/lib/libboost_system.a /xchip/gistic/Jeremiah/software/bzip2-1.0.6/libbz2.a
CFLAGS=-W -Wall -pedantic -std=c++14 -DSEQAN_HAS_ZLIB=1 -DSEQAN_HAS_BZIP2=1
CXXFLAGS=-W -Wall -pedantic -std=c++14 -DSEQAN_HAS_ZLIB=1 -DSEQAN_HAS_BZIP2=1 -O3 -DNDEBUG -DSEQAN_ENABLE_DEBUG=0 -DSEQAN_ENABLE_TESTING=0
binaries=benchmark
all: benchmark.o
g++ benchmark.o -g -o benchmark $(LIBS) -lrt -lpthread -lz -lm
g++ benchmark.o -o benchmark $(LIBS) -lrt -lpthread -lz -lm
benchmark.o: benchmark.cpp
g++ -g -c benchmark.cpp $(INCLUDES) $(CFLAGS)
g++ -c benchmark.cpp $(INCLUDES) $(CXXFLAGS)
.PHONY: clean
......
#define USE_BOOST
#define JUMPING_TEST 1
//#define READ_TEST 1
//#define JUMPING_TEST 1
#define READ_TEST 1
#include "SeqLib/SeqLibUtils.h"
......@@ -14,6 +14,7 @@
//#define RUN_SEQAN 1
//#define RUN_BAMTOOLS 1
#define RUN_SEQLIB 1
//#define RUN_HTSLIB 1
#ifdef RUN_SEQAN
#include <seqan/bam_io.h>
......@@ -26,7 +27,18 @@ using namespace seqan;
#include "SeqLib/BamWriter.h"
#endif
#define BAMTOOLS_GET_CORE 1
#ifdef RUN_HTSLIB
#include <iostream>
extern "C" {
#include "htslib/htslib/hts.h"
#include "htslib/htslib/sam.h"
#include "htslib/htslib/bgzf.h"
#include "htslib/htslib/kstring.h"
#include "htslib/htslib/faidx.h"
}
#endif
//#define BAMTOOLS_GET_CORE 1
#ifdef RUN_BAMTOOLS
#include "api/BamReader.h"
......@@ -37,7 +49,9 @@ int main()
const size_t limit = 5000000;
const size_t print_limit = 1000000;
#ifdef JUMPING_TEST
const size_t jump_limit = 1000;
#endif
size_t count = 0;
//std::string bam = "/xchip/gistic/Jeremiah/GIT/SeqLib/seq_test/test_data/small.bam";
......@@ -99,13 +113,18 @@ int main()
SeqLib::BamRecord rec;
SeqLib::BamRecordVector bav;
std::string dummy;
std::stringstream ss;
#ifdef READ_TEST
std::vector<std::string> sq;
while(r.GetNextRecord(rec) && count++ < limit) {
if (count % print_limit == 0)
std::cerr << "...at read " << SeqLib::AddCommas(count) << std::endl;
bav.push_back(rec);
//dummy = rec.Sequence();
//sq.push_back(rec.Sequence());
//sq.push_back(rec.Qname());
//sq.push_back(rec.CigarString());
}
#endif
......@@ -162,6 +181,7 @@ int main()
}
bool hasAlignments = false;
long l = 0;
for (int i = 0; i < jump_limit; ++i) {
int chr = rand() % 22;
int pos = rand() % 1000000 + 1000000;
......@@ -171,7 +191,8 @@ int main()
}
if (hasAlignments) {
readRecord(record, bamFileIn);
bav.push_back(record);
l += getAlignmentLengthInRef(record);
//bav.push_back(record);
} else {
std::cerr << "no alignments here " << std::endl;
}
......@@ -200,6 +221,47 @@ int main()
}
#endif
#endif
#ifdef RUN_HTSLIB
std::cerr << " **** RUNNING HTSLIB **** " << std::endl;
bam1_t* b = bam_init1();
htsFile* in = hts_open(bam.c_str(), "r");
bam_hdr_t* header;
if (in == NULL)
return -1;
if (b == NULL)
return -1;
header = sam_hdr_read(in);
int i = 0;
std::vector<bam1_t*> bav;
while (sam_read1(in, header, b) >= 0 && count++ < limit) {
if (count % print_limit == 0)
std::cerr << "...at read " << SeqLib::AddCommas(count) << std::endl;
bav.push_back(b);
/*
int i;
const bam1_core_t* c = &b->core;
uint8_t *s = bam1_seq(b), *t = bam1_qual(b);
fwrite(bam1_qname(b), c->l_qname - 1, sizeof(char), stdout);
fputc('\t', stdout);
for (i = 0; i < c->l_qseq; ++i)
fputc(bam_nt16_rev_table[bam1_seqi(s, i)], stdout);
fputc('\t', stdout);
if (t[0] == 0xff) {
fputs("*", stdout);
}
else {
for (i = 0; i < c->l_qseq; ++i)
fputc(t[i] + 33, stdout);
}
fputc('\n', stdout);
*/
}
printf("%d", i);
bam_hdr_destroy(header);
hts_close(in);
#endif
std::cerr << " Copied " << bav.size() << " records " << std::endl;
......
......@@ -703,7 +703,6 @@ infodir
docdir
oldincludedir
includedir
runstatedir
localstatedir
sharedstatedir
sysconfdir
......@@ -781,7 +780,6 @@ datadir='${datarootdir}'
sysconfdir='${prefix}/etc'
sharedstatedir='${prefix}/com'
localstatedir='${prefix}/var'
runstatedir='${localstatedir}/run'
includedir='${prefix}/include'
oldincludedir='/usr/include'
docdir='${datarootdir}/doc/${PACKAGE_TARNAME}'
......@@ -1034,15 +1032,6 @@ do
| -silent | --silent | --silen | --sile | --sil)
silent=yes ;;
-runstatedir | --runstatedir | --runstatedi | --runstated \
| --runstate | --runstat | --runsta | --runst | --runs \
| --run | --ru | --r)
ac_prev=runstatedir ;;
-runstatedir=* | --runstatedir=* | --runstatedi=* | --runstated=* \
| --runstate=* | --runstat=* | --runsta=* | --runst=* | --runs=* \
| --run=* | --ru=* | --r=*)
runstatedir=$ac_optarg ;;
-sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb)
ac_prev=sbindir ;;
-sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \
......@@ -1180,7 +1169,7 @@ fi
for ac_var in exec_prefix prefix bindir sbindir libexecdir datarootdir \
datadir sysconfdir sharedstatedir localstatedir includedir \
oldincludedir docdir infodir htmldir dvidir pdfdir psdir \
libdir localedir mandir runstatedir
libdir localedir mandir
do
eval ac_val=\$$ac_var
# Remove trailing slashes.
......@@ -1333,7 +1322,6 @@ Fine tuning of the installation directories:
--sysconfdir=DIR read-only single-machine data [PREFIX/etc]
--sharedstatedir=DIR modifiable architecture-independent data [PREFIX/com]
--localstatedir=DIR modifiable single-machine data [PREFIX/var]
--runstatedir=DIR modifiable per-process data [LOCALSTATEDIR/run]
--libdir=DIR object code libraries [EPREFIX/lib]
--includedir=DIR C header files [PREFIX/include]
--oldincludedir=DIR C header files for non-gcc [/usr/include]
......
......@@ -12,7 +12,7 @@ seq_test_LDADD = \
../htslib/libhts.a \
-lboost_unit_test_framework -lboost_system -lboost_timer -lboost_chrono
seq_test_LDFLAGS = --coverage
seq_test_LDFLAGS = --coverage -pthread
seq_test_SOURCES = seq_test.cpp \
../src/BFC.cpp ../src/GenomicRegion.cpp \
......
This diff is collapsed.
......@@ -48,6 +48,9 @@
/* Define to the one symbol short name of this package. */
#undef PACKAGE_TARNAME
/* Define to the home page for this package. */
#undef PACKAGE_URL
/* Define to the version of this package. */
#undef PACKAGE_VERSION
......
This diff is collapsed.
......@@ -37,14 +37,15 @@ BOOST_AUTO_TEST_CASE( read_gzbed ) {
br.Open("test_data/small.bam");
SeqLib::GRC g(GZBED, br.Header());
BOOST_CHECK_EQUAL(g.size(), 3);
BOOST_CHECK_EQUAL(g[2].chr, 22);
BOOST_CHECK_EQUAL(g[2].chr, 21);
SeqLib::GRC v(GZVCF, br.Header());
BOOST_CHECK_EQUAL(v.size(), 31);
BOOST_CHECK_EQUAL(v.size(), 57);
BOOST_CHECK_EQUAL(v[29].chr, 22);
BOOST_CHECK_EQUAL(v[29].chr, 0);
}
BOOST_AUTO_TEST_CASE ( bfc ) {
......@@ -80,29 +81,29 @@ BOOST_AUTO_TEST_CASE ( bfc ) {
b.Train();
b.clear();
b.ErrorCorrectToTag(brv2, "KC");
//b.ErrorCorrectToTag(brv2, "KC");
UnalignedSequenceVector v;
b.GetSequences(v);
//UnalignedSequenceVector v;
//b.GetSequences(v);
// write to corrected
for (auto& i : v) {
corr << ">" << i.Name << std::endl << i.Seq << std::endl;
}
orig.close();
corr.close();
//for (auto& i : v) {
// corr << ">" << i.Name << std::endl << i.Seq << std::endl;
//}
//orig.close();
//corr.close();
//
v.clear();
b.FilterUnique();
b.GetSequences(v);
//v.clear();
//b.FilterUnique();
//b.GetSequences(v);
// do everything at once
b.TrainAndCorrect(brv2);
//b.TrainAndCorrect(brv2);
// do everything in place
b.TrainCorrection(brv2);
b.ErrorCorrectInPlace(brv2);
//b.TrainCorrection(brv2);
//b.ErrorCorrectInPlace(brv2);
}
BOOST_AUTO_TEST_CASE( correct_and_assemble ) {
......@@ -116,30 +117,26 @@ BOOST_AUTO_TEST_CASE( correct_and_assemble ) {
BamRecordVector brv, brv2;
size_t count = 0;
while(br.GetNextRecord(rec) && count++ < 10000)
brv.push_back(rec);
b.TrainAndCorrect(brv);
b.AddSequence(rec.Sequence().c_str(), rec.Qualities().c_str(), rec.Qname().c_str());
b.Train();
b.ErrorCorrect();
float kcov = b.GetKCov();
int kmer = b.GetKMer();
UnalignedSequenceVector v;
b.GetSequences(v);
std::ofstream corr("corr.fa");
for (auto& i : v)
corr << ">" << i.Name << std::endl << i.Seq << std::endl;
corr.close();
v.clear();
b.FilterUnique();
b.GetSequences(v);
std::string seq, name;
while (b.GetSequence(seq, name))
v.push_back({name, seq});
std::ofstream filt("filt.fa");
for (auto& i : v) {
filt << ">" << i.Name << std::endl << i.Seq << std::endl;
}
filt.close();
//std::ofstream filt("filt.fa");
//for (auto& i : v) {
// filt << ">" << i.Name << std::endl << i.Seq << std::endl;
//}
//filt.close();
FermiAssembler f;
f.AddReads(v);
......@@ -367,11 +364,10 @@ BOOST_AUTO_TEST_CASE( read_filter_1 ) {
assert(false);
}
// test nm rule
if (!(rec.GetIntTag("NM") != 1)) {
std::cerr << rec.GetIntTag("NM") << std::endl;
int32_t nm;
rec.GetIntTag("NM", nm);
if (nm == 1)
assert(false);
}
}
}
......@@ -391,8 +387,6 @@ BOOST_AUTO_TEST_CASE ( fermi_add_reads ) {
f.CorrectReads();
f.PerformAssembly();
std::vector<std::string> out = f.GetContigs();
}
......@@ -433,17 +427,17 @@ BOOST_AUTO_TEST_CASE( bam_record ) {
size_t count = 0;
br.GetNextRecord(r);
BOOST_CHECK_EQUAL(r.AsGenomicRegion().chr, 22);