Skip to content
Commits on Source (4)
......@@ -18,3 +18,6 @@ CTestTestfile.cmake
debug/
release/
.vscode/
ext/htslib/src/htslib-stamp
ext/htslib/tmp/
.snakemake
......@@ -5,11 +5,11 @@ RNA-Seq data, or more generally of target sequences using high-throughput
sequencing reads. It is based on the novel idea of _pseudoalignment_ for
rapidly determining the compatibility of reads with targets, without the need
for alignment. On benchmarks with standard RNA-Seq data, __kallisto__ can
quantify 30 million human reads in less than 3 minutes on a Mac desktop
quantify 30 million human bulk RNA-seq reads in less than 3 minutes on a Mac desktop
computer using only the read sequences and a transcriptome index that
itself takes less than 10 minutes to build. Pseudoalignment of reads
itself takes than 10 minutes to build. Pseudoalignment of reads
preserves the key information needed for quantification, and __kallisto__
is therefore not only fast, but also as accurate than existing
is therefore not only fast, but also comparably accurate to other existing
quantification tools. In fact, because the pseudoalignment procedure is
robust to errors in the reads, in many benchmarks __kallisto__
significantly outperforms existing tools. The __kallisto__ algorithms are described in more detail in:
......@@ -18,7 +18,9 @@ NL Bray, H Pimentel, P Melsted and L Pachter, [Near optimal probabilistic RNA-se
Scripts reproducing all the results of the paper are available [here](https://github.com/pachterlab/kallisto_paper_analysis).
__kallisto__ quantified RNA-Seq can be analyzed with [__sleuth__](https://github.com/pachterlab/sleuth/).
__kallisto__ quantified bulk RNA-Seq can be analyzed with [__sleuth__](https://github.com/pachterlab/sleuth/).
__kallisto__ can be used together with [__bustools__](https://bustools.github.io/) to pre-process single-cell RNA-seq data. See the [kallistobus.tools](https://www.kallistobus.tools/) website for instructions.
## Manual
......@@ -26,24 +28,15 @@ Please visit http://pachterlab.github.io/kallisto/manual.html for the manual.
## License
Please read the license before using kallisto. The license is distributed with __kallisto__ in the file `license.txt` also viewable [here](https://pachterlab.github.io/kallisto/download).
## Announcements
There is a low traffic Google Group,
[kallisto-sleuth-announcements](https://groups.google.com/d/forum/kallisto-sleuth-announcements)
where we make announcements about new releases. This is a read-only mailing
list.
__kallisto__ is distributed under the BSD-2 license. The license is distributed with __kallisto__ in the file `license.txt`, which is also viewable [here](https://pachterlab.github.io/kallisto/download). Please read the license before using __kallisto__.
## Getting help
For help running __kallisto__, please post to the [kallisto-sleuth-users
Google Group](https://groups.google.com/d/forum/kallisto-sleuth-users).
For help running __kallisto__, please post to the [kallisto-and-applications Google Group](https://groups.google.com/forum/#!forum/kallisto-and-applications).
## Reporting bugs
Please report bugs to the [Github issues
page](https://github.com/pachterlab/kallisto/issues)
Please report bugs to the [Github issues page](https://github.com/pachterlab/kallisto/issues)
## Development and pull requests
......
kallisto (0.46.1+dfsg-1) unstable; urgency=medium
* Team upload.
* New upstream version
-- Steffen Moeller <moeller@debian.org> Fri, 08 Nov 2019 23:14:36 +0100
kallisto (0.46.0+dfsg-1) unstable; urgency=medium
* New upstream version
......
......@@ -7,7 +7,7 @@ Build-Depends: debhelper-compat (= 12),
cmake,
libhdf5-dev,
libhts-dev
Standards-Version: 4.4.0
Standards-Version: 4.4.1
Vcs-Browser: https://salsa.debian.org/med-team/kallisto
Vcs-Git: https://salsa.debian.org/med-team/kallisto.git
Homepage: https://pachterlab.github.io/kallisto
......
......@@ -565,7 +565,7 @@ void MasterProcessor::processAln(const EMAlgorithm& em, bool useEM = true) {
assert(opt.pseudobam);
pseudobatchf_in.open(opt.output + "/pseudoaln.bin", std::ios::in | std::ios::binary);
SR.reset();
SR->reset();
std::vector<std::thread> workers;
for (int i = 0; i < opt.threads; i++) {
......@@ -893,6 +893,7 @@ ReadProcessor::ReadProcessor(ReadProcessor && o) :
seqs(std::move(o.seqs)),
names(std::move(o.names)),
quals(std::move(o.quals)),
flags(std::move(o.flags)),
umis(std::move(o.umis)),
newEcs(std::move(o.newEcs)),
flens(std::move(o.flens)),
......@@ -919,16 +920,16 @@ void ReadProcessor::operator()() {
if (batchSR.empty()) {
return;
} else {
batchSR.fetchSequences(buffer, bufsize, seqs, names, quals, umis, readbatch_id, mp.opt.pseudobam );
batchSR.fetchSequences(buffer, bufsize, seqs, names, quals, flags, umis, readbatch_id, mp.opt.pseudobam );
}
} else {
std::lock_guard<std::mutex> lock(mp.reader_lock);
if (mp.SR.empty()) {
if (mp.SR->empty()) {
// nothing to do
return;
} else {
// get new sequences
mp.SR.fetchSequences(buffer, bufsize, seqs, names, quals, umis, readbatch_id, mp.opt.pseudobam || mp.opt.fusion);
mp.SR->fetchSequences(buffer, bufsize, seqs, names, quals, flags, umis, readbatch_id, mp.opt.pseudobam || mp.opt.fusion);
}
// release the reader lock
}
......@@ -1222,7 +1223,7 @@ void ReadProcessor::clear() {
BUSProcessor::BUSProcessor(const KmerIndex& index, const ProgramOptions& opt, const MinCollector& tc, MasterProcessor& mp, int _id, int _local_id) :
paired(!opt.single_end), tc(tc), index(index), mp(mp), id(_id), local_id(_local_id) {
paired(!opt.single_end), bam(opt.bam), num(opt.num), tc(tc), index(index), mp(mp), id(_id), local_id(_local_id) {
// initialize buffer
bufsize = mp.bufsize;
buffer = new char[bufsize];
......@@ -1238,6 +1239,8 @@ BUSProcessor::BUSProcessor(const KmerIndex& index, const ProgramOptions& opt, co
BUSProcessor::BUSProcessor(BUSProcessor && o) :
paired(o.paired),
bam(o.bam),
num(o.num),
tc(o.tc),
index(o.index),
mp(o.mp),
......@@ -1248,6 +1251,7 @@ BUSProcessor::BUSProcessor(BUSProcessor && o) :
seqs(std::move(o.seqs)),
names(std::move(o.names)),
quals(std::move(o.quals)),
flags(std::move(o.flags)),
newEcs(std::move(o.newEcs)),
flens(std::move(o.flens)),
bias5(std::move(o.bias5)),
......@@ -1273,13 +1277,13 @@ void BUSProcessor::operator()() {
// grab the reader lock
{
std::lock_guard<std::mutex> lock(mp.reader_lock);
if (mp.SR.empty()) {
if (mp.SR->empty()) {
// nothing to do
return;
} else {
// get new sequences
std::vector<std::string> umis;
mp.SR.fetchSequences(buffer, bufsize, seqs, names, quals, umis, readbatch_id, false);
mp.SR->fetchSequences(buffer, bufsize, seqs, names, quals, flags, umis, readbatch_id, false);
}
// release the reader lock
}
......@@ -1326,9 +1330,17 @@ void BUSProcessor::processBuffer() {
// actually process the sequences
const BUSOptions& busopt = mp.opt.busOptions;
int incf = busopt.nfiles-1;
int incf, jmax;
if (bam) {
incf = 1;
jmax = 2;
} else {
incf = busopt.nfiles-1;
jmax = busopt.nfiles;
}
//int incf = (bam) ? 1 : busopt.nfiles-1;
for (int i = 0; i + incf < seqs.size(); i++) {
for (int j = 0; j < busopt.nfiles; j++) {
for (int j = 0; j < jmax /*(bam) ? 2 : busopt.nfiles*/; j++) {
s[j] = seqs[i+j].first;
l[j] = seqs[i+j].second;
}
......@@ -1350,7 +1362,7 @@ void BUSProcessor::processBuffer() {
}
auto &bcc = busopt.bc[0];
// auto &bcc = busopt.bc[0];
int blen = 0;
bool bad_bc = false;
for (auto &bcc : busopt.bc) {
......@@ -1401,6 +1413,9 @@ void BUSProcessor::processBuffer() {
b.flags |= (f) << 8;
b.count = 1;
//std::cout << std::string(s1,10) << "\t" << b.barcode << "\t" << std::string(s1+10,16) << "\t" << b.UMI << "\n";
if (num) {
b.flags = (uint32_t) flags[i / jmax];
}
//ec = tc.findEC(u);
......@@ -1439,7 +1454,6 @@ AlnProcessor::AlnProcessor(const KmerIndex& index, const ProgramOptions& opt, Ma
bambufsize = 1<<20;
bambuffer = new char[bambufsize]; // refactor this?
if (opt.batch_mode) {
/* need to check this later */
assert(id != -1);
......@@ -1475,6 +1489,7 @@ AlnProcessor::AlnProcessor(AlnProcessor && o) :
seqs(std::move(o.seqs)),
names(std::move(o.names)),
quals(std::move(o.quals)),
flags(std::move(o.flags)),
umis(std::move(o.umis)),
batchSR(std::move(o.batchSR)) {
buffer = o.buffer;
......@@ -1515,19 +1530,19 @@ void AlnProcessor::operator()() {
if (batchSR.empty()) {
return;
} else {
batchSR.fetchSequences(buffer, bufsize, seqs, names, quals, umis, readbatch_id, true );
batchSR.fetchSequences(buffer, bufsize, seqs, names, quals, flags, umis, readbatch_id, true );
readPseudoAlignmentBatch(mp.pseudobatchf_in, pseudobatch);
assert(pseudobatch.batch_id == readbatch_id);
assert(pseudobatch.aln.size() == ((paired) ? seqs.size()/2 : seqs.size())); // sanity checks
}
} else {
std::lock_guard<std::mutex> lock(mp.reader_lock);
if (mp.SR.empty()) {
if (mp.SR->empty()) {
// nothing to do
return;
} else {
// get new sequences
mp.SR.fetchSequences(buffer, bufsize, seqs, names, quals, umis, readbatch_id, true);
mp.SR->fetchSequences(buffer, bufsize, seqs, names, quals, flags, umis, readbatch_id, true);
readPseudoAlignmentBatch(mp.pseudobatchf_in, pseudobatch);
assert(pseudobatch.batch_id == readbatch_id);
assert(pseudobatch.aln.size() == ((paired) ? seqs.size()/2 : seqs.size())); // sanity checks
......@@ -2557,9 +2572,14 @@ int fillBamRecord(bam1_t &b, uint8_t* buf, const char *seq, const char *name, co
}
/** -- sequence reader -- **/
/** -- sequence readers -- **/
SequenceReader::~SequenceReader() {
void SequenceReader::reset() {
state = false;
readbatch_id = -1;
}
FastqSequenceReader::~FastqSequenceReader() {
for (auto &f : fp) {
if (f) {
gzclose(f);
......@@ -2578,14 +2598,13 @@ SequenceReader::~SequenceReader() {
}
void SequenceReader::reserveNfiles(int n) {
fp.resize(nfiles);
seq.resize(nfiles, nullptr);
l.resize(nfiles, 0);
nl.resize(nfiles, 0);
bool FastqSequenceReader::empty() {
return (!state && current_file >= files.size());
}
void SequenceReader::reset() {
void FastqSequenceReader::reset() {
SequenceReader::reset();
for (auto &f : fp) {
if (f) {
gzclose(f);
......@@ -2593,11 +2612,6 @@ void SequenceReader::reset() {
f = nullptr;
}
for (auto &s : seq) {
kseq_destroy(s);
s = nullptr;
}
if (f_umi && f_umi->is_open()) {
f_umi->close();
}
......@@ -2612,15 +2626,24 @@ void SequenceReader::reset() {
}
current_file = 0;
state = false;
readbatch_id = -1;
for (auto &s : seq) {
kseq_destroy(s);
s = nullptr;
}
}
void FastqSequenceReader::reserveNfiles(int n) {
fp.resize(nfiles);
l.resize(nfiles, 0);
nl.resize(nfiles, 0);
seq.resize(nfiles, nullptr);
}
// returns true if there is more left to read from the files
bool SequenceReader::fetchSequences(char *buf, const int limit, std::vector<std::pair<const char *, int> > &seqs,
bool FastqSequenceReader::fetchSequences(char *buf, const int limit, std::vector<std::pair<const char *, int> > &seqs,
std::vector<std::pair<const char *, int> > &names,
std::vector<std::pair<const char *, int> > &quals,
std::vector<uint32_t>& flags,
std::vector<std::string> &umis, int& read_id,
bool full) {
......@@ -2634,6 +2657,7 @@ bool SequenceReader::fetchSequences(char *buf, const int limit, std::vector<std:
names.clear();
quals.clear();
}
flags.clear();
bool usingUMIfiles = !umi_files.empty();
int umis_read = 0;
......@@ -2678,7 +2702,7 @@ bool SequenceReader::fetchSequences(char *buf, const int limit, std::vector<std:
bool all_l = true;
int bufadd = nfiles;
for (int i = 0; i < nfiles; i++) {
all_l = all_l && l[i] > 0;
all_l = all_l && l[i] >= 0;
bufadd += l[i]; // includes seq
}
if (all_l) {
......@@ -2718,6 +2742,8 @@ bool SequenceReader::fetchSequences(char *buf, const int limit, std::vector<std:
ss >> umi;
umis.emplace_back(std::move(umi));
}
flags.push_back(numreads++);
} else {
return true; // read it next time
}
......@@ -2732,27 +2758,146 @@ bool SequenceReader::fetchSequences(char *buf, const int limit, std::vector<std:
}
}
bool SequenceReader::empty() {
return (!state && current_file >= files.size());
}
// move constructor
SequenceReader::SequenceReader(SequenceReader&& o) :
#if 1
FastqSequenceReader::FastqSequenceReader(FastqSequenceReader&& o) :
nfiles(o.nfiles),
numreads(o.numreads),
fp(std::move(o.fp)),
seq(std::move(o.seq)),
l(std::move(o.l)),
nl(std::move(o.nl)),
paired(o.paired),
nfiles(o.nfiles),
files(std::move(o.files)),
umi_files(std::move(o.umi_files)),
f_umi(std::move(o.f_umi)),
current_file(o.current_file),
state(o.state) {
current_file(o.current_file) {
o.fp.resize(nfiles);
o.seq.resize(nfiles, nullptr);
o.l.resize(nfiles, 0);
o.nl.resize(nfiles, 0);
o.state = false;
}
#endif
#if 0
FastqSequenceReader::FastqSequenceReader(FastqSequenceReader&& o) {
fp = (std::move(o.fp));
l = (std::move(o.l));
nl = (std::move(o.nl));
paired = (o.paired);
nfiles = (o.nfiles);
files = (std::move(o.files)),
umi_files = (std::move(o.umi_files));
f_umi = (std::move(o.f_umi));
current_file= (o.current_file);
seq = (std::move(o.seq));
o.fp.resize(nfiles);
o.l.resize(nfiles, 0);
o.nl.resize(nfiles, 0);
o.state = false;
o.seq.resize(nfiles, nullptr);
}
#endif
const std::string BamSequenceReader::seq_enc = "=ACMGRSVTWYHKDBN";
BamSequenceReader::~BamSequenceReader() {
if (fp) {
bgzf_close(fp);
}
if (head) {
bam_hdr_destroy(head);
}
if (rec) {
bam_destroy1(rec);
}
}
bool BamSequenceReader::empty() {
return !state;
}
void BamSequenceReader::reset() {
SequenceReader::reset();
}
void BamSequenceReader::reserveNfiles(int n) {
}
// returns true if there is more left to read from the files
bool BamSequenceReader::fetchSequences(char *buf, const int limit, std::vector<std::pair<const char *, int> > &seqs,
std::vector<std::pair<const char *, int> > &names,
std::vector<std::pair<const char *, int> > &quals,
std::vector<uint32_t>& flags,
std::vector<std::string> &umis, int& read_id,
bool full) {
readbatch_id += 1; // increase the batch id
read_id = readbatch_id; // copy now because we are inside a lock
seqs.clear();
flags.clear();
int bufpos = 0;
while (true) {
if (!state) { // should we open a file
return false;
}
// the file is open and we have read into seq1 and seq2
if (err >= 0) {
if (!(rec->core.flag & BAM_FSECONDARY)) { // record is primary alignment
int bufadd = l_seq + l_bc + l_umi + 2;
// fits into the buffer
if (bufpos+bufadd < limit) {
char *pi;
pi = buf + bufpos;
memcpy(pi, bc, l_bc);
memcpy(pi + l_bc, umi, l_umi + 1);
seqs.emplace_back(pi, l_bc + l_umi);
bufpos += l_bc + l_umi + 1;
pi = buf + bufpos;
int len = (l_seq + 1) / 2;
if (l_seq % 2) --len;
int j = 0;
for (int i = 0; i < len; ++i, ++eseq) {
buf[bufpos++] = seq_enc[*eseq >> 4];
buf[bufpos++] = seq_enc[*eseq & 0x0F];
}
if (l_seq % 2) {
buf[bufpos++] = seq_enc[*eseq >> 4];
}
buf[bufpos++] = '\0';
seqs.emplace_back(pi, l_seq);
} else {
return true; // read it next time
}
}
// read for the next one
err = bam_read1(fp, rec);
if (err < 0) {
return true;
}
eseq = bam_get_seq(rec);
l_seq = rec->core.l_qseq;
bc = bam_aux2Z(bam_aux_get(rec, "CR"));
l_bc = 0;
for (char *pbc = bc; *pbc != '\0'; ++pbc) {
++l_bc;
}
umi = bam_aux2Z(bam_aux_get(rec, "UR"));
l_umi = 0;
for (char *pumi = umi; *pumi != '\0'; ++pumi) {
++l_umi;
}
} else {
state = false; // haven't opened file yet
}
}
}
......@@ -23,6 +23,7 @@
#include "GeneModel.h"
#include "BUSData.h"
#include "BUSTools.h"
#include <htslib/sam.h>
#ifndef KSEQ_INIT_READY
......@@ -41,9 +42,35 @@ class SequenceReader {
public:
SequenceReader(const ProgramOptions& opt) :
paired(!opt.single_end), files(opt.files),
f_umi(new std::ifstream{}),
current_file(0), state(false), readbatch_id(-1) {
readbatch_id(-1) {};
SequenceReader() : state(false), readbatch_id(-1) {};
virtual ~SequenceReader() {}
// SequenceReader(SequenceReader&& o);
virtual bool empty() = 0;
virtual void reset();
virtual void reserveNfiles(int n) = 0;
virtual bool fetchSequences(char *buf, const int limit, std::vector<std::pair<const char*, int>>& seqs,
std::vector<std::pair<const char*, int>>& names,
std::vector<std::pair<const char*, int>>& quals,
std::vector<uint32_t>& flags,
std::vector<std::string>& umis, int &readbatch_id,
bool full=false) = 0;
public:
bool state; // is the file open
int readbatch_id = -1;
};
class FastqSequenceReader : public SequenceReader {
public:
FastqSequenceReader(const ProgramOptions& opt) : SequenceReader(opt),
current_file(0), paired(!opt.single_end), files(opt.files),
f_umi(new std::ifstream{}) {
SequenceReader::state = false;
if (opt.bus_mode) {
nfiles = opt.busOptions.nfiles;
} else {
......@@ -51,46 +78,106 @@ public:
}
reserveNfiles(nfiles);
}
SequenceReader() :
FastqSequenceReader() : SequenceReader(),
paired(false),
f_umi(new std::ifstream{}),
current_file(0), state(false), readbatch_id(-1) {}
SequenceReader(SequenceReader&& o);
current_file(0) {};
FastqSequenceReader(FastqSequenceReader &&o);
~FastqSequenceReader();
bool empty();
void reset();
void reserveNfiles(int n);
~SequenceReader();
bool fetchSequences(char *buf, const int limit, std::vector<std::pair<const char*, int>>& seqs,
std::vector<std::pair<const char*, int>>& names,
std::vector<std::pair<const char*, int>>& quals,
std::vector<uint32_t>& flags,
std::vector<std::string>& umis, int &readbatch_id,
bool full=false);
public:
int nfiles = 1;
uint32_t numreads = 0;
std::vector<gzFile> fp;
//gzFile fp1 = 0, fp2 = 0;
std::vector<kseq_t*> seq;
std::vector<int> l;
std::vector<int> nl;
//kseq_t *seq1 = 0, *seq2 = 0;
//int l1,l2,nl1,nl2;
bool paired;
std::vector<std::string> files;
std::vector<std::string> umi_files;
std::unique_ptr<std::ifstream> f_umi;
int current_file;
bool state; // is the file open
int readbatch_id = -1;
std::vector<kseq_t*> seq;
};
class BamSequenceReader : public SequenceReader {
public:
BamSequenceReader(const ProgramOptions& opt) :
SequenceReader(opt) {
SequenceReader::state = true;
fp = bgzf_open(opt.files[0].c_str(), "rb");
head = bam_hdr_read(fp);
rec = bam_init1();
err = bam_read1(fp, rec);
eseq = bam_get_seq(rec);
l_seq = rec->core.l_qseq;
bc = bam_aux2Z(bam_aux_get(rec, "CR"));
l_bc = 0;
for (char *pbc = bc; *pbc != '\0'; ++pbc) {
++l_bc;
}
umi = bam_aux2Z(bam_aux_get(rec, "UR"));
l_umi = 0;
for (char *pumi = umi; *pumi != '\0'; ++pumi) {
++l_umi;
}
}
BamSequenceReader() : SequenceReader() {};
BamSequenceReader(BamSequenceReader &&o);
~BamSequenceReader();
bool empty();
void reset();
void reserveNfiles(int n);
bool fetchSequences(char *buf, const int limit, std::vector<std::pair<const char*, int>>& seqs,
std::vector<std::pair<const char*, int>>& names,
std::vector<std::pair<const char*, int>>& quals,
std::vector<uint32_t>& flags,
std::vector<std::string>& umis, int &readbatch_id,
bool full=false);
public:
BGZF *fp;
bam_hdr_t *head;
bam1_t *rec;
uint8_t *eseq;
int32_t l_seq;
char *bc;
int l_bc;
char *umi;
int l_umi;
int err;
char seq[256]; // Is there a better limit?
private:
static const std::string seq_enc;
};
class MasterProcessor {
public:
MasterProcessor (KmerIndex &index, const ProgramOptions& opt, MinCollector &tc, const Transcriptome& model)
: tc(tc), index(index), model(model), bamfp(nullptr), bamfps(nullptr), bamh(nullptr), opt(opt), SR(opt), numreads(0)
: tc(tc), index(index), model(model), bamfp(nullptr), bamfps(nullptr), bamh(nullptr), opt(opt), numreads(0)
,nummapped(0), num_umi(0), bufsize(1ULL<<23), tlencount(0), biasCount(0), maxBiasCount((opt.bias) ? 1000000 : 0), last_pseudobatch_id (-1) {
if (opt.bam) {
SR = new BamSequenceReader(opt);
} else {
SR = new FastqSequenceReader(opt);
}
if (opt.batch_mode) {
memset(&bus_bc_len[0],0,33);
memset(&bus_umi_len[0],0,33);
......@@ -136,13 +223,14 @@ public:
bam_hdr_destroy(bamh);
bamh = nullptr;
}
delete SR;
}
std::mutex reader_lock;
std::mutex writer_lock;
SequenceReader SR;
SequenceReader *SR;
MinCollector& tc;
KmerIndex& index;
const Transcriptome& model;
......@@ -205,7 +293,7 @@ public:
std::vector<std::pair<std::vector<int>, std::string>> new_ec_umi;
const KmerIndex& index;
MasterProcessor& mp;
SequenceReader batchSR;
FastqSequenceReader batchSR;
int64_t numreads;
int id;
int local_id;
......@@ -215,6 +303,7 @@ public:
std::vector<std::pair<const char*, int>> seqs;
std::vector<std::pair<const char*, int>> names;
std::vector<std::pair<const char*, int>> quals;
std::vector<uint32_t> flags;
std::vector<std::string> umis;
std::vector<std::vector<int>> newEcs;
std::vector<int> flens;
......@@ -237,6 +326,8 @@ public:
size_t bufsize;
bool paired;
bool bam;
bool num;
const MinCollector& tc;
const KmerIndex& index;
MasterProcessor& mp;
......@@ -251,6 +342,7 @@ public:
std::vector<std::pair<const char*, int>> seqs;
std::vector<std::pair<const char*, int>> names;
std::vector<std::pair<const char*, int>> quals;
std::vector<uint32_t> flags;
std::vector<std::vector<int>> newEcs;
std::vector<int> flens;
......@@ -279,7 +371,7 @@ public:
const KmerIndex& index;
const EMAlgorithm& em;
MasterProcessor& mp;
SequenceReader batchSR;
FastqSequenceReader batchSR;
int64_t numreads;
int id;
PseudoAlignmentBatch pseudobatch;
......@@ -291,6 +383,7 @@ public:
std::vector<std::pair<const char*, int>> seqs;
std::vector<std::pair<const char*, int>> names;
std::vector<std::pair<const char*, int>> quals;
std::vector<uint32_t> flags;
std::vector<std::string> umis;
void operator()();
......
#ifndef KALLISTO_COMMON_H
#define KALLISTO_COMMON_H
#define KALLISTO_VERSION "0.46.0"
#define KALLISTO_VERSION "0.46.1"
#include <string>
#include <vector>
#include <iostream>
#ifdef _WIN64
typedef unsigned int uint;
......@@ -67,6 +67,8 @@ struct ProgramOptions {
bool bus_mode;
BUSOptions busOptions;
bool pseudo_quant;
bool bam;
bool num;
std::string batch_file_name;
std::vector<std::vector<std::string>> batch_files;
std::vector<std::string> batch_ids;
......@@ -107,6 +109,8 @@ ProgramOptions() :
batch_mode(false),
bus_mode(false),
pseudo_quant(false),
bam(false),
num(false),
plaintext(false),
write_index(false),
single_end(false),
......
......@@ -533,20 +533,24 @@ void ListSingleCellTechnologies() {
<< "CELSeq CEL-Seq" << endl
<< "CELSeq2 CEL-Seq version 2" << endl
<< "DropSeq DropSeq" << endl
<< "inDrops inDrops" << endl
<< "inDropsv1 inDrops version 1 chemistry" << endl
<< "inDropsv2 inDrops version 2 chemistry" << endl
<< "inDropsv3 inDrops version 3 chemistry" << endl
<< "SCRBSeq SCRB-Seq" << endl
<< "SureCell SureCell for ddSEQ" << endl
<< endl;
}
void ParseOptionsBus(int argc, char **argv, ProgramOptions& opt) {
const char *opt_string = "i:o:x:t:l";
const char *opt_string = "i:o:x:t:lbn";
static struct option long_options[] = {
{"index", required_argument, 0, 'i'},
{"output-dir", required_argument, 0, 'o'},
{"technology", required_argument, 0, 'x'},
{"list", no_argument, 0, 'l'},
{"threads", required_argument, 0, 't'},
{"bam", no_argument, 0, 'b'},
{"num", no_argument, 0, 'n'},
{0,0,0,0}
};
......@@ -584,6 +588,14 @@ void ParseOptionsBus(int argc, char **argv, ProgramOptions& opt) {
stringstream(optarg) >> opt.threads;
break;
}
case 'b': {
opt.bam = true;
break;
}
case 'n': {
opt.num = true;
break;
}
default: break;
}
}
......@@ -813,6 +825,82 @@ bool CheckOptionsBus(ProgramOptions& opt) {
} else {
auto& busopt = opt.busOptions;
if (opt.bam) { // Note: only 10xV2 has been tested
busopt.nfiles = 1;
if (opt.technology == "10XV2") {
busopt.seq = BUSOptionSubstr(1,0,0); // second file, entire string
busopt.umi = BUSOptionSubstr(0,16,26); // first file [16:26]
busopt.bc.push_back(BUSOptionSubstr(0,0,16));
} else if (opt.technology == "10XV3") {
busopt.seq = BUSOptionSubstr(1,0,0);
busopt.umi = BUSOptionSubstr(0,16,28);
busopt.bc.push_back(BUSOptionSubstr(0,0,16));
// } else if (opt.technology == "10XV1") {
} else if (opt.technology == "SURECELL") {
busopt.seq = BUSOptionSubstr(1,0,0);
busopt.umi = BUSOptionSubstr(0,18,26);
busopt.bc.push_back(BUSOptionSubstr(0,0,18));
} else if (opt.technology == "DROPSEQ") {
busopt.seq = BUSOptionSubstr(1,0,0);
busopt.umi = BUSOptionSubstr(0,12,20);
busopt.bc.push_back(BUSOptionSubstr(0,0,12));
} else if (opt.technology == "INDROPSV1") {
busopt.nfiles = 2;
busopt.seq = BUSOptionSubstr(1,0,0);
busopt.umi = BUSOptionSubstr(0,42,48);
busopt.bc.push_back(BUSOptionSubstr(0,0,11));
busopt.bc.push_back(BUSOptionSubstr(0,30,38));
} else if (opt.technology == "INDROPSV2") {
busopt.nfiles = 2;
busopt.seq = BUSOptionSubstr(0,0,0);
busopt.umi = BUSOptionSubstr(1,42,48);
busopt.bc.push_back(BUSOptionSubstr(1,0,11));
busopt.bc.push_back(BUSOptionSubstr(1,30,38));
} else if (opt.technology == "INDROPSV3") {
busopt.nfiles = 3;
busopt.seq = BUSOptionSubstr(2,0,0);
busopt.umi = BUSOptionSubstr(1,8,14);
busopt.bc.push_back(BUSOptionSubstr(0,0,8));
busopt.bc.push_back(BUSOptionSubstr(1,0,8));
} else if (opt.technology == "CELSEQ") {
busopt.seq = BUSOptionSubstr(1,0,0);
busopt.umi = BUSOptionSubstr(0,8,12);
busopt.bc.push_back(BUSOptionSubstr(0,0,8));
} else if (opt.technology == "CELSEQ2") {
busopt.seq = BUSOptionSubstr(1,0,0);
busopt.umi = BUSOptionSubstr(0,0,6);
busopt.bc.push_back(BUSOptionSubstr(0,6,12));
} else if (opt.technology == "SCRBSEQ") {
busopt.seq = BUSOptionSubstr(1,0,0);
busopt.umi = BUSOptionSubstr(0,6,16);
busopt.bc.push_back(BUSOptionSubstr(0,0,6));
} else if (opt.technology == "INDROPSV3") {
busopt.seq = BUSOptionSubstr(1,0,0);
busopt.umi = BUSOptionSubstr(0,0,6);
busopt.bc.push_back(BUSOptionSubstr(0,6,16));
} else {
vector<int> files;
vector<BUSOptionSubstr> values;
vector<BUSOptionSubstr> bcValues;
vector<std::string> errorList;
bool invalid = ParseTechnology(opt.technology, values, files, errorList, bcValues);
if(!invalid) {
busopt.nfiles = files.size();
for(int i = 0; i < bcValues.size(); i++) {
busopt.bc.push_back(bcValues[i]);
}
busopt.umi = values[0];
busopt.seq = values[1];
} else {
for(int j = 0; j < errorList.size(); j++) {
cerr << errorList[j] << endl;
}
cerr << "Unable to create technology: " << opt.technology << endl;
ret = false;
}
}
} else {
if (opt.technology == "10XV2") {
busopt.nfiles = 2;
busopt.seq = BUSOptionSubstr(1,0,0); // second file, entire string
......@@ -840,12 +928,24 @@ bool CheckOptionsBus(ProgramOptions& opt) {
busopt.seq = BUSOptionSubstr(1,0,0);
busopt.umi = BUSOptionSubstr(0,12,20);
busopt.bc.push_back(BUSOptionSubstr(0,0,12));
} else if (opt.technology == "INDROPS") {
} else if (opt.technology == "INDROPSV1") {
busopt.nfiles = 2;
busopt.seq = BUSOptionSubstr(1,0,0);
busopt.umi = BUSOptionSubstr(0,42,48);
busopt.bc.push_back(BUSOptionSubstr(0,0,11));
busopt.bc.push_back(BUSOptionSubstr(0,30,38));
} else if (opt.technology == "INDROPSV2") {
busopt.nfiles = 2;
busopt.seq = BUSOptionSubstr(0,0,0);
busopt.umi = BUSOptionSubstr(1,42,48);
busopt.bc.push_back(BUSOptionSubstr(1,0,11));
busopt.bc.push_back(BUSOptionSubstr(1,30,38));
} else if (opt.technology == "INDROPSV3") {
busopt.nfiles = 3;
busopt.seq = BUSOptionSubstr(2,0,0);
busopt.umi = BUSOptionSubstr(1,8,14);
busopt.bc.push_back(BUSOptionSubstr(0,0,8));
busopt.bc.push_back(BUSOptionSubstr(1,0,8));
} else if (opt.technology == "CELSEQ") {
busopt.nfiles = 2;
busopt.seq = BUSOptionSubstr(1,0,0);
......@@ -861,6 +961,12 @@ bool CheckOptionsBus(ProgramOptions& opt) {
busopt.seq = BUSOptionSubstr(1,0,0);
busopt.umi = BUSOptionSubstr(0,6,16);
busopt.bc.push_back(BUSOptionSubstr(0,0,6));
} else if (opt.technology == "INDROPSV3") {
busopt.nfiles = 3;
busopt.seq = BUSOptionSubstr(2,0,0);
busopt.umi = BUSOptionSubstr(1,8,14);
busopt.bc.push_back(BUSOptionSubstr(0,0,8));
busopt.bc.push_back(BUSOptionSubstr(1,0,8));
} else {
vector<int> files;
vector<BUSOptionSubstr> values;
......@@ -883,13 +989,18 @@ bool CheckOptionsBus(ProgramOptions& opt) {
}
}
}
}
if (ret && opt.files.size() % opt.busOptions.nfiles != 0) {
if (ret && !opt.bam && opt.files.size() % opt.busOptions.nfiles != 0) {
cerr << "Error: Number of files (" << opt.files.size() << ") does not match number of input files required by "
<< "technology " << opt.technology << " (" << opt.busOptions.nfiles << ")" << endl;
ret = false;
}
if (opt.bam && opt.num) {
cerr << "Warning: --bam option was used, so --num option will be ignored" << endl;
}
return ret;
}
......@@ -1535,7 +1646,9 @@ void usageBus() {
<< "-x, --technology=STRING Single-cell technology used " << endl << endl
<< "Optional arguments:" << endl
<< "-l, --list List all single-cell technologies supported" << endl
<< "-t, --threads=INT Number of threads to use (default: 1)" << endl;
<< "-t, --threads=INT Number of threads to use (default: 1)" << endl
<< "-b, --bam Input file is a BAM file" << endl
<< "-n, --num Output number of read in flag column (incompatible with --bam)" << endl;
}
void usageIndex() {
......@@ -1808,6 +1921,7 @@ int main(int argc, char *argv[]) {
for (int i = 0; i < index.num_trans; i++) {
num_unique += collection.counts[i];
}
num_pseudoaligned = 0;
for (int i = 0; i < collection.counts.size(); i++) {
num_pseudoaligned += collection.counts[i];
}
......@@ -1953,6 +2067,10 @@ int main(int argc, char *argv[]) {
num_pseudoaligned += collection.counts[i];
}
if (num_pseudoaligned == 0) {
std::cerr << "[~warn] Warning, zero reads pseudoaligned check your input files and index" << std::endl;
}
plaintext_aux(
opt.output + "/run_info.json",
std::string(std::to_string(index.num_trans)),
......@@ -1968,7 +2086,17 @@ int main(int argc, char *argv[]) {
plaintext_writer(opt.output + "/abundance.tsv", em.target_names_,
em.alpha_, em.eff_lens_, index.target_lens_);
if (opt.bootstrap > 0) {
if (opt.bootstrap > 0 && num_pseudoaligned == 0) {
// this happens if nothing aligns, then we write an empty bootstrap file
for (int b = 0; b < opt.bootstrap; b++) {
if (!opt.plaintext) {
writer.write_bootstrap(em, b); // em is empty
} else {
plaintext_writer(opt.output + "/bs_abundance_" + std::to_string(b) + ".tsv",
em.target_names_, em.alpha_, em.eff_lens_, index.target_lens_); // re-use empty input
}
}
} else if (opt.bootstrap > 0 && num_pseudoaligned > 0) {
auto B = opt.bootstrap;
std::mt19937_64 rand;
rand.seed( opt.seed );
......@@ -2016,6 +2144,9 @@ int main(int argc, char *argv[]) {
cerr << endl;
if (num_pseudoaligned == 0) {
exit(1); // exit with error
}
}
} else if (cmd == "quant-only") {
if (argc==2) {
......@@ -2086,7 +2217,27 @@ int main(int argc, char *argv[]) {
em.alpha_, em.eff_lens_, index.target_lens_);
}
if (opt.bootstrap > 0) {
int64_t num_pseudoaligned =0;
for (int i = 0; i < collection.counts.size(); i++) {
num_pseudoaligned += collection.counts[i];
}
if (num_pseudoaligned == 0) {
std::cerr << "[~warn] Warning, zero reads pseudoaligned check your input files and index" << std::endl;
}
if (opt.bootstrap > 0 && num_pseudoaligned == 0) {
// this happens if nothing aligns, then we write an empty bootstrap file
for (int b = 0; b < opt.bootstrap; b++) {
if (!opt.plaintext) {
writer.write_bootstrap(em, b); // em is empty
} else {
plaintext_writer(opt.output + "/bs_abundance_" + std::to_string(b) + ".tsv",
em.target_names_, em.alpha_, em.eff_lens_, index.target_lens_); // re-use empty input
}
}
} else if (opt.bootstrap > 0 && num_pseudoaligned > 0) {
std::cerr << "Bootstrapping!" << std::endl;
auto B = opt.bootstrap;
std::mt19937_64 rand;
......@@ -2125,6 +2276,10 @@ int main(int argc, char *argv[]) {
}
}
cerr << endl;
if (num_pseudoaligned == 0) {
exit(1);
}
}
} else if (cmd == "pseudo") {
if (argc==2) {
......