Skip to content
Commits on Source (6)
......@@ -3,3 +3,7 @@ docs.json
__dummy.html
*.o
*.obj
dub.selections.json
bin/
old_biod/
build/
# Simple Makefile
D_COMPILER=ldc2
DFLAGS = -wi -g -relocation-model=pic -unittest -main -Icontrib/undead
ifndef GUIX
ifdef GUIX_ENVIRONMENT
GUIX=$(GUIX_ENVIRONMENT)
endif
endif
ifdef GUIX
LIBRARY_PATH=$(GUIX)/lib
endif
DLIBS = $(LIBRARY_PATH)/libphobos2-ldc.a $(LIBRARY_PATH)/libdruntime-ldc.a
DLIBS_DEBUG = $(LIBRARY_PATH)/libphobos2-ldc-debug.a $(LIBRARY_PATH)/libdruntime-ldc-debug.a
SRC = $(wildcard contrib/undead/*.d) contrib/undead/*/*.d $(wildcard bio/*.d bio/*/*.d bio/*/*/*.d bio/*/*/*/*.d bio/*/*/*/*/*.d bio/*/*/*/*/*/*.d)
OBJ = $(SRC:.d=.o)
BIN = bin/biod_tests
debug: DFLAGS += -O0 -d-debug -link-debuglib
release static: DFLAGS += -O3 -release -enable-inlining -boundscheck=off
static: DFLAGS += -static -L-Bstatic
all: debug
default: all
default debug release static: $(BIN)
%.o: %.d
$(D_COMPILER) $(DFLAGS) -c $< -od=$(dir $@)
$(BIN): $(OBJ)
$(info linking...)
$(D_COMPILER) $(DFLAGS) $(OBJ) -of=$(BIN)
check: $(BIN)
$(info running tests...)
$(BIN)
clean:
rm -vf $(OBJ)
rm -vf $(BIN)
# find -name '*.o' -exec rm \{\} \;
# BioD [![Build Status](https://travis-ci.org/biod/BioD.svg?branch=master)](https://travis-ci.org/biod/BioD) [![DUB Package](https://img.shields.io/badge/dub-v0.1.0-red.svg)](https://code.dlang.org/packages/biod)
[BioD](https://github.com/biod/BioD) is a fast and memory efficient bioinformatics library written in the [D programming language](http://dlang.org).
[BioD](https://github.com/biod/BioD) is a fast and memory efficient bioinformatics library written in the [D programming language](http://www.dlang.org)
whose aim is to:
BioD aims to:
* Provide a platform for developing high-performance computational biology applications using the [D programming language](http://www.dlang.org) through
- Automatic parallelization of tasks where possible
- Avoiding unnecessary memory allocations
* Provide a platform for writing high-performance bioinformatics applications in D. BioD achieves this by:
- automatic parallelization of tasks where possible for example reading and writing BAM files
- reducing the GC overhead by avoiding unnecessary memory allocations
* Offer support for manipulating common biological data formats
## Why BioD?
## Why D?
BioD leverages on [D programming language](http://www.dlang.org)
features to develop high performance bioinformatics tools
(e.g. [sambamba](https://github.com/biod/sambamba)). The D programming
language is both a low and high-level hybrid object orientated and
functional (OOP/FP) programming language with templating/generic
features are far easier than that of C++.
D is a language that suits parallel programming because of guarantees
the compiler provides. D is both a low-level language and a high-level
hybrid OOP/FP language. There is no other programming language that
matches those features. Also, D templating/generics is far easier that
that of C++ or, say, Scala.
## D programming language resources
That is not to say that D is an easy language. A powerful toolbox will
be complicated. If you want to do everything with a hammer, maybe
better choose Java instead ;).
For more information about D find Andrei Alexandrecu's D book. It is a
classic. Ali Çehreli's book also is recommended.
* [Programming in D](http://ddili.org/ders/d.en/index.html) is online by Ali Çehreli.
* [The D Programming Language](https://www.amazon.com/D-Programming-Language-Andrei-Alexandrescu/dp/0321635361) by Andrei Alexandrecu (great book, slightly out of date)
* [The D Cookbook](https://www.amazon.com/D-Cookbook-Adam-D-Ruppe/dp/1783287217) by Adam D. Ruppe
## Current development
Our current focus is to provide a bamreader and bamwriter that is
really fast and easy to use. We believe the BAM format is here to stay
for the foreseeable future in pipelines. With D we have an good way to
write performance parsers, particularly with three typical scenarios:
Our aim is to provide a set of D modules to manipulate and work with
biological datasets. BioD provides modules for manipulating high
throughput data formats by provifing fast and easy to use native BAM
file reader and writer with ability to iterate a BAM file a read at a
time,a nucleotide at a time (pileup) or via a sliding window.
1. Go through a BAM file a read at a time
2. Go through a BAM file a nucleotide at a time (pileup)
3. Go through a BAM file with a sliding window
## Install
The sliding window is a derivation of the first - a read at a time or
a nucleotide at a time.
The current default is to provide the path to the checked out repo to
the D-compiler. For example,
At this point this functionality is mostly in BioD, but not in an
intuitive way. We are building up this functionality and will give
examples (WIP).
DFLAGS = -wi -I. -IBioD -g
# Install
## Build environment
The current default is to provide the path to the checked out repo to the D-compiler. For example
in sambamba we use
After installing ldc and dub
DFLAGS = -wi -I. -IBioD -g
dub
dub test
It is possible to create a recent build container with the
[GNU guix](https://www.gnu.org/software/guix/) transactional package
manager
guix environment -C guix --ad-hoc ldc dub zlib gdb binutils-gold --network
after getting dropped in the container simply run dub.
If you want to use the make file instead (not requiring the network) use
# Usage
guix environment -C guix --ad-hoc ldc zlib gdb make binutils-gold --no-grafts
make -j 4
make check
See the [examples directory](https://github.com/biod/BioD/tree/master/examples)
## Debugging
When using gdb, switch off these handlers
`handle SIGUSR1 SIGUSR2 nostop noprint`
It can be passed in from the command line
`gdb -iex "handle SIGUSR1 SIGUSR2 no stop noprint" biod_test`
## Usage
See the [examples directory](examples)
for examples and usage.
BioD is also a crucial part of the [sambamba](https://github.com/biod/sambamba) tool.
## Mailing list
# Contributing
[The BioD mailing list](https://groups.google.com/forum/#!forum/dlang_biod)
## Contributing
Simply clone the repository on github and put in a pull request.
# BioD contributors and support
## BioD contributors and support
See
[contributors](https://github.com/biod/BioD/graphs/contributors). For
support use the [issue tracker](https://github.com/biod/BioD/issues) or contact
* [Pjotr Prins](https://github.com/pjotrp)
* [Artem Tarasov](https://github.com/lomereiter)
* [George Githinji](https://github.com/George-Githinji)
* [Prasun Anand](https://github.com/prasunanand)
## License
# License
BioD is free software and licensed under the MIT (expat) [license](./LICENSE).
BioD is licensed under the liberal MIT (expat) [license](./LICENSE).
BioD includes some files from the
[undeaD project](https://github.com/dlang/undeaD) in ./contrib which
are published under a Boost license. This code should be phased out in time.
## ChangeLog v0.2.2 (20190316)
+ Restored make so we can compile without dub
+ Fasta and fasta indexing (.fai) support added (thanks Emilio Palumbo https://github.com/emi80)
+ Mate pair comparison and HI tag support added for BAM (thanks https://github.com/emi80)
+ Added Picard-style comparison for BAM (thanks https://github.com/TimurIs)
+ Added fast whitespace line splitter/tokenizer, a Phobos-style version and a faster C-style version (thanks https://github.com/pjotrp)
+ Added multi-allelic frequencies (MAF) support (thanks https://github.com/pjotrp)
+ Name spaces and directories reorganised (thanks George Gethinji https://github.com/george-githinji)
+ Pulled in D's undead repo (dropped dependency) and minimalised it to actual used files (@pjotrp)
## ChangeLog v0.2.1 (20181004)
+ Fix bunch of deprecation warnings
......
module bio.bam.md.core;
public import bio.bam.md.operation;
public import bio.bam.md.parse;
......@@ -23,7 +23,7 @@
*/
module bio.core.bgzf.block;
import bio.bam.constants;
import bio.std.hts.bam.constants;
import bio.core.utils.memoize;
import bio.core.utils.zlib;
......
......@@ -23,7 +23,7 @@
*/
module bio.core.bgzf.compress;
import bio.bam.constants;
import bio.std.hts.bam.constants;
import bio.core.utils.zlib;
import std.array;
......
......@@ -27,10 +27,10 @@ import bio.core.bgzf.block;
import bio.core.bgzf.virtualoffset;
import bio.core.bgzf.constants;
import bio.core.bgzf.chunk;
import bio.bam.constants;
import bio.std.hts.bam.constants;
import bio.core.utils.roundbuf;
import undead.stream;
import contrib.undead.stream;
import std.exception;
import std.conv;
import std.parallelism;
......
......@@ -28,7 +28,7 @@ import bio.core.bgzf.compress;
import bio.core.utils.roundbuf;
import undead.stream;
import contrib.undead.stream;
import std.exception;
import std.parallelism;
import std.array;
......
/*
This file is part of BioD.
Copyright (C) 2018 Pjotr Prins <pjotr.prins@thebird.nl>
*/
module bio.std.decompress;
/**
Streaming line reader which can be used for gzipped files. Note the
current edition (still) uses the garbage collector. It may help to
switch it off or to use the BioD decompressor used by bgzf.
For a comparison with gzip a 2GB file decompressed with
real 0m53.701s
user 0m53.820s
sys 0m0.572s
while gzip took
real 0m11.528s
user 0m10.288s
sys 0m0.936s
So, that is something to aim for.
Conversion can happen between different encodings, provided the
line terminator is ubyte = '\n'. GzipbyLine logic is modeled on
ByLineImpl and readln function from std.stdio.
*/
import std.algorithm;
// import std.concurrency;
import std.conv;
import std.exception;
import std.file;
import std.parallelism;
import std.stdio: File;
import std.zlib: UnCompress;
struct GzipbyLine(R) {
File f;
UnCompress decompress;
R line;
uint _bufsize;
this(string gzipfn, uint bufsize=0x4000) {
enforce(gzipfn.isFile);
f = File(gzipfn,"r");
decompress = new UnCompress();
_bufsize = bufsize;
}
@disable this(this); // disable copy semantics;
int opApply(scope int delegate(int line, R) dg) {
int line = 0;
// chunk_byLine takes a buffer and splits on \n.
R chunk_byLine(R head, R rest) {
auto split = findSplitAfter(rest,"\n");
// If a new line is found split the in left and right.
auto left = split[0]; // includes eol splitter
auto right = split[1];
if (left.length > 0) { // we have a match!
dg(line++, head ~ left);
return chunk_byLine([], right);
}
// no match
return head ~ right;
}
R tail; // tail of previous buffer
foreach (ubyte[] buffer; f.byChunk(_bufsize))
{
auto buf = cast(R)decompress.uncompress(buffer);
tail = chunk_byLine(tail,buf);
}
if (tail.length > 0) dg(line++, tail);
return 0;
}
}
unittest {
import std.algorithm.comparison : equal;
// writeln("Testing GzipbyLine");
int[] a = [ 1, 2, 4, 7, 7, 2, 4, 7, 3, 5];
auto b = findSplitAfter(a, [7]);
assert(equal(b[0],[1, 2, 4, 7]));
assert(equal(b[1],[7, 2, 4, 7, 3, 5]));
auto b1 = findSplitAfter(b[1], [7]);
assert(equal(b1[0],[7]));
assert(equal(b1[1],[2, 4, 7, 3, 5]));
auto b2 = findSplitAfter([2, 4, 3], [7]);
assert(equal(b2[0],cast(ubyte[])[]));
assert(equal(b2[1],[2,4,3]));
uint chars = 0;
int lines = 0;
foreach(line, ubyte[] s; GzipbyLine!(ubyte[])("test/data/BXD_geno.txt.gz")) {
// test file contains 7320 lines 4707218 characters
// write(cast(string)s);
chars += s.length;
lines = line;
}
assert(lines == 7319,"genotype lines " ~ to!string(lines+1)); // fails with ldc2 < 1.10!
assert(chars == 4707218,"chars " ~ to!string(chars));
}
/**
Mmfile threaded version of streaming line reader which can be used
for gzipped files. Note the current edition is slower than
GzipbyLine above and (still) uses the garbage collector. It may
help to switch it off or to use the BioD decompressor used by bgzf.
Conversion can happen between different encodings, provided the
line terminator is ubyte = '\n'. GzipbyLine logic is modeled on
ByLineImpl and readln function from std.stdio.
*/
import std.mmfile;
import core.thread;
struct GzipbyLineThreaded(R) {
string fn;
UnCompress decompress;
R line;
// Nullable!ubyte[] uncompressed_buf;
uint _bufsize;
this(string gzipfn, uint bufsize=0x4000) {
enforce(gzipfn.isFile);
fn = gzipfn;
decompress = new UnCompress();
_bufsize = bufsize;
}
@disable this(this); // disable copy semantics;
int opApply(scope int delegate(int line, R) dg) {
int line = 0;
// chunk_byLine takes a buffer and splits on \n.
R chunk_byLine(R head, R rest) {
auto split = findSplitAfter(rest,"\n");
// If a new line is found split the in left and right.
auto left = split[0]; // includes eol splitter
auto right = split[1];
if (left.length > 0) { // we have a match!
dg(line++, head ~ left);
return chunk_byLine([], right);
}
// no match
return head ~ right;
}
R decompressor(ubyte[] buffer) {
return cast(R)decompress.uncompress(buffer);
}
auto mmf = new MmFile(fn);
immutable mmf_length = mmf.length();
long rest = mmf_length;
R tail; // tail of previous buffer
// Decompress the first chunk
auto buffer1 = cast(ubyte[])mmf[0.._bufsize];
rest -= buffer1.length;
auto buf = decompressor(buffer1);
uint chunknum = 1;
while(rest>0) {
// Get the next chunk
ulong pos2 = (chunknum+1)*_bufsize;
if (pos2 > mmf_length) pos2 = cast(ulong)mmf_length;
auto buffer2 = cast(ubyte[])mmf[chunknum*_bufsize..mmf_length];
rest -= buffer2.length;
// Set up decompressing the next chunk
auto t = task(&decompressor, buffer2);
// auto t = task!decompressor(buffer2);
t.executeInNewThread();
// now invoke the delegate
tail = chunk_byLine(tail,buf);
buf = t.yieldForce();
chunknum += 1;
}
tail = chunk_byLine(tail,buf);
if (tail.length > 0) dg(line++, tail);
return 0;
}
}
unittest {
int lines = 0;
uint chars = 0;
foreach(line, ubyte[] s; GzipbyLineThreaded!(ubyte[])("test/data/BXD_geno.txt.gz")) {
// test file contains 7320 lines 4707218 characters
// write(cast(string)s);
chars += s.length;
lines = line;
}
assert(lines == 7319,"genotype lines " ~ to!string(lines+1));
assert(chars == 4707218,"chars " ~ to!string(chars));
}
/*
This file is part of BioD.
Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com>
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
module bio.core.fasta;
import std.file;
import std.exception;
import std.algorithm;
import std.string;
struct FastaRecord {
string header;
string sequence;
}
auto fastaRecords(string filename) {
static auto toFastaRecord(S)(S str) {
auto res = findSplit(str, "\n");
auto header = res[0];
auto seq = res[2];
return FastaRecord(header, removechars(seq, "\n"));
}
string text = cast(string)std.file.read(filename);
enforce(text.length > 0 && text[0] == '>');
text = text[1 .. $];
auto records = splitter(text, '>');
return map!toFastaRecord(records);
}
module bio.core.utils.stream;
public import undead.stream;
public import contrib.undead.stream;
import core.stdc.stdio;
import core.stdc.errno;
import core.stdc.string;
......@@ -48,7 +48,7 @@ FileMode toFileMode(string mode) {
return result;
}
final class File: undead.stream.File {
final class File: contrib.undead.stream.File {
this(string filename, string mode="rb") {
version (Posix) {
// Issue 8528 workaround
......
/**
(Almost) a copy-paste from undead.stream.d
(Almost) a copy-paste from contrib.undead.stream.d
*/
module bio.core.utils.switchendianness;
......
......@@ -418,10 +418,10 @@
write data;
}%%
import bio.sam.header;
import bio.bam.cigar;
import bio.bam.read;
import bio.bam.bai.bin;
import bio.std.hts.sam.header;
import bio.std.hts.bam.cigar;
import bio.std.hts.bam.read;
import bio.std.hts.bam.bai.bin;
import bio.core.utils.outbuffer;
import bio.core.base;
import std.conv;
......@@ -508,7 +508,7 @@ unittest {
assert(equal!approxEqual(to!(float[])(alignment["Y1"]), [13.263, -3.1415, 52.63461]));
assert(to!char(alignment["XT"]) == 'U');
import bio.bam.reference;
import bio.std.hts.bam.reference;
auto info = ReferenceSequenceInfo("20", 1234567);
......