Skip to content
Commits on Source (7)
.dub/
dub.selections.json
BioD/
BioD
./BioD
undeaD/
lz4/
core
bin/
test*
build/
shunit*
/*.sam
......@@ -6,4 +16,9 @@ shunit*
/*.cram
/*.crai
/*.txt
*.hex
*.zcat
*.out
/utils/ldc_version_info_.d
profile.data
profile.raw
[submodule "BioD"]
path = BioD
url = https://github.com/biod/BioD.git
[submodule "htslib"]
path = htslib
url = https://github.com/lomereiter/htslib.git
[submodule "lz4"]
path = lz4
url = https://github.com/Cyan4973/lz4
[submodule "undeaD"]
path = undeaD
url = https://github.com/dlang/undeaD
[submodule "BioD"]
path = BioD
url = https://github.com/biod/BioD.git
[submodule "lz4"]
path = lz4
url = https://github.com/lz4/lz4.git
language: d
d:
- ldc
matrix:
# OSX testing is under development
allow_failures:
- os: osx
include:
- os: linux
compiler: gcc
addons:
apt:
packages:
# Our dev environment is a more recent GNU C++
# note that Debian liblz4-dev no longer supports LZ4 frames
# - liblz4-dev
- shunit2
- os: osx
compiler: clang
script:
- make
# - make test - disable tests because shunit2 we use is older
\ No newline at end of file
# INSTALL SAMBAMBA
## Sambamba dependencies
* D compiler
* gcc tool chain
* BioD
* htslib
* undeaD
* libz
* liblz4
D_COMPILER=dmd
D_FLAGS=--compiler=dmd -IBioD -IundeaD/src -g -d#-O -release -inline # -version=serial
LDMD=ldmd2
# This is a minimalistic make file to build sambamba with ldc2 as per instructions on
# https://github.com/biod/sambamba#compiling-sambamba
#
# Typical usage:
#
# make LIBRARY_PATH=~/opt/ldc2-1.7.0-linux-x86_64/lib debug|profile|release|static
#
# Static release with optimization (for releases):
#
# make LIBRARY_PATH=~/opt/ldc2-1.7.0-linux-x86_64/lib pgo-static
#
# Debug version
#
# make LIBRARY_PATH=~/opt/ldc2-1.7.0-linux-x86_64/lib debug
STATIC_LIB_PATH=-Lhtslib -Llz4/lib
STATIC_LIB_SUBCMD=$(STATIC_LIB_PATH) -Wl,-Bstatic -lhts -llz4 -Wl,-Bdynamic
RDMD_FLAGS=--force --build-only --compiler=$(D_COMPILER) $(D_FLAGS)
D_COMPILER=ldc2
DFLAGS = -wi -I. -IBioD -IundeaD/src -g
PLATFORM := $(shell uname -s)
DLIBS = $(LIBRARY_PATH)/libphobos2-ldc.a $(LIBRARY_PATH)/libdruntime-ldc.a
DLIBS_DEBUG = $(LIBRARY_PATH)/libphobos2-ldc-debug.a $(LIBRARY_PATH)/libdruntime-ldc-debug.a
LIBS = htslib/libhts.a lz4/lib/liblz4.a -L-L$(LIBRARY_PATH) -L-lrt -L-lpthread -L-lm
LIBS_STATIC = $(LIBRARY_PATH)/libc.a $(DLIBS) htslib/libhts.a $(LIBRARY_PATH)/liblz4.a
SRC = $(wildcard main.d utils/*.d thirdparty/*.d cram/*.d) $(wildcard undeaD/src/undead/*.d) $(wildcard BioD/bio/*/*.d BioD/bio/*/*/*.d BioD/bio2/*.d BioD/bio2/*/*.d) $(wildcard sambamba/*.d sambamba/*/*.d sambamba/*/*/*.d)
OBJ = $(SRC:.d=.o) utils/ldc_version_info_.o
OUT = bin/sambamba
ifeq "$(PLATFORM)" "Darwin"
STATIC_LIB_PATH=-Lhtslib -Llz4
LINK_CMD=gcc -dead_strip -lphobos2-ldc -ldruntime-ldc -lm -lpthread htslib/libhts.a lz4/lib/liblz4.a build/sambamba.o -o build/sambamba
DMD_STATIC_LIBS=htslib/libhts.a lz4/lib/liblz4.a
.PHONY: all debug release static clean test
define split-debug
dsymutil build/sambamba -o build/sambamba.dSYM
strip -S build/sambamba
endef
debug: DFLAGS += -O0 -d-debug -link-debuglib
else
profile: DFLAGS += -fprofile-instr-generate=profile.raw
LINK_CMD=gcc -Wl,--gc-sections -o build/sambamba build/sambamba.o $(STATIC_LIB_SUBCMD) -lphobos2-ldc -ldruntime-ldc -lrt -lpthread -lm -ldl
DMD_STATIC_LIBS=-L-Lhtslib -L-l:libhts.a -L-l:libphobos2.a -L-Llz4/lib -L-l:liblz4.a
release static profile pgo-static: DFLAGS += -O3 -release -enable-inlining -boundscheck=off
define split-debug
objcopy --only-keep-debug build/sambamba sambamba.debug
objcopy --strip-debug build/sambamba
objcopy --add-gnu-debuglink=sambamba.debug build/sambamba
mv sambamba.debug build/
endef
static: DFLAGS += -static -L-Bstatic
endif
pgo-static: DFLAGS += -fprofile-instr-use=profile.data
PREREQS := ldc-version-info htslib-static lz4-static
all: release
# DMD only - this goal is used because of fast compilation speed, during development
all: $(PREREQS)
mkdir -p build/
rdmd --force --build-only $(D_FLAGS) $(DMD_STATIC_LIBS) -ofbuild/sambamba main.d
lz4-static: lz4/lib/liblz4.a
# This is the main Makefile goal, used for building releases (best performance)
sambamba-ldmd2-64: $(PREREQS)
mkdir -p build/
$(LDMD) @sambamba-ldmd-release.rsp
$(LINK_CMD)
$(split-debug)
lz4/lib/liblz4.a: lz4/lib/lz4.c lz4/lib/lz4hc.c lz4/lib/lz4frame.c lz4/lib/xxhash.c
cd lz4/lib && gcc -O3 -c lz4.c lz4hc.c lz4frame.c xxhash.c && $(AR) rcs liblz4.a lz4.o lz4hc.o lz4frame.o xxhash.o
# For debugging; GDB & Valgrind are more friendly to executables created using LDC/GDC than DMD
sambamba-ldmd2-debug: $(PREREQS)
mkdir -p build/
$(LDMD) @sambamba-ldmd-debug.rsp
$(LINK_CMD)
htslib-static:
cd htslib && $(MAKE)
ldc-version-info:
./gen_ldc_version_info.py $(shell which $(LDMD)) > utils/ldc_version_info_.d
./gen_ldc_version_info.py $(shell which ldmd2) > utils/ldc_version_info_.d
cat utils/ldc_version_info_.d
htslib-static:
cd htslib && $(MAKE)
utils/ldc_version_info_.o: ldc-version-info
$(D_COMPILER) $(DFLAGS) -c utils/ldc_version_info_.d -od=$(dir $@)
lz4-static: lz4/lib/liblz4.a
build-setup: lz4-static htslib-static ldc-version-info
mkdir -p bin/
lz4/lib/liblz4.a: lz4/lib/lz4.c lz4/lib/lz4hc.c lz4/lib/lz4frame.c lz4/lib/xxhash.c
cd lz4/lib && $(CC) -O3 -c lz4.c lz4hc.c lz4frame.c xxhash.c && $(AR) rcs liblz4.a lz4.o lz4hc.o lz4frame.o xxhash.o
default debug release static: $(OUT)
# all below link to libhts dynamically for simplicity
profile: release
./bin/sambamba sort /gnu/data/in_raw.bam -p > /dev/null
ldc-profdata merge -output=profile.data profile.raw
rm ./bin/sambamba ./bin/sambamba.o # trigger rebuild
sambamba-flagstat:
mkdir -p build/
rdmd $(RDMD_FLAGS) -L-lhts -version=standalone -ofbuild/sambamba-flagstat sambamba/flagstat.d
default: all
sambamba-merge:
mkdir -p build/
rdmd $(RDMD_FLAGS) -L-lhts -version=standalone -ofbuild/sambamba-merge sambamba/merge.d
# ---- Compile step
%.o: %.d
$(D_COMPILER) $(DFLAGS) -c $< -od=$(dir $@)
sambamba-index:
mkdir -p build/
rdmd $(RDMD_FLAGS) -L-lhts -version=standalone -ofbuild/sambamba-index sambamba/index.d
singleobj:
$(info compile single object...)
$(D_COMPILER) -singleobj $(DFLAGS) -c -of=bin/sambamba.o $(SRC)
sambamba-sort:
mkdir -p build/
rdmd $(RDMD_FLAGS) -L-lhts -version=standalone -ofbuild/sambamba-sort sambamba/sort.d
# ---- Link step
$(OUT): build-setup singleobj utils/ldc_version_info_.o
$(info linking...)
$(D_COMPILER) $(DFLAGS) -of=bin/sambamba bin/sambamba.o utils/ldc_version_info_.o $(LIBS)
sambamba-view:
mkdir -p build/
rdmd $(RDMD_FLAGS) -L-lhts -version=standalone -ofbuild/sambamba-view sambamba/view.d
test:
./run_tests.sh
sambamba-slice:
mkdir -p build/
rdmd $(RDMD_FLAGS) -L-lhts -version=standalone -ofbuild/sambamba-slice sambamba/slice.d
check: test
sambamba-markdup:
mkdir -p build/
rdmd $(RDMD_FLAGS) -L-lhts -version=standalone -ofbuild/sambamba-markdup sambamba/markdup.d
debug-strip:
objcopy --only-keep-debug bin/sambamba sambamba.debug
objcopy --strip-debug bin/sambamba
objcopy --add-gnu-debuglink=sambamba.debug bin/sambamba
mv sambamba.debug bin/
sambamba-depth:
mkdir -p build/
rdmd $(RDMD_FLAGS) -L-lhts -version=standalone -ofbuild/sambamba-depth sambamba/depth.d
pgo-static: profile static debug-strip
sambamba-pileup:
mkdir -p build/
rdmd $(RDMD_FLAGS) -L-lhts -version=standalone -ofbuild/sambamba-pileup sambamba/pileup.d
install:
install -m 0755 bin/sambamba $(prefix)/bin
.PHONY: clean ldc-version-info
clean: clean-d
cd htslib ; make clean
rm -f profile.data
rm -f profile.raw
clean:
rm -rf build/ ; $(MAKE) -C htslib clean ; $(MAKE) -C lz4 clean
clean-d:
rm -rf bin/*
rm -f $(OBJ) $(OUT) trace.{def,log}
# GNU Guix makefile
# GNU Guix makefile. GNU Guix is the package manager for GNU and we use it for
# sambamba development and deployment. Normally use the standard Makefile
# instead.
#
# To build sambamba on GNU Guix:
#
......@@ -6,33 +8,46 @@
#
# run with
#
# ./build/sambamba
# ./bin/sambamba
#
# certain paths may need to be set, e.g.
#
# make UNDEAD_PATH=../undeaD/src -f Makefile.guix
#
# For more information see INSTALL.md
#
# The following two are modified by the Guix package:
D_COMPILER=ldc2
LDC_LIB_PATH=$(HOME)/.guix-profile/lib
DFLAGS = -wi -I. -IBioD -IundeaD/src
DLIBS = $(LDC_LIB_PATH)/libphobos2-ldc.a $(LDC_LIB_PATH)/libdruntime-ldc.a
DLIBS_DEBUG = $(LDC_LIB_PATH)/libphobos2-ldc-debug.a $(LDC_LIB_PATH)/libdruntime-ldc-debug.a
UNDEAD_PATH=../undeaD/src
BIOD_PATH=../BioD
DFLAGS = -wi -I. -I$(BIOD_PATH) -IundeaD/src -I$(UNDEAD_PATH)
# DLIBS = $(LDC_LIB_PATH)/libphobos2-ldc.a $(LDC_LIB_PATH)/libdruntime-ldc.a
# DLIBS_DEBUG = -debuglib=phobos2-ldc-debug,druntime-ldc-debug $(LDC_LIB_PATH)/libphobos2-ldc-debug.a $(LDC_LIB_PATH)/libdruntime-ldc-debug.a -link-debuglib
DLIBS_DEBUG = -debuglib=phobos2-ldc-debug-shared,druntime-ldc-debug-shared -link-debuglib -L-lphobos2-ldc-debug-shared -L-ldruntime-ldc-debug-shared
RPATH = -L--rpath=$(dir $(realpath $(LDC_LIB_PATH)/libz.so)):$(dir $(realpath $(LDC_LIB_PATH)/liblz4.so))
LIBS = htslib/libhts.a -L-L$(LDC_LIB_PATH) -L-lrt -L-lpthread -L-lm -L-lz -L-llz4
LIBS_STATIC = $(LDC_LIB_PATH)/libc.a $(DLIBS) htslib/libhts.a $(LDC_LIB_PATH)/liblz4.a
SRC = $(wildcard main.d utils/*.d thirdparty/*.d cram/*.d) $(wildcard undeaD/src/undead/*.d) $(wildcard BioD/bio/*/*.d BioD/bio/*/*/*.d) $(wildcard sambamba/*.d sambamba/*/*.d sambamba/*/*/*.d)
OBJ = $(SRC:.d=.o) utils/ldc_version_info_.o
OUT = build/sambamba
# LIBS = htslib/libhts.a -L-lrt -L-lpthread -L-lm -L-lz -L-llz4
LIBS_STATIC = $(DLIBS) htslib/libhts.a $(LDC_LIB_PATH)/liblz4.a
SRC = $(wildcard utils/*.d thirdparty/*.d cram/*.d) $(wildcard $(UNDEAD_PATH)/undead/**/*.d) $(wildcard $(UNDEAD_PATH)/undead/*.d) $(wildcard $(BIOD_PATH)/bio/*/*.d $(BIOD_PATH)/bio/*/*/*.d) $(wildcard sambamba/*.d sambamba/*/*.d sambamba/*/*/*.d) utils/ldc_version_info_.d
OBJ = $(SRC:.d=.o)
OUT = bin/sambamba
.PHONY: all guix guix-debug debug release static profile clean test
# The Guix targets resolve the RPATH automatically
guix: DFLAGS += -O -release -g # Guix strips debug flags
guix-debug: DFLAGS += -O0 -g -d-debug
guix-debug: DFLAGS += -O0 -g -d-debug -unittest
# The following options are run in development from ~/.guix-profile and need to inject the RPATH
debug: DFLAGS += -O0 -g -d-debug $(RPATH) -link-debuglib
debug: DFLAGS += -O0 -g -d-debug $(RPATH) -link-debuglib -unittest
release: DFLAGS += -O -release $(RPATH)
release static: DFLAGS += -O3 -release $(RPATH) -enable-inlining -Hkeep-all-bodies -boundscheck=off
static: DFLAGS += -O -release -static -L-Bstatic
static: DFLAGS += -static -L-Bstatic -L-L/gnu/store/rmjlycdgiq8pfy5hfi42qhw3k7p6kdav-glibc-2.25/lib/
profile: DFLAGS += -g -O -profile $(RPATH)
......@@ -42,21 +57,17 @@ static: LIBS = $(LIBS_STATIC)
guix-debug debug profile: LIBS += $(DLIBS_DEBUG)
.PHONY: all guix guix-debug debug release static profile clean test
all: debug
htslib-static:
cd htslib && $(MAKE)
# bio2/pileup.d: bio2/bam/reader.d
ldc-version-info:
./gen_ldc_version_info.py $(shell which ldmd2) > utils/ldc_version_info_.d
# sambamba/subsample.d: bio2/pileup.d
utils/ldc_version_info_.o: ldc-version-info
$(D_COMPILER) $(DFLAGS) -c utils/ldc_version_info_.d -od=$(dir $@)
utils/ldc_version_info_.d:
./gen_ldc_version_info.py $(shell which ldc2) > utils/ldc_version_info_.d
build-setup: htslib-static ldc-version-info
mkdir -p build/
utils/ldc_version_info_.o: utils/ldc_version_info_.d
$(D_COMPILER) $(DFLAGS) -c utils/ldc_version_info_.d -od=$(dir $@)
guix guix-debug default debug release static profile: $(OUT)
......@@ -65,26 +76,36 @@ guix guix-debug default debug release static profile: $(OUT)
$(D_COMPILER) $(DFLAGS) -c $< -od=$(dir $@)
# ---- Link step
$(OUT): build-setup $(OBJ)
$(D_COMPILER) $(DFLAGS) -of=build/sambamba $(OBJ) $(LIBS)
$(OUT): $(OBJ)
cd htslib && $(MAKE)
mkdir -p bin/
$(D_COMPILER) $(DFLAGS) -of=bin/sambamba $(OBJ) $(LIBS)
test:
test: clean-tests
./run_tests.sh
check: test
biod:
cd $(BIOD_PATH)/src_ragel && make
check: all test
debug-strip: debug
objcopy --only-keep-debug build/sambamba sambamba.debug
objcopy --strip-debug build/sambamba
objcopy --add-gnu-debuglink=sambamba.debug build/sambamba
mv sambamba.debug build/
objcopy --only-keep-debug bin/sambamba sambamba.debug
objcopy --strip-debug bin/sambamba
objcopy --add-gnu-debuglink=sambamba.debug bin/sambamba
mv sambamba.debug bin/
install:
install -m 0755 build/sambamba $(prefix)/bin
install -m 0755 bin/sambamba $(prefix)/bin
clean: clean-d
clean-c:
cd htslib ; make clean
clean-d:
rm -rf build/*
rm -f $(OBJ) $(OUT) trace.{def,log}
rm -v $(OBJ) $(OUT) trace.{def,log}
rm -v bin/*
clean-tests:
rm -rf output/*
clean: clean-d clean-tests
D_COMPILER=dmd
D_FLAGS=--compiler=dmd -IBioD -IundeaD/src -g -d#-O -release -inline # -version=serial
LDMD=ldmd2
STATIC_LIB_PATH=-Lhtslib -Llz4/lib
STATIC_LIB_SUBCMD=$(STATIC_LIB_PATH) -Wl,-Bstatic -lhts -llz4 -Wl,-Bdynamic
RDMD_FLAGS=--force --build-only --compiler=$(D_COMPILER) $(D_FLAGS)
PLATFORM := $(shell uname -s)
ifeq "$(PLATFORM)" "Darwin"
LINK_CMD=gcc -dead_strip -lphobos2-ldc -ldruntime-ldc -lm -lpthread htslib/libhts.a lz4/lib/liblz4.a build/sambamba.o -o build/sambamba
DMD_STATIC_LIBS=htslib/libhts.a lz4/lib/liblz4.a
define split-debug
dsymutil build/sambamba -o build/sambamba.dSYM
strip -S build/sambamba
endef
else
LINK_CMD=gcc -Wl,--gc-sections -o build/sambamba build/sambamba.o $(STATIC_LIB_SUBCMD) -lphobos2-ldc -ldruntime-ldc -lrt -lpthread -lm -ldl
DMD_STATIC_LIBS=-L-Lhtslib -L-l:libhts.a -L-l:libphobos2.a -L-Llz4/lib -L-l:liblz4.a
define split-debug
objcopy --only-keep-debug build/sambamba sambamba.debug
objcopy --strip-debug build/sambamba
objcopy --add-gnu-debuglink=sambamba.debug build/sambamba
mv sambamba.debug build/
endef
endif
PREREQS := ldc-version-info htslib-static lz4-static
# DMD only - this goal is used because of fast compilation speed, during development
all: $(PREREQS)
mkdir -p build/
rdmd --force --build-only $(D_FLAGS) $(DMD_STATIC_LIBS) -ofbuild/sambamba main.d
# This is the main Makefile goal, used for building releases (best performance)
sambamba-ldmd2-64: $(PREREQS)
mkdir -p build/
$(LDMD) @sambamba-ldmd-release.rsp
$(LINK_CMD)
$(split-debug)
# For debugging; GDB & Valgrind are more friendly to executables created using LDC/GDC than DMD
sambamba-ldmd2-debug: $(PREREQS)
mkdir -p build/
$(LDMD) @sambamba-ldmd-debug.rsp
$(LINK_CMD)
ldc-version-info:
./gen_ldc_version_info.py $(shell which $(LDMD)) > utils/ldc_version_info_.d
htslib-static:
cd htslib && $(MAKE)
lz4-static: lz4/lib/liblz4.a
lz4/lib/liblz4.a: lz4/lib/lz4.c lz4/lib/lz4hc.c lz4/lib/lz4frame.c lz4/lib/xxhash.c
cd lz4/lib && $(CC) -O3 -c lz4.c lz4hc.c lz4frame.c xxhash.c && $(AR) rcs liblz4.a lz4.o lz4hc.o lz4frame.o xxhash.o
# all below link to libhts dynamically for simplicity
sambamba-flagstat:
mkdir -p build/
rdmd $(RDMD_FLAGS) -L-lhts -version=standalone -ofbuild/sambamba-flagstat sambamba/flagstat.d
sambamba-merge:
mkdir -p build/
rdmd $(RDMD_FLAGS) -L-lhts -version=standalone -ofbuild/sambamba-merge sambamba/merge.d
sambamba-index:
mkdir -p build/
rdmd $(RDMD_FLAGS) -L-lhts -version=standalone -ofbuild/sambamba-index sambamba/index.d
sambamba-sort:
mkdir -p build/
rdmd $(RDMD_FLAGS) -L-lhts -version=standalone -ofbuild/sambamba-sort sambamba/sort.d
sambamba-view:
mkdir -p build/
rdmd $(RDMD_FLAGS) -L-lhts -version=standalone -ofbuild/sambamba-view sambamba/view.d
sambamba-slice:
mkdir -p build/
rdmd $(RDMD_FLAGS) -L-lhts -version=standalone -ofbuild/sambamba-slice sambamba/slice.d
sambamba-markdup:
mkdir -p build/
rdmd $(RDMD_FLAGS) -L-lhts -version=standalone -ofbuild/sambamba-markdup sambamba/markdup.d
sambamba-depth:
mkdir -p build/
rdmd $(RDMD_FLAGS) -L-lhts -version=standalone -ofbuild/sambamba-depth sambamba/depth.d
sambamba-pileup:
mkdir -p build/
rdmd $(RDMD_FLAGS) -L-lhts -version=standalone -ofbuild/sambamba-pileup sambamba/pileup.d
.PHONY: clean ldc-version-info
clean:
rm -rf build/ ; $(MAKE) -C htslib clean ; $(MAKE) -C lz4 clean
[![Build Status](https://travis-ci.org/biod/sambamba.svg?branch=master)](https://travis-ci.org/biod/sambamba) [![Anaconda-Server Badge](https://anaconda.org/bioconda/sambamba/badges/installer/conda.svg)](https://conda.anaconda.org/bioconda) [![DL](https://anaconda.org/bioconda/sambamba/badges/downloads.svg)](https://anaconda.org/bioconda/sambamba)
# sambamba
[![Anaconda-Server Badge](https://anaconda.org/bioconda/sambamba/badges/installer/conda.svg)](https://conda.anaconda.org/bioconda) [![DL](https://anaconda.org/bioconda/sambamba/badges/downloads.svg)](https://anaconda.org/bioconda/sambamba)
## Table of Contents
......@@ -14,30 +15,50 @@
<a name="intro"></a>
# Introduction
Sambamba is a high performance modern robust and fast tool (and
library), written in the D programming language, for working with SAM
and BAM files. Current functionality is an important
subset of samtools functionality, including view, index, sort,
markdup, and depth. Most tools support piping: just specify `/dev/stdin`
or `/dev/stdout` as filenames.
For almost 5 years the main advantage over `samtools` was parallelized BAM reading.
Finally in March 2017 `samtools` 1.4 was released, reaching parity on this.
That said, we still have quite a few interesting features to offer:
- faster `sort` (no benchmarks yet, sorry)
Sambamba is a high performance highly parallel robust and fast tool
(and library), written in the D programming language, for working with
SAM and BAM files. Because of its efficiency is an important work
horse running in many sequencing centres around the world
today.
Current functionality is an important subset of samtools
functionality, including view, index, sort, markdup, and depth. Most
tools support piping: just specify `/dev/stdin` or `/dev/stdout` as
filenames. When we started writing sambamba (in 2012) the main
advantage over `samtools` was parallelized BAM reading and writing.
In March 2017 `samtools` 1.4 was released, reaching parity on this. A
[recent performance comparison](https://github.com/guigolab/sambamBench-nf)
shows that sambamba holds its ground and can do better in different
configurations. Here are some comparison
[metrics](https://public-docs.crg.es/rguigo/Data/epalumbo/sambamba_ws_report.html). For
example for flagstat sambamba is 1.4x faster than samtools. For index
they are similar. For Markdup almost 6x faster and for view 4x
faster. For sort sambamba has been beaten generally, though sambamba
is up to 2x faster on large RAM machines.
In addition sambamba has a few interesting features to offer, in particular
- faster large machine `sort`, see [performance](./test/benchmark/stats.org)
- automatic index creation when writing any coordinate-sorted file
- `view -L <bed file>` utilizes BAM index to skip unrelated chunks
- `depth` allows to measure base, sliding window, or region coverages
- [Chanjo](https://www.chanjo.co/) builds upon this and gets you to exon/gene levels of abstraction
- `markdup`, a fast implementation of Picard algorithm
- `slice` quickly extracts a region into a new file, tweaking only first/last chunks
- and more
Even though Sambamba started out as a samtools clone we are now in the
process of adding new functionality - also in the
[BioD project](https://github.com/biod/BioD). The D language is
extremely suitable for high performance computing. At this point we
think that the BAM format is here to stay for processing sequencing
data and we aim to make it easy to parse and process BAM files.
Sambamba is free and open source software, licensed under GPLv2+.
See manual pages [online](https://lomereiter.github.io/sambamba/docs/sambamba-view.html)
to know more about what is available and how to use it.
For more information on Sambamba you can contact Artem Tarasov and Pjotr Prins.
For more information on Sambamba contact the mailing list (see below).
<a name="install"></a>
# Binary installation
......@@ -49,11 +70,11 @@ are Github source and binary
the tarball, unpack it and run it. For example
```sh
wget https://github.com/biod/sambamba/releases/download/v0.6.6/sambamba_v0.6.6_linux.tar.bz2
tar xvjf sambamba_v0.6.6_linux.tar.bz2
./sambamba_v0.6.6
wget https://github.com/biod/sambamba/releases/download/v0.6.8/sambamba_v0.6.8_linux.tar.bz2
tar xvjf sambamba_v0.6.8_linux.tar.bz2
./sambamba_v0.6.8
sambamba 0.6.6
sambamba 0.6.8
Usage: sambamba [command] [args...]
......@@ -62,35 +83,6 @@ tar xvjf sambamba_v0.6.6_linux.tar.bz2
To get help on a particular command, just call it without args.
```
## Install latest pre-release
A *latest* pre-release of sambamba 0.6.7 for Linux that includes debug
information and *all* dependencies is available from this
[link](http://test-gn2.genenetwork.org/ipfs/QmakasNfZhdbPA3xJYNxNX7at5FtYnS4hUNnvDbzxhZf2J). This
24Mb download reflects the development edition and includes recent
versions of libraries, samtools and bcftools. It should install on any
Linux distribution, including old ones on HPC clusters.
Install the tarball by unpacking it and running the contained install
script with a target directory e.g.
```sh
wget http://test-gn2.genenetwork.org/ipfs/QmakasNfZhdbPA3xJYNxNX7at5FtYnS4hUNnvDbzxhZf2J/hb13hjys1064jmb6z17yc1f822hv9zsz-sambamba-0.6.7-pre1-7cff065-x86_64.tar.bz2
tar xvjf QmakasNfZhdbPA3xJYNxNX7at5FtYnS4hUNnvDbzxhZf2J/hb13hjys1064jmb6z17yc1f822hv9zsz-sambamba-0.6.7-pre1-7cff065-x86_64.tar.bz2
./install.sh ~/sambamba-test
~/sambamba-test/bin/sambamba
sambamba 0.6.7-pre1
Usage: sambamba [command] [args...]
Available commands: 'view', 'index', 'merge', 'sort',
'flagstat', 'slice', 'markdup', 'depth', 'mpileup'
```
Binaries are also available through the following packaging tools (note the version numbers):
## Bioconda install
[![Install with CONDA](https://anaconda.org/bioconda/sambamba/badges/installer/conda.svg)](https://anaconda.org/bioconda/sambamba)
......@@ -126,8 +118,7 @@ similar. Also try running the latest version of sambamba to make sure
it has not been fixed already. Support/installation questions should
be aimed at the mailing list. The issue tracker is for development
issues around the software itself. When reporting an issue include the
output of the program and the contents of the .log.txt file in the
output directory.
output of the program and the contents of the output directory.
## Check list:
......@@ -139,7 +130,7 @@ output directory.
6. [ ] If it is a support/install question I have posted it to the [mailing list](https://groups.google.com/forum/#!forum/sambamba-discussion)
7. [ ] If it is software development related I have posted a new issue on the [issue tracker](https://github.com/biod/sambamba/issues) or added to an existing one
8. [ ] In the message I have included the output of my sambamba run
9. [ ] In the message I have included the relevant .log.txt file in the output directory
9. [ ] In the message I have included the relevant files in the output directory
10. [ ] I have made available the data to reproduce the problem (optional)
To find bugs the sambamba software developers may ask to install a
......@@ -168,35 +159,48 @@ which targets LLVM.
## Compilation dependencies
- zlib
- git (to check out the repo)
- gcc compiler 4.9 or later (for htslib)
- D compiler 1.7.0 or later (ldc2, see below)
- python2 (parses D-compiler header for version info)
- zlib (library)
- lz4 (library)
- htslib (submodule)
- lz4 (submodule)
- BioD (submodule)
- undeaD (submodule)
- BioD (source)
- undeaD (source)
- python2
## Compiling for Linux
The LDC compiler's github repository also provides binary images. The current
preferred release for sambamba is LDC - the LLVM D compiler (>= 1.1.0). After
installing LDC:
The LDC compiler's github repository provides binary images. The current
preferred release for sambamba is LDC - the LLVM D compiler (>= 1.6.1). After
installing LDC from https://github.com/ldc-developers/ldc/releases/ with, for example
```sh
cd
wget https://github.com/ldc-developers/ldc/releases/download/v$ver/ldc2-1.7.0-linux-x86_64.tar.xz
tar xvJf ldc2-1.7.0-linux-x86_64.tar.xz
export PATH=$HOME/ldc2-1.7.0-linux-x86_64/bin:$PATH
export LIBRARY_PATH=$HOME/ldc2-1.7.0-linux-x86_64/lib
```
```sh
git clone --recursive https://github.com/biod/sambamba.git
cd sambamba
git clone https://github.com/dlang/undeaD
make sambamba-ldmd2-64
make
```
Installing LDC only means unpacking an archive and setting some
environmental variables, e.g. unpacking into `$HOME`:
To build a debug release run
```sh
cd
wget https://github.com/ldc-developers/ldc/releases/download/v$ver/ldc2-$ver-linux-x86_64.tar.xz
tar xJf ldc2-$ver-linux-x86_64.tar.xz
export PATH=~/ldc2-$ver-linux-x86_64/bin/:$PATH
export LIBRARY_PATH=~/ldc2-$ver-linux-x86_64/lib/
make clean && make debug
```
To run the test fetch shunit2 from https://github.com/kward/shunit2 and put it in the path so
you can run
```sh
make check
```
### GNU Guix
......@@ -209,6 +213,9 @@ guix package -i ldc
## Compiling for Mac OS X
Note: the Makefile does not work. Someone want to fix that using the
Makefile.old version? See also https://github.com/biod/sambamba/issues/338.
```sh
brew install ldc
git clone --recursive https://github.com/biod/sambamba.git
......@@ -227,9 +234,16 @@ documentation](https://github.com/biod/sambamba-dev-docs).
<a name="debug"></a>
# Debugging and troubleshooting
## Segfaults on certain Intel Xeons
Important note: some popular Xeon processors segfault under heavy
hyper threading - which Sambamba utilizes. Please read
[this](https://blog.cloudflare.com/however-improbable-the-story-of-a-processor-bug/)
when encountering seemingly random crashes.
## Dump core
In a crash sambamba can dump a core. To make this happen set
In a crash sambamba can dump a core file. To make this happen set
```sh
ulimit -c unlimited
......@@ -274,7 +288,8 @@ the tarball and run the contained install.sh script with TARGET
Run sambamba in gdb with
```
gdb --args ~/sambamba-test/sambamba-*/bin/sambamba view --throw-error
gdb -ex 'handle SIGUSR1 SIGUSR2 nostop noprint' \
--args ~/sambamba-test/sambamba-*/bin/sambamba view --throw-error
```
<a name="license"></a>
......
## ChangeLog v0.6.8-pre1 (20180207)
Minor release with a much faster binary. 10-20% faster than v0.6.6,
due to ldc and LLVM improvements. Fixes speed regression of v0.6.7 for
large files. See also [performance](https://github.com/biod/sambamba/blob/master/test/benchmark/stats.org)
+ Fixed Makefile for general use, see #332
+ Started benchmarking, see #283 and https://github.com/biod/sambamba/blob/master/test/benchmark/stats.org
+ Readded [Travis-ci support](https://travis-ci.org/biod/sambamba) for Linux (MacOS is disabled #338)
+ Updated BioD to latest https://github.com/biod/BioD/commit/5e56b2bb45324af2194b3339d298fd827c8003ae
+ Bug fixes:
* #328 Debug version: SAM output of CRAM file is populated with debug on pipe
* #331 Segmentation fault attempting to view header in json format
* #335 Intel Xeon bug may segfault Sambamba - this was tracked down to an Intel Xeon bug
+ Documentation updates
......@@ -23,13 +23,13 @@ struct RcPtr(T, alias Free) {
this.ptr = ptr;
debug {
payload_id = ++payload_counter;
stderr.writeln("Init ", T.stringof, "* #", payload_id);
// stderr.writeln("Init ", T.stringof, "* #", payload_id);
}
}
~this() {
debug {
stderr.writeln("Free ", T.stringof, "* #", payload_id);
// stderr.writeln("Free ", T.stringof, "* #", payload_id);
}
Free(ptr);
}
......@@ -51,7 +51,7 @@ struct RcPtr(T, alias Free) {
this(this)
{
static if (is(T == cram_slice)) {
debug writeln("COPIED #", data.payload_id + 1);
// debug stderr.writeln("COPIED #", data.payload_id + 1);
}
}
......@@ -159,7 +159,7 @@ struct CramContainerRange {
auto err_msg = "Failed to read container header";
while (true) {
// read container header
debug writeln("cram_read_container");
// debug stderr.writeln("cram_read_container");
auto ptr = nullChecked!cram_read_container(err_msg, _fd);
if (ptr is null) {
empty = true;
......@@ -240,8 +240,8 @@ class UndecodedSliceRange {
_container.curr_slice++;
auto err_msg = "Failure in cram_read_slice";
debug stderr.writeln("cram_read_slice (", _container.curr_slice,
"/", _container.max_slice, ")");
// debug stderr.writeln("cram_read_slice (", _container.curr_slice,
// "/", _container.max_slice, ")");
auto ptr = cram_read_slice(_fd);
if (ptr is null) {
throw new CramException(err_msg);
......
sambamba (0.6.7-3) UNRELEASED; urgency=medium
sambamba (0.6.8-pre3-1) UNRELEASED; urgency=medium
* New upstream pre-release
Closes: #907489
* Point Vcs fields to salsa.debian.org
* Standards-Version: 4.2.1
* Fix normalize(weight) (applied patch from upstream)
Should close #907489 but it does not work - wait for new upstream release
* Add build time test
* Allow pre-releases in watch file
-- Andreas Tille <tille@debian.org> Tue, 28 Aug 2018 20:07:55 +0200
-- Andreas Tille <tille@debian.org> Sun, 23 Sep 2018 07:42:58 +0200
sambamba (0.6.7-2) unstable; urgency=medium
......
......@@ -10,7 +10,7 @@ Subject: [PATCH] Add Meson build file
--- /dev/null
+++ b/meson.build
@@ -0,0 +1,135 @@
@@ -0,0 +1,138 @@
+project('Sambamba', 'd')
+
+project_version = '0.6.6'
......@@ -31,10 +31,13 @@ Subject: [PATCH] Add Meson build file
+ 'sambamba/flagstat.d',
+ 'sambamba/index.d',
+ 'sambamba/markdup.d',
+ 'sambamba/markdup2.d',
+ 'sambamba/merge.d',
+ 'sambamba/pileup.d',
+ 'sambamba/slice.d',
+ 'sambamba/sort.d',
+ 'sambamba/subsample.d',
+ 'sambamba/validate.d',
+ 'sambamba/utils/common/bed.d',
+ 'sambamba/utils/common/file.d',
+ 'sambamba/utils/common/filtering.d',
......@@ -114,7 +117,7 @@ Subject: [PATCH] Add Meson build file
+# Targets
+#
+sambamba_exe = executable('sambamba',
+ ['main.d',
+ ['sambamba/main.d',
+ sambamba_src,
+ utils_src,
+ cram_src,
......
From: Pjotr Prins <pjotr.public01@thebird.nl>
Origin: https://github.com/biod/sambamba/commit/cb170d641c21f5aabeb04cedab3ced5b7262d007
Date: Fri, 27 Jul 2018 23:28:29 +0000
Bug-Debian: https://bugs.debian.org/907489
Subject: [PATCH] Fixes normalize(weight)
/gnu/store/4snsi4vg06bdfi6qhdjfbhss16kvzxj7-ldc-1.10.0/include/d/std/numeric.d(1845):
Error: read-modify-write operations are not allowed for shared variables. Use core.atomic.atomicOp!"+="(s, e) instead.
---
Makefile | 2 +-
Makefile.guix | 6 +++---
sambamba/merge.d | 8 ++++++--
3 files changed, 10 insertions(+), 6 deletions(-)
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
D_COMPILER=dmd
-D_FLAGS=--compiler=dmd -IBioD -IundeaD/src -g -d#-O -release -inline # -version=serial
+D_FLAGS=--compiler=dmd -I../BioD -IBioD -IundeaD/src -g -d#-O -release -inline # -version=serial
LDMD=ldmd2
STATIC_LIB_PATH=-Lhtslib -Llz4/lib
--- a/sambamba/merge.d
+++ b/sambamba/merge.d
@@ -1,6 +1,7 @@
/*
This file is part of Sambamba.
Copyright (C) 2012-2016 Artem Tarasov <lomereiter@gmail.com>
+ Copyright (C) 2012-2017 Pjotr Prins <pjotr.prins@thebird.nl>
Sambamba is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -403,8 +404,11 @@ int merge_main(string[] args) {
alias ReturnType!(BamReader.readsWithProgress!withoutOffsets) AlignmentRangePB;
auto alignmentranges_with_file_ids = new Tuple!(AlignmentRangePB, size_t)[files.length];
- auto weights = cast(shared)array(map!(pipe!(getSize, to!float))(filenames));
- normalize(cast()weights);
+ // auto weights = cast(shared)array(map!(pipe!(getSize, to!float))(filenames));
+ auto weights1 = array(map!(pipe!(getSize, to!float))(filenames));
+ normalize(weights1);
+ // auto weights = cast(shared)weights1;
+ immutable weights = cast(immutable)weights1;
foreach (i; 0 .. files.length) {
alignmentranges_with_file_ids[i] = tuple(
01_add_meson.patch
disable-assert.patch
03_fix_normalize.patch
version=4
https://github.com/lomereiter/sambamba/releases .*/archive/v(\d[\d.-]+)\.(?:tar(?:\.gz|\.bz2)?|tgz)
https://github.com/lomereiter/sambamba/releases .*/archive/v(\d[\d.pre-]+)@ARCHIVE_EXT@
* Sambamba new design
** Introduction
Because of its great multi-core performance Sambamba has served over
five years in sequencing centers [[https://groups.google.com/d/msg/sambamba-discussion/fIgrrUa441o/XG7Rt3dFAQAJ][around the world]]. Here we start on a
new design (sambamba2) that should improve performance and, perhaps
more importantly, make the building blocks more composable. D has
proven to be a great language for multi-core performance and code
clarity, so we are happy to build on its latest language features. For
example, see this [[http://forum.dlang.org/thread/gvtjhpxdqpboppoodmxm@forum.dlang.org][dicussion]] on streaming data, which, if you read it
carefully, suggests we should use multiple implementations for file
access.
Here we document some of the choices we are making for the new
design. Starting with markdup2, the new version is a prototype for new
sambamba architecture using more canonical D language features,
including immutable and improved laziness and a more functional
programming style. It should provide improved performance and minimize
RAM use, as well as better composability. Also we are preparing it for
CRAM input.
Another point to consider is chunking and built-in parallelism which
may give different results on different hardware platforms. When an
algorithm allows for it and there is enough RAM we should allow for
chunking file and running chunks in parallel. Both current sambamba
and samtools provide a taskpool, but have little options for tuning
for hardware variations. Other hardware considerations are providing
the latest LLVM support for just in time (JIT) compilation and GPUs.
** Streaming input data
Every input file should be on its own streaming thread. If there is
only one input file, it should be possible to use a stdin pipe. When
using a pipe we will pass reads in an uncompressed format (it makes no
sense to compress and decompress in a pipe). This means we should be
able to do something like this on a sorted BAM file:
#+BEGIN_SRC bash
sambamba unpack in.bam | sambamba markdup2 |sambamba pack -f bam > out.bam
#+END_SRC
Because of the pipes these tools can run in parallel.
** Streaming output data
The main thread is the output thread and writes data to stdout, or
optionally a file. Compression may be handed out to other threads.
** Composability
Composability happens at two levels. First at the tool level. By
providing support for Unix pipes we'll make sambamba easier to plug
into other solutions. In particular we are interested in providing
support for other programming languages, such as Python and Ruby.
The second composability level is within the code base. It should be
easy to plug in different file readers, for example. Using a more
functional programming style and getting rid of (deprecated)
std.stream should help there.
** Tooling
*** Metrics
To get the best performance having metrics is extremely
important. Luckily LLVM provides great tooling for metrics and we
should use that.
*** Logging
Because of the pipes it is crucial sambamba gives clear error messages
and should be capable of writing to a log file.
{
"name": "sambamba",
"targetType": "none",
"authors": [
"pjotrp"
],
"dependencies": {
"undead": "~>1.0.9"
},
"description": "Sambamba",
"copyright": "Copyright © 2012-2017, Artem Tarasov and Pjotr Prins",
"license": "GPL3",
}
......@@ -3,4 +3,4 @@
# download shunit2 in order to run tests:
# curl -L "https://dl.dropboxusercontent.com/u/7916095/shunit2-2.0.3.tgz" | tar zx --overwrite
./test_suite.sh | tee /dev/stderr | grep -q 'success rate: 100%'
./test/test_suite.sh | tee /dev/stderr | grep -q 'success rate: 100%'
/*
This file is part of Sambamba.
Copyright (C) 2012-2016 Artem Tarasov <lomereiter@gmail.com>
Copyright (C) 2012-2017 Artem Tarasov <lomereiter@gmail.com>
Copyright (C) 2012-2018 Pjotr Prins <pjotr.prins@thebird.nl>
Sambamba is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
......@@ -17,16 +18,22 @@
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
import sambamba.view;
import std.experimental.logger;
import sambamba.depth;
import sambamba.index;
import sambamba.merge;
import sambamba.sort;
import sambamba.fixbins;
import sambamba.flagstat;
import sambamba.slice;
import sambamba.markdup;
import sambamba.depth;
import sambamba.markdup2;
import sambamba.merge;
import sambamba.pileup;
import sambamba.fixbins;
import sambamba.sort;
import sambamba.slice;
import sambamba.subsample;
import sambamba.validate;
import sambamba.view;
import sambamba.utils.common.ldc_gc_workaround;
......@@ -37,32 +44,53 @@ import utils.ldc_version_info_ : LDC_VERSION_STRING, DMD_VERSION_STRING, LLVM_VE
import std.stdio;
void printUsage() {
stderr.writeln("sambamba " ~ VERSION);
stderr.writeln();
stderr.writeln("Usage: sambamba [command] [args...]");
stderr.writeln();
stderr.writeln(" Available commands: 'view', 'index', 'merge', 'sort',");
stderr.writeln(" 'flagstat', 'slice', 'markdup', 'depth', 'mpileup'");
stderr.writeln(" To get help on a particular command, just call it without args.");
stderr.writeln();
stderr.writeln("Leave bug reports and feature requests at");
stderr.writeln("https://github.com/lomereiter/sambamba/issues");
stderr.writeln();
stderr.writeln("
Usage: sambamba [command] [args...]
Available commands:
view view contents and convert from one format
to another (SAM/BAM/CRAM/JSON/UNPACK)
index build index (BAI)
merge merge files (BAM)
sort sort file (BAM)
slice slice file (BAM using BED)
markdup mark or remove duplicates (BAM)
subsample subsamble (BAM)
flagstat output statistics (BAM)
depth output statistics (BAM)
validate simple validator (BAM)
Work in progress (WIP):
markdup2 mark or remove duplicates v2 (BAM)
No longer recommended:
mpileup parallel execution of samtools (BAM)
To get help on a particular command, call it without args.
For bug reports and feature requests see
https://github.com/biod/
");
}
void printVersion() {
stderr.writeln("sambamba " ~ VERSION);
stderr.writeln();
stderr.writeln("This version was built with:");
stderr.writeln(" LDC " ~ LDC_VERSION_STRING);
stderr.writeln(" using DMD " ~ DMD_VERSION_STRING);
stderr.writeln(" using LLVM " ~ LLVM_VERSION_STRING);
stderr.writeln(" bootstrapped with " ~ BOOTSTRAP_VERSION_STRING);
stderr.writeln("sambamba " ~ VERSION ~ " by Artem Tarasov and Pjotr Prins (C) 2012-2017");
stderr.writeln(" LDC " ~ LDC_VERSION_STRING ~ " / DMD " ~ DMD_VERSION_STRING ~
" / LLVM" ~ LLVM_VERSION_STRING ~ " / bootstrap " ~ BOOTSTRAP_VERSION_STRING);
stderr.writeln();
}
int main(string[] args) {
globalLogLevel(LogLevel.info);
printVersion();
if (args.length == 1) {
printUsage();
return 1;
......@@ -77,9 +105,12 @@ int main(string[] args) {
case "sort": return sort_main(_args);
case "flagstat": return flagstat_main(_args);
case "slice": return slice_main(_args);
case "markdup": return markdup_main(_args);
case "markdup": return sambamba.markdup.markdup_main(_args);
case "markdup2": return sambamba.markdup2.markdup_main(_args);
case "subsample": return subsample_main(_args);
case "depth": return depth_main(_args);
case "mpileup": return pileup_main(_args);
case "validate": return validate_main(_args);
// hidden commands
case "fixbins": return fixbins_main(_args);
......