Skip to content
Commits on Source (9)
# version format.
# you can use {branch} name in version format too
# version: 1.0.{build}-{branch}
version: 'vers.{build}'
# branches to build
branches:
# Blacklist
except:
- gh-pages
# Do not build on tags (GitHub and BitBucket)
skip_tags: true
# Skipping commits affecting specific files (GitHub only). More details here: /docs/appveyor-yml
#skip_commits:
# files:
# - docs/*
# - '**/*.html'
# We use Mingw/Msys, so use pacman for installs
install:
- set HOME=.
- set MSYSTEM=MINGW64
- set PATH=C:/msys64/usr/bin;C:/msys64/mingw64/bin;%PATH%
- set MINGWPREFIX=x86_64-w64-mingw32
- "sh -lc \"pacman -S --noconfirm --needed base-devel mingw-w64-x86_64-toolchain mingw-w64-x86_64-zlib mingw-w64-x86_64-bzip2 mingw-w64-x86_64-xz mingw-w64-x86_64-curl\""
build_script:
- set HOME=.
- set MSYSTEM=MINGW64
- set PATH=C:/msys64/usr/bin;C:/msys64/mingw64/bin;%PATH%
- "sh -lc \"aclocal && autoheader && autoconf && ./configure && make -j2\""
#build_script:
# - make
test_script:
- "sh -lc \"make test\""
......@@ -149,6 +149,12 @@ various features and specify further optional external requirements:
by default. It can be disabled with --disable-lzma, but be aware
that not all CRAM files may be possible to decode.
--with-libdeflate
Libdeflate is a heavily optimized library for DEFLATE-based compression
and decompression. It also includes a fast crc32 implementation.
By default, ./configure will probe for libdeflate and use it if
available. To prevent this, use --without-libdeflate.
The configure script also accepts the usual options and environment variables
for tuning installation locations and compilers: type './configure --help'
for details. For example,
......@@ -158,6 +164,16 @@ for details. For example,
would specify that HTSlib is to be built with icc and installed into bin,
lib, etc subdirectories under /opt/icc-compiled.
If dependencies have been installed in non-standard locations (i.e. not on
the normal include and library search paths) then the CPPFLAGS and LDFLAGS
environment variables can be used to set the options needed to find them.
For example, NetBSD users may use:
./configure CPPFLAGS=-I/usr/pkg/include \
LDFLAGS='-L/usr/pkg/lib -Wl,-R/usr/pkg/lib'
to allow compiling and linking against dependencies installed via the ports
collection.
Installation Locations
======================
......
......@@ -72,6 +72,7 @@ BUILT_TEST_PROGRAMS = \
test/hfile \
test/sam \
test/test_bgzf \
test/test_realn \
test/test-regidx \
test/test_view \
test/test-vcf-api \
......@@ -377,6 +378,9 @@ test/sam: test/sam.o libhts.a
test/test_bgzf: test/test_bgzf.o libhts.a
$(CC) $(LDFLAGS) -o $@ test/test_bgzf.o libhts.a -lz $(LIBS) -lpthread
test/test_realn: test/test_realn.o libhts.a
$(CC) $(LDFLAGS) -o $@ test/test_realn.o libhts.a $(LIBS) -lpthread
test/test-regidx: test/test-regidx.o libhts.a
$(CC) $(LDFLAGS) -o $@ test/test-regidx.o libhts.a $(LIBS) -lpthread
......@@ -400,6 +404,7 @@ test/fieldarith.o: test/fieldarith.c config.h $(htslib_sam_h)
test/hfile.o: test/hfile.c config.h $(htslib_hfile_h) $(htslib_hts_defs_h)
test/sam.o: test/sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_faidx_h) $(htslib_kstring_h)
test/test_bgzf.o: test/test_bgzf.c $(htslib_bgzf_h) $(htslib_hfile_h)
test/test-realn.o: test/test_realn.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_faidx_h)
test/test-regidx.o: test/test-regidx.c config.h $(htslib_regidx_h) $(hts_internal_h)
test/test_view.o: test/test_view.c config.h $(cram_h) $(htslib_sam_h)
test/test-vcf-api.o: test/test-vcf-api.c config.h $(htslib_hts_h) $(htslib_vcf_h) $(htslib_kstring_h) $(htslib_kseq_h)
......@@ -434,7 +439,7 @@ install: libhts.a $(BUILT_PROGRAMS) $(BUILT_PLUGINS) installdirs install-$(SHLIB
if test -n "$(BUILT_PLUGINS)"; then $(INSTALL_PROGRAM) $(BUILT_PLUGINS) $(DESTDIR)$(plugindir); fi
$(INSTALL_DATA) htslib/*.h $(DESTDIR)$(includedir)/htslib
$(INSTALL_DATA) libhts.a $(DESTDIR)$(libdir)/libhts.a
$(INSTALL_MAN) htsfile.1 tabix.1 $(DESTDIR)$(man1dir)
$(INSTALL_MAN) bgzip.1 htsfile.1 tabix.1 $(DESTDIR)$(man1dir)
$(INSTALL_MAN) faidx.5 sam.5 vcf.5 $(DESTDIR)$(man5dir)
installdirs:
......
Noteworthy changes in release 1.8 (3rd April 2018)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
* The URL to get sequences from the EBI reference server has been changed
to https://. This is because the EBI no longer serve sequences via
plain HTTP - requests to the http:// endpoint just get redirected.
HTSlib needs to be linked against libcurl to download https:// URLs,
so CRAM users who want to get references from the EBI will need to
run configure and ensure libcurl support is enabled using the
--enable-libcurl option.
* Added libdeflate as a build option for alternative faster compression and
decompression. Results vary by CPU but compression should be twice as fast
and decompression faster.
* It is now possible to set the compression level in bgzip. (#675; thanks
to Nathan Weeks).
* bgzip now gets its own manual page.
* CRAM encoding now stored MD and NM tags verbatim where the reference
contains 'N' characters, to work around ambiguities in the SAM
specification (samtools #717/762).
Also added "store_md" and "store_nm" cram-options for forcing these
tags to be stored at all locations. This is best when combined with
a subsequent decode_md=0 option while reading CRAM.
* Multiple CRAM bug fixes, including a fix to free and the subsequent reuse of
references with `-T ref.fa`. (#654; reported by Chris Saunders)
* CRAM multi-threading bugs fixed: don't try to call flush on reading;
processing of multiple range queries; problems with multi-slice containers.
* Fixed crashes caused when decoding some cramtools produced CRAM files.
* Fixed a couple of minor rANS issues with handling invalid data.
* Fixed bug where probaln_glocal() tried to allocate far more memory than
needed when the query sequence was much longer than the reference. This
caused crashes in samtools and bcftools mpileup when used on data with very
long reads. (#572, problem reported by Felix Bemm via minimap2).
* sam_prop_realn() now returns -1 (the same value as for unmapped reads)
on reads that do not include at least one 'M', 'X' or '=' CIGAR operator,
and no longer adds BQ or ZQ tags. BAQ adjustments are only made to bases
covered by these operators so there is no point in trying to align
reads that do not have them. (#572)
Noteworthy changes in release 1.7 (26th January 2018)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
......
......@@ -627,12 +627,18 @@ int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int mi
return nret;
}
void bcf_sr_sort_remove_reader(bcf_srs_t *readers, sr_sort_t *srt, int i)
{
//vcf_buf is allocated only in bcf_sr_sort_next
//So, a call to bcf_sr_add_reader() followed immediately by bcf_sr_remove_reader()
//would cause the program to crash in this segment
if (srt->vcf_buf)
{
free(srt->vcf_buf[i].rec);
if ( i+1 < srt->nsr )
memmove(&srt->vcf_buf[i], &srt->vcf_buf[i+1], (srt->nsr - i - 1)*sizeof(vcf_buf_t));
memset(srt->vcf_buf + srt->nsr - 1, 0, sizeof(vcf_buf_t));
}
}
sr_sort_t *bcf_sr_sort_init(sr_sort_t *srt)
{
if ( !srt ) return calloc(1,sizeof(sr_sort_t));
......
......@@ -35,6 +35,10 @@
#include <sys/types.h>
#include <inttypes.h>
#ifdef HAVE_LIBDEFLATE
#include <libdeflate.h>
#endif
#include "htslib/hts.h"
#include "htslib/bgzf.h"
#include "htslib/hfile.h"
......@@ -359,6 +363,64 @@ BGZF *bgzf_hopen(hFILE *hfp, const char *mode)
return fp;
}
#ifdef HAVE_LIBDEFLATE
int bgzf_compress(void *_dst, size_t *dlen, const void *src, size_t slen, int level)
{
if (slen == 0) {
// EOF block
if (*dlen < 28) return -1;
memcpy(_dst, "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0", 28);
*dlen = 28;
return 0;
}
uint8_t *dst = (uint8_t*)_dst;
if (level == 0) {
// Uncompressed data
if (*dlen < slen+5 + BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH) return -1;
dst[BLOCK_HEADER_LENGTH] = 1; // BFINAL=1, BTYPE=00; see RFC1951
u16_to_le(slen, &dst[BLOCK_HEADER_LENGTH+1]); // length
u16_to_le(~slen, &dst[BLOCK_HEADER_LENGTH+3]); // ones-complement length
memcpy(dst + BLOCK_HEADER_LENGTH+5, src, slen);
*dlen = slen+5 + BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
} else {
level = level > 0 ? level : 6; // libdeflate doesn't honour -1 as default
// NB levels go up to 12 here.
struct libdeflate_compressor *z = libdeflate_alloc_compressor(level);
if (!z) return -1;
// Raw deflate
size_t clen =
libdeflate_deflate_compress(z, src, slen,
dst + BLOCK_HEADER_LENGTH,
*dlen - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH);
if (clen <= 0) {
hts_log_error("Call to libdeflate_deflate_compress failed");
libdeflate_free_compressor(z);
return -1;
}
*dlen = clen + BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
libdeflate_free_compressor(z);
}
// write the header
memcpy(dst, g_magic, BLOCK_HEADER_LENGTH); // the last two bytes are a place holder for the length of the block
packInt16(&dst[16], *dlen - 1); // write the compressed length; -1 to fit 2 bytes
// write the footer
uint32_t crc = libdeflate_crc32(0, src, slen);
packInt32((uint8_t*)&dst[*dlen - 8], crc);
packInt32((uint8_t*)&dst[*dlen - 4], slen);
return 0;
}
#else
int bgzf_compress(void *_dst, size_t *dlen, const void *src, size_t slen, int level)
{
uint32_t crc;
......@@ -395,6 +457,7 @@ int bgzf_compress(void *_dst, size_t *dlen, const void *src, size_t slen, int le
packInt32((uint8_t*)&dst[*dlen - 4], slen);
return 0;
}
#endif // HAVE_LIBDEFLATE
static int bgzf_gzip_compress(BGZF *fp, void *_dst, size_t *dlen, const void *src, size_t slen, int level)
{
......@@ -438,6 +501,28 @@ static int deflate_block(BGZF *fp, int block_length)
return comp_size;
}
#ifdef HAVE_LIBDEFLATE
static int bgzf_uncompress(uint8_t *dst, size_t *dlen, const uint8_t *src, size_t slen) {
struct libdeflate_decompressor *z = libdeflate_alloc_decompressor();
if (!z) {
hts_log_error("Call to libdeflate_alloc_decompressor failed");
return -1;
}
int ret = libdeflate_deflate_decompress(z, src, slen, dst, *dlen, dlen);
libdeflate_free_decompressor(z);
if (ret != LIBDEFLATE_SUCCESS) {
hts_log_error("Inflate operation failed: %d", ret);
return -1;
}
return 0;
}
#else
static int bgzf_uncompress(uint8_t *dst, size_t *dlen, const uint8_t *src, size_t slen) {
z_stream zs;
zs.zalloc = NULL;
......@@ -467,6 +552,7 @@ static int bgzf_uncompress(uint8_t *dst, size_t *dlen, const uint8_t *src, size_
*dlen = *dlen - zs.avail_out;
return 0;
}
#endif // HAVE_LIBDEFLATE
// Inflate the block in fp->compressed_block into fp->uncompressed_block
static int inflate_block(BGZF* fp, int block_length)
......@@ -482,7 +568,11 @@ static int inflate_block(BGZF* fp, int block_length)
// Check CRC of uncompressed block matches the gzip header.
// NB: we may wish to switch out the zlib crc32 for something more performant.
// See PR#361 and issue#467
#ifdef HAVE_LIBDEFLATE
uint32_t c1 = libdeflate_crc32(0L, (unsigned char *)fp->uncompressed_block, dlen);
#else
uint32_t c1 = crc32(0L, (unsigned char *)fp->uncompressed_block, dlen);
#endif
uint32_t c2 = le_to_u32((uint8_t *)fp->compressed_block + block_length-8);
if (c1 != c2) {
fp->errcode |= BGZF_ERR_CRC;
......@@ -1160,7 +1250,7 @@ restart:
pthread_cond_signal(&mt->command_c);
pthread_mutex_unlock(&mt->command_m);
hts_tpool_process_destroy(mt->out_queue);
pthread_exit(NULL);
return NULL;
default:
break;
......@@ -1182,7 +1272,7 @@ restart:
// We tear down the multi-threaded decoder and revert to the old code.
hts_tpool_dispatch(mt->pool, mt->out_queue, bgzf_nul_func, j);
hts_tpool_process_ref_decr(mt->out_queue);
pthread_exit(&j->errcode);
return &j->errcode;
}
// Dispatch an empty block so EOF is spotted.
......@@ -1193,7 +1283,7 @@ restart:
hts_tpool_dispatch(mt->pool, mt->out_queue, bgzf_nul_func, j);
if (j->errcode != 0) {
hts_tpool_process_destroy(mt->out_queue);
pthread_exit(&j->errcode);
return &j->errcode;
}
// We hit EOF so can stop reading, but we may get a subsequent
......@@ -1224,10 +1314,9 @@ restart:
pthread_cond_signal(&mt->command_c);
pthread_mutex_unlock(&mt->command_m);
hts_tpool_process_destroy(mt->out_queue);
pthread_exit(NULL);
return NULL;
}
}
return NULL;
}
int bgzf_thread_pool(BGZF *fp, hts_tpool *pool, int qsize) {
......@@ -1452,7 +1541,7 @@ ssize_t bgzf_block_write(BGZF *fp, const void *data, size_t length)
uint64_t ublock_size; // amount of uncompressed data to be fed into next block
while (remaining > 0) {
current_block = fp->idx->moffs - fp->idx->noffs;
ublock_size = fp->idx->offs[current_block+1].uaddr-fp->idx->offs[current_block].uaddr;
ublock_size = current_block + 1 < fp->idx->moffs ? fp->idx->offs[current_block+1].uaddr-fp->idx->offs[current_block].uaddr : BGZF_MAX_BLOCK_SIZE;
uint8_t* buffer = (uint8_t*)fp->uncompressed_block;
int copy_length = ublock_size - fp->block_offset;
if (copy_length > remaining) copy_length = remaining;
......@@ -1462,6 +1551,7 @@ ssize_t bgzf_block_write(BGZF *fp, const void *data, size_t length)
remaining -= copy_length;
if (fp->block_offset == ublock_size) {
if (lazy_flush(fp) != 0) return -1;
if (fp->idx->noffs > 0)
fp->idx->noffs--; // decrement noffs to track the blocks
}
}
......
.TH bgzip 1 "3 April 2018" "htslib-1.8" "Bioinformatics tools"
.SH NAME
.PP
bgzip \- Block compression/decompression utility
.\"
.\" Copyright (C) 2009-2011 Broad Institute.
.\" Copyright (C) 2018 Genome Research Limited.
.\"
.\" Author: Heng Li <lh3@sanger.ac.uk>
.\"
.\" Permission is hereby granted, free of charge, to any person obtaining a
.\" copy of this software and associated documentation files (the "Software"),
.\" to deal in the Software without restriction, including without limitation
.\" the rights to use, copy, modify, merge, publish, distribute, sublicense,
.\" and/or sell copies of the Software, and to permit persons to whom the
.\" Software is furnished to do so, subject to the following conditions:
.\"
.\" The above copyright notice and this permission notice shall be included in
.\" all copies or substantial portions of the Software.
.\"
.\" THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
.\" IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
.\" FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
.\" THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
.\" LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
.\" FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
.\" DEALINGS IN THE SOFTWARE.
.\"
.
.\" For code blocks and examples (cf groff's Ultrix-specific man macros)
.de EX
. in +\\$1
. nf
. ft CR
..
.de EE
. ft
. fi
. in
..
.SH SYNOPSIS
.PP
.B bgzip
.RB [ -cdfhir ]
.RB [ -b
.IR virtualOffset ]
.RB [ -I
.IR index_name ]
.RB [ -l
.IR compression_level ]
.RB [ -s
.IR size ]
.RB [ -@
.IR threads ]
.RI [ file ]
.PP
.SH DESCRIPTION
.PP
Bgzip compresses files in a similar manner to, and compatible with, gzip(1).
The file is compressed into a series of small (less than 64K) 'BGZF' blocks.
This allows indexes to be built against the compressed file and used to
retrieve portions of the data without having to decompress the entire file.
If no files are specified on the command line, bgzip will compress (or
decompress if the -d option is used) standard input to standard output.
If a file is specified, it will be compressed (or decompressed with -d).
If the -c option is used, the result will be written to standard output,
otherwise when compressing bgzip will write to a new file with a .gz
suffix and remove the original. When decompressing the input file must
have a .gz suffix, which will be removed to make the output name. Again
after decompression completes the input file will be removed.
.SH OPTIONS
.TP 10
.BI "-b, --offset " INT
Decompress to standard output from virtual file position (0-based uncompressed
offset).
Implies -c and -d.
.TP
.B "-c, --stdout"
Write to standard output, keep original files unchanged.
.TP
.B "-d, --decompress"
Decompress.
.TP
.B "-f, --force"
Overwrite files without asking.
.TP
.B "-h, --help"
Displays a help message.
.TP
.B "-i, --index"
Create a BGZF index while compressing.
Unless the -I option is used, this will have the name of the compressed
file with .gzi appended to it.
.TP
.BI "-I, --index-name " FILE
Index file name.
.TP
.BI "-l, --compress-level " INT
Compression level to use when compressing.
From 0 to 9, or -1 for the default level set by the compression library. [-1]
.TP
.B "-r, --reindex"
Rebuild the index on an existing compressed file.
.TP
.B "-g, --rebgzip"
Try to use an existing index to create a compressed file with matching
block offsets.
Note that this assumes that the same compression library and level are in use
as when making the original file.
Don't use it unless you know what you're doing.
.TP
.BI "-s, --size " INT
Decompress INT bytes (uncompressed size) to standard output.
Implies -c.
.TP
.BI "-@, --threads " INT
Number of threads to use [1].
.PP
.SH BGZF FORMAT
The BGZF format written by bgzip is described in the SAM format specification
available from http://samtools.github.io/hts-specs/SAMv1.pdf.
It makes use of a gzip feature which allows compressed files to be
concatenated.
The input data is divided into blocks which are no larger than 64 kilobytes
both before and after compression (including compression headers).
Each block is compressed into a gzip file.
The gzip header includes an extra sub-field with identifier 'BC' and the length
of the compressed block, including all headers.
.SH GZI FORMAT
The index format is a binary file listing pairs of compressed and
uncompressed offsets in a BGZF file.
Each compressed offset points to the start of a BGZF block.
The uncompressed offset is the corresponding location in the uncompressed
data stream.
All values are stored as little-endian 64-bit unsigned integers.
The file contents are:
.EX 4
uint64_t number_entries
.EE
followed by number_entries pairs of:
.EX 4
uint64_t compressed_offset
uint64_t uncompressed_offset
.EE
.SH EXAMPLES
.EX 4
# Compress stdin to stdout
bgzip < /usr/share/dict/words > /tmp/words.gz
# Make a .gzi index
bgzip -r /tmp/words.gz
# Extract part of the data using the index
bgzip -b 367635 -s 4 /tmp/words.gz
# Uncompress the whole file, removing the compressed copy
bgzip -d /tmp/words.gz
.EE
.SH AUTHOR
.PP
The BGZF library was originally implemented by Bob Handsaker and modified
by Heng Li for remote file access and in-memory caching.
.SH SEE ALSO
.PP
.BR gzip (1), tabix (1)
......@@ -80,6 +80,7 @@ static int bgzip_main_usage(void)
fprintf(stderr, " -h, --help give this help\n");
fprintf(stderr, " -i, --index compress and create BGZF index\n");
fprintf(stderr, " -I, --index-name FILE name of BGZF index file [file.gz.gzi]\n");
fprintf(stderr, " -l, --compress-level INT Compression level to use when compressing; 0 to 9, or -1 for default [-1]\n");
fprintf(stderr, " -r, --reindex (re)index compressed file\n");
fprintf(stderr, " -g, --rebgzip use an index file to bgzip a file\n");
fprintf(stderr, " -s, --size INT decompress INT bytes (uncompressed size)\n");
......@@ -90,7 +91,7 @@ static int bgzip_main_usage(void)
int main(int argc, char **argv)
{
int c, compress, pstdout, is_forced, index = 0, rebgzip = 0, reindex = 0;
int c, compress, compress_level = -1, pstdout, is_forced, index = 0, rebgzip = 0, reindex = 0;
BGZF *fp;
void *buffer;
long start, end, size;
......@@ -106,6 +107,7 @@ int main(int argc, char **argv)
{"force", no_argument, NULL, 'f'},
{"index", no_argument, NULL, 'i'},
{"index-name", required_argument, NULL, 'I'},
{"compress-level", required_argument, NULL, 'l'},
{"reindex", no_argument, NULL, 'r'},
{"rebgzip",no_argument,NULL,'g'},
{"size", required_argument, NULL, 's'},
......@@ -115,7 +117,7 @@ int main(int argc, char **argv)
};
compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0;
while((c = getopt_long(argc, argv, "cdh?fb:@:s:iI:gr",loptions,NULL)) >= 0){
while((c = getopt_long(argc, argv, "cdh?fb:@:s:iI:l:gr",loptions,NULL)) >= 0){
switch(c){
case 'd': compress = 0; break;
case 'c': pstdout = 1; break;
......@@ -124,6 +126,7 @@ int main(int argc, char **argv)
case 'f': is_forced = 1; break;
case 'i': index = 1; break;
case 'I': index_fname = optarg; break;
case 'l': compress_level = atol(optarg); break;
case 'g': rebgzip = 1; break;
case 'r': reindex = 1; compress = 0; break;
case '@': threads = atoi(optarg); break;
......@@ -144,6 +147,17 @@ int main(int argc, char **argv)
if (compress == 1) {
struct stat sbuf;
int f_src = fileno(stdin);
char out_mode[3] = "w\0";
char out_mode_exclusive[4] = "wx\0";
if (compress_level < -1 || compress_level > 9) {
fprintf(stderr, "[bgzip] Invalid compress-level: %d\n", compress_level);
return 1;
}
if (compress_level >= 0) {
out_mode[1] = compress_level + '0';
out_mode_exclusive[2] = compress_level + '0';
}
if ( argc>optind )
{
......@@ -159,15 +173,15 @@ int main(int argc, char **argv)
}
if (pstdout)
fp = bgzf_open("-", "w");
fp = bgzf_open("-", out_mode);
else
{
char *name = malloc(strlen(argv[optind]) + 5);
strcpy(name, argv[optind]);
strcat(name, ".gz");
fp = bgzf_open(name, is_forced? "w" : "wx");
fp = bgzf_open(name, is_forced? out_mode : out_mode_exclusive);
if (fp == NULL && errno == EEXIST && confirm_overwrite(name))
fp = bgzf_open(name, "w");
fp = bgzf_open(name, out_mode);
if (fp == NULL) {
fprintf(stderr, "[bgzip] can't create %s: %s\n", name, strerror(errno));
free(name);
......@@ -184,7 +198,7 @@ int main(int argc, char **argv)
return 1;
}
else
fp = bgzf_open("-", "w");
fp = bgzf_open("-", out_mode);
if ( index && rebgzip )
{
......
......@@ -64,9 +64,9 @@ m4_ifdef([PKG_PROG_PKG_CONFIG], [PKG_PROG_PKG_CONFIG], [PKG_CONFIG=""])
need_crypto=no
pc_requires=
static_LDFLAGS=
static_LIBS='-lz -lm'
private_LIBS=
static_LDFLAGS=$LDFLAGS
static_LIBS='-lpthread -lz -lm'
private_LIBS=$LDFLAGS
AC_ARG_ENABLE([bz2],
[AS_HELP_STRING([--disable-bz2],
......@@ -97,6 +97,11 @@ AC_ARG_ENABLE([plugins],
[], [enable_plugins=no])
AC_SUBST(enable_plugins)
AC_ARG_WITH([libdeflate],
[AS_HELP_STRING([--with-libdeflate],
[use libdeflate for faster crc and deflate algorithms])],
[], [with_libdeflate=check])
AC_ARG_WITH([plugin-dir],
[AS_HELP_STRING([--with-plugin-dir=DIR],
[plugin installation location [LIBEXECDIR/htslib]])],
......@@ -257,6 +262,26 @@ produced elsewhere unreadable) or resolve this error to build HTSlib.])
static_LIBS="$static_LIBS -llzma"
fi
AS_IF([test "x$with_libdeflate" != "xno"],
[libdeflate=ok
AC_CHECK_HEADER([libdeflate.h],[],[libdeflate='missing header'],[;])
AC_CHECK_LIB([deflate], [libdeflate_deflate_compress],[],[libdeflate='missing library'])
AS_IF([test "$libdeflate" = "ok"],
[AC_DEFINE([HAVE_LIBDEFLATE], 1, [Define if libdeflate is available.])
private_LIBS="$private_LIBS -ldeflate"
static_LIBS="$static_LIBS -ldeflate"],
[AS_IF([test "x$with_libdeflate" != "xcheck"],
[AC_MSG_ERROR([libdeflate development files not found: $libdeflate
You requested libdeflate, but do not have the required header / library
files. The source for libdeflate is available from
<https://github.com/ebiggers/libdeflate>. You may have to adjust
search paths in CPPFLAGS and/or LDFLAGS if the header and library
are not currently on them.
Either configure with --without-libdeflate or resolve this error to build
HTSlib.])])])])
libcurl=disabled
if test "$enable_libcurl" != no; then
AC_CHECK_LIB([curl], [curl_easy_pause],
......
......@@ -311,10 +311,6 @@ static char *cram_extract_block(cram_block *b, int size) {
* ---------------------------------------------------------------------------
* EXTERNAL
*/
static void cram_external_decode_reset(cram_codec *c) {
c->external.b = NULL;
}
int cram_external_decode_int(cram_slice *slice, cram_codec *c,
cram_block *in, char *out, int *out_size) {
int l;
......@@ -322,10 +318,7 @@ int cram_external_decode_int(cram_slice *slice, cram_codec *c,
cram_block *b;
/* Find the external block */
if (!(b = c->external.b)){
b = cram_get_block_by_id(slice, c->external.content_id);
c->external.b = b;
}
if (!b)
return *out_size?-1:0;
......@@ -345,10 +338,7 @@ int cram_external_decode_char(cram_slice *slice, cram_codec *c,
cram_block *b;
/* Find the external block */
if (!(b = c->external.b)){
b = cram_get_block_by_id(slice, c->external.content_id);
c->external.b = b;
}
if (!b)
return *out_size?-1:0;
......@@ -369,10 +359,7 @@ static int cram_external_decode_block(cram_slice *slice, cram_codec *c,
cram_block *b = NULL;
/* Find the external block */
if (!(b = c->external.b)){
b = cram_get_block_by_id(slice, c->external.content_id);
c->external.b = b;
}
if (!b)
return *out_size?-1:0;
......@@ -416,8 +403,6 @@ cram_codec *cram_external_decode_init(char *data, int size,
goto malformed;
c->external.type = option;
c->external.b = NULL;
c->reset = cram_external_decode_reset;
return c;
......@@ -495,8 +480,6 @@ cram_codec *cram_external_encode_init(cram_stats *st,
* ---------------------------------------------------------------------------
* BETA
*/
void cram_nop_decode_reset(cram_codec *c) {}
int cram_beta_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
int32_t *out_i = (int32_t *)out;
int i, n = *out_size;
......@@ -575,8 +558,6 @@ cram_codec *cram_beta_decode_init(char *data, int size,
return NULL;
}
c->reset = cram_nop_decode_reset;
return c;
}
......@@ -769,8 +750,6 @@ cram_codec *cram_subexp_decode_init(char *data, int size,
return NULL;
}
c->reset = cram_nop_decode_reset;
return c;
}
......@@ -833,8 +812,6 @@ cram_codec *cram_gamma_decode_init(char *data, int size,
if (cp - data != size)
goto malformed;
c->reset = cram_nop_decode_reset;
return c;
malformed:
......@@ -1031,8 +1008,6 @@ cram_codec *cram_huffman_decode_init(char *data, int size,
if (i != ncodes)
goto malformed;
h->reset = cram_nop_decode_reset;
if (ncodes == 0) {
/* NULL huffman stream. Ensure it returns an error if
anything tries to use it. */
......@@ -1472,11 +1447,6 @@ void cram_byte_array_len_decode_free(cram_codec *c) {
free(c);
}
static void cram_byte_array_len_decode_reset(cram_codec *c) {
c->byte_array_len.len_codec->reset(c->byte_array_len.len_codec);
c->byte_array_len.val_codec->reset(c->byte_array_len.val_codec);
}
cram_codec *cram_byte_array_len_decode_init(char *data, int size,
enum cram_external_type option,
int version) {
......@@ -1517,8 +1487,6 @@ cram_codec *cram_byte_array_len_decode_init(char *data, int size,
if (cp - data != size)
goto malformed;
c->reset = cram_byte_array_len_decode_reset;
return c;
malformed:
......@@ -1617,20 +1585,13 @@ cram_codec *cram_byte_array_len_encode_init(cram_stats *st,
* ---------------------------------------------------------------------------
* BYTE_ARRAY_STOP
*/
static void cram_byte_array_stop_decode_reset(cram_codec *c) {
c->byte_array_stop.b = NULL;
}
static int cram_byte_array_stop_decode_char(cram_slice *slice, cram_codec *c,
cram_block *in, char *out,
int *out_size) {
char *cp, ch;
cram_block *b = NULL;
if (!(b = c->byte_array_stop.b)){
b = cram_get_block_by_id(slice, c->byte_array_stop.content_id);
c->byte_array_stop.b = b;
}
if (!b)
return *out_size?-1:0;
......@@ -1668,10 +1629,7 @@ int cram_byte_array_stop_decode_block(cram_slice *slice, cram_codec *c,
char *cp, *out_cp, *cp_end;
char stop;
if (!(b = c->byte_array_stop.b)){
b = cram_get_block_by_id(slice, c->byte_array_stop.content_id);
c->byte_array_stop.b = b;
}
if (!b)
return *out_size?-1:0;
......@@ -1746,9 +1704,6 @@ cram_codec *cram_byte_array_stop_decode_init(char *data, int size,
if ((char *)cp - data != size)
goto malformed;
c->byte_array_stop.b = NULL;
c->reset = cram_byte_array_stop_decode_reset;
return c;
malformed:
......
......@@ -84,7 +84,6 @@ typedef struct {
typedef struct {
int32_t content_id;
enum cram_external_type type;
cram_block *b;
} cram_external_decoder;
typedef struct {
......@@ -95,7 +94,6 @@ typedef struct {
typedef struct {
unsigned char stop;
int32_t content_id;
cram_block *b;
} cram_byte_array_stop_decoder;
typedef struct {
......@@ -110,6 +108,9 @@ typedef struct {
/*
* A generic codec structure.
*/
#ifdef __SUNPRO_C
# pragma error_messages(off, E_ANONYMOUS_UNION_DECL)
#endif
typedef struct cram_codec {
enum cram_encoding codec;
cram_block *out;
......@@ -120,7 +121,6 @@ typedef struct cram_codec {
char *in, int in_size);
int (*store)(struct cram_codec *codec, cram_block *b, char *prefix,
int version);
void (*reset)(struct cram_codec *codec); // used between slices in a container
union {
cram_huffman_decoder huffman;
......@@ -138,6 +138,9 @@ typedef struct cram_codec {
cram_beta_decoder e_beta;
};
} cram_codec;
#ifdef __SUNPRO_C
# pragma error_messages(default, E_ANONYMOUS_UNION_DECL)
#endif
const char *cram_encoding2str(enum cram_encoding t);
......
This diff is collapsed.
......@@ -105,6 +105,11 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
SAM_hdr *hdr);
/*
* Drains and frees the decode read-queue for a multi-threaded reader.
*/
void cram_drain_rqueue(cram_fd *fd);
#ifdef __cplusplus
}
#endif
......
......@@ -1152,11 +1152,11 @@ static int lossy_read_names(cram_fd *fd, cram_container *c, cram_slice *s,
uint64_t i64;
struct {
int32_t e,c; // expected & observed counts.
};
} counts;
} u;
e = expected_template_count(b);
u.e = e; u.c = 1;
u.counts.e = e; u.counts.c = 1;
k = kh_put(m_s2u64, names, bam_name(b), &n);
if (n == -1)
......@@ -1165,13 +1165,13 @@ static int lossy_read_names(cram_fd *fd, cram_container *c, cram_slice *s,
if (n == 0) {
// not a new name
u.i64 = kh_val(names, k);
if (u.e != e) {
if (u.counts.e != e) {
// different expectation or already hit the max
//fprintf(stderr, "Err computing no. %s recs\n", bam_name(b));
kh_val(names, k) = 0;
} else {
u.c++;
if (u.e == u.c) {
u.counts.c++;
if (u.counts.e == u.counts.c) {
// Reached expected count.
kh_val(names, k) = -1;
} else {
......@@ -2072,7 +2072,8 @@ static char *cram_encode_aux_1_0(cram_fd *fd, bam_seq_t *b, cram_container *c,
* NULL on failure or no rg present (FIXME)
*/
static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c,
cram_slice *s, cram_record *cr) {
cram_slice *s, cram_record *cr,
int verbatim_NM, int verbatim_MD) {
char *aux, *orig, *rg = NULL;
int aux_size = bam_get_l_aux(b);
cram_block *td_b = c->comp_hdr->TD_blk;
......@@ -2095,7 +2096,7 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c,
// MD:Z
if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') {
if (cr->len && !fd->no_ref && !(cr->flags & BAM_FUNMAP)) {
if (cr->len && !fd->no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_MD) {
while (*aux++);
continue;
}
......@@ -2103,7 +2104,7 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c,
// NM:i
if (aux[0] == 'N' && aux[1] == 'M') {
if (cr->len && !fd->no_ref && !(cr->flags & BAM_FUNMAP)) {
if (cr->len && !fd->no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_NM) {
switch(aux[2]) {
case 'A': case 'C': case 'c': aux+=4; break;
case 'S': case 's': aux+=5; break;
......@@ -2242,7 +2243,9 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c,
m->codec = c;
// Link to fd-global tag metrics
pthread_mutex_lock(&fd->metrics_lock);
m->m = k_global ? (cram_metrics *)kh_val(fd->tags_used, k_global) : NULL;
pthread_mutex_unlock(&fd->metrics_lock);
}
cram_tag_map *tm = (cram_tag_map *)kh_val(c->tags_used, k);
......@@ -2514,6 +2517,14 @@ static int process_one_read(cram_fd *fd, cram_container *c,
char *cp, *rg;
char *ref, *seq, *qual;
// Any places with N in seq and/or reference can lead to ambiguous
// interpretation of the SAM NM:i tag. So we store these verbatim
// to ensure valid data round-trips the same regardless of who
// defines it as valid.
// Similarly when alignments go beyond end of the reference.
int verbatim_NM = fd->store_nm;
int verbatim_MD = fd->store_md;
// FIXME: multi-ref containers
ref = c->ref;
......@@ -2522,33 +2533,6 @@ static int process_one_read(cram_fd *fd, cram_container *c,
//fprintf(stderr, "%s => %d\n", rg ? rg : "\"\"", cr->rg);
// Fields to resolve later
//cr->mate_line; // index to another cram_record
//cr->mate_flags; // MF
//cr->ntags; // TC
cr->ntags = 0; //cram_stats_add(c->stats[DS_TC], cr->ntags);
if (CRAM_MAJOR_VERS(fd->version) == 1)
rg = cram_encode_aux_1_0(fd, b, c, s, cr);
else
rg = cram_encode_aux(fd, b, c, s, cr);
//cr->aux_size = b->blk_size - ((char *)bam_aux(b) - (char *)&bam_ref(b));
//cr->aux = DSTRING_LEN(s->aux_ds);
//dstring_nappend(s->aux_ds, bam_aux(b), cr->aux_size);
/* Read group, identified earlier */
if (rg) {
SAM_RG *brg = sam_hdr_find_rg(fd->header, rg);
cr->rg = brg ? brg->id : -1;
} else if (CRAM_MAJOR_VERS(fd->version) == 1) {
SAM_RG *brg = sam_hdr_find_rg(fd->header, "UNKNOWN");
assert(brg);
} else {
cr->rg = -1;
}
cram_stats_add(c->stats[DS_RG], cr->rg);
cr->ref_id = bam_ref(b); cram_stats_add(c->stats[DS_RI], cr->ref_id);
cram_stats_add(c->stats[DS_BF], fd->cram_flag_swap[cr->flags & 0xfff]);
......@@ -2689,6 +2673,8 @@ static int process_one_read(cram_fd *fd, cram_container *c,
return -1;
}
for (l = 0; l < end; l++) {
if (rp[l] == 'N' && sp[l] == 'N')
verbatim_NM = verbatim_MD = 1;
if (rp[l] != sp[l]) {
if (!sp[l])
break;
......@@ -2737,6 +2723,7 @@ static int process_one_read(cram_fd *fd, cram_container *c,
}
} else {
/* off end of sequence or non-ref based output */
verbatim_NM = verbatim_MD = 1;
for (; l < cig_len && seq[spos]; l++, spos++) {
if (cram_add_base(fd, c, s, cr, spos,
seq[spos], qual[spos]))
......@@ -2746,6 +2733,7 @@ static int process_one_read(cram_fd *fd, cram_container *c,
apos += cig_len;
} else if (!cr->len) {
/* Seq "*" */
verbatim_NM = verbatim_MD = 1;
apos += cig_len;
spos += cig_len;
}
......@@ -2832,6 +2820,24 @@ static int process_one_read(cram_fd *fd, cram_container *c,
fake_qual = 0;
}
cr->ntags = 0; //cram_stats_add(c->stats[DS_TC], cr->ntags);
if (CRAM_MAJOR_VERS(fd->version) == 1)
rg = cram_encode_aux_1_0(fd, b, c, s, cr);
else
rg = cram_encode_aux(fd, b, c, s, cr, verbatim_NM, verbatim_MD);
/* Read group, identified earlier */
if (rg) {
SAM_RG *brg = sam_hdr_find_rg(fd->header, rg);
cr->rg = brg ? brg->id : -1;
} else if (CRAM_MAJOR_VERS(fd->version) == 1) {
SAM_RG *brg = sam_hdr_find_rg(fd->header, "UNKNOWN");
assert(brg);
} else {
cr->rg = -1;
}
cram_stats_add(c->stats[DS_RG], cr->rg);
/*
* Append to the qual block now. We do this here as
* cram_add_substitution() can generate BA/QS events which need to
......
......@@ -142,10 +142,10 @@ static int cram_block_compression_hdr_set_DS(cram_block_compression_hdr *ch,
return 0;
default:
return -1;
break;
}
return 0;
return -1;
}
int cram_block_compression_hdr_set_rg(cram_block_compression_hdr *ch, int new_rg) {
......
......@@ -463,36 +463,57 @@ cram_index *cram_index_last(cram_fd *fd, int refid, cram_index *from) {
* and then read from then on, skipping decoding of slices and/or
* whole containers when they don't overlap the specified cram_range.
*
* This function also updates the cram_fd range field.
*
* Returns 0 on success
* -1 on general failure
* -2 on no-data (empty chromosome)
*/
int cram_seek_to_refpos(cram_fd *fd, cram_range *r) {
int ret = 0;
cram_index *e;
if (r->refid == HTS_IDX_NONE)
return -2;
if (r->refid == HTS_IDX_NONE) {
ret = -2; goto err;
}
// Ideally use an index, so see if we have one.
if ((e = cram_index_query(fd, r->refid, r->start, NULL))) {
if (0 != cram_seek(fd, e->offset, SEEK_SET))
if (0 != cram_seek(fd, e->offset - fd->first_container, SEEK_CUR))
return -1;
if (0 != cram_seek(fd, e->offset, SEEK_SET)) {
if (0 != cram_seek(fd, e->offset - fd->first_container, SEEK_CUR)) {
ret = -1; goto err;
}
}
} else {
// Absent from index, but this most likely means it simply has no data.
return -2;
ret = -2; goto err;
}
pthread_mutex_lock(&fd->range_lock);
fd->range = *r;
if (r->refid == HTS_IDX_START || r->refid == HTS_IDX_REST)
fd->range.refid = -2; // special case in cram_next_slice
pthread_mutex_unlock(&fd->range_lock);
if (fd->ctr) {
cram_free_container(fd->ctr);
if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
cram_free_container(fd->ctr_mt);
fd->ctr = NULL;
fd->ctr_mt = NULL;
fd->ooc = 0;
fd->eof = 0;
}
return 0;
err:
// It's unlikely fd->range will be accessed after EOF or error,
// but this maintains identical behaviour to the previous code.
pthread_mutex_lock(&fd->range_lock);
fd->range = *r;
pthread_mutex_unlock(&fd->range_lock);
return ret;
}
......
......@@ -69,6 +69,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <time.h>
#include <stdint.h>
#ifdef HAVE_LIBDEFLATE
#include <libdeflate.h>
#define crc32(a,b,c) libdeflate_crc32((a),(b),(c))
#endif
#include "cram/cram.h"
#include "cram/os.h"
#include "htslib/hts.h"
......@@ -2036,7 +2041,7 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
* If we have no ref path, we use the EBI server.
* However to avoid spamming it we require a local ref cache too.
*/
ref_path = "http://www.ebi.ac.uk:80/ena/cram/md5/%s";
ref_path = "https://www.ebi.ac.uk/ena/cram/md5/%s";
if (!local_cache || *local_cache == '\0') {
const char *extra;
const char *base = get_cache_basedir(&extra);
......@@ -2257,7 +2262,7 @@ static void cram_ref_decr_locked(refs_t *r, int id) {
RP("%d FREE REF %d (%p)\n", gettid(),
r->last_id, r->ref_id[r->last_id]->seq);
ref_entry_free_seq(r->ref_id[r->last_id]);
r->ref_id[r->last_id]->length = 0;
if (r->ref_id[r->last_id]->is_md5) r->ref_id[r->last_id]->length = 0;
}
}
r->last_id = id;
......@@ -2741,13 +2746,23 @@ void cram_free_container(cram_container *c) {
if (c->comp_hdr_block)
cram_free_block(c->comp_hdr_block);
// Free the slices; filled out by encoder only
if (c->slices) {
for (i = 0; i < c->max_slice; i++)
for (i = 0; i < c->max_slice; i++) {
if (c->slices[i])
cram_free_slice(c->slices[i]);
if (c->slices[i] == c->slice)
c->slice = NULL;
}
free(c->slices);
}
// Free the current slice; set by both encoder & decoder
if (c->slice) {
cram_free_slice(c->slice);
c->slice = NULL;
}
for (id = DS_RN; id < DS_TN; id++)
if (c->stats[id]) cram_stats_free(c->stats[id]);
......@@ -2880,6 +2895,7 @@ cram_container *cram_read_container(cram_fd *fd) {
c->offset = rd;
c->slices = NULL;
c->slice = NULL;
c->curr_slice = 0;
c->max_slice = c->num_landmarks;
c->slice_rec = 0;
......@@ -3102,7 +3118,11 @@ void *cram_flush_thread(void *arg) {
static int cram_flush_result(cram_fd *fd) {
int i, ret = 0;
hts_tpool_result *r;
cram_container *lc = NULL;
// NB: we can have one result per slice, not per container,
// so we need to free the container only after all slices
// within it have been freed. (Automatic via reference counting.)
while ((r = hts_tpool_next_result(fd->rqueue))) {
cram_job *j = (cram_job *)hts_tpool_result_data(r);
cram_container *c;
......@@ -3119,23 +3139,49 @@ static int cram_flush_result(cram_fd *fd) {
if (0 != cram_flush_container2(fd, c))
return -1;
/* Free the container */
// Free the slices; filled out by encoder only
if (c->slices) {
for (i = 0; i < c->max_slice; i++) {
if (c->slices && c->slices[i]) {
if (c->slices[i])
cram_free_slice(c->slices[i]);
if (c->slices[i] == c->slice)
c->slice = NULL;
c->slices[i] = NULL;
}
}
// Free the current slice; set by both encoder & decoder
if (c->slice) {
cram_free_slice(c->slice);
c->slice = NULL;
}
c->curr_slice = 0;
cram_free_container(c);
// Our jobs will be in order, so we free the last
// container when our job has switched to a new one.
if (c != lc) {
if (lc) {
if (fd->ctr == lc)
fd->ctr = NULL;
if (fd->ctr_mt == lc)
fd->ctr_mt = NULL;
cram_free_container(lc);
}
lc = c;
}
if (fd->mode == 'w')
ret |= hflush(fd->fp) == 0 ? 0 : -1;
hts_tpool_delete_result(r, 1);
}
if (lc) {
if (fd->ctr == lc)
fd->ctr = NULL;
if (fd->ctr_mt == lc)
fd->ctr_mt = NULL;
cram_free_container(lc);
}
return ret;
}
......@@ -3447,15 +3493,17 @@ cram_slice *cram_read_slice(cram_fd *fd) {
min_id = s->block[i]->content_id;
}
}
if (min_id >= 0 && max_id < 1024) {
if (!(s->block_by_id = calloc(1024, sizeof(s->block[0]))))
if (!(s->block_by_id = calloc(512, sizeof(s->block[0]))))
goto err;
for (i = 0; i < n; i++) {
if (s->block[i]->content_type != EXTERNAL)
continue;
s->block_by_id[s->block[i]->content_id] = s->block[i];
}
int v = s->block[i]->content_id;
if (v < 0 || v >= 256)
v = 256 + (v > 0 ? v % 251 : (-v) % 251);
s->block_by_id[v] = s->block[i];
}
/* Initialise encoding/decoding tables */
......@@ -3473,6 +3521,7 @@ cram_slice *cram_read_slice(cram_fd *fd) {
s->crecs = NULL;
s->last_apos = s->hdr->ref_seq_start;
s->decode_md = fd->decode_md;
return s;
......@@ -3664,7 +3713,7 @@ static void full_path(char *out, char *in) {
size_t in_l = strlen(in);
if (*in == '/' ||
// Windows paths
(in_l > 3 && toupper(*in) >= 'A' && toupper(*in) <= 'Z' &&
(in_l > 3 && toupper_c(*in) >= 'A' && toupper_c(*in) <= 'Z' &&
in[1] == ':' && (in[2] == '/' || in[2] == '\\'))) {
strncpy(out, in, PATH_MAX);
out[PATH_MAX-1] = 0;
......@@ -4068,6 +4117,7 @@ cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) {
fd->record_counter = 0;
fd->ctr = NULL;
fd->ctr_mt = NULL;
fd->refs = refs_create();
if (!fd->refs)
goto err;
......@@ -4088,6 +4138,8 @@ cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) {
fd->multi_seq = -1;
fd->unsorted = 0;
fd->shared_ref = 0;
fd->store_md = 0;
fd->store_nm = 0;
fd->index = NULL;
fd->own_pool = 0;
......@@ -4133,6 +4185,8 @@ int cram_seek(cram_fd *fd, off_t offset, int whence) {
fd->ooc = 0;
cram_drain_rqueue(fd);
if (hseek(fd->fp, offset, whence) >= 0) {
return 0;
}
......@@ -4193,18 +4247,22 @@ int cram_close(cram_fd *fd) {
return -1;
}
if (fd->mode != 'w')
cram_drain_rqueue(fd);
if (fd->pool && fd->eof >= 0) {
hts_tpool_process_flush(fd->rqueue);
if (0 != cram_flush_result(fd))
return -1;
if (fd->mode == 'w')
fd->ctr = NULL; // prevent double freeing
pthread_mutex_destroy(&fd->metrics_lock);
pthread_mutex_destroy(&fd->ref_lock);
pthread_mutex_destroy(&fd->bam_list_lock);
fd->ctr = NULL; // prevent double freeing
//fprintf(stderr, "CRAM: destroy queue %p\n", fd->rqueue);
hts_tpool_process_destroy(fd->rqueue);
......@@ -4259,6 +4317,9 @@ int cram_close(cram_fd *fd) {
if (fd->ctr)
cram_free_container(fd->ctr);
if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
cram_free_container(fd->ctr_mt);
if (fd->refs)
refs_free(fd->refs);
if (fd->ref_free)
......@@ -4402,9 +4463,14 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) {
}
break;
case CRAM_OPT_RANGE:
fd->range = *va_arg(args, cram_range *);
return cram_seek_to_refpos(fd, &fd->range);
case CRAM_OPT_RANGE: {
int r = cram_seek_to_refpos(fd, va_arg(args, cram_range *));
pthread_mutex_lock(&fd->range_lock);
if (fd->range.refid != -2)
fd->required_fields |= SAM_POS;
pthread_mutex_unlock(&fd->range_lock);
return r;
}
case CRAM_OPT_REFERENCE:
return cram_load_reference(fd, va_arg(args, char *));
......@@ -4443,6 +4509,7 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) {
fd->rqueue = hts_tpool_process_init(fd->pool, nthreads*2, 0);
pthread_mutex_init(&fd->metrics_lock, NULL);
pthread_mutex_init(&fd->ref_lock, NULL);
pthread_mutex_init(&fd->range_lock, NULL);
pthread_mutex_init(&fd->bam_list_lock, NULL);
fd->shared_ref = 1;
fd->own_pool = 1;
......@@ -4459,6 +4526,7 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) {
0);
pthread_mutex_init(&fd->metrics_lock, NULL);
pthread_mutex_init(&fd->ref_lock, NULL);
pthread_mutex_init(&fd->range_lock, NULL);
pthread_mutex_init(&fd->bam_list_lock, NULL);
}
fd->shared_ref = 1; // Needed to avoid clobbering ref between threads
......@@ -4472,6 +4540,16 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) {
case CRAM_OPT_REQUIRED_FIELDS:
fd->required_fields = va_arg(args, int);
if (fd->range.refid != -2)
fd->required_fields |= SAM_POS;
break;
case CRAM_OPT_STORE_MD:
fd->store_md = va_arg(args, int);
break;
case CRAM_OPT_STORE_NM:
fd->store_nm = va_arg(args, int);
break;
case HTS_OPT_COMPRESSION_LEVEL:
......
......@@ -475,9 +475,17 @@ char *cram_content_type2str(enum cram_content_type t);
*/
static inline cram_block *cram_get_block_by_id(cram_slice *slice, int id) {
if (slice->block_by_id && id >= 0 && id < 1024) {
//fprintf(stderr, "%d\t%p\n", id, slice->block_by_id);
if (slice->block_by_id && id >= 0 && id < 256) {
return slice->block_by_id[id];
} else {
int v = 256 + (id > 0 ? id % 251 : (-id) % 251);
if (slice->block_by_id &&
slice->block_by_id[v] &&
slice->block_by_id[v]->content_id == id)
return slice->block_by_id[v];
// Otherwise a linear search in case of collision
int i;
for (i = 0; i < slice->hdr->num_blocks; i++) {
cram_block *b = slice->block[i];
......
......@@ -296,6 +296,7 @@ typedef struct cram_block_compression_hdr {
int AP_delta;
// indexed by ref-base and subst. code
char substitution_matrix[5][4];
int no_ref;
// TD Dictionary as a concatenated block
cram_block *TD_blk; // Tag Dictionary
......@@ -312,8 +313,6 @@ typedef struct cram_block_compression_hdr {
char *uncomp; // A single block of uncompressed data
size_t uncomp_size, uncomp_alloc;
unsigned int data_series; // See cram_fields enum below
} cram_block_compression_hdr;
typedef struct cram_map {
......@@ -381,6 +380,7 @@ typedef struct cram_container {
/* For construction purposes */
int max_slice, curr_slice; // maximum number of slices
int curr_slice_mt; // Curr_slice when reading ahead (via threads)
int max_rec, curr_rec; // current and max recs per slice
int max_c_rec, curr_c_rec; // current and max recs per container
int slice_rec; // rec no. for start of this slice
......@@ -469,8 +469,7 @@ typedef struct cram_record {
* A feature is a base difference, used for the sequence reference encoding.
* (We generate these internally when writing CRAM.)
*/
typedef struct cram_feature {
union {
typedef union cram_feature {
struct {
int pos;
int code;
......@@ -530,7 +529,6 @@ typedef struct cram_feature {
int code;
int len;
} H;
};
} cram_feature;
/*
......@@ -589,6 +587,12 @@ typedef struct cram_slice {
// For going from BAM to CRAM; an array of auxiliary blocks per type
int naux_block;
cram_block **aux_block;
unsigned int data_series; // See cram_fields enum
int decode_md;
int max_rec, curr_rec; // current and max recs per slice
int slice_num; // To be copied into c->curr_slice in decode
} cram_slice;
/*-----------------------------------------------------------------------------
......@@ -685,9 +689,12 @@ typedef struct cram_fd {
//cram_block_compression_hdr *comp_hdr;
//cram_block_slice_hdr *slice_hdr;
// Current container being processed.
// Current container being processed
cram_container *ctr;
// Current container used for decoder threads
cram_container *ctr_mt;
// positions for encoding or decoding
int first_base, last_base;
......@@ -717,6 +724,8 @@ typedef struct cram_fd {
int use_lzma;
int shared_ref;
unsigned int required_fields;
int store_md;
int store_nm;
cram_range range;
// lookup tables, stored here so we can be trivially multi-threaded
......@@ -741,6 +750,7 @@ typedef struct cram_fd {
hts_tpool_process *rqueue;
pthread_mutex_t metrics_lock;
pthread_mutex_t ref_lock;
pthread_mutex_t range_lock;
spare_bams *bl;
pthread_mutex_t bam_list_lock;
void *job_pending;
......@@ -834,7 +844,7 @@ enum cram_fields {
/* Internal only */
#define CRAM_FLAG_STATS_ADDED (1<<30)
#define CRAM_FLAG_DISCARD_NAME (1<<31)
#define CRAM_FLAG_DISCARD_NAME (1U<<31)
#ifdef __cplusplus
}
......
......@@ -120,10 +120,13 @@ char *tokenise_search_path(char *searchpath) {
if (path_sep == ':') {
if ((i == 0 || (i > 0 && searchpath[i-1] == ':')) &&
(!strncmp(&searchpath[i], "http:", 5) ||
!strncmp(&searchpath[i], "https:", 6) ||
!strncmp(&searchpath[i], "ftp:", 4) ||
!strncmp(&searchpath[i], "|http:", 6) ||
!strncmp(&searchpath[i], "|https:", 7) ||
!strncmp(&searchpath[i], "|ftp:", 5) ||
!strncmp(&searchpath[i], "URL=http:", 9) ||
!strncmp(&searchpath[i], "URL=https:",10)||
!strncmp(&searchpath[i], "URL=ftp:", 8))) {
do {
newsearch[j++] = searchpath[i];
......@@ -342,6 +345,7 @@ mFILE *open_path_mfile(char *file, char *path, char *relative_to) {
return fp;
}
} else if (!strncmp(ele2, "http:", 5) ||
!strncmp(ele2, "https:", 6) ||
!strncmp(ele2, "ftp:", 4)) {
if ((fp = find_file_url(file, ele2))) {
free(newsearch);
......@@ -394,6 +398,7 @@ char *find_path(char *file, char *path) {
if (!strncmp(ele2, "URL=", 4) ||
!strncmp(ele2, "http:", 5) ||
!strncmp(ele2, "https:", 6) ||
!strncmp(ele2, "ftp:", 4)) {
continue;
} else {
......