Skip to content
Commits on Source (4)
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
# C extensions
*.so
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
# Translations
*.mo
*.pot
# Django stuff:
*.log
# Sphinx documentation
docs/_build/
# PyBuilder
target/
*.o
#./setup.py sdist creates this
MANIFEST
*.swp
language: python
python:
- "3.5"
- "3.6"
install:
- pip install numpy && python ./setup.py install
matrix:
include:
- python: 3.7
dist: xenial
script: nosetests -sv
Metadata-Version: 2.1
Name: pyBigWig
Version: 0.3.12
Summary: A package for accessing bigWig files using libBigWig
Home-page: https://github.com/dpryan79/pyBigWig
Author: Devon P. Ryan
Author-email: ryan@ie-freiburg.mpg.de
License: UNKNOWN
Download-URL: https://github.com/dpryan79/pyBigWig/tarball/0.3.11
Description: UNKNOWN
Keywords: bioinformatics,bigWig,bigBed
Platform: UNKNOWN
Provides-Extra: numpy input
pybigwig (0.3.16+dfsg-1) unstable; urgency=medium
* Team upload.
* Not an initial release (Closes: #928665)
Was not aware of Diane's package at the time.
Now merged both efforts.
* Asked upstream https://github.com/deeptools/pyBigWig/issues/84
about not redistributing the wrapped C library
which is now a dependency.
* No longer providing Python 2 package
-- Steffen Moeller <moeller@debian.org> Fri, 03 May 2019 17:55:38 +0200
pybigwig (0.3.12-1) unstable; urgency=medium
* New upstream release.
......
......@@ -5,28 +5,29 @@ Build-Depends: debhelper (>= 11),
dh-python,
libcurl4-gnutls-dev,
zlib1g-dev,
python-all (>= 2.6.6-3~),
python-all-dev,
python-numpy,
python-setuptools,
libbigwig-dev,
# python-all (>= 2.6.6-3~),
# python-all-dev,
# python-numpy,
# python-setuptools,
python3-all,
python3-all-dev,
python3-numpy,
python3-setuptools
Standards-Version: 4.2.1
Standards-Version: 4.3.0
Vcs-Git: https://salsa.debian.org/med-team/pybigwig.git
Vcs-Browser: https://salsa.debian.org/med-team/pybigwig
Section: science
Homepage: https://github.com/dpryan79/pyBigWig
Package: python-pybigwig
Architecture: any
Depends: ${misc:Depends}, ${python:Depends}, ${shlibs:Depends}
Description: Python 2 module for quick access to bigBed and bigWig files
This is a Python extension, written in C, for quick access to bigBed files,
and access to and creation of bigWig files.
.
This contains the Python 2 version
#Package: python-pybigwig
#Architecture: any
#Depends: ${misc:Depends}, ${python:Depends}, ${shlibs:Depends}
#Description: Python 2 module for quick access to bigBed and bigWig files
# This is a Python extension, written in C, for quick access to bigBed files,
# and access to and creation of bigWig files.
# .
# This contains the Python 2 version
Package: python3-pybigwig
Architecture: any
......@@ -35,4 +36,20 @@ Description: Python 3 module for quick access to bigBed and bigWig files
This is a Python extension, written in C, for quick access to bigBed files,
and access to and creation of bigWig files.
.
This contains the Python 3 version
The bigWig format was originally created in the context of genome
browsers. There, computing exact summary statistics for a given interval
is less important than quickly being able to compute an approximate
statistic. Because of this, bigWig files contain not only interval-value
associations, but also `sum of values`/`sum of squared values`/`minimum
value`/`maximum value`/`number of bases covered` for equally sized
bins of various sizes. These different sizes are referred to as "zoom
levels". The smallest zoom level has bins that are 16 times the mean
interval size in the file and each subsequent zoom level has bins 4 times
larger than the previous. This methodology is used in Kent's tools and,
therefore, likely used in almost every currently existing bigWig file.
.
When a bigWig file is queried for a summary statistic, the size of the
interval is used to determine whether to use a zoom level and, if so,
which one. The optimal zoom level is that which has the largest bins no
more than half the width of the desired interval. If no such zoom level
exists, the original intervals are instead used for the calculation.
Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
Upstream-Name: pyBigWig
Source: https://github.com/dpryan79/pyBigWig
Upstream-Name: python-pybigwig
Source: https://github.com/deeptools/pyBigWig/releases
Files-Excluded: libBigWig
Files: *
Copyright: 2015 Devon Ryan
License: Expat
License: MIT
Files: debian/*
Copyright: 2016 Diane Trout <diane@ghic.org>
License: Expat
Copyright: 2016-2019 Diane Trout <diane@ghic.org>
2019 Steffen Moeller <moeller@debian.org>
License: MIT
License: MIT
.
The MIT License (MIT)
.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
.
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
License: Expat
The MIT License (MIT)
.
Copyright (c) 2015 Devon Ryan
.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
.
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
Index: python-pybigwig-0.3.14/setup.py
===================================================================
--- python-pybigwig-0.3.14.orig/setup.py
+++ python-pybigwig-0.3.14/setup.py
@@ -11,11 +11,9 @@ try:
except:
WITHNUMPY = False
-srcs = [x for x in
- glob.glob("libBigWig/*.c")]
-srcs.append("pyBigWig.c")
+srcs = ["pyBigWig.c"]
-libs=["m", "z"]
+libs=["BigWig", "m", "z"]
# do not link to python on mac, see https://github.com/deeptools/pyBigWig/issues/58
if 'dynamic_lookup' not in (sysconfig.get_config_var('LDSHARED') or ''):
prepareForExternalLibBigWig.patch
......@@ -8,9 +8,8 @@ include /usr/share/dpkg/buildflags.mk
export PYBUILD_NAME=pybigwig
%:
dh $@ --with python2,python3 --buildsystem=pybuild
dh $@ --with python3 --buildsystem=pybuild
override_dh_auto_install:
dh_auto_install
dh_numpy
dh_numpy3
version=4
opts="uversionmangle=s/pyBigWig/pybigwig/"
https://pypi.python.org/packages/source/p/pyBigWig/ \
pyBigWig-(.+)\.tar\.gz debian uupdate
# Diane started with .gz, did not want to change albeit now repacking -- Steffen
opts="dversionmangle=auto,oversionmangle=s/$/+dfsg/,compression=gz,repack,filenamemangle=s%(?:.*?)?v?(\d[\d.]*)\.tar\.gz%python-pybigwig-$1.tar.gz%" \
https://github.com/deeptools/pyBigWig/tags \
(?:.*?/)?v?(\d[\d.]*)\.tar\.gz debian uupdate
The MIT License (MIT)
Copyright (c) 2015 Devon Ryan
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
![Master build status](https://travis-ci.org/dpryan79/libBigWig.svg?branch=master) [![DOI](https://zenodo.org/badge/doi/10.5281/zenodo.45278.svg)](http://dx.doi.org/10.5281/zenodo.45278)
A C library for reading/parsing local and remote bigWig and bigBed files. While Kent's source code is free to use for these purposes, it's really inappropriate as library code since it has the unfortunate habit of calling `exit()` whenever there's an error. If that's then used inside of something like python then the python interpreter gets killed. This library is aimed at resolving these sorts of issues and should also use more standard things like curl and has a friendlier license to boot.
Documentation is automatically generated by doxygen and can be found under `docs/html` or online [here](https://cdn.rawgit.com/dpryan79/libBigWig/master/docs/html/index.html).
# Example
The only functions and structures that end users need to care about are in "bigWig.h". Below is a commented example. You can see the files under `test/` for further examples.
#include "bigWig.h"
int main(int argc, char *argv[]) {
bigWigFile_t *fp = NULL;
bwOverlappingIntervals_t *intervals = NULL;
double *stats = NULL;
if(argc != 2) {
fprintf(stderr, "Usage: %s {file.bw|URL://path/file.bw}\n", argv[0]);
return 1;
}
//Initialize enough space to hold 128KiB (1<<17) of data at a time
if(bwInit(1<<17) != 0) {
fprintf(stderr, "Received an error in bwInit\n");
return 1;
}
//Open the local/remote file
fp = bwOpen(argv[1], NULL, "r");
if(!fp) {
fprintf(stderr, "An error occurred while opening %s\n", argv[1]);
return 1;
}
//Get values in a range (0-based, half open) without NAs
intervals = bwGetValues(fp, "chr1", 10000000, 10000100, 0);
bwDestroyOverlappingIntervals(intervals); //Free allocated memory
//Get values in a range (0-based, half open) with NAs
intervals = bwGetValues(fp, "chr1", 10000000, 10000100, 1);
bwDestroyOverlappingIntervals(intervals); //Free allocated memory
//Get the full intervals that overlap
intervals = bwGetOverlappingIntervals(fp, "chr1", 10000000, 10000100);
bwDestroyOverlappingIntervals(intervals);
//Get an example statistic - standard deviation
//We want ~4 bins in the range
stats = bwStats(fp, "chr1", 10000000, 10000100, 4, dev);
if(stats) {
printf("chr1:10000000-10000100 std. dev.: %f %f %f %f\n", stats[0], stats[1], stats[2], stats[3]);
free(stats);
}
bwClose(fp);
bwCleanup();
return 0;
}
##Writing example
N.B., creation of bigBed files is not supported (there are no plans to change this).
Below is an example of how to write bigWig files. You can also find this file under `test/exampleWrite.c`. Unlike with Kent's tools, you can create bigWig files entry by entry without needing an intermediate wiggle or bedGraph file. Entries in bigWig files are stored in blocks with each entry in a block referring to the same chromosome and having the same type, of which there are three (see the [wiggle specification](http://genome.ucsc.edu/goldenpath/help/wiggle.html) for more information on this).
#include "bigWig.h"
int main(int argc, char *argv[]) {
bigWigFile_t *fp = NULL;
char *chroms[] = {"1", "2"};
char *chromsUse[] = {"1", "1", "1"};
uint32_t chrLens[] = {1000000, 1500000};
uint32_t starts[] = {0, 100, 125,
200, 220, 230,
500, 600, 625,
700, 800, 850};
uint32_t ends[] = {5, 120, 126,
205, 226, 231};
float values[] = {0.0f, 1.0f, 200.0f,
-2.0f, 150.0f, 25.0f,
0.0f, 1.0f, 200.0f,
-2.0f, 150.0f, 25.0f,
-5.0f, -20.0f, 25.0f,
-5.0f, -20.0f, 25.0f};
if(bwInit(1<<17) != 0) {
fprintf(stderr, "Received an error in bwInit\n");
return 1;
}
fp = bwOpen("example_output.bw", NULL, "w");
if(!fp) {
fprintf(stderr, "An error occurred while opening example_output.bw for writingn\n");
return 1;
}
//Allow up to 10 zoom levels, though fewer will be used in practice
if(bwCreateHdr(fp, 10)) goto error;
//Create the chromosome lists
fp->cl = bwCreateChromList(chroms, chrLens, 2);
if(!fp->cl) goto error;
//Write the header
if(bwWriteHdr(fp)) goto error;
//Some example bedGraph-like entries
if(bwAddIntervals(fp, chromsUse, starts, ends, values, 3)) goto error;
//We can continue appending similarly formatted entries
//N.B. you can't append a different chromosome (those always go into different
if(bwAppendIntervals(fp, starts+3, ends+3, values+3, 3)) goto error;
//Add a new block of entries with a span. Since bwAdd/AppendIntervals was just used we MUST create a new block
if(bwAddIntervalSpans(fp, "1", starts+6, 20, values+6, 3)) goto error;
//We can continue appending similarly formatted entries
if(bwAppendIntervalSpans(fp, starts+9, values+9, 3)) goto error;
//Add a new block of fixed-step entries
if(bwAddIntervalSpanSteps(fp, "1", 900, 20, 30, values+12, 3)) goto error;
//The start is then 760, since that's where the previous step ended
if(bwAppendIntervalSpanSteps(fp, values+15, 3)) goto error;
//Add a new chromosome
chromsUse[0] = "2";
chromsUse[1] = "2";
chromsUse[2] = "2";
if(bwAddIntervals(fp, chromsUse, starts, ends, values, 3)) goto error;
//Closing the file causes the zoom levels to be created
bwClose(fp);
bwCleanup();
return 0;
error:
fprintf(stderr, "Received an error somewhere!\n");
bwClose(fp);
bwCleanup();
return 1;
}
# Testing file types
As of version 0.3.0, this library supports accessing bigBed files, which are related to bigWig files. Applications that need to support both bigWig and bigBed input can use the `bwIsBigWig` and `bbIsBigBed` functions to determine if their inputs are bigWig/bigBed files:
...code...
if(bwIsBigWig(input_file_name, NULL)) {
//do something
} else if(bbIsBigBed(input_file_name, NULL)) {
//do something else
} else {
//handle unknown input
}
Note that these two functions rely on the "magic number" at the beginning of each file, which differs between bigWig and bigBed files.
# bigBed support
Support for accessing bigBed files was added in version 0.3.0. The function names used for accessing bigBed files are similar to those used for bigWig files.
Function | Use
--- | ---
bbOpen | Opens a bigBed file
bbGetSQL | Returns the SQL string (if it exists) in a bigBed file
bbGetOverlappingEntries | Returns all entries overlapping an interval (either with or without their associated strings
bbDestroyOverlappingEntries | Free memory allocated by the above command
Other functions, such as `bwClose` and `bwInit`, are shared between bigWig and bigBed files. See `test/testBigBed.c` for a full example.
# A note on bigBed entries
Inside bigBed files, entries are stored as chromosome, start, and end coordinates with an (optional) associated string. For example, a "bedRNAElements" file from Encode has name, score, strand, "level", "significance", and "score2" values associated with each entry. These are stored inside the bigBed files as a single tab-separated character vector (char \*), which makes parsing difficult. The names of the various fields inside of bigBed files is stored as an SQL string, for example:
table RnaElements
"BED6 + 3 scores for RNA Elements data "
(
string chrom; "Reference sequence chromosome or scaffold"
uint chromStart; "Start position in chromosome"
uint chromEnd; "End position in chromosome"
string name; "Name of item"
uint score; "Normalized score from 0-1000"
char[1] strand; "+ or - or . for unknown"
float level; "Expression level such as RPKM or FPKM. Set to -1 for no data."
float signif; "Statistical significance such as IDR. Set to -1 for no data."
uint score2; "Additional measurement/count e.g. number of reads. Set to 0 for no data."
)
Entries will then be of the form (one per line):
59426 115 - 0.021 0.48 218
51 209 + 0.071 0.74 130
52 170 + 0.045 0.61 171
59433 178 - 0.049 0.34 296
53 156 + 0.038 0.19 593
59436 186 - 0.054 0.15 1010
59437 506 - 1.560 0.00 430611
Note that chromosome and start/end intervals are stored separately, so there's no need to parse them out of string. libBigWig can return these entries, either with or without the above associated strings. Parsing these string is left to the application requiring them and is currently outside the scope of this library.
# Interval/Entry iterators
Sometimes it is desirable to request a large number of intervals from a bigWig file or entries from a bigBed file, but not hold them all in memory at once (e.g., due to saving memory). To support this, libBigWig (since version 0.3.0) supports two kinds of iterators. The general process of using iterators is: (1) iterator creation, (2) traversal, and finally (3) iterator destruction. Only iterator creation differs between bigWig and bigBed files.
Importantly, iterators return results by one or more blocks. This is for convenience, since bigWig intervals and bigBed entries are stored in together in fixed-size groups, called blocks. The number of blocks of entries returned, therefore, is an option that can be specified to balance performance and memory usage.
## Iterator creation
For bigwig files, iterators are created with the `bwOverlappingIntervalsIterator()`. This function takes chromosomal bounds (chromosome name, start, and end position) as well as a number of blocks. The equivalent function for bigBed files is `bbOverlappingEntriesIterator()`, which additionally takes a `withString` argutment, which dictates whether the returned entries include the associated string values or not.
Each of the aforementioned files returns a pointer to a `bwOverlapIterator_t` object. The only important parts of this structure for end users are the following members: `entries`, `intervals`, and `data`. `entries` is a pointer to a `bbOverlappingEntries_t` object, or `NULL` if a bigWig file is being used. Likewise, `intervals` is a pointer to a `bwOverlappingIntervals_t` object, or `NULL` if a bigBed file is being used. `data` is a special pointer, used to signify the end of iteration. Thus, when `data` is a `NULL` pointer, iteration has ended.
## Iterator traversal
Regardless of whether a bigWig or bigBed file is being used, the `bwIteratorNext()` function will free currently used memory and load the appropriate intervals or entries for the next block(s). On error, this will return a NULL pointer (memory is already internally freed in this case).
## Iterator destruction
`bwOverlapIterator_t` objects MUST be destroyed after use. This can be done with the `bwIteratorDestroy()` function.
## Example
A full example is provided in `tests/testIterator.c`, but a small example of iterating over all bigWig intervals in `chr1:0-10000000` in chunks of 5 blocks follows:
iter = bwOverlappingIntervalsIterator(fp, "chr1", 0, 10000000, 5);
while(iter->data) {
//Do stuff with iter->intervals
iter = bwIteratorNext(iter);
}
bwIteratorDestroy(iter);
# A note on bigWig statistics
The results of `min`, `max`, and `mean` should be the same as those from `BigWigSummary`. `stdev` and `coverage`, however, may differ due to Kent's tools producing incorrect results (at least for `coverage`, though the same appears to be the case for `stdev`).
# Python interface
There are currently two python interfaces that make use of libBigWig: [pyBigWig](https://github.com/dpryan79/pyBigWig) by me and [bw-python](https://github.com/brentp/bw-python) by Brent Pederson. Those interested are encouraged to give both a try!
This diff is collapsed.
#ifndef NOCURL
#include <curl/curl.h>
#else
#include <stdio.h>
typedef int CURLcode;
typedef void CURL;
#define CURLE_OK 0
#define CURLE_FAILED_INIT 1
#endif
/*! \file bigWigIO.h
* These are (typically internal) IO functions, so there's generally no need for you to directly use them!
*/
/*!
* The size of the buffer used for remote files.
*/
extern size_t GLOBAL_DEFAULTBUFFERSIZE;
/*!
* The enumerated values that indicate the connection type used to access a file.
*/
enum bigWigFile_type_enum {
BWG_FILE = 0,
BWG_HTTP = 1,
BWG_HTTPS = 2,
BWG_FTP = 3
};
/*!
* @brief This structure holds the file pointers and buffers needed for raw access to local and remote files.
*/
typedef struct {
union {
#ifndef NOCURL
CURL *curl; /**<The CURL * file pointer for remote files.*/
#endif
FILE *fp; /**<The FILE * file pointer for local files.**/
} x; /**<A union holding curl and fp.*/
void *memBuf; /**<A void * pointing to memory of size bufSize.*/
size_t filePos; /**<Current position inside the file.*/
size_t bufPos; /**<Curent position inside the buffer.*/
size_t bufSize; /**<The size of the buffer.*/
size_t bufLen; /**<The actual size of the buffer used.*/
enum bigWigFile_type_enum type; /**<The connection type*/
int isCompressed; /**<1 if the file is compressed, otherwise 0*/
char *fname; /**<Only needed for remote connections. The original URL/filename requested, since we need to make multiple connections.*/
} URL_t;
/*!
* @brief Reads data into the given buffer.
*
* This function will store bufSize data into buf for both local and remote files. For remote files an internal buffer is used to store a (typically larger) segment of the remote file.
*
* @param URL A URL_t * pointing to a valid opened file or remote URL.
* @param buf The buffer in memory that you would like filled. It must be able to hold bufSize bytes!
* @param bufSize The number of bytes to transfer to buf.
*
* @return Returns the number of bytes stored in buf, which should be bufSize on success and something else on error.
*
* @warning Note that on error, URL for remote files is left in an unusable state. You can get around this by running urlSeek() to a position outside of the range held by the internal buffer.
*/
size_t urlRead(URL_t *URL, void *buf, size_t bufSize);
/*!
* @brief Seeks to a given position in a local or remote file.
*
* For local files, this will set the file position indicator for the file pointer to the desired position. For remote files, it sets the position to start downloading data for the next urlRead(). Note that for remote files that running urlSeek() with a pos within the current buffer will simply modify the internal offset.
*
* @param URL A URL_t * pointing to a valid opened file or remote URL.
* @param pos The position to seek to.
*
* @return CURLE_OK on success and a different CURLE_XXX on error. For local files, the error return value is always CURLE_FAILED_INIT
*/
CURLcode urlSeek(URL_t *URL, size_t pos);
/*!
* @brief Open a local or remote file
*
* Opens a local or remote file. Currently, http, https, and ftp are the only supported protocols and the URL must then begin with "http://", "https://", or "ftp://" as appropriate.
*
* For remote files, an internal buffer is used to hold file contents, to avoid downloading entire files before starting. The size of this buffer and various variable related to connection timeout are set with bwInit().
*
* Note that you **must** run urlClose() on this when finished. However, you would typically just use bwOpen() rather than directly calling this function.
*
* @param fname The file name or URL to open.
* @param callBack An optional user-supplied function. This is applied to remote connections so users can specify things like proxy and password information.
* @param mode "r", "w" or NULL. If and only if the mode contains the character "w" will the file be opened for writing.
*
* @return A URL_t * or NULL on error.
*/
URL_t *urlOpen(char *fname, CURLcode (*callBack)(CURL*), const char* mode);
/*!
* @brief Close a local/remote file
*
* This will perform the cleanup required on a URL_t*, releasing memory as needed.
*
* @param URL A URL_t * pointing to a valid opened file or remote URL.
*
* @warning URL will no longer point to a valid location in memory!
*/
void urlClose(URL_t *URL);
/*! \file bwCommon.h
*
* You have no reason to use these functions. They may change without warning because there's no reason for them to be used outside of libBigWig's internals.
*
* These are structures and functions from a variety of files that are used across files internally but don't need to be see by libBigWig users.
*/
/*!
* @brief Like fsetpos, but for local or remote bigWig files.
* This will set the file position indicator to the specified point. For local files this literally is `fsetpos`, while for remote files it fills a memory buffer with data starting at the desired position.
* @param fp A valid opened bigWigFile_t.
* @param pos The position within the file to seek to.
* @return 0 on success and -1 on error.
*/
int bwSetPos(bigWigFile_t *fp, size_t pos);
/*!
* @brief A local/remote version of `fread`.
* Reads data from either local or remote bigWig files.
* @param data An allocated memory block big enough to hold the data.
* @param sz The size of each member that should be copied.
* @param nmemb The number of members to copy.
* @param fp The bigWigFile_t * from which to copy the data.
* @see bwSetPos
* @return For nmemb==1, the size of the copied data. For nmemb>1, the number of members fully copied (this is equivalent to `fread`).
*/
size_t bwRead(void *data, size_t sz, size_t nmemb, bigWigFile_t *fp);
/*!
* @brief Determine what the file position indicator say.
* This is equivalent to `ftell` for local or remote files.
* @param fp The file.
* @return The position in the file.
*/
long bwTell(bigWigFile_t *fp);
/*!
* @brief Reads a data index (either full data or a zoom level) from a bigWig file.
* There is little reason for end users to use this function. This must be freed with `bwDestroyIndex`
* @param fp A valid bigWigFile_t pointer
* @param offset The file offset where the index begins
* @return A bwRTree_t pointer or NULL on error.
*/
bwRTree_t *bwReadIndex(bigWigFile_t *fp, uint64_t offset);
/*!
* @brief Destroy an bwRTreeNode_t and all of its children.
* @param node The node to destroy.
*/
void bwDestroyIndexNode(bwRTreeNode_t *node);
/*!
* @brief Frees space allocated by `bwReadIndex`
* There is generally little reason to use this, since end users should typically not need to run `bwReadIndex` themselves.
* @param idx A bwRTree_t pointer allocated by `bwReadIndex`.
*/
void bwDestroyIndex(bwRTree_t *idx);
/// @cond SKIP
bwOverlapBlock_t *walkRTreeNodes(bigWigFile_t *bw, bwRTreeNode_t *root, uint32_t tid, uint32_t start, uint32_t end);
void destroyBWOverlapBlock(bwOverlapBlock_t *b);
/// @endcond
/*!
* @brief Finishes what's needed to write a bigWigFile
* Flushes the buffer, converts the index linked list to a tree, writes that to disk, handles zoom level stuff, writes magic at the end
* @param fp A valid bigWigFile_t pointer
* @return 0 on success
*/
int bwFinalize(bigWigFile_t *fp);
#include "bigWig.h"
#include "bwCommon.h"
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <stdio.h>
static uint64_t readChromBlock(bigWigFile_t *bw, chromList_t *cl, uint32_t keySize);
//Return the position in the file
long bwTell(bigWigFile_t *fp) {
if(fp->URL->type == BWG_FILE) return ftell(fp->URL->x.fp);
return (long) (fp->URL->filePos + fp->URL->bufPos);
}
//Seek to a given position, always from the beginning of the file
//Return 0 on success and -1 on error
//To do, use the return code of urlSeek() in a more useful way.
int bwSetPos(bigWigFile_t *fp, size_t pos) {
CURLcode rv = urlSeek(fp->URL, pos);
if(rv == CURLE_OK) return 0;
return -1;
}
//returns the number of full members read (nmemb on success, something less on error)
size_t bwRead(void *data, size_t sz, size_t nmemb, bigWigFile_t *fp) {
size_t i, rv;
for(i=0; i<nmemb; i++) {
rv = urlRead(fp->URL, data+i*sz, sz);
if(rv != sz) return i;
}
return nmemb;
}
//Initializes curl and sets global variables
//Returns 0 on success and 1 on error
//This should be called only once and bwCleanup() must be called when finished.
int bwInit(size_t defaultBufSize) {
//set the buffer size, number of iterations, sleep time between iterations, etc.
GLOBAL_DEFAULTBUFFERSIZE = defaultBufSize;
//call curl_global_init()
#ifndef NOCURL
CURLcode rv;
rv = curl_global_init(CURL_GLOBAL_ALL);
if(rv != CURLE_OK) return 1;
#endif
return 0;
}
//This should be called before quiting, to release memory acquired by curl
void bwCleanup() {
#ifndef NOCURL
curl_global_cleanup();
#endif
}
static bwZoomHdr_t *bwReadZoomHdrs(bigWigFile_t *bw) {
if(bw->isWrite) return NULL;
uint16_t i;
bwZoomHdr_t *zhdr = malloc(sizeof(bwZoomHdr_t));
if(!zhdr) return NULL;
uint32_t *level = malloc(bw->hdr->nLevels * sizeof(uint64_t));
if(!level) {
free(zhdr);
return NULL;
}
uint32_t padding = 0;
uint64_t *dataOffset = malloc(sizeof(uint64_t) * bw->hdr->nLevels);
if(!dataOffset) {
free(zhdr);
free(level);
return NULL;
}
uint64_t *indexOffset = malloc(sizeof(uint64_t) * bw->hdr->nLevels);
if(!dataOffset) {
free(zhdr);
free(level);
free(dataOffset);
return NULL;
}
for(i=0; i<bw->hdr->nLevels; i++) {
if(bwRead((void*) &(level[i]), sizeof(uint32_t), 1, bw) != 1) goto error;
if(bwRead((void*) &padding, sizeof(uint32_t), 1, bw) != 1) goto error;
if(bwRead((void*) &(dataOffset[i]), sizeof(uint64_t), 1, bw) != 1) goto error;
if(bwRead((void*) &(indexOffset[i]), sizeof(uint64_t), 1, bw) != 1) goto error;
}
zhdr->level = level;
zhdr->dataOffset = dataOffset;
zhdr->indexOffset = indexOffset;
zhdr->idx = calloc(bw->hdr->nLevels, sizeof(bwRTree_t*));
if(!zhdr->idx) goto error;
return zhdr;
error:
for(i=0; i<bw->hdr->nLevels; i++) {
if(zhdr->idx[i]) bwDestroyIndex(zhdr->idx[i]);
}
free(zhdr);
free(level);
free(dataOffset);
free(indexOffset);
return NULL;
}
static void bwHdrDestroy(bigWigHdr_t *hdr) {
int i;
if(hdr->zoomHdrs) {
free(hdr->zoomHdrs->level);
free(hdr->zoomHdrs->dataOffset);
free(hdr->zoomHdrs->indexOffset);
for(i=0; i<hdr->nLevels; i++) {
if(hdr->zoomHdrs->idx[i]) bwDestroyIndex(hdr->zoomHdrs->idx[i]);
}
free(hdr->zoomHdrs->idx);
free(hdr->zoomHdrs);
}
free(hdr);
}
static void bwHdrRead(bigWigFile_t *bw) {
uint32_t magic;
if(bw->isWrite) return;
bw->hdr = calloc(1, sizeof(bigWigHdr_t));
if(!bw->hdr) return;
if(bwRead((void*) &magic, sizeof(uint32_t), 1, bw) != 1) goto error; //0x0
if(magic != BIGWIG_MAGIC && magic != BIGBED_MAGIC) goto error;
if(bwRead((void*) &(bw->hdr->version), sizeof(uint16_t), 1, bw) != 1) goto error; //0x4
if(bwRead((void*) &(bw->hdr->nLevels), sizeof(uint16_t), 1, bw) != 1) goto error; //0x6
if(bwRead((void*) &(bw->hdr->ctOffset), sizeof(uint64_t), 1, bw) != 1) goto error; //0x8
if(bwRead((void*) &(bw->hdr->dataOffset), sizeof(uint64_t), 1, bw) != 1) goto error; //0x10
if(bwRead((void*) &(bw->hdr->indexOffset), sizeof(uint64_t), 1, bw) != 1) goto error; //0x18
if(bwRead((void*) &(bw->hdr->fieldCount), sizeof(uint16_t), 1, bw) != 1) goto error; //0x20
if(bwRead((void*) &(bw->hdr->definedFieldCount), sizeof(uint16_t), 1, bw) != 1) goto error; //0x22
if(bwRead((void*) &(bw->hdr->sqlOffset), sizeof(uint64_t), 1, bw) != 1) goto error; //0x24
if(bwRead((void*) &(bw->hdr->summaryOffset), sizeof(uint64_t), 1, bw) != 1) goto error; //0x2c
if(bwRead((void*) &(bw->hdr->bufSize), sizeof(uint32_t), 1, bw) != 1) goto error; //0x34
if(bwRead((void*) &(bw->hdr->extensionOffset), sizeof(uint64_t), 1, bw) != 1) goto error; //0x38
//zoom headers
if(bw->hdr->nLevels) {
if(!(bw->hdr->zoomHdrs = bwReadZoomHdrs(bw))) goto error;
}
//File summary information
if(bw->hdr->summaryOffset) {
if(urlSeek(bw->URL, bw->hdr->summaryOffset) != CURLE_OK) goto error;
if(bwRead((void*) &(bw->hdr->nBasesCovered), sizeof(uint64_t), 1, bw) != 1) goto error;
if(bwRead((void*) &(bw->hdr->minVal), sizeof(uint64_t), 1, bw) != 1) goto error;
if(bwRead((void*) &(bw->hdr->maxVal), sizeof(uint64_t), 1, bw) != 1) goto error;
if(bwRead((void*) &(bw->hdr->sumData), sizeof(uint64_t), 1, bw) != 1) goto error;
if(bwRead((void*) &(bw->hdr->sumSquared), sizeof(uint64_t), 1, bw) != 1) goto error;
}
//In case of uncompressed remote files, let the IO functions know to request larger chunks
bw->URL->isCompressed = (bw->hdr->bufSize > 0)?1:0;
return;
error:
bwHdrDestroy(bw->hdr);
fprintf(stderr, "[bwHdrRead] There was an error while reading in the header!\n");
bw->hdr = NULL;
}
static void destroyChromList(chromList_t *cl) {
uint32_t i;
if(!cl) return;
if(cl->nKeys && cl->chrom) {
for(i=0; i<cl->nKeys; i++) {
if(cl->chrom[i]) free(cl->chrom[i]);
}
}
if(cl->chrom) free(cl->chrom);
if(cl->len) free(cl->len);
free(cl);
}
static uint64_t readChromLeaf(bigWigFile_t *bw, chromList_t *cl, uint32_t valueSize) {
uint16_t nVals, i;
uint32_t idx;
char *chrom = NULL;
if(bwRead((void*) &nVals, sizeof(uint16_t), 1, bw) != 1) return -1;
chrom = calloc(valueSize+1, sizeof(char));
if(!chrom) return -1;
for(i=0; i<nVals; i++) {
if(bwRead((void*) chrom, sizeof(char), valueSize, bw) != valueSize) goto error;
if(bwRead((void*) &idx, sizeof(uint32_t), 1, bw) != 1) goto error;
if(bwRead((void*) &(cl->len[idx]), sizeof(uint32_t), 1, bw) != 1) goto error;
cl->chrom[idx] = strdup(chrom);
if(!(cl->chrom[idx])) goto error;
}
free(chrom);
return nVals;
error:
free(chrom);
return -1;
}
static uint64_t readChromNonLeaf(bigWigFile_t *bw, chromList_t *cl, uint32_t keySize) {
uint64_t offset , rv = 0, previous;
uint16_t nVals, i;
if(bwRead((void*) &nVals, sizeof(uint16_t), 1, bw) != 1) return -1;
previous = bwTell(bw) + keySize;
for(i=0; i<nVals; i++) {
if(bwSetPos(bw, previous)) return -1;
if(bwRead((void*) &offset, sizeof(uint64_t), 1, bw) != 1) return -1;
if(bwSetPos(bw, offset)) return -1;
rv += readChromBlock(bw, cl, keySize);
previous += 8 + keySize;
}
return rv;
}
static uint64_t readChromBlock(bigWigFile_t *bw, chromList_t *cl, uint32_t keySize) {
uint8_t isLeaf, padding;
if(bwRead((void*) &isLeaf, sizeof(uint8_t), 1, bw) != 1) return -1;
if(bwRead((void*) &padding, sizeof(uint8_t), 1, bw) != 1) return -1;
if(isLeaf) {
return readChromLeaf(bw, cl, keySize);
} else { //I've never actually observed one of these, which is good since they're pointless
return readChromNonLeaf(bw, cl, keySize);
}
}
static chromList_t *bwReadChromList(bigWigFile_t *bw) {
chromList_t *cl = NULL;
uint32_t magic, keySize, valueSize, itemsPerBlock;
uint64_t rv, itemCount;
if(bw->isWrite) return NULL;
if(bwSetPos(bw, bw->hdr->ctOffset)) return NULL;
cl = calloc(1, sizeof(chromList_t));
if(!cl) return NULL;
if(bwRead((void*) &magic, sizeof(uint32_t), 1, bw) != 1) goto error;
if(magic != CIRTREE_MAGIC) goto error;
if(bwRead((void*) &itemsPerBlock, sizeof(uint32_t), 1, bw) != 1) goto error;
if(bwRead((void*) &keySize, sizeof(uint32_t), 1, bw) != 1) goto error;
if(bwRead((void*) &valueSize, sizeof(uint32_t), 1, bw) != 1) goto error;
if(bwRead((void*) &itemCount, sizeof(uint64_t), 1, bw) != 1) goto error;
cl->nKeys = itemCount;
cl->chrom = calloc(itemCount, sizeof(char*));
cl->len = calloc(itemCount, sizeof(uint32_t));
if(!cl->chrom) goto error;
if(!cl->len) goto error;
if(bwRead((void*) &magic, sizeof(uint32_t), 1, bw) != 1) goto error;
if(bwRead((void*) &magic, sizeof(uint32_t), 1, bw) != 1) goto error;
//Read in the blocks
rv = readChromBlock(bw, cl, keySize);
if(rv == (uint64_t) -1) goto error;
if(rv != itemCount) goto error;
return cl;
error:
destroyChromList(cl);
return NULL;
}
//This is here mostly for convenience
static void bwDestroyWriteBuffer(bwWriteBuffer_t *wb) {
if(wb->p) free(wb->p);
if(wb->compressP) free(wb->compressP);
if(wb->firstZoomBuffer) free(wb->firstZoomBuffer);
if(wb->lastZoomBuffer) free(wb->lastZoomBuffer);
if(wb->nNodes) free(wb->nNodes);
free(wb);
}
void bwClose(bigWigFile_t *fp) {
if(!fp) return;
if(bwFinalize(fp)) {
fprintf(stderr, "[bwClose] There was an error while finishing writing a bigWig file! The output is likely truncated.\n");
}
if(fp->URL) urlClose(fp->URL);
if(fp->hdr) bwHdrDestroy(fp->hdr);
if(fp->cl) destroyChromList(fp->cl);
if(fp->idx) bwDestroyIndex(fp->idx);
if(fp->writeBuffer) bwDestroyWriteBuffer(fp->writeBuffer);
free(fp);
}
int bwIsBigWig(char *fname, CURLcode (*callBack) (CURL*)) {
uint32_t magic = 0;
URL_t *URL = NULL;
URL = urlOpen(fname, *callBack, NULL);
if(!URL) return 0;
if(urlRead(URL, (void*) &magic, sizeof(uint32_t)) != sizeof(uint32_t)) magic = 0;
urlClose(URL);
if(magic == BIGWIG_MAGIC) return 1;
return 0;
}
char *bbGetSQL(bigWigFile_t *bw) {
char *o = NULL;
uint64_t len;
if(!bw->hdr->sqlOffset) return NULL;
len = bw->hdr->summaryOffset - bw->hdr->sqlOffset; //This includes the NULL terminator
o = malloc(sizeof(char) * len);
if(!o) goto error;
if(bwSetPos(bw, bw->hdr->sqlOffset)) goto error;
if(bwRead((void*) o, len, 1, bw) != 1) goto error;
return o;
error:
if(o) free(o);
printf("Got an error in bbGetSQL!\n");
return NULL;
}
int bbIsBigBed(char *fname, CURLcode (*callBack) (CURL*)) {
uint32_t magic = 0;
URL_t *URL = NULL;
URL = urlOpen(fname, *callBack, NULL);
if(!URL) return 0;
if(urlRead(URL, (void*) &magic, sizeof(uint32_t)) != sizeof(uint32_t)) magic = 0;
urlClose(URL);
if(magic == BIGBED_MAGIC) return 1;
return 0;
}
bigWigFile_t *bwOpen(char *fname, CURLcode (*callBack) (CURL*), const char *mode) {
bigWigFile_t *bwg = calloc(1, sizeof(bigWigFile_t));
if(!bwg) {
fprintf(stderr, "[bwOpen] Couldn't allocate space to create the output object!\n");
return NULL;
}
if((!mode) || (strchr(mode, 'w') == NULL)) {
bwg->isWrite = 0;
bwg->URL = urlOpen(fname, *callBack, NULL);
if(!bwg->URL) {
fprintf(stderr, "[bwOpen] urlOpen is NULL!\n");
goto error;
}
//Attempt to read in the fixed header
bwHdrRead(bwg);
if(!bwg->hdr) {
fprintf(stderr, "[bwOpen] bwg->hdr is NULL!\n");
goto error;
}
//Read in the chromosome list
bwg->cl = bwReadChromList(bwg);
if(!bwg->cl) {
fprintf(stderr, "[bwOpen] bwg->cl is NULL (%s)!\n", fname);
goto error;
}
//Read in the index
if(bwg->hdr->nBasesCovered) {
bwg->idx = bwReadIndex(bwg, 0);
if(!bwg->idx) {
fprintf(stderr, "[bwOpen] bwg->idx is NULL bwg->hdr->dataOffset 0x%"PRIx64"!\n", bwg->hdr->dataOffset);
goto error;
}
}
} else {
bwg->isWrite = 1;
bwg->URL = urlOpen(fname, NULL, "w+");
if(!bwg->URL) goto error;
bwg->writeBuffer = calloc(1,sizeof(bwWriteBuffer_t));
if(!bwg->writeBuffer) goto error;
bwg->writeBuffer->l = 24;
}
return bwg;
error:
bwClose(bwg);
return NULL;
}
bigWigFile_t *bbOpen(char *fname, CURLcode (*callBack) (CURL*)) {
bigWigFile_t *bb = calloc(1, sizeof(bigWigFile_t));
if(!bb) {
fprintf(stderr, "[bbOpen] Couldn't allocate space to create the output object!\n");
return NULL;
}
//Set the type to 1 for bigBed
bb->type = 1;
bb->URL = urlOpen(fname, *callBack, NULL);
if(!bb->URL) goto error;
//Attempt to read in the fixed header
bwHdrRead(bb);
if(!bb->hdr) goto error;
//Read in the chromosome list
bb->cl = bwReadChromList(bb);
if(!bb->cl) goto error;
//Read in the index
bb->idx = bwReadIndex(bb, 0);
if(!bb->idx) goto error;
return bb;
error:
bwClose(bb);
return NULL;
}
#include "bigWig.h"
#include "bwCommon.h"
#include <errno.h>
#include <stdlib.h>
#include <zlib.h>
#include <math.h>
#include <string.h>
//Returns -1 if there are no applicable levels, otherwise an integer indicating the most appropriate level.
//Like Kent's library, this divides the desired bin size by 2 to minimize the effect of blocks overlapping multiple bins
static int32_t determineZoomLevel(bigWigFile_t *fp, int basesPerBin) {
int32_t out = -1;
int64_t diff;
uint32_t bestDiff = -1;
uint16_t i;
basesPerBin/=2;
for(i=0; i<fp->hdr->nLevels; i++) {
diff = basesPerBin - (int64_t) fp->hdr->zoomHdrs->level[i];
if(diff >= 0 && diff < bestDiff) {
bestDiff = diff;
out = i;
}
}
return out;
}
/// @cond SKIP
struct val_t {
uint32_t nBases;
float min, max, sum, sumsq;
double scalar;
};
struct vals_t {
uint32_t n;
struct val_t **vals;
};
/// @endcond
void destroyVals_t(struct vals_t *v) {
uint32_t i;
if(!v) return;
for(i=0; i<v->n; i++) free(v->vals[i]);
if(v->vals) free(v->vals);
free(v);
}
//Determine the base-pair overlap between an interval and a block
double getScalar(uint32_t i_start, uint32_t i_end, uint32_t b_start, uint32_t b_end) {
double rv = 0.0;
if(b_start <= i_start) {
if(b_end > i_start) rv = ((double)(b_end - i_start))/(b_end-b_start);
} else if(b_start < i_end) {
if(b_end < i_end) rv = ((double)(b_end - b_start))/(b_end-b_start);
else rv = ((double)(i_end - b_start))/(b_end-b_start);
}
return rv;
}
//Returns NULL on error
static struct vals_t *getVals(bigWigFile_t *fp, bwOverlapBlock_t *o, int i, uint32_t tid, uint32_t start, uint32_t end) {
void *buf = NULL, *compBuf = NULL;
uLongf sz = fp->hdr->bufSize;
int compressed = 0, rv;
uint32_t *p, vtid, vstart, vend;
struct vals_t *vals = NULL;
struct val_t *v = NULL;
if(sz) {
compressed = 1;
buf = malloc(sz);
}
sz = 0; //This is now the size of the compressed buffer
if(bwSetPos(fp, o->offset[i])) goto error;
vals = calloc(1,sizeof(struct vals_t));
if(!vals) goto error;
v = malloc(sizeof(struct val_t));
if(!v) goto error;
if(sz < o->size[i]) compBuf = malloc(o->size[i]);
if(!compBuf) goto error;
if(bwRead(compBuf, o->size[i], 1, fp) != 1) goto error;
if(compressed) {
sz = fp->hdr->bufSize;
rv = uncompress(buf, &sz, compBuf, o->size[i]);
if(rv != Z_OK) goto error;
} else {
buf = compBuf;
sz = o->size[i];
}
p = buf;
while(((uLongf) ((void*)p-buf)) < sz) {
vtid = p[0];
vstart = p[1];
vend = p[2];
v->nBases = p[3];
v->min = ((float*) p)[4];
v->max = ((float*) p)[5];
v->sum = ((float*) p)[6];
v->sumsq = ((float*) p)[7];
v->scalar = getScalar(start, end, vstart, vend);
if(tid == vtid) {
if((start <= vstart && end > vstart) || (start < vend && start >= vstart)) {
vals->vals = realloc(vals->vals, sizeof(struct val_t*)*(vals->n+1));
if(!vals->vals) goto error;
vals->vals[vals->n++] = v;
v = malloc(sizeof(struct val_t));
if(!v) goto error;
}
if(vstart > end) break;
} else if(vtid > tid) {
break;
}
p+=8;
}
free(v);
free(buf);
if(compressed) free(compBuf);
return vals;
error:
if(buf) free(buf);
if(compBuf && compressed) free(compBuf);
if(v) free(v);
destroyVals_t(vals);
return NULL;
}
//On error, errno is set to ENOMEM and NaN is returned (though NaN can be returned normally)
static double blockMean(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) {
uint32_t i, j;
double output = 0.0, coverage = 0.0;
struct vals_t *v = NULL;
if(!blocks->n) return strtod("NaN", NULL);
//Iterate over the blocks
for(i=0; i<blocks->n; i++) {
v = getVals(fp, blocks, i, tid, start, end);
if(!v) goto error;
for(j=0; j<v->n; j++) {
output += v->vals[j]->sum * v->vals[j]->scalar;
coverage += v->vals[j]->nBases * v->vals[j]->scalar;
}
destroyVals_t(v);
}
if(!coverage) return strtod("NaN", NULL);
return output/coverage;
error:
if(v) free(v);
errno = ENOMEM;
return strtod("NaN", NULL);
}
static double intMean(bwOverlappingIntervals_t* ints, uint32_t start, uint32_t end) {
double sum = 0.0;
uint32_t nBases = 0, i, start_use, end_use;
if(!ints->l) return strtod("NaN", NULL);
for(i=0; i<ints->l; i++) {
start_use = ints->start[i];
end_use = ints->end[i];
if(ints->start[i] < start) start_use = start;
if(ints->end[i] > end) end_use = end;
nBases += end_use-start_use;
sum += (end_use-start_use)*((double) ints->value[i]);
}
return sum/nBases;
}
//Does UCSC compensate for partial block/range overlap?
static double blockDev(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) {
uint32_t i, j;
double mean = 0.0, ssq = 0.0, coverage = 0.0, diff;
struct vals_t *v = NULL;
if(!blocks->n) return strtod("NaN", NULL);
//Iterate over the blocks
for(i=0; i<blocks->n; i++) {
v = getVals(fp, blocks, i, tid, start, end);
if(!v) goto error;
for(j=0; j<v->n; j++) {
coverage += v->vals[j]->nBases * v->vals[j]->scalar;
mean += v->vals[j]->sum * v->vals[j]->scalar;
ssq += v->vals[j]->sumsq * v->vals[j]->scalar;
}
destroyVals_t(v);
v = NULL;
}
if(coverage<=1.0) return strtod("NaN", NULL);
diff = ssq-mean*mean/coverage;
if(coverage > 1.0) diff /= coverage-1;
if(fabs(diff) > 1e-8) { //Ignore floating point differences
return sqrt(diff);
} else {
return 0.0;
}
error:
if(v) destroyVals_t(v);
errno = ENOMEM;
return strtod("NaN", NULL);
}
//This uses compensated summation to account for finite precision math
static double intDev(bwOverlappingIntervals_t* ints, uint32_t start, uint32_t end) {
double v1 = 0.0, mean, rv;
uint32_t nBases = 0, i, start_use, end_use;
if(!ints->l) return strtod("NaN", NULL);
mean = intMean(ints, start, end);
for(i=0; i<ints->l; i++) {
start_use = ints->start[i];
end_use = ints->end[i];
if(ints->start[i] < start) start_use = start;
if(ints->end[i] > end) end_use = end;
nBases += end_use-start_use;
v1 += (end_use-start_use) * pow(ints->value[i]-mean, 2.0); //running sum of squared difference
}
if(nBases>=2) rv = sqrt(v1/(nBases-1));
else if(nBases==1) rv = sqrt(v1);
else rv = strtod("NaN", NULL);
return rv;
}
static double blockMax(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) {
uint32_t i, j, isNA = 1;
double o = strtod("NaN", NULL);
struct vals_t *v = NULL;
if(!blocks->n) return o;
//Iterate the blocks
for(i=0; i<blocks->n; i++) {
v = getVals(fp, blocks, i, tid, start, end);
if(!v) goto error;
for(j=0; j<v->n; j++) {
if(isNA) {
o = v->vals[j]->max;
isNA = 0;
} else if(v->vals[j]->max > o) {
o = v->vals[j]->max;
}
}
destroyVals_t(v);
}
return o;
error:
destroyVals_t(v);
errno = ENOMEM;
return strtod("NaN", NULL);
}
static double intMax(bwOverlappingIntervals_t* ints) {
uint32_t i;
double o;
if(ints->l < 1) return strtod("NaN", NULL);
o = ints->value[0];
for(i=1; i<ints->l; i++) {
if(ints->value[i] > o) o = ints->value[i];
}
return o;
}
static double blockMin(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) {
uint32_t i, j, isNA = 1;
double o = strtod("NaN", NULL);
struct vals_t *v = NULL;
if(!blocks->n) return o;
//Iterate the blocks
for(i=0; i<blocks->n; i++) {
v = getVals(fp, blocks, i, tid, start, end);
if(!v) goto error;
for(j=0; j<v->n; j++) {
if(isNA) {
o = v->vals[j]->min;
isNA = 0;
} else if(v->vals[j]->min < o) o = v->vals[j]->min;
}
destroyVals_t(v);
}
return o;
error:
destroyVals_t(v);
errno = ENOMEM;
return strtod("NaN", NULL);
}
static double intMin(bwOverlappingIntervals_t* ints) {
uint32_t i;
double o;
if(ints->l < 1) return strtod("NaN", NULL);
o = ints->value[0];
for(i=1; i<ints->l; i++) {
if(ints->value[i] < o) o = ints->value[i];
}
return o;
}
//Does UCSC compensate for only partial block/interval overlap?
static double blockCoverage(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) {
uint32_t i, j;
double o = 0.0;
struct vals_t *v = NULL;
if(!blocks->n) return strtod("NaN", NULL);
//Iterate over the blocks
for(i=0; i<blocks->n; i++) {
v = getVals(fp, blocks, i, tid, start, end);
if(!v) goto error;
for(j=0; j<v->n; j++) {
o+= v->vals[j]->nBases * v->vals[j]->scalar;
}
destroyVals_t(v);
}
if(o == 0.0) return strtod("NaN", NULL);
return o;
error:
destroyVals_t(v);
errno = ENOMEM;
return strtod("NaN", NULL);
}
static double intCoverage(bwOverlappingIntervals_t* ints, uint32_t start, uint32_t end) {
uint32_t i, start_use, end_use;
double o = 0.0;
if(!ints->l) return strtod("NaN", NULL);
for(i=0; i<ints->l; i++) {
start_use = ints->start[i];
end_use = ints->end[i];
if(start_use < start) start_use = start;
if(end_use > end) end_use = end;
o += end_use - start_use;
}
return o/(end-start);
}
static double blockSum(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) {
uint32_t i, j, sizeUse;
double o = 0.0;
struct vals_t *v = NULL;
if(!blocks->n) return strtod("NaN", NULL);
//Iterate over the blocks
for(i=0; i<blocks->n; i++) {
v = getVals(fp, blocks, i, tid, start, end);
if(!v) goto error;
for(j=0; j<v->n; j++) {
//Multiply the block average by min(bases covered, block overlap with interval)
sizeUse = v->vals[j]->scalar;
if(sizeUse > v->vals[j]->nBases) sizeUse = v->vals[j]->nBases;
o+= (v->vals[j]->sum * sizeUse) / v->vals[j]->nBases;
}
destroyVals_t(v);
}
if(o == 0.0) return strtod("NaN", NULL);
return o;
error:
destroyVals_t(v);
errno = ENOMEM;
return strtod("NaN", NULL);
}
static double intSum(bwOverlappingIntervals_t* ints, uint32_t start, uint32_t end) {
uint32_t i, start_use, end_use;
double o = 0.0;
if(!ints->l) return strtod("NaN", NULL);
for(i=0; i<ints->l; i++) {
start_use = ints->start[i];
end_use = ints->end[i];
if(start_use < start) start_use = start;
if(end_use > end) end_use = end;
o += (end_use - start_use) * ints->value[i];
}
return o;
}
//Returns NULL on error, otherwise a double* that needs to be free()d
double *bwStatsFromZoom(bigWigFile_t *fp, int32_t level, uint32_t tid, uint32_t start, uint32_t end, uint32_t nBins, enum bwStatsType type) {
bwOverlapBlock_t *blocks = NULL;
double *output = NULL;
uint32_t pos = start, i, end2;
if(!fp->hdr->zoomHdrs->idx[level]) {
fp->hdr->zoomHdrs->idx[level] = bwReadIndex(fp, fp->hdr->zoomHdrs->indexOffset[level]);
if(!fp->hdr->zoomHdrs->idx[level]) return NULL;
}
errno = 0; //Sometimes libCurls sets and then doesn't unset errno on errors
output = malloc(sizeof(double)*nBins);
if(!output) return NULL;
for(i=0, pos=start; i<nBins; i++) {
end2 = start + ((double)(end-start)*(i+1))/((int) nBins);
blocks = walkRTreeNodes(fp, fp->hdr->zoomHdrs->idx[level]->root, tid, pos, end2);
if(!blocks) goto error;
switch(type) {
case 0:
//mean
output[i] = blockMean(fp, blocks, tid, pos, end2);
break;
case 1:
//stdev
output[i] = blockDev(fp, blocks, tid, pos, end2);
break;
case 2:
//max
output[i] = blockMax(fp, blocks, tid, pos, end2);
break;
case 3:
//min
output[i] = blockMin(fp, blocks, tid, pos, end2);
break;
case 4:
//cov
output[i] = blockCoverage(fp, blocks, tid, pos, end2)/(end2-pos);
break;
case 5:
//sum
output[i] = blockSum(fp, blocks, tid, pos, end2);
break;
default:
goto error;
break;
}
if(errno) goto error;
destroyBWOverlapBlock(blocks);
pos = end2;
}
return output;
error:
fprintf(stderr, "got an error in bwStatsFromZoom in the range %"PRIu32"-%"PRIu32": %s\n", pos, end2, strerror(errno));
if(blocks) destroyBWOverlapBlock(blocks);
if(output) free(output);
return NULL;
}
double *bwStatsFromFull(bigWigFile_t *fp, char *chrom, uint32_t start, uint32_t end, uint32_t nBins, enum bwStatsType type) {
bwOverlappingIntervals_t *ints = NULL;
double *output = malloc(sizeof(double)*nBins);
uint32_t i, pos = start, end2;
if(!output) return NULL;
for(i=0; i<nBins; i++) {
end2 = start + ((double)(end-start)*(i+1))/((int) nBins);
ints = bwGetOverlappingIntervals(fp, chrom, pos, end2);
if(!ints) {
output[i] = strtod("NaN", NULL);
continue;
}
switch(type) {
default :
case 0:
output[i] = intMean(ints, pos, end2);
break;
case 1:
output[i] = intDev(ints, pos, end2);
break;
case 2:
output[i] = intMax(ints);
break;
case 3:
output[i] = intMin(ints);
break;
case 4:
output[i] = intCoverage(ints, pos, end2);
break;
case 5:
output[i] = intSum(ints, pos, end2);
break;
}
bwDestroyOverlappingIntervals(ints);
pos = end2;
}
return output;
}
//Returns a list of floats of length nBins that must be free()d
//On error, NULL is returned
double *bwStats(bigWigFile_t *fp, char *chrom, uint32_t start, uint32_t end, uint32_t nBins, enum bwStatsType type) {
int32_t level = determineZoomLevel(fp, ((double)(end-start))/((int) nBins));
uint32_t tid = bwGetTid(fp, chrom);
if(tid == (uint32_t) -1) return NULL;
if(level == -1) return bwStatsFromFull(fp, chrom, start, end, nBins, type);
return bwStatsFromZoom(fp, level, tid, start, end, nBins, type);
}
This diff is collapsed.
#include <inttypes.h>
/*! \file bwValues.h
*
* You should not directly use functions and structures defined here. They're really meant for internal use only.
*
* All of the structures here need to be destroyed or you'll leak memory! There are methods available to destroy anything that you need to take care of yourself.
*/
//N.B., coordinates are still 0-based half open!
/*!
* @brief A node within an R-tree holding the index for data.
*
* Note that there are two types of nodes: leaf and twig. Leaf nodes point to where data actually is. Twig nodes point to additional index nodes, which may or may not be leaves. Each of these nodes has additional children, which may span multiple chromosomes/contigs.
*
* With the start/end position, these positions refer specifically to the chromosomes specified in chrIdxStart/chrIdxEnd. Any chromosomes between these are completely spanned by a given child.
*/
typedef struct bwRTreeNode_t {
uint8_t isLeaf; /**<Is this node a leaf?*/
//1 byte of padding
uint16_t nChildren; /**<The number of children of this node, all lists have this length.*/
uint32_t *chrIdxStart; /**<A list of the starting chromosome indices of each child.*/
uint32_t *baseStart; /**<A list of the start position of each child.*/
uint32_t *chrIdxEnd; /**<A list of the end chromosome indices of each child.*/
uint32_t *baseEnd; /**<A list of the end position of each child.*/
uint64_t *dataOffset; /**<For leaves, the offset to the on-disk data. For twigs, the offset to the child node.*/
union {
uint64_t *size; /**<Leaves only: The size of the data block.*/
struct bwRTreeNode_t **child; /**<Twigs only: The child node(s).*/
} x; /**<A union holding either size or child*/
} bwRTreeNode_t;
/*!
* A header and index that points to an R-tree that in turn points to data blocks.
*/
//TODO rootOffset is pointless, it's 48bytes after the indexOffset
typedef struct {
uint32_t blockSize; /**<The maximum number of children a node can have*/
uint64_t nItems; /**<The total number of data blocks pointed to by the tree. This is completely redundant.*/
uint32_t chrIdxStart; /**<The index to the first chromosome described.*/
uint32_t baseStart; /**<The first position on chrIdxStart with a value.*/
uint32_t chrIdxEnd; /**<The index of the last chromosome with an entry.*/
uint32_t baseEnd; /**<The last position on chrIdxEnd with an entry.*/
uint64_t idxSize; /**<This is actually the offset of the index rather than the size?!? Yes, it's completely redundant.*/
uint32_t nItemsPerSlot; /**<This is always 1!*/
//There's 4 bytes of padding in the file here
uint64_t rootOffset; /**<The offset to the root node of the R-Tree (on disk). Yes, this is redundant.*/
bwRTreeNode_t *root; /**<A pointer to the root node.*/
} bwRTree_t;
/*!
* @brief This structure holds the data blocks that overlap a given interval.
*/
typedef struct {
uint64_t n; /**<The number of blocks that overlap. This *MAY* be 0!.*/
uint64_t *offset; /**<The offset to the on-disk position of the block.*/
uint64_t *size; /**<The size of each block on disk (in bytes).*/
} bwOverlapBlock_t;
/*!
* @brief The header section of a given data block.
*
* There are 3 types of data blocks in bigWig files, each with slightly different needs. This is all taken care of internally.
*/
typedef struct {
uint32_t tid; /**<The chromosome ID.*/
uint32_t start; /**<The start position of a block*/
uint32_t end; /**<The end position of a block*/
uint32_t step; /**<The step size of the values*/
uint32_t span; /**<The span of each data value*/
uint8_t type; /**<The block type: 1, bedGraph; 2, variable step; 3, fixed step.*/
uint16_t nItems; /**<The number of values in a given block.*/
} bwDataHeader_t;