Commit c0833a2d authored by Andreas Tille's avatar Andreas Tille

New upstream version 0.2.7

parent 7f525bf5
Package: DelayedArray
Title: Delayed operations on array-like objects
Description: Wrapping an array-like object (typically an on-disk object) in
a DelayedArray object allows one to perform common array operations
on it without loading the object in memory. In order to reduce memory
usage and optimize performance, operations on the object are either
delayed or executed using a block processing mechanism. Note that this
also works on in-memory array-like objects like DataFrame objects
(typically with Rle columns), Matrix objects, and ordinary arrays and
data frames.
Version: 0.2.7
Encoding: UTF-8
Author: Hervé Pagès
Maintainer: Hervé Pagès <hpages@fredhutch.org>
biocViews: Infrastructure, DataRepresentation, Annotation,
GenomeAnnotation
Depends: R (>= 3.4), methods, BiocGenerics, S4Vectors (>= 0.14.3),
IRanges, matrixStats
Imports: stats
Suggests: Matrix, HDF5Array, genefilter, BiocStyle
License: Artistic-2.0
Collate: utils.R ArrayBlocks-class.R show-utils.R DelayedArray-class.R
cbind-methods.R realize.R block_processing.R
DelayedArray-utils.R DelayedMatrix-utils.R DelayedArray-stats.R
DelayedMatrix-stats.R RleArray-class.R zzz.R
NeedsCompilation: no
Packaged: 2017-06-03 00:46:21 UTC; biocbuild
import(methods)
importFrom(stats, setNames, dlogis, plogis, qlogis)
import(BiocGenerics)
import(S4Vectors)
import(IRanges)
import(matrixStats)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Export S4 classes
###
exportClasses(
ArrayBlocks,
DelayedArray, DelayedMatrix,
RealizationSink, arrayRealizationSink,
RleArray, RleMatrix, RleRealizationSink
)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Export S3 methods
###
S3method(as.array, DelayedArray)
S3method(as.character, DelayedArray)
S3method(as.complex, DelayedArray)
S3method(as.data.frame, DelayedArray)
S3method(as.integer, DelayedArray)
S3method(as.logical, DelayedArray)
S3method(as.matrix, DelayedArray)
S3method(as.numeric, DelayedArray)
S3method(as.raw, DelayedArray)
S3method(as.vector, DelayedArray)
S3method(mean, DelayedArray)
S3method(split, DelayedArray)
### We also export them thru the export() directive so that (a) they can be
### called directly, (b) tab-completion on the name of the generic shows them,
### and (c) methods() doesn't asterisk them.
export(
as.array.DelayedArray,
as.character.DelayedArray,
as.complex.DelayedArray,
as.data.frame.DelayedArray,
as.integer.DelayedArray,
as.logical.DelayedArray,
as.matrix.DelayedArray,
as.numeric.DelayedArray,
as.raw.DelayedArray,
as.vector.DelayedArray,
mean.DelayedArray,
split.DelayedArray
)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Export S4 methods for generics not defined in DelayedArray
###
exportMethods(
## Methods for generics defined in the base package:
length, names, "names<-",
dim, "dim<-", dimnames, "dimnames<-",
as.array, as.vector, as.matrix, as.data.frame,
"[", "[[",
drop,
t,
c,
split,
is.na, is.finite, is.infinite, is.nan,
"!",
#"+", "-", "*", "/", "^", "%%", "%/%", # "Arith" group generic
"==", "!=", "<=", ">=", "<", ">", # "Compare" group generic
anyNA, which,
max, min, range, sum, prod, any, all, # "Summary" group generic
mean,
round, signif,
rowSums, colSums, rowMeans, colMeans,
nchar, tolower, toupper,
## Methods for generics defined in the methods package:
coerce, show,
## Methods for generics defined in the stats package:
dlogis, plogis, qlogis,
## Methods for generics defined in the BiocGenerics package:
cbind, rbind,
## Methods for generics defined in the S4Vectors package:
isEmpty,
## Methods for generics defined in the IRanges package:
splitAsList,
arbind, acbind
)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Export non-generic functions
###
export(
ArrayBlocks,
supportedRealizationBackends, getRealizationBackend, setRealizationBackend,
RealizationSink, arrayRealizationSink,
RleArray, RleRealizationSink
)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Export S4 generics defined in DelayedArray + export corresponding methods
###
export(
matrixClass, DelayedArray, seed, subset_seed_as_array, type,
realize,
write_to_sink, close,
pmax2, pmin2, apply,
rowMaxs, colMaxs, rowMins, colMins, rowRanges, colRanges
)
### Exactly the same list as above.
exportMethods(
matrixClass, DelayedArray, seed, subset_seed_as_array, type,
realize,
write_to_sink, close,
pmax2, pmin2, apply,
rowMaxs, colMaxs, rowMins, colMins, rowRanges, colRanges
)
## Should this go in the SummarizedExperiment package? As an additional section
## in the vignette? As a separate vignette? As a man page? Probably the former.
## The problem
## ===========
##
## When trying to create a SummarizedExperiment object with big dimensions it's
## critical to use a memory-efficient container for the assay data. Depending
## on the nature of the data, in-memory containers that compress the data (e.g.
## a DataFrame of Rle's or a sparse matrix from the Matrix package) might help
## to a certain extent. However, even after compression some data might remain
## too big to fit in memory. In that case, one solution is to split the
## SummarizedExperiment object in smaller objects, then process the smaller
## objects separately, and finally combine the results. A disadvantage of this
## approach is that the split/process/combine mechanism is the responsibility
## of the SummarizedExperiment-based application so it makes the development of
## such applications more complicated. Having the assay data stored in an
## on-disk container like HDF5Matrix should greatly simplify this: the goal is
## to make it possible for the end-user to manipulate the big
## SummarizedExperiment object as a whole and have the split/process/combine
## mechanism automatically and transparently happen behind the scene .
## Comparison of assay data containers
## ===================================
##
## Each container has its strengths and weaknesses and which one to use exactly
## depends on several factors.
##
## DataFrame of Rle's
## ------------------
## Works great for coverage data. See ?GPos in GenomicRanges for an example.
## Sparse matrix object from the Matrix package
## --------------------------------------------
## This sounds like a natural candidate for RNA-seq count data which tends to
## be sparse. Unfortunately, because the Matrix package can only store the
## counts as doubles and not as integers, trying to use it on real RNA-seq
## count data actually increases the size of the matrix of counts:
library(Matrix)
library(airway)
data(airway)
head(assay(airway))
object.size(assay(airway))
object.size(Matrix(assay(airway), sparse=TRUE))
### =========================================================================
### ArrayBlocks objects
### -------------------------------------------------------------------------
###
setClass("ArrayBlocks",
contains="List",
representation(
dim="integer",
max_block_len="integer",
N="integer",
by="integer"
),
prototype(elementType="list")
)
### Return an ArrayBlocks object i.e. a collection of subarrays of the
### original array with the following properties:
### (a) The collection of blocks is a partitioning of the original array
### i.e. the blocks fully cover it and don't overlap each other.
### (b) Each block is made of adjacent elements in the original array.
### (c) Each block has a length (i.e. nb of elements) <= 'max_block_len'.
ArrayBlocks <- function(dim, max_block_len)
{
p <- cumprod(dim)
w <- which(p <= max_block_len)
N <- if (length(w) == 0L) 1L else w[[length(w)]] + 1L
if (N > length(dim)) {
by <- 1L
} else if (N == 1L) {
by <- max_block_len
} else {
by <- max_block_len %/% as.integer(p[[N - 1L]])
}
new("ArrayBlocks", dim=dim, max_block_len=max_block_len, N=N, by=by)
}
.get_ArrayBlocks_inner_length <- function(blocks)
{
ndim <- length(blocks@dim)
if (blocks@N > ndim)
return(if (any(blocks@dim == 0L)) 0L else 1L)
inner_len <- blocks@dim[[blocks@N]] %/% blocks@by
by2 <- blocks@dim[[blocks@N]] %% blocks@by
if (by2 != 0L) # 'blocks' contains truncated blocks
inner_len <- inner_len + 1L
inner_len
}
.get_ArrayBlocks_outer_length <- function(blocks)
{
ndim <- length(blocks@dim)
if (blocks@N >= ndim)
return(1L)
outer_dim <- blocks@dim[(blocks@N + 1L):ndim]
prod(outer_dim)
}
### Return the number of blocks in 'x'.
setMethod("length", "ArrayBlocks",
function(x)
.get_ArrayBlocks_inner_length(x) * .get_ArrayBlocks_outer_length(x)
)
get_block_lengths <- function(blocks)
{
p <- prod(blocks@dim[seq_len(blocks@N - 1L)])
ndim <- length(blocks@dim)
if (blocks@N > ndim)
return(p)
fb_len <- p * blocks@by # full block length
lens <- rep.int(fb_len, blocks@dim[[blocks@N]] %/% blocks@by)
by2 <- blocks@dim[[blocks@N]] %% blocks@by
if (by2 != 0L) { # 'blocks' contains truncated blocks
tb_len <- p * by2 # truncated block length
lens <- c(lens, tb_len)
}
rep.int(lens, .get_ArrayBlocks_outer_length(blocks))
}
### Return an IRanges object with 1 range per dimension.
get_block_ranges <- function(blocks, i)
{
nblock <- length(blocks)
stopifnot(isSingleInteger(i), i >= 1L, i <= nblock)
ndim <- length(blocks@dim)
ans <- IRanges(rep.int(1L, ndim), blocks@dim)
if (blocks@N > ndim)
return(ans)
i <- i - 1L
if (blocks@N < ndim) {
inner_len <- .get_ArrayBlocks_inner_length(blocks)
i1 <- i %% inner_len
i2 <- i %/% inner_len
} else {
i1 <- i
}
k1 <- i1 * blocks@by
k2 <- k1 + blocks@by
k1 <- k1 + 1L
upper_bound <- blocks@dim[[blocks@N]]
if (k2 > upper_bound)
k2 <- upper_bound
start(ans)[[blocks@N]] <- k1
end(ans)[[blocks@N]] <- k2
if (blocks@N < ndim) {
outer_dim <- blocks@dim[(blocks@N + 1L):ndim]
subindex <- arrayInd(i2 + 1L, outer_dim)
ans[(blocks@N + 1L):ndim] <- IRanges(subindex, width=1L)
}
ans
}
get_array_block_Nindex <- function(blocks, i)
{
make_Nindex_from_block_ranges(get_block_ranges(blocks, i), blocks@dim)
}
setMethod("getListElement", "ArrayBlocks",
function(x, i, exact=TRUE)
{
i <- normalizeDoubleBracketSubscript(i, x, exact=exact,
error.if.nomatch=TRUE)
get_array_block_Nindex(x, i)
}
)
setMethod("show", "ArrayBlocks",
function(object)
{
dim_in1string <- paste0(object@dim, collapse=" x ")
cat(class(object), " object with ", length(object), " blocks ",
"of length <= ", object@max_block_len, " on a ",
dim_in1string, " array:\n", sep="")
for (i in seq_along(object)) {
Nindex <- object[[i]]
cat("[[", i, "]]: [", Nindex_as_string(Nindex), "]\n",
sep="")
}
}
)
extract_array_block <- function(x, blocks, i)
{
Nindex <- get_array_block_Nindex(blocks, i)
subset_by_Nindex(x, Nindex)
}
### NOT exported but used in unit tests.
split_array_in_blocks <- function(x, max_block_len)
{
blocks <- ArrayBlocks(dim(x), max_block_len)
lapply(seq_along(blocks),
function(i) extract_array_block(x, blocks, i))
}
### NOT exported but used in unit tests.
### Rebuild the original array from the subarrays obtained by
### split_array_in_blocks() as an *ordinary* array.
### So if 'x' is an ordinary array, then:
###
### unsplit_array_from_blocks(split_array_in_blocks(x, max_block_len), x)
###
### should be a no-op for any 'max_block_len' < 'length(x)'.
unsplit_array_from_blocks <- function(subarrays, x)
{
ans <- combine_array_objects(subarrays)
dim(ans) <- dim(x)
ans
}
This diff is collapsed.
### =========================================================================
### Statistical methods for DelayedArray objects
### -------------------------------------------------------------------------
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Logistic
###
### All these methods return a DelayedArray object of the same dimensions
### as their first argument.
###
setMethod("dlogis", "DelayedArray",
function(x, location=0, scale=1, log=FALSE)
register_delayed_op(x, "dlogis",
Rargs=list(location=location, scale=scale, log=log))
)
setMethod("plogis", "DelayedArray",
function(q, location=0, scale=1, lower.tail=TRUE, log.p=FALSE)
register_delayed_op(q, "plogis",
Rargs=list(location=location, scale=scale,
lower.tail=lower.tail, log.p=log.p))
)
setMethod("qlogis", "DelayedArray",
function(p, location=0, scale=1, lower.tail=TRUE, log.p=FALSE)
register_delayed_op(p, "qlogis",
Rargs=list(location=location, scale=scale,
lower.tail=lower.tail, log.p=log.p))
)
This diff is collapsed.
### =========================================================================
### Statistical/summarization methods for DelayedMatrix objects
### -------------------------------------------------------------------------
###
### Raise an error if invalid input type. Otherwise return "integer",
### "numeric", "double", or "complex".
.get_ans_type <- function(x, must.be.numeric=FALSE)
{
x_type <- type(x)
ans_type <- switch(x_type,
logical="integer",
integer=, numeric=, double=, complex=x_type,
stop(wmsg("operation not supported on matrices of type ", x_type)))
if (must.be.numeric && !is.numeric(get(ans_type)(0)))
stop(wmsg("operation not supported on matrices of type ", x_type))
ans_type
}
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### row/colSums() and row/colMeans()
###
.normarg_dims <- function(dims, method)
{
if (!identical(dims, 1))
stop("\"", method, "\" method for DelayedMatrix objects ",
"does not support the 'dims' argument yet")
}
### row/colSums()
.DelayedMatrix_block_rowSums <- function(x, na.rm=FALSE, dims=1)
{
.normarg_dims(dims, "rowSums")
if (is(x, "DelayedArray") && x@is_transposed)
return(.DelayedMatrix_block_colSums(t(x), na.rm=na.rm, dims=dims))
.get_ans_type(x) # check input type
APPLY <- function(m) rowSums(m, na.rm=na.rm)
COMBINE <- function(b, m, init, reduced) { init + reduced }
init <- numeric(nrow(x))
ans <- colblock_APPLY_and_COMBINE(x, APPLY, COMBINE, init)
setNames(ans, rownames(x))
}
.DelayedMatrix_block_colSums <- function(x, na.rm=FALSE, dims=1)
{
.normarg_dims(dims, "colSums")
if (is(x, "DelayedArray") && x@is_transposed)
return(.DelayedMatrix_block_rowSums(t(x), na.rm=na.rm, dims=dims))
.get_ans_type(x) # check input type
colsums_list <- colblock_APPLY(x, colSums, na.rm=na.rm)
if (length(colsums_list) == 0L)
return(numeric(ncol(x)))
unlist(colsums_list, recursive=FALSE)
}
setMethod("rowSums", "DelayedMatrix", .DelayedMatrix_block_rowSums)
setMethod("colSums", "DelayedMatrix", .DelayedMatrix_block_colSums)
### row/colMeans()
.DelayedMatrix_block_rowMeans <- function(x, na.rm=FALSE, dims=1)
{
.normarg_dims(dims, "rowMeans")
if (is(x, "DelayedMatrix") && x@is_transposed)
return(.DelayedMatrix_block_colMeans(t(x), na.rm=na.rm, dims=dims))
.get_ans_type(x) # check input type
APPLY <- function(m) {
m_sums <- rowSums(m, na.rm=na.rm)
m_nvals <- ncol(m)
if (na.rm)
m_nvals <- m_nvals - rowSums(is.na(m))
cbind(m_sums, m_nvals)
}
COMBINE <- function(b, m, init, reduced) { init + reduced }
init <- cbind(
numeric(nrow(x)), # sums
numeric(nrow(x)) # nvals
)
ans <- colblock_APPLY_and_COMBINE(x, APPLY, COMBINE, init)
setNames(ans[ , 1L] / ans[ , 2L], rownames(x))
}
.DelayedMatrix_block_colMeans <- function(x, na.rm=FALSE, dims=1)
{
.normarg_dims(dims, "colMeans")
if (is(x, "DelayedArray") && x@is_transposed)
return(.DelayedMatrix_block_rowMeans(t(x), na.rm=na.rm, dims=dims))
.get_ans_type(x) # check input type
colmeans_list <- colblock_APPLY(x, colMeans, na.rm=na.rm)
if (length(colmeans_list) == 0L)
return(rep.int(NaN, ncol(x)))
unlist(colmeans_list, recursive=FALSE)
}
setMethod("rowMeans", "DelayedMatrix", .DelayedMatrix_block_rowMeans)
setMethod("colMeans", "DelayedMatrix", .DelayedMatrix_block_colMeans)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Row/column summarization from the matrixStats package
###
### row/colMaxs(), row/colMins(), row/colRanges(),
### row/colProds(), row/colAnys(), row/colAlls(), row/colMedians()
###
.fix_type <- function(x, ans_type)
{
if (ans_type == "integer" && !is.integer(x) && all(is.finite(x)))
storage.mode(x) <- ans_type
x
}
### row/colMaxs()
.DelayedMatrix_block_rowMaxs <- function(x, rows=NULL, cols=NULL,
na.rm=FALSE, dim.=dim(x))
{
if (is(x, "DelayedArray") && x@is_transposed)
return(.DelayedMatrix_block_colMaxs(t(x), rows=rows, cols=cols,
na.rm=na.rm, dim.=dim.))
ans_type <- .get_ans_type(x, must.be.numeric=TRUE)
APPLY <- function(m) rowMaxs(m, na.rm=na.rm)
COMBINE <- function(b, m, init, reduced)
.fix_type(pmax(init, reduced), ans_type)
init <- .fix_type(rep.int(-Inf, nrow(x)), ans_type)
ans <- colblock_APPLY_and_COMBINE(x, APPLY, COMBINE, init)
setNames(ans, rownames(x))
}
.DelayedMatrix_block_colMaxs <- function(x, rows=NULL, cols=NULL,
na.rm=FALSE, dim.=dim(x))
{
if (is(x, "DelayedArray") && x@is_transposed)
return(.DelayedMatrix_block_rowMaxs(t(x), rows=rows, cols=cols,
na.rm=na.rm, dim.=dim.))
ans_type <- .get_ans_type(x, must.be.numeric=TRUE)
colmaxs_list <- colblock_APPLY(x, colMaxs, na.rm=na.rm)
if (length(colmaxs_list) == 0L)
return(.fix_type(rep.int(-Inf, ncol(x)), ans_type))
unlist(colmaxs_list, recursive=FALSE)
}
setGeneric("rowMaxs", signature="x")
setGeneric("colMaxs", signature="x")
setMethod("rowMaxs", "DelayedMatrix", .DelayedMatrix_block_rowMaxs)
setMethod("colMaxs", "DelayedMatrix", .DelayedMatrix_block_colMaxs)
### row/colMins()
.DelayedMatrix_block_rowMins <- function(x, rows=NULL, cols=NULL,
na.rm=FALSE, dim.=dim(x))
{
if (is(x, "DelayedArray") && x@is_transposed)
return(.DelayedMatrix_block_colMins(t(x), rows=rows, cols=cols,
na.rm=na.rm, dim.=dim.))
ans_type <- .get_ans_type(x, must.be.numeric=TRUE)
APPLY <- function(m) rowMins(m, na.rm=na.rm)
COMBINE <- function(b, m, init, reduced)
.fix_type(pmin(init, reduced), ans_type)
init <- .fix_type(rep.int(Inf, nrow(x)), ans_type)
ans <- colblock_APPLY_and_COMBINE(x, APPLY, COMBINE, init)
setNames(ans, rownames(x))
}
.DelayedMatrix_block_colMins <- function(x, rows=NULL, cols=NULL,
na.rm=FALSE, dim.=dim(x))
{
if (is(x, "DelayedArray") && x@is_transposed)
return(.DelayedMatrix_block_rowMins(t(x), rows=rows, cols=cols,
na.rm=na.rm, dim.=dim.))
ans_type <- .get_ans_type(x, must.be.numeric=TRUE)
colmins_list <- colblock_APPLY(x, colMins, na.rm=na.rm)
if (length(colmins_list) == 0L)
return(.fix_type(rep.int(Inf, ncol(x)), ans_type))
unlist(colmins_list, recursive=FALSE)
}
setGeneric("rowMins", signature="x")
setGeneric("colMins", signature="x")
setMethod("rowMins", "DelayedMatrix", .DelayedMatrix_block_rowMins)
setMethod("colMins", "DelayedMatrix", .DelayedMatrix_block_colMins)
### row/colRanges()
.DelayedMatrix_block_rowRanges <- function(x, rows=NULL, cols=NULL,
na.rm=FALSE, dim.=dim(x))
{
if (is(x, "DelayedArray") && x@is_transposed)
return(.DelayedMatrix_block_colRanges(t(x), rows=rows, cols=cols,
na.rm=na.rm, dim.=dim.))
ans_type <- .get_ans_type(x, must.be.numeric=TRUE)
APPLY <- function(m) rowRanges(m, na.rm=na.rm)
COMBINE <- function(b, m, init, reduced) {
.fix_type(cbind(pmin(init[ , 1L], reduced[ , 1L]),
pmax(init[ , 2L], reduced[ , 2L])),
ans_type)
}
init <- .fix_type(matrix(rep(c(Inf, -Inf), each=nrow(x)), ncol=2L),
ans_type)
ans <- colblock_APPLY_and_COMBINE(x, APPLY, COMBINE, init)
setNames(ans, rownames(x))
}
.DelayedMatrix_block_colRanges <- function(x, rows=NULL, cols=NULL,
na.rm=FALSE, dim.=dim(x))
{
if (is(x, "DelayedArray") && x@is_transposed)
return(.DelayedMatrix_block_rowRanges(t(x), rows=rows, cols=cols,
na.rm=na.rm, dim.=dim.))
ans_type <- .get_ans_type(x, must.be.numeric=TRUE)
colranges_list <- colblock_APPLY(x, colRanges, na.rm=na.rm)
if (length(colranges_list) == 0L)
return(.fix_type(matrix(rep(c(Inf, -Inf), each=ncol(x)), ncol=2L),
ans_type))
do.call(rbind, colranges_list)
}
.rowRanges.useAsDefault <- function(x, ...) matrixStats::rowRanges(x, ...)
setGeneric("rowRanges", signature="x",
function(x, ...) standardGeneric("rowRanges"),
useAsDefault=.rowRanges.useAsDefault
)
setGeneric("colRanges", signature="x")
setMethod("rowRanges", "DelayedMatrix", .DelayedMatrix_block_rowRanges)
setMethod("colRanges", "DelayedMatrix", .DelayedMatrix_block_colRanges)
### TODO: Add more row/column summarization generics/methods.
### =========================================================================
### Common operations on DelayedMatrix objects
### -------------------------------------------------------------------------
###
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Matrix multiplication
###
### We only support multiplication of an ordinary matrix (typically
### small) by a DelayedMatrix object (typically big). Multiplication of 2
### DelayedMatrix objects is not supported.
###
.DelayedMatrix_block_mult_by_left_matrix <- function(x, y)
{
stopifnot(is.matrix(x),
is(y, "DelayedMatrix") || is.matrix(y),
ncol(x) == nrow(y))
ans_dim <- c(nrow(x), ncol(y))
ans_dimnames <- list(rownames(x), colnames(y))