Commit 864c6537 authored by Andreas Tille's avatar Andreas Tille

New upstream version 1.12.2

parent 0bd46bdf
Package: GenomeInfoDb
Title: Utilities for manipulating chromosome and other 'seqname'
identifiers
Description: Contains data and functions that
define and allow translation between different chromosome
sequence naming conventions (e.g., "chr1" versus "1"),
including a function that attempts to place sequence names in
their natural, rather than lexicographic, order.
Version: 1.12.2
Encoding: UTF-8
Author: Sonali Arora, Martin Morgan, Marc Carlson, H. Pagès
Maintainer: Bioconductor Package Maintainer <maintainer@bioconductor.org>
biocViews: Genetics, DataRepresentation, Annotation, GenomeAnnotation
Depends: R (>= 3.1), methods, BiocGenerics (>= 0.13.8), S4Vectors (>=
0.9.25), IRanges (>= 1.99.26)
Imports: stats, stats4, utils, RCurl, GenomeInfoDbData
Suggests: GenomicRanges, Rsamtools, GenomicAlignments, BSgenome,
GenomicFeatures, BSgenome.Scerevisiae.UCSC.sacCer2,
BSgenome.Celegans.UCSC.ce2, BSgenome.Hsapiens.NCBI.GRCh38,
TxDb.Dmelanogaster.UCSC.dm3.ensGene, RUnit, BiocStyle, knitr
License: Artistic-2.0
Collate: utils.R rankSeqlevels.R assembly-utils.R available.species.R
fetchExtendedChromInfoFromUCSC.R fetchSequenceInfo.R seqinfo.R
seqlevelsStyle.R seqlevels-wrappers.R Seqinfo-class.R
GenomeDescription-class.R mapGenomeBuilds.R zzz.R
VignetteBuilder: knitr
Video: http://youtu.be/wdEjCYSXa7w
NeedsCompilation: no
Packaged: 2017-06-08 23:29:48 UTC; biocbuild
import(methods)
importFrom(utils, data, download.file, read.delim, read.table, read.csv)
importFrom(stats, setNames)
importFrom(stats4, summary)
importFrom(RCurl, getURL)
import(BiocGenerics)
import(S4Vectors)
import(IRanges) # for CompressedList
import(GenomeInfoDbData)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Export non-generic functions
###
export(
## rankSeqlevels.R:
orderSeqlevels,
rankSeqlevels,
## fetchExtendedChromInfoFromUCSC.R:
fetchExtendedChromInfoFromUCSC,
## mapGenomeBuilds.R
mapGenomeBuilds,
genomeBuilds,
listOrganisms,
## seqlevelsStyle.R:
genomeStyles,
extractSeqlevels,
extractSeqlevelsByGroup,
mapSeqlevels,
seqlevelsInGroup,
## seqlevels-wrappers.R:
keepSeqlevels, dropSeqlevels, renameSeqlevels, restoreSeqlevels,
standardChromosomes, keepStandardChromosomes,
## Seqinfo-class.R:
Seqinfo,
## GenomeDescription-classR:
GenomeDescription
)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Export S4 classes
###
exportClasses(
Seqinfo,
GenomeDescription
)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Export S3 methods
###
S3method(summary, Seqinfo)
### We also export them thru the export() directive so that (a) they can be
### called directly, (b) tab-completion on the name of the generic shows them,
### and (c) methods() doesn't asterisk them.
export(
summary.Seqinfo
)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Export S4 methods for generics not defined in GenomeInfoDb
###
exportMethods(
length,
names, "names<-",
"[",
as.data.frame,
show,
merge,
## Generics defined in the stats4 package:
summary,
## Generics defined in the BiocGenerics package:
intersect, organism, species
)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Export S4 generics defined in GenomeInfoDb + export corresponding methods
###
export(
## seqinfo.R:
seqinfo, "seqinfo<-",
seqnames, "seqnames<-",
seqlevels, "seqlevels<-",
sortSeqlevels,
seqlevelsInUse,
seqlevels0,
seqlengths, "seqlengths<-",
isCircular, "isCircular<-",
genome, "genome<-",
## seqlevelsStyle.R:
seqlevelsStyle, "seqlevelsStyle<-",
## GenomeDescription-class.R:
commonName, provider, providerVersion,
releaseDate, releaseName, bsgenomeName,
available.species
)
exportMethods(
seqinfo,
seqnames, "seqnames<-",
seqlevels, "seqlevels<-",
sortSeqlevels,
seqlevelsInUse,
seqlengths, "seqlengths<-",
isCircular, "isCircular<-",
genome, "genome<-",
seqlevelsStyle, "seqlevelsStyle<-",
commonName, provider, providerVersion,
releaseDate, releaseName, bsgenomeName
)
CHANGES IN VERSION 1.12.0
------------------------------
NEW FEATURES
o Add function standardChromosomes()
o Seqlevels() setter now supports "fine" and "tidy" modes on GRangesList and
GAlignmentsList objects
o Add assembly_accessions dataset
MODIFICATIONS
o Updated mapping table between UCSC and Ensembl to include recent builds
o Use https instead of http to fetch stuff from NCBI
o Replace 'force=TRUE' with 'pruning.mode="coarse"' in seqlevels() setter
o Add 'pruning.mode' argument to the keepSeqlevels(), dropSeqlevels(), and
keepStandardChromosomes() functions. IMPORTANT NOTE: Like for the
seqlevels() setter, the default pruning mode is "error", which means that
now these functions fail when some of the seqlevels to drop from 'x' are in
use. The old behavior was to silently prune 'x' (doing "coarse" pruning)
o Update files in data directory
o Updated internal functions .lookup_refseq_assembly_accession() and
fetch_assembly_report() for speed and efficiency
o move some files from GenomeInfoDb/data/ to GenomeInfoDbData annotation package
BUG FIXES
o fetch_assembly_summary() updated to work with recent changes to format of
files assembly_summary_genbank.txt and assembly_summary_refseq.txt
CHANGES IN VERSION 1.10.0
------------------------------
NEW FEATURES
o Add function mapGenomeBuilds() that maps between UCSC and Ensembl
builds.
o Add function genomeBuilds() that list all the available UCSC or Ensembl
builds for a given organism[s] that can be used in mapGenomeBuilds()
o Add listOrganism() that list all currently available organism[s] included
for use in genomeBuilds()
DEPRECATED AND DEFUNCT
o After being deprecated, the species() method for GenomeDescription objects
is now defunct
MODIFICATIONS
o Zebra finch is removed as option for
fetchExtendedChromInfoFromUCSC() as it is not support yet
o keepStandardChromosomes() chooses first style when multiple are matched
BUG FIXES
o Fix WARNING occuring when determining style in keepStandardChromosomes()
CHANGES IN VERSION 0.99.7
----------------------------------
MODIFICATIONS
o rename:
isSupportedSeqnames -> .isSupportedSeqnames
supportedSeqnameStyles -> .supportedSeqnameStyles
supportedSeqnameMappings -> .supportedSeqnameMappings
isSupportedSeqnamesStyle -> .isSupportedSeqnamesStyle
CHANGES IN VERSION 0.99.6
----------------------------------
NEW FEATURES
o add new functions()
seqnamesInGroup which will take a character vector of chromosomes and
return the chromosomes specified by the group parameter supplied by the
user. The user can also give the species and the style.
seqnamesOrder() internally calls Herve's function makeSeqnameIds()
o add seqnameStyles generic and method from GenomicRanges
MODIFICATIONS
o rename:
testSeqnames -> isSupportedSeqnames
o move SeqnamesStyle generic from GenomicRanges and define a new method which
works on a character vector.
DEPRECATED AND DEFUNCT
o deprecate listAllSupportedStylesBySpecies(),
listAllSupportedSeqnameStyles(), supportedOrganisms()
supportedSeqnameMappingsWithGroup()
o deprecate supportedSeqnameMappings(), supportedSeqnameStyles(),
isSupportedSeqnamesStyle(),issupportedSeqnames()
CHANGES IN VERSION 0.99.17
----------------------------------
MODIFICATIONS
o keepStandardChromosomes: Make 'species' argument optional and remove
'style' argument.
CHANGES IN VERSION 0.99.14
----------------------------------
MODIFICATIONS
o rename:
package: Seqnames --> GenomeInfoDb
supportedStyles -> genomeStyles
makeSeqnameIds --> rankSeqlevels (add to export)
seqnamesOrder --> orderSeqlevels
extractSeqnameSet -> extractSeqlevels
extractSeqnameSetByGroup -> extractSeqlevelsByGroup
findSequenceRenamingMaps --> mapSeqlevels
seqnamesInGroup --> seqlevelsInGroup
seqnamesStyle --> seqlevelsStyle
"seqnameStyle<-" --> "seqlevelsStyle<-"
CHANGES IN VERSION 0.99.1
------------------------------
NEW FEATURES
o added new functions:
supportedOrganisms()
supportedSeqnameMappingsWithGroup()
extractSeqnameSetByGroup()
MODIFICATIONS
o The Seqnames package will have functions which will be moved from
AnnotationDbi , GenomicRanges
o List of 9 functions moved from AnnotationDbi
supportedSeqnameMappings, findSequenceRenamingMaps,
supportedSeqnameStyles, supportedSeqnames,
extractSeqnameSet, testSeqnames, isSupportedSeqnamesStyle,
listAllSupportedStylesBySpecies, listAllSupportedSeqnameStyles.
o makeSeqnameIds moved from GenomicRanges
o keepStandardChromosomes moved from GenomicRanges
o rename:
keepStandardChromosomes -> keepChromosomes
### =========================================================================
### The "GenomeDescription" class
### -------------------------------------------------------------------------
setClass("GenomeDescription",
representation(
## organism: "Homo sapiens", "Mus musculus", etc...
organism="character",
## common_name: "Human", "Mouse", etc...
common_name="character",
## provider: "UCSC", "BDGP", etc...
provider="character",
## provider_version: "hg18", "mm8", "sacCer1", etc...
provider_version="character",
## release_date: "Mar. 2006", "Feb. 2006", "Oct. 2003", etc...
release_date="character",
## release_name: "NCBI Build 36.1", "NCBI Build 36",
## "SGD 1 Oct 2003 sequence", etc...
release_name="character",
## names, lengths, and circularity flags of the genome sequences
seqinfo="Seqinfo"
)
)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Accessor methods.
###
setMethod("organism", "GenomeDescription", function(object) object@organism)
setGeneric("commonName", function(object) standardGeneric("commonName"))
setMethod("commonName", "GenomeDescription",
function(object) object@common_name
)
setGeneric("provider", function(x) standardGeneric("provider"))
setMethod("provider", "GenomeDescription", function(x) x@provider)
setGeneric("providerVersion", function(x) standardGeneric("providerVersion"))
setMethod("providerVersion", "GenomeDescription", function(x) x@provider_version)
setGeneric("releaseDate", function(x) standardGeneric("releaseDate"))
setMethod("releaseDate", "GenomeDescription", function(x) x@release_date)
setGeneric("releaseName", function(x) standardGeneric("releaseName"))
setMethod("releaseName", "GenomeDescription", function(x) x@release_name)
setGeneric("bsgenomeName", function(x) standardGeneric("bsgenomeName"))
setMethod("bsgenomeName", "GenomeDescription",
function(x)
{
part1 <- "BSgenome"
tmp <- strsplit(organism(x), " ", fixed=TRUE)[[1L]]
part2 <- paste(substr(tmp[1L], start=1L, stop=1L), tmp[2L], sep="")
part3 <- provider(x)
part4 <- providerVersion(x)
paste(part1, part2, part3, part4, sep=".")
}
)
setMethod("seqinfo", "GenomeDescription", function(x) x@seqinfo)
setMethod("seqnames", "GenomeDescription",
function(x)
{
ans <- seqnames(seqinfo(x))
if (length(ans) == 0L)
ans <- NULL
ans
}
)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Validity.
###
setValidity("GenomeDescription",
function(object)
{
SINGLE_STRING_SLOTS <- setdiff(slotNames("GenomeDescription"),
"seqinfo")
.validSlot <- function(slotname)
{
slotval <- slot(object, slotname)
if (isSingleStringOrNA(slotval))
return(NULL)
problem <- paste("slot '", slotname, "' must be a ",
"single string (or NA)", sep="")
return(problem)
}
unlist(lapply(SINGLE_STRING_SLOTS, .validSlot))
}
)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Constructor-like functions
###
### NOTE: In BioC 3.1, the 'species' argument was replaced with the
### 'common_name' argument but the former was kept for backward compatibility
### (essentially with existing SNPlocs and XtraSNPlocs packages).
### TODO: At some point the 'species' argument needs to be deprecated.
GenomeDescription <- function(organism, common_name,
provider, provider_version,
release_date, release_name,
seqinfo,
species=NA_character_)
{
if (identical(organism, "NA")) organism <- NA_character_
if (missing(common_name))
common_name <- species
if (identical(common_name, "NA")) common_name <- NA_character_
if (identical(release_date, "NA")) release_date <- NA_character_
if (identical(release_name, "NA")) release_name <- NA_character_
new("GenomeDescription",
organism=organism,
common_name=common_name,
provider=provider,
provider_version=provider_version,
release_date=release_date,
release_name=release_name,
seqinfo=seqinfo)
}
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### The 'show' method
###
### NOT exported but used in BSgenome package.
### Kind of very low-level. Could go into S4Vectors if someone else needed
### this...
compactPrintNamedAtomicVector <- function(x, margin="")
{
x_len <- length(x)
halfWidth <- (getOption("width") - nchar(margin)) %/% 2L
first <- max(1L, halfWidth)
showMatrix <-
rbind(as.character(head(names(x), first)),
as.character(head(x, first)))
if (x_len > first) {
last <- min(x_len - first, halfWidth)
showMatrix <-
cbind(showMatrix,
rbind(as.character(tail(names(x), last)),
as.character(tail(x, last))))
}
showMatrix <- format(showMatrix, justify="right")
cat(S4Vectors:::labeledLine(margin, showMatrix[1L, ], count=FALSE,
labelSep=""), sep="")
cat(S4Vectors:::labeledLine(margin, showMatrix[2L, ], count=FALSE,
labelSep=""), sep="")
}
### NOT exported (but used in the BSgenome package).
showGenomeDescription <- function(x, margin="", print.seqlengths=FALSE)
{
cat(margin, "organism: ", organism(x), " (", commonName(x), ")\n", sep="")
cat(margin, "provider: ", provider(x), "\n", sep="")
cat(margin, "provider version: ", providerVersion(x), "\n", sep="")
cat(margin, "release date: ", releaseDate(x), "\n", sep="")
cat(margin, "release name: ", releaseName(x), "\n", sep="")
if (print.seqlengths) {
cat(margin, "---\n", sep="")
cat(margin, "seqlengths:\n", sep="")
compactPrintNamedAtomicVector(seqlengths(x), margin=margin)
}
}
setMethod("show", "GenomeDescription",
function(object)
{
showGenomeDescription(object, margin="| ", print.seqlengths=TRUE)
}
)
This diff is collapsed.
This diff is collapsed.
### =========================================================================
### Helpers to map between genus, species and taxonomy ID
### -------------------------------------------------------------------------
## In February 2017 the mapping files in GenomeInfoDb/data/ were moved to the
## GenomeInfoDbData annotation package.
.lookupSpeciesFromTaxId <- function(id) {
if (!exists("specData")) {
data(specData, package = "GenomeInfoDbData")
}
## Then find matches
g <- specData[,1] == id
res <- specData[g,]
if (dim(res)[1]<1)
stop(paste0("Cannot find a species to match the requested",
" taxonomy id. Please provide the genus and species",
" manually."))
if (dim(res)[1] == 1) {
return(res[1,])
} else if (dim(res)[1]>1) {
.tooLong <- function(x){
splt <- unlist(strsplit(x,split=" "))
if(length(splt) > 1){
return(TRUE)
}else{
return(FALSE)
}
}
tooLong <- unlist(lapply(as.character(res$species), .tooLong))
if (all(tooLong)) {
return(res[1,])
} else {
res <- res[!tooLong,]
return(res[1,])
}
}
}
available.species <- function(){
if (!exists("speciesMap"))
data(speciesMap, package="GenomeInfoDbData")
speciesMap
}
.getTaxonomyId <- function(species) {
if (is.na(species)) {return(NA)}
if (!exists("speciesMap"))
data(speciesMap, package="GenomeInfoDbData")
species <- gsub(" {2,}", " ", species)
species <- gsub(",", " ", species, fixed=TRUE)
idx <- match(species, speciesMap$species)
if (any(is.na(idx)))
stop(sum(is.na(idx)), " unknown species: ",
paste(sQuote(head(species[is.na(idx)])),
paste0("Please use 'available.species' to see viable",
" species names or tax Ids"),
collapse=" "))
as.integer(speciesMap$taxon[idx])
}
.taxonomyId <- function(species){
unlist(lapply(species, .getTaxonomyId))
}
.checkForAValidTaxonomyId <- function(taxId) {
if (!exists("validTaxIds"))
data(validTaxIds, package = "GenomeInfoDbData")
validTaxIds <- c(validTaxIds, NA_integer_)
if(!(taxId %in% validTaxIds)) {
stop(wmsg(paste0("The taxonomy Id you have provided (",taxId,")",
" is not in our list of valid Tax Ids.",
" Please check to make sure that your tax ID",
" is really legitimate and if so, then please tell",
" us about it so that we can update our list.")))
}
}
This diff is collapsed.
### =========================================================================
### fetchSequenceInfo()
### -------------------------------------------------------------------------
.fetch_sequence_info_for_UCSC_genome <- function(genome,
goldenPath_url="http://hgdownload.cse.ucsc.edu/goldenPath")
{
ext_chrominfo <- fetchExtendedChromInfoFromUCSC(genome,
goldenPath_url=goldenPath_url, quiet=TRUE)
data.frame(seqnames=ext_chrominfo[ , "UCSC_seqlevel"],
seqlengths=ext_chrominfo[ , "UCSC_seqlength"],
is_circular=ext_chrominfo[ , "circular"],
genome=genome,
stringsAsFactors=FALSE)
}
#.fetch_sequence_info_for_NCBI_genome <- function(refseq_assembly_accession,
# AssemblyUnits,
# circ_seqs)
#{
# assembly_report <- fetch_assembly_report(refseq_assembly_accession,
# AssemblyUnits=AssemblyUnits)
# ans_seqnames <- as.character(assembly_report[ , "SequenceName"])
#
#}
#
#SUPPORTED_NCBI_GENOMES <- list(
# GRCh38=
# list(refseq_assembly_accession="GCF_000001405.26", circular="MT")
#)
### Returns a data frame.
### Only supports UCSC genomes for now (the same genomes that are supported
### by fetchExtendedChromInfoFromUCSC()).
### NOT exported.
fetchSequenceInfo <- function(genome)
{
if (!isSingleString(genome) || genome == "")
stop("'genome' must be a single non-empty string")
idx <- match(genome, names(SUPPORTED_UCSC_GENOMES))
if (!is.na(idx))
return(.fetch_sequence_info_for_UCSC_genome(genome))
#idx <- match(genome, names(SUPPORTED_NCBI_GENOMES))
#if (!is.na(idx)) {
# supported_genome <- SUPPORTED_NCBI_GENOMES[[idx]]
# refseq_assembly_accession <- supported_genome$refseq_assembly_accession
# AssemblyUnits <- supported_genome$AssemblyUnits
# circ_seqs <- supported_genome$circular
# return(.fetch_sequence_info_for_NCBI_genome(refseq_assembly_accession,
# AssemblyUnits,
# circ_seqs))
#}
stop("genome \"", genome, "\" is not supported")
}
listOrganisms <- function(){
filename <- system.file(package="GenomeInfoDb", "extdata",
"dataFiles", "genomeMappingTbl.csv")
tbl <- read.csv(filename, header=TRUE, stringsAsFactors=FALSE)
tbl_names <- unique(tbl[,1:2])
rownames(tbl_names) <- NULL
tbl_names[,2] = paste0(toupper(substring(tbl_names[,2], 1, 1)),
substring(tbl_names[,2], 2, nchar(tbl_names[,2])))
tbl_names
}
genomeBuilds <- function(organism, style = c("UCSC", "Ensembl")) {
if (!is.character(organism))
stop("'organism' must be a character vector")
style <- match.arg(style)
filename <- system.file(package="GenomeInfoDb", "extdata",
"dataFiles", "genomeMappingTbl.csv")
tbl <- read.csv(filename, header=TRUE, stringsAsFactors=FALSE)
colkeep <- switch(style,
UCSC="ucscID",
Ensembl="ensemblID"
)
fnd1 <- sapply(tolower(organism), grep, tolower(tbl$commonName))
fnd2 <- sapply(tolower(organism), grep, tolower(tbl$organism))
fnd <- mapply(c, fnd1, fnd2)
notFnd <- names(which(lengths(fnd) == 0))
if (length(notFnd))
warning("'organism' not found: ", paste(notFnd, collapse=", "),
call.=FALSE)
if (!missing(organism)){
rowkeep <- apply(FUN=any, MARGIN=1, cbind(
tolower(tbl$commonName) %in% tolower(organism),
tolower(tbl$organism) %in% tolower(organism)
))
tbl <- tbl[rowkeep,c("commonName", "organism", colkeep)]
}else
tbl <- tbl[,c("commonName", "organism", colkeep)]
if (nrow(tbl) == 0L)
return(data.frame())
tbl <- unique(na.omit(tbl))
rownames(tbl) <- NULL
tbl[,2] = paste0(toupper(substring(tbl[,2], 1, 1)),
substring(tbl[,2], 2, nchar(tbl[,2])))
tbl
}
mapGenomeBuilds <- function(genome, style = c("UCSC", "Ensembl") ){
if (!is.character(genome))
stop("'genome' must be a character vector")
genome <- tolower(genome)
style <- match.arg(style)
filename <- system.file(package="GenomeInfoDb", "extdata",
"dataFiles", "genomeMappingTbl.csv")
tbl <- read.csv(filename, header=TRUE, stringsAsFactors=FALSE)
colkeep <- switch(tolower(style),
ucsc=c("ucscID","ucscDate","ensemblID"),