Commit ae5c4984 authored by Andreas Tille's avatar Andreas Tille

New upstream version 1.16.0

parent 1f678aee
......@@ -6,24 +6,24 @@ Description: Contains data and functions that
sequence naming conventions (e.g., "chr1" versus "1"),
including a function that attempts to place sequence names in
their natural, rather than lexicographic, order.
Version: 1.14.0
Version: 1.16.0
Encoding: UTF-8
Author: Sonali Arora, Martin Morgan, Marc Carlson, H. Pagès
Maintainer: Bioconductor Package Maintainer <maintainer@bioconductor.org>
biocViews: Genetics, DataRepresentation, Annotation, GenomeAnnotation
Depends: R (>= 3.1), methods, BiocGenerics (>= 0.13.8), S4Vectors (>=
0.9.25), IRanges (>= 1.99.26)
0.17.25), IRanges (>= 2.13.12)
Imports: stats, stats4, utils, RCurl, GenomeInfoDbData
Suggests: GenomicRanges, Rsamtools, GenomicAlignments, BSgenome,
GenomicFeatures, BSgenome.Scerevisiae.UCSC.sacCer2,
BSgenome.Celegans.UCSC.ce2, BSgenome.Hsapiens.NCBI.GRCh38,
TxDb.Dmelanogaster.UCSC.dm3.ensGene, RUnit, BiocStyle, knitr
License: Artistic-2.0
Collate: utils.R rankSeqlevels.R assembly-utils.R available.species.R
Collate: utils.R rankSeqlevels.R assembly-utils.R loadTaxonomyDb.R
fetchExtendedChromInfoFromUCSC.R fetchSequenceInfo.R seqinfo.R
seqlevelsStyle.R seqlevels-wrappers.R Seqinfo-class.R
GenomeDescription-class.R mapGenomeBuilds.R zzz.R
VignetteBuilder: knitr
Video: http://youtu.be/wdEjCYSXa7w
NeedsCompilation: no
Packaged: 2017-10-30 23:51:42 UTC; biocbuild
Packaged: 2018-04-30 23:56:00 UTC; biocbuild
......@@ -27,7 +27,7 @@ export(
mapGenomeBuilds,
genomeBuilds,
listOrganisms,
## seqlevelsStyle.R:
genomeStyles,
extractSeqlevels,
......@@ -96,6 +96,9 @@ exportMethods(
###
export(
## loadTaxonomyDb.R:
loadTaxonomyDb, available.species,
## seqinfo.R:
seqinfo, "seqinfo<-",
seqnames, "seqnames<-",
......@@ -112,8 +115,7 @@ export(
## GenomeDescription-class.R:
commonName, provider, providerVersion,
releaseDate, releaseName, bsgenomeName,
available.species
releaseDate, releaseName, bsgenomeName
)
exportMethods(
......
### =========================================================================
### Helpers to map between genus, species and taxonomy ID
### -------------------------------------------------------------------------
## In February 2017 the mapping files in GenomeInfoDb/data/ were moved to the
## GenomeInfoDbData annotation package.
.lookupSpeciesFromTaxId <- function(id, all=FALSE) {
if (!exists("specData")) {
data(specData, package = "GenomeInfoDbData")
}
## Then find matches
g <- specData[,1] == id
res <- specData[g,]
if (dim(res)[1]<1)
stop(paste0("Cannot find a species to match the requested",
" taxonomy id. Please provide the genus and species",
" manually."))
if (dim(res)[1] == 1) {
return(res[1,])
} else if (dim(res)[1]>1) {
.tooLong <- function(x){
splt <- unlist(strsplit(x,split=" "))
if(length(splt) > 1){
return(TRUE)
}else{
return(FALSE)
}
}
tooLong <- unlist(lapply(as.character(res$species), .tooLong))
if (all(tooLong)) {
return(res[1,])
} else {
res <- res[!tooLong,]
if (all) {
return(res)
} else {
return(res[1,])
}
}
}
}
available.species <- function(){
if (!exists("speciesMap"))
data(speciesMap, package="GenomeInfoDbData")
speciesMap
}
.getTaxonomyId <- function(species) {
if (is.na(species)) {return(NA)}
if (!exists("speciesMap"))
data(speciesMap, package="GenomeInfoDbData")
species <- gsub(" {2,}", " ", species)
species <- gsub(",", " ", species, fixed=TRUE)
idx <- match(species, speciesMap$species)
if (any(is.na(idx)))
stop(sum(is.na(idx)), " unknown species: ",
paste(sQuote(head(species[is.na(idx)])),
paste0("Please use 'available.species' to see viable",
" species names or tax Ids"),
collapse=" "))
as.integer(speciesMap$taxon[idx])
}
.taxonomyId <- function(species){
unlist(lapply(species, .getTaxonomyId))
}
.checkForAValidTaxonomyId <- function(taxId) {
if (!exists("validTaxIds"))
data(validTaxIds, package = "GenomeInfoDbData")
validTaxIds <- c(validTaxIds, NA_integer_)
if(!(taxId %in% validTaxIds)) {
stop(wmsg(paste0("The taxonomy Id you have provided (",taxId,")",
" is not in our list of valid Tax Ids.",
" Please check to make sure that your tax ID",
" is really legitimate and if so, then please tell",
" us about it so that we can update our list.")))
}
}
### =========================================================================
### Helpers to map between taxonomy ID and organism
### -------------------------------------------------------------------------
### In February 2017 the mapping files in GenomeInfoDb/data/ were moved to the
### GenomeInfoDbData annotation package.
.TAXONOMY_DB_cache <- new.env(parent=emptyenv())
### Return a data.frame with 3 columns: tax_id, genus, species.
### Number of rows: 1820543 (as of Jan 30, 2018).
### TODO: Rename specData dataset -> TAXONOMY_DB in GenomeInfoDbData.
loadTaxonomyDb <- function()
{
ans <- try(get("TAXONOMY_DB", envir=.TAXONOMY_DB_cache, inherits=FALSE),
silent=TRUE)
if (!is(ans, "try-error"))
return(ans)
data(specData, package="GenomeInfoDbData", envir=.TAXONOMY_DB_cache)
taxdb <- get("specData", envir=.TAXONOMY_DB_cache, inherits=FALSE)
stopifnot(identical(colnames(taxdb), c("tax_id", "genus", "species")),
is.integer(taxdb[["tax_id"]]),
is.factor(taxdb[["genus"]]),
is.character(taxdb[["species"]]))
## Replace NAs in the "species" column with emty strings.
## Shouldn't we clean the dataset in GenomeInfoDbData instead?
taxdb[["species"]][is.na(taxdb[["species"]])] <- ""
assign("TAXONOMY_DB", taxdb, envir=.TAXONOMY_DB_cache)
taxdb
}
available.species <- function()
{
.Deprecated("loadTaxonomyDb")
loadTaxonomyDb()
}
### NOT exported but used in the GenomicFeatures package.
### Not vectorized.
lookup_organism_by_tax_id <- function(tax_id, all=FALSE)
{
stopifnot(isSingleNumber(tax_id))
taxdb <- loadTaxonomyDb()
## Find matches.
idx <- which(taxdb[["tax_id"]] == tax_id)
if (length(idx) == 0L)
stop(wmsg("Cannot find an organism to match the requested ",
"taxonomy ID. Please provide the genus and species ",
"manually."))
ans <- taxdb[idx, , drop=FALSE]
if (nrow(ans) == 1L || all)
return(ans)
## When nrow(ans) > 1 and 'all' is FALSE, we first reduce the set of
## entries to keep single word species only, then pick up the first
## entry.
idx1 <- which(lengths(strsplit(ans[["species"]], split=" ")) == 1L)
if (length(idx1) >= 1L)
ans <- ans[idx1, , drop=FALSE]
ans[1L, , drop=FALSE]
}
### NOT exported but used in the GenomicFeatures package.
### Not vectorized.
lookup_tax_id_by_organism <- function(organism)
{
stopifnot(is.character(organism) || is.factor(organism),
length(organism) == 1L)
if (is.na(organism)) return(NA)
taxdb <- loadTaxonomyDb()
species <- taxdb[["species"]]
organisms <- trimws(paste(taxdb[["genus"]], species))
organism <- gsub(" {2,}", " ", organism)
organism <- gsub(",", " ", organism, fixed=TRUE)
idx <- match(organism, organisms)
if (is.na(idx))
stop(wmsg(organism, ": unknown organism. ",
"Please use 'loadTaxonomyDb()' to see viable ",
"genus/species and taxonomy IDs."))
as.integer(taxdb[["tax_id"]][[idx]])
}
### NOT exported but used in the GenomicFeatures package.
### Vectorized.
check_tax_id <- function(tax_id)
{
stopifnot(is.numeric(tax_id))
if (!is.integer(tax_id))
tax_id <- as.integer(tax_id)
taxdb <- loadTaxonomyDb()
bad_idx <- which(!(tax_id %in% taxdb[["tax_id"]]))
if (length(bad_idx) != 0L) {
bad_ids <- paste0(unique(tax_id[bad_idx]), collapse=", ")
stop(wmsg("Unknown taxonomy IDs: ", bad_ids,
"\n\n These taxonomy IDs are not in our list of valid ",
"taxonomy IDs. Please check to make sure that the ",
"supplied taxonomy IDs are legitimate and if so, then ",
"please tell us about it so that we can update our list."))
}
}
No preview for this file type
\name{available.species}
\name{loadTaxonomyDb}
\alias{loadTaxonomyDb}
\alias{available.species}
\title{
Returns a data.frame that lists the available species strings and
their taxonomy Ids.
Return a data.frame that lists the known taxonomy IDs and their
corresponding organisms.
}
\description{
NCBI maintains a collection of unique taxonomy Ids and pairs these
NCBI maintains a collection of unique taxonomy IDs and pairs these
with associated genus and species designations. This function returns
the set of pre-processed values that we use to check that something is
a valid Taxonomy ID (or species name)
a valid Taxonomy ID (or organism).
}
\usage{
available.species()
loadTaxonomyDb()
}
\value{
A data frame with 1 row per species designation and two columns. The
1st column is the taxonomy Id. The second columns is the species name.
A data frame with 1 row per genus/species designation and three columns.
The 1st column is the taxonomy ID. The second columns is the genus and the
third is the species name.
}
\author{
......@@ -27,11 +30,11 @@
}
\examples{
## get the data
spec <- available.species()
tail(spec)
## which can then be searched etc.
spec[grepl('yoelii',spec$species),]
## get the data
taxdb <- loadTaxonomyDb()
tail(taxdb)
## which can then be searched etc.
taxdb[grepl('yoelii', taxdb$species), ]
}
\keyword{manip}
......@@ -195,15 +195,14 @@ genome(x) <- value
The \pkg{GenomicRanges} package defines \code{seqinfo} and
\code{seqinfo<-} methods for these low-level data types:
\code{List}, \code{RangesList} and \code{RangedData}. Those
objects do not have the means to formally store sequence
information. Thus, the wrappers simply store the \code{Seqinfo}
object within \code{metadata(x)}. Initially, the metadata
\code{List} and \code{IntegerRangesList}. Those objects do not
have the means to formally store sequence information. Thus,
the wrappers simply store the \code{Seqinfo} object
within \code{metadata(x)}. Initially, the metadata
is empty, so there is some effort to generate a reasonable
default \code{Seqinfo}. The names of any \code{List} are
taken as the \code{seqnames}, and the \code{universe} of
\code{RangesList} or \code{RangedData} is taken as the
\code{genome}.
\code{IntegerRangesList} is taken as the \code{genome}.
}
\note{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment