Skip to content
Commits on Source (4)
blib/
.build/
_build/
cover_db/
inc/
Build
!Build/
Build.bat
.last_cover_stats
MANIFEST.bak
META.yml
MYMETA.yml
nytprof.out
pm_to_blib
.DS_Store
Thumbs.db
*.swp
*.swo
*~
\#*\#
/.emacs.desktop
/.emacs.desktop.lock
.elc
auto-save-list
tramp
.\#*
# Org-mode
.org-id-locations
*_archive
a.out
*.o
*.obj
*.class
MultiTestDB.conf
MultiTestDB.*.conf
*.frozen.conf
bioperl*.zip
/modules/t/CLEAN.t
/misc-scripts/xref_mapping/sql/populate_metadata.sql
language: perl
os:
- linux
services:
- mysql
perl:
- '5.14'
- '5.26'
env:
matrix:
- COVERALLS=true DB=mysql
- COVERALLS=false DB=mysql
- COVERALLS=false DB=sqlite
global:
- secure: Ju069PzB8QZG3302emIhyCEEQfVfVsiXy0nGcR6hue+vW9nE82NnOEZHbZIwUCXEjUaZRMVQ31Em70Ky22OrLK4D59bs2ClH21u8URDGD/cn7JNPGWFrgxuaXQKMQrw72doeB0+w1+ShURtqM41vITjinyU3y34RZ1NcbDwYSZI=
sudo: false
addons:
apt:
packages:
- unzip
before_install:
- git clone --branch master --depth 1 https://github.com/Ensembl/ensembl-test.git
- git clone --branch master --depth 1 https://github.com/Ensembl/ensembl-io.git
- git clone --branch master --depth 1 https://github.com/Ensembl/ensembl-variation.git
- git clone --branch master --depth 1 https://github.com/Ensembl/ensembl-compara.git
- git clone -b release-1-6-924 --depth 1 https://github.com/bioperl/bioperl-live.git
install:
- cpanm -v --installdeps --notest .
- cpanm -n Devel::Cover::Report::Coveralls
- cpanm -n DBD::SQLite
- cp travisci/MultiTestDB.conf.travisci.mysql modules/t/MultiTestDB.conf.mysql
- cp travisci/MultiTestDB.conf.travisci.SQLite modules/t/MultiTestDB.conf.SQLite
- mysql -u root -h localhost -e 'GRANT ALL PRIVILEGES ON *.* TO "travis"@"%"'
script: "./travisci/harness.sh"
jobs:
include:
- stage: trigger_dependent_builds
script: "./travisci/trigger-dependent-build.sh"
matrix:
exclude:
- perl: '5.14'
env: COVERALLS=false DB=mysql
- perl: '5.14'
env: COVERALLS=true DB=mysql
- perl: '5.26'
env: COVERALLS=false DB=sqlite
- perl: '5.26'
env: COVERALLS=false DB=mysql
notifications:
email:
on_failure: change
slack:
rooms:
secure: AbIJIPtituqEBGPKO47+Mp+KdFFocT5xJ0oXa1yOFROQz9m03uJPWpMdQ6qol7ftTNLQQChhq8Bek+OJvgZPzvwfsOjgcMrgycaLHsXpqb1S+JRYRHvqQqv0MHFtFLCxnM+R43BdUak8GJmp+lzY96higiLO0ffhu/ovrqmf2VM=
on_failure: change
# Contribution Guide
The Ensembl development team welcomes outside contributions, in fact we moved to Git to facilitate this. However, to ensure legibility for other users, we ask contributors to take a few moments to clean up their code, its comments, and its history before submitting a pull request. It takes a little bit of effort from everyone, but nobody likes to decipher cryptic comments, review commits overloaded with minor typesetting changes or re-trace the history of a file across fragmented commits. Together, let's keep Ensembl tidy!
This guide covers how to contribute changes to an Ensembl project. Please do not create a pull request without reading this guide first.
We also invite you to read our code of conduct (http://www.ensembl.org/info/about/code_of_conduct.html) before continuing.
## Quick Guide - Using Forks & Pull Requests
1. Fork the ensembl repository
2. Switch to the branch you want to edit
* Consider using feature branches over editing master
3. Hack away and commit with a useful commit message
* First line is short but descriptive
* Other lines are more explanatory
4. Make sure your forked master is up-to date with origin and rebase if needed
6. Push
7. Create a pull request
8. Communicate what the change is
9. Wait and celebrate as your change goes in
## Quick Guide - Using Patches
**Whilst patch submission is supported we really prefer users to submit pull requests. Patches also take longer to integrate**
1. Clone the repository
2. Switch to the branch you want to edit
3. Hack away and commit
4. Use `git format-patch` to create the patches
5. Send them to helpdesk for the attention of the team
# Why Could My Pull Request Be Rejected?
We attempt to integrate as many pull requests into Ensembl as possible but do reserve some rights to reject pull requests
* The pull request brought in a non-useful change
- Single line comments which serve no help to the code
* The pull request removes essential code
* The pull request's history was too complex
- We want to see as close to a linear set of commits resulting in your final change
- We will ask for multiple internal merges to be squished into a sensible set of commits
* Poor commit messages
- Do not repeat the same message in multiple commits
* Excessive whitespace changes
- Do not reformat someone else's code; it's just bad manners
* The pull request modifies code without testing for regression
- Please provide simple unit tests to test the changes
* The pull request fails unit testing
- Please ensure the test suite runs successfully and update the test data if necessary
# License
Ensembl code is licensed under our Apache 2.0 license. Our expectation is that contributing code is made available under the same license. Any copyright assertion to other organistions should be declared in the modyfying file and in the root LICENSE section.
# Using Forks and Pull Requests
This is our preferred method of accepting code contributions. It is the cleanest way and allows us to review your changes quickly before integration.
## Fork and Clone The Repository
You must fork the repository before contributing changes. The big _Fork_ button at the top right of GitHub's interface will automate the process. There is more information from GitHub at https://help.github.com/articles/fork-a-repo.
Once forked clone your new repository to your machine to start hacking. After cloning remember to add the upstream repository as a remote. For example to add the original Ensembl core project repo back do the following:
```
git remote add upstream https://github.com/Ensembl/ensembl.git
```
## Switching Branch
By default Ensembl projects have a default branch of the **latest stable release**. If you are contributing a fix for a specific release then please remain there; otherwise switch to master.
```
git checkout --track -b master origin/master
```
To help improve your hacking time consider developing on a branch from master. This will allow you to bring in changes from upstream and integrate them into your fork without fear of merge conflicts. The following prefixes are available to use:
* _feature/_ - A new feature going into a repository
* _hotfix/_ - Fixes to be integrated into a branch
* _experimental/_ - Experimental feature with no guarentee about hash stability
Switch to a new branch once you are on master:
```
git checkout -b hotfix/quickfixtoadaptor
```
## Hacking and Committing
Go forth and hack. Be aware that commit messages should attempt to summarise the changes in a single line and then describe the changes in more depth. For example the following commit message is bad:
```
Fixed insertion code
```
Compared to:
```
Fixed insertion code when storing a Gene.
GeneAdaptor's store method was not using the DBI insert id
retrieval system but rather the MySQL last insert id variable.
```
Try also to minimise branches within your code base. If we see too many we will ask you to rebase/squish.
## Syncing master with upstream, rebasing your changes and pushing
First switch to master and pull in new changes from upstream held on master. This will bring those changes down and attempt to merge your local master with _upstream/master_. If you have changes on master be aware that this will probably require a merge commit. Staying away from master is a good idea.
```
git checkout master
git pull upstream master
```
Once the changes are down rebase your branch against master:
```
git checkout hotfix/quickfixtoadaptor
git rebase master
```
Now push to origin:
```
git push -u origin hotfix/quickfixtoadaptor
```
## Creating the Pull Request
https://help.github.com/articles/using-pull-requests
Go to your GitHub fork's page, switch to your branch and click on the _Compare and Review_ button. This will start the merge. Then click on the top left +- file icon and edit accordingly:
* Switch the base branch to _master_
This ensures you are submitting your change against the right branch in Ensembl. For more information see [GitHub's documentation on doing this](https://help.github.com/articles/using-pull-requests#changing-the-branch-range-and-destination-repository).
Now click "Start a discussion" and give a brief description of what your pull reuqest is. Also if you are not using an obvious username let us know who you are.
## Updating a Pull Request
Should you be told to perform some fixes for your pull request you should perform them on your local repo. To update the pull request run `git push -f origin hotfix/quickfixtoadaptor` but never do this on a branch you have shared with more than just us.
# Updating the schema
Any change that affects the underlying SQL schema needs to come with the following:
- a patch file, patch_oldrelease_newrelease_version.sql
See https://github.com/Ensembl/ensembl/blob/release/81/sql/patch_79_80_b.sql for an example
This patch describes the actual schema change and updates the meta table to record the change
- an update to the table.sql file
The change from the patch should also be included in the table.sql file
This is to ensure that a newly created database is identical to an existing database to which the patch was applied
- an update to the test databases
The databases used for testing are in https://github.com/Ensembl/ensembl/tree/release/81/modules/t/test-genome-DBs
These should be patched with the latest changes to ensure data consistency.
This can be done using the patching script provided in https://github.com/Ensembl/ensembl-test/blob/master/scripts/patch_test_databases.pl
# The test suite
The Ensembl code comes with a series of unit tests that check methods behave as expected.
This test suite is run as part of the TravisCi integration for each pull request.
See https://travis-ci.org/Ensembl/ensembl
This ensures that any change does not affect existing functionality.
A pull request can only be integrated if the test suite passes successfully.
If no tests are available for a new functionality, please provide some basic tests.
Ensembl Deprecated Methods
===================
This file contains the list of methods deprecated in the Ensembl core API.
A method is deprecated when it is not functional any more (schema/data change) or has been replaced by a better one.
Backwards compatibility is provided whenever possible.
When a method is deprecated, a deprecation warning is thrown whenever the method is used.
The warning also contains instructions on replacing the deprecated method and when it will be removed.
A year after deprecation (4 Ensembl releases), the method is removed from the API.
### Removed in Ensembl Release 100 ###
- Bio::EnsEMBL::DBFile::**FileAdaptor**::*get_filehandle()*
- Bio::EnsEMBL::DBFile::**FileAdaptor**::*open_file()*
- Bio::EnsEMBL::DBFile::**FileAdaptor**::*validate_file_length()*
- Bio::EnsEMBL::DBFile::**FileAdaptor**::*initialise_filehandle()*
- Bio::EnsEMBL::DBFile::**CollectionAdaptor**::*initialise_filehandle()*
- Bio::EnsEMBL::DBFile::**CollectionAdaptor**::*read_collection_blob()*
- Bio::EnsEMBL::Utils::**Collector**::*new()*
- Bio::EnsEMBL::Utils::**Collector**::*new_assembly()*
- Bio::EnsEMBL::Utils::**Collector**::*max_data_type_size()*
- Bio::EnsEMBL::Utils::**Collector**::*max_view_width()*
- Bio::EnsEMBL::Utils::**Collector**::*bin_method()*
- Bio::EnsEMBL::Utils::**Collector**::*bin_model()*
- Bio::EnsEMBL::Utils::**Collector**::*window_sizes()*
- Bio::EnsEMBL::Utils::**Collector**::*has_window_size()*
- Bio::EnsEMBL::Utils::**Collector**::*pack_template()*
- Bio::EnsEMBL::Utils::**Collector**::*packed_size()*
- Bio::EnsEMBL::Utils::**Collector**::*bins_per_record()*
- Bio::EnsEMBL::Utils::**Collector**::*current_packed_size()*
- Bio::EnsEMBL::Utils::**Collector**::*score_cache()*
- Bio::EnsEMBL::Utils::**Collector**::*collection_start()*
- Bio::EnsEMBL::Utils::**Collector**::*collection_end()*
- Bio::EnsEMBL::Utils::**Collector**::*collection_strand()*
- Bio::EnsEMBL::Utils::**Collector**::*_get_Slice_chunks()*
- Bio::EnsEMBL::Utils::**Collector**::*set_config()*
- Bio::EnsEMBL::Utils::**Collector**::*store_window_bins_by_Slice()*
- Bio::EnsEMBL::Utils::**Collector**::*_bin_features_by_Slice_window_sizes()*
- Bio::EnsEMBL::Utils::**Collector**::*_calculate_count()*
- Bio::EnsEMBL::Utils::**Collector**::*_calculate_average_score()*
- Bio::EnsEMBL::Utils::**Collector**::*_post_process_average_score()*
- Bio::EnsEMBL::Utils::**Collector**::*_calculate_max_magnitude()*
- Bio::EnsEMBL::Utils::**Collector**::*_post_process_max_magnitude()*
- Bio::EnsEMBL::Utils::**Collector**::*_calculate_RPKM()*
- Bio::EnsEMBL::Utils::**Collector**::*_post_process_RPKM()*
- Bio::EnsEMBL::Utils::**Collector**::*_set_up_RPKM()*
- Bio::EnsEMBL::Utils::**Collector**::*_RPKM_factor()*
- Bio::EnsEMBL::Utils::**Collector**::*get_diploid_genome_length_by_gender()*
### Removed in Ensembl Release 98 ###
- Bio::EnsEMBL::Utils::**SequenceOntologyMapper**::*new()*
- Bio::EnsEMBL::Utils::**SequenceOntologyMapper**::*to_accession()*
- Bio::EnsEMBL::Utils::**SequenceOntologyMapper**::*to_name()*
- Bio::EnsEMBL::Utils::**SequenceOntologyMapper**::*gene_biotype_to_name()*
- Bio::EnsEMBL::Utils::**SequenceOntologyMapper**::*transcript_biotype_to_name()*
- Bio::EnsEMBL::Utils::**SequenceOntologyMapper**::*_fetch_SO_name_by_accession()*
### Removed in Ensembl Release 95 ###
- Bio::EnsEMBL::**AlignStrainSlice**::*alignFeature()*
- Bio::EnsEMBL::**AlignStrainSlice**::*get_all_Slices()*
- Bio::EnsEMBL::**AlignStrainSlice**::*length()*
- Bio::EnsEMBL::**AlignStrainSlice**::*mapper()*
- Bio::EnsEMBL::**AlignStrainSlice**::*new()*
- Bio::EnsEMBL::**AlignStrainSlice**::*Slice()*
- Bio::EnsEMBL::**AlignStrainSlice**::*strains()*
- Bio::EnsEMBL::**AlignStrainSlice**::*_get_indels()*
- Bio::EnsEMBL::**IndividualSlice**::*add_AlleleFeature()*
- Bio::EnsEMBL::**IndividualSlice**::*alleleFeatures()*
- Bio::EnsEMBL::**IndividualSlice**::*get_all_differences_IndividualSlice()*
- Bio::EnsEMBL::**IndividualSlice**::*get_all_differences_Slice()*
- Bio::EnsEMBL::**IndividualSlice**::*get_all_Exons()*
- Bio::EnsEMBL::**IndividualSlice**::*get_all_Genes()*
- Bio::EnsEMBL::**IndividualSlice**::*get_all_Transcripts()*
- Bio::EnsEMBL::**IndividualSlice**::*individual_name()*
- Bio::EnsEMBL::**IndividualSlice**::*map_to_Individual()*
- Bio::EnsEMBL::**IndividualSlice**::*mapper()*
- Bio::EnsEMBL::**IndividualSlice**::*new()*
- Bio::EnsEMBL::**IndividualSlice**::*seq()*
- Bio::EnsEMBL::**IndividualSlice**::*sub_Slice()*
- Bio::EnsEMBL::**IndividualSlice**::*subseq()*
- Bio::EnsEMBL::**IndividualSlice**::*_convert_difference()*
- Bio::EnsEMBL::**IndividualSliceFactory**::*adaptor()*
- Bio::EnsEMBL::**IndividualSliceFactory**::*get_all_IndividualSlice()*
- Bio::EnsEMBL::**IndividualSliceFactory**::*new()*
- Bio::EnsEMBL::**IndividualSliceFactory**::*_rearrange_Individuals_Alleles()*
- Bio::EnsEMBL::**Slice**::*calculate_pi()*
- Bio::EnsEMBL::**Slice**::*calculate_theta()*
- Bio::EnsEMBL::**Slice**::*get_all_IndividualSlice()*
- Bio::EnsEMBL::**Slice**::*get_by_Individual()*
- Bio::EnsEMBL::**Slice**::*get_all_somatic_VariationFeatures()*
- Bio::EnsEMBL::**Slice**::*get_all_somatic_VariationFeatures_by_source()*
- Bio::EnsEMBL::**Slice**::*get_all_somatic_VariationFeatures_with_phenotype()*
- Bio::EnsEMBL::**Slice**::*get_all_VariationFeatures_by_Population()*
- Bio::EnsEMBL::**Slice**::*get_by_strain()*
- Bio::EnsEMBL::**Slice**::*_calculate_a*
### Removed in Ensembl Release 91 ###
- Bio::EnsEMBL::DBSQL::**BaseAdaptor**::*dump_data()*
- Bio::EnsEMBL::DBSQL::**BaseAdaptor**::*get_dumped_data()*
### Removed in Ensembl Release 90 ###
- Bio::EnsEMBL::**Gene**::*is_known()*
- Bio::EnsEMBL::**Gene**::*status()*
- Bio::EnsEMBL::**Transcript**::*is_known()*
- Bio::EnsEMBL::**Transcript**::*status()*
### Removed in Ensembl Release 88 ###
- Bio::EnsEMBL::**Slice**::*get_all_VariationFeatures_with_phenotype()*
- Bio::EnsEMBL::**Slice**::*get_all_StructuralVariationFeatures()*
- Bio::EnsEMBL::**Slice**::*get_all_somatic_StructuralVariationFeatures()*
- Bio::EnsEMBL::**Slice**::*get_all_StructuralVariationFeatures_by_size_range()*
- Bio::EnsEMBL::**Slice**::*get_all_somatic_StructuralVariationFeatures_by_size_range()*
- Bio::EnsEMBL::**Slice**::*get_all_StructuralVariationFeatures_by_Study()*
- Bio::EnsEMBL::**Slice**::*get_all_StructuralVariationFeatures_by_source()*
- Bio::EnsEMBL::**Slice**::*get_all_somatic_StructuralVariationFeatures_by_source()*
- Bio::EnsEMBL::**Slice**::*get_all_VariationFeatures_by_VariationSet()*
- Bio::EnsEMBL::**Slice**::*get_all_StructuralVariationFeatures_by_VariationSet()*
- Bio::EnsEMBL::**Slice**::*get_all_PhenotypeFeatures()*
- Bio::EnsEMBL::**Slice**::*get_all_CopyNumberVariantProbeFeatures()*
- Bio::EnsEMBL::**Slice**::*get_all_genotyped_VariationFeatures()*
- Bio::EnsEMBL::**StrainSlice**::*remove_indels()*
- Bio::EnsEMBL::**StrainSlice**::*get_original_seq_region_position()*
- Bio::EnsEMBL::**StrainSlice**::*get_all_VariationFeatures()*
- Bio::EnsEMBL::**StrainSlice**::*mapper()*
- Bio::EnsEMBL::**StrainSlice**::*subseq()*
- Bio::EnsEMBL::**StrainSlice**::*ref_subseq()*
- Bio::EnsEMBL::**StrainSlice**::*sub_Slice()*
- Bio::EnsEMBL::**StrainSlice**::*_convert_difference()*
- Bio::EnsEMBL::**StrainSlice**::*get_all_differences_StrainSlice()*
- Bio::EnsEMBL::**StrainSlice**::*get_all_AlleleFeatures_Slice()*
- Bio::EnsEMBL::**StrainSlice**::*get_all_AlleleFeature()*
- Bio::EnsEMBL::**StrainSlice**::*_add_coverage_information()*
- Bio::EnsEMBL::**StrainSlice**::*expanded_length()*
- Bio::EnsEMBL::**StrainSlice**::*seq()*
- Bio::EnsEMBL::**StrainSlice**::*display_Slice_name()*
- Bio::EnsEMBL::**StrainSlice**::*sample()*
- Bio::EnsEMBL::**StrainSlice**::*strain_name()*
- Bio::EnsEMBL::**StrainSlice**::*_filter_af_by_coverage()*
- Bio::EnsEMBL::**StrainSlice**::*new()*
- Bio::EnsEMBL::DBSQL::**StrainSliceAdaptor**::*new()*
- Bio::EnsEMBL::DBSQL::**StrainSliceAdaptor**::*fetch_by_name()*
### Removed in Ensembl Release 87 ###
- Bio::EnsEMBL::**AssemblyMapper**::*in_assembly()*
- Bio::EnsEMBL::**AssemblyMapper**::*map_coordinates_to_assembly()*
- Bio::EnsEMBL::**AssemblyMapper**::*fast_to_assembly()*
- Bio::EnsEMBL::**AssemblyMapper**::*map_coordinates_to_rawcontig()*
- Bio::EnsEMBL::**AssemblyMapper**::*list_contig_ids()*
- Bio::EnsEMBL::**ChainedAssemblyMapper**::*in_assembly()*
- Bio::EnsEMBL::**ChainedAssemblyMapper**::*map_coordinates_to_assembly()*
- Bio::EnsEMBL::**ChainedAssemblyMapper**::*fast_to_assembly()*
- Bio::EnsEMBL::**ChainedAssemblyMapper**::*map_coordinates_to_rawcontig()*
- Bio::EnsEMBL::**ChainedAssemblyMapper**::*list_contig_ids()*
- Bio::EnsEMBL::**DBEntry**::*get_synonyms()*
- Bio::EnsEMBL::DBSQL::**AssemblyMapperAdaptor**::*register_region()*
- Bio::EnsEMBL::DBSQL::**AssemblyMapperAdaptor**::*register_contig()*
- Bio::EnsEMBL::DBSQL::**AssemblyMapperAdaptor**::*fetch_by_type()*
- Bio::EnsEMBL::DBSQL::**KaryotypeBandAdaptor**::*fetch_by_chr_band()*
- Bio::EnsEMBL::DBSQL::**TranslationAdaptor**::*fetch_all_by_**DBEntry**()*
- Bio::EnsEMBL::DBSQL::**TranslationAdaptor**::*get_stable_entry_info()*
- Bio::EnsEMBL::DBSQL::**AltAlleleGroupAdaptor**::*fetch_all_Groups()*
- Bio::EnsEMBL::DBSQL::**AltAlleleGroupAdaptor**::*fetch_all_Groups_by_type()*
- Bio::EnsEMBL::DBSQL::**AltAlleleGroupAdaptor**::*fetch_Group_by_id()*
- Bio::EnsEMBL::DBSQL::**AltAlleleGroupAdaptor**::*fetch_Group_by_Gene_dbID()*
- Bio::EnsEMBL::DBSQL::**AnalysisAdaptor**::*feature_classes()*
- Bio::EnsEMBL::DBSQL::**BaseAlignFeatureAdaptor**::*fetch_all_by_RawContig_and_pid()*
- Bio::EnsEMBL::DBSQL::**BaseFeatureAdaptor**::*fetch_all_by_RawContig_constraint()*
- Bio::EnsEMBL::DBSQL::**BaseFeatureAdaptor**::*fetch_all_by_RawContig()*
- Bio::EnsEMBL::DBSQL::**BaseFeatureAdaptor**::*fetch_all_by_RawContig_and_score()*
- Bio::EnsEMBL::DBSQL::**BaseFeatureAdaptor**::*remove_by_RawContig()*
- Bio::EnsEMBL::DBSQL::**DBAdaptor**::*db_handle()*
- Bio::EnsEMBL::DBSQL::**DBAdaptor**::*port()*
- Bio::EnsEMBL::DBSQL::**DBAdaptor**::*driver()*
- Bio::EnsEMBL::DBSQL::**DBAdaptor**::*password()*
- Bio::EnsEMBL::DBSQL::**DBAdaptor**::*username()*
- Bio::EnsEMBL::DBSQL::**DBAdaptor**::*host()*
- Bio::EnsEMBL::DBSQL::**DBAdaptor**::*reconnect_when_lost()*
- Bio::EnsEMBL::DBSQL::**DBAdaptor**::*disconnect_when_inactive()*
- Bio::EnsEMBL::DBSQL::**DBAdaptor**::*dbname()*
- Bio::EnsEMBL::DBSQL::**DBAdaptor**::*prepare()*
- Bio::EnsEMBL::DBSQL::**DBAdaptor**::*list_supported_assemblies()*
- Bio::EnsEMBL::DBSQL::**DBAdaptor**::*assembly_type()*
- Bio::EnsEMBL::DBSQL::**DBAdaptor**::*db()*
- Bio::EnsEMBL::DBSQL::**DBConnection**::*group()*
- Bio::EnsEMBL::DBSQL::**DBConnection**::*species()*
- Bio::EnsEMBL::DBSQL::**DBEntryAdaptor**::*geneids_by_extids()*
- Bio::EnsEMBL::DBSQL::**DBEntryAdaptor**::*translationids_by_extids()*
- Bio::EnsEMBL::DBSQL::**DBEntryAdaptor**::*transcriptids_by_extids()*
- Bio::EnsEMBL::DBSQL::**DataFileAdaptor**::*DataFile_to_extension()*
- Bio::EnsEMBL::DBSQL::**ExonAdaptor**::*get_stable_entry_info()*
- Bio::EnsEMBL::DBSQL::**ExonAdaptor**::*fetch_all_by_gene_id()*
- Bio::EnsEMBL::DBSQL::**GeneAdaptor**::*fetch_nearest_Gene_by_Feature()*
- Bio::EnsEMBL::DBSQL::**GeneAdaptor**::*fetch_by_maximum_DBLink()*
- Bio::EnsEMBL::DBSQL::**GeneAdaptor**::*get_display_xref()*
- Bio::EnsEMBL::DBSQL::**GeneAdaptor**::*get_description()*
- Bio::EnsEMBL::DBSQL::**GeneAdaptor**::*fetch_all_by_**DBEntry**()*
- Bio::EnsEMBL::DBSQL::**GeneAdaptor**::*get_stable_entry_info()*
- Bio::EnsEMBL::DBSQL::**GeneAdaptor**::*fetch_by_Peptide_id()*
- Bio::EnsEMBL::DBSQL::**MetaContainer**::*get_Species()*
- Bio::EnsEMBL::DBSQL::**MetaContainer**::*get_default_assembly()*
- Bio::EnsEMBL::DBSQL::**ProteinFeatureAdaptor**::*fetch_by_translation_id()*
- Bio::EnsEMBL::DBSQL::**ProteinFeatureAdaptor**::*fetch_all_by_feature_and_dbID()*
- Bio::EnsEMBL::DBSQL::**RepeatConsensusAdaptor**::*fetch_by_class_seq()*
- Bio::EnsEMBL::DBSQL::**SliceAdaptor**::*fetch_by_mapfrag()*
- Bio::EnsEMBL::DBSQL::**SliceAdaptor**::*fetch_by_chr_start_end()*
- Bio::EnsEMBL::DBSQL::**SliceAdaptor**::*fetch_by_contig_name()*
- Bio::EnsEMBL::DBSQL::**SliceAdaptor**::*fetch_by_clone_accession()*
- Bio::EnsEMBL::DBSQL::**SliceAdaptor**::*fetch_by_supercontig_name()*
- Bio::EnsEMBL::DBSQL::**SliceAdaptor**::*list_overlapping_supercontigs()*
- Bio::EnsEMBL::DBSQL::**SliceAdaptor**::*fetch_by_chr_name()*
- Bio::EnsEMBL::DBSQL::**TranscriptAdaptor**::*get_display_xref()*
- Bio::EnsEMBL::DBSQL::**TranscriptAdaptor**::*get_stable_entry_info()*
- Bio::EnsEMBL::DBSQL::**TranscriptAdaptor**::*fetch_all_by_DBEntry()*
- Bio::EnsEMBL::DBSQL::**SequenceAdaptor**::*fetch_by_assembly_location()*
- Bio::EnsEMBL::DBSQL::**SequenceAdaptor**::*fetch_by_RawContig_start_end_strand()*
- Bio::EnsEMBL::**Exon**::*temporary_id()*
- Bio::EnsEMBL::**Exon**::*created()*
- Bio::EnsEMBL::**Exon**::*modified()*
- Bio::EnsEMBL::**Exon**::*type()*
- Bio::EnsEMBL::**FeaturePair**::*feature1()*
- Bio::EnsEMBL::**FeaturePair**::*feature2()*
- Bio::EnsEMBL::**FeaturePair**::*set_featurepair_fields()*
- Bio::EnsEMBL::**FeaturePair**::*gffstring()*
- Bio::EnsEMBL::**FeaturePair**::*hphase()*
- Bio::EnsEMBL::**FeaturePair**::*hend_phase()*
- Bio::EnsEMBL::**Feature**::*contig()*
- Bio::EnsEMBL::**Feature**::*id()*
- Bio::EnsEMBL::**Gene**::*add_DBLink()*
- Bio::EnsEMBL::**Gene**::*temporary_id()*
- Bio::EnsEMBL::**Gene**::*chr_name()*
- Bio::EnsEMBL::**Gene**::*type()*
- Bio::EnsEMBL::**Gene**::*confidence()*
- Bio::EnsEMBL::**IdentityXref**::*query_identity()*
- Bio::EnsEMBL::**IdentityXref**::*target_identity()*
- Bio::EnsEMBL::**IdentityXref**::*translation_start()*
- Bio::EnsEMBL::**IdentityXref**::*translation_end()*
- Bio::EnsEMBL::**IdentityXref**::*query_start()*
- Bio::EnsEMBL::**IdentityXref**::*query_end()*
- Bio::EnsEMBL::**KaryotypeBand**::*chr_name()*
- Bio::EnsEMBL::Map::DBSQL::**MarkerFeatureAdaptor**::*fetch_all_by_RawContig_and_priority()*
- Bio::EnsEMBL::Map::**DitagFeature**::*fetch_ditag()*
- Bio::EnsEMBL::Map::**MapLocation**::*chromosome()*
- Bio::EnsEMBL::**OperonTranscript**::*add_gene()*
- Bio::EnsEMBL::**PredictionTranscript**::*get_exon_count()*
- Bio::EnsEMBL::**PredictionTranscript**::*get_cdna()*
- Bio::EnsEMBL::**Registry**::*load_registry_with_web_adaptors()*
- Bio::EnsEMBL::**Root**::*throw()*
- Bio::EnsEMBL::**Root**::*warn()*
- Bio::EnsEMBL::**Root**::*verbose()*
- Bio::EnsEMBL::**Root**::*stack_trace_dump()*
- Bio::EnsEMBL::**Root**::*stack_trace()*
- Bio::EnsEMBL::**Slice**::*get_all_SNPs()*
- Bio::EnsEMBL::**Slice**::*get_all_genotyped_SNPs()*
- Bio::EnsEMBL::**Slice**::*get_all_supercontig_Slices()*
- Bio::EnsEMBL::**Slice**::*get_Chromosome()*
- Bio::EnsEMBL::**Slice**::*chr_name()*
- Bio::EnsEMBL::**Slice**::*chr_start()*
- Bio::EnsEMBL::**Slice**::*chr_end()*
- Bio::EnsEMBL::**Slice**::*assembly_type()*
- Bio::EnsEMBL::**Slice**::*dbID()*
- Bio::EnsEMBL::**Slice**::*get_all_MapFrags()*
- Bio::EnsEMBL::**Slice**::*has_MapSet()*
- Bio::EnsEMBL::**StrainSlice**::*get_all_differences_Slice()*
- Bio::EnsEMBL::**Transcript**::*created()*
- Bio::EnsEMBL::**Transcript**::*modified()*
- Bio::EnsEMBL::**Transcript**::*temporary_id()*
- Bio::EnsEMBL::**Transcript**::*type()*
- Bio::EnsEMBL::**Transcript**::*confidence()*
- Bio::EnsEMBL::**Translation**::*temporary_id()*
- Bio::EnsEMBL::Utils::**ConversionSupport**::*user_confirm()*
### Removed in Ensembl Release 84 ###
- Bio::EnsEMBL::DBSQL::**CoordSystemAdaptor**::*_fetch_by_attrib()*
- Bio::EnsEMBL::DBSQL::**CoordSystemAdaptor**::*_fetch_all_by_attrib()*
- Bio::EnsEMBL::DBSQL::**DBAdaptor**::*source()*
- Bio::EnsEMBL::DBSQL::**SliceAdaptor**::*fetch_by_band()*
- Bio::EnsEMBL::DBSQL::**MetaContainer**::*get_short_name()*
- Bio::EnsEMBL::DBSQL::**MetaContainer**::*get_max_assembly_contig()*
- Bio::EnsEMBL::**DBEntry**::*ensembl_object_type()*
- Bio::EnsEMBL::**DBEntry**::*ensembl_id()*
- Bio::EnsEMBL::**Exon**::*_get_stable_entry_info()*
- Bio::EnsEMBL::**FeaturePair**::*validate()*
- Bio::EnsEMBL::**FeaturePair**::*validate_prot_feature()*
- Bio::EnsEMBL::**PredictionTranscript**::*set_exon_count()*
- Bio::EnsEMBL::**Root**::*_rearrange()*
- Bio::EnsEMBL::**SeqFeatureI**::*analysis()*
- Bio::EnsEMBL::**SeqFeatureI**::*validate()*
- Bio::EnsEMBL::**SeqFeatureI**::*id()*
- Bio::EnsEMBL::**SeqFeatureI**::*percent_id()*
- Bio::EnsEMBL::**SeqFeatureI**::*e_value()*
- Bio::EnsEMBL::**SeqFeatureI**::*phase()*
- Bio::EnsEMBL::**SeqFeatureI**::*end_phase()*
- Bio::EnsEMBL::**SeqFeatureI**::*location()*
- Bio::EnsEMBL::**Slice**::*get_all_SNPs_transcripts()*
- Bio::EnsEMBL::**Slice**::*get_all_AffyFeatures()*
- Bio::EnsEMBL::**Slice**::*get_all_OligoFeatures()*
- Bio::EnsEMBL::**Slice**::*get_all_OligoFeatures_by_type()*
- Bio::EnsEMBL::**Slice**::*get_tiling_path()*
- Bio::EnsEMBL::**Transcript**::*sort()*
- Bio::EnsEMBL::**Transcript**::*_translation_id()*
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "{}"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
Copyright [2016-2019] EMBL-European Bioinformatics Institute
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
ENSEMBL CORE PROJECT SUBCOMPONENTS
The Ensembl Core Project includes a number of subcomponents with
separate copyright notices and license terms. Your use of the source
code for the these subcomponents is subject to the terms and
conditions of the following licenses.
COMPONENT: modules/Bio/EnsEMBL/Utils/Cache.pm
Code has been renamed from Tie::Cache to
Bio::EnsEMBL::Utils::Cache and originally distributed on CPAN.
Copyright (c) 1999-2002 Joshua Chamas, Chamas Enterprises Inc.
Sponsored by development on NodeWorks http://www.nodeworks.com
All rights reserved. This program is free software;
you can redistribute it and/or modify it under the same
terms as Perl itself.
COMPONENT: misc-scripts/doxygen_filter/EnsEMBL/Filter.pm and
misc-scripts/doxygen_filter/EnsEMBL/PerlFilter.pm.
Both portions of code were based on the following agreement;
PerlFilter.pm has seen significant re-writing but derives from the
original source.
Doxygen Pre-Processor for Perl
Copyright (C) 2002 Bart Schuller
Copyright (C) 2006 Phinex Informatik AG
All Rights Reserved
Doxygen Filter is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
Larry Wall's 'Artistic License' for perl can be found in
http://www.perl.com/pub/a/language/misc/Artistic.html
## Requirements
- Filling out the template is required. Any pull request that does not include enough information to be reviewed in a timely manner may be closed at the maintainers' discretion;
- Review the [contributing guidelines](https://github.com/Ensembl/ensembl/blob/master/CONTRIBUTING.md#why-could-my-pull-request-be-rejected) for this repository; remember in particular:
- do not modify code without testing for regression
- provide simple unit tests to test the changes
- if you change the schema you must patch the test databases as well, see [Updating the schema](https://github.com/Ensembl/ensembl/blob/master/CONTRIBUTING.md#updating-the-schema)
- the PR must not fail unit testing
## Description
_Using one or more sentences, describe in detail the proposed changes._
## Use case
_Describe the problem. Please provide an example representing the motivation behind the need for having these changes in place._
## Benefits
_If applicable, describe the advantages the changes will have._
## Possible Drawbacks
_If applicable, describe any possible undesirable consequence of the changes._
## Testing
_Have you added/modified unit tests to test the changes?_
_If so, do the tests pass/fail?_
_Have you run the entire test suite and no regression was detected?_
# Ensembl Core API
[![Build Status](https://travis-ci.org/Ensembl/ensembl.svg?branch=release/98)][travis]
[![Coverage Status](https://coveralls.io/repos/github/Ensembl/ensembl/badge.svg?branch=release/98)][coveralls]
[travis]: https://travis-ci.org/Ensembl/ensembl
[coveralls]: https://coveralls.io/github/Ensembl/ensembl
requires 'DBI';
requires 'DBD::mysql';
requires 'HTTP::Tiny';
requires 'IO::Compress::Gzip';
requires 'URI::Escape';
requires 'Config::IniFiles';
test_requires 'Test::Warnings';
test_requires 'Test::Differences';
test_requires 'Test::Exception';
test_requires 'Test::MockObject';
test_requires 'Test::Deep';
test_requires 'Test::More';
test_requires 'Devel::Peek';
test_requires 'Devel::Cycle';
test_requires 'Error';
test_requires 'PadWalker';
test_requires 'Test::Builder::Module';
test_requires 'IO::String';
test_requires 'Test::Perl::Critic';
test_requires 'Perl::Critic::Utils';
=cut
feature 'assembly_mapping', 'Assembly mapper support' => sub {
requires 'Algorithm::Diff';
requires 'Tie::IxHash';
};
feature 'xref_mapping', 'Xref mapping pipeline' => sub {
requires 'Config::IniFiles';
requires 'Digest::MD5';
requires 'Text::Glob';
requires 'XML::LibXML';
};
=cut
ensembl (65-1) UNRELEASED; urgency=low
ensembl (98+git20190619.e98e194-1) UNRELEASED; urgency=low
* New upstream release
* New upstream release based on latest Github commit
* debian/control:
- Fixed Vcs fields
- Added myself to uploaders
......
Source: ensembl
Maintainer: Debian Med Packaging Team <debian-med-packaging@lists.alioth.debian.org>
Uploaders: Richard Holland <holland@eaglegenomics.com>,
Steffen Moeller <moeller@debian.org>,
William Spooner <whs@eaglegenomics.com>,
Madhu Donepudi <madhu@eaglegenomics.com>,
Uploaders: Steffen Moeller <moeller@debian.org>,
Andreas Tille <tille@debian.org>
Section: non-free/science
XS-Autobuild: no
Section: science
Priority: optional
Build-Depends: debhelper (>= 10),
po-debconf
Standards-Version: 3.9.8
Vcs-Browser: https://anonscm.debian.org/cgit/debian-med/ensembl.git
Vcs-Git: https://anonscm.debian.org/git/debian-med/ensembl.git
Vcs-Browser: https://salsa.debian.org/med-team/ensembl
Vcs-Git: https://salsa.debian.org/med-team/ensembl.git
Homepage: http://www.ensembl.org
Package: ensembl
......
version=4
http://ftp.ensembl.org/pub/release-(\d+)
opts="mode=git,pretty=98+git%cd.%h" \
https://github.com/Ensembl/ensembl.git HEAD
# asked helpdesk for release tags
Ensembl API Connection FAQ
==========================
### MSG: human is not a valid species name (check DB and API version)
Usually this is caused by using the wrong Ensembl API version to access a server. The API can only find species from the same version as it.
The master branch of the Ensembl git repository is typically one release ahead of the public servers, and will always fail to find a species by default. The master branch is in development, and is not guaranteed to work. To access an older release, the Registry option DB_VERSION can be set, but it is preferable to use the correct API version to avoid unintended consequences.
You can also try the systematic name of the species.
### MSG: (insert fungi/protist/etc here) is not a valid species name (check DB and API version)
Try running perl ensembl/misc-scripts/ping_ensembl.pl and check the output.
Ensembl Genomes provides these species, and releases roughly two weeks later than Ensembl. If you have just updated your API and Ensembl recently announced a release, your software may be too new for the Ensembl Genomes servers. You can wait until they release, or roll back your API version. This is easy if you installed from Github.
```bash
VERSION=`perl -e 'use Bio::EnsEMBL::ApiVersion qw/software_version/; print software_version'`
git checkout release/`expr ${VERSION} - 1`
```
If you installed a downloaded package, then you will need to download an older Ensembl API release.
### error 2006 (MySQL server has gone away)
In a long-running process, it is possible to hit database server time limits for connections. Typically after 8 hours the server will close the connection, and your Perl code will die.
#### Strategies for avoiding timeouts
1 - Use the nearest database server to improve efficiency. Ensembl has mirrors in Asia and the USA as well as the main servers hosted in the UK. See the [Mirrors page](http://www.ensembl.org/info/about/mirrors.html) for specifics.
2 - For intense database access and high frequency querying, choose a good time to disconnect manually
```perl
...
# discrete work completed that takes an hour or two
$gene_adaptor->dbc->disconnect_if_idle;
# API re-opens connection automatically
$gene_adaptor->fetch_by_stable_id($stable_id);
```
3 - For scripts which occasionally consult Ensembl while working on a big problem for several hours
```perl
...
# For all code using Ensembl
Bio::Ensembl::Registry->set_disconnect_when_inactive(1);
# For just one adaptor
$adaptor->dbc->disconnect_when_inactive(1);
# For just one occasion
# This causes the connection to close whenever it is not being used. This is costly if there are very frequent database requests
$adaptor->dbc->disconnect_if_idle;
# In combination with disconnecting after every request, you can hold the connection open for the duration of a code block
my @gene_ids = ('ENSG0000001',...);
my %external_refs;
$gene_adaptor->dbc->prevent_disconnect(sub {
while (my $id = shift @gene_ids ) {
my $gene = $gene_adaptor->fetch_by_stable_id($id);
my $xrefs = $gene->get_all_DBEntries;
foreach my $xref (@$xrefs) {
$external_refs{$id} = $xref->display_id;
}
}
});
# This will finish faster than if it continues to disconnect and reconnect all the time
```
4 - For scripts which access Ensembl a lot and have no easy opportunity to behave as in option 2 above.
```perl
# For all code using Ensembl
Bio::Ensembl::Registry->set_reconnect_when_lost(1);
# For one adaptor
$adaptor->dbc->reconnect_when_lost(1);
# This option adds an additional message to every call to the database, checking that the connection is still up
# It increases network traffic and latency of each request, but can restore a broken connection
# Not necessary if you call disconnect_if_idle
```
Option 2 is both fastest and makes best use of Ensembl servers. Option 4 is next quickest, and option 3 is slow for heavy access, but suitable for occasional requests.
### Error 99
The server can't host any more connections. Users are connecting too many times in a short time period. Contact Ensembl Helpdesk (helpdesk@ensembl.org) to let us know there is a problem, and try again later.
### DBI/DBD::mysql not in PERL5LIB
The Ensembl API requires both DBI and DBD::mysql packages, typically via cpan or cpanm. If you have installed these libraries but still have this problem, you will need to add them to your PERL5LIB environment variable.
```bash
echo $PERL5LIB
# Can I see where my libs are installed?
export PERL5LIB=$PERL5LIB:/path/to/perl/lib
```
### MSG: Cannot connect to the Ensembl MySQL server at ensembldb.ensembl.org:3306;
Firstly, check your connection parameters. Run perl ensembl/scripts/ping_ensembl.pl and see what it says. If both ping_ensembl and your script cannot connect, the most likely cause is that your local network prohibits this kind of traffic. Ask you sysadmins if they allow outbound database traffic on port 3306/5306.
experimental/db-portability/master 2014-02-17
==================================
cliHelper is not supported; test is skipped.
t/dbEntries.t has a single failure.
experimental/db-portability/74
==============================
t/cliHelper.t is not supported.
t/dbEntries.t fails but does so as for MySQL.
branch-e73-db-portability &
branch-e72-db-portability
=========================
No major changes, see below for previous items.
branch-e71-db-portability
=========================
See below for notes from the original work for porting EnsEMBL 70 to
SQLite, and especially see the bottom of this file for an important
disclaimer.
t/circularSlice.t
now passes for SQLite.
t/dbEntries.t
now also passes for SQLite, but further work is needed in
DBEntryAdaptor.pm to replicate the changes made in
_store_or_fetch_xref() to allow for the differences in behaviour
between MySQL 'INSERT IGNORE' and SQLite 'INSERT OR IGNORE'.
branch-e70-db-portability
=========================
Overview
--------
The first revision to be considered of useful quality is that reached
at 2013-02-15 18:06:00 (registry.t: don't hardcode db driver in config
file).
The development up to that point has been rearranged to group related
changes into single commits, detailed below.
For the complete story, see also changes on the
branch-e70-db-portability branch of ensembl-test.
The main area of missing functionality under SQLite is registry
support.
Test results
------------
All tests still pass for MySQL, except for those requiring a threaded
perl, which the author has not yet tried to configure (and which thus
also fail for the umodified EnsEMBL).
The following tests fail for SQLite:
t/circularSlice.t: test-genome-DBs/circ/*.txt need patching.
* t/dbEntries.t: needs threads.
* t/registry.t: no tests run (needs threads?)
t/schema.t: need to work around 'create database' in test
t/schemaPatches.t: need to work around 'create database' in test
*: these tests also fail on MySQL for want of a threaded perl.
Development commits
-------------------
Schema conversion:
Date: Tue Feb 12 17:42:31 2013 +0000
Run ensembl-test/scripts/convert_test_schemas.sh.
Converts MySQL test schemas to SQLite test schemas.
modules/t/test-genome-DBs/circ/core/SQLite/table.sql
modules/t/test-genome-DBs/homo_sapiens/core/SQLite/table.sql
modules/t/test-genome-DBs/homo_sapiens/empty/SQLite/table.sql
Cherry-picked patches already applied to MAIN:
Date: Wed Feb 13 09:40:42 2013 +0000
Patched out a pair of CVS keywords that interfere with Git
compatibility for benefit of Anacode. This in no way implies
endorsement of Git for Ensembl.
misc-scripts/doxygen_filter/EnsEMBL/Filter.pm
misc-scripts/utilities/dna_compress.pl
Date: Wed Feb 13 14:12:25 2013 +0000
proteinFeatureAdaptor now cleans up after itself
fix sent by mg13
modules/t/proteinFeatureAdaptor.t
Date: Wed Feb 13 14:31:08 2013 +0000
Applying Michael Gray's patches for the purposes of future
database independence.
Extended support for different SQL backends through specific
return values. See Michael Gray's efforts.
See JIRA ticket ENSCORESW-349
modules/Bio/EnsEMBL/DBSQL/IntronSupportingEvidenceAdaptor.pm
modules/Bio/EnsEMBL/Utils/SqlHelper.pm
Date: Wed Feb 13 14:57:03 2013 +0000
ENSCORESW-348
also save and restore operon_transcript_gene table
entries in the operon_transcript_gene table are not cleaned up
modules/t/operon_transcript.t
Portability patches:
(*) indicates that the fix has been re-factored below under
"Portability architercture"
Date: Fri Feb 15 17:14:16 2013 +0000
rows() doesn't work on SQLite for SELECT unless all rows actually fetched!
[test scripts]
modules/t/MultiTestDB.t
modules/t/dbConnection.t
Date: Fri Feb 15 17:16:23 2013 +0000
rows() doesn't work on SQLite for SELECT unless all rows actually fetched!
[Adaptors]
modules/Bio/EnsEMBL/DBSQL/AssemblyMapperAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/MiscSetAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/ProteinFeatureAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/SliceAdaptor.pm
Date: Fri Feb 15 17:18:09 2013 +0000
BaseAdaptor.pm: properly unpack _tables().
modules/Bio/EnsEMBL/DBSQL/BaseAdaptor.pm
Date: Fri Feb 15 17:21:02 2013 +0000
"INSERT INTO table SET col1=val1, col2=val2" does not work for SQLite.
Use "INSERT INTO table (col1, col2) VALUES (val1, val2) instead.
modules/Bio/EnsEMBL/DBSQL/AttributeAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/CoordSystemAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/DBEntryAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/GeneAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/MiscFeatureAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/MiscSetAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/OperonAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/OperonTranscriptAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/ProteinFeatureAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/SliceAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/TranscriptAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/TranslationAdaptor.pm
Date: Fri Feb 15 17:23:48 2013 +0000
"INSERT IGNORE" vs "INSERT OR IGNORE" portability. (*)
modules/Bio/EnsEMBL/DBSQL/AnalysisAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/AttributeAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/BaseAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/DBEntryAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/DensityTypeAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/IntronSupportingEvidenceAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/MetaCoordContainer.pm
modules/Bio/EnsEMBL/DBSQL/MiscFeatureAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/MiscSetAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/SeqRegionSynonymAdaptor.pm
Date: Fri Feb 15 17:26:11 2013 +0000
Need to use last_insert_id() instead of {mysql_insertid}.
modules/Bio/EnsEMBL/DBSQL/AnalysisAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/AssemblyExceptionFeatureAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/AttributeAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/CoordSystemAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/DBEntryAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/DensityFeatureAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/DensityTypeAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/DnaAlignFeatureAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/ExonAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/GeneAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/MiscFeatureAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/MiscSetAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/OperonAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/OperonTranscriptAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/PredictionExonAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/PredictionTranscriptAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/ProteinAlignFeatureAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/ProteinFeatureAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/RepeatConsensusAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/RepeatFeatureAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/SeqRegionSynonymAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/SimpleFeatureAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/SliceAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/TranscriptAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/TranslationAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/UnmappedObjectAdaptor.pm
modules/Bio/EnsEMBL/Map/DBSQL/DitagAdaptor.pm
modules/Bio/EnsEMBL/Map/DBSQL/DitagFeatureAdaptor.pm
modules/Bio/EnsEMBL/Map/DBSQL/MarkerAdaptor.pm
modules/Bio/EnsEMBL/Map/DBSQL/MarkerFeatureAdaptor.pm
Date: Fri Feb 15 17:27:37 2013 +0000
port() and host() may not be set for all drivers.
modules/Bio/EnsEMBL/Storable.pm
modules/t/dbConnection.t
Date: Fri Feb 15 17:30:06 2013 +0000
DBConnection.pm: date conversions for SQLite. (*)
modules/Bio/EnsEMBL/DBSQL/DBConnection.pm
Date: Fri Feb 15 17:31:06 2013 +0000
dbConnection.t: quoting for SQLite.
modules/t/dbConnection.t
Date: Fri Feb 15 17:31:50 2013 +0000
sqlHelper.t: only attempt to alter engine for MySQL.
modules/t/sqlHelper.t
Date: Fri Feb 15 17:34:25 2013 +0000
AttributeAdaptor.pm: portable SQL.
This may be sub-optimal for MySQL.
modules/Bio/EnsEMBL/DBSQL/AttributeAdaptor.pm
Date: Fri Feb 15 17:35:41 2013 +0000
Make SQL more portable, where this has no side-effects.
modules/Bio/EnsEMBL/DBSQL/ArchiveStableIdAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/CompressedSequenceAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/SequenceAdaptor.pm
Date: Fri Feb 15 17:42:34 2013 +0000
STRAIGHT_JOIN optimisation only works on MySQL. (*)
modules/Bio/EnsEMBL/DBSQL/BaseAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/ExonAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/TranscriptAdaptor.pm
Date: Fri Feb 15 17:43:31 2013 +0000
No support for the registry in SQLite yet.
modules/t/exon.t
modules/t/gene.t
modules/t/operon.t
modules/t/operon_transcript.t
modules/t/transcript.t
modules/t/translation.t
Portability architecture:
Date: Fri Feb 15 17:48:12 2013 +0000
Split out driver-specfic code into delegated subclasses.
modules/Bio/EnsEMBL/DBSQL/DBConnection.pm
modules/Bio/EnsEMBL/DBSQL/Driver.pm
modules/Bio/EnsEMBL/DBSQL/Driver/SQLite.pm
modules/Bio/EnsEMBL/DBSQL/Driver/TestDummy.pm
modules/Bio/EnsEMBL/DBSQL/Driver/mysql.pm
modules/Bio/EnsEMBL/DBSQL/Driver/odbc.pm
modules/t/dbConnection.t
Date: Fri Feb 15 17:52:23 2013 +0000
Move DBD connection details into Driver::<type> subclasses.
modules/Bio/EnsEMBL/DBSQL/DBConnection.pm
modules/Bio/EnsEMBL/DBSQL/Driver/Oracle.pm
modules/Bio/EnsEMBL/DBSQL/Driver/SQLite.pm
modules/Bio/EnsEMBL/DBSQL/Driver/Sybase.pm
modules/Bio/EnsEMBL/DBSQL/Driver/mysql.pm
modules/Bio/EnsEMBL/DBSQL/Driver/odbc.pm
Date: Fri Feb 15 17:56:03 2013 +0000
last_insert_id(): move driver-specifics to Driver::<type> subclasses.
modules/Bio/EnsEMBL/DBSQL/BaseAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/Driver.pm
modules/Bio/EnsEMBL/DBSQL/Driver/mysql.pm
Date: Fri Feb 15 17:57:14 2013 +0000
insert_ignore_clause(): move driver-specifics to Driver::<type> subclasses.
modules/Bio/EnsEMBL/DBSQL/BaseAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/Driver/SQLite.pm
modules/Bio/EnsEMBL/DBSQL/Driver/mysql.pm
Date: Fri Feb 15 17:59:34 2013 +0000
Move can_straight_join() to Driver::<type> subclasses.
modules/Bio/EnsEMBL/DBSQL/BaseAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/Driver.pm
modules/Bio/EnsEMBL/DBSQL/Driver/mysql.pm
Further portability work:
Date: Fri Feb 15 18:00:31 2013 +0000
Setting wait_timeout is MySQL-specific.
modules/Bio/EnsEMBL/DBSQL/DBConnection.pm
modules/Bio/EnsEMBL/DBSQL/Driver.pm
modules/Bio/EnsEMBL/DBSQL/Driver/mysql.pm
Date: Fri Feb 15 18:01:28 2013 +0000
dbConnection.t: under DBI, true is not always 1.
(Can be 0E0, for example.)
modules/t/dbConnection.t
Date: Fri Feb 15 18:02:33 2013 +0000
sliceAdaptor.t: save/restore assembly table.
Not saving it relies on MySQL not re-using
primary keys after a restore. SQLite starts
where it left off after the restore.
modules/t/sliceAdaptor.t
Date: Fri Feb 15 18:03:30 2013 +0000
operon_transcript.t: avoid datatype mismatch warnings on translation coords.
modules/t/operon_transcript.t
Date: Fri Feb 15 18:04:22 2013 +0000
qtl.t: do not depend on ordering of list_traits().
modules/t/qtl.t
Date: Fri Feb 15 18:05:16 2013 +0000
registry.t: don't hardcode db driver in config file.
modules/t/registry.t
Further work required
---------------------
DnaAlignFeatureAdaptor & ProteinFeatureAdaptor:
I don't like use of "${tablename}_id" in last_insert_id().
Consider extending the _table() tuple to include id column.
DBEntryAdaptor:
Will need fixups for $sth->rows() called before fetching.
Possible test:
---vvvvv--- cut here ---vvvvv---
# test fetch_all_by_name
$xrefs = $dbEntryAdaptor->fetch_all_by_name('NM_030815');
ok(@{$xrefs} == 1); # test 61
---^^^^^--- cut here ---^^^^^---
OntologyTermAdaptor: FIND_IN_SET() is not available in SQLite.
ProxyDBConnection: Not considered.
Disclaimer
----------
To date this work has been steered by the ensembl test suite. I have
attempted to identify other instances of identified problems
throughout the adaptor code, but can make no guarantess about the
correct functionality under SQLite of any code not covered by the test
suite.
Michael Gray
mg13@sanger.ac.uk
February 2013
<p>h1. FASTA Pipeline</p>
<p>This is a re-implementation of an existing pipeline developed originally by core and the webteam. The new version uses eHive, so familiarity with this system is essential, and has been written to use as little memory as possible.</p>
<p>h2. The Registry File</p>
<p>This is the way we retrieve the database connections to work with. The registry file should specify:</p>
<ul>
<li>The core (and any other) databases to dump from</li>
<li>A production database
<em>* <em>species = multi</em>
<em></em> <em>group = production</em>
*</em> Used to find which species require new DNA</li>
<li>A web database
<em>* <em>species = multi</em>
<em></em> <em>group = web</em>
*</em> Used to name BLAT index files</li>
</ul>
<p>Here is an example of a file for v67 of Ensembl. Note the use of the Registry object within a registry file and the scoping of the package. If you omit the <em>-db_version</em> parameter and only use HEAD checkouts of Ensembl then this will automatically select the latest version of the API. Any change to version here must be reflected in the configuration file.</p>
<p>bc.
package Reg;
use Bio::EnsEMBL::Registry;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
Bio::EnsEMBL::Registry->no_version_check(1);
Bio::EnsEMBL::Registry->no_cache_warnings(1);
{
my $version = 67;
Bio::EnsEMBL::Registry->load_registry_from_multiple_dbs(
{
-host => &#8220;mydb-1&#8221;,
-port => 3306,
-db_version => $version,
-user => &#8220;user&#8221;,
-NO_CACHE => 1,
},
{ <br />
-host => &#8220;mydb-2&#8221;,
-port => 3306,
-db_version => $version,
-user => &#8220;user&#8221;,
-NO_CACHE => 1,
},
);
Bio::EnsEMBL::DBSQL::DBAdaptor->new(
-HOST => &#8216;mydb-2&#8217;,
-PORT => 3306,
-USER => &#8216;user&#8217;,
-DBNAME => &#8216;ensembl_website&#8217;,
-SPECIES => &#8216;multi&#8217;,
-GROUP => &#8216;web&#8217;
);
Bio::EnsEMBL::DBSQL::DBAdaptor->new(
-HOST => &#8216;mydb-2&#8217;,
-PORT => 3306,
-USER => &#8216;user&#8217;,
-DBNAME => &#8216;ensembl_production&#8217;,
-SPECIES => &#8216;multi&#8217;,
-GROUP => &#8216;production&#8217;
);
}
1;</p>
<p>You give the registry to the <em>init_pipeline.pl</em> script via the <em>-registry</em> option</p>
<p>h2. Overriding Defaults Using a New Config File </p>
<p>We recommend if you have a number of parameters which do not change between releases to create a configuration file which inherits from the root config file e.g.</p>
<p>bc.
package MyCnf;
use base qw/Bio::EnsEMBL::Pipeline::FASTA::FASTA_conf/;
sub default_options {
my ($self) = @<em>;
return {
%{ $self->SUPER::default</em>options() },
#Override of options
};
}
1;</p>
<p>If you do override the config then you should use the package name for your overridden config in the upcoming example commands.</p>
<p>h2. Environment</p>
<p>h3. PERL5LIB</p>
<ul>
<li>ensembl</li>
<li>ensembl-hive</li>
<li>bioperl</li>
</ul>
<p>h3. PATH</p>
<ul>
<li>ensembl-hive/scripts</li>
<li>faToTwoBit (if not using a custom location)</li>
<li>xdformat (if not using a custom location)</li>
<li>sendmail (for emailing reports)</li>
</ul>
<p>h3. ENSEMBL_CVS_ROOT_DIR</p>
<p>Set to the base checkout of Ensembl. We should be able to add <em>ensembl-hive/sql</em> onto this path to find the SQL directory for hive e.g.</p>
<p>bc.
export ENSEMBL_CVS_ROOT_DIR=$HOME/work/ensembl-checkouts</p>
<p>h3. ENSADMIN_PSW</p>
<p>Give the password to use to log into a database server e.g.</p>
<p>bc.
export ENSADMIN_PSW=wibble</p>
<p>h2. Command Line Arguments</p>
<p>Where <em>Multiple Supported</em> is supported we allow the user to specify the parameter more than once on the command line. For example species is one of these options e.g. </p>
<p>bc. -species human -species cele -species yeast</p>
<p>|<em>. Name |</em>. Type|<em>. Multiple Supported|</em>. Description|<em>. Default|</em>. Required|
|@-registry@|String|No|Location of the Ensembl registry to use with this pipeline|-|<em>YES</em>|
|@-base_path@|String|No|Location of the dumps|-|<em>YES</em>|
|@-pipeline_db -host=@|String|No|Specify a host for the hive database e.g. @-pipeline_db -host=myserver.mysql@|See hive generic config|<em>YES</em>|
|@-pipeline_db -dbname=@|String|No|Specify a different database to use as the hive DB e.g. @-pipeline_db -dbname=my_dumps_test@|Uses pipeline name by default|<em>NO</em>|
|@-ftp_dir@|String|No|Location of the current FTP directory with the previous release&#8217;s files. We will use this to copy DNA files from one release to another. If not given we do not do any reuse|-|<em>NO</em>|
|@-run_all_@|Boolean|No|Ignores any kind of reuse an forces the dump of all DNAs|-|<em>NO</em>|
|@-species@|String|Yes|Specify one or more species to process. Pipeline will only <em>consider</em> these species. Use <em>-force_species</em> if you want to force a species run|-|<em>NO</em>|
|@-force_species@|String|Yes|Specify one or more species to force through the pipeline. This is useful to force a dump when you know reuse will not do the <em>&#8220;right thing&#8221;</em>|-|<em>NO</em>|
|@-dump_types@|String|Yes|Specify each type of dump you want to produce. Supported values are <em>dna</em>, <em>cdna</em> and <em>ncrna</em>|All|<em>NO</em>|
|@-db_types@|String|Yes|The database types to use. Supports the normal db adaptor groups e.g. <em>core</em>, <em>otherfeatures</em> etc.|core|<em>NO</em>|
|@-process_logic_names@|String|Yes|Provide a set of logic names whose models should be dumped|-|<em>NO</em>|
|@-skip_logic_names@|String|Yes|Provide a set of logic names to skip when creating dumps. These are evaluated <em>after</em> @-process_logic_names@|core|<em>NO</em>|
|@-release@|Integer|No|The release to dump|Software version|<em>NO</em>|
|@-previous_release@|Integer|No|The previous release to use. Used to calculate reuse|Software version minus 1|<em>NO</em>|
|@-blast_servers@|String|Yes|The servers to copy blast indexes to|-|<em>NO</em>|
|@-blast_genomic_dir@|String|No|Location to copy the DNA blast indexes to|-|<em>NO</em>|
|@-blast_genes_dir@|String|No|Location to copy the DNA gene (cdna, ncrna and protein) indexes to|-|<em>NO</em>|
|@-scp_user@|String|No|User to perform the SCP as. Defaults to the current user|Current user|<em>NO</em>|
|@-scp_identity@|String|No|The SSH identity file to use when performing SCPs. Normally used in conjunction with <em>-scp_user</em>|-|<em>NO</em>|
|@-no_scp@|Boolean|No|Skip SCP altogether|0|<em>NO</em>|
|@-pipeline_name@|String|No|Name to use for the pipeline|fasta_dump_$release|<em>NO</em>|
|@-wublast_exe@|String|No|Location of the WUBlast indexing binary|xdformat|<em>NO</em>|
|@-blat_exe@|String|No|Location of the Blat indexing binary|faToTwoBit|<em>NO</em>|
|@-port_offset@|Integer|No|The offset of the ports to use when generating blat indexes. This figure is added onto the web database species ID|30000|<em>NO</em>|
|@-email@|String|No|Email to send pipeline summaries to upon its successful completion|$USER@sanger.ac.uk|<em>NO</em>|</p>
<p>h2. Example Commands</p>
<p>h3. To load use normally:</p>
<p>bc.
init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig::FASTA_conf \
-pipeline_db -host=my-db-host -base_path /path/to/dumps -registry reg.pm</p>
<p>h3. Run a subset of species (no forcing &amp; supports registry aliases):</p>
<p>bc.
init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig::FASTA_conf \
-pipeline_db -host=my-db-host -species anolis -species celegans -species human \
-base_path /path/to/dumps -registry reg.pm</p>
<p>h3. Specifying species to force (supports all registry aliases):</p>
<p>bc.
init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig::FASTA_conf \
-pipeline_db -host=my-db-host -force_species anolis -force_species celegans -force_species human \
-base_path /path/to/dumps -registry reg.pm</p>
<p>h3. Running &amp; forcing a species:</p>
<p>bc.
init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig::FASTA_conf \
-pipeline_db -host=my-db-host -species celegans -force_species celegans \
-base_path /path/to/dumps -registry reg.pm</p>
<p>h3. Running everything:</p>
<p>bc.
init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig::FASTA_conf \
-pipeline_db -host=my-db-host -run_all 1 \
-base_path /path/to/dumps -registry reg.pm</p>
<p>h3. Dumping just gene data (no DNA or ncRNA):</p>
<p>bc.
init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig::FASTA_conf \
-pipeline_db -host=my-db-host -dump_type cdna \
-base_path /path/to/dumps -registry reg.pm</p>
<p>h3. Using a different SCP user &amp; identity:</p>
<p>bc.
init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig::FASTA_conf \
-pipeline_db -host=my-db-host -scp_user anotherusr -scp_identity /users/anotherusr/.pri/identity \
-base_path /path/to/dumps -registry reg.pm</p>
<p>h2. Running the Pipeline</p>
<h1 id="start_a_screen_session_or_get_ready_to_run_the_beekeeper_with_a_nohup">Start a screen session or get ready to run the beekeeper with a @nohup@</h1>
<h1 id="choose_a_dump_location">Choose a dump location</h1>
<h1 id="_a_fasta_blast_and_blat_directory_will_be_created_1_level_below">* A fasta, blast and blat directory will be created 1 level below</h1>
<h1 id="use_an_init_pipelinepl_configuration_from_above">Use an @init_pipeline.pl@ configuration from above</h1>
<h1 id="_make_sure_to_give_it_the_base_path_parameter">* Make sure to give it the @-base_path@ parameter</h1>
<h1 id="sync_the_database_using_one_of_the_displayed_from_init_pipelinepl">Sync the database using one of the displayed from @init_pipeline.pl@</h1>
<h1 id="run_the_pipeline_in_a_loop_with_a_good_sleep_between_submissions_and_redirect_log_output_the_following_assumes_you_are_using_bash">Run the pipeline in a loop with a good sleep between submissions and redirect log output (the following assumes you are using <em>bash</em>)</h1>
<h1 id="_21_is_important_as_this_clobbers_stderr_into_stdout">* @2>&amp;1@ is important as this clobbers STDERR into STDOUT</h1>
<h1 id="_my_runlog_then_sends_the_output_to_this_file_use_tail_f_to_track_the_pipeline">* @> my_run.log@ then sends the output to this file. Use @tail -f@ to track the pipeline</h1>
<h1 id="beekeeperpl_url_mysql_usrpassserverport_db_reg_conf_regpm_loop_sleep_5_21_my_runlog_">@beekeeper.pl -url mysql://usr:pass@server:port/db -reg_conf reg.pm -loop -sleep 5 2>&amp;1 > my_run.log &amp;@</h1>
<h1 id="wait">Wait</h1>
<p>h2. But I Don&#8217;t Want a Pipeline</p>
<p>Hive gives us the ability to run any Process outside of a database pipeline
run using @standaloneJob.pl@. We will list some useful commands to run</p>
<p>h3. Running DNA Dumping</p>
<p>bc.
standaloneJob.pl Bio::EnsEMBL::Pipeline::FASTA::DumpFile \
-reg_conf reg.pm -debug 2 \
-release 67 -species homo_sapiens -sequence_type_list &#8216;[&#8220;dna&#8221;]&#8217; \
-base_path /path/to/dumps</p>
<p>h3. Running Gene Dumping</p>
<p>bc.
standaloneJob.pl Bio::EnsEMBL::Pipeline::FASTA::DumpFile \
-reg_conf reg.pm -debug 2 \
-release 67 -species homo_sapiens -sequence_type_list &#8216;[&#8220;cdna&#8221;]&#8217; \
-base_path /path/to/dumps</p>
<p>h3. Running Gene Dumping</p>
<p>bc.
standaloneJob.pl Bio::EnsEMBL::Pipeline::FASTA::DumpFile \
-reg_conf reg.pm -debug 2 \
-release 67 -species homo_sapiens -sequence_type_list &#8216;[&#8220;ncrna&#8221;]&#8217; \
-base_path /path/to/dumps</p>
h1. FASTA Pipeline
This is a re-implementation of an existing pipeline developed originally by core and the webteam. The new version uses eHive, so familiarity with this system is essential, and has been written to use as little memory as possible.
h2. The Registry File
This is the way we retrieve the database connections to work with. The registry file should specify:
* The core (and any other) databases to dump from
* A production database
** *species = multi*
** *group = production*
** Used to find which species require new DNA
* A web database
** *species = multi*
** *group = web*
** Used to name BLAT index files
Here is an example of a file for v67 of Ensembl. Note the use of the Registry object within a registry file and the scoping of the package. If you omit the *-db_version* parameter and only use HEAD checkouts of Ensembl then this will automatically select the latest version of the API. Any change to version here must be reflected in the configuration file.
bc.
package Reg;
use Bio::EnsEMBL::Registry;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
Bio::EnsEMBL::Registry->no_version_check(1);
Bio::EnsEMBL::Registry->no_cache_warnings(1);
{
my $version = 67;
Bio::EnsEMBL::Registry->load_registry_from_multiple_dbs(
{
-host => "mydb-1",
-port => 3306,
-db_version => $version,
-user => "user",
-NO_CACHE => 1,
},
{
-host => "mydb-2",
-port => 3306,
-db_version => $version,
-user => "user",
-NO_CACHE => 1,
},
);
Bio::EnsEMBL::DBSQL::DBAdaptor->new(
-HOST => 'mydb-2',
-PORT => 3306,
-USER => 'user',
-DBNAME => 'ensembl_website',
-SPECIES => 'multi',
-GROUP => 'web'
);
Bio::EnsEMBL::DBSQL::DBAdaptor->new(
-HOST => 'mydb-2',
-PORT => 3306,
-USER => 'user',
-DBNAME => 'ensembl_production',
-SPECIES => 'multi',
-GROUP => 'production'
);
}
1;
You give the registry to the *init_pipeline.pl* script via the *-registry* option
h2. Overriding Defaults Using a New Config File
We recommend if you have a number of parameters which do not change between releases to create a configuration file which inherits from the root config file e.g.
bc.
package MyCnf;
use base qw/Bio::EnsEMBL::Pipeline::FASTA::FASTA_conf/;
sub default_options {
my ($self) = @_;
return {
%{ $self->SUPER::default_options() },
#Override of options
};
}
1;
If you do override the config then you should use the package name for your overridden config in the upcoming example commands.
h2. Environment
h3. PERL5LIB
* ensembl
* ensembl-hive
* bioperl
h3. PATH
* ensembl-hive/scripts
* faToTwoBit (if not using a custom location)
* xdformat (if not using a custom location)
* sendmail (for emailing reports)
h3. ENSEMBL_CVS_ROOT_DIR
Set to the base checkout of Ensembl. We should be able to add *ensembl-hive/sql* onto this path to find the SQL directory for hive e.g.
bc.
export ENSEMBL_CVS_ROOT_DIR=$HOME/work/ensembl-checkouts
h3. ENSADMIN_PSW
Give the password to use to log into a database server e.g.
bc.
export ENSADMIN_PSW=wibble
h2. Command Line Arguments
Where *Multiple Supported* is supported we allow the user to specify the parameter more than once on the command line. For example species is one of these options e.g.
bc. -species human -species cele -species yeast
|_. Name |_. Type|_. Multiple Supported|_. Description|_. Default|_. Required|
|@-registry@|String|No|Location of the Ensembl registry to use with this pipeline|-|*YES*|
|@-base_path@|String|No|Location of the dumps|-|*YES*|
|@-pipeline_db -host=@|String|No|Specify a host for the hive database e.g. @-pipeline_db -host=myserver.mysql@|See hive generic config|*YES*|
|@-pipeline_db -dbname=@|String|No|Specify a different database to use as the hive DB e.g. @-pipeline_db -dbname=my_dumps_test@|Uses pipeline name by default|*NO*|
|@-ftp_dir@|String|No|Location of the current FTP directory with the previous release's files. We will use this to copy DNA files from one release to another. If not given we do not do any reuse|-|*NO*|
|@-run_all_@|Boolean|No|Ignores any kind of reuse an forces the dump of all DNAs|-|*NO*|
|@-species@|String|Yes|Specify one or more species to process. Pipeline will only _consider_ these species. Use *-force_species* if you want to force a species run|-|*NO*|
|@-force_species@|String|Yes|Specify one or more species to force through the pipeline. This is useful to force a dump when you know reuse will not do the _"right thing"_|-|*NO*|
|@-dump_types@|String|Yes|Specify each type of dump you want to produce. Supported values are *dna*, *cdna* and *ncrna*|All|*NO*|
|@-db_types@|String|Yes|The database types to use. Supports the normal db adaptor groups e.g. *core*, *otherfeatures* etc.|core|*NO*|
|@-process_logic_names@|String|Yes|Provide a set of logic names whose models should be dumped|-|*NO*|
|@-skip_logic_names@|String|Yes|Provide a set of logic names to skip when creating dumps. These are evaluated *after* @-process_logic_names@|core|*NO*|
|@-release@|Integer|No|The release to dump|Software version|*NO*|
|@-previous_release@|Integer|No|The previous release to use. Used to calculate reuse|Software version minus 1|*NO*|
|@-blast_servers@|String|Yes|The servers to copy blast indexes to|-|*NO*|
|@-blast_genomic_dir@|String|No|Location to copy the DNA blast indexes to|-|*NO*|
|@-blast_genes_dir@|String|No|Location to copy the DNA gene (cdna, ncrna and protein) indexes to|-|*NO*|
|@-scp_user@|String|No|User to perform the SCP as. Defaults to the current user|Current user|*NO*|
|@-scp_identity@|String|No|The SSH identity file to use when performing SCPs. Normally used in conjunction with *-scp_user*|-|*NO*|
|@-no_scp@|Boolean|No|Skip SCP altogether|0|*NO*|
|@-pipeline_name@|String|No|Name to use for the pipeline|fasta_dump_$release|*NO*|
|@-wublast_exe@|String|No|Location of the WUBlast indexing binary|xdformat|*NO*|
|@-blat_exe@|String|No|Location of the Blat indexing binary|faToTwoBit|*NO*|
|@-port_offset@|Integer|No|The offset of the ports to use when generating blat indexes. This figure is added onto the web database species ID|30000|*NO*|
|@-email@|String|No|Email to send pipeline summaries to upon its successful completion|$USER@sanger.ac.uk|*NO*|
h2. Example Commands
h3. To load use normally:
bc.
init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig::FASTA_conf \
-pipeline_db -host=my-db-host -base_path /path/to/dumps -registry reg.pm
h3. Run a subset of species (no forcing & supports registry aliases):
bc.
init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig::FASTA_conf \
-pipeline_db -host=my-db-host -species anolis -species celegans -species human \
-base_path /path/to/dumps -registry reg.pm
h3. Specifying species to force (supports all registry aliases):
bc.
init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig::FASTA_conf \
-pipeline_db -host=my-db-host -force_species anolis -force_species celegans -force_species human \
-base_path /path/to/dumps -registry reg.pm
h3. Running & forcing a species:
bc.
init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig::FASTA_conf \
-pipeline_db -host=my-db-host -species celegans -force_species celegans \
-base_path /path/to/dumps -registry reg.pm
h3. Running everything:
bc.
init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig::FASTA_conf \
-pipeline_db -host=my-db-host -run_all 1 \
-base_path /path/to/dumps -registry reg.pm
h3. Dumping just gene data (no DNA or ncRNA):
bc.
init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig::FASTA_conf \
-pipeline_db -host=my-db-host -dump_type cdna \
-base_path /path/to/dumps -registry reg.pm
h3. Using a different SCP user & identity:
bc.
init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig::FASTA_conf \
-pipeline_db -host=my-db-host -scp_user anotherusr -scp_identity /users/anotherusr/.pri/identity \
-base_path /path/to/dumps -registry reg.pm
h2. Running the Pipeline
# Start a screen session or get ready to run the beekeeper with a @nohup@
# Choose a dump location
#* A fasta, blast and blat directory will be created 1 level below
# Use an @init_pipeline.pl@ configuration from above
#* Make sure to give it the @-base_path@ parameter
# Sync the database using one of the displayed from @init_pipeline.pl@
# Run the pipeline in a loop with a good sleep between submissions and redirect log output (the following assumes you are using *bash*)
#* @2>&1@ is important as this clobbers STDERR into STDOUT
#* @> my_run.log@ then sends the output to this file. Use @tail -f@ to track the pipeline
# @beekeeper.pl -url mysql://usr:pass@server:port/db -reg_conf reg.pm -loop -sleep 5 2>&1 > my_run.log &@
# Wait
h2. But I Don't Want a Pipeline
Hive gives us the ability to run any Process outside of a database pipeline
run using @standaloneJob.pl@. We will list some useful commands to run
h3. Running DNA Dumping
bc.
standaloneJob.pl Bio::EnsEMBL::Pipeline::FASTA::DumpFile \
-reg_conf reg.pm -debug 2 \
-release 67 -species homo_sapiens -sequence_type_list '["dna"]' \
-base_path /path/to/dumps
h3. Running Gene Dumping
bc.
standaloneJob.pl Bio::EnsEMBL::Pipeline::FASTA::DumpFile \
-reg_conf reg.pm -debug 2 \
-release 67 -species homo_sapiens -sequence_type_list '["cdna"]' \
-base_path /path/to/dumps
h3. Running Gene Dumping
bc.
standaloneJob.pl Bio::EnsEMBL::Pipeline::FASTA::DumpFile \
-reg_conf reg.pm -debug 2 \
-release 67 -species homo_sapiens -sequence_type_list '["ncrna"]' \
-base_path /path/to/dumps
\ No newline at end of file
<?xml version='1.0' encoding='utf-8' ?><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/></head><body><h1 id="FlatFilePipeline">FlatFile Pipeline</h1><p>This is a re-implementation of an existing pipeline developed originally by core and the webteam. The new version uses eHive, so familiarity with this system is essential, and has been written to use as little memory as possible.</p><h2 id="TheRegistryFile">The Registry File</h2><p>This is the way we retrieve the database connections to work with. The registry file should specify:</p><ul><li>The core (and any other) databases to dump from</li></ul><p>Here is an example of a file for v67 of Ensembl. Note the use of the Registry object within a registry file and the scoping of the package. If you omit the <strong>-db_version</strong> parameter and only use HEAD checkouts of Ensembl then this will automatically select the latest version of the API. Any change to version here must be reflected in the configuration file.</p><pre><code> package Reg;
use Bio::EnsEMBL::Registry;
Bio::EnsEMBL::Registry-&gt;no_version_check(1);
Bio::EnsEMBL::Registry-&gt;no_cache_warnings(1);
{
my $version = 67;
Bio::EnsEMBL::Registry-&gt;load_registry_from_multiple_dbs(
{
-host =&gt; "mydb-1",
-port =&gt; 3306,
-db_version =&gt; $version,
-user =&gt; "user",
-NO_CACHE =&gt; 1,
},
{
-host =&gt; "mydb-2",
-port =&gt; 3306,
-db_version =&gt; $version,
-user =&gt; "user",
-NO_CACHE =&gt; 1,
},
);
}
1;
</code></pre><p>You give the registry to the <strong>init_pipeline.pl</strong> script via the <strong>-registry</strong> option</p><h2 id="OverridingDefaultsUsingaNewConfigFile">Overriding Defaults Using a New Config File </h2><p>We recommend if you have a number of parameters which do not change between releases to create a configuration file which inherits from the root config file e.g.</p><pre><code> package MyCnf;
use base qw/Bio::EnsEMBL::Pipeline::Flatfile::Flatfile_conf/;
sub default_options {
my ($self) = @_;
return {
%{ $self-&gt;SUPER::default_options() },
#Override of options
};
}
1;
</code></pre><p>If you do override the config then you should use the package name for your overridden config in the upcoming example commands.</p><h2 id="Environment">Environment</h2><h3 id="PERL5LIB">PERL5LIB</h3><ul><li>ensembl</li><li>ensembl-hive</li><li>bioperl</li></ul><h3 id="PATH">PATH</h3><ul><li>ensembl-hive/scripts</li></ul><h3 id="ENSEMBLCVSROOTDIR">ENSEMBL_CVS_ROOT_DIR</h3><p>Set to the base checkout of Ensembl. We should be able to add <strong>ensembl-hive/sql</strong> onto this path to find the SQL directory for hive e.g.</p><pre><code> export ENSEMBL_CVS_ROOT_DIR=$HOME/work/ensembl-checkouts
</code></pre><h3 id="ENSADMINPSW">ENSADMIN_PSW</h3><p>Give the password to use to log into a database server e.g.</p><pre><code> export ENSADMIN_PSW=wibble
</code></pre><h2 id="CommandLineArguments">Command Line Arguments</h2><p>Where <strong>Multiple Supported</strong> is supported we allow the user to specify the parameter more than once on the command line. For example species is one of these options e.g. </p><pre><code>-species human -species cele -species yeast
</code></pre><table><tr><th>Name </th><th> Type</th><th>Multiple Supported</th><th> Description</th><th>Default</th><th> Required</th></tr><tr><td><code>-registry</code></td><td>String</td><td>No</td><td>Location of the Ensembl registry to use with this pipeline</td><td>-</td><td><strong>YES</strong></td></tr><tr><td><code>-base_path</code></td><td>String</td><td>No</td><td>Location of the dumps</td><td>-</td><td><strong>YES</strong></td></tr><tr><td><code>-pipeline_db -host=</code></td><td>String</td><td>No</td><td>Specify a host for the hive database e.g. <code>-pipeline_db -host=myserver.mysql</code></td><td>See hive generic config</td><td><strong>YES</strong></td></tr><tr><td><code>-pipeline_db -dbname=</code></td><td>String</td><td>No</td><td>Specify a different database to use as the hive DB e.g. <code>-pipeline_db -dbname=my_dumps_test</code></td><td>Uses pipeline name by default</td><td><strong>NO</strong></td></tr><tr><td><code>-species</code></td><td>String</td><td>Yes</td><td>Specify one or more species to process. Pipeline will only <em>consider</em> these species</td><td>-</td><td><strong>NO</strong></td></tr><tr><td><code>-types</code></td><td>String</td><td>Yes</td><td>Specify each type of dump you want to produce. Supported values are <strong>embl</strong> and <strong>genbank</strong></td><td>All</td><td><strong>NO</strong></td></tr><tr><td><code>-db_types</code></td><td>String</td><td>Yes</td><td>The database types to use. Supports the normal db adaptor groups e.g. <strong>core</strong>, <strong>otherfeatures</strong> etc.</td><td>core</td><td><strong>NO</strong></td></tr><tr><td><code>-pipeline_name</code></td><td>String</td><td>No</td><td>Name to use for the pipeline</td><td>flatfile_dump_$release</td><td><strong>NO</strong></td></tr><tr><td><code>-email</code></td><td>String</td><td>No</td><td>Email to send pipeline summaries to upon its successful completion</td><td>$USER@sanger.ac.uk</td><td><strong>NO</strong></td></tr></table><h2 id="ExampleCommands">Example Commands</h2><h3 id="Toloadusenormally">To load use normally:</h3><pre><code> init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig::Flatfile_conf \
-pipeline_db -host=my-db-host -base_path /path/to/dumps -registry reg.pm
</code></pre><h3 id="Runasubsetofspeciesnoforcingsupportsregistryaliases">Run a subset of species (no forcing &amp; supports registry aliases):</h3><pre><code> init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig::Flatfile_conf \
-pipeline_db -host=my-db-host -species anolis -species celegans -species human \
-base_path /path/to/dumps -registry reg.pm
</code></pre><h3 id="DumpingjustEMBLdatanogenbank">Dumping just EMBL data (no genbank):</h3><pre><code> init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig::Flatfile_conf \
-pipeline_db -host=my-db-host -type embl \
-base_path /path/to/dumps -registry reg.pm
</code></pre><h2 id="RunningthePipeline">Running the Pipeline</h2><ol><li>Start a screen session or get ready to run the beekeeper with a <code>nohup</code></li><li>Choose a dump location<ul><li>A fasta, blast and blat directory will be created 1 level below</li></ul></li><li>Use an <code>init_pipeline.pl</code> configuration from above<ul><li>Make sure to give it the <code>-base_path</code> parameter</li></ul></li><li>Sync the database using one of the displayed from <code>init_pipeline.pl</code></li><li>Run the pipeline in a loop with a good sleep between submissions and redirect log output (the following assumes you are using <strong>bash</strong>)<ul><li><code>2&gt;&amp;1</code> is important as this clobbers STDERR into STDOUT</li><li><code>&gt; my_run.log</code> then sends the output to this file. Use <code>tail -f</code> to track the pipeline</li></ul></li><li><code>beekeeper.pl -url mysql://usr:pass@server:port/db -reg_conf reg.pm -loop -sleep 5 2&gt;&amp;1 &gt; my_run.log &amp;</code></li><li>Wait</li></ol><h2 id="RunningwithoutaPipeline">Running without a Pipeline</h2><p>Hive gives us the ability to run any Process outside of a database pipeline <br/>run using <code>standaloneJob.pl</code>. We will list some useful commands to run</p><h3 id="DumpingaSingleSpecies">Dumping a Single Species</h3><pre><code> standaloneJob.pl Bio::EnsEMBL::Pipeline::Flatfile::DumpFile \
-reg_conf reg.pm -debug 2 \
-release 67 -species homo_sapiens -type embl \
-base_path /path/to/dumps
</code></pre><h2 id="Verification">Verification</h2><p>Another pipeline is provided which can verify the files produced by this <br/>pipeline. Nothing else other than a basic prodding of file contents is<br/>attempted.</p><h3 id="RunningwithaPipeline">Running with a Pipeline</h3><p>The code works with a SQLite database so you do not need a MySQL database<br/>to schedule these jobs. You will have to schedule two pipelines; one<br/>to work with embl and another to work with genbank.</p><p>The pipeline searches for all files matching the format *.dat.gz.</p><pre><code> init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig::FlatfileChecker_conf \
-base_path /path/to/embl/dumps -type embl
</code></pre><pre><code> init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig::FlatfileChecker_conf \
-base_path /path/to/genbank/dumps -type genbank
</code></pre><h3 id="RunningwithoutaPipeline2">Running without a Pipeline</h3><p>You can run this module without a pipeline if you need to check a single<br/>file.</p><pre><code> standaloneJob.pl Bio::EnsEMBL::Pipeline::Flatfile::CheckFlatfile \
-file /path/to/embl/dumps/homo_sapiens/Homo_sapiens.chromosome.1.dat.gz \
-type embl
</code></pre></body></html>
\ No newline at end of file
h1. FlatFile Pipeline
This is a re-implementation of an existing pipeline developed originally by core and the webteam. The new version uses eHive, so familiarity with this system is essential, and has been written to use as little memory as possible.
h2. The Registry File
This is the way we retrieve the database connections to work with. The registry file should specify:
* The core (and any other) databases to dump from
Here is an example of a file for v67 of Ensembl. Note the use of the Registry object within a registry file and the scoping of the package. If you omit the *-db_version* parameter and only use HEAD checkouts of Ensembl then this will automatically select the latest version of the API. Any change to version here must be reflected in the configuration file.
bc.
package Reg;
use Bio::EnsEMBL::Registry;
Bio::EnsEMBL::Registry->no_version_check(1);
Bio::EnsEMBL::Registry->no_cache_warnings(1);
{
my $version = 67;
Bio::EnsEMBL::Registry->load_registry_from_multiple_dbs(
{
-host => "mydb-1",
-port => 3306,
-db_version => $version,
-user => "user",
-NO_CACHE => 1,
},
{
-host => "mydb-2",
-port => 3306,
-db_version => $version,
-user => "user",
-NO_CACHE => 1,
},
);
}
1;
You give the registry to the *init_pipeline.pl* script via the *-registry* option
h2. Overriding Defaults Using a New Config File
We recommend if you have a number of parameters which do not change between releases to create a configuration file which inherits from the root config file e.g.
bc.
package MyCnf;
use base qw/Bio::EnsEMBL::Pipeline::Flatfile::Flatfile_conf/;
sub default_options {
my ($self) = @_;
return {
%{ $self->SUPER::default_options() },
#Override of options
};
}
1;
If you do override the config then you should use the package name for your overridden config in the upcoming example commands.
h2. Environment
h3. PERL5LIB
* ensembl
* ensembl-hive
* bioperl
h3. PATH
* ensembl-hive/scripts
h3. ENSEMBL_CVS_ROOT_DIR
Set to the base checkout of Ensembl. We should be able to add *ensembl-hive/sql* onto this path to find the SQL directory for hive e.g.
bc.
export ENSEMBL_CVS_ROOT_DIR=$HOME/work/ensembl-checkouts
h3. ENSADMIN_PSW
Give the password to use to log into a database server e.g.
bc.
export ENSADMIN_PSW=wibble
h2. Command Line Arguments
Where *Multiple Supported* is supported we allow the user to specify the parameter more than once on the command line. For example species is one of these options e.g.
bc. -species human -species cele -species yeast
|_. Name |_. Type|_. Multiple Supported|_. Description|_. Default|_. Required|
|@-registry@|String|No|Location of the Ensembl registry to use with this pipeline|-|*YES*|
|@-base_path@|String|No|Location of the dumps|-|*YES*|
|@-pipeline_db -host=@|String|No|Specify a host for the hive database e.g. @-pipeline_db -host=myserver.mysql@|See hive generic config|*YES*|
|@-pipeline_db -dbname=@|String|No|Specify a different database to use as the hive DB e.g. @-pipeline_db -dbname=my_dumps_test@|Uses pipeline name by default|*NO*|
|@-species@|String|Yes|Specify one or more species to process. Pipeline will only _consider_ these species|-|*NO*|
|@-types@|String|Yes|Specify each type of dump you want to produce. Supported values are *embl* and *genbank*|All|*NO*|
|@-db_types@|String|Yes|The database types to use. Supports the normal db adaptor groups e.g. *core*, *otherfeatures* etc.|core|*NO*|
|@-pipeline_name@|String|No|Name to use for the pipeline|flatfile_dump_$release|*NO*|
|@-email@|String|No|Email to send pipeline summaries to upon its successful completion|$USER@sanger.ac.uk|*NO*|
h2. Example Commands
h3. To load use normally:
bc.
init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig::Flatfile_conf \
-pipeline_db -host=my-db-host -base_path /path/to/dumps -registry reg.pm
h3. Run a subset of species (no forcing & supports registry aliases):
bc.
init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig::Flatfile_conf \
-pipeline_db -host=my-db-host -species anolis -species celegans -species human \
-base_path /path/to/dumps -registry reg.pm
h3. Dumping just EMBL data (no genbank):
bc.
init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig::Flatfile_conf \
-pipeline_db -host=my-db-host -type embl \
-base_path /path/to/dumps -registry reg.pm
h2. Running the Pipeline
# Start a screen session or get ready to run the beekeeper with a @nohup@
# Choose a dump location
#* A fasta, blast and blat directory will be created 1 level below
# Use an @init_pipeline.pl@ configuration from above
#* Make sure to give it the @-base_path@ parameter
# Sync the database using one of the displayed from @init_pipeline.pl@
# Run the pipeline in a loop with a good sleep between submissions and redirect log output (the following assumes you are using *bash*)
#* @2>&1@ is important as this clobbers STDERR into STDOUT
#* @> my_run.log@ then sends the output to this file. Use @tail -f@ to track the pipeline
# @beekeeper.pl -url mysql://usr:pass@server:port/db -reg_conf reg.pm -loop -sleep 5 2>&1 > my_run.log &@
# Wait
h2. Running without a Pipeline
Hive gives us the ability to run any Process outside of a database pipeline
run using @standaloneJob.pl@. We will list some useful commands to run
h3. Dumping a Single Species
bc.
standaloneJob.pl Bio::EnsEMBL::Pipeline::Flatfile::DumpFile \
-reg_conf reg.pm -debug 2 \
-release 67 -species homo_sapiens -type embl \
-base_path /path/to/dumps
h2. Verification
Another pipeline is provided which can verify the files produced by this
pipeline. Nothing else other than a basic prodding of file contents is
attempted.
h3. Running with a Pipeline
The code works with a SQLite database so you do not need a MySQL database
to schedule these jobs. You will have to schedule two pipelines; one
to work with embl and another to work with genbank.
The pipeline searches for all files matching the format *.dat.gz.
bc.
init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig::FlatfileChecker_conf \
-base_path /path/to/embl/dumps -type embl
bc.
init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig::FlatfileChecker_conf \
-base_path /path/to/genbank/dumps -type genbank
h3. Running without a Pipeline
You can run this module without a pipeline if you need to check a single
file.
bc.
standaloneJob.pl Bio::EnsEMBL::Pipeline::Flatfile::CheckFlatfile \
-file /path/to/embl/dumps/homo_sapiens/Homo_sapiens.chromosome.1.dat.gz \
-type embl
#!/usr/bin/env perl
# Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
# Copyright [2016-2019] EMBL-European Bioinformatics Institute
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
##########################################
#
# Simple but handy script that generate the input file
# neded by the script CopyDBoverServer.pl
#
# Please post comments/questions to the Ensembl development list
# <http://lists.ensembl.org/mailman/listinfo/dev>
#
#########################################
use strict;
use warnings;
use DBI;
use Getopt::Long;
my $help= 0;
my $sourceHost="ens-staging";
my $sourcePort="3306";
my $sourceUser="";
my $sourcePwd="";
my $destinationHost="mart2";
my $destinationPort="3306";
my $limit='';
my $target_location = '';
my $usage = "\nUsage: $0 -sourceHost mart1 -sourceUser xxx -destinationHost mart2 -limit %42%\n
-help or -h [for help]
-sourceHost [default: ens-staging]
-sourcePort [default: 3306 ]
-sourceUser [default: ]
-destinationHost [default: mart2 ]
-destinationPort [default: 3306 ]
-limit [eg. %core_42% ]
-target_location [default: blank if you want standard data locations]\n
The limit option will limit the databases being copied according to your limit criteria.
With -limit %core_42% only ensembl core 42 databases will be copied\n\n
";
GetOptions('help|h' => \$help,
'sourceHost=s' => \$sourceHost,
'sourcePort=s' => \$sourcePort,
'sourceUser=s' => \$sourceUser,
'sourcePwd=s' => \$sourcePwd,
'destinationHost=s' => \$destinationHost,
'destinationPort=s' => \$destinationPort,
'target_location=s' => \$target_location,
'limit=s' => \$limit);
if ($help || scalar @ARGV == 0 ) {
#if ($help) {
print $usage;
exit 0;
}
#--------------connect to MySQL Source Host
my $dbh = DBI->connect ("DBI:mysql:host=$sourceHost:port=$sourcePort",
$sourceUser,
$sourcePwd)
or die "Can\'t connect to database: $DBI::errstr\n";
#--------------prepare and execute the query
my $sql;
if (!$limit){
$sql = "show databases ;";
}else {
$sql = "show databases like \"".$limit."\" ;";
}
my $sth = $dbh->prepare( $sql);
$sth->execute( );
while ( my @row = $sth->fetchrow ){
my $result = sprintf ("%s %d %50s %s %d %s",$sourceHost.".internal.sanger.ac.uk", $sourcePort, $row[0], $destinationHost.".internal.sanger.ac.uk ", $destinationPort, $row[0]);
$result .= " $target_location" if $target_location;
print $result . "\n";
}
$sth->finish( );
*** Schema patching using schema_patcher.pl ***
This document describes the schema patch procedure for Ensembl core, core like,
variation and regulation/funcgen databases.
-------------------------------------------------------------------------------
Applying patches
-------------------------------------------------------------------------------
Schema patches are split into multiple files, each containing one "task". The
convention for patch files (in ensembl/sql) is:
patch_FROM_TO_[a-z].sql
where FROM is the schema version before patching, TO the version to patch to,
and [a-z] is a letter specifying the respective patch.
Each patch will put an entry into the meta table (meta_key = 'patch') to
indicate that it has been applied successfully. this should make it easier to
track which patches have been applied to which dbs.
This meta information is also used by the new script to determine which patches
need to be applied to a db (note that you can still apply the patches manually
if you prefer). The script is ensembl/misc-scripts/schema_patcher.pl; please
consult the help and extended help that comes with this script for more
advanced use cases.
Typical use for core databases:
-------------------------------
Please note this will only apply patches to core and core like
databases (rnaseq etc.). Please look at the help for other examples:
1. Check which patches need to be applied:
This step is entirely optional, you can proceed to step (2) immediately if you
like.
$ ensembl/misc-scripts/schema_patcher.pl --host HOST --port PORT \
--user xxx --pass xxx --type core --from 65 --release 66 --verbose --dryrun
2. Patch the databases:
$ ensembl/misc-scripts/schema_patcher.pl --host HOST --port PORT \
--user xxx --pass xxx --type core --from 65 --release 66 --verbose
Patching other databases:
-------------------------
The schema_patcher.pl has been modified to work as well to patch funcgen and
variation databases. If you want to patch the variation DBs then run as
follows
$ ensembl/misc-scripts/schema_patcher.pl --host HOST --port PORT \
--user xxx --pass xxx --type variation --from 65 --release 66 --verbose
Running against a single database:
----------------------------------
Some circumstances require the running of the code against a single database
(though the script is intelligent enough to avoid running a patch which has
already been reported as run in the meta table).
$ ensembl/misc-scripts/schema_patcher.pl --host HOST --port PORT \
--user xxx --pass xxx --database homo_sapiens_core_66_37 --from 65 --release 66 --verbose
Checking if you are up to date:
-------------------------------
The following will check all core databases on the current server and will
check that all patches have been applied and reports back when they are not.
$ ensembl/misc-scripts/schema_patcher.pl --host HOST --port PORT \
--user xxx --pass xxx --type core --release 66 --dryrun
It is then left to the user to look at those missing patches & attempt to
recreate the patch as best they can (natural schema churn can result in
old patches becoming redundant or unsafe to run on a later DBs).
-------------------------------------------------------------------------------
Writing patches
-------------------------------------------------------------------------------
Each patch file *must* add an entry to the meta table to indicate that it has
been run. This should be the last SQL statement in your patch, and look for
example like this:
# patch idenifier
INSERT INTO meta (meta_key, meta_value) VALUES ('patch', 'patch_39_40_a.sql|rationalise_key_columns');
The meta_value is the name of the patch file plus a short string describing the
purpose of the patch, separated by a '|'.
For each patch, the same line should be added to table.sql if it contains this
patch. These entries get cleared from table.sql at the beginning of each
release cycle.
-------------------------------------------------------------------------------
Related files
-------------------------------------------------------------------------------
The patch script:
ensembl/misc_scripts/schema_patcher.pl
Schema definition and patch files:
ensembl/sql/table.sql
ensembl/sql/patch*.sql
ensembl-variation/sql/table.sql
ensembl-variation/sql/patch*.sql
ensembl-functgenomics/sql/table.sql
ensembl-functgenomics/sql/patch*.sql
#!/usr/bin/env perl
# Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
# Copyright [2016-2019] EMBL-European Bioinformatics Institute
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
use strict;
use warnings;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Bio::EnsEMBL::AltAlleleGroup;
use Getopt::Long qw(:config pass_through);
# (make sure api version is correct
# Usage:
# perl alt_alleles.pl -cpass XXXX > & human_release_63_alt_alleles
#
#
# long way
# perl alt_alleles.pl -vhost ens-staging1 -vport 3306 -vdbname homo_sapiens_vega_63_37 -cdbname homo_sapiens_core_63_37 -chost ens-staging1 -cpass XXXX > & human_release_63_alt_alleles
#
my ($vhost, $vpass, $vport, $vdbname, $vuser, $chost, $cpass, $cport, $cdbname, $cuser);
GetOptions(
'vuser=s' => \$vuser,
'vpass=s' => \$vpass,
'vhost=s' => \$vhost,
'vport=i' => \$vport,
'vdbname=s' => \$vdbname,
'cuser=s' => \$cuser,
'cpass=s' => \$cpass,
'chost=s' => \$chost,
'cport=i' => \$cport,
'cdbname=s' => \$cdbname);
#
# Connect to the vgea databse to get the alt allele data.
#
my $api_version = Bio::EnsEMBL::ApiVersion->software_version();
if(!defined($vdbname)){
$vdbname = "homo_sapiens_vega_".$api_version."_37";
}
if(!defined($cdbname)){
$cdbname = "homo_sapiens_core_".$api_version."_37";
}
#
# Connect to the core & vega database
#
my $core_dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new(
-host => $chost||'ens-staging1',
-user => $cuser||'ensadmin',
-pass => $cpass,
-group => 'core',
-dbname => $cdbname,
-port => $cport
);
my $vega_dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new(
-host => $vhost||'ens-staging1',
-user => $vuser||'ensadmin',
-pass => $vpass,
-group => 'vega',
-dbname => $vdbname,
-port => $vport
);
#
# get ensembl gene ids and vega stable ids from the *core* database
#
my $vega_core_sql = <<'SQL';
select display_label, ensembl_id
from object_xref
join xref using(xref_id)
join external_db using(external_db_id)
where db_name = 'OTTG'
and ensembl_object_type = 'Gene'
SQL
# sometimes we will see more than one gene associated with an OTTG
# this happens when an OTTG on the primary assemby has been projected to a patch.
my %vega_to_ensembl_core_gene_id;
$core_dba->dbc->sql_helper()->execute_no_return(-SQL => $vega_core_sql, -CALLBACK => sub {
my ($row) = @_;
my ($vega_stable_id, $gene_id) = @{$row};
$vega_to_ensembl_core_gene_id{$vega_stable_id}{$gene_id} = $gene_id;
});
print "\nFetched ".(scalar(keys %vega_to_ensembl_core_gene_id))." Vega Stable IDs\n";
#
# Get AltAlleles from vega
#
my $vega_aaga = $vega_dba->get_AltAlleleGroupAdaptor();
my $vega_groups = $vega_aaga->fetch_all();
my $cnt_vega_rows = @{$vega_groups};
print STDERR "Fetched $cnt_vega_rows rows from the vega db alt_allele table\n";
my %no_gene_id;
my @new_groups;
foreach my $group (@{$vega_groups}) {
my $members = $group->get_all_Genes_types();
my $new_core_group = Bio::EnsEMBL::AltAlleleGroup->new();
foreach my $member (@{$members}) {
my ($vega_gene, $attribs_hash) = @{$member};
my $vega_stable_id = $vega_gene->stable_id();
if(exists $vega_to_ensembl_core_gene_id{$vega_stable_id}) {
foreach my $gene_id (keys %{$vega_to_ensembl_core_gene_id{$vega_stable_id}} ) {
#Add each gene in. If we had a 1:m relationship then we copy the attribute already assigned
#across
$new_core_group->add_member($gene_id, $attribs_hash);
}
}
else {
push @{$no_gene_id{$group->dbID()}}, $vega_stable_id;
print STDERR "no ensembl gene_id found for vega stable id $vega_stable_id in core\n";
}
}
if($new_core_group->size() > 0) {
push(@new_groups, $new_core_group);
}
}
#
# Delete the old data
#
print STDERR "\n\nDeleting all alt_alleles...\n\n";
$core_dba->dbc->do("delete from alt_allele");
$core_dba->dbc->do("delete from alt_allele_attrib");
$core_dba->dbc->do("delete from alt_allele_group");
#
# Store alt_alleles.
#
print STDERR "Storing new alt alleles...\n\n";
my $alt_allele_count=0;
my $gene_count = 0;
my $core_aaga = $core_dba->get_AltAlleleGroupAdaptor();
foreach my $group (@new_groups) {
my $alt_allele_id = $core_aaga->store($group);
$alt_allele_count++;
$gene_count += $group->size()
}
print "Added $alt_allele_count alt_allele ids for $gene_count genes\nDONE\n";