Skip to content
Commits on Source (8)
......@@ -7,18 +7,27 @@ environment:
matrix:
- PY_MAJOR_VER: 2
PYTHON_ARCH: "x86"
- PY_MAJOR_VER: 3
- PY_MAJOR_VER: 2
PYTHON_ARCH: "x86_64"
- PY_MAJOR_VER: 3
PYTHON_ARCH: "x86"
- PY_MAJOR_VER: 3
PYTHON_ARCH: "x86_64"
matrix:
fast_finish: true
# Enable MySQL and PostgreSQL for BioSQL tests
services:
- mysql
- postgresql
- mysql
before_build:
- SET PGUSER=postgres
- SET PGPASSWORD=Password12!
- SET PATH=C:\Program Files\PostgreSQL\9.6\bin\;%PATH%
- psql -U postgres -c "CREATE USER biosql_user WITH PASSWORD 'biosql_pass';"
- psql -U postgres -c "CREATE DATABASE biosql_test OWNER biosql_user;"
build_script:
# If there's a newer build queued for the same PR, cancel this one
......@@ -33,7 +42,7 @@ build_script:
- conda update conda
- conda install setuptools numpy mysql-connector-python psycopg2 matplotlib networkx reportlab scipy coverage
- if "PY_MAJOR_VER"=="2" conda install unittest2
- pip install . -vvv
- python setup.py build
test_script:
- python -c "import sys; print(sys.version)"
......
# See https://help.github.com/articles/about-codeowners/
# and https://github.com/blog/2392-introducing-code-owners
#
# Lines starting with '#' are comments.
# Each line is a file pattern followed by one or more owners.
#
# These owners will be the default owners for everything in the repo.
# * @defunkt
#
# Order is important. The last matching pattern has the most precedence.
# So if a pull request only touches javascript files, only these owners
# will be requested to review.
# *.js @octocat @github/js
#
# You can also use email addresses if you prefer.
# docs/* docs@example.com
#
Bio/Alphabet @peterjc
Bio/Align/* @peterjc
Bio/AlignIO/* @peterjc
Tests/test_AlignIO* @peterjc
Bio/bgzf.py @peterjc
Tests/test_bgzf*.py @peterjc
Bio/Cluster/* @mdehoon
Tests/test_Cluster*.py @mdehoon
Bio/codonalign/* @zruan
Tests/test_codonalign*.py @zruan
Bio/Entrez/* @mdehoon
Tests/test_Entrez*.py @mdehoon
Bio/GenBank/* @peterjc
Tests/test_GenBank*.py @peterjc
Tests/test_EMBL*.py @peterjc
Bio/Graphics/GenomeDiagram/* @widdowquinn @peterjc
Tests/test_GenomeDiagram*.py @widdowquinn @peterjc
Bio/motifs/* @mdehoon
Tests/test_motifs*.py @mdehoon
Bio/PDB/* @joaorodrigues @lennax
Tests/test_PDB*.py @joaorodrigues @lennax
Bio/Phylo/* @etal
Tests/test_Phylo*.py @etal
Bio/PopGen/* @tiagoantao
Tests/test_PopGen*.py @tiagoantao
Bio/SearchIO/* @bow
Tests/test_SearchIO*.py @bow
Bio/Seq*.py @peterjc
Bio/SeqIO/* @peterjc
Tests/test_Seq* @peterjc
Tests/test_seq* @peterjc
Bio/SeqIO/AbiIO.py @peterjc @bow
Tests/test_SeqIO_AbiIO.py @peterjc @bow
Bio/TogoWS/* @peterjc
Tests/test_TogoWS*.py @peterjc
### Setup
I am reporting a problem with Biopython version, Python version, and operating
system as follows:
```python
import sys; print(sys.version)
import platform; print(platform.python_implementation()); print(platform.platform())
import Bio; print(Bio.__version__)
```
(*Please copy and run the above in your Python, and copy-and-paste the output*)
### Expected behaviour
(*Please fill this in*)
### Actual behaviour
(*Please fill this in, and provide any exception message in full*)
### Steps to reproduce
(*Please fill this in *)
This pull request addresses issue #...
I hereby agree to dual licence this and any previous contributions under both
the _Biopython License Agreement_ **AND** the _BSD 3-Clause License_.
I have read the ``CONTRIBUTING.rst`` file and understand that AppVeyor and
TravisCI will be used to confirm the Biopython unit tests and ``flake8`` style
checks pass with these changes.
I am happy be thanked by name in the ``NEWS.rst`` and ``CONTRIB.rst`` files,
and have added myself to those files as part of this pull request. (*This
acknowledgement is optional. Note we list the names sorted alphabetically.*)
......@@ -5,6 +5,7 @@ build
dist
#Ignore backup files from some Unix editors,
\#*.py\#
*~
*.swp
*.bak
......@@ -94,3 +95,6 @@ Doc/*/*/hevea.sty
#Ignore IntelliJ IDEA directory and project files
.idea
*.iml
#Ignore unittest cache dirctory
.cache/
......@@ -61,8 +61,8 @@ deps =
{py27}: mysql-python
{py27,py36}: mmtf-python
{py27,py35}: reportlab
{py27,py35}: psycopg2
{py27,py35,pypy}: mysql-connector-python-rf
{py27,py34,py35,py36}: psycopg2-binary
{py27,py34,py35,py35}: mysql-connector-python-rf
{py27,py35,pypy}: rdflib
{pypy,pypy3}: numpy==1.12.1
{py27,py34,py35,py36}: numpy
......@@ -91,6 +91,7 @@ deps =
flake8
flake8-docstrings
flake8-blind-except
flake8-rst-docstrings
restructuredtext_lint
commands =
# These folders each have their own .flake8 file:
......@@ -108,6 +109,10 @@ commands =
bash -c \'grep "1999-`date +'%Y'`" LICENSE.rst\'
# Check no __docformat__ lines
bash -c "if grep --include '*.py' -rn '^__docformat__ ' Bio BioSQL Tests Scripts Doc ; then echo 'Remove __docformat__ line(s), we assume restructuredtext.'; false; fi"
# Check DOI link style, see https://www.crossref.org/display-guidelines/
bash -c "if grep --include '*.py' --include '*.rst' --include '*.tex' -rni 'doi:' Bio BioSQL Scripts Doc ; then echo 'Please use https://doi.org/... not the doi: or DOI: style.'; false; fi"
bash -c "if grep --include '*.py' --include '*.rst' --include '*.tex' -rn 'dx\.doi\.org' Bio BioSQL Tests Scripts Doc ; then echo 'Please use https://doi.org/... not the dx.doi.org style.'; false; fi"
bash -c "if grep --include '*.py' --include '*.rst' --include '*.tex' -rn 'http://doi\.org' Bio BioSQL Tests Scripts Doc ; then echo 'Please use https://doi.org/... not http://doi.org/...'; false; fi"
[testenv:sdist]
# This does not need to install Biopython or any of its dependencies
......
......@@ -13,24 +13,27 @@
# - basics - quick things like style and packaging
# - test - the actual functional tests which are slow
dist: trusty
language: python
cache: pip
matrix:
include:
- stage: basics
python: 2.7
env: TOXENV=style
addons:
apt:
packages:
before_install: echo "Going to run basic checks"
- stage: basics
env: TOXENV=sdist
python: 3.6
env: TOXENV=style
addons:
apt:
packages:
before_install: echo "Going to run basic checks"
- stage: basics
env: TOXENV=bdist_wheel
env: TOXENV=sdist,bdist_wheel
addons:
apt:
packages:
......@@ -76,45 +79,68 @@ addons:
#
# There is no GenePop Ubuntu pacakge, although it is in BioConda.
#
# There are TravisCI provided versions of PyPy and PyPy3, but currently too old.
# We therefore deactivate that, and download and unzip portable PyPy binaries.
#
# We also need DSSP for testing but it is not available in the repositories.
# Try to download the binary for Linux and place it in $HOME/bin
#
# This before_install list is only used for the test stage.
before_install:
- cd $HOME
- mkdir bin
- pushd $HOME
- mkdir -p bin
- export PATH=$HOME/bin:$PATH
- echo "Installing PhyML"
- curl -L -O http://www.atgc-montpellier.fr/download/binaries/phyml/PhyML-3.1.zip
- unzip PhyML-3.1.zip
- mv PhyML-3.1/PhyML-3.1_linux64 bin/phyml
- cd $HOME
- "if [[ $TRAVIS_PYTHON_VERSION == 'pypy' ]]; then deactivate && wget https://bitbucket.org/squeaky/portable-pypy/downloads/pypy-5.7.1-linux_x86_64-portable.tar.bz2 && tar -jxvf pypy-5.7.1-linux_x86_64-portable.tar.bz2 && echo 'Setting up aliases...' && cd pypy-5.7.1-linux_x86_64-portable/bin/ && export PATH=$PWD:$PATH && ln -s pypy python && echo 'Setting up pip...' && ./pypy -m ensurepip ; fi"
- "if [[ $TRAVIS_PYTHON_VERSION == 'pypy3' ]]; then deactivate && wget https://bitbucket.org/squeaky/portable-pypy/downloads/pypy3.5-5.8-beta-linux_x86_64-portable.tar.bz2 && tar -jxvf pypy3.5-5.8-beta-linux_x86_64-portable.tar.bz2 && echo 'Setting up aliases...' && cd pypy3.5-5.8-beta-linux_x86_64-portable/bin/ && export PATH=$PWD:$PATH && ln -s pypy3 python && echo 'Setting up pip...' && ./pypy3 -m ensurepip && ln -s pip3 pip ; fi"
- cd $HOME
- echo "Installing dssp"
- curl -L -O ftp://ftp.cmbi.ru.nl/pub/software/dssp/dssp-2.0.4-linux-amd64
- mv dssp-2.0.4-linux-amd64 bin/dssp
- chmod a+x bin/dssp
#- echo "Installing dssp"
#- curl -L -O ftp://ftp.cmbi.ru.nl/pub/software/dssp/dssp-2.0.4-linux-amd64
#- mv dssp-2.0.4-linux-amd64 bin/dssp
#- chmod a+x bin/dssp
- echo "Installing Genepop"
- curl -L -O https://anaconda.org/bioconda/genepop/4.5.1/download/linux-64/genepop-4.5.1-0.tar.bz2
# This will create ./bin/Genepop and a harmless ./info/ folder.
- tar -jxvf genepop-4.5.1-0.tar.bz2
- cd $TRAVIS_BUILD_DIR
- "cp Tests/biosql.ini.sample Tests/biosql.ini"
# There are TravisCI provided versions of PyPy and PyPy3, but currently too old.
# We therefore deactivate that, and download and unzip portable PyPy binaries.
- |
if [[ $TRAVIS_PYTHON_VERSION == 'pypy' ]]; then
deactivate
wget https://bitbucket.org/squeaky/portable-pypy/downloads/pypy-5.10.0-linux_x86_64-portable.tar.bz2
tar -jxvf pypy-5.10.0-linux_x86_64-portable.tar.bz2
echo 'Setting up aliases...'
cd pypy-5.10.0-linux_x86_64-portable/bin/
export PATH=$PWD:$PATH
ln -s pypy python
echo 'Setting up pip...'
./pypy -m ensurepip
fi
- |
if [[ $TRAVIS_PYTHON_VERSION == 'pypy3' ]]; then
deactivate
wget https://bitbucket.org/squeaky/portable-pypy/downloads/pypy3.5-5.10.1-linux_x86_64-portable.tar.bz2
tar -jxvf pypy3.5-5.10.1-linux_x86_64-portable.tar.bz2
echo 'Setting up aliases...'
cd pypy3.5-5.10.1-linux_x86_64-portable/bin/
export PATH=$PWD:$PATH
ln -s pypy3 python
echo 'Setting up pip...'
./pypy3 -m ensurepip
ln -s pip3 pip
fi
- popd
- cp Tests/biosql.ini.sample Tests/biosql.ini
- psql -c "create database biosql_test;" -U postgres
- psql -c "create user biosql_user with encrypted password 'biosql_pass';" -U postgres
- psql -c "grant all privileges on database biosql_test to biosql_user;" -U postgres
# This is minimal and used under all stages
install:
- "pip install --upgrade pip setuptools"
- "pip install tox"
- "tox -c .travis-tox.ini -e $TOXENV --notest"
- pip install --upgrade pip setuptools
- pip install tox
- tox -c .travis-tox.ini -e $TOXENV --notest
script:
- "tox -c .travis-tox.ini -e $TOXENV"
- travis_wait tox -c .travis-tox.ini -e $TOXENV
notifications:
email: false
......@@ -7,7 +7,18 @@ ignore =
# =======================
# pycodestyle v2.3.1 default ignore is E121,E123,E126,E226,E24,E704,W503
# flake8 v3.3.0 default ignore is E121,E123,E126,E226,E24,E704,W503,W504
E122,E123,E126,E127,E128,E501,E731,F401,F812,F841,
#
# These are ignored by default:
E122,E123,E126,W503,
# These are not ignored by default:
# E127 continuation line over-indented for visual indent
# E128 continuation line under-indented for visual indent
# E501 line too long (XX > 79 characters)
# E731 do not assign a lambda expression, use a def
# F401 module imported but unused
# F841 local variable name is assigned to but never used
# TODO: Fix some of these?
E127,E128,E501,E731,F401,F841,
# =====================================
# pydocstyle: D1## - Missing Docstrings
# =====================================
......@@ -15,41 +26,16 @@ ignore =
# D101 Missing docstring in public class
# D102 Missing docstring in public method
# D103 Missing docstring in public function
# D104 Missing docstring in public package
# D105 Missing docstring in magic method
# TODO: Fix some of these?
D100,D101,D102,D103,D104,D105,
D100,D101,D102,D103,D105,
# ====================================
# pydocstyle: D2## - Whitespace Issues
# ====================================
# D200 One-line docstring should fit on one line with quotes
# D202 No blank lines allowed after function docstring
# D203 1 blank line required before class docstring
# D204 1 blank line required after class docstring
# D205 1 blank line required between summary line and description
# D207 Docstring is under-indented
# D208 Docstring is over-indented
# D210 No whitespaces allowed surrounding docstring text
# TODO: Fix these:
D205,
# We ignore D203 deliberately in favour of passing D211,
# D211 No blank lines allowed before class docstring
# We ignore D203 deliberately in favour of passing D211,
D203,
# ================================
# pydocstyle: D3## - Quotes Issues
# ================================
# D300 Use """triple double quotes"""
# D301 Use r""" if any backslashes in a docstring
# TODO: Fix this?:
D301,
# ===========================================
# pydocstyle: D4## - Docstring Content Issues
# ===========================================
# D400 First line should end with a period
# D401 First line should be in imperative mood
# D402 First line should not be the function’s "signature"
# D403 First word of the first line should be properly capitalized
# D412 No blank lines allowed between a section header and its content
D400,D401,D402,D403,
# ================================================
# flake8-commas: C#### (in case installed locally)
# ================================================
......
......@@ -19,7 +19,10 @@ except ImportError:
class ParserError(ValueError):
"""Affymetrix parser error."""
def __init__(self, *args):
"""Initialise class."""
super(ParserError, self).__init__(*args)
......@@ -33,7 +36,7 @@ _modeError = ParserError("You're trying to open an Affymetrix v4"
class Record(object):
"""Stores the information in a cel file
"""Stores the information in a cel file.
Example usage:
......@@ -65,6 +68,7 @@ class Record(object):
"""
def __init__(self):
"""Initialize class."""
self.version = None
self.GridCornerUL = None
self.GridCornerUR = None
......@@ -87,7 +91,7 @@ class Record(object):
def read(handle):
"""Reads Affymetrix CEL file and returns Record object.
"""Read Affymetrix CEL file and return Record object.
CEL files version 3 and 4 are supported, and the parser attempts version detection.
......@@ -99,6 +103,7 @@ def read(handle):
...
>>> c.version == 4
True
"""
# If we fail to read the magic number, then it will remain None, and thus
# we will invoke read_v3 (if mode is not strict), or raise IOError if mode
......@@ -135,8 +140,7 @@ def read(handle):
# read Affymetrix files version 4.
def read_v4(f):
"""Reads Affymetrix CEL file, version 4, and returns a corresponding Record
object.
"""Read verion 4 Affymetrix CEL file, returns corresponding Record object.
Most importantly record.intensities correspond to intensities from the CEL
file.
......@@ -151,8 +155,9 @@ def read_v4(f):
...
>>> c.version == 4
True
>>> print(c.intensities.shape)
(5, 5)
>>> print("%i by %i array" % c.intensities.shape)
5 by 5 array
"""
# We follow the documentation here:
# http://www.affymetrix.com/estore/support/developer/powertools/changelog/gcos-agcc/cel.html.affx
......@@ -278,7 +283,7 @@ def read_v4(f):
def read_v3(handle):
"""Reads Affymetrix CEL file, version 3, and returns a corresponding Record object.
"""Read version 3 Affymetrix CEL file, and return corresponding Record object.
Example Usage:
......@@ -288,6 +293,7 @@ def read_v3(handle):
...
>>> c.version == 3
True
"""
# Needs error handling.
# Needs to know the chip design.
......
......@@ -20,9 +20,20 @@ class ClustalOmegaCommandline(AbstractCommandline):
http://www.clustal.org/omega
Example:
--------
Notes
-----
Last checked against version: 1.2.0
References
----------
Sievers F, Wilm A, Dineen DG, Gibson TJ, Karplus K, Li W, Lopez R,
McWilliam H, Remmert M, Söding J, Thompson JD, Higgins DG (2011).
Fast, scalable generation of high-quality protein multiple
sequence alignments using Clustal Omega.
Molecular Systems Biology 7:539 https://doi.org/10.1038/msb.2011.75
Examples
--------
>>> from Bio.Align.Applications import ClustalOmegaCommandline
>>> in_file = "unaligned.fasta"
>>> out_file = "aligned.fasta"
......@@ -30,23 +41,13 @@ class ClustalOmegaCommandline(AbstractCommandline):
>>> print(clustalomega_cline)
clustalo -i unaligned.fasta -o aligned.fasta --auto -v
You would typically run the command line with clustalomega_cline() or via
the Python subprocess module, as described in the Biopython tutorial.
Citation:
---------
Sievers F, Wilm A, Dineen DG, Gibson TJ, Karplus K, Li W, Lopez R,
McWilliam H, Remmert M, Söding J, Thompson JD, Higgins DG (2011).
Fast, scalable generation of high-quality protein multiple
sequence alignments using Clustal Omega.
Molecular Systems Biology 7:539 doi:10.1038/msb.2011.75
Last checked against versions: 1.2.0
"""
def __init__(self, cmd="clustalo", **kwargs):
"""Initialize the class."""
# order parameters in the same order as clustalo --help
self.parameters = \
[
......
......@@ -15,9 +15,19 @@ class ClustalwCommandline(AbstractCommandline):
http://www.clustal.org/
Example:
--------
Notes
-----
Last checked against versions: 1.83 and 2.1
References
----------
Larkin MA, Blackshields G, Brown NP, Chenna R, McGettigan PA,
McWilliam H, Valentin F, Wallace IM, Wilm A, Lopez R, Thompson JD,
Gibson TJ, Higgins DG. (2007). Clustal W and Clustal X version 2.0.
Bioinformatics, 23, 2947-2948.
Examples
--------
>>> from Bio.Align.Applications import ClustalwCommandline
>>> in_file = "unaligned.fasta"
>>> clustalw_cline = ClustalwCommandline("clustalw2", infile=in_file)
......@@ -27,21 +37,12 @@ class ClustalwCommandline(AbstractCommandline):
You would typically run the command line with clustalw_cline() or via
the Python subprocess module, as described in the Biopython tutorial.
Citation:
---------
Larkin MA, Blackshields G, Brown NP, Chenna R, McGettigan PA,
McWilliam H, Valentin F, Wallace IM, Wilm A, Lopez R, Thompson JD,
Gibson TJ, Higgins DG. (2007). Clustal W and Clustal X version 2.0.
Bioinformatics, 23, 2947-2948.
Last checked against versions: 1.83 and 2.1
"""
# TODO - Should we default to cmd="clustalw2" now?
def __init__(self, cmd="clustalw", **kwargs):
self.parameters = \
[
"""Initialize the class."""
self.parameters = [
_Option(["-infile", "-INFILE", "INFILE", "infile"],
"Input sequences.",
filename=True),
......@@ -127,20 +128,20 @@ class ClustalwCommandline(AbstractCommandline):
# ***Fast Pairwise Alignments:***
_Option(["-ktuple", "-KTUPLE", "KTUPLE", "ktuple"],
"Word size",
checker_function=lambda x: isinstance(x, int) or
isinstance(x, float)),
checker_function=lambda x: (isinstance(x, int) or
isinstance(x, float))),
_Option(["-topdiags", "-TOPDIAGS", "TOPDIAGS", "topdiags"],
"Number of best diags.",
checker_function=lambda x: isinstance(x, int) or
isinstance(x, float)),
checker_function=lambda x: (isinstance(x, int) or
isinstance(x, float))),
_Option(["-window", "-WINDOW", "WINDOW", "window"],
"Window around best diags.",
checker_function=lambda x: isinstance(x, int) or
isinstance(x, float)),
checker_function=lambda x: (isinstance(x, int) or
isinstance(x, float))),
_Option(["-pairgap", "-PAIRGAP", "PAIRGAP", "pairgap"],
"Gap penalty",
checker_function=lambda x: isinstance(x, int) or
isinstance(x, float)),
checker_function=lambda x: (isinstance(x, int) or
isinstance(x, float))),
_Option(["-score", "-SCORE", "SCORE", "score"],
"Either: PERCENT or ABSOLUTE",
checker_function=lambda x: x in ["percent", "PERCENT",
......@@ -148,26 +149,26 @@ class ClustalwCommandline(AbstractCommandline):
# ***Slow Pairwise Alignments:***
_Option(["-pwmatrix", "-PWMATRIX", "PWMATRIX", "pwmatrix"],
"Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename",
checker_function=lambda x: x in ["BLOSUM", "PAM",
"GONNET", "ID",
"blosum", "pam",
"gonnet", "id"] or
os.path.exists(x),
checker_function=lambda x: (x in ["BLOSUM", "PAM",
"GONNET", "ID",
"blosum", "pam",
"gonnet", "id"]
or os.path.exists(x)),
filename=True),
_Option(["-pwdnamatrix", "-PWDNAMATRIX", "PWDNAMATRIX", "pwdnamatrix"],
"DNA weight matrix=IUB, CLUSTALW or filename",
checker_function=lambda x: x in ["IUB", "CLUSTALW",
"iub", "clustalw"] or
os.path.exists(x),
checker_function=lambda x: (x in ["IUB", "CLUSTALW",
"iub", "clustalw"]
or os.path.exists(x)),
filename=True),
_Option(["-pwgapopen", "-PWGAPOPEN", "PWGAPOPEN", "pwgapopen"],
"Gap opening penalty",
checker_function=lambda x: isinstance(x, int) or
isinstance(x, float)),
checker_function=lambda x: (isinstance(x, int) or
isinstance(x, float))),
_Option(["-pwgapext", "-PWGAPEXT", "PWGAPEXT", "pwgapext"],
"Gap extension penalty",
checker_function=lambda x: isinstance(x, int) or
isinstance(x, float)),
checker_function=lambda x: (isinstance(x, int) or
isinstance(x, float))),
# ***Multiple Alignments:***
_Option(["-newtree", "-NEWTREE", "NEWTREE", "newtree"],
"Output file name for newly created guide tree",
......@@ -178,32 +179,32 @@ class ClustalwCommandline(AbstractCommandline):
filename=True),
_Option(["-matrix", "-MATRIX", "MATRIX", "matrix"],
"Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename",
checker_function=lambda x: x in ["BLOSUM", "PAM",
"GONNET", "ID",
"blosum", "pam",
"gonnet", "id"] or
os.path.exists(x),
checker_function=lambda x: (x in ["BLOSUM", "PAM",
"GONNET", "ID",
"blosum", "pam",
"gonnet", "id"]
or os.path.exists(x)),
filename=True),
_Option(["-dnamatrix", "-DNAMATRIX", "DNAMATRIX", "dnamatrix"],
"DNA weight matrix=IUB, CLUSTALW or filename",
checker_function=lambda x: x in ["IUB", "CLUSTALW",
"iub", "clustalw"] or
os.path.exists(x),
checker_function=lambda x: (x in ["IUB", "CLUSTALW",
"iub", "clustalw"]
or os.path.exists(x)),
filename=True),
_Option(["-gapopen", "-GAPOPEN", "GAPOPEN", "gapopen"],
"Gap opening penalty",
checker_function=lambda x: isinstance(x, int) or
isinstance(x, float)),
checker_function=lambda x: (isinstance(x, int) or
isinstance(x, float))),
_Option(["-gapext", "-GAPEXT", "GAPEXT", "gapext"],
"Gap extension penalty",
checker_function=lambda x: isinstance(x, int) or
isinstance(x, float)),
checker_function=lambda x: (isinstance(x, int) or
isinstance(x, float))),
_Switch(["-endgaps", "-ENDGAPS", "ENDGAPS", "endgaps"],
"No end gap separation pen."),
_Option(["-gapdist", "-GAPDIST", "GAPDIST", "gapdist"],
"Gap separation pen. range",
checker_function=lambda x: isinstance(x, int) or
isinstance(x, float)),
checker_function=lambda x: (isinstance(x, int) or
isinstance(x, float))),
_Switch(["-nopgap", "-NOPGAP", "NOPGAP", "nopgap"],
"Residue-specific gaps off"),
_Switch(["-nohgap", "-NOHGAP", "NOHGAP", "nohgap"],
......@@ -212,8 +213,8 @@ class ClustalwCommandline(AbstractCommandline):
"List hydrophilic res."),
_Option(["-maxdiv", "-MAXDIV", "MAXDIV", "maxdiv"],
"% ident. for delay",
checker_function=lambda x: isinstance(x, int) or
isinstance(x, float)),
checker_function=lambda x: (isinstance(x, int) or
isinstance(x, float))),
# Already handled in General Settings section, but appears a second
# time under Multiple Alignments in the help
# _Option(["-type", "-TYPE", "TYPE", "type"],
......@@ -222,8 +223,8 @@ class ClustalwCommandline(AbstractCommandline):
# "protein", "dna"]),
_Option(["-transweight", "-TRANSWEIGHT", "TRANSWEIGHT", "transweight"],
"Transitions weighting",
checker_function=lambda x: isinstance(x, int) or
isinstance(x, float)),
checker_function=lambda x: (isinstance(x, int) or
isinstance(x, float))),
_Option(["-iteration", "-ITERATION", "ITERATION", "iteration"],
"NONE or TREE or ALIGNMENT",
checker_function=lambda x: x in ["NONE", "TREE",
......@@ -277,20 +278,20 @@ class ClustalwCommandline(AbstractCommandline):
"both", "none"]),
_Option(["-helixgap", "-HELIXGAP", "HELIXGAP", "helixgap"],
"Gap penalty for helix core residues",
checker_function=lambda x: isinstance(x, int) or
isinstance(x, float)),
checker_function=lambda x: (isinstance(x, int) or
isinstance(x, float))),
_Option(["-strandgap", "-STRANDGAP", "STRANDGAP", "strandgap"],
"gap penalty for strand core residues",
checker_function=lambda x: isinstance(x, int) or
isinstance(x, float)),
checker_function=lambda x: (isinstance(x, int) or
isinstance(x, float))),
_Option(["-loopgap", "-LOOPGAP", "LOOPGAP", "loopgap"],
"Gap penalty for loop regions",
checker_function=lambda x: isinstance(x, int) or
isinstance(x, float)),
checker_function=lambda x: (isinstance(x, int) or
isinstance(x, float))),
_Option(["-terminalgap", "-TERMINALGAP", "TERMINALGAP", "terminalgap"],
"Gap penalty for structure termini",
checker_function=lambda x: isinstance(x, int) or
isinstance(x, float)),
checker_function=lambda x: (isinstance(x, int) or
isinstance(x, float))),
_Option(["-helixendin", "-HELIXENDIN", "HELIXENDIN", "helixendin"],
"Number of residues inside helix to be treated as terminal",
checker_function=lambda x: isinstance(x, int)),
......
......@@ -15,9 +15,17 @@ class DialignCommandline(AbstractCommandline):
http://bibiserv.techfak.uni-bielefeld.de/dialign/welcome.html
Example:
--------
Notes
-----
Last checked against version: 2.2
References
----------
B. Morgenstern (2004). DIALIGN: Multiple DNA and Protein Sequence
Alignment at BiBiServ. Nucleic Acids Research 32, W33-W36.
Examples
--------
To align a FASTA file (unaligned.fasta) with the output files names
aligned.* including a FASTA output file (aligned.fa), use:
......@@ -30,16 +38,10 @@ class DialignCommandline(AbstractCommandline):
You would typically run the command line with dialign_cline() or via
the Python subprocess module, as described in the Biopython tutorial.
Citation:
---------
B. Morgenstern (2004). DIALIGN: Multiple DNA and Protein Sequence
Alignment at BiBiServ. Nucleic Acids Research 32, W33-W36.
Last checked against version: 2.2
"""
def __init__(self, cmd="dialign2-2", **kwargs):
"""Initialize the class."""
self.program_name = cmd
self.parameters = \
[
......
......@@ -16,9 +16,18 @@ class MSAProbsCommandline(AbstractCommandline):
http://msaprobs.sourceforge.net
Example:
--------
Notes
-----
Last checked against version: 0.9.7
References
----------
Yongchao Liu, Bertil Schmidt, Douglas L. Maskell: "MSAProbs: multiple
sequence alignment based on pair hidden Markov models and partition
function posterior probabilities". Bioinformatics, 2010, 26(16): 1958 -1964
Examples
--------
>>> from Bio.Align.Applications import MSAProbsCommandline
>>> in_file = "unaligned.fasta"
>>> out_file = "aligned.cla"
......@@ -29,17 +38,10 @@ class MSAProbsCommandline(AbstractCommandline):
You would typically run the command line with cline() or via
the Python subprocess module, as described in the Biopython tutorial.
Citation:
---------
Yongchao Liu, Bertil Schmidt, Douglas L. Maskell: "MSAProbs: multiple
sequence alignment based on pair hidden Markov models and partition
function posterior probabilities". Bioinformatics, 2010, 26(16): 1958 -1964
Last checked against version: 0.9.7
"""
def __init__(self, cmd="msaprobs", **kwargs):
"""Initialize the class."""
# order of parameters is the same as in msaprobs -help
self.parameters = \
[
......
......@@ -6,8 +6,6 @@
from __future__ import print_function
import os
from Bio.Application import _Option, _Switch, _Argument, AbstractCommandline
......@@ -16,9 +14,33 @@ class MafftCommandline(AbstractCommandline):
http://align.bmr.kyushu-u.ac.jp/mafft/software/
Example:
--------
Notes
-----
Last checked against version: MAFFT v6.717b (2009/12/03)
References
----------
Katoh, Toh (BMC Bioinformatics 9:212, 2008) Improved accuracy of
multiple ncRNA alignment by incorporating structural information into
a MAFFT-based framework (describes RNA structural alignment methods)
Katoh, Toh (Briefings in Bioinformatics 9:286-298, 2008) Recent
developments in the MAFFT multiple sequence alignment program
(outlines version 6)
Katoh, Toh (Bioinformatics 23:372-374, 2007) Errata PartTree: an
algorithm to build an approximate tree from a large number of
unaligned sequences (describes the PartTree algorithm)
Katoh, Kuma, Toh, Miyata (Nucleic Acids Res. 33:511-518, 2005) MAFFT
version 5: improvement in accuracy of multiple sequence alignment
(describes [ancestral versions of] the G-INS-i, L-INS-i and E-INS-i
strategies)
Katoh, Misawa, Kuma, Miyata (Nucleic Acids Res. 30:3059-3066, 2002)
Examples
--------
>>> from Bio.Align.Applications import MafftCommandline
>>> mafft_exe = "/opt/local/mafft"
>>> in_file = "../Doc/examples/opuntia.fasta"
......@@ -37,6 +59,7 @@ class MafftCommandline(AbstractCommandline):
You would typically run the command line with mafft_cline() or via
the Python subprocess module, as described in the Biopython tutorial.
Note that MAFFT will write the alignment to stdout, which you may
want to save to a file and then parse, e.g.::
......@@ -54,32 +77,10 @@ class MafftCommandline(AbstractCommandline):
from Bio import AlignIO
align = AlignIO.read(StringIO(stdout), "fasta")
Citations:
----------
Katoh, Toh (BMC Bioinformatics 9:212, 2008) Improved accuracy of
multiple ncRNA alignment by incorporating structural information into
a MAFFT-based framework (describes RNA structural alignment methods)
Katoh, Toh (Briefings in Bioinformatics 9:286-298, 2008) Recent
developments in the MAFFT multiple sequence alignment program
(outlines version 6)
Katoh, Toh (Bioinformatics 23:372-374, 2007) Errata PartTree: an
algorithm to build an approximate tree from a large number of
unaligned sequences (describes the PartTree algorithm)
Katoh, Kuma, Toh, Miyata (Nucleic Acids Res. 33:511-518, 2005) MAFFT
version 5: improvement in accuracy of multiple sequence alignment
(describes [ancestral versions of] the G-INS-i, L-INS-i and E-INS-i
strategies)
Katoh, Misawa, Kuma, Miyata (Nucleic Acids Res. 30:3059-3066, 2002)
Last checked against version: MAFFT v6.717b (2009/12/03)
"""
def __init__(self, cmd="mafft", **kwargs):
"""Initialize the class."""
BLOSUM_MATRICES = ["30", "45", "62", "80"]
self.parameters = \
[
......
......@@ -14,9 +14,20 @@ class MuscleCommandline(AbstractCommandline):
http://www.drive5.com/muscle/
Example:
--------
Notes
-----
Last checked against version: 3.7, briefly against 3.8
References
----------
Edgar, Robert C. (2004), MUSCLE: multiple sequence alignment with high
accuracy and high throughput, Nucleic Acids Research 32(5), 1792-97.
Edgar, R.C. (2004) MUSCLE: a multiple sequence alignment method with
reduced time and space complexity. BMC Bioinformatics 5(1): 113.
Examples
--------
>>> from Bio.Align.Applications import MuscleCommandline
>>> muscle_exe = r"C:\Program Files\Aligments\muscle3.8.31_i86win32.exe"
>>> in_file = r"C:\My Documents\unaligned.fasta"
......@@ -28,19 +39,10 @@ class MuscleCommandline(AbstractCommandline):
You would typically run the command line with muscle_cline() or via
the Python subprocess module, as described in the Biopython tutorial.
Citations:
----------
Edgar, Robert C. (2004), MUSCLE: multiple sequence alignment with high
accuracy and high throughput, Nucleic Acids Research 32(5), 1792-97.
Edgar, R.C. (2004) MUSCLE: a multiple sequence alignment method with
reduced time and space complexity. BMC Bioinformatics 5(1): 113.
Last checked against version: 3.7, briefly against 3.8
"""
def __init__(self, cmd="muscle", **kwargs):
"""Initialize the class."""
CLUSTERING_ALGORITHMS = ["upgma", "upgmb", "neighborjoining"]
DISTANCE_MEASURES_ITER1 = ["kmer6_6", "kmer20_3", "kmer20_4",
"kbit20_3", "kmer4_6"]
......
......@@ -15,9 +15,22 @@ class PrankCommandline(AbstractCommandline):
http://www.ebi.ac.uk/goldman-srv/prank/prank/
Example:
--------
Notes
-----
Last checked against version: 081202
References
----------
Loytynoja, A. and Goldman, N. 2005. An algorithm for progressive
multiple alignment of sequences with insertions. Proceedings of
the National Academy of Sciences, 102: 10557--10562.
Loytynoja, A. and Goldman, N. 2008. Phylogeny-aware gap placement
prevents errors in sequence alignment and evolutionary analysis.
Science, 320: 1632.
Examples
--------
To align a FASTA file (unaligned.fasta) with the output in aligned
FASTA format with the output filename starting with "aligned" (you
can't pick the filename explicitly), no tree output and no XML output,
......@@ -34,21 +47,10 @@ class PrankCommandline(AbstractCommandline):
You would typically run the command line with prank_cline() or via
the Python subprocess module, as described in the Biopython tutorial.
Citations:
----------
Loytynoja, A. and Goldman, N. 2005. An algorithm for progressive
multiple alignment of sequences with insertions. Proceedings of
the National Academy of Sciences, 102: 10557--10562.
Loytynoja, A. and Goldman, N. 2008. Phylogeny-aware gap placement
prevents errors in sequence alignment and evolutionary analysis.
Science, 320: 1632.
Last checked against version: 081202
"""
def __init__(self, cmd="prank", **kwargs):
"""Initialize the class."""
OUTPUT_FORMAT_VALUES = list(range(1, 18))
self.parameters = [
# ################# input/output parameters: ##################
......@@ -160,14 +162,14 @@ class PrankCommandline(AbstractCommandline):
# Doesn't specify type but Float and Int work
_Option(["-matresize", "matresize"],
"Matrix resizing multiplier",
checker_function=lambda x: isinstance(x, float) or
isinstance(x, int)),
checker_function=lambda x: (isinstance(x, float) or
isinstance(x, int))),
# -matinitsize=# [matrix initial size multiplier]
# Doesn't specify type but Float and Int work
_Option(["-matinitsize", "matinitsize"],
"Matrix initial size multiplier",
checker_function=lambda x: isinstance(x, float) or
isinstance(x, int)),
checker_function=lambda x: (isinstance(x, float) or
isinstance(x, int))),
_Switch(["-longseq", "longseq"],
"Save space in pairwise alignments"),
_Switch(["-pwgenomic", "pwgenomic"],
......
......@@ -15,9 +15,18 @@ class ProbconsCommandline(AbstractCommandline):
http://probcons.stanford.edu/
Example:
--------
Notes
-----
Last checked against version: 1.12
References
----------
Do, C.B., Mahabhashyam, M.S.P., Brudno, M., and Batzoglou, S. 2005.
PROBCONS: Probabilistic Consistency-based Multiple Sequence Alignment.
Genome Research 15: 330-340.
Examples
--------
To align a FASTA file (unaligned.fasta) with the output in ClustalW
format, and otherwise default settings, use:
......@@ -29,6 +38,7 @@ class ProbconsCommandline(AbstractCommandline):
You would typically run the command line with probcons_cline() or via
the Python subprocess module, as described in the Biopython tutorial.
Note that PROBCONS will write the alignment to stdout, which you may
want to save to a file and then parse, e.g.::
......@@ -46,17 +56,10 @@ class ProbconsCommandline(AbstractCommandline):
from Bio import AlignIO
align = AlignIO.read(StringIO(stdout), "clustalw")
Citations:
----------
Do, C.B., Mahabhashyam, M.S.P., Brudno, M., and Batzoglou, S. 2005.
PROBCONS: Probabilistic Consistency-based Multiple Sequence Alignment.
Genome Research 15: 330-340.
Last checked against version: 1.12
"""
def __init__(self, cmd="probcons", **kwargs):
"""Initialize the class."""
self.parameters = \
[
# Note that some options cannot be assigned via properties using the
......
......@@ -18,9 +18,17 @@ class TCoffeeCommandline(AbstractCommandline):
This wrapper implements a VERY limited number of options - if you
would like to help improve it please get in touch.
Example:
--------
Notes
-----
Last checked against: Version_6.92
References
----------
T-Coffee: A novel method for multiple sequence alignments.
Notredame, Higgins, Heringa, JMB,302(205-217) 2000
Examples
--------
To align a FASTA file (unaligned.fasta) with the output in ClustalW
format (file aligned.aln), and otherwise default settings, use:
......@@ -34,18 +42,12 @@ class TCoffeeCommandline(AbstractCommandline):
You would typically run the command line with tcoffee_cline() or via
the Python subprocess module, as described in the Biopython tutorial.
Citation:
---------
T-Coffee: A novel method for multiple sequence alignments.
Notredame, Higgins, Heringa, JMB,302(205-217) 2000
Last checked against: Version_6.92
"""
SEQ_TYPES = ["dna", "protein", "dna_protein"]
def __init__(self, cmd="t_coffee", **kwargs):
"""Initialize the class."""
self.parameters = [
_Option(["-output", "output"],
"""Specify the output type.
......
......@@ -12,7 +12,7 @@ class, used in the Bio.AlignIO module.
from __future__ import print_function
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqRecord import SeqRecord, _RestrictedDict
from Bio import Alphabet
......@@ -103,7 +103,7 @@ class MultipleSeqAlignment(object):
"""
def __init__(self, records, alphabet=None,
annotations=None):
annotations=None, column_annotations=None):
"""Initialize a new MultipleSeqAlignment object.
Arguments:
......@@ -115,6 +115,10 @@ class MultipleSeqAlignment(object):
record alphabets. If omitted, a consensus alphabet is
used.
- annotations - Information about the whole alignment (dictionary).
- column_annotations - Per column annotation (restricted dictionary).
This holds Python sequences (lists, strings, tuples)
whose length matches the number of columns. A typical
use would be a secondary structure consensus string.
You would normally load a MSA from a file using Bio.AlignIO, but you
can do this from a list of SeqRecord objects too:
......@@ -126,7 +130,9 @@ class MultipleSeqAlignment(object):
>>> a = SeqRecord(Seq("AAAACGT", generic_dna), id="Alpha")
>>> b = SeqRecord(Seq("AAA-CGT", generic_dna), id="Beta")
>>> c = SeqRecord(Seq("AAAAGGT", generic_dna), id="Gamma")
>>> align = MultipleSeqAlignment([a, b, c], annotations={"tool": "demo"})
>>> align = MultipleSeqAlignment([a, b, c],
... annotations={"tool": "demo"},
... column_annotations={"stats": "CCCXCCC"})
>>> print(align)
DNAAlphabet() alignment with 3 rows and 7 columns
AAAACGT Alpha
......@@ -134,6 +140,8 @@ class MultipleSeqAlignment(object):
AAAAGGT Gamma
>>> align.annotations
{'tool': 'demo'}
>>> align.column_annotations
{'stats': 'CCCXCCC'}
NOTE - The older Bio.Align.Generic.Alignment class only accepted a
single argument, an alphabet. This is still supported via a backwards
......@@ -180,6 +188,45 @@ class MultipleSeqAlignment(object):
raise TypeError("annotations argument should be a dict")
self.annotations = annotations
# Annotations about each colum of the alignment
if column_annotations is None:
column_annotations = {}
# Handle this via the property set function which will validate it
self.column_annotations = column_annotations
def _set_per_column_annotations(self, value):
if not isinstance(value, dict):
raise TypeError("The per-column-annotations should be a "
"(restricted) dictionary.")
# Turn this into a restricted-dictionary (and check the entries)
if len(self):
# Use the standard method to get the length
expected_length = self.get_alignment_length()
self._per_col_annotations = _RestrictedDict(length=expected_length)
self._per_col_annotations.update(value)
else:
# Bit of a problem case... number of columns is undefined
self._per_col_annotations = None
if value:
raise ValueError("Can't set per-column-annotations without an alignment")
def _get_per_column_annotations(self):
if self._per_col_annotations is None:
# This happens if empty at initialisation
if len(self):
# Use the standard method to get the length
expected_length = self.get_alignment_length()
else:
# Should this raise an exception? Compare SeqRecord behaviour...
expected_length = 0
self._per_col_annotations = _RestrictedDict(length=expected_length)
return self._per_col_annotations
column_annotations = property(
fget=_get_per_column_annotations,
fset=_set_per_column_annotations,
doc="""Dictionary of per-letter-annotation for the sequence.""")
def _str_line(self, record, length=50):
"""Return a truncated string representation of a SeqRecord (PRIVATE).
......@@ -398,7 +445,7 @@ class MultipleSeqAlignment(object):
By default, all sequences have the same weight. (0.0 =>
no weight, 1.0 => highest weight)
In general providing a SeqRecord and calling .append is prefered.
In general providing a SeqRecord and calling .append is preferred.
"""
new_seq = Seq(sequence, self._alphabet)
......@@ -481,6 +528,9 @@ class MultipleSeqAlignment(object):
return
expected_length = len(rec)
self._append(rec, expected_length)
# Can now setup the per-column-annotations as well, set to None
# while missing the length:
self.column_annotations = {}
# Now continue to the rest of the records as usual
for rec in records:
......@@ -573,9 +623,11 @@ class MultipleSeqAlignment(object):
>>> b2 = SeqRecord(Seq("GT", generic_dna), id="Beta")
>>> c2 = SeqRecord(Seq("GT", generic_dna), id="Gamma")
>>> left = MultipleSeqAlignment([a1, b1, c1],
... annotations={"tool": "demo", "name": "start"})
... annotations={"tool": "demo", "name": "start"},
... column_annotations={"stats": "CCCXC"})
>>> right = MultipleSeqAlignment([a2, b2, c2],
... annotations={"tool": "demo", "name": "end"})
... annotations={"tool": "demo", "name": "end"},
... column_annotations={"stats": "CC"})
Now, let's look at these two alignments:
......@@ -621,6 +673,11 @@ class MultipleSeqAlignment(object):
>>> combined.annotations
{'tool': 'demo'}
Similarly any common per-column-annotations are combined:
>>> combined.column_annotations
{'stats': 'CCCXCCC'}
"""
if not isinstance(other, MultipleSeqAlignment):
raise NotImplementedError
......@@ -634,7 +691,11 @@ class MultipleSeqAlignment(object):
for k, v in self.annotations.items():
if k in other.annotations and other.annotations[k] == v:
annotations[k] = v
return MultipleSeqAlignment(merged, alpha, annotations)
column_annotations = dict()
for k, v in self.column_annotations.items():
if k in other.column_annotations:
column_annotations[k] = v + other.column_annotations[k]
return MultipleSeqAlignment(merged, alpha, annotations, column_annotations)
def __getitem__(self, index):
"""Access part of the alignment.
......@@ -754,7 +815,13 @@ class MultipleSeqAlignment(object):
return self._records[index]
elif isinstance(index, slice):
# e.g. sub_align = align[i:j:k]
return MultipleSeqAlignment(self._records[index], self._alphabet)
new = MultipleSeqAlignment(self._records[index], self._alphabet)
if self.column_annotations and len(new) == len(self):
# All rows kept (although could have been reversed)
# Perserve the column annotations too,
for k, v in self.column_annotations.items():
new.column_annotations[k] = v
return new
elif len(index) != 2:
raise TypeError("Invalid index type.")
......@@ -768,8 +835,14 @@ class MultipleSeqAlignment(object):
return "".join(rec[col_index] for rec in self._records[row_index])
else:
# e.g. sub_align = align[1:4, 5:7], gives another alignment
return MultipleSeqAlignment((rec[col_index] for rec in self._records[row_index]),
self._alphabet)
new = MultipleSeqAlignment((rec[col_index] for rec in self._records[row_index]),
self._alphabet)
if self.column_annotations and len(new) == len(self):
# All rows kept (although could have been reversed)
# Perserve the column annotations too,
for k, v in self.column_annotations.items():
new.column_annotations[k] = v[col_index]
return new
def sort(self, key=None, reverse=False):
"""Sort the rows (SeqRecord objects) of the alignment in place.
......
......@@ -49,6 +49,14 @@ class ClustalWriter(SequentialAlignmentWriter):
if max_length <= 0:
raise ValueError("Non-empty sequences are required")
if "clustal_consensus" in alignment.column_annotations:
star_info = alignment.column_annotations["clustal_consensus"]
elif hasattr(alignment, "_star_info"):
# This was originally stored by Bio.Clustalw as ._star_info
star_info = alignment._star_info
else:
star_info = None
# keep displaying sequences until we reach the end
while cur_char != max_length:
# calculate the number of sequences to show, which will
......@@ -70,10 +78,8 @@ class ClustalWriter(SequentialAlignmentWriter):
output += line + "\n"
# now we need to print out the star info, if we've got it
# This was stored by Bio.Clustalw using a ._star_info property.
if hasattr(alignment, "_star_info") and alignment._star_info != '':
output += (" " * 36) + \
alignment._star_info[cur_char:(cur_char + show_num)] + "\n"
if star_info:
output += (" " * 36) + star_info[cur_char:(cur_char + show_num)] + "\n"
output += "\n"
cur_char += show_num
......@@ -284,5 +290,7 @@ class ClustalIterator(AlignmentIterator):
assert len(consensus) == alignment_length, \
"Alignment length is %i, consensus length is %i, '%s'" \
% (alignment_length, len(consensus), consensus)
alignment.column_annotations["clustal_consensus"] = consensus
# For backward compatibility prior to .column_annotations:
alignment._star_info = consensus
return alignment