Skip to content
Commits on Source (12)
......@@ -40,8 +40,14 @@ build_script:
- SET PATH=C:\Py;C:\Py\Scripts;C:\Py\Library\bin;%PATH%
- conda config --set always_yes yes
- conda update conda
- conda install setuptools numpy mysql-connector-python psycopg2 matplotlib networkx reportlab scipy coverage
- if "PY_MAJOR_VER"=="2" conda install unittest2
# Pinning pillow==5.4.1 as version 6 breaks on current latest reportlab
# https://bitbucket.org/rptlab/reportlab/issues/176/incompatibility-with-pillow-600
- conda install setuptools numpy psycopg2 matplotlib networkx reportlab scipy coverage pillow==5.4.1
# Pinning mysql-connector-python==8.0.13 for Python 3 as 8.0.16 breaks our tests
# https://github.com/biopython/biopython/issues/2120
# We don't install mysql-connector-python for Python 2
- if "%PY_MAJOR_VER%"=="3" conda install mysql-connector-python==8.0.13
- python setup.py build
test_script:
......
# =================================================
# flake8:
# pycodestyle: E### (error), W### (warning)
# pyflake: F### (error)
# pydocstyle: D1## - Missing Docstrings
# D2## - Whitespace Issues
# D4## - Docstring Content issues
# flake8-bugbear: B###
# flake8-quotes: Q###
# flake8-commas: C#### (in case installed locally)
# flake8-black : BLK### (in case installed locally)
# flake8-pie : PIE### (in case installed locally)
# =================================================
[flake8]
doctests = True
# Exclude some file types and folders that shouldn't be checked:
exclude = .svn,CVS,.bzr,.hg,.git,__pycache__,.tox,.github,build,
ignore =
# =============================================================
# Biopython's 'standard' ignores we can agree to always accept:
# =============================================================
D203, # 1 blank line required before class docstring
# deliberately ignore in favour of passing D211: No blank lines
# allowed before class docstring
W503, # line-break before binary operator
# deliberately ignore (in favour of some day enforcing W504?)
# ===========================================
# Ignores that we have to accept for a while:
# ===========================================
E123, # closing bracket does not match indentation of opening bracket's line
# TODO? (main/Bio/Tests: 3/91/31 occurrences)
E203, # whitespace before ':'
# gives false positives after running black, see
# https://github.com/PyCQA/pycodestyle/issues/373
E501, # line too long
# Maybe we find a sensible limit, e.g. 88 (black) and enforce it
W504, # line break after binary operator (Bio/Tests/Scripts: 225/119/7) TODO?
B007, # Loop control variable not used within the loop body.
# If this is intended, start the name with an underscore
# =========================================
# Optional ignores for local installations:
# =========================================
BLK100, # Black would make changes, only on local installations (so far)
PIE781, # Assigning to temp variable and then returning, not enforcing
# ========================
# Folder specific ignores:
# ========================
per-file-ignores =
Bio/*:E122,E126,F401,F841,D105,B009,B010,B011,C812,C815
Tests/*:F401,F841,D101,D102,D103,B009,B010,B011,C812
# Due to a bug in flake8, we need the following lines for running the
# pre-commit hook. If you made edits above, please change also here!
/Bio/*:E122,E126,F401,F841,D105,B009,B010,B011,C812,C815
/Tests/*:F401,F841,D101,D102,D103,B009,B010,B011,C812
# =============================
# per-file-ignores error codes:
# =============================
#Bio/*:E122 continuation line missing indentation or outdented TODO? (264 occurrences)
# E126 continuation line over-indented for hanging indent TODO? (54 occurrences)
# F401 module imported but unused TODO? (107 occurrences)
# F841 local variable is assigned to but never used TODO? (55 occurrences)
# D105 missing docstring magic method (121 occurrences)
# B009 do not call getattr with a constant attribute value,
# it is not any safer than normal property access
# B010 do not call setattr with a constant attribute value,
# it is not any safer than normal property access
# B011 do not call assert False since python -O removes these calls;
# instead callers should raise AssertionError().
# C812 missing trailing comma
# C815 missing trailing comma in Python 3.5+
#Tests/*:F401 module imported but unused TODO? (88 occurrences)
# F841 local variable is assigned to but never used TODO? (64 occurrences)
# D101 missing docstring in public class (207 occurrences)
# D102 missing docstring in public method (956 occurrences)
# D103 missing docstring in public functions (52 occurrences)
# B009 do not call getattr with a constant attribute value,
# it is not any safer than normal property access
# B010 do not call setattr with a constant attribute value,
# it is not any safer than normal property access
# B011 do not call assert False since python -O removes these calls;
# instead callers should raise AssertionError()
# C812 missing trailing comma
# =======================
# flake8-quotes settings:
# =======================
inline-quotes = double
......@@ -21,3 +21,7 @@ Tests/SubsMat/acc_rep_mat.pik binary
# MMTF is a binary file format,
Tests/PDB/4CUP.mmtf binary
# UCSC Nib files are binary:
Tests/Nib/test_bigendian.nib binary
Tests/Nib/test_littleendian.nib binary
......@@ -19,6 +19,7 @@
Bio/Alphabet @peterjc
Bio/Align/* @peterjc
Bio/Align/_aligners.c @mdehoon
Bio/AlignIO/* @peterjc
Tests/test_AlignIO* @peterjc
......
......@@ -6,9 +6,9 @@ This pull request addresses issue #...
- [ ] I hereby agree to dual licence this and any previous contributions under both
the _Biopython License Agreement_ **AND** the _BSD 3-Clause License_.
- [ ] I have read the ``CONTRIBUTING.rst`` file and understand that AppVeyor and
TravisCI will be used to confirm the Biopython unit tests and ``flake8`` style
checks pass with these changes.
- [ ] I have read the ``CONTRIBUTING.rst`` file, have run ``flake8`` locally, and
understand that AppVeyor and TravisCI will be used to confirm the Biopython unit
tests and style checks pass with these changes.
- [ ] I have added my name to the alphabetical contributors listings in the files
``NEWS.rst`` and ``CONTRIB.rst`` as part of this pull request, am listed
......
#!/bin/bash
# Assumes being called from the Biopython repository's root folder,
# (i.e. a clone of https://github.com/biopython/biopython) as part
# of our continuous integration testing to save the compiled docs
# to https://github.com/biopython/docs
#
# In order to have write permissions, we put a private key into the
# TravisCI settings as a secure environment variable, and put the
# matching public key into the GitHub documentation repository's
# settings as a deploy key with write permissions.
#
# Key creation,
#
# $ ssh-keygen -t rsa -b 4096 -C "biopython documentation deploy key" -f biopython_doc_key -N ""
# Generating public/private rsa key pair.
# Your identification has been saved in biopython_doc_key.
# Your public key has been saved in biopython_doc_key.pub.
# The key fingerprint is:
# SHA256:nFfhbwryDLDz8eDEHa4sjdH0gOgwyXGGDUBGfDi5luQ biopython documentation deploy key
# The key's randomart image is:
# +---[RSA 4096]----+
# |===+o . |
# |.B.*.. . . . |
# |o X . o o . o |
# | E + B * o . |
# |. . + S * o |
# | X @ . o |
# | o * + . |
# | . |
# | |
# +----[SHA256]-----+
#
# Next, we add the public key to https://github.com/biopython/docs as
# a deployment key with write permission,
#
# $ cat biopython_doc_key.pub
# ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQDpQ3I6ZpL9cqUpqkHMPALTQg9ya3sL1MVXYjbTnuWnDoRml5UYXVgD8hgOJxwaxDo1BV+fKn68LXPEwlZ5FC6eRSCJz20SvWPkMDhAwChJJ+nE7f/vvK18R3Ge9ksWra8LFSR3EL7joQTN+c1VyaJH22qj1OED3G68Ix+bgvnUpZgeurV8vDV06FVx7H1Q5a5MoTWFdMa9wzJn5o6m7khditOTDKqznFULoOONpw7CsTiJD6drQPk1pwftDxEBMEAG7cKwux/dRWJtzsRQ7IO0d/AhzsqnLJJIgkHzQwmvpGpffWfoomNwF4bWJuWzu6tRcGcX16fLMyGK8kFJaL1zY6gQFkAbfsIdA2G28S79mIC4jT1JtiNYBOV9wIjxyZUyvzSeQGVC7yBafHE5eEb267dgGnDl654XzyIImLSKv/nx8No16UK/e5F+ds3hp0DPTknzeVOGBUEt1k8pEp47J9JVKoeceph0cJbfzFNv9pgOgyaHb1mhs9pI4kIQ3R+ibeAZbPWT709n26Y99Q2MSSZyPuZvX8VBA1NfoENmuTrEn/qqGlvZez3Blh4MIvYg24DFv/rHN92Edk5S7xY0eB7E6D6X/N80ThuBSqxlJpxSQlA+LICcq/EPd37/WT7exiheXysN5oIOvwNgUNNFftDWv2gPBu2bf/foHfAQKQ== biopython documentation deploy key
#
# Finally, we add the private key to TravisCI by going to
# https://travis-ci.org/biopython/biopython/settings or any authorised
# fork like https://travis-ci.org/peterjc/biopython/settings and
# setting DOC_KEY to the following (secret) value:
#
# $ python -c "print(open('biopython_doc_key').read().strip().replace(' ', r'\ ').replace('\n', r'\\\n'))"
# ...
#
# TravisCI requires we escape spaces as '\ ' and newlines as '\\n', and
# we explicitly strip the trailing new line so that we don't get an extra
# one when rebuilding the key later.
#
# Make sure "DISPLAY VALUE IN BUILD LOG" is off (the default).
#
# For testing locally, set local environment $DOC_KEY to this value.
# Thereafter, when ever this script gets run on TravisCI it should
# be able to deplop the HTML documentation to our documentation
# repository (which will dispaly on biopython.org via GitHub pages).
set -e
if [ -z "$DOC_KEY" ]; then
echo "Missing (secret) environment variable DOC_KEY,"
echo "which should hold the private SSH deployment key."
false
fi
set -euo pipefail
DEST_SLUG=biopython/docs
# Could look at $TRAVIS_TAG, e.g. DEST_DIR=${TRAVIS_TAG:-dev}
# However, tags lack the dots in the version number. Since
# Biopython was installed to run Sphinx and build the docs,
# can use this:
DEST_DIR=`python -c "import Bio; v=Bio.__version__; print('dev' if 'dev' in v else v)"`
SOURCE_DIR=${TRAVIS_BUILD_DIR:-$PWD}/Doc/api/_build/html
WORKING_DIR=/tmp/deploy_biopython_docs
if [ -z "$DEST_DIR" ]; then
echo "ERROR: Failed to get Biopython version, is it not installed?"
python -c "import Bio; print(Bio.__version__)"
false
fi
DEST_DIR=$DEST_DIR/api
echo "Aiming to deploy $SOURCE_DIR to $DEST_SLUG branch gh-pages as $DEST_DIR"
# On TravisCI, must create the variable using '\ ' and '\n', so
# here we must unescape the whitespace to recover the SSH deploy key:
python -c "import os; print(os.environ['DOC_KEY'].strip().replace(r'\ ', ' ').replace(r'\n', '\n'))" > $HOME/.biopython_doc_deploy.key
# Check we have a sane looking line structure:
if [ `grep -c "^\-\-\-\-\-" $HOME/.biopython_doc_deploy.key` -ne 2 ]; then
echo "ERROR: Failed to rebuild the SSH key,"
wc -l $HOME/.biopython_doc_deploy.key
md5sum $HOME/.biopython_doc_deploy.key
false
fi
chmod 600 $HOME/.biopython_doc_deploy.key
export GIT_SSH=${TRAVIS_BUILD_DIR:-$PWD}/.github/ssh_via_deploy_key.sh
if ! [[ -f "$GIT_SSH" ]]; then
echo "Error, set GIT_SSH="$GIT_SSH" but does not exist"
false
elif ! [[ -x "$GIT_SSH" ]]; then
echo "Error, set GIT_SSH="$GIT_SSH" but not executable"
false
fi;
echo "Setting up clone of $DEST_SLUG locally at $WORKING_DIR"
# Clone the destination under /tmp (public URL, no key needed)
rm -rf $WORKING_DIR
git clone https://github.com/$DEST_SLUG.git $WORKING_DIR
pushd $WORKING_DIR
git checkout gh-pages
# Switch the git protocol to SSH based so we can use our key
git remote set-url origin --push git@github.com:$DEST_SLUG.git
popd
echo "Copying $SOURCE_DIR/* to $WORKING_DIR/$DEST_DIR/ next"
if [ ! -d $SOURCE_DIR ]; then
echo "ERROR: Directory $SOURCE_DIR/ does not exist."
false
fi
# Remove any old files
pushd $WORKING_DIR
if [ -d $DEST_DIR ]; then
echo "Removing old files"
git rm -r $DEST_DIR/
fi
mkdir -p $DEST_DIR
echo "Copying files"
cp -R $SOURCE_DIR/* $DEST_DIR/
echo "Staging files in git"
git add $DEST_DIR/
if [[ -z $(git status --porcelain) ]]; then
echo "Nothing has changed, nothing needs pushing."
else
echo "Making commit of new files"
git commit -m "Automated update ${TRAVIS_COMMIT:-}" --author "TravisCI <travisci@example.org>"
echo "Finally, pushing to $DEST_SLUG gh-pages branch"
git push origin gh-pages
echo "Documentation deployed!"
fi
popd
#!/bin/bash
# Call ssh using our GitHub repository deploy key (set via -i)
# using -F to make sure this ignores ~/.ssh/config
ssh -i "$HOME/.biopython_doc_deploy.key" -F /dev/null -p 22 $*
......@@ -31,6 +31,9 @@ dist
#Ignore all Jython class files (present if using Jython)
*.class
#Ignore any compiled C code
*.so
#Ignore the .tox directory from running tox locally
.tox/
......@@ -47,6 +50,9 @@ Tests/Graphics/*.eps
Tests/Graphics/*.svg
Tests/Graphics/*.png
# This file is downloaded when testing the Bio.SeqIO.GckIO module.
Tests/Gck/DGVC_GCK.zip
#Ignore the local BioSQL test settings:
Tests/biosql.ini
......@@ -62,6 +68,7 @@ Doc/examples/tree1.nwk
#Ignore LaTeX temp files, and compiled output
Doc/*.aux
Doc/*.gz
Doc/*.log
Doc/*.out
Doc/*.toc
......@@ -79,6 +86,7 @@ Doc/*/*/*.out
Doc/*/*/*.toc
Doc/*/*/*.haux
Doc/*/*/*.htoc
Doc/_minted-Tutorial/
Doc/Tutorial.txt
Doc/Tutorial.pdf
Doc/Tutorial.html
......@@ -94,9 +102,19 @@ Doc/hevea.sty
Doc/*/hevea.sty
Doc/*/*/hevea.sty
#Ignore artifacts of building the docs
_build
Doc/api/*.rst
#Ignore IntelliJ IDEA directory and project files
.idea
*.iml
#Ignore unittest cache dirctory
#Ignore unittest cache directory
.cache/
#Ignore vscode directory
.vscode
#Ignore mypy cache directory
.mypy_cache
\ No newline at end of file
# This is a configuration file for tox, used to test
# Biopython on various versions of Python etc under
# the Travis Continous Integration service which is
# the Travis Continuous Integration service which is
# configured in the file .travis.yml
#
# By default tox will look for tox.ini, so this file
......@@ -34,8 +34,8 @@ envlist =
sdist
bdist_wheel
api
{py27,py34,py35,py36,pypy,pypy3}-cover
{py27,py34,py35,py36,pypy,pypy3}-nocov
{py27,py35,py36,py37,py38,pypy,pypy3}-cover
{py27,py35,py36,py37,py38,pypy,pypy3}-nocov
[testenv]
# TODO: Try tox default sdist based install instead:
......@@ -56,21 +56,25 @@ whitelist_externals =
install_command = pip install --only-binary=scipy {opts} {packages}
deps =
#Lines startings xxx: are filtered by the environment.
#Leaving py34 without any soft dependencies (just numpy)
#Leaving py36 without any dependencies (even numpy)
cover: coverage
cover: codecov
py27: unittest2
py27: mysql-python
py27,py36: mmtf-python
py27,py37: mmtf-python
# https://bitbucket.org/rptlab/reportlab/issues/176/incompatibility-with-pillow-600
py27,py35: reportlab
py27,py34,py35,py36: psycopg2-binary
py27,py34,py35,py35: mysql-connector-python-rf
py27,py35: pillow==5.4
py27,py35,py37: psycopg2-binary
py27,py35: mysql-connector-python-rf
py35,py37: mysqlclient
py27,py35,pypy: rdflib
pypy,pypy3: numpy==1.12.1
py27,py34,py36: numpy
py36: scipy
pypy,pypy3: mysqlclient
py27,py35,py37: numpy
py37: scipy
py27: networkx
py36: matplotlib
py37: matplotlib
commands =
#The bash call is a work around for special characters
#The /dev/null is to hide the verbose output but leave warnings
......@@ -95,17 +99,23 @@ deps =
flake8-docstrings
flake8-blind-except
flake8-rst-docstrings
py34,py35,py36: flake8-bugbear
flake8-comprehensions
flake8-bugbear;python_version>="3.5"
flake8-quotes
restructuredtext_lint
doc8
pygments
# doc8 needs docutils, but docutils==0.15 has an import order bug
# https://bugs.launchpad.net/doc8/+bug/1837515
# https://sourceforge.net/p/docutils/bugs/366/
docutils==0.14
# flake8-docstrings uses a function removed in pydocstyle 4.0.0; once a fix
# is released in flake8-docstrings we can remove the following constraint:
pydocstyle<4.0.0
# See https://gitlab.com/pycqa/flake8-docstrings/issues/36
commands =
flake8 --max-line-length 82 setup.py
# These folders each have their own .flake8 file:
flake8 BioSQL/
flake8 Scripts/
flake8 Doc/examples/
flake8 Bio/
flake8 Tests/
# PEP-8 and PEP-257 style checks:
flake8
# Now do various checks on our RST files:
# Calling via bash to get it to expand the wildcard for us
bash -c \'rst-lint --level warning *.rst\'
......@@ -153,7 +163,8 @@ deps =
scipy
sphinx>=1.8.0
numpydoc
pygments
sphinx_rtd_theme
commands =
bash -c \'python setup.py install > /dev/null\'
bash -c \'mkdir -p Doc/api/_templates Doc/api/_static Doc/api/_build\'
make -C Doc/api/ html
......@@ -13,7 +13,10 @@
# - basics - quick things like style and packaging
# - test - the actual functional tests which are slow
dist: trusty
dist: xenial
services:
- mysql
- postgresql
language: python
cache: pip
matrix:
......@@ -21,42 +24,57 @@ matrix:
- stage: basics
python: 2.7
env: TOXENV=style
services:
addons:
apt:
packages:
before_install: echo "Going to run basic checks"
- stage: basics
python: 3.6
python: 3.7
env: TOXENV=style
services:
addons:
apt:
packages:
before_install: echo "Going to run basic checks"
- stage: basics
python: 3.7
env: TOXENV=sdist,bdist_wheel
services:
addons:
apt:
packages:
before_install: echo "Going to run basic checks"
- stage: test
python: 2.7
python: 3.7
env: TOXENV=api
services:
addons:
apt:
packages:
before_install: echo "Going to build API docs"
deploy:
provider: script
script: .github/deploy_docs.sh
skip_cleanup: true
on:
branch: master
- stage: test
python: 2.7
env: TOXENV=py27-cover
- stage: test
python: 3.4
env: TOXENV=py34-cover
- stage: test
python: 3.5
env: TOXENV=py35-cover
- stage: test
python: 3.6
env: TOXENV=py36-cover
- stage: test
python: 3.7
env: TOXENV=py37-cover
- stage: test
# TODO: Change this once a stable Python 3.8 is on TravisCI:
python: 3.8-dev
env: TOXENV=py38-cover
- stage: test
python: pypy
env: TOXENV=pypy-nocov
......@@ -111,33 +129,6 @@ before_install:
# Setup environment for t-coffee
- mkdir -p $HOME/tcoffee_temp
- export HOME_4_TCOFFEE=$HOME/tcoffee_temp
# There are TravisCI provided versions of PyPy and PyPy3, but currently too old.
# We therefore deactivate that, and download and unzip portable PyPy binaries.
- |
if [[ $TRAVIS_PYTHON_VERSION == 'pypy' ]]; then
deactivate
wget https://bitbucket.org/squeaky/portable-pypy/downloads/pypy-6.0.0-linux_x86_64-portable.tar.bz2
tar -jxvf pypy-6.0.0-linux_x86_64-portable.tar.bz2
echo 'Setting up aliases...'
cd pypy-6.0.0-linux_x86_64-portable/bin/
export PATH=$PWD:$PATH
ln -s pypy python
echo 'Setting up pip...'
./pypy -m ensurepip
fi
- |
if [[ $TRAVIS_PYTHON_VERSION == 'pypy3' ]]; then
deactivate
wget https://bitbucket.org/squeaky/portable-pypy/downloads/pypy3.5-6.0.0-linux_x86_64-portable.tar.bz2
tar -jxvf pypy3.5-6.0.0-linux_x86_64-portable.tar.bz2
echo 'Setting up aliases...'
cd pypy3.5-6.0.0-linux_x86_64-portable/bin/
export PATH=$PWD:$PATH
ln -s pypy3 python
echo 'Setting up pip...'
./pypy3 -m ensurepip
ln -s pip3 pip
fi
- popd
- cp Tests/biosql.ini.sample Tests/biosql.ini
- psql -c "create database biosql_test;" -U postgres
......@@ -152,6 +143,7 @@ install:
- tox -c .travis-tox.ini -e $TOXENV --notest
script:
- python --version
- travis_wait 30 tox -c .travis-tox.ini -e $TOXENV
notifications:
......
[flake8]
# Would like to enable this in future...
# doctests = True
ignore =
# =======================
# flake: E###, F###, W###
# =======================
# pycodestyle v2.4.0 default ignore is E121,E123,E126,E226,E24,E704,W503,W504
# flake8 v3.6.0 default ignore is E121,E123,E126,E226,E24,E704,W503,W504
# These are ignored by default:
E122,E123,E126,W503,W504,
# These are not ignored by default:
# E127 continuation line over-indented for visual indent
# E128 continuation line under-indented for visual indent
# E501 line too long (XX > 79 characters)
# E731 do not assign a lambda expression, use a def
# F401 module imported but unused
# F841 local variable name is assigned to but never used
# TODO: Fix some of these?
E501,E731,F401,F841,
# =====================================
# pydocstyle: D1## - Missing Docstrings
# =====================================
# D100 Missing docstring in public module
# D101 Missing docstring in public class
# D102 Missing docstring in public method
# D103 Missing docstring in public function
# D105 Missing docstring in magic method
# TODO: Fix some of these?
D100,D101,D102,D103,D105,
# ====================================
# pydocstyle: D2## - Whitespace Issues
# ====================================
# D203 1 blank line required before class docstring
# D211 No blank lines allowed before class docstring
# We ignore D203 deliberately in favour of passing D211,
D203,
# ====================
# flake8-bugbear: B###
# ====================
# B007 Loop control variable not used within the loop body.
# If this is intended, start the name with an underscore.
B007,
# ================================================
# flake8-commas: C#### (in case installed locally)
# ================================================
# C812 missing trailing comma
# C815 missing trailing comma in Python 3.5+
C812,C815
......@@ -29,7 +29,7 @@ class ParserError(ValueError):
_modeError = ParserError("You're trying to open an Affymetrix v4"
" CEL file. You have to use a read binary mode,"
" like this `open(filename \"rb\")`.")
" like this: open(filename, 'rb')")
# for debugging
# import pprint
......@@ -164,14 +164,14 @@ def read_v4(f):
# http://www.affymetrix.com/estore/support/developer/powertools/changelog/gcos-agcc/cel.html.affx
record = Record()
preHeaders = ["magic", "version", "columns", "rows", "cellNo", "headerLen"]
preHeadersMap = dict()
headersMap = dict()
preHeadersMap = {}
headersMap = {}
# load pre-headers
try:
for name in preHeaders:
preHeadersMap[name] = struct.unpack("<i", f.read(4))[0]
except UnicodeDecodeError as e:
except UnicodeDecodeError:
raise _modeError
char = f.read(preHeadersMap["headerLen"])
......@@ -232,7 +232,7 @@ def read_v4(f):
raiseBadHeader("OffsetY", 0)
# This is unfortunately undocumented, but it turns out that real data has
# the `record.AlgorithmParameters` repeated in the data section, until an
# the record.AlgorithmParameters repeated in the data section, until an
# EOF, i.e. b"\x04".
char = b"\x00"
safetyValve = 10**4
......
......@@ -71,7 +71,7 @@ class SummaryInfo(object):
"""
# Iddo Friedberg, 1-JUL-2004: changed ambiguous default to "X"
consensus = ''
consensus = ""
# find the length of the consensus we are creating
con_len = self.alignment.get_alignment_length()
......@@ -86,7 +86,7 @@ class SummaryInfo(object):
# make sure we haven't run past the end of any sequences
# if they are of different lengths
if n < len(record.seq):
if record.seq[n] != '-' and record.seq[n] != '.':
if record.seq[n] != "-" and record.seq[n] != ".":
if record.seq[n] not in atom_dict:
atom_dict[record.seq[n]] = 1
else:
......@@ -132,7 +132,7 @@ class SummaryInfo(object):
"""
# Iddo Friedberg, 1-JUL-2004: changed ambiguous default to "X"
consensus = ''
consensus = ""
# find the length of the consensus we are creating
con_len = self.alignment.get_alignment_length()
......@@ -270,8 +270,8 @@ class SummaryInfo(object):
rep_dict = self._pair_replacement(
self.alignment[rec_num1].seq,
self.alignment[rec_num2].seq,
self.alignment[rec_num1].annotations.get('weight', 1.0),
self.alignment[rec_num2].annotations.get('weight', 1.0),
self.alignment[rec_num1].annotations.get("weight", 1.0),
self.alignment[rec_num2].annotations.get("weight", 1.0),
rep_dict, skip_items)
return rep_dict
......@@ -363,7 +363,7 @@ class SummaryInfo(object):
if isinstance(self.alignment._alphabet, Alphabet.Gapped):
skip_items.append(self.alignment._alphabet.gap_char)
all_letters = all_letters.replace(
self.alignment._alphabet.gap_char, '')
self.alignment._alphabet.gap_char, "")
# now create the dictionary
for first_letter in all_letters:
......@@ -408,7 +408,7 @@ class SummaryInfo(object):
chars_to_ignore.append(self.alignment._alphabet.gap_char)
for char in chars_to_ignore:
all_letters = all_letters.replace(char, '')
all_letters = all_letters.replace(char, "")
if axis_seq:
left_seq = axis_seq
......@@ -429,7 +429,7 @@ class SummaryInfo(object):
this_residue = None
if this_residue and this_residue not in chars_to_ignore:
weight = record.annotations.get('weight', 1.0)
weight = record.annotations.get("weight", 1.0)
try:
score_dict[this_residue] += weight
# if we get a KeyError then we have an alphabet problem
......@@ -521,7 +521,7 @@ class SummaryInfo(object):
# determine all of the letters we have to deal with
all_letters = self._get_all_letters()
for char in chars_to_ignore:
all_letters = all_letters.replace(char, '')
all_letters = all_letters.replace(char, "")
info_content = {}
for residue_num in range(start, end):
......@@ -584,7 +584,7 @@ class SummaryInfo(object):
for record in all_records:
try:
if record.seq[residue_num] not in to_ignore:
weight = record.annotations.get('weight', 1.0)
weight = record.annotations.get("weight", 1.0)
freq_info[record.seq[residue_num]] += weight
total_count += weight
# getting a key error means we've got a problem with the alphabet
......@@ -676,6 +676,7 @@ class SummaryInfo(object):
return total_info
def get_column(self, col):
"""Return column of alignment."""
# TODO - Deprecate this and implement slicing?
return self.alignment[:, col]
......
......@@ -58,12 +58,12 @@ class DialignCommandline(AbstractCommandline):
"Anchored alignment. Requires a file <seq_file>.anc "
"containing anchor points."),
_Switch(["-cs", "cs"],
"If segments are translated, not only the `Watson "
"strand' but also the `Crick strand' is looked at."),
"If segments are translated, not only the 'Watson "
"strand' but also the 'Crick strand' is looked at."),
_Switch(["-cw", "cw"],
"Additional output file in CLUSTAL W format."),
_Switch(["-ds", "ds"],
"`dna alignment speed up' - non-translated nucleic acid "
"'dna alignment speed up' - non-translated nucleic acid "
"fragments are taken into account only if they start "
"with at least two matches. Speeds up DNA alignment at "
"the expense of sensitivity."),
......@@ -89,7 +89,7 @@ class DialignCommandline(AbstractCommandline):
"This option speeds up the alignment but may lead "
"to reduced alignment quality."),
_Switch(["-lgs", "lgs"],
"`long genomic sequences' - combines the following "
"'long genomic sequences' - combines the following "
"options: -ma, -thr 2, -lmax 30, -smin 8, -nta, -ff, "
"-fop, -ff, -cs, -ds, -pst "),
_Switch(["-lgs_t", "lgs_t"],
......@@ -99,7 +99,7 @@ class DialignCommandline(AbstractCommandline):
"-lgs but not very sensitive for non-coding regions."),
_Option(["-lmax", "lmax"],
"Maximum fragment length = x (default: x = 40 or "
"x = 120 for `translated' fragments). Shorter x "
"x = 120 for 'translated' fragments). Shorter x "
"speeds up the program but may affect alignment quality.",
checker_function=lambda x: isinstance(x, int),
equate=False),
......@@ -108,11 +108,11 @@ class DialignCommandline(AbstractCommandline):
"about fragments selected for pairwise alignment and "
"about consistency in multi-alignment procedure."),
_Switch(["-ma", "ma"],
"`mixed alignments' consisting of P-fragments and "
"'mixed alignments' consisting of P-fragments and "
"N-fragments if nucleic acid sequences are aligned."),
_Switch(["-mask", "mask"],
"Residues not belonging to selected fragments are "
"replaced by `*' characters in output alignment "
"replaced by '*' characters in output alignment "
"(rather than being printed in lower-case characters)"),
_Switch(["-mat", "mat"],
"Creates file *mat with substitution counts derived "
......@@ -135,10 +135,10 @@ class DialignCommandline(AbstractCommandline):
"No translation of fragments."),
_Switch(["-nt", "nt"],
"Input sequences are nucleic acid sequences and "
"`nucleic acid segments' are translated to `peptide "
"'nucleic acid segments' are translated to 'peptide "
"segments'."),
_Switch(["-nta", "nta"],
"`no textual alignment' - textual alignment suppressed. "
"'no textual alignment' - textual alignment suppressed. "
"This option makes sense if other output files are of "
"interest -- e.g. the fragment files created with -ff, "
"-fop, -fsm or -lo."),
......@@ -161,7 +161,7 @@ class DialignCommandline(AbstractCommandline):
"alignment or alignment of translated DNA fragments "
"at the expense of sensitivity."),
_Option(["-stars", "stars"],
"Maximum number of `*' characters indicating degree "
"Maximum number of '*' characters indicating degree "
"of local similarity among sequences. By default, no "
"stars are used but numbers between 0 and 9, instead.",
checker_function=lambda x: x in range(0, 10),
......
......@@ -281,7 +281,7 @@ class MafftCommandline(AbstractCommandline):
_Option(["--tm", "tm"],
"Transmembrane PAM number (Jones et al. 1994) "
"matrix is used. number>0. Default: BLOSUM62",
filename=True,
filename=True, # to ensure spaced inputs are quoted
equate=False),
# Use a user-defined AA scoring matrix. The format of matrixfile is
# the same to that of BLAST. Ignored when nucleotide sequences are
......@@ -289,7 +289,7 @@ class MafftCommandline(AbstractCommandline):
_Option(["--aamatrix", "aamatrix"],
"Use a user-defined AA scoring matrix. "
"Default: BLOSUM62",
filename=True,
filename=True, # to ensure spaced inputs are quoted
equate=False),
# Incorporate the AA/nuc composition information into the scoring
# matrix. Default: off
......
......@@ -52,7 +52,12 @@ class MuscleCommandline(AbstractCommandline):
["pctid_kimura", "pctid_log"]
OBJECTIVE_SCORES = ["sp", "ps", "dp", "xp", "spf", "spm"]
TREE_ROOT_METHODS = ["pseudo", "midlongestspan", "minavgleafdist"]
SEQUENCE_TYPES = ["protein", "nucleo", "auto"]
# The mucleotide arguments for the sequence type parameter in MUSCLE (-seqtype)
# were updated at somepoint in MUSCLE version 3.8. Prior to the update
# 'nucleo' was used for nucleotide. This has been updated to 'rna' and 'dna'. 'nucleo' kept for
# backwards compatibility with older MUSCLE versions.
SEQUENCE_TYPES = ["protein", "rna", "dna", "nucleo", "auto"]
WEIGHTING_SCHEMES = ["none", "clustalw", "henikoff", "henikoffpb",
"gsc", "threeway"]
self.parameters = [
......@@ -304,8 +309,10 @@ class MuscleCommandline(AbstractCommandline):
filename=True,
equate=False),
# seqtype protein auto Sequence type.
# nucleo
# dna (MUSCLE version > 3.8)
# rna (MUSCLE version > 3.8)
# auto
# nucleo (only valid for MUSCLE versions < 3.8)
_Option(["-seqtype", "seqtype"],
"Sequence type",
checker_function=lambda x: x in SEQUENCE_TYPES,
......
......@@ -57,10 +57,7 @@ class TCoffeeCommandline(AbstractCommandline):
One (or more separated by a comma) of:
'clustalw_aln', 'clustalw', 'gcg', 'msf_aln',
'pir_aln', 'fasta_aln', 'phylip', 'pir_seq', 'fasta_seq'
Note that of these Biopython's AlignIO module will only
read clustalw, pir, and fasta.
""", # TODO - Can we read the PHYLIP output?
""",
equate=False),
_Option(["-infile", "infile"],
"Specify the input file.",
......
......@@ -15,18 +15,17 @@ class, used in the Bio.AlignIO module.
from __future__ import print_function
import sys # Only needed to check if we are using Python 2 or 3
from Bio._py3k import raise_from
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord, _RestrictedDict
from Bio import Alphabet
try:
from Bio.Align import _aligners
except ImportError as e:
new_exc = ImportError("{}: you should not import directly from the "
"biopython source directory; please exit the source "
"tree and re-launch your code from there".format(e))
new_exc.__cause__ = None
raise new_exc
# Import errors may occur here if a compiled aligners.c file
# (_aligners.pyd or _aligners.so) is missing or if the user is
# importing from within the Biopython source tree, see PR #2007:
# https://github.com/biopython/biopython/pull/2007
class MultipleSeqAlignment(object):
......@@ -460,12 +459,12 @@ class MultipleSeqAlignment(object):
# in BioPerl, but I'm not positive what the best thing to do on
# this is...
if start:
new_record.annotations['start'] = start
new_record.annotations["start"] = start
if end:
new_record.annotations['end'] = end
new_record.annotations["end"] = end
# another hack to add weight information to the sequence
new_record.annotations['weight'] = weight
new_record.annotations["weight"] = weight
self._records.append(new_record)
......@@ -681,11 +680,11 @@ class MultipleSeqAlignment(object):
alpha = Alphabet._consensus_alphabet([self._alphabet, other._alphabet])
merged = (left + right for left, right in zip(self, other))
# Take any common annotation:
annotations = dict()
annotations = {}
for k, v in self.annotations.items():
if k in other.annotations and other.annotations[k] == v:
annotations[k] = v
column_annotations = dict()
column_annotations = {}
for k, v in self.column_annotations.items():
if k in other.column_annotations:
column_annotations[k] = v + other.column_annotations[k]
......@@ -970,11 +969,18 @@ class PairwiseAlignment(object):
return self.path >= other.path
def __format__(self, format_spec):
if format_spec == 'psl':
if format_spec == "psl":
return self._format_psl()
return str(self)
def __str__(self):
if isinstance(self.query, str) and isinstance(self.target, str):
return self.format()
else:
return self._format_generalized()
def format(self):
"""Create a human-readable representation of the alignment."""
query = self.query
target = self.target
try:
......@@ -1000,23 +1006,23 @@ class PairwiseAlignment(object):
end1, end2 = path[0]
if end1 > 0 or end2 > 0:
end = max(end1, end2)
aligned_seq1 += "." * (end - end1) + seq1[:end1]
aligned_seq2 += "." * (end - end2) + seq2[:end2]
pattern += '.' * end
aligned_seq1 += " " * (end - end1) + seq1[:end1]
aligned_seq2 += " " * (end - end2) + seq2[:end2]
pattern += " " * end
start1 = end1
start2 = end2
for end1, end2 in path[1:]:
gap = 0
if end1 == start1:
gap = end2 - start2
aligned_seq1 += '-' * gap
aligned_seq1 += "-" * gap
aligned_seq2 += seq2[start2:end2]
pattern += '-' * gap
pattern += "-" * gap
elif end2 == start2:
gap = end1 - start1
aligned_seq1 += seq1[start1:end1]
aligned_seq2 += '-' * gap
pattern += '-' * gap
aligned_seq2 += "-" * gap
pattern += "-" * gap
else:
s1 = seq1[start1:end1]
s2 = seq2[start2:end2]
......@@ -1024,17 +1030,90 @@ class PairwiseAlignment(object):
aligned_seq2 += s2
for c1, c2 in zip(s1, s2):
if c1 == c2:
pattern += '|'
pattern += "|"
else:
pattern += 'X'
pattern += "."
start1 = end1
start2 = end2
n1 -= end1
n2 -= end2
n = max(n1, n2)
aligned_seq1 += seq1[end1:] + '.' * (n - n1)
aligned_seq2 += seq2[end2:] + '.' * (n - n2)
pattern += '.' * n
aligned_seq1 += seq1[end1:] + " " * (n - n1)
aligned_seq2 += seq2[end2:] + " " * (n - n2)
pattern += " " * n
return "%s\n%s\n%s\n" % (aligned_seq1, pattern, aligned_seq2)
def _format_generalized(self):
seq1 = self.target
seq2 = self.query
n1 = len(seq1)
n2 = len(seq2)
aligned_seq1 = []
aligned_seq2 = []
pattern = []
path = self.path
end1, end2 = path[0]
if end1 > 0 or end2 > 0:
if end1 <= end2:
for c2 in seq2[:end2 - end1]:
s2 = str(c2)
s1 = " " * len(s2)
aligned_seq1.append(s1)
aligned_seq2.append(s2)
pattern.append(s1)
else: # end1 > end2
for c1 in seq1[:end1 - end2]:
s1 = str(c1)
s2 = " " * len(s1)
aligned_seq1.append(s1)
aligned_seq2.append(s2)
pattern.append(s2)
start1 = end1
start2 = end2
for end1, end2 in path[1:]:
if end1 == start1:
for c2 in seq2[start2:end2]:
s2 = str(c2)
s1 = "-" * len(s2)
aligned_seq1.append(s1)
aligned_seq2.append(s2)
pattern.append(s1)
start2 = end2
elif end2 == start2:
for c1 in seq1[start1:end1]:
s1 = str(c1)
s2 = "-" * len(s1)
aligned_seq1.append(s1)
aligned_seq2.append(s2)
pattern.append(s2)
start1 = end1
else:
for c1, c2 in zip(seq1[start1:end1], seq2[start2:end2]):
s1 = str(c1)
s2 = str(c2)
m1 = len(s1)
m2 = len(s2)
if c1 == c2:
p = "|"
else:
p = "."
if m1 < m2:
space = (m2 - m1) * " "
s1 += space
pattern.append(p * m1 + space)
elif m1 > m2:
space = (m1 - m2) * " "
s2 += space
pattern.append(p * m2 + space)
else:
pattern.append(p * m1)
aligned_seq1.append(s1)
aligned_seq2.append(s2)
start1 = end1
start2 = end2
aligned_seq1 = " ".join(aligned_seq1)
aligned_seq2 = " ".join(aligned_seq2)
pattern = " ".join(pattern)
return "%s\n%s\n%s\n" % (aligned_seq1, pattern, aligned_seq2)
def _format_psl(self):
......@@ -1073,7 +1152,7 @@ class PairwiseAlignment(object):
blockSizes = []
qStarts = []
tStarts = []
strand = '+'
strand = "+"
start1 = 0
start2 = 0
start1, start2 = self.path[0]
......@@ -1104,7 +1183,7 @@ class PairwiseAlignment(object):
qStarts.append(start2)
blockSizes.append(count1)
for c1, c2 in zip(seq1[start1:end1], seq2[start2:end2]):
if c1 == 'N' or c2 == 'N':
if c1 == "N" or c2 == "N":
Ns += 1
elif c1 == c2:
match += 1
......@@ -1141,6 +1220,96 @@ class PairwiseAlignment(object):
line = "\t".join(words) + "\n"
return line
@property
def aligned(self):
"""Return the indices of subsequences aligned to each other.
This property returns the start and end indices of subsequences
in the target and query sequence that were aligned to each other.
If the alignment between target (t) and query (q) consists of N
chunks, you get two tuples of length N:
(((t_start1, t_end1), (t_start2, t_end2), ..., (t_startN, t_endN)),
((q_start1, q_end1), (q_start2, q_end2), ..., (q_startN, q_endN)))
For example,
>>> from Bio import Align
>>> aligner = Align.PairwiseAligner()
>>> alignments = aligner.align("GAACT", "GAT")
>>> alignment = alignments[0]
>>> print(alignment)
GAACT
||--|
GA--T
<BLANKLINE>
>>> alignment.aligned
(((0, 2), (4, 5)), ((0, 2), (2, 3)))
>>> alignment = alignments[1]
>>> print(alignment)
GAACT
|-|-|
G-A-T
<BLANKLINE>
>>> alignment.aligned
(((0, 1), (2, 3), (4, 5)), ((0, 1), (1, 2), (2, 3)))
Note that different alignments may have the same subsequences
aligned to each other. In particular, this may occur if alignments
differ from each other in terms of their gap placement only:
>>> aligner.mismatch_score = -10
>>> alignments = aligner.align("AAACAAA", "AAAGAAA")
>>> len(alignments)
2
>>> print(alignments[0])
AAAC-AAA
|||--|||
AAA-GAAA
<BLANKLINE>
>>> alignments[0].aligned
(((0, 3), (4, 7)), ((0, 3), (4, 7)))
>>> print(alignments[1])
AAA-CAAA
|||--|||
AAAG-AAA
<BLANKLINE>
>>> alignments[1].aligned
(((0, 3), (4, 7)), ((0, 3), (4, 7)))
The property can be used to identify alignments that are identical
to each other in terms of their aligned sequences.
"""
segments1 = []
segments2 = []
if sys.version_info[0] > 2:
i1, i2 = self.path[0]
for node in self.path[1:]:
j1, j2 = node
if j1 > i1 and j2 > i2:
segment1 = (i1, j1)
segment2 = (i2, j2)
segments1.append(segment1)
segments2.append(segment2)
i1, i2 = j1, j2
else:
# Python 2: convert all long ints to ints to be consistent
# with the doctests
i1, i2 = self.path[0]
i1 = int(i1)
i2 = int(i2)
for node in self.path[1:]:
j1, j2 = node
j1 = int(j1)
j2 = int(j2)
if j1 > i1 and j2 > i2:
segment1 = (i1, j1)
segment2 = (i2, j2)
segments1.append(segment1)
segments2.append(segment2)
i1, i2 = j1, j2
return tuple(segments1), tuple(segments2)
class PairwiseAlignments(object):
"""Implements an iterator over pairwise alignments returned by the aligner.
......@@ -1187,7 +1356,7 @@ class PairwiseAlignments(object):
try:
alignment = next(self)
except StopIteration:
raise IndexError('index out of range')
raise_from(IndexError("index out of range"), None)
return alignment
def __iter__(self):
......@@ -1233,60 +1402,60 @@ class PairwiseAligner(_aligners.PairwiseAligner):
>>> from Bio import Align
>>> aligner = Align.PairwiseAligner()
>>> alignments = aligner.align("ACCGT", "ACG")
>>> alignments = aligner.align("TACCG", "ACG")
>>> for alignment in sorted(alignments):
... print("Score = %.1f:" % alignment.score)
... print(alignment)
...
Score = 3.0:
ACCGT
|-||-
A-CG-
TACCG
-|-||
-A-CG
<BLANKLINE>
Score = 3.0:
ACCGT
||-|-
AC-G-
TACCG
-||-|
-AC-G
<BLANKLINE>
Specify the aligner mode as local to generate local alignments:
>>> aligner.mode = 'local'
>>> alignments = aligner.align("ACCGT", "ACG")
>>> alignments = aligner.align("TACCG", "ACG")
>>> for alignment in sorted(alignments):
... print("Score = %.1f:" % alignment.score)
... print(alignment)
...
Score = 3.0:
ACCGT
|-||.
A-CG.
TACCG
|-||
A-CG
<BLANKLINE>
Score = 3.0:
ACCGT
||-|.
AC-G.
TACCG
||-|
AC-G
<BLANKLINE>
Do a global alignment. Identical characters are given 2 points,
1 point is deducted for each non-identical character.
>>> aligner.mode = 'global'
>>> aligner.match = 2
>>> aligner.mismatch = -1
>>> for alignment in aligner.align("ACCGT", "ACG"):
>>> aligner.match_score = 2
>>> aligner.mismatch_score = -1
>>> for alignment in aligner.align("TACCG", "ACG"):
... print("Score = %.1f:" % alignment.score)
... print(alignment)
...
Score = 6.0:
ACCGT
||-|-
AC-G-
TACCG
-||-|
-AC-G
<BLANKLINE>
Score = 6.0:
ACCGT
|-||-
A-CG-
TACCG
-|-||
-A-CG
<BLANKLINE>
Same as above, except now 0.5 points are deducted when opening a
......@@ -1296,27 +1465,27 @@ class PairwiseAligner(_aligners.PairwiseAligner):
>>> aligner.extend_gap_score = -0.1
>>> aligner.target_end_gap_score = 0.0
>>> aligner.query_end_gap_score = 0.0
>>> for alignment in aligner.align("ACCGT", "ACG"):
>>> for alignment in aligner.align("TACCG", "ACG"):
... print("Score = %.1f:" % alignment.score)
... print(alignment)
...
Score = 5.5:
ACCGT
|-||-
A-CG-
TACCG
-|-||
-A-CG
<BLANKLINE>
Score = 5.5:
ACCGT
||-|-
AC-G-
TACCG
-||-|
-AC-G
<BLANKLINE>
The alignment function can also use known matrices already included in
Biopython:
>>> from Bio.SubsMat import MatrixInfo
>>> from Bio.Align import substitution_matrices
>>> aligner = Align.PairwiseAligner()
>>> aligner.substitution_matrix = MatrixInfo.blosum62
>>> aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
>>> alignments = aligner.align("KEVLA", "EVL")
>>> alignments = list(alignments)
>>> print("Number of alignments: %d" % len(alignments))
......@@ -1332,15 +1501,28 @@ class PairwiseAligner(_aligners.PairwiseAligner):
"""
def __setattr__(self, key, value):
if key not in dir(_aligners.PairwiseAligner):
# To prevent confusion, don't allow users to create new attributes
message = "'PairwiseAligner' object has no attribute '%s'" % key
raise AttributeError(message)
_aligners.PairwiseAligner.__setattr__(self, key, value)
def align(self, seqA, seqB):
"""Return the alignments of two sequences using PairwiseAligner."""
if isinstance(seqA, Seq):
seqA = str(seqA)
if isinstance(seqB, Seq):
seqB = str(seqB)
score, paths = _aligners.PairwiseAligner.align(self, seqA, seqB)
alignments = PairwiseAlignments(seqA, seqB, score, paths)
return alignments
def score(self, seqA, seqB):
"""Return the alignments score of two sequences using PairwiseAligner."""
if isinstance(seqA, Seq):
seqA = str(seqA)
if isinstance(seqB, Seq):
seqB = str(seqB)
return _aligners.PairwiseAligner.score(self, seqA, seqB)
......
This diff is collapsed.
"""Substitution matrices."""
import os
import string
import numpy
from Bio import File
from Bio import BiopythonExperimentalWarning
from Bio._py3k import raise_from
# These two can be removed once we drop python2:
import sys
import platform
import warnings
warnings.warn("Bio.Align.substitution_matrices is an experimental module "
"which may still undergo significant changes. In particular, "
"the location of this module may change, and the Array class "
"defined in this module may be moved to other existing or new "
"modules in Biopython.",
BiopythonExperimentalWarning)
class Array(numpy.ndarray):
"""numpy array subclass indexed by integers and by letters."""
def __new__(cls, alphabet=None, dims=None, data=None, dtype=float):
"""Create a new Array instance."""
if isinstance(data, dict):
if alphabet is not None:
raise ValueError("alphabet should be None if data is a dict")
if dims is not None:
raise ValueError("dims should be None if data is a dict")
alphabet = []
for key in data:
if isinstance(key, str):
if dims is None:
dims = 1
elif dims != 1:
raise ValueError("inconsistent dimensions in data")
alphabet.append(key)
elif isinstance(key, tuple):
single_letters = True
if dims is None:
dims = len(key)
elif dims != len(key):
raise ValueError("inconsistent dimensions in data")
if dims == 1:
if not isinstance(key, str):
raise ValueError("expected string")
if len(key) > 1:
single_letters = False
alphabet.append(key)
elif dims == 2:
for letter in key:
if not isinstance(letter, str):
raise ValueError("expected string")
if len(letter) > 1:
single_letters = False
alphabet.append(letter)
else:
raise ValueError("data array should be 1- or 2- "
"dimensional (found %d dimensions) "
"in key" % dims)
alphabet = sorted(set(alphabet))
if single_letters:
alphabet = "".join(alphabet)
else:
alphabet = tuple(alphabet)
n = len(alphabet)
if dims == 1:
shape = (n, )
elif dims == 2:
shape = (n, n)
else: # dims is None
raise ValueError("data is an empty dictionary")
obj = super(Array, cls).__new__(cls, shape, dtype)
if dims == 1:
for i, key in enumerate(alphabet):
obj[i] = data.get(letter, 0.0)
elif dims == 2:
for i1, letter1 in enumerate(alphabet):
for i2, letter2 in enumerate(alphabet):
key = (letter1, letter2)
value = data.get(key, 0.0)
obj[i1, i2] = value
obj._alphabet = alphabet
return obj
if alphabet is None:
alphabet = string.ascii_uppercase
elif not (isinstance(alphabet, str) or isinstance(alphabet, tuple)):
raise ValueError("alphabet should be a string or a tuple")
n = len(alphabet)
if data is None:
if dims is None:
dims = 1
elif dims not in (1, 2):
raise ValueError("dims should be 1 or 2 (found %d)" % dims)
shape = (n, ) * dims
else:
if dims is None:
shape = data.shape
dims = len(shape)
if dims == 1:
pass
elif dims == 2:
if shape[0] != shape[1]:
raise ValueError("data array is not square")
else:
raise ValueError("data array should be 1- or 2- "
"dimensional (found %d dimensions) "
% dims)
else:
shape = (n, ) * dims
if data.shape != shape:
raise ValueError("data shape has inconsistent shape "
"(expected (%s), found (%s))"
% (shape, data.shape))
obj = super(Array, cls).__new__(cls, shape, dtype)
if data is None:
obj[:] = 0.0
else:
obj[:] = data
obj._alphabet = alphabet
return obj
def __array_finalize__(self, obj):
if obj is None:
return
self._alphabet = getattr(obj, "_alphabet", None)
def _convert_key(self, key):
if isinstance(key, tuple):
indices = []
for index in key:
if isinstance(index, str):
try:
index = self._alphabet.index(index)
except ValueError:
raise_from(IndexError("'%s'" % index), None)
indices.append(index)
key = tuple(indices)
elif isinstance(key, str):
try:
key = self._alphabet.index(key)
except ValueError:
raise_from(IndexError("'%s'" % key), None)
return key
def __getitem__(self, key):
key = self._convert_key(key)
value = numpy.ndarray.__getitem__(self, key)
if value.ndim == 2:
if self.ndim == 2:
if value.shape != self.shape:
raise IndexError("Requesting truncated array")
elif self.ndim == 1:
length = self.shape[0]
if value.shape[0] == length and value.shape[1] == 1:
pass
elif value.shape[0] == 1 and value.shape[1] == length:
pass
else:
raise IndexError("Requesting truncated array")
elif value.ndim == 1:
if value.shape[0] != self.shape[0]:
value._alphabet = self.alphabet[key]
return value.view(Array)
def __setitem__(self, key, value):
key = self._convert_key(key)
numpy.ndarray.__setitem__(self, key, value)
def __contains__(self, key):
# Follow dict definition of __contains__
return key in self.keys()
def __array_prepare__(self, out_arr, context=None):
# needed for numpy older than 1.13.0
ufunc, inputs, i = context
alphabet = self.alphabet
for arg in inputs:
if isinstance(arg, Array):
if arg.alphabet != alphabet:
raise ValueError("alphabets are inconsistent")
return numpy.ndarray.__array_prepare__(self, out_arr, context)
def __array_wrap__(self, out_arr, context=None):
if len(out_arr) == 1:
return out_arr[0]
return numpy.ndarray.__array_wrap__(self, out_arr, context)
def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
args = []
alphabet = self._alphabet
for arg in inputs:
if isinstance(arg, Array):
if arg.alphabet != alphabet:
raise ValueError("alphabets are inconsistent")
args.append(arg.view(numpy.ndarray))
else:
args.append(arg)
outputs = kwargs.pop("out", None)
if outputs:
out_args = []
for arg in outputs:
if isinstance(arg, Array):
if arg.alphabet != alphabet:
raise ValueError("alphabets are inconsistent")
out_args.append(arg.view(numpy.ndarray))
else:
out_args.append(arg)
kwargs["out"] = tuple(out_args)
else:
outputs = (None,) * ufunc.nout
raw_results = super(Array, self).__array_ufunc__(ufunc, method,
*args, **kwargs)
if raw_results is NotImplemented:
return NotImplemented
if method == "at":
return
if ufunc.nout == 1:
raw_results = (raw_results,)
results = []
for raw_result, output in zip(raw_results, outputs):
if raw_result.ndim == 0:
result = raw_result
elif output is None:
result = numpy.asarray(raw_result).view(Array)
result._alphabet = self._alphabet
else:
result = output
result._alphabet = self._alphabet
results.append(result)
return results[0] if len(results) == 1 else results
def transpose(self, axes=None):
"""Transpose the array."""
other = numpy.ndarray.transpose(self, axes)
other._alphabet = self._alphabet
return other
@property
def alphabet(self):
"""Return the alphabet property."""
return self._alphabet
def copy(self):
"""Create and return a copy of the array."""
other = Array(alphabet=self._alphabet, data=self)
return other
def get(self, key, value=None):
"""Return the value of the key if found; return value otherwise."""
try:
return self[key]
except IndexError:
return value
def items(self):
"""Return an iterator of (key, value) pairs in the array."""
dims = len(self.shape)
if dims == 1:
for index, key in enumerate(self._alphabet):
value = numpy.ndarray.__getitem__(self, index)
yield key, value
elif dims == 2:
for i1, c1 in enumerate(self._alphabet):
for i2, c2 in enumerate(self._alphabet):
key = (c1, c2)
value = numpy.ndarray.__getitem__(self, (i1, i2))
yield key, value
else:
raise RuntimeError("array has unexpected shape %s" % self.shape)
def keys(self):
"""Return a tuple with the keys associated with the array."""
dims = len(self.shape)
alphabet = self._alphabet
if dims == 1:
return tuple(alphabet)
elif dims == 2:
return tuple((c1, c2) for c2 in alphabet for c1 in alphabet)
else:
raise RuntimeError("array has unexpected shape %s" % self.shape)
def values(self):
"""Return a tuple with the values stored in the array."""
dims = len(self.shape)
alphabet = self._alphabet
if dims == 1:
return tuple(self)
elif dims == 2:
n1, n2 = self.shape
return tuple(numpy.ndarray.__getitem__(self, (i1, i2)) for i2 in range(n2) for i1 in range(n1))
else:
raise RuntimeError("array has unexpected shape %s" % self.shape)
def update(self, E=None, **F):
"""Update the array from dict/iterable E and F."""
if E is not None:
try:
alphabet = E.keys()
except AttributeError:
for key, value in E:
self[key] = value
else:
for key in E:
self[key] = E[key]
for key in F:
self[key] = F[key]
def _format_1D(self, fmt):
_alphabet = self._alphabet
n = len(_alphabet)
words = [None for i in range(n)]
lines = []
try:
header = self.header
except AttributeError:
pass
else:
for line in header:
line = "# %s\n" % line
lines.append(line)
maxwidth = 0
for i, key in enumerate(_alphabet):
value = self[key]
word = fmt % value
width = len(word)
if width > maxwidth:
maxwidth = width
words[i] = word
fmt2 = " %" + str(maxwidth) + "s"
for letter, word in zip(_alphabet, words):
word = fmt2 % word
line = letter + word + "\n"
lines.append(line)
text = "".join(lines)
return text
def _format_2D(self, fmt):
alphabet = self.alphabet
n = len(alphabet)
words = [[None for j in range(n)] for i in range(n)]
lines = []
try:
header = self.header
except AttributeError:
pass
else:
for line in header:
line = "# %s\n" % line
lines.append(line)
width = max(len(c) for c in alphabet)
line = " " * width
for j, c2 in enumerate(alphabet):
maxwidth = 0
for i, c1 in enumerate(alphabet):
key = (c1, c2)
value = self[key]
word = fmt % value
width = len(word)
if width > maxwidth:
maxwidth = width
words[i][j] = word
fmt2 = " %" + str(maxwidth) + "s"
word = fmt2 % c2
line += word
for i, c1 in enumerate(alphabet):
word = words[i][j]
words[i][j] = fmt2 % word
line = line.rstrip() + "\n"
lines.append(line)
for letter, row in zip(alphabet, words):
line = letter + "".join(row) + "\n"
lines.append(line)
text = "".join(lines)
return text
def __format__(self, fmt):
if fmt == "":
if numpy.issubdtype(self.dtype, numpy.integer):
fmt = "%i"
else:
fmt = "%.1f"
n = len(self.shape)
if n == 1:
return self._format_1D(fmt)
elif n == 2:
return self._format_2D(fmt)
else:
raise RuntimeError("Array has unexpected rank %d" % n)
def __str__(self):
return self.__format__("")
def __repr__(self):
text = numpy.ndarray.__repr__(self)
alphabet = self._alphabet
if isinstance(alphabet, str):
assert text.endswith(")")
text = text[:-1] + ",\n alphabet='%s')" % self._alphabet
return text
if sys.version_info[0] < 3 and platform.python_implementation() == "PyPy":
# For python2 on PyPy, subclassing from a numpy array, which supports the
# buffer protocol, loses the Py_TPFLAGS_HAVE_NEWBUFFER flag on tp_flags on
# the class type, although the subclass still supports the buffer protocol.
# Adding this flag by hand here, as a temporary hack until we drop python2.
from .. import _aligners
_aligners.add_buffer_protocol_flag(Array)
def read(handle, dtype=float):
"""Parse the file and return an Array object."""
header = []
with File.as_handle(handle) as fp:
for line in fp:
if not line.startswith("#"):
break
header.append(line[1:].strip())
row = line.split()
rows = [row]
for line in fp:
row = line.split()
rows.append(row)
if len(rows[0]) == len(rows[1]) == 2:
alphabet = [key for key, value in rows]
for key in alphabet:
if len(key) > 1:
alphabet = tuple(alphabet)
break
else:
alphabet = "".join(alphabet)
matrix = Array(alphabet=alphabet, dims=1, dtype=dtype)
matrix.update(rows)
else:
alphabet = rows.pop(0)
for key in alphabet:
if len(key) > 1:
alphabet = tuple(alphabet)
break
else:
alphabet = "".join(alphabet)
matrix = Array(alphabet=alphabet, dims=2, dtype=dtype)
for letter1, row in zip(alphabet, rows):
assert letter1 == row.pop(0)
for letter2, word in zip(alphabet, row):
matrix[letter1, letter2] = float(word)
matrix.header = header
return matrix
def load(name=None):
"""Load and return a precalculated substitution matrix.
>>> from Bio.Align import substitution_matrices
>>> names = substitution_matrices.load()
"""
path = os.path.realpath(__file__)
directory = os.path.dirname(path)
subdirectory = os.path.join(directory, "data")
if name is None:
filenames = os.listdir(subdirectory)
return sorted(filenames)
path = os.path.join(subdirectory, name)
matrix = read(path)
return matrix