Skip to content
Commits on Source (5)
# Check on http://lint.travis-ci.org/ after modifying it! Originally
# modified from https://gist.github.com/dan-blanchard/7045057
sudo: false
language: python
language: c
os:
- linux
- osx
env:
- PYTHON_VERSION=3.6 MAKE_DOC=TRUE
- PYTHON_VERSION=3.5 MAKE_DOC=TRUE
- PYTHON_VERSION=3.4 USE_CYTHON=TRUE
- CONDA_PY=3.6 MAKE_DOC=TRUE
- CONDA_PY=3.5 MAKE_DOC=TRUE USE_CYTHON=TRUE
before_install:
- "export DISPLAY=:99.0"
- "sh -e /etc/init.d/xvfb start"
- wget http://repo.continuum.io/miniconda/Miniconda3-3.7.3-Linux-x86_64.sh -O miniconda.sh
- chmod +x miniconda.sh
- ./miniconda.sh -b
- export PATH=/home/travis/miniconda3/bin:$PATH
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then wget https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh; fi
- bash miniconda.sh -b -p $HOME/miniconda
- export PATH="$HOME/miniconda/bin:$PATH"
- hash -r
# Update conda itself
- conda update --yes conda
install:
- conda create --yes -n env_name python=$PYTHON_VERSION
- conda create --yes -n env_name python=$CONDA_PY
- conda install --yes -n env_name --file ci/conda_requirements.txt
- if [ ${USE_CYTHON} ]; then conda install --yes -n env_name cython; fi
- source activate env_name
......@@ -24,11 +27,14 @@ install:
- python --version
- pip install -r ci/pip_requirements.txt
- pip install . --no-deps
script:
- WITH_COVERAGE=TRUE make test
- if [ ${MAKE_DOC} ]; then make -C doc clean html; fi
after_success:
- coveralls
notifications:
webhooks:
urls:
......
# scikit-bio changelog
## Version 0.5.3 (2018-08-07)
### Features
* Added `unpack` and `unpack_by_func` to `skbio.tree.TreeNode` to unpack one or multiple internal nodes. The "unpack" operation removes an internal node and regrafts its children to its parent while retaining the overall length.
* Added `support` to `skbio.tree.TreeNode` to return the support value of a node.
* Added `permdisp` to `skbio.stats.distance` to test for the homogeniety of groups. ([#1228](https://github.com/biocore/scikit-bio/issues/1228)).
* Added `pcoa_biplot` to `skbio.stats.ordination` to project descriptors into a PCoA plot.
* Fixed pandas to 0.22.0 due to this: https://github.com/pandas-dev/pandas/issues/20527
### Backward-incompatible changes [stable]
### Backward-incompatible changes [experimental]
### Performance enhancements
### Bug fixes
* Relaxing type checking in diversity calculations. ([#1583](https://github.com/biocore/scikit-bio/issues/1583)).
### Deprecated functionality [stable]
### Deprecated functionality [experimental]
### Miscellaneous
## Version 0.5.2 (2018-04-18)
### Features
......
......@@ -3,7 +3,7 @@
:target: http://scikit-bio.org
:alt: scikit-bio logo
|Build Status| |Coverage Status| |ASV Benchmarks| |Gitter Badge| |Depsy Badge| |Anaconda Cloud Build| |Anaconda Cloud| |License| |Downloads| |Install|
|Build Status| |Coverage Status| |ASV Benchmarks| |Gitter Badge| |Depsy Badge| |Anaconda Build Platforms| |Anaconda Build Version| |License| |Downloads| |Install|
scikit-bio is an open-source, BSD-licensed Python 3 package providing data structures, algorithms and educational resources for bioinformatics.
......@@ -118,9 +118,9 @@ scikit-bio's logo was created by `Alina Prassas <http://cargocollective.com/alin
:target: https://gitter.im/biocore/scikit-bio?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge
.. |Depsy Badge| image:: http://depsy.org/api/package/pypi/scikit-bio/badge.svg
:target: http://depsy.org/package/python/scikit-bio
.. |Anaconda Cloud Build| image:: https://anaconda.org/conda-forge/scikit-bio/badges/build.svg
:target: https://anaconda.org/biocore/scikit-bio/builds
.. |Anaconda Cloud| image:: https://anaconda.org/conda-forge/scikit-bio/badges/version.svg
.. |Anaconda Build Platforms| image:: https://anaconda.org/conda-forge/scikit-bio/badges/platforms.svg
:target: https://anaconda.org/conda-forge/scikit-bio
.. |Anaconda Build Version| image:: https://anaconda.org/conda-forge/scikit-bio/badges/version.svg
:target: https://anaconda.org/conda-forge/scikit-bio
.. |License| image:: https://anaconda.org/conda-forge/scikit-bio/badges/license.svg
:target: https://anaconda.org/conda-forge/scikit-bio
......
......@@ -3,7 +3,7 @@ pip
numpy<1.14.0
scipy
matplotlib
pandas
pandas<0.23.0
nose
pep8
ipython
......
......@@ -7,3 +7,4 @@ Sphinx
sphinx-bootstrap-theme
numpydoc < 0.8.0
check-manifest
hdmedians
python-skbio (0.5.3-1) UNRELEASED; urgency=medium
* New upstream version
* Standards-Version: 4.2.0
TODO: Needs python-hdmedians (to be packaged)
-- Andreas Tille <tille@debian.org> Wed, 15 Aug 2018 15:21:02 +0200
python-skbio (0.5.2-1) unstable; urgency=medium
* New upstream version
......
......@@ -27,7 +27,7 @@ Build-Depends: debhelper (>= 11~),
python3-sphinx,
python3-sphinx-bootstrap-theme,
libssw-dev
Standards-Version: 4.1.4
Standards-Version: 4.2.0
Vcs-Browser: https://salsa.debian.org/med-team/python-skbio
Vcs-Git: https://salsa.debian.org/med-team/python-skbio.git
Homepage: https://github.com/biocore/scikit-bio
......
......@@ -120,9 +120,10 @@ setup(name='scikit-bio',
# a numpy 1.14.0 conda package on `defaults` or `conda-forge`
# channels.
'numpy >= 1.9.2, < 1.14.0',
'pandas >= 0.19.2',
'pandas >= 0.19.2, < 0.23.0',
'scipy >= 0.15.1',
'nose >= 1.3.7',
'hdmedians >= 0.13',
'scikit-learn >= 0.19.1'
],
test_suite='nose.collector',
......
......@@ -26,7 +26,7 @@ __all__ = ['Sequence', 'DNA', 'RNA', 'Protein', 'GeneticCode',
'TreeNode', 'nj', 'read', 'write', 'OrdinationResults']
__credits__ = "https://github.com/biocore/scikit-bio/graphs/contributors"
__version__ = "0.5.2"
__version__ = "0.5.3"
mottos = [
# 03/15/2014
......
......@@ -735,7 +735,7 @@ def local_pairwise_align_ssw(sequence1, sequence2, **kwargs):
return msa, alignment.optimal_alignment_score, start_end
@deprecated(as_of="0.4.0", until="0.5.3",
@deprecated(as_of="0.4.0", until="0.5.4",
reason="Will be replaced by a SubstitutionMatrix class. To track "
"progress, see [#161]"
"(https://github.com/biocore/scikit-bio/issues/161).")
......
......@@ -182,7 +182,7 @@ def alpha_diversity(metric, counts, ids=None, validate=True, **kwargs):
return pd.Series(results, index=ids)
@deprecated(as_of='0.5.0', until='0.5.3',
@deprecated(as_of='0.5.0', until='0.6.0',
reason=('The return type is unstable. Developer caution is '
'advised. The resulting DistanceMatrix object will '
'include zeros when distance has not been calculated, and '
......
......@@ -21,10 +21,8 @@ def _validate_counts_vector(counts, suppress_cast=False):
"""
counts = np.asarray(counts)
if not suppress_cast:
counts = counts.astype(int, casting='safe', copy=False)
if not np.all(np.isreal(counts)):
raise ValueError("Counts vector must contain real-valued entries.")
if counts.ndim != 1:
raise ValueError("Only 1-D vectors are supported.")
elif (counts < 0).any():
......
......@@ -57,9 +57,6 @@ class ValidationTests(TestCase):
self.assertEqual(obs.dtype, int)
def test_validate_counts_vector_invalid_input(self):
# wrong dtype
with self.assertRaises(TypeError):
_validate_counts_vector([0, 2, 1.2, 3])
# wrong number of dimensions (2-D)
with self.assertRaises(ValueError):
......@@ -73,6 +70,10 @@ class ValidationTests(TestCase):
with self.assertRaises(ValueError):
_validate_counts_vector([0, 0, 2, -1, 3])
# strings
with self.assertRaises(ValueError):
_validate_counts_vector([0, 0, 'a', -1, 3])
def test_validate_counts_matrix(self):
# basic valid input (n=2)
obs = _validate_counts_matrix([[0, 1, 1, 0, 2],
......@@ -101,8 +102,6 @@ class ValidationTests(TestCase):
npt.assert_array_equal(obs[1], np.array([42.2, 42.1, 1.0]))
self.assertEqual(obs[0].dtype, float)
self.assertEqual(obs[1].dtype, float)
with self.assertRaises(TypeError):
_validate_counts_matrix([[0.0], [1]], suppress_cast=False)
def test_validate_counts_matrix_negative_counts(self):
with self.assertRaises(ValueError):
......
......@@ -281,7 +281,7 @@ class GrammaredSequence(Sequence, metaclass=GrammaredSequenceMeta):
return set(cls.degenerate_map)
@classproperty
@deprecated(as_of='0.5.0', until='0.5.3',
@deprecated(as_of='0.5.0', until='0.6.0',
reason='Renamed to definite_chars')
def nondegenerate_chars(cls):
"""Return non-degenerate characters.
......@@ -492,7 +492,7 @@ class GrammaredSequence(Sequence, metaclass=GrammaredSequenceMeta):
"""
return np.in1d(self._bytes, self._definite_char_codes)
@deprecated(as_of='0.5.0', until='0.5.3',
@deprecated(as_of='0.5.0', until='0.6.0',
reason='Renamed to definites')
def nondegenerates(self):
"""Find positions containing non-degenerate characters in the sequence.
......@@ -548,7 +548,7 @@ class GrammaredSequence(Sequence, metaclass=GrammaredSequenceMeta):
# TODO: cache results
return bool(self.definites().any())
@deprecated(as_of='0.5.0', until='0.5.3',
@deprecated(as_of='0.5.0', until='0.6.0',
reason='Renamed to has_definites')
def has_nondegenerates(self):
"""Determine if sequence contains one or more non-degenerate characters
......
......@@ -154,6 +154,7 @@ Categorical Variable Stats
anosim
permanova
permdisp
Continuous Variable Stats
^^^^^^^^^^^^^^^^^^^^^^^^^
......@@ -196,9 +197,10 @@ from ._bioenv import bioenv
from ._anosim import anosim
from ._permanova import permanova
from ._mantel import mantel, pwmantel
from ._permdisp import permdisp
__all__ = ['DissimilarityMatrixError', 'DistanceMatrixError', 'MissingIDError',
'DissimilarityMatrix', 'DistanceMatrix', 'randdm', 'anosim',
'permanova', 'bioenv', 'mantel', 'pwmantel']
'permanova', 'bioenv', 'mantel', 'pwmantel', 'permdisp']
test = TestRunner(__file__).test
# ----------------------------------------------------------------------------
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------
from functools import partial
import numpy as np
from scipy.stats import f_oneway
from scipy.spatial.distance import cdist
import hdmedians as hd
from ._base import (_preprocess_input, _run_monte_carlo_stats, _build_results)
from skbio.stats.ordination import pcoa
from skbio.util._decorator import experimental
@experimental(as_of="0.5.2")
def permdisp(distance_matrix, grouping, column=None, test='median',
permutations=999):
"""Test for Homogeneity of Multivariate Groups Disperisons using Marti
Anderson's PERMDISP2 procedure.
PERMDISP is a multivariate analogue of Levene's test for homogeneity of
multivariate variances. Distances are handled by reducing the
original distances to principal coordinates. PERMDISP calculates an
F-statistic to assess whether the dispersions between groups is significant
Parameters
----------
distance_matrix : DistanceMatrix
Distance matrix containing distances between objects (e.g., distances
between samples of microbial communities).
grouping : 1-D array_like or pandas.DataFrame
Vector indicating the assignment of objects to groups. For example,
these could be strings or integers denoting which group an object
belongs to. If `grouping` is 1-D ``array_like``, it must be the same
length and in the same order as the objects in `distance_matrix`. If
`grouping` is a ``DataFrame``, the column specified by `column` will be
used as the grouping vector. The ``DataFrame`` must be indexed by the
IDs in `distance_matrix` (i.e., the row labels must be distance matrix
IDs), but the order of IDs between `distance_matrix` and the
``DataFrame`` need not be the same. All IDs in the distance matrix must
be present in the ``DataFrame``. Extra IDs in the ``DataFrame`` are
allowed (they are ignored in the calculations).
column : str, optional
Column name to use as the grouping vector if `grouping` is a
``DataFrame``. Must be provided if `grouping` is a ``DataFrame``.
Cannot be provided if `grouping` is 1-D ``array_like``.
test : {'centroid', 'median'}
determines whether the analysis is done using centroid or spaitial
median.
permutations : int, optional
Number of permutations to use when assessing statistical
significance. Must be greater than or equal to zero. If zero,
statistical significance calculations will be skipped and the p-value
will be ``np.nan``.
Returns
-------
pandas.Series
Results of the statistical test, including ``test statistic`` and
``p-value``.
Raises
------
TypeError
If, when using the spatial median test, the pcoa ordination is not of
type np.float32 or np.float64, the spatial median function will fail
and the centroid test should be used instead
ValueError
If the test is not centroid or median.
TypeError
If the distance matrix is not an instance of a
``skbio.DistanceMatrix``.
ValueError
If there is only one group
ValueError
If a list and a column name are both provided
ValueError
If a list is provided for `grouping` and it's length does not match
the number of ids in distance_matrix
ValueError
If all of the values in the grouping vector are unique
KeyError
If there are ids in grouping that are not in distance_matrix
See Also
--------
permanova
anosim
Notes
-----
The significance of the results from this function will be the same as the
results found in vegan's betadisper, however due to floating point
variability the F-statistic results may vary slightly.
See [1]_ for the original method reference, as well as
``vegan::betadisper``, available in R's vegan package [2]_.
References
----------
.. [1] Anderson, Marti J. "Distance-Based Tests for Homogeneity of
Multivariate Dispersions." Biometrics 62 (2006):245-253
.. [2] http://cran.r-project.org/web/packages/vegan/index.html
Examples
--------
Load a 6x6 distance matrix and grouping vector denoting 2 groups of
objects:
>>> from skbio import DistanceMatrix
>>> dm = DistanceMatrix([[0, 0.5, 0.75, 1, 0.66, 0.33],
... [0.5, 0, 0.25, 0.33, 0.77, 0.61],
... [0.75, 0.25, 0, 0.1, 0.44, 0.55],
... [1, 0.33, 0.1, 0, 0.75, 0.88],
... [0.66, 0.77, 0.44, 0.75, 0, 0.77],
... [0.33, 0.61, 0.55, 0.88, 0.77, 0]],
... ['s1', 's2', 's3', 's4', 's5', 's6'])
>>> grouping = ['G1', 'G1', 'G1', 'G2', 'G2', 'G2']
Run PERMDISP using 99 permutations to caluculate the p-value:
>>> from skbio.stats.distance import permdisp
>>> import numpy as np
>>> #make output deterministic, should not be included during normal use
>>> np.random.seed(0)
>>> permdisp(dm, grouping, permutations=99)
method name PERMDISP
test statistic name F-value
sample size 6
number of groups 2
test statistic 1.03296
p-value 0.35
number of permutations 99
Name: PERMDISP results, dtype: object
The return value is a ``pandas.Series`` object containing the results of
the statistical test.
To suppress calculation of the p-value and only obtain the F statistic,
specify zero permutations:
>>> permdisp(dm, grouping, permutations=0)
method name PERMDISP
test statistic name F-value
sample size 6
number of groups 2
test statistic 1.03296
p-value NaN
number of permutations 0
Name: PERMDISP results, dtype: object
PERMDISP computes variances based on two types of tests, using either
centroids or spatial medians, also commonly referred to as a geometric
median. The spatial median is thought to yield a more robust test
statistic, and this test is used by default. Spatial medians are computed
using an iterative algorithm to find the optimally minimum point from all
other points in a group while centroids are computed using a deterministic
formula. As such the two different tests yeild slightly different F
statistics.
>>> np.random.seed(0)
>>> permdisp(dm, grouping, test='centroid', permutations=99)
method name PERMDISP
test statistic name F-value
sample size 6
number of groups 2
test statistic 3.67082
p-value 0.29
number of permutations 99
Name: PERMDISP results, dtype: object
You can also provide a ``pandas.DataFrame`` and a column denoting the
grouping instead of a grouping vector. The following DataFrame's
Grouping column specifies the same grouping as the vector we used in the
previous examples.:
>>> import pandas as pd
>>> df = pd.DataFrame.from_dict(
... {'Grouping': {'s1': 'G1', 's2': 'G1', 's3': 'G1', 's4': 'G2',
... 's5': 'G2', 's6': 'G2'}})
>>> # make output deterministic; should not be included during normal use
>>> np.random.seed(0)
>>> permdisp(dm, df, 'Grouping', permutations=99, test='centroid')
method name PERMDISP
test statistic name F-value
sample size 6
number of groups 2
test statistic 3.67082
p-value 0.29
number of permutations 99
Name: PERMDISP results, dtype: object
Note that when providing a ``DataFrame``, the ordering of rows and/or
columns does not affect the grouping vector that is extracted. The
``DataFrame`` must be indexed by the distance matrix IDs (i.e., the row
labels must be distance matrix IDs).
If IDs (rows) are present in the ``DataFrame`` but not in the distance
matrix, they are ignored. The previous example's ``s7`` ID illustrates this
behavior: note that even though the ``DataFrame`` had 7 objects, only 6
were used in the test (see the "Sample size" row in the results above to
confirm this). Thus, the ``DataFrame`` can be a superset of the distance
matrix IDs. Note that the reverse is not true: IDs in the distance matrix
*must* be present in the ``DataFrame`` or an error will be raised.
PERMDISP should be used to determine whether the dispersions between the
groups in your distance matrix are significantly separated.
A non-significant test result indicates that group dispersions are similar
to each other. PERMANOVA or ANOSIM should then be used in conjunction to
determine whether clustering within groups is significant.
"""
if test not in ['centroid', 'median']:
raise ValueError('Test must be centroid or median')
ordination = pcoa(distance_matrix)
samples = ordination.samples
sample_size, num_groups, grouping, tri_idxs, distances = _preprocess_input(
distance_matrix, grouping, column)
test_stat_function = partial(_compute_groups, samples, test)
stat, p_value = _run_monte_carlo_stats(test_stat_function, grouping,
permutations)
return _build_results('PERMDISP', 'F-value', sample_size, num_groups,
stat, p_value, permutations)
def _compute_groups(samples, test_type, grouping):
groups = []
samples['grouping'] = grouping
if test_type == 'centroid':
centroids = samples.groupby('grouping').aggregate('mean')
elif test_type == 'median':
centroids = samples.groupby('grouping').aggregate(_config_med)
for label, df in samples.groupby('grouping'):
groups.append(cdist(df.values[:, :-1], [centroids.loc[label].values],
metric='euclidean'))
stat, _ = f_oneway(*groups)
stat = stat[0]
return stat
def _config_med(x):
"""
slice the vector up to the last value to exclude grouping column
and transpose the vector to be compatible with hd.geomedian
"""
X = x.values[:, :-1]
return np.array(hd.geomedian(X.T))
This diff is collapsed.
#SampleID BarcodeSequence LinkerPrimerSequence BodySite Year Month Day Subject ReportedAntibioticUsage DaysSinceExperimentStart Description
L1S8 AGCTGACTAGTC GTGCCAGCMGCCGCGGTAA gut 2008 10 28 subject-1 Yes 0 subject-1.gut.2008-10-28
L1S57 ACACACTATGGC GTGCCAGCMGCCGCGGTAA gut 2009 1 20 subject-1 No 84 subject-1.gut.2009-1-20
L1S76 ACTACGTGTGGT GTGCCAGCMGCCGCGGTAA gut 2009 2 17 subject-1 No 112 subject-1.gut.2009-2-17
L1S105 AGTGCGATGCGT GTGCCAGCMGCCGCGGTAA gut 2009 3 17 subject-1 No 140 subject-1.gut.2009-3-17
L2S155 ACGATGCGACCA GTGCCAGCMGCCGCGGTAA left palm 2009 1 20 subject-1 No 84 subject-1.left-palm.2009-1-20
L2S175 AGCTATCCACGA GTGCCAGCMGCCGCGGTAA left palm 2009 2 17 subject-1 No 112 subject-1.left-palm.2009-2-17
L2S204 ATGCAGCTCAGT GTGCCAGCMGCCGCGGTAA left palm 2009 3 17 subject-1 No 140 subject-1.left-palm.2009-3-17
L2S222 CACGTGACATGT GTGCCAGCMGCCGCGGTAA left palm 2009 4 14 subject-1 No 168 subject-1.left-palm.2009-4-14
L3S242 ACAGTTGCGCGA GTGCCAGCMGCCGCGGTAA right palm 2008 10 28 subject-1 Yes 0 subject-1.right-palm.2008-10-28
L3S294 CACGACAGGCTA GTGCCAGCMGCCGCGGTAA right palm 2009 1 20 subject-1 No 84 subject-1.right-palm.2009-1-20
L3S313 AGTGTCACGGTG GTGCCAGCMGCCGCGGTAA right palm 2009 2 17 subject-1 No 112 subject-1.right-palm.2009-2-17
L3S341 CAAGTGAGAGAG GTGCCAGCMGCCGCGGTAA right palm 2009 3 17 subject-1 No 140 subject-1.right-palm.2009-3-17
L3S360 CATCGTATCAAC GTGCCAGCMGCCGCGGTAA right palm 2009 4 14 subject-1 No 168 subject-1.right-palm.2009-4-14
L5S104 CAGTGTCAGGAC GTGCCAGCMGCCGCGGTAA tongue 2008 10 28 subject-1 Yes 0 subject-1.tongue.2008-10-28
L5S155 ATCTTAGACTGC GTGCCAGCMGCCGCGGTAA tongue 2009 1 20 subject-1 No 84 subject-1.tongue.2009-1-20
L5S174 CAGACATTGCGT GTGCCAGCMGCCGCGGTAA tongue 2009 2 17 subject-1 No 112 subject-1.tongue.2009-2-17
L5S203 CGATGCACCAGA GTGCCAGCMGCCGCGGTAA tongue 2009 3 17 subject-1 No 140 subject-1.tongue.2009-3-17
L5S222 CTAGAGACTCTT GTGCCAGCMGCCGCGGTAA tongue 2009 4 14 subject-1 No 168 subject-1.tongue.2009-4-14
L1S140 ATGGCAGCTCTA GTGCCAGCMGCCGCGGTAA gut 2008 10 28 subject-2 Yes 0 subject-2.gut.2008-10-28
L1S208 CTGAGATACGCG GTGCCAGCMGCCGCGGTAA gut 2009 1 20 subject-2 No 84 subject-2.gut.2009-1-20
L1S257 CCGACTGAGATG GTGCCAGCMGCCGCGGTAA gut 2009 3 17 subject-2 No 140 subject-2.gut.2009-3-17
L1S281 CCTCTCGTGATC GTGCCAGCMGCCGCGGTAA gut 2009 4 14 subject-2 No 168 subject-2.gut.2009-4-14
L2S240 CATATCGCAGTT GTGCCAGCMGCCGCGGTAA left palm 2008 10 28 subject-2 Yes 0 subject-2.left-palm.2008-10-28
L2S309 CGTGCATTATCA GTGCCAGCMGCCGCGGTAA left palm 2009 1 20 subject-2 No 84 subject-2.left-palm.2009-1-20
L2S357 CTAACGCAGTCA GTGCCAGCMGCCGCGGTAA left palm 2009 3 17 subject-2 No 140 subject-2.left-palm.2009-3-17
L2S382 CTCAATGACTCA GTGCCAGCMGCCGCGGTAA left palm 2009 4 14 subject-2 No 168 subject-2.left-palm.2009-4-14
L3S378 ATCGATCTGTGG GTGCCAGCMGCCGCGGTAA right palm 2008 10 28 subject-2 Yes 0 subject-2.right-palm.2008-10-28
L4S63 CTCGTGGAGTAG GTGCCAGCMGCCGCGGTAA right palm 2009 1 20 subject-2 No 84 subject-2.right-palm.2009-1-20
L4S112 GCGTTACACACA GTGCCAGCMGCCGCGGTAA right palm 2009 3 17 subject-2 No 140 subject-2.right-palm.2009-3-17
L4S137 GAACTGTATCTC GTGCCAGCMGCCGCGGTAA right palm 2009 4 14 subject-2 No 168 subject-2.right-palm.2009-4-14
L5S240 CTGGACTCATAG GTGCCAGCMGCCGCGGTAA tongue 2008 10 28 subject-2 Yes 0 subject-2.tongue.2008-10-28
L6S20 GAGGCTCATCAT GTGCCAGCMGCCGCGGTAA tongue 2009 1 20 subject-2 No 84 subject-2.tongue.2009-1-20
L6S68 GATACGTCCTGA GTGCCAGCMGCCGCGGTAA tongue 2009 3 17 subject-2 No 140 subject-2.tongue.2009-3-17
L6S93 GATTAGCACTCT GTGCCAGCMGCCGCGGTAA tongue 2009 4 14 subject-2 No 168 subject-2.tongue.2009-4-14
\ No newline at end of file
# ----------------------------------------------------------------------------
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------
from functools import partial
from unittest import TestCase, main
import numpy as np
import numpy.testing as npt
import pandas as pd
from pandas.util.testing import assert_series_equal
from scipy.stats import f_oneway
import hdmedians as hd
from skbio import DistanceMatrix
from skbio.stats.ordination import pcoa
from skbio.stats.distance import permdisp
from skbio.stats.distance._permdisp import _compute_groups
from skbio.util import get_data_path
class testPERMDISP(TestCase):
def setUp(self):
# test with 2 groups of equal size
# when assigned different labels, results should be the same
self.grouping_eq = ['foo', 'foo', 'foo', 'bar', 'bar', 'bar']
self.grouping_eq_relab = ['pyt', 'pyt', 'pyt', 'hon', 'hon', 'hon']
self.exp_index = ['method name', 'test statistic name', 'sample size',
'number of groups', 'test statistic', 'p-value',
'number of permutations']
# test with 3 groups of different sizes
# when assigned different labels results should be the same
self.grouping_uneq = ['foo', 'foo', 'bar', 'bar', 'bar',
'qw', 'qw', 'qw', 'qw']
self.grouping_uneq_relab = [12, 12, 7, 7, 7, 23, 23, 23, 23]
self.grouping_un_mixed = ['a', 'a', 7, 7, 7, 'b', 'b', 'b', 'b']
eq_ids = ['s1', 's2', 's3', 's4', 's5', 's6']
uneq_ids = ['s1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9']
# matrix for equal grouping
self.eq_mat = DistanceMatrix([[0, 4, 0, 0, 4, 2],
[4, 0, 2, 0, 3, 1],
[0, 2, 0, 5, 2, 5],
[0, 0, 5, 0, 0, 2],
[4, 3, 2, 0, 0, 2],
[2, 1, 5, 2, 2, 0]], eq_ids)
# matrix for unequal grouping
self.uneq_mat = DistanceMatrix([[0, 0, 4, 0, 0, 3, 5, 3, 0],
[0, 0, 0, 3, 4, 5, 3, 0, 3],
[4, 0, 0, 4, 3, 1, 0, 5, 2],
[0, 3, 4, 0, 0, 2, 1, 3, 5],
[0, 4, 3, 0, 0, 1, 1, 5, 0],
[3, 5, 1, 2, 1, 0, 2, 0, 5],
[5, 3, 0, 1, 1, 2, 0, 4, 3],
[3, 0, 5, 3, 5, 0, 4, 0, 4],
[0, 3, 2, 5, 0, 5, 3, 4, 0]], uneq_ids)
# null matrix for equal grouping
self.null_mat = DistanceMatrix([[0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0]], eq_ids)
unif_ids = ['PC.354', 'PC.355', 'PC.356', 'PC.481', 'PC.593', 'PC.607',
'PC.634', 'PC.635', 'PC.636']
self.unifrac_dm = DistanceMatrix(
[[0.0, 0.595483768391, 0.618074717633, 0.582763100909,
0.566949022108, 0.714717232268, 0.772001731764, 0.690237118413,
0.740681707488],
[0.595483768391, 0.0, 0.581427669668, 0.613726772383,
0.65945132763, 0.745176523638, 0.733836123821, 0.720305073505,
0.680785600439],
[0.618074717633, 0.581427669668, 0.0, 0.672149021573,
0.699416863323, 0.71405573754, 0.759178215168, 0.689701276341,
0.725100672826],
[0.582763100909, 0.613726772383, 0.672149021573, 0.0,
0.64756120797, 0.666018240373, 0.66532968784, 0.650464714994,
0.632524644216],
[0.566949022108, 0.65945132763, 0.699416863323, 0.64756120797,
0.0, 0.703720200713, 0.748240937349, 0.73416971958,
0.727154987937],
[0.714717232268, 0.745176523638, 0.71405573754, 0.666018240373,
0.703720200713, 0.0, 0.707316869557, 0.636288883818,
0.699880573956],
[0.772001731764, 0.733836123821, 0.759178215168, 0.66532968784,
0.748240937349, 0.707316869557, 0.0, 0.565875193399,
0.560605525642],
[0.690237118413, 0.720305073505, 0.689701276341, 0.650464714994,
0.73416971958, 0.636288883818, 0.565875193399, 0.0,
0.575788039321],
[0.740681707488, 0.680785600439, 0.725100672826, 0.632524644216,
0.727154987937, 0.699880573956, 0.560605525642, 0.575788039321,
0.0]], unif_ids)
self.unif_grouping = ['Control', 'Control', 'Control', 'Control',
'Control', 'Fast', 'Fast', 'Fast', 'Fast']
self.assert_series_equal = partial(assert_series_equal,
check_index_type=True,
check_series_type=True)
def test_centroids_eq_groups(self):
exp = [[1.2886811963240687, 1.890538910062923, 1.490527658097728],
[2.17349240061718, 2.3192679626679946, 2.028338553903792]]
exp_stat, _ = f_oneway(*exp)
dm = pcoa(self.eq_mat)
dm = dm.samples
obs = _compute_groups(dm, 'centroid', self.grouping_eq)
self.assertAlmostEqual(obs, exp_stat, places=6)
obs_relab = _compute_groups(dm, 'centroid', self.grouping_eq_relab)
self.assertAlmostEqual(obs_relab, obs, places=6)
def test_centroids_uneq_groups(self):
"""
the expected result here was calculated by hand
"""
exp = [[2.5847022428144935, 2.285624595858895,
1.7022431146340287],
[1.724817266046108, 1.724817266046108],
[2.4333280644972795, 2.389000390879655,
2.8547180589306036, 3.218568759338847]]
exp_stat, _ = f_oneway(*exp)
dm = pcoa(self.uneq_mat)
dm = dm.samples
obs = _compute_groups(dm, 'centroid', self.grouping_uneq)
self.assertAlmostEqual(obs, exp_stat, places=6)
obs_relab = _compute_groups(dm, 'centroid', self.grouping_uneq_relab)
self.assertAlmostEqual(obs, obs_relab, places=6)
def test_centroids_mixedgroups(self):
exp = [[2.5847022428144935, 2.285624595858895,
1.7022431146340287],
[1.724817266046108, 1.724817266046108],
[2.4333280644972795, 2.389000390879655,
2.8547180589306036, 3.218568759338847]]
dm = pcoa(self.uneq_mat)
dm = dm.samples
exp_stat, _ = f_oneway(*exp)
obs_mixed = _compute_groups(dm, 'centroid', self.grouping_un_mixed)
self.assertAlmostEqual(exp_stat, obs_mixed, places=6)
def test_centroids_null(self):
dm = pcoa(self.null_mat)
dm = dm.samples
obs_null = _compute_groups(dm, 'centroid', self.grouping_eq)
np.isnan(obs_null)
def test_centroid_normal(self):
exp = pd.Series(index=self.exp_index,
data=['PERMDISP', 'F-value', 9, 2, 0.244501519876,
0.63, 99],
name='PERMDISP results')
grouping = ['Control', 'Control', 'Control', 'Control', 'Control',
'Fast', 'Fast', 'Fast', 'Fast']
np.random.seed(0)
obs = permdisp(self.unifrac_dm, grouping, test='centroid',
permutations=99)
self.assert_series_equal(obs, exp)
def test_median_normal(self):
exp = pd.Series(index=self.exp_index,
data=['PERMDISP', 'F-value', 9, 2, 0.139475441876,
0.61, 99],
name='PERMDISP results')
np.random.seed(0)
obs = permdisp(self.unifrac_dm, self.unif_grouping, test='median',
permutations=99)
self.assert_series_equal(obs, exp)
def test_not_distance_matrix(self):
dm = []
grouping = ['Control', 'Control', 'Control', 'Control', 'Control',
'Fast', 'Fast', 'Fast', 'Fast']
npt.assert_raises(TypeError, permdisp, dm, grouping, permutations=0)
def test_mismatched_group(self):
gr = ['foo', 'bar']
npt.assert_raises(ValueError, permdisp, self.unifrac_dm, gr)
def test_single_group(self):
gr = ['f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f']
npt.assert_raises(ValueError, permdisp, self.unifrac_dm, gr)
def test_no_permuations(self):
obs = permdisp(self.eq_mat, self.grouping_eq, permutations=0)
pval = obs['p-value']
np.isnan(pval)
def test_hdmedians(self):
exp = np.array([2.01956244, 1.53164546, 2.60571752, 0.91424179,
1.76214416, 1.69943057])
obs = np.array(hd.geomedian(self.eq_mat.data))
npt.assert_almost_equal(obs, exp, decimal=6)
def test_confirm_betadispr_results(self):
mp_dm = DistanceMatrix.read(get_data_path('moving_pictures_dm.tsv'))
mp_mf = pd.read_csv(get_data_path('moving_pictures_mf.tsv'), sep='\t')
mp_mf.set_index('#SampleID', inplace=True)
obs_med_mp = permdisp(mp_dm, mp_mf,
column='BodySite')
obs_cen_mp = permdisp(mp_dm, mp_mf, column='BodySite',
test='centroid')
exp_data_m = ['PERMDISP', 'F-value', 33, 4, 10.1956, 0.001, 999]
exp_data_c = ['PERMDISP', 'F-value', 33, 4, 17.4242, 0.001, 999]
exp_ind = ['method name', 'test statistic name', 'sample size',
'number of groups', 'test statistic', 'p-value',
'number of permutations']
exp_med_mp = pd.Series(data=exp_data_m, index=exp_ind, dtype='object',
name='PERMDISP results')
exp_cen_mp = pd.Series(data=exp_data_c, index=exp_ind, dtype='object',
name='PERMDISP results')
self.assert_series_equal(exp_med_mp, obs_med_mp)
self.assert_series_equal(exp_cen_mp, obs_cen_mp)
if __name__ == '__main__':
main()
......@@ -17,6 +17,7 @@ Functions
ca
pcoa
pcoa_biplot
cca
rda
mean_and_std
......@@ -126,11 +127,11 @@ from skbio.util import TestRunner
from ._redundancy_analysis import rda
from ._correspondence_analysis import ca
from ._canonical_correspondence_analysis import cca
from ._principal_coordinate_analysis import pcoa
from ._principal_coordinate_analysis import pcoa, pcoa_biplot
from ._ordination_results import OrdinationResults
from ._utils import (mean_and_std, scale, svd_rank, corr, e_matrix, f_matrix)
__all__ = ['ca', 'rda', 'cca', 'pcoa', 'OrdinationResults',
__all__ = ['ca', 'rda', 'cca', 'pcoa', 'pcoa_biplot', 'OrdinationResults',
'mean_and_std', 'scale', 'svd_rank', 'corr',
'e_matrix', 'f_matrix']
......