Andreas Tille · Andreas Tille · Andreas Tille · Andreas Tille · Andreas Tille · Andreas Tille
--- a/.travis.yml
+++ b/.travis.yml
-# Modified from https://github.com/biocore/scikit-bio/
+# Modified from https://github.com/biocore/scikit-bio
 language: python
 env:
-  - PYTHON_VERSION=2.7 WITH_DOCTEST=False USE_CYTHON=True
-  - PYTHON_VERSION=3.5 WITH_DOCTEST=True USE_CYTHON=True
-  - PYTHON_VERSION=3.6 WITH_DOCTEST=True USE_CYTHON=True
-  - PYTHON_VERSION=3.7 WITH_DOCTEST=True USE_CYTHON=True
+  - PYTHON_VERSION=3.6 WITH_DOCTEST=True 
+  - PYTHON_VERSION=3.7 WITH_DOCTEST=True 
+  - PYTHON_VERSION=3.8 WITH_DOCTEST=True 
 before_install:
  - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
  - chmod +x miniconda.sh
  - ./miniconda.sh -b
  - export PATH=/home/travis/miniconda3/bin:$PATH
 install:
-  - conda create --yes -n env_name python=$PYTHON_VERSION pip click numpy scipy pep8 flake8 coverage future six "pandas>=0.20.0" nose h5py>=2.2.0 cython
+  - conda create --yes -n env_name python=$PYTHON_VERSION pip click numpy "scipy>=1.3.1" pep8 flake8 coverage future six "pandas>=0.20.0" nose h5py>=2.2.0 cython
  - rm biom/*.c
  - source activate env_name
-  - if [ ${PYTHON_VERSION} = "2.7" ]; then pip install pyqi; fi
-  - if [ ${PYTHON_VERSION} = "2.7" ]; then conda install --yes Sphinx=1.2.2; fi
+  - if [ ${PYTHON_VERSION} = "3.6" ]; then pip install sphinx==1.2.2; fi
  - pip install coveralls
  - pip install -e . --no-deps
 script:
  - make test 
  - biom show-install-info
-  - if [ ${PYTHON_VERSION} = "2.7" ]; then make -C doc html; fi
+  - if [ ${PYTHON_VERSION} = "3.6" ]; then make -C doc html; fi
  # we can only validate the tables if we have H5PY
  - for table in examples/*hdf5.biom; do echo ${table}; biom validate-table -i ${table}; done
  # validate JSON formatted tables

--- a/ChangeLog.md
+++ b/ChangeLog.md
 BIOM-Format ChangeLog
 =====================

+biom 2.1.8
+----------
+
+New features and bug fixes, released on 6 January 2020.
+
+Important:
+
+* Python 2.7 and 3.5 support has been dropped.
+* Python 3.8 support has been added into Travis CI. 
+* A change to the defaults for `Table.nonzero_counts` was performed such that the default now is to count the number of nonzero features. See [issue #685](https://github.com/biocore/biom-format/issues/685)
+* We now require a SciPy >= 1.3.1. See [issue #816](https://github.com/biocore/biom-format/issues/816)
+
+New Features:
+
+* The detailed report is no longer part of the table validator. See [issue #378](https://github.com/biocore/biom-format/issues/378).
+* `load_table` now accepts open file handles. See [issue #481](https://github.com/biocore/biom-format/issues/481).
+* `biom export-metadata` has been added to export metadata as TSV. See [issue #820](https://github.com/biocore/biom-format/issues/820).
+
+Bug fixes:
+
+* `Table.to_dataframe(dense=False)` does now correctly produce sparse data frames (and not accidentally dense ones as before). See [issue #808](https://github.com/biocore/biom-format/issues/808).
+* Order of error evaluations was unstable in Python versions without implicit `OrderedDict`. See [issue #813](https://github.com/biocore/biom-format/issues/813). Thanks @gwarmstrong for identifying this bug.
+* `Table._extract_data_from_tsv` would fail if taxonomy was provided, and if the first row had the empty string for taxonomy. See [issue #827](https://github.com/biocore/biom-format/issues/827). Thanks @KasperSkytte for identifying this bug.
+
 biom 2.1.7
 ----------


--- a/biom/cli/__init__.py
+++ b/biom/cli/__init__.py
@@ -30,6 +30,7 @@ def cli(ctx):

 import_module('biom.cli.table_summarizer')
 import_module('biom.cli.metadata_adder')
+import_module('biom.cli.metadata_exporter')
 import_module('biom.cli.table_converter')
 import_module('biom.cli.installation_informer')
 import_module('biom.cli.table_subsetter')

--- a/biom/cli/metadata_exporter.py
+++ b/biom/cli/metadata_exporter.py
+# -----------------------------------------------------------------------------
+# Copyright (c) 2011-2017, The BIOM Format Development Team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# -----------------------------------------------------------------------------
+
+import click
+
+from biom import load_table
+from biom.cli import cli
+
+
+@cli.command(name='export-metadata')
+@click.option('-i', '--input-fp', required=True,
+              type=click.Path(exists=True, dir_okay=False),
+              help='The input BIOM table')
+@click.option('-m', '--sample-metadata-fp', required=False,
+              type=click.Path(exists=False, dir_okay=False),
+              help='The sample metadata output file.')
+@click.option('--observation-metadata-fp', required=False,
+              type=click.Path(exists=False, dir_okay=False),
+              help='The observation metadata output file.')
+def export_metadata(input_fp, sample_metadata_fp, observation_metadata_fp):
+    """Export metadata as TSV.
+
+    Example usage:
+
+    Export metadata as TSV:
+
+    $ biom export-metadata -i otu_table.biom
+      --sample-metadata-fp sample.tsv
+      --observation-metadata-fp observation.tsv
+    """
+    table = load_table(input_fp)
+
+    if sample_metadata_fp:
+        _export_metadata(table, 'sample', input_fp, sample_metadata_fp)
+    if observation_metadata_fp:
+        _export_metadata(table, 'observation', input_fp,
+                         observation_metadata_fp)
+
+
+def _export_metadata(table, axis, input_fp, output_fp):
+    try:
+        metadata = table.metadata_to_dataframe(axis)
+        metadata.to_csv(output_fp, sep='\t')
+    except KeyError:
+        click.echo('File {} does not contain {} metadata'.format(input_fp,
+                                                                 axis))
--- a/biom/cli/table_validator.py
+++ b/biom/cli/table_validator.py
@@ -29,9 +29,7 @@ from biom.util import HAVE_H5PY, biom_open, is_hdf5_file
                   ' specification')
 @click.option('-f', '--format-version', default=None,
              help='The specific format version to validate against')
-@click.option('--detailed-report', is_flag=True, default=False,
-              help='Include more details in the output report')
-def validate_table(input_fp, format_version, detailed_report):
+def validate_table(input_fp, format_version):
    """Validate a BIOM-formatted file.

    Test a file for adherence to the Biological Observation Matrix (BIOM)
@@ -46,7 +44,7 @@ def validate_table(input_fp, format_version, detailed_report):
    $ biom validate-table -i table.biom

    """
-    valid, report = _validate_table(input_fp, format_version, detailed_report)
+    valid, report = _validate_table(input_fp, format_version)
    click.echo("\n".join(report))
    if valid:
        # apparently silence is too quiet to be golden.
@@ -57,9 +55,8 @@ def validate_table(input_fp, format_version, detailed_report):
        sys.exit(1)


-def _validate_table(input_fp, format_version=None, detailed_report=False):
-    result = TableValidator()(table=input_fp, format_version=format_version,
-                              detailed_report=detailed_report)
+def _validate_table(input_fp, format_version=None):
+    result = TableValidator()(table=input_fp, format_version=format_version)
    return result['valid_table'], result['report_lines']


@@ -108,23 +105,15 @@ class TableValidator(object):
                raise IOError("h5py is not installed, can only validate JSON "
                              "tables")

-    def __call__(self, table, format_version=None, detailed_report=False):
-        return self.run(table=table, format_version=format_version,
-                        detailed_report=detailed_report)
+    def __call__(self, table, format_version=None):
+        return self.run(table=table, format_version=format_version)

    def _validate_hdf5(self, **kwargs):
        table = kwargs['table']

-        # Need to make this an attribute so that we have this info during
-        # validation.
-        detailed_report = kwargs['detailed_report']
-
        report_lines = []
        valid_table = True

-        if detailed_report:
-            report_lines.append("Validating BIOM table...")
-
        required_attrs = [
            ('format-url', self._valid_format_url),
            ('format-version', self._valid_hdf5_format_version),
@@ -154,9 +143,6 @@ class TableValidator(object):
                report_lines.append("Missing attribute: '%s'" % required_attr)
                continue

-            if detailed_report:
-                report_lines.append("Validating '%s'..." % required_attr)
-
            status_msg = attr_validator(table)

            if len(status_msg) > 0:
@@ -166,20 +152,12 @@ class TableValidator(object):
        for group in required_groups:
            if group not in table:
                valid_table = False
-                if detailed_report:
-                    report_lines.append("Missing group: %s" % group)

        for dataset in required_datasets:
            if dataset not in table:
                valid_table = False
-                if detailed_report:
-                    report_lines.append("Missing dataset: %s" % dataset)

        if 'shape' in table.attrs:
-            if detailed_report:
-                report_lines.append("Validating 'shape' versus number of "
-                                    "samples and observations...")
-
            n_obs, n_samp = table.attrs['shape']
            obs_ids = table.get('observation/ids', None)
            samp_ids = table.get('sample/ids', None)
@@ -270,14 +248,10 @@ class TableValidator(object):
        # Need to make this an attribute so that we have this info during
        # validation.
        self._format_version = kwargs['format_version']
-        detailed_report = kwargs['detailed_report']

        report_lines = []
        valid_table = True

-        if detailed_report:
-            report_lines.append("Validating BIOM table...")
-
        required_keys = [
            ('format', self._valid_format),
            ('format_url', self._valid_format_url),
@@ -299,9 +273,6 @@ class TableValidator(object):
                report_lines.append("Missing field: '%s'" % key)
                continue

-            if detailed_report:
-                report_lines.append("Validating '%s'..." % key)
-
            status_msg = method(table_json)

            if len(status_msg) > 0:
@@ -309,10 +280,6 @@ class TableValidator(object):
                report_lines.append(status_msg)

        if 'shape' in table_json:
-            if detailed_report:
-                report_lines.append("Validating 'shape' versus number of rows "
-                                    "and columns...")
-
            if ('rows' in table_json and
                    len(table_json['rows']) != table_json['shape'][0]):
                valid_table = False

--- a/biom/err.py
+++ b/biom/err.py
@@ -75,7 +75,8 @@ OBSMDSIZE = "Size of observation metadata differs from matrix size!"
 SAMPMDSIZE = "Size of sample metadata differs from matrix size!"


-def _test_empty(t):
+# _zz_ so the sort order places this test last
+def _zz_test_empty(t):
    """Check if t is empty"""
    return t.is_empty()

@@ -250,8 +251,9 @@ class ErrorProfile(object):
        if not args:
            args = self._test.keys()

-        for errtype in args:
+        for errtype in sorted(args):
            test = self._test.get(errtype, lambda: None)
+
            if test(item):
                return self._handle_error(errtype, item)

@@ -318,7 +320,7 @@ class ErrorProfile(object):


 __errprof = ErrorProfile()
-__errprof.register('empty', EMPTY, 'ignore', _test_empty,
+__errprof.register('empty', EMPTY, 'ignore', _zz_test_empty,
                   exception=TableException)
 __errprof.register('obssize', OBSSIZE, 'raise', _test_obssize,
                   exception=TableException)

--- a/biom/parse.py
+++ b/biom/parse.py
@@ -12,6 +12,8 @@ from __future__ import division

 import numpy as np
 from future.utils import string_types
+import io
+import h5py

 from biom.exception import BiomParseException, UnknownAxisError
 from biom.table import Table
@@ -341,13 +343,14 @@ def parse_uc(fh):
    return Table(data, observation_ids=observation_ids, sample_ids=sample_ids)


-def parse_biom_table(fp, ids=None, axis='sample', input_is_dense=False):
-    r"""Parses the biom table stored in the filepath `fp`
+def parse_biom_table(file_obj, ids=None, axis='sample', input_is_dense=False):
+    r"""Parses the biom table stored in `file_obj`

    Parameters
    ----------
-    fp : file like
-        File alike object storing the BIOM table
+    file_obj : file-like object, or list
+        file-like object storing the BIOM table (tab-delimited or JSON), or
+        a list of lines of the BIOM table in tab-delimited or JSON format
    ids : iterable
        The sample/observation ids of the samples/observations that we need
        to retrieve from the biom table
@@ -360,7 +363,7 @@ def parse_biom_table(fp, ids=None, axis='sample', input_is_dense=False):
    Returns
    -------
    Table
-        The BIOM table stored at fp
+        The BIOM table stored at file_obj

    Raises
    ------
@@ -391,34 +394,36 @@ def parse_biom_table(fp, ids=None, axis='sample', input_is_dense=False):
        UnknownAxisError(axis)

    try:
-        return Table.from_hdf5(fp, ids=ids, axis=axis)
+        return Table.from_hdf5(file_obj, ids=ids, axis=axis)
    except ValueError:
        pass
    except RuntimeError:
        pass
-    if hasattr(fp, 'read'):
-        old_pos = fp.tell()
+    if hasattr(file_obj, 'read'):
+        old_pos = file_obj.tell()
        # Read in characters until first non-whitespace
        # If it is a {, then this is (most likely) JSON
-        c = fp.read(1)
+        c = file_obj.read(1)
        while c.isspace():
-            c = fp.read(1)
+            c = file_obj.read(1)
        if c == '{':
-            fp.seek(old_pos)
-            t = Table.from_json(json.load(fp, object_pairs_hook=OrderedDict),
+            file_obj.seek(old_pos)
+            t = Table.from_json(json.load(file_obj,
+                                          object_pairs_hook=OrderedDict),
                                input_is_dense=input_is_dense)
        else:
-            fp.seek(old_pos)
-            t = Table.from_tsv(fp, None, None, lambda x: x)
-    elif isinstance(fp, list):
+            file_obj.seek(old_pos)
+            t = Table.from_tsv(file_obj, None, None, lambda x: x)
+    elif isinstance(file_obj, list):
        try:
-            t = Table.from_json(json.loads(''.join(fp),
+            t = Table.from_json(json.loads(''.join(file_obj),
                                           object_pairs_hook=OrderedDict),
                                input_is_dense=input_is_dense)
        except ValueError:
-            t = Table.from_tsv(fp, None, None, lambda x: x)
+            t = Table.from_tsv(file_obj, None, None, lambda x: x)
    else:
-        t = Table.from_json(json.loads(fp, object_pairs_hook=OrderedDict),
+        t = Table.from_json(json.loads(file_obj,
+                                       object_pairs_hook=OrderedDict),
                            input_is_dense=input_is_dense)

    def subset_ids(data, id_, md):
@@ -632,7 +637,8 @@ def load_table(f):

    Parameters
    ----------
-    f : str
+    f : str or file-like object
+        The entity to parse

    Returns
    -------
@@ -655,6 +661,12 @@ def load_table(f):
    >>> table = load_table('path/to/table.biom') # doctest: +SKIP

    """
+    if isinstance(f, (io.IOBase, h5py.File)):
+        try:
+            table = parse_biom_table(f)
+        except (IndexError, TypeError):
+            raise TypeError("%s does not appear to be a BIOM file!" % f)
+    else:
        with biom_open(f) as fp:
            try:
                table = parse_biom_table(fp)

--- a/biom/table.py
+++ b/biom/table.py
@@ -178,7 +178,7 @@ import scipy.stats
 from copy import deepcopy
 from datetime import datetime
 from json import dumps
-from functools import reduce
+from functools import reduce, partial
 from operator import itemgetter
 from future.builtins import zip
 from future.utils import viewitems
@@ -2822,7 +2822,7 @@ class Table(object):
        Parameters
        ----------
        inplace : bool, optional
-            Defaults to ``False``
+            Defaults to ``True``

        Returns
        -------
@@ -3103,7 +3103,7 @@ class Table(object):
            for col_idx in indices[start:end]:
                yield (obs_id, samp_ids[col_idx])

-    def nonzero_counts(self, axis, binary=False):
+    def nonzero_counts(self, axis, binary=True):
        """Get nonzero summaries about an axis

        Parameters
@@ -3111,7 +3111,7 @@ class Table(object):
        axis : {'sample', 'observation', 'whole'}
            The axis on which to count nonzero entries
        binary : bool, optional
-            Defaults to ``False``. If ``True``, return number of nonzero
+            Defaults to ``True``. If ``True``, return number of nonzero
            entries. If ``False``, sum the values of the entries.

        Returns
@@ -3252,26 +3252,26 @@ class Table(object):
        alignable_o = self_o == other_o
        alignable_s = self_s == other_s

-        if axis is 'both' and not (alignable_o and alignable_s):
+        if axis == 'both' and not (alignable_o and alignable_s):
            raise DisjointIDError("Cannot align both axes")
-        elif axis is 'sample' and not alignable_s:
+        elif axis == 'sample' and not alignable_s:
            raise DisjointIDError("Cannot align samples")
-        elif axis is 'observation' and not alignable_o:
+        elif axis == 'observation' and not alignable_o:
            raise DisjointIDError("Cannot align observations")
-        elif axis is 'detect' and not (alignable_o or alignable_s):
+        elif axis == 'detect' and not (alignable_o or alignable_s):
            raise DisjointIDError("Neither axis appears alignable")

-        if axis is 'both':
+        if axis == 'both':
            order = ['observation', 'sample']
-        elif axis is 'detect':
+        elif axis == 'detect':
            order = []
            if alignable_s:
                order.append('sample')
            if alignable_o:
                order.append('observation')
-        elif axis is 'sample':
+        elif axis == 'sample':
            order = ['sample']
-        elif axis is 'observation':
+        elif axis == 'observation':
            order = ['observation']
        else:
            raise UnknownAxisError("Unrecognized axis: %s" % axis)
@@ -3506,18 +3506,18 @@ class Table(object):

        """
        # determine the sample order in the resulting table
-        if sample is 'union':
+        if sample == 'union':
            new_samp_order = self._union_id_order(self.ids(), other.ids())
-        elif sample is 'intersection':
+        elif sample == 'intersection':
            new_samp_order = self._intersect_id_order(self.ids(), other.ids())
        else:
            raise TableException("Unknown sample merge type: %s" % sample)

        # determine the observation order in the resulting table
-        if observation is 'union':
+        if observation == 'union':
            new_obs_order = self._union_id_order(
                self.ids(axis='observation'), other.ids(axis='observation'))
-        elif observation is 'intersection':
+        elif observation == 'intersection':
            new_obs_order = self._intersect_id_order(
                self.ids(axis='observation'), other.ids(axis='observation'))
        else:
@@ -4045,9 +4045,10 @@ html
            mat = self.matrix_data.toarray()
            constructor = pd.DataFrame
        else:
-            mat = [pd.SparseSeries(r.toarray().squeeze())
-                   for r in self.matrix_data.tocsr()]
-            constructor = pd.SparseDataFrame
+            mat = self.matrix_data
+            constructor = partial(pd.SparseDataFrame,
+                                  default_fill_value=0,
+                                  copy=True)

        return constructor(mat, index=index, columns=columns)

@@ -4688,6 +4689,14 @@ html

        .. shownumpydoc
        """
+        def isfloat(value):
+            # see https://stackoverflow.com/a/20929881
+            try:
+                float(value)
+                return True
+            except ValueError:
+                return False
+
        if not isinstance(lines, list):
            try:
                hasattr(lines, 'seek')
@@ -4706,37 +4715,28 @@ html
                # Covers the case where the first line is the header
                # and there is no indication of it (no comment character)
                if not header:
-                    header = line.strip().split(delim)[1:]
+                    header = line.rstrip().split(delim)[1:]
                    data_start = list_index + 1
                else:
                    data_start = list_index
                break
            list_index += 1
            header = line.strip().split(delim)[1:]
-        # If the first line is the header, then we need to get the next
+
+        # If the first line is the header, then we need to get the data lines
        # line for the "last column" check
        if isinstance(lines, list):
-            line = lines[data_start]
+            value_checks = lines[data_start:]
        else:
            lines.seek(0)
-            for index in range(0, data_start + 1):
-                line = lines.readline()
+            for index in range(0, data_start):
+                lines.readline()
+            value_checks = [line for line in lines]

        # attempt to determine if the last column is non-numeric, ie, metadata
-        first_values = line.strip().split(delim)
-        last_value = first_values[-1]
-        last_column_is_numeric = True
-
-        if '.' in last_value:
-            try:
-                float(last_value)
-            except ValueError:
-                last_column_is_numeric = False
-        else:
-            try:
-                int(last_value)
-            except ValueError:
-                last_column_is_numeric = False
+        last_values = [line.rsplit(delim, 1)[-1].strip()
+                       for line in value_checks]
+        last_column_is_numeric = all([isfloat(i) for i in last_values])

        # determine sample ids
        if last_column_is_numeric:
@@ -4761,13 +4761,13 @@ html
            lines = lines[data_start:]

        for lineno, line in enumerate(lines, data_start):
-            line = line.strip()
-            if not line:
+            if not line.strip():
                continue
            if line.startswith('#'):
                continue

-            fields = line.strip().split(delim)
+            fields = line.split(delim)
+            fields[-1] = fields[-1].strip()
            obs_ids.append(fields[0])

            if last_column_is_numeric:

--- a/biom/tests/test_cli/test_validate_table.py
+++ b/biom/tests/test_cli/test_validate_table.py
@@ -121,9 +121,8 @@ class TableValidatorTests(TestCase):
        f.close()
        self.to_remove.append('valid_test3')

-        obs = self.cmd(table='valid_test3', detailed_report=True)
+        obs = self.cmd(table='valid_test3')
        self.assertTrue(obs['valid_table'])
-        self.assertTrue(len(obs['report_lines']) > 0)

    def test_invalid(self):
        """Correctly invalidates a table that is... invalid."""

--- a/biom/tests/test_err.py
+++ b/biom/tests/test_err.py
@@ -11,12 +11,14 @@
 from unittest import TestCase, main
 from copy import deepcopy

+import numpy as np
+
 from biom import example_table, Table
 from biom.exception import TableException
-from biom.err import (_test_empty, _test_obssize, _test_sampsize, _test_obsdup,
-                      _test_sampdup, _test_obsmdsize, _test_sampmdsize,
-                      errstate, geterr, seterr, geterrcall, seterrcall,
-                      errcheck, __errprof)
+from biom.err import (_zz_test_empty, _test_obssize, _test_sampsize,
+                      _test_obsdup, _test_sampdup, _test_obsmdsize,
+                      _test_sampmdsize, errstate, geterr, seterr, geterrcall,
+                      seterrcall, errcheck, __errprof)


 runtime_ep = __errprof
@@ -30,8 +32,8 @@ class ErrModeTests(TestCase):
        self.ex_table = example_table.copy()

    def test_test_empty(self):
-        self.assertTrue(_test_empty(Table([], [], [])))
-        self.assertFalse(_test_empty(self.ex_table))
+        self.assertTrue(_zz_test_empty(Table([], [], [])))
+        self.assertFalse(_zz_test_empty(self.ex_table))

    def test_test_obssize(self):
        self.assertFalse(_test_obssize(self.ex_table))
@@ -87,6 +89,17 @@ class ErrorProfileTests(TestCase):
        self.assertTrue(isinstance(self.ep.test(self.ex_table, 'obssize'),
                                   TableException))

+    def test_test_evaluation_order(self):
+        # issue 813
+        tab = Table(np.array([[1, 2], [3, 4]]), ['A', 'B'], ['C', 'D'])
+        tab._observation_ids = np.array(['A', 'A'], dtype='object')
+        tab._sample_ids = np.array(['B', 'B'], dtype='object')
+
+        self.assertEqual(self.ep.test(tab, 'obsdup', 'sampdup').args[0],
+                         'Duplicate observation IDs')
+        self.assertEqual(self.ep.test(tab, 'sampdup', 'obsdup').args[0],
+                         'Duplicate observation IDs')
+
    def test_state(self):
        self.ep.state = {'all': 'ignore'}
        self.assertEqual(set(self.ep._state.values()), set(['ignore']))

--- a/biom/tests/test_parse.py
+++ b/biom/tests/test_parse.py
@@ -16,7 +16,8 @@ from unittest import TestCase, main
 import numpy as np
 import numpy.testing as npt

-from biom.parse import generatedby, MetadataMap, parse_biom_table, parse_uc
+from biom.parse import (generatedby, MetadataMap, parse_biom_table, parse_uc,
+                        load_table)
 from biom.table import Table
 from biom.util import HAVE_H5PY, __version__
 from biom.tests.long_lines import (uc_empty, uc_invalid_id, uc_minimal,
@@ -237,6 +238,32 @@ class ParseTests(TestCase):
        Table.from_hdf5(h5py.File('test_data/test.biom'))
        os.chdir(cwd)

+    @npt.dec.skipif(HAVE_H5PY is False, msg='H5PY is not installed')
+    def test_load_table_filepath(self):
+        cwd = os.getcwd()
+        if '/' in __file__[1:]:
+            os.chdir(__file__.rsplit('/', 1)[0])
+        load_table('test_data/test.biom')
+        os.chdir(cwd)
+
+    @npt.dec.skipif(HAVE_H5PY is False, msg='H5PY is not installed')
+    def test_load_table_inmemory(self):
+        cwd = os.getcwd()
+        if '/' in __file__[1:]:
+            os.chdir(__file__.rsplit('/', 1)[0])
+        load_table(h5py.File('test_data/test.biom'))
+        os.chdir(cwd)
+
+    def test_load_table_inmemory_json(self):
+        cwd = os.getcwd()
+        if '/' in __file__[1:]:
+            os.chdir(__file__.rsplit('/', 1)[0])
+        load_table(open('test_data/test.json'))
+        os.chdir(cwd)
+
+    def test_load_table_inmemory_stringio(self):
+        load_table(StringIO('\n'.join(self.classic_otu_table1_no_tax)))
+
    def test_parse_biom_table(self):
        """tests for parse_biom_table when we do not have h5py"""
        # This is a TSV as a list of lines

--- a/biom/tests/test_table.py
+++ b/biom/tests/test_table.py
@@ -1475,10 +1475,17 @@ class TableTests(TestCase):
    def test_to_dataframe(self):
        exp = pd.SparseDataFrame(np.array([[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]),
                                 index=['O1', 'O2'],
-                                 columns=['S1', 'S2', 'S3'])
+                                 columns=['S1', 'S2', 'S3'],
+                                 default_fill_value=0.0)
        obs = example_table.to_dataframe()
        pdt.assert_frame_equal(obs, exp)

+    def test_to_dataframe_is_sparse(self):
+        df = example_table.to_dataframe()
+        density = (float(example_table.matrix_data.getnnz()) /
+                   np.prod(example_table.shape))
+        assert np.allclose(df.density, density)
+
    def test_to_dataframe_dense(self):
        exp = pd.DataFrame(np.array([[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]),
                           index=['O1', 'O2'],
@@ -2228,9 +2235,9 @@ class SparseTableTests(TestCase):
        exp_obs = np.array([14, 15, 0])
        exp_whole = np.array([29])

-        obs_samp = st.nonzero_counts('sample')
-        obs_obs = st.nonzero_counts('observation')
-        obs_whole = st.nonzero_counts('whole')
+        obs_samp = st.nonzero_counts('sample', binary=False)
+        obs_obs = st.nonzero_counts('observation', binary=False)
+        obs_whole = st.nonzero_counts('whole', binary=False)

        npt.assert_equal(obs_samp, exp_samp)
        npt.assert_equal(obs_obs, exp_obs)
@@ -3771,6 +3778,47 @@ class SparseTableTests(TestCase):
        obs = Table._extract_data_from_tsv(input, dtype=int)
        npt.assert_equal(obs, exp)

+    def test_extract_data_from_tsv_bad_metadata(self):
+        input = legacy_otu_table_bad_metadata.splitlines()
+        samp_ids = ['Fing', 'Key', 'NA']
+        obs_ids = ['0', '1', '7', '3', '4']
+        metadata = [
+            '',
+            'Bacteria; Firmicutes; Alicyclobacillaceae; Bacilli; Lactobacillal'
+            'es; Lactobacillales; Streptococcaceae; Streptococcus',
+            'Bacteria; Actinobacteria; Actinobacteridae; Gordoniaceae; Coryneb'
+            'acteriaceae',
+            'Bacteria; Firmicutes; Alicyclobacillaceae; Bacilli; Staphylococca'
+            'ceae',
+            'Bacteria; Cyanobacteria; Chloroplasts; vectors']
+        md_name = 'Consensus Lineage'
+        data = [[0, 0, 19111], [0, 1, 44536], [0, 2, 42],
+                [1, 0, 1216], [1, 1, 3500], [1, 2, 6],
+                [2, 0, 1803], [2, 1, 1184], [2, 2, 2],
+                [3, 0, 1722], [3, 1, 4903], [3, 2, 17],
+                [4, 0, 589], [4, 1, 2074], [4, 2, 34]]
+
+        exp = (samp_ids, obs_ids, data, metadata, md_name)
+        obs = Table._extract_data_from_tsv(input, dtype=int)
+        npt.assert_equal(obs, exp)
+
+        # and assert the exact identified bug in #827 is resolved
+        input = extract_tsv_bug.splitlines()
+        samp_ids = ['s1', 's2']
+        obs_ids = ['1', '2', '3']
+        metadata = [
+            '',
+            'k__test;p__test',
+            'k__test;p__test']
+        md_name = 'taxonomy'
+        data = [[0, 0, 123], [0, 1, 32],
+                [1, 0, 315], [1, 1, 3],
+                [2, 1, 22]]
+
+        exp = (samp_ids, obs_ids, data, metadata, md_name)
+        obs = Table._extract_data_from_tsv(input, dtype=int)
+        npt.assert_equal(obs, exp)
+
    def test_identify_bad_value(self):
        pos = [str(i) for i in range(10)]
        exp = (None, None)
@@ -4116,6 +4164,21 @@ ae; Corynebacteriaceae
 aphylococcaceae
 4\t589\t2074\t34\tBacteria; Cyanobacteria; Chloroplasts; vectors
 """
+legacy_otu_table_bad_metadata = u"""# some comment goes here
+#OTU id\tFing\tKey\tNA\tConsensus Lineage
+0\t19111\t44536\t42 \t
+1\t1216\t3500\t6\tBacteria; Firmicutes; Alicyclobacillaceae; Bacilli; La\
+ctobacillales; Lactobacillales; Streptococcaceae; Streptococcus
+7\t1803\t1184\t2\tBacteria; Actinobacteria; Actinobacteridae; Gordoniace\
+ae; Corynebacteriaceae
+3\t1722\t4903\t17\tBacteria; Firmicutes; Alicyclobacillaceae; Bacilli; St\
+aphylococcaceae
+4\t589\t2074\t34\tBacteria; Cyanobacteria; Chloroplasts; vectors
+"""
+extract_tsv_bug = """#OTU ID	s1	s2	taxonomy
+1	123	32\t
+2	315	3	k__test;p__test
+3	0	22	k__test;p__test"""
 otu_table1 = u"""# Some comment
 #OTU ID\tFing\tKey\tNA\tConsensus Lineage
 0\t19111\t44536\t42\tBacteria; Actinobacteria; Actinobacteridae; \

--- a/biom/util.py
+++ b/biom/util.py
@@ -9,7 +9,6 @@
 # ----------------------------------------------------------------------------

 import os
-import sys
 import inspect
 from contextlib import contextmanager
 import io
@@ -27,10 +26,6 @@ try:
    import h5py
    HAVE_H5PY = True

-    if sys.version_info.major == 2:
-        H5PY_VLEN_STR = h5py.special_dtype(vlen=unicode)  # noqa
-        H5PY_VLEN_UNICODE = h5py.special_dtype(vlen=unicode)  # noqa
-    else:
    H5PY_VLEN_STR = h5py.special_dtype(vlen=str)
    H5PY_VLEN_UNICODE = h5py.special_dtype(vlen=str)

@@ -50,7 +45,7 @@ __url__ = "http://biom-format.org"
 __maintainer__ = "Daniel McDonald"
 __email__ = "daniel.mcdonald@colorado.edu"
 __format_version__ = (2, 1)
-__version__ = "2.1.7"
+__version__ = "2.1.8"


 def generate_subsamples(table, n, axis='sample', by_id=False):
@@ -390,7 +385,8 @@ def is_gzip(fp):
    project, but we obtained permission from the authors of this function to
    port it to the BIOM Format project (and keep it under BIOM's BSD license).
    """
-    return open(fp, 'rb').read(2) == b'\x1f\x8b'
+    with open(fp, 'rb') as f:
+        return f.read(2) == b'\x1f\x8b'


 @contextmanager

--- a/debian/changelog
+++ b/debian/changelog
+python-biom-format (2.1.8+dfsg-1) unstable; urgency=medium
+
+  * Drop cython from Build-Depends
+    Closes: #937605
+  * New upstream version
+  * Set upstream metadata fields: Bug-Submit.
+
+ -- Andreas Tille <tille@debian.org>  Mon, 20 Jan 2020 11:35:04 +0100
+
 python-biom-format (2.1.7+dfsg-5) unstable; urgency=medium

  * Set upstream metadata fields: Bug-Database, Repository, Repository-

--- a/debian/control
+++ b/debian/control
@@ -6,7 +6,6 @@ Testsuite: autopkgtest-pkg-python
 Priority: optional
 Build-Depends: debhelper-compat (= 12),
               dh-python,
-               cython,
               help2man,
               bash-completion,
               cython3,

--- a/debian/upstream/metadata
+++ b/debian/upstream/metadata
+Bug-Submit: https://github.com/biocore/biom-format/issues/new
 Reference:
  Author: >
    Daniel McDonald and Jose C. Clemente and Justin Kuczynski and Jai

--- a/doc/conf.py
+++ b/doc/conf.py
@@ -66,8 +66,8 @@ copyright = u'2011-2018 The BIOM Format Development Team'
 # built documents.
 #
 # The full version, including alpha/beta/rc tags.
-version = "2.1.7"
-release = "2.1.7"
+version = "2.1.8"
+release = "2.1.8"

 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

--- a/setup.py
+++ b/setup.py
@@ -9,7 +9,6 @@
 # The full license is in the file COPYING.txt, distributed with this software.
 # ----------------------------------------------------------------------------

-import os
 import sys

 from setuptools import setup, find_packages
@@ -21,6 +20,13 @@ try:
 except ImportError:
    raise ImportError("numpy must be installed prior to installing biom")

+
+try:
+    from Cython.Build import cythonize
+except ImportError:
+    raise ImportError("cython must be installed prior to installing biom")
+
+
 # Hack to prevent stupid "TypeError: 'NoneType' object is not callable" error
 # in multiprocessing/util.py _exit_function when running `python
 # setup.py test` (see
@@ -37,7 +43,7 @@ __copyright__ = "Copyright 2011-2017, The BIOM Format Development Team"
 __credits__ = ["Greg Caporaso", "Daniel McDonald", "Jose Clemente",
               "Jai Ram Rideout", "Jorge Cañardo Alastuey", "Michael Hall"]
 __license__ = "BSD"
-__version__ = "2.1.7"
+__version__ = "2.1.8"
 __maintainer__ = "Daniel McDonald"
 __email__ = "mcdonadt@colorado.edu"

@@ -92,10 +98,9 @@ classes = """
    Topic :: Software Development :: Libraries :: Application Frameworks
    Topic :: Software Development :: Libraries :: Python Modules
    Programming Language :: Python
-    Programming Language :: Python :: 2.7
-    Programming Language :: Python :: 3.4
-    Programming Language :: Python :: 3.5
    Programming Language :: Python :: 3.6
+    Programming Language :: Python :: 3.7
+    Programming Language :: Python :: 3.8
    Programming Language :: Python :: Implementation :: CPython
    Operating System :: OS Independent
    Operating System :: POSIX :: Linux
@@ -104,8 +109,7 @@ classes = """
 classifiers = [s.strip() for s in classes.split('\n') if s]

 # Dealing with Cython
-USE_CYTHON = os.environ.get('USE_CYTHON', False)
-ext = '.pyx' if USE_CYTHON else '.c'
+ext = '.pyx'
 extensions = [Extension("biom._filter",
                        ["biom/_filter" + ext],
                        include_dirs=[np.get_include()]),
@@ -115,22 +119,15 @@ extensions = [Extension("biom._filter",
              Extension("biom._subsample",
                        ["biom/_subsample" + ext],
                        include_dirs=[np.get_include()])]
-
-if USE_CYTHON:
-    from Cython.Build import cythonize
 extensions = cythonize(extensions)

 install_requires = ["click", "numpy >= 1.9.2", "future >= 0.16.0",
-                    "scipy >= 0.13.0", 'pandas >= 0.20.0',
-                    "six >= 1.10.0"]
+                    "scipy >= 1.3.1", 'pandas >= 0.20.0',
+                    "six >= 1.10.0", "cython >= 0.29"]

-# HACK: for backward-compatibility with QIIME 1.9.x, pyqi must be installed.
-# pyqi is not used anymore in this project.
 if sys.version_info[0] < 3:
-    install_requires.append("pyqi")
-    import warnings
-    warnings.warn("Python 2.7 support will be removed on the next release",
-                  DeprecationWarning)
+    raise SystemExit("Python 2.7 is no longer supported")
+

 setup(name='biom-format',
      version=__version__,