New upstream version 0.2.5

c4ee398e · Håvard F. Aasen · 7a315a1a · c4ee398e · c4ee398e · 7a315a1a
Commit c4ee398e authored 3 years ago by Håvard F. Aasen
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
+name: CI
+on: [push, pull_request]
+jobs:
+  build:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [macos-latest, windows-latest, ubuntu-latest]
+        python-version: [2.7, 3.6, 3.7, 3.8, 3.9, pypy-2.7, pypy-3.6, pypy-3.7]
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      # https://github.com/actions/cache/blob/main/examples.md#using-a-script-to-get-cache-location
+      - id: pip-cache
+        run: python -c "from pip._internal.locations import USER_CACHE_DIR; print('::set-output name=dir::' + USER_CACHE_DIR)"
+      - uses: actions/cache@v1
+        with:
+          path: ${{ steps.pip-cache.outputs.dir }}
+          key: ${{ runner.os }}-pip-${{ hashFiles('**/setup.py') }}
+          restore-keys: |
+            ${{ runner.os }}-pip-
+      - run: pip install --upgrade check-manifest flake8 isort setuptools
+      - run: check-manifest
+      - run: flake8 .
+      - run: isort . --check-only
+      - run: pip install .[test]
+      - run: nosetests
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,6 @@
 *.pyc
 *.swp
 *.swo
-.tox
 *.egg-info
 docs/_build
 dist

--- a/.travis.yml
+++ b/.travis.yml
-dist: xenial
-language: python
-python:
-  - "2.7"
-  - "3.4"
-  - "3.5"
-  - "3.6"
-  - "3.7"
-# command to install dependencies
-install:
-    - if [[ $TRAVIS_PYTHON_VERSION == 3* ]]; then pip install -r requirements-py3.txt; else pip install -r requirements-py2.txt; fi
-# command to run tests
-script: nosetests tests
-sudo: false
--- a/AUTHORS.rst
+++ b/AUTHORS.rst
 The following individuals have contributed code to agate-excel:

 * `Christopher Groskopf <https://github.com/onyxfish>`_
-* `James McKinney <https://github.com/jpmckinney>`_
 * `Ben Welsh <https://github.com/palewire>`_
+* `James McKinney <https://github.com/jpmckinney>`_
 * `Peter M. Landwehr <https://github.com/pmlandwehr>`_
-* `Tim Freund <https://github.com/timfreund>`_
 * `Jani Mikkonen <https://github.com/rasjani>`_
+* `Tim Freund <https://github.com/timfreund>`_
 * `Loïc Corbasson <https://github.com/lcorbasson>`_
 * `Robert Schütz <https://github.com/dotlambda>`_
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
-0.2.3
-----
+0.2.5 - August 8, 2021
+----------------------
+
+* Add ``six`` to ``install_requires``.
+
+0.2.4 - July 13, 2021
+---------------------
+
+* Add ``row_limit`` keyword argument to ``from_xls`` and ``from_xlsx``. (#40)
+* Preserve column types from XLS files. (#36)
+* Add support for Compound File Binary File (CFBF) XLS files. (#44)
+* Close XLSX file before raising error for non-existent sheet. (#34)
+* Use less memory and close XLS files. (#39)
+* Drop support for Python 3.4 (end-of-life was March 18, 2019).
+
+0.2.3 - March 16, 2019
+----------------------

 * Fix bug in accepting ``column_names`` as keyword argument.
 * Add a ``reset_dimensions`` argument to :meth:`.Table.from_xlsx` to recalculate the data's dimensions, instead of trusting those in the file's properties.
@@ -24,8 +39,8 @@
 * Fix bug in handling an empty XLS.
 * Fix bug in handling non-string column names in XLSX.

-0.2.0
-----
+0.2.0 - December 19, 2016
+-------------------------

 * Fix bug in handling of ``None`` in boolean columns for XLS. (#11)
 * Removed usage of deprecated openpyxl method ``get_sheet_by_name``.
@@ -33,7 +48,7 @@
 * Upgrade required agate version to ``1.5.0``.
 * Ensure columns with numbers for names (e.g. years) are parsed as strings.

-0.1.0
-----
+0.1.0 - February 5, 2016
+------------------------

 * Initial version.
--- a/MANIFEST.in
+++ b/MANIFEST.in
+include *.py
+include *.rst
 include COPYING
-include AUTHORS.rst
-include README.rst
+recursive-include docs *.py
+recursive-include docs *.rst
+recursive-include docs Makefile
+recursive-include examples *.xls
+recursive-include examples *.xlsx
 recursive-include tests *.py
-graft examples
--- a/README.rst
+++ b/README.rst
-.. image:: https://travis-ci.org/wireservice/agate-excel.png
-    :target: https://travis-ci.org/wireservice/agate-excel
+.. image:: https://github.com/wireservice/agate-excel/workflows/CI/badge.svg
+    :target: https://github.com/wireservice/agate-excel/actions
    :alt: Build status

+.. image:: https://img.shields.io/pypi/dm/agate-excel.svg
+    :target: https://pypi.python.org/pypi/agate-excel
+    :alt: PyPI downloads
+
 .. image:: https://img.shields.io/pypi/v/agate-excel.svg
    :target: https://pypi.python.org/pypi/agate-excel
    :alt: Version

--- a/agateexcel/table_xls.py
+++ b/agateexcel/table_xls.py
@@ -8,11 +8,22 @@ import datetime
 from collections import OrderedDict

 import agate
+import olefile
 import six
 import xlrd

+EXCEL_TO_AGATE_TYPE = {
+    xlrd.biffh.XL_CELL_EMPTY: agate.Boolean(),
+    xlrd.biffh.XL_CELL_TEXT: agate.Text(),
+    xlrd.biffh.XL_CELL_NUMBER: agate.Number(),
+    xlrd.biffh.XL_CELL_DATE: agate.DateTime(),
+    xlrd.biffh.XL_CELL_BOOLEAN: agate.Boolean(),
+    xlrd.biffh.XL_CELL_ERROR: agate.Text(),
+    xlrd.biffh.XL_CELL_BLANK: agate.Boolean(),
+}

-def from_xls(cls, path, sheet=None, skip_lines=0, header=True, encoding_override=None, **kwargs):
+
+def from_xls(cls, path, sheet=None, skip_lines=0, header=True, encoding_override=None, row_limit=None, **kwargs):
    """
    Parse an XLS file.

@@ -25,70 +36,104 @@ def from_xls(cls, path, sheet=None, skip_lines=0, header=True, encoding_override
        The number of rows to skip from the top of the sheet.
    :param header:
        If :code:`True`, the first row is assumed to contain column names.
+    :param row_limit:
+        Limit how many rows of data will be read.
    """
    if not isinstance(skip_lines, int):
        raise ValueError('skip_lines argument must be an int')

+    def open_workbook(f):
+        try:
+            book = xlrd.open_workbook(file_contents=f.read(), encoding_override=encoding_override, on_demand=True)
+        except xlrd.compdoc.CompDocError:
+            # This is not a pure XLS file; we'll try to read it though.
+            # Let's try the Compound File Binary Format:
+            ole = olefile.OleFileIO(f)
+            if ole.exists('Workbook'):
+                d = ole.openstream('Workbook')
+                book = xlrd.open_workbook(file_contents=d.read(), on_demand=True)
+            else:
+                raise IOError('No Workbook stream found in OLE file')
+        return book
+
    if hasattr(path, 'read'):
-        book = xlrd.open_workbook(file_contents=path.read(), encoding_override=encoding_override)
+        book = open_workbook(path)
    else:
        with open(path, 'rb') as f:
-            book = xlrd.open_workbook(file_contents=f.read(), encoding_override=encoding_override)
-
-    multiple = agate.utils.issequence(sheet)
-    if multiple:
-        sheets = sheet
-    else:
-        sheets = [sheet]
+            book = open_workbook(f)

-    tables = OrderedDict()
-
-    for i, sheet in enumerate(sheets):
-        if isinstance(sheet, six.string_types):
-            sheet = book.sheet_by_name(sheet)
-        elif isinstance(sheet, int):
-            sheet = book.sheet_by_index(sheet)
-        else:
-            sheet = book.sheet_by_index(0)
-
-        if header:
-            offset = 1
-            column_names = []
+    try:
+        multiple = agate.utils.issequence(sheet)
+        if multiple:
+            sheets = sheet
        else:
-            offset = 0
-            column_names = None
-
-        columns = []
+            sheets = [sheet]

-        for i in range(sheet.ncols):
-            data = sheet.col_values(i)
-            values = data[skip_lines + offset:]
-            types = sheet.col_types(i)[skip_lines + offset:]
-            excel_type = determine_excel_type(types)
+        tables = OrderedDict()

-            if excel_type == xlrd.biffh.XL_CELL_BOOLEAN:
-                values = normalize_booleans(values)
-            elif excel_type == xlrd.biffh.XL_CELL_DATE:
-                values = normalize_dates(values, book.datemode)
+        for i, sheet in enumerate(sheets):
+            if isinstance(sheet, six.string_types):
+                sheet = book.sheet_by_name(sheet)
+            elif isinstance(sheet, int):
+                sheet = book.sheet_by_index(sheet)
+            else:
+                sheet = book.sheet_by_index(0)

            if header:
-                name = six.text_type(data[skip_lines]) or None
-                column_names.append(name)
-
-            columns.append(values)
-
-        rows = []
-
-        if columns:
-            for i in range(len(columns[0])):
-                rows.append([c[i] for c in columns])
-
-        if 'column_names' in kwargs:
-            if not header:
-                column_names = kwargs['column_names']
-            del kwargs['column_names']
-
-        tables[sheet.name] = agate.Table(rows, column_names, **kwargs)
+                offset = 1
+                column_names = []
+            else:
+                offset = 0
+                column_names = None
+
+            columns = []
+            column_types = []
+
+            for i in range(sheet.ncols):
+                if row_limit is None:
+                    values = sheet.col_values(i, skip_lines + offset)
+                    types = sheet.col_types(i, skip_lines + offset)
+                else:
+                    values = sheet.col_values(i, skip_lines + offset, skip_lines + offset + row_limit)
+                    types = sheet.col_types(i, skip_lines + offset, skip_lines + offset + row_limit)
+                excel_type = determine_excel_type(types)
+                agate_type = determine_agate_type(excel_type)
+
+                if excel_type == xlrd.biffh.XL_CELL_BOOLEAN:
+                    values = normalize_booleans(values)
+                elif excel_type == xlrd.biffh.XL_CELL_DATE:
+                    values, with_date, with_time = normalize_dates(values, book.datemode)
+                    if not with_date:
+                        agate_type = agate.TimeDelta()
+                    if not with_time:
+                        agate_type = agate.Date()
+
+                if header:
+                    name = six.text_type(sheet.cell_value(skip_lines, i)) or None
+                    column_names.append(name)
+
+                columns.append(values)
+                column_types.append(agate_type)
+
+            rows = []
+
+            if columns:
+                for i in range(len(columns[0])):
+                    rows.append([c[i] for c in columns])
+
+            if 'column_names' in kwargs:
+                if not header:
+                    column_names = kwargs['column_names']
+                del kwargs['column_names']
+
+            if 'column_types' in kwargs:
+                column_types = kwargs['column_types']
+                del kwargs['column_types']
+
+            tables[sheet.name] = agate.Table(rows, column_names, column_types, **kwargs)
+
+    finally:
+        book.release_resources()

    if multiple:
        return agate.MappedSequence(tables.values(), tables.keys())
@@ -96,6 +141,13 @@ def from_xls(cls, path, sheet=None, skip_lines=0, header=True, encoding_override
        return tables.popitem()[1]


+def determine_agate_type(excel_type):
+    try:
+        return EXCEL_TO_AGATE_TYPE[excel_type]
+    except KeyError:
+        return agate.Text()
+
+
 def determine_excel_type(types):
    """
    Determine the correct type for a column from a list of cell types.
@@ -130,6 +182,8 @@ def normalize_dates(values, datemode=0):
    Normalize a column of date cells.
    """
    normalized = []
+    with_date = False
+    with_time = False

    for v in values:
        if not v:
@@ -141,13 +195,18 @@ def normalize_dates(values, datemode=0):
        if v_tuple[3:6] == (0, 0, 0):
            # Date only
            normalized.append(datetime.date(*v_tuple[:3]))
+            with_date = True
        elif v_tuple[:3] == (0, 0, 0):
+            # Time only
            normalized.append(datetime.time(*v_tuple[3:6]))
+            with_time = True
        else:
            # Date and time
            normalized.append(datetime.datetime(*v_tuple[:6]))
+            with_date = True
+            with_time = True

-    return normalized
+    return (normalized, with_date, with_time)


 agate.Table.from_xls = classmethod(from_xls)
--- a/agateexcel/table_xlsx.py
+++ b/agateexcel/table_xlsx.py
@@ -14,8 +14,8 @@ import six
 NULL_TIME = datetime.time(0, 0, 0)


-def from_xlsx(cls, path, sheet=None, skip_lines=0, header=True, read_only=True, 
-              reset_dimensions=False, **kwargs):
+def from_xlsx(cls, path, sheet=None, skip_lines=0, header=True, read_only=True,
+              reset_dimensions=False, row_limit=None, **kwargs):
    """
    Parse an XLSX file.

@@ -29,8 +29,10 @@ def from_xlsx(cls, path, sheet=None, skip_lines=0, header=True, read_only=True,
    :param header:
        If :code:`True`, the first row is assumed to contain column names.
    :param reset_dimensions:
-        If :code:`True`, do not trust the dimensions in the file's properties, 
+        If :code:`True`, do not trust the dimensions in the file's properties,
        and recalculate them based on the data in the file.
+    :param row_limit:
+        Limit how many rows of data will be read.
    """
    if not isinstance(skip_lines, int):
        raise ValueError('skip_lines argument must be an int')
@@ -52,23 +54,38 @@ def from_xlsx(cls, path, sheet=None, skip_lines=0, header=True, read_only=True,

    for i, sheet in enumerate(sheets):
        if isinstance(sheet, six.string_types):
-            sheet = book[sheet]
+            try:
+                sheet = book[sheet]
+            except KeyError:
+                f.close()
+                raise
        elif isinstance(sheet, int):
-            sheet = book.worksheets[sheet]
+            try:
+                sheet = book.worksheets[sheet]
+            except IndexError:
+                f.close()
+                raise
        else:
            sheet = book.active

        column_names = None
+        offset = 0
        rows = []

        if reset_dimensions:
            sheet.reset_dimensions()

-        for i, row in enumerate(sheet.iter_rows(min_row=skip_lines + 1)):
-            if i == 0 and header:
-                column_names = [None if c.value is None else six.text_type(c.value) for c in row]
-                continue
+        if header:
+            sheet_header = sheet.iter_rows(min_row=1 + skip_lines, max_row=1 + skip_lines)
+            column_names = [None if c.value is None else six.text_type(c.value) for row in sheet_header for c in row]
+            offset = 1

+        if row_limit is None:
+            sheet_rows = sheet.iter_rows(min_row=1 + skip_lines + offset)
+        else:
+            sheet_rows = sheet.iter_rows(min_row=1 + skip_lines + offset, max_row=1 + skip_lines + offset + row_limit)
+
+        for i, row in enumerate(sheet_rows):
            values = []

            for c in row:

--- a/docs/conf.py
+++ b/docs/conf.py
 # -*- coding: utf-8 -*-
 #
-# flake8: noqa
-#
 # This file is execfile()d with the current directory set to its containing dir.
 #
 # Note that not all possible configuration values are present in this
@@ -54,9 +52,9 @@ copyright = u'2017, Christopher Groskopf'
 # built documents.
 #
 # The short X.Y version.
-version = '0.2.3'
+version = '0.2.5'
 # The full version, including alpha/beta/rc tags.
-release = '0.2.3'
+release = version

 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

--- a/example.py
+++ b/example.py
 #!/usr/bin/env python

 import agate
-import agateexcel  # noqa
+
+import agateexcel

 table = agate.Table.from_xls('examples/test.xls')


--- a/examples/test_skip_lines.xls
+++ b/examples/test_skip_lines.xls
--- a/requirements-py2.txt
+++ b/requirements-py2.txt
-unittest2==0.5.1
-nose>=1.1.2
-tox>=1.3
-Sphinx>=1.2.2
-sphinx_rtd_theme>=0.1.6
-wheel>=0.24.0
-ordereddict>=1.1
-xlrd>=0.9.4
-openpyxl>=2.3.0
-agate>=1.2.2
--- a/requirements-py3.txt
+++ b/requirements-py3.txt
-nose>=1.1.2
-tox>=3.1.0
-Sphinx>=1.2.2
-sphinx_rtd_theme>=0.1.6
-wheel>=0.24.0
-xlrd>=0.9.4
-openpyxl>=2.3.0
-agate>=1.2.2
--- a/setup.cfg
+++ b/setup.cfg
+[flake8]
+max-line-length = 119
+per-file-ignores =
+    # imported but unused
+    agateexcel/__init__.py: F401
+    example.py: F401
+    # block comment should start with '# '
+    docs/conf.py: E265
+
+[isort]
+line_length = 119
+
 [bdist_wheel]
 universal = 1
--- a/setup.py
+++ b/setup.py
-#!/usr/bin/env python
+from setuptools import find_packages, setup

-from setuptools import setup
-
-install_requires = [
-    'agate>=1.5.0',
-    'xlrd>=0.9.4',
-    'openpyxl>=2.3.0'
-]
+with open('README.rst') as f:
+    long_description = f.read()

 setup(
    name='agate-excel',
-    version='0.2.3',
+    version='0.2.5',
    description='agate-excel adds read support for Excel files (xls and xlsx) to agate.',
-    long_description=open('README.rst').read(),
+    long_description=long_description,
+    long_description_content_type='text/x-rst',
    author='Christopher Groskopf',
    author_email='chrisgroskopf@gmail.com',
    url='http://agate-excel.readthedocs.org/',
@@ -26,19 +22,30 @@ setup(
        'Operating System :: OS Independent',
        'Programming Language :: Python',
        'Programming Language :: Python :: 2.7',
-        'Programming Language :: Python :: 3.4',
-        'Programming Language :: Python :: 3.5',
        'Programming Language :: Python :: 3.6',
        'Programming Language :: Python :: 3.7',
+        'Programming Language :: Python :: 3.8',
+        'Programming Language :: Python :: 3.9',
        'Programming Language :: Python :: Implementation :: CPython',
        'Programming Language :: Python :: Implementation :: PyPy',
-        'Topic :: Multimedia :: Graphics',
        'Topic :: Scientific/Engineering :: Information Analysis',
-        'Topic :: Scientific/Engineering :: Visualization',
        'Topic :: Software Development :: Libraries :: Python Modules',
    ],
-    packages=[
-        'agateexcel'
+    packages=find_packages(exclude=['tests', 'tests.*']),
+    install_requires=[
+        'agate>=1.5.0',
+        'olefile',
+        'openpyxl>=2.3.0',
+        'six',
+        'xlrd>=0.9.4',
    ],
-    install_requires=install_requires
+    extras_require={
+        'test': [
+            'nose>=1.1.2',
+        ],
+        'docs': [
+            'Sphinx>=1.2.2',
+            'sphinx_rtd_theme>=0.1.6',
+        ],
+    }
 )
--- a/tests/test_table_xls.py
+++ b/tests/test_table_xls.py
@@ -4,7 +4,8 @@
 import datetime

 import agate
-import agateexcel  # noqa
+
+import agateexcel  # noqa: F401


 class TestXLS(agate.AgateTestCase):
@@ -31,7 +32,8 @@ class TestXLS(agate.AgateTestCase):
        self.table = agate.Table(self.rows, self.column_names, self.column_types)

    def test_from_xls_with_column_names(self):
-        table = agate.Table.from_xls('examples/test.xls', header=False, skip_lines=1, column_names=self.user_provided_column_names )
+        table = agate.Table.from_xls('examples/test.xls', header=False, skip_lines=1,
+                                     column_names=self.user_provided_column_names)

        self.assertColumnNames(table, self.user_provided_column_names)
        self.assertColumnTypes(table, [agate.Number, agate.Text, agate.Boolean, agate.Date, agate.DateTime])
@@ -135,3 +137,17 @@ class TestXLS(agate.AgateTestCase):
        self.assertRows(table, [
            ['Canada', 35160000, 'value'],
        ])
+
+    def test_row_limit(self):
+        table = agate.Table.from_xls('examples/test.xls', row_limit=2)
+
+        self.assertColumnNames(table, self.column_names)
+        self.assertColumnTypes(table, [agate.Number, agate.Text, agate.Boolean, agate.Date, agate.DateTime])
+        self.assertRows(table, [r.values() for r in self.table.rows][:2])
+
+    def test_row_limit_too_high(self):
+        table = agate.Table.from_xls('examples/test.xls', row_limit=200)
+
+        self.assertColumnNames(table, self.column_names)
+        self.assertColumnTypes(table, [agate.Number, agate.Text, agate.Boolean, agate.Date, agate.DateTime])
+        self.assertRows(table, [r.values() for r in self.table.rows])
--- a/tests/test_table_xlsx.py
+++ b/tests/test_table_xlsx.py
@@ -4,7 +4,9 @@
 import datetime

 import agate
-import agateexcel  # noqa
+import six
+
+import agateexcel  # noqa: F401


 class TestXLSX(agate.AgateTestCase):
@@ -31,7 +33,8 @@ class TestXLSX(agate.AgateTestCase):
        self.table = agate.Table(self.rows, self.column_names, self.column_types)

    def test_from_xlsx_with_column_names(self):
-        table = agate.Table.from_xlsx('examples/test.xlsx', header=False, skip_lines=1, column_names=self.user_provided_column_names)
+        table = agate.Table.from_xlsx('examples/test.xlsx', header=False, skip_lines=1,
+                                      column_names=self.user_provided_column_names)

        self.assertColumnNames(table, self.user_provided_column_names)
        self.assertColumnTypes(table, [agate.Number, agate.Text, agate.Boolean, agate.Date, agate.DateTime])
@@ -103,10 +106,16 @@ class TestXLSX(agate.AgateTestCase):
    def test_ambiguous_date(self):
        table = agate.Table.from_xlsx('examples/test_ambiguous_date.xlsx')

+        # openpyxl >= 3 fixes a bug, but Python 2 is constrained to openpyxl < 3.
+        if six.PY2:
+            expected = datetime.date(1899, 12, 31)
+        else:
+            expected = datetime.date(1900, 1, 1)
+
        self.assertColumnNames(table, ['s'])
        self.assertColumnTypes(table, [agate.Date])
        self.assertRows(table, [
-            [datetime.date(1899, 12, 31)],
+            [expected],
        ])

    def test_empty(self):
@@ -124,3 +133,17 @@ class TestXLSX(agate.AgateTestCase):
        self.assertRows(table, [
            ['Canada', 35160000, 'value'],
        ])
+
+    def test_row_limit(self):
+        table = agate.Table.from_xlsx('examples/test.xlsx', row_limit=2)
+
+        self.assertColumnNames(table, self.column_names)
+        self.assertColumnTypes(table, [agate.Number, agate.Text, agate.Boolean, agate.Date, agate.DateTime])
+        self.assertRows(table, [r.values() for r in self.table.rows][:2])
+
+    def test_row_limit_too_high(self):
+        table = agate.Table.from_xlsx('examples/test.xlsx', row_limit=200)
+
+        self.assertColumnNames(table, self.column_names)
+        self.assertColumnTypes(table, [agate.Number, agate.Text, agate.Boolean, agate.Date, agate.DateTime])
+        self.assertRows(table, [r.values() for r in self.table.rows])
--- a/tox.ini
+++ b/tox.ini
-[tox]
-envlist = py27,py34,py35,py36,py37,pypy
-
-[testenv]
-deps=
-    nose>=1.1.2
-    six>=1.6.1
-commands=nosetests
-
-[testenv:py27]
-deps=
-    {[testenv]deps}
-
-[testenv:py34]
-deps=
-    {[testenv]deps}
-
-[testenv:py35]
-deps=
-    {[testenv:py33]deps}
-
-[testenv:py36]
-deps=
-    {[testenv:py33]deps}
-
-[testenv:py37]
-deps=
-    {[testenv:py33]deps}
-
-[testenv:pypy]
-deps=
-    {[testenv:py33]deps}