Commit 5d96a92c authored by Chris Lamb's avatar Chris Lamb 💬

Add support for .docx and .odt files via docx2txt & odt2txt. (Closes: #859056)

Signed-off-by: Chris Lamb's avatarChris Lamb <lamby@debian.org>
parent d4249b69
......@@ -17,6 +17,7 @@ Build-Depends:
debhelper (>= 10),
default-jdk-headless <!nocheck> | default-jdk <!nocheck>,
dh-python (>= 2.20160818~),
docx2txt <!nocheck>,
dpkg-dev (>= 1.17.14),
enjarify <!nocheck>,
fontforge-extras <!nocheck>,
......@@ -33,6 +34,7 @@ Build-Depends:
libjs-jquery-throttle-debounce <!nocheck>,
llvm <!nocheck>,
mono-utils <!nocheck>,
odt2txt <!nocheck>,
openssh-client <!nocheck>,
pdftk <!nocheck>,
pgpdump <!nocheck>,
......
......@@ -70,6 +70,8 @@ class ComparatorManager(object):
('tar.TarFile',),
('xz.XzFile',),
('apk.ApkFile',),
('odt.OdtFile',),
('docx.DocxFile',),
('zip.ZipFile',),
('zip.MozillaZipFile',),
('image.JPEGImageFile',),
......
# -*- coding: utf-8 -*-
#
# diffoscope: in-depth comparison of files, archives, and directories
#
# Copyright © 2017 Chris Lamb <lamby@debian.org>
#
# diffoscope is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# diffoscope is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with diffoscope. If not, see <https://www.gnu.org/licenses/>.
import re
from diffoscope.tools import tool_required
from diffoscope.difference import Difference
from .utils.file import File
from .utils.command import Command
class Docx2txt(Command):
@tool_required('docx2txt')
def cmdline(self):
return (
'docx2txt',
self.path,
'-',
)
class DocxFile(File):
RE_FILE_TYPE = re.compile(r'^Microsoft Word 2007+\b')
def compare_details(self, other, source=None):
return [Difference.from_command(
Docx2txt,
self.path,
other.path,
source='docx2txt',
)]
# -*- coding: utf-8 -*-
#
# diffoscope: in-depth comparison of files, archives, and directories
#
# Copyright © 2017 Chris Lamb <lamby@debian.org>
#
# diffoscope is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# diffoscope is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with diffoscope. If not, see <https://www.gnu.org/licenses/>.
import re
from diffoscope.tools import tool_required
from diffoscope.difference import Difference
from .utils.file import File
from .utils.command import Command
class Odt2txt(Command):
@tool_required('odt2txt')
def cmdline(self):
return (
'odt2txt',
'--encoding=UTF-8',
self.path,
)
class OdtFile(File):
RE_FILE_TYPE = re.compile(r'^OpenDocument Text\b')
def compare_details(self, other, source=None):
return [Difference.from_command(
Odt2txt,
self.path,
other.path,
source='odt2txt',
)]
......@@ -44,6 +44,9 @@ EXTERNAL_TOOLS = {
'debian': 'diffutils',
'arch': 'diffutils',
},
'docx2txt': {
'debian': 'docx2txt',
},
'enjarify': {
'debian': 'enjarify',
'arch': 'enjarify',
......@@ -136,6 +139,9 @@ EXTERNAL_TOOLS = {
'debian': 'binutils-multiarch',
'arch': 'binutils',
},
'odt2txt': {
'debian': 'odt2txt',
},
'pgpdump': {
'debian': 'pgpdump',
'arch': 'pgpdump',
......
# -*- coding: utf-8 -*-
#
# diffoscope: in-depth comparison of files, archives, and directories
#
# Copyright © 2017 Chris Lamb <lamby@debian.org>
#
# diffoscope is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# diffoscope is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with diffoscope. If not, see <https://www.gnu.org/licenses/>.
import pytest
from diffoscope.comparators.docx import DocxFile
from utils.data import load_fixture, get_data
from utils.tools import skip_unless_tools_exist
from utils.nonexisting import assert_non_existing
docx1 = load_fixture('test1.docx')
docx2 = load_fixture('test2.docx')
def test_identification(docx1):
assert isinstance(docx1, DocxFile)
def test_no_differences(docx1):
difference = docx1.compare(docx1)
assert difference is None
@pytest.fixture
def differences(docx1, docx2):
return docx1.compare(docx2).details
@skip_unless_tools_exist('docx2txt')
def test_diff(differences):
expected_diff = get_data('docx_expected_diff')
assert differences[0].unified_diff == expected_diff
@skip_unless_tools_exist('docx2txt')
def test_compare_non_existing(monkeypatch, docx1):
assert_non_existing(monkeypatch, docx1, has_null_source=False)
# -*- coding: utf-8 -*-
#
# diffoscope: in-depth comparison of files, archives, and directories
#
# Copyright © 2017 Chris Lamb <lamby@debian.org>
#
# diffoscope is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# diffoscope is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with diffoscope. If not, see <https://www.gnu.org/licenses/>.
import pytest
from diffoscope.comparators.odt import OdtFile
from utils.data import load_fixture, get_data
from utils.tools import skip_unless_tools_exist
from utils.nonexisting import assert_non_existing
odt1 = load_fixture('test1.odt')
odt2 = load_fixture('test2.odt')
def test_identification(odt1):
assert isinstance(odt1, OdtFile)
def test_no_differences(odt1):
difference = odt1.compare(odt1)
assert difference is None
@pytest.fixture
def differences(odt1, odt2):
return odt1.compare(odt2).details
@skip_unless_tools_exist('odt2txt')
def test_diff(differences):
expected_diff = get_data('odt_expected_diff')
assert differences[0].unified_diff == expected_diff
@skip_unless_tools_exist('odt2txt')
def test_compare_non_existing(monkeypatch, odt1):
assert_non_existing(monkeypatch, odt1, has_null_source=False)
@@ -1,3 +1,3 @@
-a
+b
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment