Commit 4e7ba71d authored by Chris Lamb's avatar Chris Lamb

Add support for comparing PDF metadata using PyPDF2. (Closes: #911446)

parent f8fc0baa
Pipeline #22619 failed with stage
in 16 minutes and 42 seconds
......@@ -62,6 +62,7 @@ Build-Depends:
python3-libarchive-c,
python3-magic,
python3-progressbar <!nocheck>,
python3-pypdf2 <!nocheck>,
python3-pytest <!nocheck>,
python3-pytest-cov <!nocheck>,
python3-pyxattr <!nocheck>,
......
......@@ -7,7 +7,7 @@
# $ mv debian/tests/control.tmp debian/tests/control
Tests: pytest-with-recommends
Depends: diffoscope, python3-pytest, file, linux-image-amd64 [amd64] | linux-image-generic [amd64], abootimg, acl, binutils-multiarch, bzip2, caca-utils, colord, db-util, default-jdk-headless | default-jdk | java-sdk, device-tree-compiler, docx2txt, e2fsprogs, enjarify, fontforge-extras, fp-utils [!ppc64el !s390x], genisoimage, gettext, ghc, ghostscript, giflib-tools, gnumeric, gnupg, imagemagick, jsbeautifier, libarchive-tools, llvm, lz4 | liblz4-tool, mono-utils, ocaml-nox, odt2txt, openssh-client, pgpdump, poppler-utils, procyon-decompiler, r-base-core, rpm2cpio, sng, sqlite3, squashfs-tools, tcpdump, unzip, xmlbeans, xxd | vim-common, xz-utils, python3-distro, python3-argcomplete, python3-progressbar, python3-binwalk, python3-defusedxml, python3-guestfs, python3-jsondiff, python3-debian, python3-pyxattr, python3-rpm, python3-tlsh
Depends: diffoscope, python3-pytest, file, linux-image-amd64 [amd64] | linux-image-generic [amd64], abootimg, acl, binutils-multiarch, bzip2, caca-utils, colord, db-util, default-jdk-headless | default-jdk | java-sdk, device-tree-compiler, docx2txt, e2fsprogs, enjarify, fontforge-extras, fp-utils [!ppc64el !s390x], genisoimage, gettext, ghc, ghostscript, giflib-tools, gnumeric, gnupg, imagemagick, jsbeautifier, libarchive-tools, llvm, lz4 | liblz4-tool, mono-utils, ocaml-nox, odt2txt, openssh-client, pgpdump, poppler-utils, procyon-decompiler, r-base-core, rpm2cpio, sng, sqlite3, squashfs-tools, tcpdump, unzip, xmlbeans, xxd | vim-common, xz-utils, python3-distro, python3-argcomplete, python3-progressbar, python3-binwalk, python3-defusedxml, python3-guestfs, python3-jsondiff, python3-debian, python3-pypdf2, python3-pyxattr, python3-rpm, python3-tlsh
Tests: pytest
Depends: diffoscope, python3-pytest, file
......
......@@ -25,6 +25,11 @@ from diffoscope.difference import Difference
from .utils.file import File
from .utils.command import Command
try:
import PyPDF2
except ImportError: # noqa
PyPDF2 = None
class Pdftotext(Command):
@tool_required('pdftotext')
......@@ -37,4 +42,32 @@ class PdfFile(File):
FILE_TYPE_RE = re.compile(r'^PDF document\b')
def compare_details(self, other, source=None):
return [Difference.from_command(Pdftotext, self.path, other.path)]
xs = []
if PyPDF2 is not None:
difference = Difference.from_text(
self.dump_pypdf2_metadata(self),
self.dump_pypdf2_metadata(other),
self.path,
other.path,
)
if difference:
difference.add_comment("Document info")
xs.append(difference)
xs.append(Difference.from_command(Pdftotext, self.path, other.path))
return xs
@staticmethod
def dump_pypdf2_metadata(file):
try:
pdf = PyPDF2.PdfFileReader(file.path)
except PyPDF2.utils.PdfReadError:
return "(Could not extract metadata)"
xs = []
for k, v in sorted(pdf.getDocumentInfo().items()):
xs.append("{}: {!r}".format(k.lstrip('/'), v))
return "\n".join(xs)
......@@ -61,6 +61,7 @@ setup(
'guestfs',
'jsondiff',
'python-debian',
'pypdf2',
'pyxattr',
'rpm-python',
'tlsh',
......
......@@ -22,7 +22,7 @@ import pytest
from diffoscope.comparators.pdf import PdfFile
from ..utils.data import load_fixture, get_data
from ..utils.tools import skip_unless_tools_exist
from ..utils.tools import skip_unless_tools_exist, skip_unless_module_exists
from ..utils.nonexisting import assert_non_existing
......@@ -61,3 +61,14 @@ def test_text_diff(differences):
@skip_unless_tools_exist('pdftotext')
def test_compare_non_existing(monkeypatch, pdf1):
assert_non_existing(monkeypatch, pdf1, has_null_source=False)
@pytest.fixture
def differences_metadata(pdf1, pdf1a):
return pdf1.compare(pdf1a).details
@skip_unless_module_exists('PyPDF2')
def test_metadata(differences_metadata):
expected_diff = get_data('pdf_metadata_expected_diff')
assert differences_metadata[0].unified_diff == expected_diff
@@ -1,2 +1,10 @@
-Creator: 'Prawn'
-Producer: 'Prawn'
+Author: ''
+CreationDate: 'D:20180428153751Z'
+Creator: 'LaTeX with hyperref package'
+Keywords: ''
+ModDate: 'D:20180428153751Z'
+PTEX.Fullbanner: 'This is pdfTeX, Version 3.14159265-2.6-1.40.19 (TeX Live 2018/Debian) kpathsea version 6.3.0'
+Producer: 'pdfTeX-1.40.19'
+Subject: ''
+Title: ''
+Trapped: '/False'
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment