Commit dbeab9e3 authored by Chris Lamb's avatar Chris Lamb 👀
Browse files

Don't crash if we can open a PDF file with PyPDF, but cannot parse the...

Don't crash if we can open a PDF file with PyPDF, but cannot parse the annotations within. (Closes: #311)
parent 4dc5b2ff
Loading
Loading
Loading
Loading
Loading
+35 −27
Original line number Diff line number Diff line
@@ -17,6 +17,8 @@
# You should have received a copy of the GNU General Public License
# along with diffoscope.  If not, see <https://www.gnu.org/licenses/>.

import logging
import os
import re

from diffoscope.tools import python_module_missing, tool_required
@@ -25,6 +27,8 @@ from diffoscope.difference import Difference
from .utils.file import File
from .utils.command import Command

logger = logging.getLogger(__name__)

try:
    import PyPDF2

@@ -95,13 +99,10 @@ class PdfFile(File):

        return xs

    @staticmethod
    def dump_pypdf2_metadata(file):
    def dump_pypdf2_metadata(self, file):
        try:
            pdf = PyPDF2.PdfFileReader(file.path)
            document_info = pdf.getDocumentInfo()
        except PdfReadError as e:
            return f"(Could not extract metadata: {e})"

            if document_info is None:
                return ""
@@ -111,13 +112,15 @@ class PdfFile(File):
                xs.append("{}: {!r}".format(k.lstrip("/"), v))

            return "\n".join(xs)
        except PdfReadError as e:
            msg = f"Could not extract PyPDF2 metadata from {os.path.basename(file.name)}: {e}"
            self.add_comment(msg)
            logger.error(msg)
            return ""

    @staticmethod
    def dump_pypdf2_annotations(file):
    def dump_pypdf2_annotations(self, file):
        try:
            pdf = PyPDF2.PdfFileReader(file.path)
        except PdfReadError as e:
            return f"(Could not open file: {e})"

            xs = []
            for x in range(pdf.getNumPages()):
@@ -132,3 +135,8 @@ class PdfFile(File):
                    pass

            return "\n".join(xs)
        except PdfReadError as e:
            msg = f"Could not extract PyPDF2 annotations from {os.path.basename(file.name)}: {e}"
            file.add_comment(msg)
            logger.error(msg)
            return ""