Georg Faerber · Georg Faerber · Georg Faerber · 6b351399 · 6b351399 · 6b351399
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -9,7 +9,7 @@ bandit:
  script:  # TODO: remove B405 and B314
  - apt-get -qqy update
  - apt-get -qqy install --no-install-recommends python3-bandit
-  - bandit ./mat2 --format txt
+  - bandit ./mat2 --format txt --skip B101
  - bandit -r ./nautilus/ --format txt --skip B101
  - bandit -r ./libmat2 --format txt --skip B101,B404,B603,B405,B314

@@ -42,9 +42,9 @@ tests:debian:
  stage: test
  script:
  - apt-get -qqy update
-  - apt-get -qqy install --no-install-recommends python3-mutagen python3-gi-cairo gir1.2-poppler-0.18 gir1.2-gdkpixbuf-2.0 libimage-exiftool-perl python3-coverage
+  - apt-get -qqy install --no-install-recommends python3-mutagen python3-gi-cairo gir1.2-poppler-0.18 gir1.2-gdkpixbuf-2.0 libimage-exiftool-perl python3-coverage ffmpeg
  - python3-coverage run --branch -m unittest discover -s tests/
-  - python3-coverage report -m --include 'libmat2/*'
+  - python3-coverage report --fail-under=100 -m --include 'libmat2/*'

 tests:fedora:
  image: fedora
@@ -62,5 +62,5 @@ tests:archlinux:
  tags:
    - whitewhale
  script:
-  - pacman -Sy --noconfirm python-mutagen python-gobject gdk-pixbuf2 poppler-glib gdk-pixbuf2 python-cairo perl-image-exiftool python-setuptools mailcap
+  - pacman -Sy --noconfirm python-mutagen python-gobject gdk-pixbuf2 poppler-glib gdk-pixbuf2 python-cairo perl-image-exiftool python-setuptools mailcap ffmpeg
  - python3 setup.py test
--- a/.pylintrc
+++ b/.pylintrc
@@ -6,11 +6,12 @@ max-locals=20
 disable=
    fixme,
    invalid-name,
+    duplicate-code,
    missing-docstring,
    protected-access,
-		abstract-method,
-		wrong-import-position,
-		catching-non-exception,
-		cell-var-from-loop,
-		locally-disabled,
-		invalid-sequence-index,  # pylint doesn't like things like `Tuple[int, bytes]` in type annotation
+    abstract-method,
+    wrong-import-position,
+    catching-non-exception,
+    cell-var-from-loop,
+    locally-disabled,
+    invalid-sequence-index,  # pylint doesn't like things like `Tuple[int, bytes]` in type annotation
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
+# 0.5.0 - 2018-10-23
+
+- Video (.avi files for now) support, via FFmpeg, optionally
+- Lightweight cleaning for png and tiff files
+- Processing files starting with a dash is now quicker
+- Metadata are now displayed sorted
+- Recursive metadata support for FLAC files
+- Unsupported extensions aren't displayed in `/.mat -l` anymore
+- Improve the display when no metadata are found
+- Update the logo according to the GNOME guidelines
+- The testsuite is now runnable on the installed version of mat2
+- Various internal cleanup/improvements
+
 # 0.4.0 - 2018-10-03

 - There is now a policy, for advanced users, to deal with unknown embedded fileformats

--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -32,5 +32,6 @@ Since MAT2 is written in Python3, please conform as much as possible to the
 9. Create the signed tarball with `git archive --format=tar.xz --prefix=mat-$VERSION/ $VERSION > mat-$VERSION.tar.xz`
 10. Sign the tarball with `gpg --armor --detach-sign mat-$VERSION.tar.xz`
 11. Upload the result on Gitlab's [tag page](https://0xacab.org/jvoisin/mat2/tags) and add the changelog there
-12. Tell the [downstreams](https://0xacab.org/jvoisin/mat2/blob/master/INSTALL.md) about it
-13. Do the secret release dance
+12. Announce the release on the [mailing list](https://mailman.boum.org/listinfo/mat-dev)
+13. Tell the [downstreams](https://0xacab.org/jvoisin/mat2/blob/master/INSTALL.md) about it
+14. Do the secret release dance
--- a/README.md
+++ b/README.md
@@ -30,10 +30,11 @@ metadata.
 - `python3-mutagen` for audio support
 - `python3-gi-cairo` and `gir1.2-poppler-0.18` for PDF support
 - `gir1.2-gdkpixbuf-2.0` for images support
+- `FFmpeg`, optionally, for video support 
 - `libimage-exiftool-perl` for everything else

 Please note that MAT2 requires at least Python3.5, meaning that it
-doesn't run on [Debian Jessie](https://packages.debian.org/jessie/python3),
+doesn't run on [Debian Jessie](https://packages.debian.org/jessie/python3).

 # Running the test suite


--- a/data/mat2.png
+++ b/data/mat2.png
--- a/data/mat2.svg
+++ b/data/mat2.svg
--- a/debian/mat2.docs
+++ b/debian/mat2.docs
+doc/implementation_notes.md
+doc/threat_model.md
--- a/doc/mat2.1
+++ b/doc/mat2.1
-.TH MAT2 "1" "October 2018" "MAT2 0.4.0" "User Commands"
+.TH MAT2 "1" "October 2018" "MAT2 0.5.0" "User Commands"

 .SH NAME
 mat2 \- the metadata anonymisation toolkit 2

--- a/libmat2/__init__.py
+++ b/libmat2/__init__.py
-#!/bin/env python3
+#!/usr/bin/env python3

-import os
 import collections
 import enum
 import importlib
 from typing import Dict, Optional

+from . import exiftool, video
+
 # make pyflakes happy
 assert Dict
+assert Optional

 # A set of extension that aren't supported, despite matching a supported mimetype
 UNSUPPORTED_EXTENSIONS = {
@@ -36,24 +38,13 @@ DEPENDENCIES = {
    'mutagen': 'Mutagen',
    }

-def _get_exiftool_path() -> Optional[str]:  # pragma: no cover
-    exiftool_path = '/usr/bin/exiftool'
-    if os.path.isfile(exiftool_path):
-        if os.access(exiftool_path, os.X_OK):
-            return exiftool_path
-
-    # ArchLinux
-    exiftool_path = '/usr/bin/vendor_perl/exiftool'
-    if os.path.isfile(exiftool_path):
-        if os.access(exiftool_path, os.X_OK):
-            return exiftool_path

-    return None

-def check_dependencies() -> dict:
+def check_dependencies() -> Dict[str, bool]:
    ret = collections.defaultdict(bool)  # type: Dict[str, bool]

-    ret['Exiftool'] = True if _get_exiftool_path() else False
+    ret['Exiftool'] = True if exiftool._get_exiftool_path() else False
+    ret['Ffmpeg'] = True if video._get_ffmpeg_path() else False

    for key, value in DEPENDENCIES.items():
        ret[value] = True

--- a/libmat2/abstract.py
+++ b/libmat2/abstract.py
 import abc
 import os
-from typing import Set, Dict
+import re
+from typing import Set, Dict, Union

 assert Set  # make pyflakes happy


 class AbstractParser(abc.ABC):
    """ This is the base class of every parser.
-    It might yield `ValueError` on instantiation on invalid files.
+    It might yield `ValueError` on instantiation on invalid files,
+    and `RuntimeError` when something went wrong in `remove_all`.
    """
    meta_list = set()  # type: Set[str]
    mimetypes = set()  # type: Set[str]
@@ -16,21 +18,23 @@ class AbstractParser(abc.ABC):
        """
        :raises ValueError: Raised upon an invalid file
        """
+        if re.search('^[a-z0-9./]', filename) is None:
+            # Some parsers are calling external binaries,
+            # this prevents shell command injections
+            filename = os.path.join('.', filename)
+
        self.filename = filename
        fname, extension = os.path.splitext(filename)
        self.output_filename = fname + '.cleaned' + extension
+        self.lightweight_cleaning = False

    @abc.abstractmethod
-    def get_meta(self) -> Dict[str, str]:
+    def get_meta(self) -> Dict[str, Union[str, dict]]:
        pass  # pragma: no cover

    @abc.abstractmethod
    def remove_all(self) -> bool:
-        pass  # pragma: no cover
-
-    def remove_all_lightweight(self) -> bool:
-        """ This method removes _SOME_ metadata.
-        It might be useful to implement it for fileformats that do
-        not support non-destructive cleaning.
        """
-        return self.remove_all()
+        :raises RuntimeError: Raised if the cleaning process went wrong.
+        """
+        pass  # pragma: no cover
--- a/libmat2/archive.py
+++ b/libmat2/archive.py
@@ -4,13 +4,14 @@ import tempfile
 import os
 import logging
 import shutil
-from typing import Dict, Set, Pattern
+from typing import Dict, Set, Pattern, Union

 from . import abstract, UnknownMemberPolicy, parser_factory

 # Make pyflakes happy
 assert Set
 assert Pattern
+assert Union


 class ArchiveBasedAbstractParser(abstract.AbstractParser):

--- a/libmat2/audio.py
+++ b/libmat2/audio.py
+import mimetypes
+import os
 import shutil
+import tempfile
+from typing import Dict, Union

 import mutagen

-from . import abstract
+from . import abstract, parser_factory


 class MutagenParser(abstract.AbstractParser):
@@ -13,13 +17,13 @@ class MutagenParser(abstract.AbstractParser):
        except mutagen.MutagenError:
            raise ValueError

-    def get_meta(self):
+    def get_meta(self) -> Dict[str, Union[str, dict]]:
        f = mutagen.File(self.filename)
        if f.tags:
            return {k:', '.join(v) for k, v in f.tags.items()}
        return {}

-    def remove_all(self):
+    def remove_all(self) -> bool:
        shutil.copy(self.filename, self.output_filename)
        f = mutagen.File(self.output_filename)
        f.delete()
@@ -30,8 +34,8 @@ class MutagenParser(abstract.AbstractParser):
 class MP3Parser(MutagenParser):
    mimetypes = {'audio/mpeg', }

-    def get_meta(self):
-        metadata = {}
+    def get_meta(self) -> Dict[str, Union[str, dict]]:
+        metadata = {}  # type: Dict[str, Union[str, dict]]
        meta = mutagen.File(self.filename).tags
        for key in meta:
            metadata[key.rstrip(' \t\r\n\0')] = ', '.join(map(str, meta[key].text))
@@ -44,3 +48,30 @@ class OGGParser(MutagenParser):

 class FLACParser(MutagenParser):
    mimetypes = {'audio/flac', 'audio/x-flac'}
+
+    def remove_all(self) -> bool:
+        shutil.copy(self.filename, self.output_filename)
+        f = mutagen.File(self.output_filename)
+        f.clear_pictures()
+        f.delete()
+        f.save(deleteid3=True)
+        return True
+
+    def get_meta(self) -> Dict[str, Union[str, dict]]:
+        meta = super().get_meta()
+        for num, picture in enumerate(mutagen.File(self.filename).pictures):
+            name = picture.desc if picture.desc else 'Cover %d' % num
+            extension = mimetypes.guess_extension(picture.mime)
+            if extension is None:  #  pragma: no cover
+                meta[name] = 'harmful data'
+                continue
+
+            _, fname = tempfile.mkstemp()
+            fname = fname + extension
+            with open(fname, 'wb') as f:
+                f.write(picture.data)
+            p, _ = parser_factory.get_parser(fname)  # type: ignore
+            # Mypy chokes on ternaries :/
+            meta[name] = p.get_meta() if p else 'harmful data'  # type: ignore
+            os.remove(fname)
+        return meta
--- a/libmat2/exiftool.py
+++ b/libmat2/exiftool.py
+import json
+import logging
+import os
+import subprocess
+from typing import Dict, Union, Set
+
+from . import abstract
+
+# Make pyflakes happy
+assert Set
+
+
+class ExiftoolParser(abstract.AbstractParser):
+    """ Exiftool is often the easiest way to get all the metadata
+    from a import file, hence why several parsers are re-using its `get_meta`
+    method.
+    """
+    meta_whitelist = set()  # type: Set[str]
+
+    def get_meta(self) -> Dict[str, Union[str, dict]]:
+        out = subprocess.check_output([_get_exiftool_path(), '-json', self.filename])
+        meta = json.loads(out.decode('utf-8'))[0]
+        for key in self.meta_whitelist:
+            meta.pop(key, None)
+        return meta
+
+    def _lightweight_cleanup(self) -> bool:
+        if os.path.exists(self.output_filename):
+            try:
+                # exiftool can't force output to existing files
+                os.remove(self.output_filename)
+            except OSError as e:  # pragma: no cover
+                logging.error("The output file %s is already existing and \
+                               can't be overwritten: %s.", self.filename, e)
+                return False
+
+        # Note: '-All=' must be followed by a known exiftool option.
+        # Also, '-CommonIFD0' is needed for .tiff files
+        cmd = [_get_exiftool_path(),
+               '-all=',         # remove metadata
+               '-adobe=',       # remove adobe-specific metadata
+               '-exif:all=',    # remove all exif metadata
+               '-Time:All=',    # remove all timestamps
+               '-quiet',        # don't show useless logs
+               '-CommonIFD0=',  # remove IFD0 metadata
+               '-o', self.output_filename,
+               self.filename]
+        try:
+            subprocess.check_call(cmd)
+        except subprocess.CalledProcessError as e:  # pragma: no cover
+            logging.error("Something went wrong during the processing of %s: %s", self.filename, e)
+            return False
+        return True
+
+def _get_exiftool_path() -> str:  # pragma: no cover
+    exiftool_path = '/usr/bin/exiftool'
+    if os.path.isfile(exiftool_path):
+        if os.access(exiftool_path, os.X_OK):
+            return exiftool_path
+
+    # ArchLinux
+    exiftool_path = '/usr/bin/vendor_perl/exiftool'
+    if os.path.isfile(exiftool_path):
+        if os.access(exiftool_path, os.X_OK):
+            return exiftool_path
+
+    raise RuntimeError("Unable to find exiftool")
--- a/libmat2/harmless.py
+++ b/libmat2/harmless.py
 import shutil
-from typing import Dict
+from typing import Dict, Union
 from . import abstract


@@ -7,7 +7,7 @@ class HarmlessParser(abstract.AbstractParser):
    """ This is the parser for filetypes that can not contain metadata. """
    mimetypes = {'text/plain', 'image/x-ms-bmp'}

-    def get_meta(self) -> Dict[str, str]:
+    def get_meta(self) -> Dict[str, Union[str, dict]]:
        return dict()

    def remove_all(self) -> bool:

--- a/libmat2/images.py
+++ b/libmat2/images.py
-import subprocess
 import imghdr
-import json
 import os
-import shutil
-import tempfile
-import re
 from typing import Set

 import cairo
@@ -13,44 +8,12 @@ import gi
 gi.require_version('GdkPixbuf', '2.0')
 from gi.repository import GdkPixbuf

-from . import abstract, _get_exiftool_path
+from . import exiftool

 # Make pyflakes happy
 assert Set

-class _ImageParser(abstract.AbstractParser):
-    """ Since we use `exiftool` to get metadata from
-    all images fileformat, `get_meta` is implemented in this class,
-    and all the image-handling ones are inheriting from it."""
-    meta_whitelist = set()  # type: Set[str]
-
-    @staticmethod
-    def __handle_problematic_filename(filename: str, callback) -> str:
-        """ This method takes a filename with a problematic name,
-        and safely applies it a `callback`."""
-        tmpdirname = tempfile.mkdtemp()
-        fname = os.path.join(tmpdirname, "temp_file")
-        shutil.copy(filename, fname)
-        out = callback(fname)
-        shutil.rmtree(tmpdirname)
-        return out
-
-    def get_meta(self):
-        """ There is no way to escape the leading(s) dash(es) of the current
-        self.filename to prevent parameter injections, so we need to take care
-        of this.
-        """
-        fun = lambda f: subprocess.check_output([_get_exiftool_path(), '-json', f])
-        if re.search('^[a-z0-9/]', self.filename) is None:
-            out = self.__handle_problematic_filename(self.filename, fun)
-        else:
-            out = fun(self.filename)
-        meta = json.loads(out.decode('utf-8'))[0]
-        for key in self.meta_whitelist:
-            meta.pop(key, None)
-        return meta
-
-class PNGParser(_ImageParser):
+class PNGParser(exiftool.ExiftoolParser):
    mimetypes = {'image/png', }
    meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName',
                      'Directory', 'FileSize', 'FileModifyDate',
@@ -71,19 +34,26 @@ class PNGParser(_ImageParser):
        except MemoryError:  # pragma: no cover
            raise ValueError

-    def remove_all(self):
+    def remove_all(self) -> bool:
+        if self.lightweight_cleaning:
+            return self._lightweight_cleanup()
        surface = cairo.ImageSurface.create_from_png(self.filename)
        surface.write_to_png(self.output_filename)
        return True


-class GdkPixbufAbstractParser(_ImageParser):
+class GdkPixbufAbstractParser(exiftool.ExiftoolParser):
    """ GdkPixbuf can handle a lot of surfaces, so we're rending images on it,
        this has the side-effect of completely removing metadata.
    """
    _type = ''

-    def remove_all(self):
+    def __init__(self, filename):
+        super().__init__(filename)
+        if imghdr.what(filename) != self._type:  # better safe than sorry
+            raise ValueError
+
+    def remove_all(self) -> bool:
        _, extension = os.path.splitext(self.filename)
        pixbuf = GdkPixbuf.Pixbuf.new_from_file(self.filename)
        if extension.lower() == '.jpg':
@@ -91,11 +61,6 @@ class GdkPixbufAbstractParser(_ImageParser):
        pixbuf.savev(self.output_filename, extension[1:], [], [])
        return True

-    def __init__(self, filename):
-        super().__init__(filename)
-        if imghdr.what(filename) != self._type:  # better safe than sorry
-            raise ValueError
-

 class JPGParser(GdkPixbufAbstractParser):
    _type = 'jpeg'

--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -2,7 +2,7 @@ import logging
 import os
 import re
 import zipfile
-from typing import Dict, Set, Pattern
+from typing import Dict, Set, Pattern, Tuple, Union

 import xml.etree.ElementTree as ET  # type: ignore

@@ -14,9 +14,8 @@ from .archive import ArchiveBasedAbstractParser
 assert Set
 assert Pattern

-def _parse_xml(full_path: str):
+def _parse_xml(full_path: str) -> Tuple[ET.ElementTree, Dict[str, str]]:
    """ This function parses XML, with namespace support. """
-
    namespace_map = dict()
    for _, (key, value) in ET.iterparse(full_path, ("start-ns", )):
        # The ns[0-9]+ namespaces are reserved for internal usage, so
@@ -88,6 +87,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
            r'^docProps/custom\.xml$',
            r'^word/printerSettings/',
            r'^word/theme',
+            r'^word/people\.xml$',

            # we have a whitelist in self.files_to_keep,
            # so we can trash everything else
@@ -182,20 +182,20 @@ class MSOfficeParser(ArchiveBasedAbstractParser):

        parent_map = {c:p for p in tree.iter() for c in p}

-        elements = list()
+        elements_del = list()
        for element in tree.iterfind('.//w:del', namespace):
-            elements.append(element)
-        for element in elements:
+            elements_del.append(element)
+        for element in elements_del:
            parent_map[element].remove(element)

-        elements = list()
+        elements_ins = list()
        for element in tree.iterfind('.//w:ins', namespace):
            for position, item in enumerate(tree.iter()):  # pragma: no cover
                if item == element:
                    for children in element.iterfind('./*'):
-                        elements.append((element, position, children))
+                        elements_ins.append((element, position, children))
                    break
-        for (element, position, children) in elements:
+        for (element, position, children) in elements_ins:
            parent_map[element].insert(position, children)
            parent_map[element].remove(element)

@@ -296,7 +296,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser):

        return True

-    def get_meta(self) -> Dict[str, str]:
+    def get_meta(self) -> Dict[str, Union[str, dict]]:
        """
        Yes, I know that parsing xml with regexp ain't pretty,
        be my guest and fix it if you want.
@@ -381,7 +381,7 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
                return False
        return True

-    def get_meta(self) -> Dict[str, str]:
+    def get_meta(self) -> Dict[str, Union[str, dict]]:
        """
        Yes, I know that parsing xml with regexp ain't pretty,
        be my guest and fix it if you want.

--- a/libmat2/parser_factory.py
+++ b/libmat2/parser_factory.py
@@ -18,6 +18,8 @@ def __load_all_parsers():
            continue
        elif fname.endswith('__init__.py'):
            continue
+        elif fname.endswith('exiftool.py'):
+            continue
        basename = os.path.basename(fname)
        name, _ = os.path.splitext(basename)
        importlib.import_module('.' + name, package='libmat2')
@@ -33,6 +35,7 @@ def _get_parsers() -> List[T]:


 def get_parser(filename: str) -> Tuple[Optional[T], Optional[str]]:
+    """ Return the appropriate parser for a giver filename. """
    mtype, _ = mimetypes.guess_type(filename)

    _, extension = os.path.splitext(filename)

--- a/libmat2/pdf.py
+++ b/libmat2/pdf.py
@@ -7,6 +7,7 @@ import re
 import logging
 import tempfile
 import io
+from typing import Dict, Union
 from distutils.version import LooseVersion

 import cairo
@@ -37,7 +38,12 @@ class PDFParser(abstract.AbstractParser):
        except GLib.GError:  # Invalid PDF
            raise ValueError

-    def remove_all_lightweight(self):
+    def remove_all(self) -> bool:
+        if self.lightweight_cleaning is True:
+            return self.__remove_all_lightweight()
+        return self.__remove_all_thorough()
+
+    def __remove_all_lightweight(self) -> bool:
        """
            Load the document into Poppler, render pages on a new PDFSurface.
        """
@@ -64,7 +70,7 @@ class PDFParser(abstract.AbstractParser):

        return True

-    def remove_all(self):
+    def __remove_all_thorough(self) -> bool:
        """
            Load the document into Poppler, render pages on PNG,
            and shove those PNG into a new PDF.
@@ -119,13 +125,13 @@ class PDFParser(abstract.AbstractParser):
        return True

    @staticmethod
-    def __parse_metadata_field(data: str) -> dict:
+    def __parse_metadata_field(data: str) -> Dict[str, str]:
        metadata = {}
        for (_, key, value) in re.findall(r"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)</\1:\2>", data, re.I):
            metadata[key] = value
        return metadata

-    def get_meta(self):
+    def get_meta(self) -> Dict[str, Union[str, dict]]:
        """ Return a dict with all the meta of the file
        """
        metadata = {}

--- a/libmat2/torrent.py
+++ b/libmat2/torrent.py
@@ -14,7 +14,7 @@ class TorrentParser(abstract.AbstractParser):
        if self.dict_repr is None:
            raise ValueError

-    def get_meta(self) -> Dict[str, str]:
+    def get_meta(self) -> Dict[str, Union[str, dict]]:
        metadata = {}
        for key, value in self.dict_repr.items():
            if key not in self.whitelist: