Georg Faerber · Georg Faerber · b626ee1e · b626ee1e · b626ee1e · b626ee1e
--- a/.gitignore
+++ b/.gitignore
+.*
 *.pyc
+.coverage
+.eggs
+.mypy_cache/
+build
+dist
+mat2.egg-info
+tags
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -6,22 +6,57 @@ stages:

 bandit:
  stage: linting
-  script:
+  script:  # TODO: remove B405 and B314
  - apt-get -qqy update
  - apt-get -qqy install --no-install-recommends python3-bandit
-  - bandit -r ./src --format txt --skip B404,B603
+  - bandit ./mat2 --format txt
+  - bandit -r ./nautilus/ --format txt --skip B101
+  - bandit -r ./libmat2 --format txt --skip B101,B404,B603,B405,B314
+
+pylint:
+  stage: linting
+  script:
+  - apt-get -qqy update
+  - apt-get -qqy install --no-install-recommends pylint3 python3-mutagen python3-gi-cairo gir1.2-poppler-0.18 gir1.2-gdkpixbuf-2.0
+  - pylint3 --extension-pkg-whitelist=cairo,gi ./libmat2 ./mat2
+  # Once nautilus-python is in Debian, decomment it form the line below
+  - pylint3 --extension-pkg-whitelist=Nautilus,GObject,Gtk,Gio,GLib,gi ./nautilus/mat2.py

 pyflakes:
  stage: linting
  script:
  - apt-get -qqy update
  - apt-get -qqy install --no-install-recommends pyflakes3
-  - pyflakes3 ./src
+  - pyflakes3 ./libmat2 ./mat2 ./tests/ ./nautilus

-tests:
+mypy:
+  stage: linting
+  script:
+  - apt-get -qqy update
+  - apt-get -qqy install --no-install-recommends python3-pip
+  - pip3 install mypy
+  - mypy mat2 libmat2/*.py --ignore-missing-imports
+  - mypy --ignore-missing-imports ./nautilus/mat2.py
+
+tests:debian:
  stage: test
  script:
  - apt-get -qqy update
  - apt-get -qqy install --no-install-recommends python3-mutagen python3-gi-cairo gir1.2-poppler-0.18 gir1.2-gdkpixbuf-2.0 libimage-exiftool-perl python3-coverage
-  - python3-coverage run -m unittest discover -s tests/
-  - python3-coverage report -m --include 'src/*'
+  - python3-coverage run --branch -m unittest discover -s tests/
+  - python3-coverage report -m --include 'libmat2/*'
+
+tests:fedora:
+  image: fedora
+  stage: test
+  script:
+  - dnf install -y python3 python3-mutagen python3-gobject gdk-pixbuf2 poppler-glib gdk-pixbuf2 gdk-pixbuf2-modules cairo-gobject cairo python3-cairo perl-Image-ExifTool mailcap
+  - gdk-pixbuf-query-loaders-64 > /usr/lib64/gdk-pixbuf-2.0/2.10.0/loaders.cache
+  - python3 setup.py test
+
+tests:archlinux:
+  image: archlinux/base
+  stage: test
+  script:
+  - pacman -Sy --noconfirm python-mutagen python-gobject gdk-pixbuf2 poppler-glib gdk-pixbuf2 python-cairo perl-image-exiftool python-setuptools mailcap
+  - python3 setup.py test
--- a/.pylintrc
+++ b/.pylintrc
+[FORMAT]
+good-names=e,f,i,x,s
+max-locals=20
+
+[MESSAGES CONTROL]
+disable=
+    fixme,
+    invalid-name,
+    missing-docstring,
+    protected-access,
+		abstract-method,
+		wrong-import-position,
+		catching-non-exception,
+		cell-var-from-loop,
+		locally-disabled,
+		invalid-sequence-index,  # pylint doesn't like things like `Tuple[int, bytes]` in type annotation
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
+# 0.3.1 - 2018-09-01
+
+- Document how to install MAT2 for various distributions
+- Fix various typos in the documentation/comments
+- Add ArchLinux to the CI to ensure that MAT2 is running on it
+- Fix the handling of files with a name ending in `.JPG`
+- Improve the detection of unsupported extensions in upper-case
+- Streamline MAT2's logging
+
+
+# 0.3.0 - 2018-08-03
+
+- Add a check for missing dependencies
+- Add Nautilus extension
+- Minors code simplifications
+- Improve our linters' coverage
+- Add a manpage
+- Add folder/multiple files related tests
+- Change the logo
+
+
+# 0.2.0 - 2018-07-10
+
+- Fix various crashes dues to malformed files
+- Simplify various code-paths
+- Remove superfluous debug message
+- Remove the `--check` option that never was implemented anyway
+- Add a `-c` option to check for MAT2's dependencies
+
+
+# 0.1.3 - 2018-07-06
+
+- Improve MAT2 resilience against corrupted images
+- Check that the minimal version of Poppler is available
+- Simplify how MAT2 deals with office files
+- Improve cleaning of office files
+	- Thumbnails are removed
+	- Revisions are removed
+- Add support for plain text files
+
+
+# 0.1.2 - 2018-06-21
+
+- Rename some files to ease the packaging
+- Add linters to the CI (mypy, bandit and pyflakes)
+- Prevent exitftool-related parameters injections
+- Improve MAT2's resilience against corrupted files
+- Make MAT2 work on fedora, thanks to @atenart
+- Tighten the threat model
+- Simplify and improve how office files are handled
+
 # 0.1.1 - 2018-05-16

 - Improve the cli usage

--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -7,10 +7,27 @@ Do feel free to pick up [an issue]( https://0xacab.org/jvoisin/mat2/issues )
 and to send a pull-request. Please do check that everything is fine by running the
 testsuite with `python3 -m unittest discover -v` before submitting one :)

+If you're fixing a bug or adding a new feature, please add tests accordingly,
+this will greatly improve the odds of your merge-request getting merged.
+
+If you're adding a new fileformat, please add tests for:
+
+1. Getting metadata
+2. Cleaning metadata
+3. Raising `ValueError` upon a corrupted file
+
+Since MAT2 is written in Python3, please conform as much as possible to the
+[pep8]( https://pep8.org/ ) style; except where it makes no sense of course.
+
 # Doing a release

 1. Update the [changelog](https://0xacab.org/jvoisin/mat2/blob/master/CHANGELOG.md)
-2. Update the version in the [main.py](https://0xacab.org/jvoisin/mat2/blob/master/main.py) file
-3. Commit the changelog and the main.py file
-4. Create a tag with `git tag -s $VERSION`
-5. Push the tag with `git push --tags`
+2. Update the version in the [mat2](https://0xacab.org/jvoisin/mat2/blob/master/mat2) file
+3. Update the version in the [setup.py](https://0xacab.org/jvoisin/mat2/blob/master/setup.py) file
+4. Update the version and date in the [man page](https://0xacab.org/jvoisin/mat2/blob/master/doc/mat.1)
+5. Commit the changelog, man page, mat2 and setup.py files
+6. Create a tag with `git tag -s $VERSION`
+7. Push the commit with `git push origin master`
+8. Push the tag with `git push --tags`
+9. Tell the [downstreams](https://0xacab.org/jvoisin/mat2/blob/master/INSTALL.md) about it
+10. Do the secret release dance
--- a/INSTALL.md
+++ b/INSTALL.md
+# GNU/Linux
+
+## Fedora
+
+Thanks to [atenart](https://ack.tf/), there is a package available on
+[Fedora's copr]( https://copr.fedorainfracloud.org/coprs/atenart/mat2/ ).
+
+We use copr (cool other packages repo) as the Mat2 Nautilus plugin depends on
+python3-nautilus, which isn't available yet in Fedora (but is distributed
+through this copr).
+
+First you need to enable Mat2's copr:
+
+```
+dnf -y copr enable atenart/mat2
+```
+
+Then you can install both the Mat2 command and Nautilus extension:
+
+```
+dnf -y install mat2 mat2-nautilus
+```
+
+## Debian
+
+There is currently no package for Debian. If you want to help to make this
+happen, there is an [issue](https://0xacab.org/jvoisin/mat2/issues/16) open.
+
+But fear not, there is a way to install it *manually*:
+
+```
+# apt install python3-mutagen python3-gi-cairo gir1.2-gdkpixbuf-2.0 libimage-exiftool-perl gir1.2-glib-2.0 gir1.2-poppler-0.18
+$ git clone https://0xacab.org/jvoisin/mat2.git
+$ cd mat2
+$ ./mat2
+```
+
+and if you want to install the über-fancy Nautilus extension:
+
+```
+# apt install python-gi-dev
+$ git clone https://github.com/GNOME/nautilus-python
+$ cd nautilus-python
+$ PYTHON=/usr/bin/python3 ./autogen.sh
+$ make
+# make install
+$ cp ./nautilus/mat2.py ~/.local/share/nautilus-python/extensions/
+$ PYTHONPATH=/home/$USER/mat2 PYTHON=/usr/bin/python3 nautilus
+```
+
+## Arch Linux
+
+Thanks to [Francois_B](https://www.sciunto.org/), there is an package available on
+[Arch linux's AUR](https://aur.archlinux.org/packages/mat2/).
--- a/README.md
+++ b/README.md
 ```
- _____ _____ _____ ___ 
-|     |  _  |_   _|_  |  Keep you data,
+ _____ _____ _____ ___
+|     |  _  |_   _|_  |  Keep your data,
 | | | |     | | | |  _|     trash your meta!
 |_|_|_|__|__| |_| |___|
-                       
-```

-[![pipeline status](https://0xacab.org/jvoisin/mat2/badges/master/pipeline.svg)](https://0xacab.org/jvoisin/mat2/commits/master)
-[![coverage report](https://0xacab.org/jvoisin/mat2/badges/master/coverage.svg)](https://0xacab.org/jvoisin/mat2/commits/master)
+```

 This software is currently in **beta**, please don't use it for anything
 critical.

 # Metadata and privacy

-Metadata consist of information that characterizes data.                                             
-Metadata are used to provide documentation for data products.                                        
-In essence, metadata answer who, what, when, where, why, and how about                               
-every facet of the data that are being documented.                                                   
+Metadata consist of information that characterizes data.
+Metadata are used to provide documentation for data products.
+In essence, metadata answer who, what, when, where, why, and how about
+every facet of the data that are being documented.

-Metadata within a file can tell a lot about you.                                                     
-Cameras record data about when a picture was taken and what                                          
-camera was used. Office documents like PDF or Office automatically adds                              
-author and company information to documents and spreadsheets.                                        
-Maybe you don't want to disclose those information on the web. 
+Metadata within a file can tell a lot about you.
+Cameras record data about when a picture was taken and what
+camera was used. Office documents like PDF or Office automatically adds
+author and company information to documents and spreadsheets.
+Maybe you don't want to disclose those information on the web.

 This is precisely the job of MAT2: getting rid, as much as possible, of
 metadata.
@@ -36,29 +33,59 @@ metadata.
 - `libimage-exiftool-perl` for everything else

 Please note that MAT2 requires at least Python3.5, meaning that it
-doesn't run on [Debian Jessie](Stretc://packages.debian.org/jessie/python3),
+doesn't run on [Debian Jessie](https://packages.debian.org/jessie/python3),

-# Running the testsuite
+# Running the test suite

 ```bash
 $ python3 -m unittest discover -v
 ```

-# Supported formats
+# How to use MAT2

 ```bash
-$ python3 ./main.py -l
+usage: mat2 [-h] [-v] [-l] [-s | -L] [files [files ...]]
+
+Metadata anonymisation toolkit 2
+
+positional arguments:
+  files
+
+optional arguments:
+  -h, --help         show this help message and exit
+  -v, --version      show program's version number and exit
+  -l, --list         list all supported fileformats
+  -s, --show         list all the harmful metadata of a file without removing
+                     them
+  -L, --lightweight  remove SOME metadata
 ```

-# Related softwares
+# Notes about detecting metadata

- The first iteration of [MAT](http://mat.boum.org)
+While MAT2 is doing its very best to display metadata when the `--show` flag is
+passed, it doesn't mean that a file is clean from any metadata if MAT2 doesn't
+show any. There is no reliable way to detect every single possible metadata for
+complex file formats.
+
+This is why you shouldn't rely on metadata's presence to decide if your file must
+be cleaned or not.
+
+# Related software
+
+- The first iteration of [MAT](https://mat.boum.org)
 - [Exiftool](https://sno.phy.queensu.ca/~phil/exiftool/mat)
 - [pdf-redact-tools](https://github.com/firstlookmedia/pdf-redact-tools), that
 	tries to deal with *printer dots* too.
 - [pdfparanoia](https://github.com/kanzure/pdfparanoia), that removes
 	watermarks from PDF.

+# Contact
+
+If possible, use the [issues system](https://0xacab.org/jvoisin/mat2/issues).
+If you think that a more private contact is needed (eg. for reporting security issues),
+you can email Julien (jvoisin) Voisin at `julien.voisin+mat@dustri.org`,
+using the gpg key `9FCDEE9E1A381F311EA62A7404D041E8171901CC`.
+
 # License

 This program is free software: you can redistribute it and/or modify
@@ -75,13 +102,14 @@ You should have received a copy of the GNU Lesser General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.

 Copyright 2018 Julien (jvoisin) Voisin <julien.voisin+mat2@dustri.org>
+Copyright 2016 Marie Rose for MAT2's logo

 # Thanks

-MAT2 wouldn't existe without:
+MAT2 wouldn't exist without:

- the Google Summer of Code,
- the fine people from the Tails project
+- the [Google Summer of Code](https://summerofcode.withgoogle.com/);
+- the fine people from [Tails]( https://tails.boum.org);
 - friends

 Many thanks to them!

--- a/data/mat2.png
+++ b/data/mat2.png
--- a/data/mat2.svg
+++ b/data/mat2.svg
--- a/doc/implementation_notes.md
+++ b/doc/implementation_notes.md
@@ -6,9 +6,19 @@ Lightweight cleaning mode

 Due to *popular* request, MAT2 is providing a *lightweight* cleaning mode,
 that only cleans the superficial metadata of your file, but not
-the ones that might be in **embeded** resources. Like for example,
+the ones that might be in **embedded** resources. Like for example,
 images in a PDF or an office document.

+Revisions handling
+------------------
+
+Revisions are handled according to the principle of least astonishment: they are entirely removed.
+
+- Either the users aren't aware of the revisions, are thus they should be deleted. For example journalists that are editing a document to erase mentions sources mentions.
+
+- Or they are aware of it, and will likely not expect MAT2 to be able to keep the revisions, that are basically traces about how, when and who edited the document.
+
+
 Race conditions
 ---------------


--- a/doc/mat.1
+++ b/doc/mat.1
+.TH MAT2 "1" "September 2018" "MAT2 0.3.1" "User Commands"
+
+.SH NAME
+mat2 \- the metadata anonymisation toolkit 2
+
+.SH SYNOPSIS
+mat2 [\-h] [\-v] [\-l] [\-c] [\-s | \-L]\fR [files [files ...]]
+
+.SH DESCRIPTION
+.B mat2
+removes metadata from various fileformats. It supports a wide variety of file 
+formats, audio, office, images, …
+
+.SH OPTIONS
+.SS "positional arguments:"
+.TP
+\fBfiles\fR
+the files to process
+.SS "optional arguments:"
+.TP
+\fB\-h\fR, \fB\-\-help\fR
+show this help message and exit
+.TP
+\fB\-v\fR, \fB\-\-version\fR
+show program's version number and exit
+.TP
+\fB\-l\fR, \fB\-\-list\fR
+list all supported fileformats
+.TP
+\fB\-c\fR, \fB\-\-check\-dependencies\fR
+check if MAT2 has all the dependencies it needs
+.TP
+\fB\-s\fR, \fB\-\-show\fR
+list harmful metadata detectable by MAT2 without
+removing them
+.TP
+\fB\-L\fR, \fB\-\-lightweight\fR
+remove SOME metadata
+
+.SH EXAMPLES
+To remove all the metadata from a PDF file:
+.PP
+.nf
+.RS
+mat2 ./myfile.pdf
+.RE
+.fi
+.PP
+
+.SH BUGS
+
+While mat2 does its very best to remove every single metadata,
+it's still in beta, and \fBsome\fR might remain. Should you encounter
+some issues, check the bugtracker: https://0xacab.org/jvoisin/mat2/issues
+.PP
+Please use accordingly and be careful.
+
+.SH AUTHOR
+This software was made by Julien (jvoisin) Voisin with the support of the Tails project.
+
+.SH COPYRIGHT
+This software is released on LGPLv3.
+
+.SH "SEE ALSO"
+.BR exiftool (1p)
+.BR pdf-redact-tools (1)
--- a/doc/threat_model.md
+++ b/doc/threat_model.md
 Threat Model
 ============
+
 The Metadata Anonymisation Toolkit 2 adversary has a number
 of goals, capabilities, and counter-attack types that can be
 used to guide us towards a set of requirements for the MAT2.
@@ -13,17 +14,19 @@ Mat only removes standard metadata from your files, it does _not_:

  - anonymise their content (the substance and the form)
  - handle watermarking
-  - handle steganography
+  - handle steganography nor homoglyphs
  - handle stylometry
  - handle any non-standard metadata field/system
+  - handle file-system related metadata

 If you really want to be anonymous format that does not contain any
 metadata, or better : use plain-text ASCII without trailing spaces.
-And as usual, think before clicking.
+
+And as usual, think twice before clicking.


 Adversary
------------
+---------

 * Goals:

@@ -40,17 +43,18 @@ Adversary
    to directly identify the author and/or source, his next
    goal is to determine the source of the equipment used
    to produce, copy, and transmit the document. This can
-    include the model of camera used to take a photo, or
-    which software was used to produce an office document.
+    include the model of camera used to take a photo or a film, 
+    which software was used to produce an office document, …


 * Adversary Capabilities - Positioning
+
    - The adversary created the document specifically for this
    user. This is the strongest position for the adversary to
    have. In this case, the adversary is capable of inserting
    arbitrary, custom watermarks specifically for tracking
-    the user. In general, MAT cannot defend against this
-    adversary, but we list it for completeness.
+    the user. In general, MAT2 cannot defend against this
+    adversary, but we list it for completeness' sake.

    - The adversary created the document for a group of users.
    In this case, the adversary knows that they attempted to
@@ -58,30 +62,39 @@ Adversary
    or may not have watermarked the document for these
    users, but they certainly know the format used.

-    - The adversary did not create the document, the weakest
-    position for the adversary to have. The file format is (most of the time)
-     standard, nothing custom is added: MAT
-    should be able to remove all meta-information from the
-    file.
+		- The adversary did not create the document, the weakest
+		position for the adversary to have. The file format is
+		(most of the time) standard, nothing custom is added:
+		MAT2 must be able to remove all metadata from the file.
+

 Requirements
---------------
+------------

 * Processing
-    - The MAT2 *should* avoid interactions with information.
+
+    - MAT2 *should* avoid interactions with information.
    Its goal is to remove metadata, and the user is solely
    responsible for the information of the file.

-    - The MAT2 *must* warn when encountering an unknown
+    - MAT2 *must* warn when encountering an unknown
    format. For example, in a zipfile, if MAT encounters an
    unknown format, it should warn the user, and ask if the
    file should be added to the anonymised archive that is
    produced.

-    - The MAT2 *must* not add metadata, since its purpose is to
+    - MAT2 *must* not add metadata, since its purpose is to
    anonymise files: every added items of metadata decreases
    anonymity.

-    - The MAT2 *should* handle unknown/hidden metadata fields,
+    - MAT2 *should* handle unknown/hidden metadata fields,
    like proprietary extensions of open formats.

+		- MAT2 *must not* fail silently. Upon failure,
+		MAT2 *must not* modify the file in any way.
+
+		- MAT2 *might* leak the fact that MAT2 was used on the file,
+		since it might be uncommon for some file formats to come
+		without any kind of metadata, an adversary might suspect that
+		the user used MAT2 on certain files.
+
--- a/libmat2/__init__.py
+++ b/libmat2/__init__.py
+#!/bin/env python3
+
+import os
+import collections
+import importlib
+from typing import Dict, Optional
+
+# make pyflakes happy
+assert Dict
+
+# A set of extension that aren't supported, despite matching a supported mimetype
+UNSUPPORTED_EXTENSIONS = {
+    '.asc',
+    '.bat',
+    '.brf',
+    '.c',
+    '.h',
+    '.ksh',
+    '.pl',
+    '.pot',
+    '.rdf',
+    '.srt',
+    '.wsdl',
+    '.xpdl',
+    '.xsd',
+    '.xsl',
+    }
+
+DEPENDENCIES = {
+    'cairo': 'Cairo',
+    'gi': 'PyGobject',
+    'gi.repository.GdkPixbuf': 'GdkPixbuf from PyGobject',
+    'gi.repository.Poppler': 'Poppler from PyGobject',
+    'gi.repository.GLib': 'GLib from PyGobject',
+    'mutagen': 'Mutagen',
+    }
+
+def _get_exiftool_path() -> Optional[str]:
+    exiftool_path = '/usr/bin/exiftool'
+    if os.path.isfile(exiftool_path):
+        if os.access(exiftool_path, os.X_OK):  # pragma: no cover
+            return exiftool_path
+
+    # ArchLinux
+    exiftool_path = '/usr/bin/vendor_perl/exiftool'
+    if os.path.isfile(exiftool_path):
+        if os.access(exiftool_path, os.X_OK):  # pragma: no cover
+            return exiftool_path
+
+    return None
+
+def check_dependencies() -> dict:
+    ret = collections.defaultdict(bool)  # type: Dict[str, bool]
+
+    ret['Exiftool'] = True if _get_exiftool_path() else False
+
+    for key, value in DEPENDENCIES.items():
+        ret[value] = True
+        try:
+            importlib.import_module(key)
+        except ImportError:  # pragma: no cover
+            ret[value] = False  # pragma: no cover
+
+    return ret
--- a/src/abstract.py
+++ b/src/abstract.py
 import abc
 import os
+from typing import Set, Dict
+
+assert Set  # make pyflakes happy


 class AbstractParser(abc.ABC):
-    meta_list = set()
-    mimetypes = set()
+    """ This is the base class of every parser.
+    It might yield `ValueError` on instantiation on invalid files.
+    """
+    meta_list = set()  # type: Set[str]
+    mimetypes = set()  # type: Set[str]

-    def __init__(self, filename: str):
+    def __init__(self, filename: str) -> None:
+        """
+        :raises ValueError: Raised upon an invalid file
+        """
        self.filename = filename
        fname, extension = os.path.splitext(filename)
        self.output_filename = fname + '.cleaned' + extension

    @abc.abstractmethod
-    def get_meta(self) -> dict:
-        pass
+    def get_meta(self) -> Dict[str, str]:
+        pass  # pragma: no cover

    @abc.abstractmethod
    def remove_all(self) -> bool:
-        pass
+        pass  # pragma: no cover

    def remove_all_lightweight(self) -> bool:
-        """ Remove _SOME_ metadata. """
+        """ This method removes _SOME_ metadata.
+        It might be useful to implement it for fileformats that do
+        not support non-destructive cleaning.
+        """
        return self.remove_all()
--- a/src/audio.py
+++ b/src/audio.py
@@ -6,6 +6,13 @@ from . import abstract


 class MutagenParser(abstract.AbstractParser):
+    def __init__(self, filename):
+        super().__init__(filename)
+        try:
+            mutagen.File(self.filename)
+        except mutagen.MutagenError:
+            raise ValueError
+
    def get_meta(self):
        f = mutagen.File(self.filename)
        if f.tags:
@@ -36,4 +43,4 @@ class OGGParser(MutagenParser):


 class FLACParser(MutagenParser):
-    mimetypes = {'audio/flac', }
+    mimetypes = {'audio/flac', 'audio/x-flac'}
--- a/libmat2/harmless.py
+++ b/libmat2/harmless.py
+import shutil
+from typing import Dict
+from . import abstract
+
+
+class HarmlessParser(abstract.AbstractParser):
+    """ This is the parser for filetypes that can not contain metadata. """
+    mimetypes = {'text/plain', 'image/x-ms-bmp'}
+
+    def get_meta(self) -> Dict[str, str]:
+        return dict()
+
+    def remove_all(self) -> bool:
+        shutil.copy(self.filename, self.output_filename)
+        return True
--- a/src/images.py
+++ b/src/images.py
 import subprocess
+import imghdr
 import json
 import os
+import shutil
+import tempfile
+import re
+from typing import Set

 import cairo

@@ -8,10 +13,44 @@ import gi
 gi.require_version('GdkPixbuf', '2.0')
 from gi.repository import GdkPixbuf

-from . import abstract
+from . import abstract, _get_exiftool_path

+# Make pyflakes happy
+assert Set

-class PNGParser(abstract.AbstractParser):
+class _ImageParser(abstract.AbstractParser):
+    """ Since we use `exiftool` to get metadata from
+    all images fileformat, `get_meta` is implemented in this class,
+    and all the image-handling ones are inheriting from it."""
+    meta_whitelist = set()  # type: Set[str]
+
+    @staticmethod
+    def __handle_problematic_filename(filename: str, callback) -> str:
+        """ This method takes a filename with a problematic name,
+        and safely applies it a `callback`."""
+        tmpdirname = tempfile.mkdtemp()
+        fname = os.path.join(tmpdirname, "temp_file")
+        shutil.copy(filename, fname)
+        out = callback(fname)
+        shutil.rmtree(tmpdirname)
+        return out
+
+    def get_meta(self):
+        """ There is no way to escape the leading(s) dash(es) of the current
+        self.filename to prevent parameter injections, so we need to take care
+        of this.
+        """
+        fun = lambda f: subprocess.check_output([_get_exiftool_path(), '-json', f])
+        if re.search('^[a-z0-9/]', self.filename) is None:
+            out = self.__handle_problematic_filename(self.filename, fun)
+        else:
+            out = fun(self.filename)
+        meta = json.loads(out.decode('utf-8'))[0]
+        for key in self.meta_whitelist:
+            meta.pop(key, None)
+        return meta
+
+class PNGParser(_ImageParser):
    mimetypes = {'image/png', }
    meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName',
                      'Directory', 'FileSize', 'FileModifyDate',
@@ -28,40 +67,34 @@ class PNGParser(abstract.AbstractParser):
        except MemoryError:
            raise ValueError

-    def get_meta(self):
-        out = subprocess.check_output(['/usr/bin/exiftool', '-json', self.filename])
-        meta = json.loads(out.decode('utf-8'))[0]
-        for key in self.meta_whitelist:
-            meta.pop(key, None)
-        return meta
-
    def remove_all(self):
        surface = cairo.ImageSurface.create_from_png(self.filename)
        surface.write_to_png(self.output_filename)
        return True


-class GdkPixbufAbstractParser(abstract.AbstractParser):
+class GdkPixbufAbstractParser(_ImageParser):
    """ GdkPixbuf can handle a lot of surfaces, so we're rending images on it,
-        this has the side-effect of removing metadata completely.
+        this has the side-effect of completely removing metadata.
    """
-    def get_meta(self):
-        out = subprocess.check_output(['/usr/bin/exiftool', '-json', self.filename])
-        meta = json.loads(out.decode('utf-8'))[0]
-        for key in self.meta_whitelist:
-            meta.pop(key, None)
-        return meta
+    _type = ''

    def remove_all(self):
        _, extension = os.path.splitext(self.filename)
        pixbuf = GdkPixbuf.Pixbuf.new_from_file(self.filename)
-        if extension == '.jpg':
-            extension = '.jpeg'
+        if extension.lower() == '.jpg':
+            extension = '.jpeg'  # gdk is picky
        pixbuf.savev(self.output_filename, extension[1:], [], [])
        return True

+    def __init__(self, filename):
+        super().__init__(filename)
+        if imghdr.what(filename) != self._type:  # better safe than sorry
+            raise ValueError
+

 class JPGParser(GdkPixbufAbstractParser):
+    _type = 'jpeg'
    mimetypes = {'image/jpeg'}
    meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName',
                      'Directory', 'FileSize', 'FileModifyDate',
@@ -74,6 +107,7 @@ class JPGParser(GdkPixbufAbstractParser):


 class TiffParser(GdkPixbufAbstractParser):
+    _type = 'tiff'
    mimetypes = {'image/tiff'}
    meta_whitelist = {'Compression', 'ExifByteOrder', 'ExtraSamples',
                      'FillOrder', 'PhotometricInterpretation',
@@ -84,18 +118,3 @@ class TiffParser(GdkPixbufAbstractParser):
                      'FilePermissions', 'FileSize', 'FileType',
                      'FileTypeExtension', 'ImageHeight', 'ImageSize',
                      'ImageWidth', 'MIMEType', 'Megapixels', 'SourceFile'}
-
-
-class BMPParser(GdkPixbufAbstractParser):
-    mimetypes = {'image/x-ms-bmp'}
-    meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', 'Directory',
-                      'FileSize', 'FileModifyDate', 'FileAccessDate',
-                      'FileInodeChangeDate', 'FilePermissions', 'FileType',
-                      'FileTypeExtension', 'MIMEType', 'BMPVersion',
-                      'ImageWidth', 'ImageHeight', 'Planes', 'BitDepth',
-                      'Compression', 'ImageLength', 'PixelsPerMeterX',
-                      'PixelsPerMeterY', 'NumColors', 'NumImportantColors',
-                      'RedMask', 'GreenMask', 'BlueMask', 'AlphaMask',
-                      'ColorSpace', 'RedEndpoint', 'GreenEndpoint',
-                      'BlueEndpoint', 'GammaRed', 'GammaGreen', 'GammaBlue',
-                      'ImageSize', 'Megapixels'}
--- a/libmat2/office.py
+++ b/libmat2/office.py
+import os
+import re
+import shutil
+import tempfile
+import datetime
+import zipfile
+import logging
+from typing import Dict, Set, Pattern
+
+try:  # protect against DoS
+    from defusedxml import ElementTree as ET  # type: ignore
+except ImportError:
+    import xml.etree.ElementTree as ET  # type: ignore
+
+
+from . import abstract, parser_factory
+
+# Make pyflakes happy
+assert Set
+assert Pattern
+
+def _parse_xml(full_path: str):
+    """ This function parse XML, with namespace support. """
+
+    namespace_map = dict()
+    for _, (key, value) in ET.iterparse(full_path, ("start-ns", )):
+        namespace_map[key] = value
+        ET.register_namespace(key, value)
+
+    return ET.parse(full_path), namespace_map
+
+
+class ArchiveBasedAbstractParser(abstract.AbstractParser):
+    """ Office files (.docx, .odt, …) are zipped files. """
+    # Those are the files that have a format that _isn't_
+    # supported by MAT2, but that we want to keep anyway.
+    files_to_keep = set()  # type: Set[str]
+
+    # Those are the files that we _do not_ want to keep,
+    # no matter if they are supported or not.
+    files_to_omit = set() # type: Set[Pattern]
+
+    def __init__(self, filename):
+        super().__init__(filename)
+        try:  # better fail here than later
+            zipfile.ZipFile(self.filename)
+        except zipfile.BadZipFile:
+            raise ValueError
+
+    def _specific_cleanup(self, full_path: str) -> bool:
+        """ This method can be used to apply specific treatment
+        to files present in the archive."""
+        # pylint: disable=unused-argument,no-self-use
+        return True  # pragma: no cover
+
+    @staticmethod
+    def _clean_zipinfo(zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo:
+        zipinfo.create_system = 3  # Linux
+        zipinfo.comment = b''
+        zipinfo.date_time = (1980, 1, 1, 0, 0, 0)  # this is as early as a zipfile can be
+        return zipinfo
+
+    @staticmethod
+    def _get_zipinfo_meta(zipinfo: zipfile.ZipInfo) -> Dict[str, str]:
+        metadata = {}
+        if zipinfo.create_system == 3:  # this is Linux
+            pass
+        elif zipinfo.create_system == 2:
+            metadata['create_system'] = 'Windows'
+        else:
+            metadata['create_system'] = 'Weird'
+
+        if zipinfo.comment:
+            metadata['comment'] = zipinfo.comment  # type: ignore
+
+        if zipinfo.date_time != (1980, 1, 1, 0, 0, 0):
+            metadata['date_time'] = str(datetime.datetime(*zipinfo.date_time))
+
+        return metadata
+
+    def remove_all(self) -> bool:
+        with zipfile.ZipFile(self.filename) as zin,\
+             zipfile.ZipFile(self.output_filename, 'w') as zout:
+
+            temp_folder = tempfile.mkdtemp()
+
+            for item in zin.infolist():
+                if item.filename[-1] == '/':  # `is_dir` is added in Python3.6
+                    continue  # don't keep empty folders
+
+                zin.extract(member=item, path=temp_folder)
+                full_path = os.path.join(temp_folder, item.filename)
+
+                if self._specific_cleanup(full_path) is False:
+                    shutil.rmtree(temp_folder)
+                    os.remove(self.output_filename)
+                    logging.warning("Something went wrong during deep cleaning of %s",
+                                    item.filename)
+                    return False
+
+                if item.filename in self.files_to_keep:
+                    # those files aren't supported, but we want to add them anyway
+                    pass
+                elif any(map(lambda r: r.search(item.filename), self.files_to_omit)):
+                    continue
+                else:
+                    # supported files that we want to clean then add
+                    tmp_parser, mtype = parser_factory.get_parser(full_path)  # type: ignore
+                    if not tmp_parser:
+                        shutil.rmtree(temp_folder)
+                        os.remove(self.output_filename)
+                        logging.error("In file %s, element %s's format (%s) " +
+                                      "isn't supported",
+                                      self.filename, item.filename, mtype)
+                        return False
+                    tmp_parser.remove_all()
+                    os.rename(tmp_parser.output_filename, full_path)
+
+                zinfo = zipfile.ZipInfo(item.filename)  # type: ignore
+                clean_zinfo = self._clean_zipinfo(zinfo)
+                with open(full_path, 'rb') as f:
+                    zout.writestr(clean_zinfo, f.read())
+
+        shutil.rmtree(temp_folder)
+        return True
+
+
+class MSOfficeParser(ArchiveBasedAbstractParser):
+    mimetypes = {
+        'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+        'application/vnd.openxmlformats-officedocument.presentationml.presentation'
+    }
+    files_to_keep = {
+        '[Content_Types].xml',
+        '_rels/.rels',
+        'word/_rels/document.xml.rels',
+        'word/document.xml',
+        'word/fontTable.xml',
+        'word/settings.xml',
+        'word/styles.xml',
+    }
+    files_to_omit = set(map(re.compile, {  # type: ignore
+        '^docProps/',
+    }))
+
+    @staticmethod
+    def __remove_revisions(full_path: str) -> bool:
+        """ In this function, we're changing the XML document in several
+        different times, since we don't want to change the tree we're currently
+        iterating on.
+        """
+        try:
+            tree, namespace = _parse_xml(full_path)
+        except ET.ParseError:
+            return False
+
+        # Revisions are either deletions (`w:del`) or
+        # insertions (`w:ins`)
+        del_presence = tree.find('.//w:del', namespace)
+        ins_presence = tree.find('.//w:ins', namespace)
+        if del_presence is None and ins_presence is None:
+            return True  # No revisions are present
+
+        parent_map = {c:p for p in tree.iter() for c in p}
+
+        elements = list()
+        for element in tree.iterfind('.//w:del', namespace):
+            elements.append(element)
+        for element in elements:
+            parent_map[element].remove(element)
+
+        elements = list()
+        for element in tree.iterfind('.//w:ins', namespace):
+            for position, item in enumerate(tree.iter()):  #pragma: no cover
+                if item == element:
+                    for children in element.iterfind('./*'):
+                        elements.append((element, position, children))
+                    break
+        for (element, position, children) in elements:
+            parent_map[element].insert(position, children)
+            parent_map[element].remove(element)
+
+        tree.write(full_path, xml_declaration=True)
+
+        return True
+
+    def _specific_cleanup(self, full_path: str) -> bool:
+        if full_path.endswith('/word/document.xml'):
+            # this file contains the revisions
+            return self.__remove_revisions(full_path)
+        return True
+
+    def get_meta(self) -> Dict[str, str]:
+        """
+        Yes, I know that parsing xml with regexp ain't pretty,
+        be my guest and fix it if you want.
+        """
+        metadata = {}
+        zipin = zipfile.ZipFile(self.filename)
+        for item in zipin.infolist():
+            if item.filename.startswith('docProps/') and item.filename.endswith('.xml'):
+                try:
+                    content = zipin.read(item).decode('utf-8')
+                    results = re.findall(r"<(.+)>(.+)</\1>", content, re.I|re.M)
+                    for (key, value) in results:
+                        metadata[key] = value
+                except (TypeError, UnicodeDecodeError):  # We didn't manage to parse the xml file
+                    metadata[item.filename] = 'harmful content'
+            for key, value in self._get_zipinfo_meta(item).items():
+                metadata[key] = value
+        zipin.close()
+        return metadata
+
+
+class LibreOfficeParser(ArchiveBasedAbstractParser):
+    mimetypes = {
+        'application/vnd.oasis.opendocument.text',
+        'application/vnd.oasis.opendocument.spreadsheet',
+        'application/vnd.oasis.opendocument.presentation',
+        'application/vnd.oasis.opendocument.graphics',
+        'application/vnd.oasis.opendocument.chart',
+        'application/vnd.oasis.opendocument.formula',
+        'application/vnd.oasis.opendocument.image',
+    }
+    files_to_keep = {
+        'META-INF/manifest.xml',
+        'content.xml',
+        'manifest.rdf',
+        'mimetype',
+        'settings.xml',
+        'styles.xml',
+    }
+    files_to_omit = set(map(re.compile, {  # type: ignore
+        r'^meta\.xml$',
+        '^Configurations2/',
+        '^Thumbnails/',
+    }))
+
+
+    @staticmethod
+    def __remove_revisions(full_path: str) -> bool:
+        try:
+            tree, namespace = _parse_xml(full_path)
+        except ET.ParseError:
+            return False
+
+        if 'office' not in namespace.keys():  # no revisions in the current file
+            return True
+
+        for text in tree.getroot().iterfind('.//office:text', namespace):
+            for changes in text.iterfind('.//text:tracked-changes', namespace):
+                text.remove(changes)
+
+        tree.write(full_path, xml_declaration=True)
+
+        return True
+
+    def _specific_cleanup(self, full_path: str) -> bool:
+        if os.path.basename(full_path) == 'content.xml':
+            return self.__remove_revisions(full_path)
+        return True
+
+    def get_meta(self) -> Dict[str, str]:
+        """
+        Yes, I know that parsing xml with regexp ain't pretty,
+        be my guest and fix it if you want.
+        """
+        metadata = {}
+        zipin = zipfile.ZipFile(self.filename)
+        for item in zipin.infolist():
+            if item.filename == 'meta.xml':
+                try:
+                    content = zipin.read(item).decode('utf-8')
+                    results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I|re.M)
+                    for (key, value) in results:
+                        metadata[key] = value
+                except (TypeError, UnicodeDecodeError):  # We didn't manage to parse the xml file
+                    metadata[item.filename] = 'harmful content'
+            for key, value in self._get_zipinfo_meta(item).items():
+                metadata[key] = value
+        zipin.close()
+        return metadata
--- a/src/parser_factory.py
+++ b/src/parser_factory.py
+import glob
 import os
 import mimetypes
 import importlib
-import pkgutil
-from typing import TypeVar
+from typing import TypeVar, List, Tuple, Optional

-from . import abstract, unsupported_extensions
+from . import abstract, UNSUPPORTED_EXTENSIONS

+assert Tuple  # make pyflakes happy

 T = TypeVar('T', bound='abstract.AbstractParser')

-# This loads every parser in a dynamic way
-for module_loader, name, ispkg in pkgutil.walk_packages('.src'):
-    if not name.startswith('src.'):
-        continue
-    elif name == 'src.abstract':
-        continue
-    importlib.import_module(name)
-
-
-def _get_parsers() -> list:
+def __load_all_parsers():
+    """ Loads every parser in a dynamic way """
+    current_dir = os.path.dirname(__file__)
+    for fname in glob.glob(os.path.join(current_dir, '*.py')):
+        if fname.endswith('abstract.py'):
+            continue
+        elif fname.endswith('__init__.py'):
+            continue
+        basename = os.path.basename(fname)
+        name, _ = os.path.splitext(basename)
+        importlib.import_module('.' + name, package='libmat2')
+
+__load_all_parsers()
+
+def _get_parsers() -> List[T]:
    """ Get all our parsers!"""
    def __get_parsers(cls):
        return cls.__subclasses__() + \
@@ -26,17 +32,17 @@ def _get_parsers() -> list:
    return __get_parsers(abstract.AbstractParser)


-def get_parser(filename: str) -> (T, str):
+def get_parser(filename: str) -> Tuple[Optional[T], Optional[str]]:
    mtype, _ = mimetypes.guess_type(filename)

    _, extension = os.path.splitext(filename)
-    if extension in unsupported_extensions:
+    if extension.lower() in UNSUPPORTED_EXTENSIONS:
        return None, mtype

-    for c in _get_parsers():
-        if mtype in c.mimetypes:
+    for parser_class in _get_parsers():  # type: ignore
+        if mtype in parser_class.mimetypes:
            try:
-                return c(filename), mtype
+                return parser_class(filename), mtype
            except ValueError:
                return None, mtype
    return None, mtype
--- a/src/pdf.py
+++ b/src/pdf.py
@@ -7,6 +7,7 @@ import re
 import logging
 import tempfile
 import io
+from distutils.version import LooseVersion

 import cairo
 import gi
@@ -15,7 +16,10 @@ from gi.repository import Poppler, GLib

 from . import abstract

-logging.basicConfig(level=logging.DEBUG)
+poppler_version = Poppler.get_version()
+if LooseVersion(poppler_version) < LooseVersion('0.46'): # pragma: no cover
+    raise ValueError("MAT2 needs at least Poppler version 0.46 to work. \
+The installed version is %s." % poppler_version)  # pragma: no cover


 class PDFParser(abstract.AbstractParser):
@@ -41,7 +45,7 @@ class PDFParser(abstract.AbstractParser):
        pages_count = document.get_n_pages()

        tmp_path = tempfile.mkstemp()[1]
-        pdf_surface = cairo.PDFSurface(tmp_path, 10, 10)
+        pdf_surface = cairo.PDFSurface(tmp_path, 10, 10)  # resized later anyway
        pdf_context = cairo.Context(pdf_surface)  # context draws on the surface

        for pagenum in range(pages_count):
@@ -77,7 +81,9 @@ class PDFParser(abstract.AbstractParser):
            page_width, page_height = page.get_size()
            logging.info("Rendering page %d/%d", pagenum + 1, pages_count)

-            img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width) * self.__scale, int(page_height) * self.__scale)
+            width = int(page_width) * self.__scale
+            height = int(page_height) * self.__scale
+            img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, width, height)
            img_context = cairo.Context(img_surface)

            img_context.scale(self.__scale, self.__scale)
@@ -93,7 +99,7 @@ class PDFParser(abstract.AbstractParser):
            pdf_surface.set_size(page_width*self.__scale, page_height*self.__scale)
            pdf_context.set_source_surface(img, 0, 0)
            pdf_context.paint()
-            pdf_context.show_page()
+            pdf_context.show_page()  # draw pdf_context on pdf_surface

        pdf_surface.finish()

@@ -131,5 +137,6 @@ class PDFParser(abstract.AbstractParser):
                metadata[key] = document.get_property(key)
        if 'metadata' in metadata:
            parsed_meta = self.__parse_metadata_field(metadata['metadata'])
-            return {**metadata, **parsed_meta}
+            for key, value in parsed_meta.items():
+                metadata[key] = value
        return metadata