Skip to content
Commits on Source (9)
......@@ -5,21 +5,35 @@ stages:
- linting
- test
.prepare_env: &prepare_env
before_script: # This is needed to not run the testsuite as root
- useradd --home-dir ${CI_PROJECT_DIR} mat2
- chown -R mat2 .
linting:bandit:
image: $CONTAINER_REGISTRY:linting
stage: linting
script: # TODO: remove B405 and B314
- bandit ./mat2 --format txt --skip B101
- bandit -r ./nautilus/ --format txt --skip B101
- bandit -r ./libmat2 --format txt --skip B101,B404,B603,B405,B314
- bandit -r ./libmat2 --format txt --skip B101,B404,B603,B405,B314,B108
linting:codespell:
image: $CONTAINER_REGISTRY:linting
stage: linting
script:
# Run codespell to check for spelling errors; ignore errors about binary
# files, use a config with ignored words and exclude the git directory,
# which might contain false positives
- codespell -q 2 -I utils/ci/codespell/ignored_words.txt -S .git
linting:pylint:
image: $CONTAINER_REGISTRY:linting
stage: linting
script:
- pylint3 --disable=no-else-return --extension-pkg-whitelist=cairo,gi ./libmat2 ./mat2
- pylint --disable=no-else-return,no-else-raise,no-else-continue,unnecessary-comprehension --extension-pkg-whitelist=cairo,gi ./libmat2 ./mat2
# Once nautilus-python is in Debian, decomment it form the line below
- pylint3 --disable=no-else-return --extension-pkg-whitelist=Nautilus,GObject,Gtk,Gio,GLib,gi ./nautilus/mat2.py
- pylint --disable=no-else-return,no-else-raise,no-else-continue,unnecessary-comprehension --extension-pkg-whitelist=Nautilus,GObject,Gtk,Gio,GLib,gi ./nautilus/mat2.py
linting:pyflakes:
image: $CONTAINER_REGISTRY:linting
......@@ -44,16 +58,15 @@ tests:debian:
stage: test
script:
- apt-get -qqy purge bubblewrap
- python3-coverage run --branch -m unittest discover -s tests/
- python3-coverage report --fail-under=90 -m --include 'libmat2/*'
- python3 setup.py test
tests:debian_with_bubblewrap:
image: $CONTAINER_REGISTRY:debian
stage: test
allow_failure: true
<<: *prepare_env
script:
- python3-coverage run --branch -m unittest discover -s tests/
- python3-coverage report --fail-under=100 -m --include 'libmat2/*'
- su - mat2 -c "python3-coverage run --branch -m unittest discover -s tests/"
- su - mat2 -c "python3-coverage report --fail-under=100 -m --include 'libmat2/*'"
tests:fedora:
image: $CONTAINER_REGISTRY:fedora
......@@ -64,6 +77,6 @@ tests:fedora:
tests:gentoo:
image: $CONTAINER_REGISTRY:gentoo
stage: test
allow_failure: true
<<: *prepare_env
script:
- python3 -m unittest discover -v
- su - mat2 -c "python3 -m unittest discover -v"
# 0.10.0 - 2019-11-30
- Make mat2 work on Python3.8
- Minor improvement of ppt handling
- Minor improvement of odt handling
- Add an integration KDE's file manager: Dolphin
- mat2 now copies file permissions on the cleaned files
- Add a flag to disable sandboxing
- Tighten a bit the sandboxing
- Improve handling of MSOffice documents
- Add support for inplace cleaning
- Better handling of mutually-exclusive arguments in the command line
- Add support for svg
- Add support for ppm
- Cleaned zip files are compressed by default
- Minor performances improvement when dealing with images/video files
- Better handling of optional dependencies
# 0.9.0 - 2019-05-10
- Add tar/tar.gz/tar.bz2/tar.zx archives support
......@@ -67,12 +85,12 @@
# 0.3.1 - 2018-09-01
- Document how to install MAT2 for various distributions
- Document how to install mat2 for various distributions
- Fix various typos in the documentation/comments
- Add ArchLinux to the CI to ensure that MAT2 is running on it
- Add ArchLinux to the CI to ensure that mat2 is running on it
- Fix the handling of files with a name ending in `.JPG`
- Improve the detection of unsupported extensions in upper-case
- Streamline MAT2's logging
- Streamline mat2's logging
# 0.3.0 - 2018-08-03
......@@ -92,14 +110,14 @@
- Simplify various code-paths
- Remove superfluous debug message
- Remove the `--check` option that never was implemented anyway
- Add a `-c` option to check for MAT2's dependencies
- Add a `-c` option to check for mat2's dependencies
# 0.1.3 - 2018-07-06
- Improve MAT2 resilience against corrupted images
- Improve mat2 resilience against corrupted images
- Check that the minimal version of Poppler is available
- Simplify how MAT2 deals with office files
- Simplify how mat2 deals with office files
- Improve cleaning of office files
- Thumbnails are removed
- Revisions are removed
......@@ -111,8 +129,8 @@
- Rename some files to ease the packaging
- Add linters to the CI (mypy, bandit and pyflakes)
- Prevent exitftool-related parameters injections
- Improve MAT2's resilience against corrupted files
- Make MAT2 work on fedora, thanks to @atenart
- Improve mat2's resilience against corrupted files
- Make mat2 work on fedora, thanks to @atenart
- Tighten the threat model
- Simplify and improve how office files are handled
......
# Contributing to MAT2
# Contributing to mat2
The main repository for MAT2 is on [0xacab]( https://0xacab.org/jvoisin/mat2 ),
The main repository for mat2 is on [0xacab]( https://0xacab.org/jvoisin/mat2 ),
but you can send patches to jvoisin by [email](https://dustri.org/) if you prefer.
Do feel free to pick up [an issue]( https://0xacab.org/jvoisin/mat2/issues )
......@@ -16,7 +16,7 @@ If you're adding a new fileformat, please add tests for:
2. Cleaning metadata
3. Raising `ValueError` upon a corrupted file
Since MAT2 is written in Python3, please conform as much as possible to the
Since mat2 is written in Python3, please conform as much as possible to the
[pep8]( https://pep8.org/ ) style; except where it makes no sense of course.
# Doing a release
......
......@@ -14,7 +14,16 @@ pip3 install mat2
## Optional dependencies
When [bubblewrap](https://github.com/projectatomic/bubblewrap) is
installed, MAT2 uses it to sandbox any external processes it invokes.
installed, mat2 uses it to sandbox any external processes it invokes.
## Arch Linux
Thanks to [Francois_B](https://www.sciunto.org/), there is an package available on
[Arch linux's AUR](https://aur.archlinux.org/packages/mat2/).
## Debian
There is a package available in [Debian](https://packages.debian.org/search?keywords=mat2&searchon=names&section=all).
## Fedora
......@@ -37,40 +46,6 @@ Then you can install both the Mat2 command and Nautilus extension:
dnf -y install mat2 mat2-nautilus
```
## Debian
There a package available in Debian *buster/sid*. The package [doesn't include
the Nautilus extension yet](https://bugs.debian.org/910491).
For Debian 9 *stretch*, there is a way to install it *manually*:
```
# apt install python3-mutagen python3-gi-cairo gir1.2-gdkpixbuf-2.0 libimage-exiftool-perl gir1.2-glib-2.0 gir1.2-poppler-0.18 ffmpeg
# apt install bubblewrap # if you want sandboxing
$ git clone https://0xacab.org/jvoisin/mat2.git
$ cd mat2
$ ./mat2
```
and if you want to install the über-fancy Nautilus extension:
```
# apt install gnome-common gtk-doc-tools libnautilus-extension-dev python-gi-dev python3-dev build-essential
$ git clone https://github.com/GNOME/nautilus-python
$ cd nautilus-python
$ PYTHON=/usr/bin/python3 ./autogen.sh
$ make
# make install
$ mkdir -p ~/.local/share/nautilus-python/extensions/
$ cp ../nautilus/mat2.py ~/.local/share/nautilus-python/extensions/
$ PYTHONPATH=/home/$USER/mat2 PYTHON=/usr/bin/python3 nautilus
```
## Arch Linux
Thanks to [Francois_B](https://www.sciunto.org/), there is an package available on
[Arch linux's AUR](https://aur.archlinux.org/packages/mat2/).
## Gentoo
MAT2 is available in the [torbrowser overlay](https://github.com/MeisterP/torbrowser-overlay).
mat2 is available in the [torbrowser overlay](https://github.com/MeisterP/torbrowser-overlay).
......@@ -20,22 +20,25 @@ Metadata within a file can tell a lot about you.
Cameras record data about when a picture was taken and what
camera was used. Office documents like PDF or Office automatically adds
author and company information to documents and spreadsheets.
Maybe you don't want to disclose those information on the web.
Maybe you don't want to disclose those information.
This is precisely the job of MAT2: getting rid, as much as possible, of
This is precisely the job of mat2: getting rid, as much as possible, of
metadata.
mat2 provides both a command line tool, and a graphical user interface
via an extension for Nautilus, the default file manager of GNOME.
# Requirements
- `python3-mutagen` for audio support
- `python3-gi-cairo` and `gir1.2-poppler-0.18` for PDF support
- `gir1.2-gdkpixbuf-2.0` for images support
- `gir1.2-rsvg-2.0` for svg support
- `FFmpeg`, optionally, for video support
- `libimage-exiftool-perl` for everything else
- `bubblewrap`, optionally, for sandboxing
Please note that MAT2 requires at least Python3.5, meaning that it
doesn't run on [Debian Jessie](https://packages.debian.org/jessie/python3).
Please note that mat2 requires at least Python3.5.
# Running the test suite
......@@ -50,7 +53,7 @@ $ python3-coverage run --branch -m unittest discover -s tests/
$ python3-coverage report --include -m --include /libmat2/*'
```
# How to use MAT2
# How to use mat2
```bash
usage: mat2 [-h] [-v] [-l] [--check-dependencies] [-V]
......@@ -66,25 +69,25 @@ optional arguments:
-h, --help show this help message and exit
-v, --version show program's version number and exit
-l, --list list all supported fileformats
--check-dependencies check if MAT2 has all the dependencies it needs
--check-dependencies check if mat2 has all the dependencies it needs
-V, --verbose show more verbose status information
--unknown-members policy
how to handle unknown members of archive-style files
(policy should be one of: abort, omit, keep) [Default:
abort]
-s, --show list harmful metadata detectable by MAT2 without
-s, --show list harmful metadata detectable by mat2 without
removing them
-L, --lightweight remove SOME metadata
```
Note that MAT2 **will not** clean files in-place, but will produce, for
Note that mat2 **will not** clean files in-place, but will produce, for
example, with a file named "myfile.png" a cleaned version named
"myfile.cleaned.png".
# Notes about detecting metadata
While MAT2 is doing its very best to display metadata when the `--show` flag is
passed, it doesn't mean that a file is clean from any metadata if MAT2 doesn't
While mat2 is doing its very best to display metadata when the `--show` flag is
passed, it doesn't mean that a file is clean from any metadata if mat2 doesn't
show any. There is no reliable way to detect every single possible metadata for
complex file formats.
......@@ -134,12 +137,15 @@ GNU General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Copyright 2018 Julien (jvoisin) Voisin <julien.voisin+mat2@dustri.org>
Copyright 2016 Marie Rose for MAT2's logo
Copyright 2018 Julien (jvoisin) Voisin <julien.voisin+mat2@dustri.org>
Copyright 2016 Marie-Rose for mat2's logo
The `tests/data/dirty_with_nsid.docx` file is licensed under GPLv3,
and was borrowed from the Calibre project: https://calibre-ebook.com/downloads/demos/demo.docx
# Thanks
MAT2 wouldn't exist without:
mat2 wouldn't exist without:
- the [Google Summer of Code](https://summerofcode.withgoogle.com/);
- the fine people from [Tails]( https://tails.boum.org);
......
mat2 (0.10.0-1) unstable; urgency=medium
* New upstream release.
* d/control:
- Build-depend on python3-all, instead of python3, to provide Python 3.8.
- Depend on gir1.2-rsvg-2.0 to handle .svg files.
- Extend description: Mention new support for .ppm and .svg files, and add
a note about the new Dolphin, the KDE file manager, service menu.
- Improve description: Use consistent naming scheme (mat2 vs MAT2).
* d/mat2.install:
- Ship logo files and the KDE Dolphin service menu.
-- Georg Faerber <georg@debian.org> Sun, 01 Dec 2019 12:59:52 +0000
mat2 (0.9.0-2) unstable; urgency=medium
* d/control:
......
......@@ -10,8 +10,9 @@ Build-Depends: debhelper-compat (= 12),
ffmpeg,
gir1.2-gdkpixbuf-2.0,
gir1.2-poppler-0.18,
gir1.2-rsvg-2.0,
libimage-exiftool-perl,
python3,
python3-all,
python3-gi-cairo,
python3-mutagen,
python3-setuptools,
......@@ -27,6 +28,7 @@ Replaces: mat (<< 0.8.0-2~)
Architecture: all
Depends: gir1.2-gdkpixbuf-2.0,
gir1.2-poppler-0.18,
gir1.2-rsvg-2.0,
libimage-exiftool-perl,
python3-gi-cairo,
python3-mutagen,
......@@ -48,7 +50,7 @@ Description: Metadata anonymisation toolkit v2
.
Maybe you don't want to disclose those information.
.
MAT2 only removes metadata from your files, it does not anonymise their
mat2 only removes metadata from your files, it does not anonymise their
content, nor can it handle watermarking, steganography, or any too
custom metadata field/system.
.
......@@ -69,13 +71,16 @@ Description: Metadata anonymisation toolkit v2
- Ogg Vorbis (.ogg)
- Open Document (.odt, .odx, .ods, ...)
- Portable Document Fileformat (.pdf)
- Portable Pixmap Format (.ppm)
- Scalable Vector Graphics (.svg)
- Tape ARchive (.tar, .tar.bz2, .tar.gz, .tar.zx)
- Torrent (.torrent)
- Windows Media Video (.wmv)
- ZIP (.zip)
.
MAT2 provides both a command line tool, and a graphical user interface
via an extension for Nautilus, the default file manager of GNOME.
mat2 provides a command line tool, and graphical user interfaces
via a service menu for Dolphin, the default file manager of KDE, and
an extension for Nautilus, the default file manager of GNOME.
Package: mat
Depends: mat2,
......
#!/usr/bin/dh-exec
data/mat2.png => /usr/share/icons/hicolor/512x512/apps/mat2.png
data/mat2.svg => /usr/share/icons/hicolor/scalable/apps/mat2.svg
dolphin/mat2.desktop => /usr/share/kservices5/ServiceMenus/mat2.desktop
mat2 => /usr/bin/mat2
nautilus/mat2.py => /usr/share/nautilus-python/extensions/mat2.py
# Exiftool
mat2 is in fact using exiftool to extract metadata from files,
but not to remove them. The previous iteration of mat2, MAT,
was using exiftool to remove metadata, which lead to several cases where
they weren't correctly removed, if at all.
For example, [Exiftool's documentation](https://www.sno.phy.queensu.ca/~phil/exiftool/TagNames/PDF.html)
states the following with regard to PDF:
> All metadata edits are reversible. While this would normally be considered an
advantage, it is a potential security problem because old information is never
actually deleted from the file.
To remove metadata, mat2 usually re-render the file completely, eliminating
all possible original metadata. See the `implementation_notes.md` file for
details.
# jpegoptim, optipng, …
While designed to reduce as much as possible the size of pictures,
those software can be used to remove metadata. They usually have very good
support for a single picture format, and can be used in place of mat2 for them.
# PDF Redact Tools
[PDF Redact Tools](https://github.com/firstlookmedia/pdf-redact-tools) is
a software developed by the people from [First Look
Media](https://firstlook.media/), the entity behind, amongst other things,
[The Intercept](https://theintercept.com/).
The tool uses roughly the same approach than mat2 to deal with PDF,
which is unfortunately the only fileformat that it does support.
It's interesting to note that it has counter-measures against
[yellow dots](https://en.wikipedia.org/wiki/Machine_Identification_Code),
a capacity that mat2 [doesn't possess yet](https://0xacab.org/jvoisin/mat2/issues/43).
# Exiv2
[Exiv2](https://www.exiv2.org/) was considered for mat2,
but it currently [misses a lot of metadata](https://0xacab.org/jvoisin/mat2/issues/85)
# Others non open source software/online service
There are a lot of closed-source software and online service claiming to remove
metadata from your files, but since there is no way to actually verify that
they're effectively removing them, let alone adding unique markers, they
shouldn't be used.
......@@ -4,7 +4,7 @@ Implementation notes
Lightweight cleaning mode
-------------------------
Due to *popular* request, MAT2 is providing a *lightweight* cleaning mode,
Due to *popular* request, mat2 is providing a *lightweight* cleaning mode,
that only cleans the superficial metadata of your file, but not
the ones that might be in **embedded** resources. Like for example,
images in a PDF or an office document.
......@@ -12,33 +12,49 @@ images in a PDF or an office document.
Revisions handling
------------------
Revisions are handled according to the principle of least astonishment: they are entirely removed.
Revisions are handled according to the principle of least astonishment: they
are entirely removed.
- Either the users aren't aware of the revisions, are thus they should be deleted. For example journalists that are editing a document to erase mentions sources mentions.
- Either the users aren't aware of the revisions, are thus they should be
deleted. For example journalists that are editing a document to erase
mentions sources mentions.
- Or they are aware of it, and will likely not expect MAT2 to be able to keep the revisions, that are basically traces about how, when and who edited the document.
- Or they are aware of it, and will likely not expect mat2 to be able to keep
the revisions, that are basically traces about how, when and who edited the
document.
Race conditions
---------------
MAT2 does its very best to avoid crashing at runtime. This is why it's checking
if the file is valid __at parser creation__. MAT2 doesn't take any measure to
mat2 does its very best to avoid crashing at runtime. This is why it's checking
if the file is valid __at parser creation__. mat2 doesn't take any measure to
ensure that the file is not changed between the time the parser is
instantiated, and the call to clean or show the metadata.
Symlink attacks
---------------
MAT2 output predictable filenames (like yourfile.jpg.cleaned).
mat2 output predictable filenames (like yourfile.jpg.cleaned).
This may lead to symlink attack. Please check if you OS prevent
against them
Archives handling
-----------------
MAT2 doesn't support archives yet, because we haven't found an usable way to ask the user
what to do when a non-supported files are encountered.
By default, when cleaning a non-support file format in an archive,
mat2 will abort with a detailed error message.
While strongly discouraged, it's possible to override this behaviour to force
the exclusion, or inclusion of unknown files into the cleaned archive.
While Python's [zipfile](https://docs.python.org/3/library/zipfile.html) module
provides *safe* way to extract members of a zip archive, the
[tarfile](https://docs.python.org/3/library/tarfile.html) one doesn't,
meaning that it's up to mat2 to implement safety checks. Currently,
it defends against path-traversal, both relative and absolute,
symlink-related attacks, setuid/setgid attacks, duplicate members, block and
char devices, … but there might still be dragons lurking there.
PDF handling
------------
......@@ -49,10 +65,10 @@ didn't remove any *deep metadata*, like the ones in embedded pictures. This was
on of the reason MAT was abandoned: the absence of satisfying solution to
handle PDF. But apparently, people are ok with [pdf redact
tools](https://github.com/firstlookmedia/pdf-redact-tools), that simply
transform the PDF into images. So this is what's MAT2 is doing too.
transform the PDF into images. So this is what's mat2 is doing too.
Of course, it would be possible to detect images in PDf file, and process them
with MAT2, but since a PDF can contain a lot of things, like images, videos,
with mat2, but since a PDF can contain a lot of things, like images, videos,
javascript, pdf, blobs, … this is the easiest and safest way to clean them.
Images handling
......@@ -65,7 +81,7 @@ XML attacks
-----------
Since our threat model conveniently excludes files crafted to specifically
bypass MAT2, fileformats containing harmful XML are out of our scope.
But since MAT2 is using [etree](https://docs.python.org/3/library/xml.html#xml-vulnerabilities)
bypass mat2, fileformats containing harmful XML are out of our scope.
But since mat2 is using [etree](https://docs.python.org/3/library/xml.html#xml-vulnerabilities)
to process XML, it's "only" vulnerable to DoS, and not memory corruption:
odds are that the user will notice that the cleaning didn't succeed.
.TH MAT2 "1" "May 2019" "MAT2 0.9.0" "User Commands"
.TH mat2 "1" "November 2019" "mat2 0.10.0" "User Commands"
.SH NAME
mat2 \- the metadata anonymisation toolkit 2
......@@ -32,7 +32,7 @@ show program's version number and exit
list all supported fileformats
.TP
\fB\-\-check\-dependencies\fR
check if MAT2 has all the dependencies it needs
check if mat2 has all the dependencies it needs
.TP
\fB\-V\fR, \fB\-\-verbose\fR
show more verbose status information
......@@ -41,11 +41,16 @@ show more verbose status information
how to handle unknown members of archive-style files (policy should be one of: abort, omit, keep)
.TP
\fB\-s\fR, \fB\-\-show\fR
list harmful metadata detectable by MAT2 without
removing them
list harmful metadata detectable by mat2 without removing them
.TP
\fB\-L\fR, \fB\-\-lightweight\fR
remove SOME metadata
.TP
\fB\--no-sandbox\fR
disable bubblewrap's sandboxing
.TP
\fB\--inplace\fR
clean in place, without backup
.SH EXAMPLES
To remove all the metadata from a PDF file:
......
......@@ -3,7 +3,7 @@ Threat Model
The Metadata Anonymisation Toolkit 2 adversary has a number
of goals, capabilities, and counter-attack types that can be
used to guide us towards a set of requirements for the MAT2.
used to guide us towards a set of requirements for the mat2.
This is an overhaul of MAT's (the first iteration of the software) one.
......@@ -53,7 +53,7 @@ Adversary
user. This is the strongest position for the adversary to
have. In this case, the adversary is capable of inserting
arbitrary, custom watermarks specifically for tracking
the user. In general, MAT2 cannot defend against this
the user. In general, mat2 cannot defend against this
adversary, but we list it for completeness' sake.
- The adversary created the document for a group of users.
......@@ -65,7 +65,7 @@ Adversary
- The adversary did not create the document, the weakest
position for the adversary to have. The file format is
(most of the time) standard, nothing custom is added:
MAT2 must be able to remove all metadata from the file.
mat2 must be able to remove all metadata from the file.
Requirements
......@@ -73,28 +73,28 @@ Requirements
* Processing
- MAT2 *should* avoid interactions with information.
- mat2 *should* avoid interactions with information.
Its goal is to remove metadata, and the user is solely
responsible for the information of the file.
- MAT2 *must* warn when encountering an unknown
format. For example, in a zipfile, if MAT encounters an
- mat2 *must* warn when encountering an unknown
format. For example, in a zipfile, if mat2 encounters an
unknown format, it should warn the user, and ask if the
file should be added to the anonymised archive that is
produced.
- MAT2 *must* not add metadata, since its purpose is to
- mat2 *must* not add metadata, since its purpose is to
anonymise files: every added items of metadata decreases
anonymity.
- MAT2 *should* handle unknown/hidden metadata fields,
- mat2 *should* handle unknown/hidden metadata fields,
like proprietary extensions of open formats.
- MAT2 *must not* fail silently. Upon failure,
MAT2 *must not* modify the file in any way.
- mat2 *must not* fail silently. Upon failure,
mat2 *must not* modify the file in any way.
- MAT2 *might* leak the fact that MAT2 was used on the file,
- mat2 *might* leak the fact that mat2 was used on the file,
since it might be uncommon for some file formats to come
without any kind of metadata, an adversary might suspect that
the user used MAT2 on certain files.
the user used mat2 on certain files.
Dolphin integration
===================
Thanks to [Miguel Marco](https://riemann.unizar.es/~mmarco/), here is an neat
integration for [Dolphin](https://kde.org/applications/system/org.kde.dolphin),
the KDE file manager:
1. Add the `mat.desktop` file either in
- `/usr/share/kservices5/ServiceMenus/` to install it globally
- `~/.local/share/kservices5/ServiceMenus/` for a specific user
2. Run `kbuildsycoca5` to update the corresponding database
3. Enjoy your new contextual menu to remove metadata from your files!
[Desktop Entry]
X-KDE-ServiceTypes=KonqPopupMenu/Plugin
MimeType=application/pdf;application/vnd.oasis.opendocument.chart ;application/vnd.oasis.opendocument.formula ;application/vnd.oasis.opendocument.graphics ;application/vnd.oasis.opendocument.image ;application/vnd.oasis.opendocument.presentation ;application/vnd.oasis.opendocument.spreadsheet ;application/vnd.oasis.opendocument.text ;application/vnd.openxmlformats-officedocument.presentationml.presentation ;application/vnd.openxmlformats-officedocument.spreadsheetml.sheet ;application/vnd.openxmlformats-officedocument.wordprocessingml.document ;application/x-bittorrent ;application/zip ;audio/flac ;audio/mpeg ;audio/ogg ;audio/x-flac ;image/jpeg ;image/png ;image/tiff ;image/x-ms-bmp ;text/plain ;video/mp4 ;video/x-msvideo;
Actions=cleanMetadata;
Type=Service
[Desktop Action cleanMetadata]
Name=Clean metadata
Name[es]=Limpiar metadatos
Icon=/usr/share/icons/hicolor/scalable/apps/mat2.svg
Exec=kdialog --yesno "$( mat2 -s %U )" --title "Clean Metadata?" && mat2 %U
#!/usr/bin/env python3
import collections
import enum
import importlib
from typing import Dict, Optional
from typing import Dict, Optional, Union
from . import exiftool, video
# make pyflakes happy
assert Dict
assert Optional
assert Union
# A set of extension that aren't supported, despite matching a supported mimetype
UNSUPPORTED_EXTENSIONS = {
......@@ -30,35 +30,65 @@ UNSUPPORTED_EXTENSIONS = {
}
DEPENDENCIES = {
'Cairo': 'cairo',
'PyGobject': 'gi',
'GdkPixbuf from PyGobject': 'gi.repository.GdkPixbuf',
'Poppler from PyGobject': 'gi.repository.Poppler',
'GLib from PyGobject': 'gi.repository.GLib',
'Mutagen': 'mutagen',
}
'Cairo': {
'module': 'cairo',
'required': True,
},
'PyGobject': {
'module': 'gi',
'required': True,
},
'GdkPixbuf from PyGobject': {
'module': 'gi.repository.GdkPixbuf',
'required': True,
},
'Poppler from PyGobject': {
'module': 'gi.repository.Poppler',
'required': True,
},
'GLib from PyGobject': {
'module': 'gi.repository.GLib',
'required': True,
},
'Mutagen': {
'module': 'mutagen',
'required': True,
},
}
CMD_DEPENDENCIES = {
'Exiftool': exiftool._get_exiftool_path,
'Ffmpeg': video._get_ffmpeg_path,
}
'Exiftool': {
'cmd': exiftool._get_exiftool_path,
'required': False,
},
'Ffmpeg': {
'cmd': video._get_ffmpeg_path,
'required': False,
},
}
def check_dependencies() -> Dict[str, bool]:
ret = collections.defaultdict(bool) # type: Dict[str, bool]
def check_dependencies() -> Dict[str, Dict[str, bool]]:
ret = dict() # type: Dict[str, dict]
for key, value in DEPENDENCIES.items():
ret[key] = True
ret[key] = {
'found': True,
'required': value['required'],
}
try:
importlib.import_module(value)
importlib.import_module(value['module']) # type: ignore
except ImportError: # pragma: no cover
ret[key] = False # pragma: no cover
ret[key]['found'] = False
for k, v in CMD_DEPENDENCIES.items():
ret[k] = True
ret[k] = {
'found': True,
'required': v['required'],
}
try:
v()
v['cmd']() # type: ignore
except RuntimeError: # pragma: no cover
ret[k] = False
ret[k]['found'] = False
return ret
......
......@@ -32,6 +32,7 @@ class AbstractParser(abc.ABC):
self.output_filename = fname + '.cleaned' + extension
self.lightweight_cleaning = False
self.sandbox = True
@abc.abstractmethod
def get_meta(self) -> Dict[str, Union[str, dict]]:
......
......@@ -47,11 +47,12 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
def __init__(self, filename):
super().__init__(filename)
self.archive_class = None # type: Optional[ArchiveClass]
self.member_class = None # type: Optional[ArchiveMember]
# We ignore typing here because mypy is too stupid
self.archive_class = None # type: ignore
self.member_class = None # type: ignore
# Those are the files that have a format that _isn't_
# supported by MAT2, but that we want to keep anyway.
# supported by mat2, but that we want to keep anyway.
self.files_to_keep = set() # type: Set[Pattern]
# Those are the files that we _do not_ want to keep,
......@@ -62,7 +63,9 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
# the archive?
self.unknown_member_policy = UnknownMemberPolicy.ABORT # type: UnknownMemberPolicy
self.is_archive_valid()
# The LGTM comment is to mask a false-positive,
# see https://lgtm.com/projects/g/jvoisin/mat2/
self.is_archive_valid() # lgtm [py/init-calls-subclass]
def is_archive_valid(self):
"""Raise a ValueError is the current archive isn't a valid one."""
......@@ -80,28 +83,27 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
return {} # pragma: no cover
@staticmethod
@abc.abstractstaticmethod
@abc.abstractmethod
def _get_all_members(archive: ArchiveClass) -> List[ArchiveMember]:
"""Return all the members of the archive."""
@staticmethod
@abc.abstractstaticmethod
@abc.abstractmethod
def _clean_member(member: ArchiveMember) -> ArchiveMember:
"""Remove all the metadata for a given member."""
@staticmethod
@abc.abstractstaticmethod
@abc.abstractmethod
def _get_member_meta(member: ArchiveMember) -> Dict[str, str]:
"""Return all the metadata of a given member."""
@staticmethod
@abc.abstractstaticmethod
@abc.abstractmethod
def _get_member_name(member: ArchiveMember) -> str:
"""Return the name of the given member."""
@staticmethod
@abc.abstractstaticmethod
def _add_file_to_archive(archive: ArchiveClass, member: ArchiveMember,
@abc.abstractmethod
def _add_file_to_archive(self, archive: ArchiveClass, member: ArchiveMember,
full_path: str):
"""Add the file at full_path to the archive, via the given member."""
......@@ -197,7 +199,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
logging.warning("In file %s, keeping unknown element %s (format: %s)",
self.filename, member_name, mtype)
else:
logging.error("In file %s, element %s's format (%s) " \
logging.error("In file %s, element %s's format (%s) "
"isn't supported",
self.filename, member_name, mtype)
abort = True
......@@ -313,8 +315,7 @@ class TarParser(ArchiveBasedAbstractParser):
metadata['gname'] = member.gname
return metadata
@staticmethod
def _add_file_to_archive(archive: ArchiveClass, member: ArchiveMember,
def _add_file_to_archive(self, archive: ArchiveClass, member: ArchiveMember,
full_path: str):
assert isinstance(member, tarfile.TarInfo) # please mypy
assert isinstance(archive, tarfile.TarFile) # please mypy
......@@ -358,6 +359,7 @@ class ZipParser(ArchiveBasedAbstractParser):
super().__init__(filename)
self.archive_class = zipfile.ZipFile
self.member_class = zipfile.ZipInfo
self.zip_compression_type = zipfile.ZIP_DEFLATED
def is_archive_valid(self):
try:
......@@ -392,13 +394,13 @@ class ZipParser(ArchiveBasedAbstractParser):
return metadata
@staticmethod
def _add_file_to_archive(archive: ArchiveClass, member: ArchiveMember,
def _add_file_to_archive(self, archive: ArchiveClass, member: ArchiveMember,
full_path: str):
assert isinstance(archive, zipfile.ZipFile) # please mypy
assert isinstance(member, zipfile.ZipInfo) # please mypy
with open(full_path, 'rb') as f:
archive.writestr(member, f.read())
archive.writestr(member, f.read(),
compress_type=self.zip_compression_type)
@staticmethod
def _get_all_members(archive: ArchiveClass) -> List[ArchiveMember]:
......
......@@ -18,6 +18,8 @@ __all__ = ['PIPE', 'run', 'CalledProcessError']
PIPE = subprocess.PIPE
CalledProcessError = subprocess.CalledProcessError
# pylint: disable=subprocess-run-check
def _get_bwrap_path() -> str:
bwrap_path = '/usr/bin/bwrap'
......@@ -49,9 +51,17 @@ def _get_bwrap_args(tempdir: str,
args = ro_bind_args + \
['--dev', '/dev',
'--proc', '/proc',
'--chdir', cwd,
'--unshare-all',
'--tmpfs', '/tmp',
'--unshare-user-try',
'--unshare-ipc',
'--unshare-pid',
'--unshare-net',
'--unshare-uts',
'--unshare-cgroup-try',
'--new-session',
'--cap-drop', 'all',
# XXX: enable --die-with-parent once all supported platforms have
# a bubblewrap recent enough to support it.
# '--die-with-parent',
......
import functools
import json
import logging
import os
import subprocess
from typing import Dict, Union, Set
from . import abstract
from . import subprocess
from . import bubblewrap
# Make pyflakes happy
assert Set
......@@ -18,9 +20,13 @@ class ExiftoolParser(abstract.AbstractParser):
meta_allowlist = set() # type: Set[str]
def get_meta(self) -> Dict[str, Union[str, dict]]:
out = subprocess.run([_get_exiftool_path(), '-json', self.filename],
input_filename=self.filename,
check=True, stdout=subprocess.PIPE).stdout
if self.sandbox:
out = bubblewrap.run([_get_exiftool_path(), '-json', self.filename],
input_filename=self.filename,
check=True, stdout=subprocess.PIPE).stdout
else:
out = subprocess.run([_get_exiftool_path(), '-json', self.filename],
check=True, stdout=subprocess.PIPE).stdout
meta = json.loads(out.decode('utf-8'))[0]
for key in self.meta_allowlist:
meta.pop(key, None)
......@@ -28,8 +34,7 @@ class ExiftoolParser(abstract.AbstractParser):
def _lightweight_cleanup(self) -> bool:
if os.path.exists(self.output_filename):
try:
# exiftool can't force output to existing files
try: # exiftool can't force output to existing files
os.remove(self.output_filename)
except OSError as e: # pragma: no cover
logging.error("The output file %s is already existing and \
......@@ -48,14 +53,18 @@ class ExiftoolParser(abstract.AbstractParser):
'-o', self.output_filename,
self.filename]
try:
subprocess.run(cmd, check=True,
input_filename=self.filename,
output_filename=self.output_filename)
if self.sandbox:
bubblewrap.run(cmd, check=True,
input_filename=self.filename,
output_filename=self.output_filename)
else:
subprocess.run(cmd, check=True)
except subprocess.CalledProcessError as e: # pragma: no cover
logging.error("Something went wrong during the processing of %s: %s", self.filename, e)
return False
return True
@functools.lru_cache()
def _get_exiftool_path() -> str: # pragma: no cover
possible_pathes = {
'/usr/bin/exiftool', # debian/fedora
......
import imghdr
import os
from typing import Set
import re
from typing import Set, Dict, Union, Any
import cairo
import gi
gi.require_version('GdkPixbuf', '2.0')
from gi.repository import GdkPixbuf, GLib
gi.require_version('Rsvg', '2.0')
from gi.repository import GdkPixbuf, GLib, Rsvg
from . import exiftool
from . import exiftool, abstract
# Make pyflakes happy
assert Set
assert Any
class SVGParser(exiftool.ExiftoolParser):
mimetypes = {'image/svg+xml', }
meta_allowlist = {'Directory', 'ExifToolVersion', 'FileAccessDate',
'FileInodeChangeDate', 'FileModifyDate', 'FileName',
'FilePermissions', 'FileSize', 'FileType',
'FileTypeExtension', 'ImageHeight', 'ImageWidth',
'MIMEType', 'SVGVersion', 'SourceFile', 'ViewBox'
}
def remove_all(self) -> bool:
svg = Rsvg.Handle.new_from_file(self.filename)
dimensions = svg.get_dimensions()
surface = cairo.SVGSurface(self.output_filename,
dimensions.height,
dimensions.width)
context = cairo.Context(surface)
svg.render_cairo(context)
surface.finish()
return True
def get_meta(self) -> Dict[str, Union[str, dict]]:
meta = super().get_meta()
# The namespace is mandatory, but only the …/2000/svg is valid.
ns = 'http://www.w3.org/2000/svg'
if meta.get('Xmlns', ns) == ns:
meta.pop('Xmlns')
return meta
class PNGParser(exiftool.ExiftoolParser):
mimetypes = {'image/png', }
......@@ -108,3 +140,23 @@ class TiffParser(GdkPixbufAbstractParser):
'FilePermissions', 'FileSize', 'FileType',
'FileTypeExtension', 'ImageHeight', 'ImageSize',
'ImageWidth', 'MIMEType', 'Megapixels', 'SourceFile'}
class PPMParser(abstract.AbstractParser):
mimetypes = {'image/x-portable-pixmap'}
def get_meta(self) -> Dict[str, Union[str, dict]]:
meta = {} # type: Dict[str, Union[str, Dict[Any, Any]]]
with open(self.filename) as f:
for idx, line in enumerate(f):
if line.lstrip().startswith('#'):
meta[str(idx)] = line.lstrip().rstrip()
return meta
def remove_all(self) -> bool:
with open(self.filename) as fin:
with open(self.output_filename, 'w') as fout:
for line in fin:
if not line.lstrip().startswith('#'):
line = re.sub(r"\s+", "", line, flags=re.UNICODE)
fout.write(line)
return True