Skip to content
Commits on Source (3)
......@@ -9,7 +9,7 @@ bandit:
script: # TODO: remove B405 and B314
- apt-get -qqy update
- apt-get -qqy install --no-install-recommends python3-bandit
- bandit ./mat2 --format txt
- bandit ./mat2 --format txt --skip B101
- bandit -r ./nautilus/ --format txt --skip B101
- bandit -r ./libmat2 --format txt --skip B101,B404,B603,B405,B314
......@@ -42,9 +42,9 @@ tests:debian:
stage: test
script:
- apt-get -qqy update
- apt-get -qqy install --no-install-recommends python3-mutagen python3-gi-cairo gir1.2-poppler-0.18 gir1.2-gdkpixbuf-2.0 libimage-exiftool-perl python3-coverage
- apt-get -qqy install --no-install-recommends python3-mutagen python3-gi-cairo gir1.2-poppler-0.18 gir1.2-gdkpixbuf-2.0 libimage-exiftool-perl python3-coverage ffmpeg
- python3-coverage run --branch -m unittest discover -s tests/
- python3-coverage report -m --include 'libmat2/*'
- python3-coverage report --fail-under=100 -m --include 'libmat2/*'
tests:fedora:
image: fedora
......@@ -62,5 +62,5 @@ tests:archlinux:
tags:
- whitewhale
script:
- pacman -Sy --noconfirm python-mutagen python-gobject gdk-pixbuf2 poppler-glib gdk-pixbuf2 python-cairo perl-image-exiftool python-setuptools mailcap
- pacman -Sy --noconfirm python-mutagen python-gobject gdk-pixbuf2 poppler-glib gdk-pixbuf2 python-cairo perl-image-exiftool python-setuptools mailcap ffmpeg
- python3 setup.py test
......@@ -6,11 +6,12 @@ max-locals=20
disable=
fixme,
invalid-name,
duplicate-code,
missing-docstring,
protected-access,
abstract-method,
wrong-import-position,
catching-non-exception,
cell-var-from-loop,
locally-disabled,
invalid-sequence-index, # pylint doesn't like things like `Tuple[int, bytes]` in type annotation
abstract-method,
wrong-import-position,
catching-non-exception,
cell-var-from-loop,
locally-disabled,
invalid-sequence-index, # pylint doesn't like things like `Tuple[int, bytes]` in type annotation
# 0.5.0 - 2018-10-23
- Video (.avi files for now) support, via FFmpeg, optionally
- Lightweight cleaning for png and tiff files
- Processing files starting with a dash is now quicker
- Metadata are now displayed sorted
- Recursive metadata support for FLAC files
- Unsupported extensions aren't displayed in `/.mat -l` anymore
- Improve the display when no metadata are found
- Update the logo according to the GNOME guidelines
- The testsuite is now runnable on the installed version of mat2
- Various internal cleanup/improvements
# 0.4.0 - 2018-10-03
- There is now a policy, for advanced users, to deal with unknown embedded fileformats
......
......@@ -32,5 +32,6 @@ Since MAT2 is written in Python3, please conform as much as possible to the
9. Create the signed tarball with `git archive --format=tar.xz --prefix=mat-$VERSION/ $VERSION > mat-$VERSION.tar.xz`
10. Sign the tarball with `gpg --armor --detach-sign mat-$VERSION.tar.xz`
11. Upload the result on Gitlab's [tag page](https://0xacab.org/jvoisin/mat2/tags) and add the changelog there
12. Tell the [downstreams](https://0xacab.org/jvoisin/mat2/blob/master/INSTALL.md) about it
13. Do the secret release dance
12. Announce the release on the [mailing list](https://mailman.boum.org/listinfo/mat-dev)
13. Tell the [downstreams](https://0xacab.org/jvoisin/mat2/blob/master/INSTALL.md) about it
14. Do the secret release dance
......@@ -30,10 +30,11 @@ metadata.
- `python3-mutagen` for audio support
- `python3-gi-cairo` and `gir1.2-poppler-0.18` for PDF support
- `gir1.2-gdkpixbuf-2.0` for images support
- `FFmpeg`, optionally, for video support
- `libimage-exiftool-perl` for everything else
Please note that MAT2 requires at least Python3.5, meaning that it
doesn't run on [Debian Jessie](https://packages.debian.org/jessie/python3),
doesn't run on [Debian Jessie](https://packages.debian.org/jessie/python3).
# Running the test suite
......
data/mat2.png

235 KiB | W: | H:

data/mat2.png

28.2 KiB | W: | H:

data/mat2.png
data/mat2.png
data/mat2.png
data/mat2.png
  • 2-up
  • Swipe
  • Onion skin
source diff could not be displayed: it is too large. Options to address this: view the blob.
doc/implementation_notes.md
doc/threat_model.md
.TH MAT2 "1" "October 2018" "MAT2 0.4.0" "User Commands"
.TH MAT2 "1" "October 2018" "MAT2 0.5.0" "User Commands"
.SH NAME
mat2 \- the metadata anonymisation toolkit 2
......
#!/bin/env python3
#!/usr/bin/env python3
import os
import collections
import enum
import importlib
from typing import Dict, Optional
from . import exiftool, video
# make pyflakes happy
assert Dict
assert Optional
# A set of extension that aren't supported, despite matching a supported mimetype
UNSUPPORTED_EXTENSIONS = {
......@@ -36,24 +38,13 @@ DEPENDENCIES = {
'mutagen': 'Mutagen',
}
def _get_exiftool_path() -> Optional[str]: # pragma: no cover
exiftool_path = '/usr/bin/exiftool'
if os.path.isfile(exiftool_path):
if os.access(exiftool_path, os.X_OK):
return exiftool_path
# ArchLinux
exiftool_path = '/usr/bin/vendor_perl/exiftool'
if os.path.isfile(exiftool_path):
if os.access(exiftool_path, os.X_OK):
return exiftool_path
return None
def check_dependencies() -> dict:
def check_dependencies() -> Dict[str, bool]:
ret = collections.defaultdict(bool) # type: Dict[str, bool]
ret['Exiftool'] = True if _get_exiftool_path() else False
ret['Exiftool'] = True if exiftool._get_exiftool_path() else False
ret['Ffmpeg'] = True if video._get_ffmpeg_path() else False
for key, value in DEPENDENCIES.items():
ret[value] = True
......
import abc
import os
from typing import Set, Dict
import re
from typing import Set, Dict, Union
assert Set # make pyflakes happy
class AbstractParser(abc.ABC):
""" This is the base class of every parser.
It might yield `ValueError` on instantiation on invalid files.
It might yield `ValueError` on instantiation on invalid files,
and `RuntimeError` when something went wrong in `remove_all`.
"""
meta_list = set() # type: Set[str]
mimetypes = set() # type: Set[str]
......@@ -16,21 +18,23 @@ class AbstractParser(abc.ABC):
"""
:raises ValueError: Raised upon an invalid file
"""
if re.search('^[a-z0-9./]', filename) is None:
# Some parsers are calling external binaries,
# this prevents shell command injections
filename = os.path.join('.', filename)
self.filename = filename
fname, extension = os.path.splitext(filename)
self.output_filename = fname + '.cleaned' + extension
self.lightweight_cleaning = False
@abc.abstractmethod
def get_meta(self) -> Dict[str, str]:
def get_meta(self) -> Dict[str, Union[str, dict]]:
pass # pragma: no cover
@abc.abstractmethod
def remove_all(self) -> bool:
pass # pragma: no cover
def remove_all_lightweight(self) -> bool:
""" This method removes _SOME_ metadata.
It might be useful to implement it for fileformats that do
not support non-destructive cleaning.
"""
return self.remove_all()
:raises RuntimeError: Raised if the cleaning process went wrong.
"""
pass # pragma: no cover
......@@ -4,13 +4,14 @@ import tempfile
import os
import logging
import shutil
from typing import Dict, Set, Pattern
from typing import Dict, Set, Pattern, Union
from . import abstract, UnknownMemberPolicy, parser_factory
# Make pyflakes happy
assert Set
assert Pattern
assert Union
class ArchiveBasedAbstractParser(abstract.AbstractParser):
......
import mimetypes
import os
import shutil
import tempfile
from typing import Dict, Union
import mutagen
from . import abstract
from . import abstract, parser_factory
class MutagenParser(abstract.AbstractParser):
......@@ -13,13 +17,13 @@ class MutagenParser(abstract.AbstractParser):
except mutagen.MutagenError:
raise ValueError
def get_meta(self):
def get_meta(self) -> Dict[str, Union[str, dict]]:
f = mutagen.File(self.filename)
if f.tags:
return {k:', '.join(v) for k, v in f.tags.items()}
return {}
def remove_all(self):
def remove_all(self) -> bool:
shutil.copy(self.filename, self.output_filename)
f = mutagen.File(self.output_filename)
f.delete()
......@@ -30,8 +34,8 @@ class MutagenParser(abstract.AbstractParser):
class MP3Parser(MutagenParser):
mimetypes = {'audio/mpeg', }
def get_meta(self):
metadata = {}
def get_meta(self) -> Dict[str, Union[str, dict]]:
metadata = {} # type: Dict[str, Union[str, dict]]
meta = mutagen.File(self.filename).tags
for key in meta:
metadata[key.rstrip(' \t\r\n\0')] = ', '.join(map(str, meta[key].text))
......@@ -44,3 +48,30 @@ class OGGParser(MutagenParser):
class FLACParser(MutagenParser):
mimetypes = {'audio/flac', 'audio/x-flac'}
def remove_all(self) -> bool:
shutil.copy(self.filename, self.output_filename)
f = mutagen.File(self.output_filename)
f.clear_pictures()
f.delete()
f.save(deleteid3=True)
return True
def get_meta(self) -> Dict[str, Union[str, dict]]:
meta = super().get_meta()
for num, picture in enumerate(mutagen.File(self.filename).pictures):
name = picture.desc if picture.desc else 'Cover %d' % num
extension = mimetypes.guess_extension(picture.mime)
if extension is None: # pragma: no cover
meta[name] = 'harmful data'
continue
_, fname = tempfile.mkstemp()
fname = fname + extension
with open(fname, 'wb') as f:
f.write(picture.data)
p, _ = parser_factory.get_parser(fname) # type: ignore
# Mypy chokes on ternaries :/
meta[name] = p.get_meta() if p else 'harmful data' # type: ignore
os.remove(fname)
return meta
import json
import logging
import os
import subprocess
from typing import Dict, Union, Set
from . import abstract
# Make pyflakes happy
assert Set
class ExiftoolParser(abstract.AbstractParser):
""" Exiftool is often the easiest way to get all the metadata
from a import file, hence why several parsers are re-using its `get_meta`
method.
"""
meta_whitelist = set() # type: Set[str]
def get_meta(self) -> Dict[str, Union[str, dict]]:
out = subprocess.check_output([_get_exiftool_path(), '-json', self.filename])
meta = json.loads(out.decode('utf-8'))[0]
for key in self.meta_whitelist:
meta.pop(key, None)
return meta
def _lightweight_cleanup(self) -> bool:
if os.path.exists(self.output_filename):
try:
# exiftool can't force output to existing files
os.remove(self.output_filename)
except OSError as e: # pragma: no cover
logging.error("The output file %s is already existing and \
can't be overwritten: %s.", self.filename, e)
return False
# Note: '-All=' must be followed by a known exiftool option.
# Also, '-CommonIFD0' is needed for .tiff files
cmd = [_get_exiftool_path(),
'-all=', # remove metadata
'-adobe=', # remove adobe-specific metadata
'-exif:all=', # remove all exif metadata
'-Time:All=', # remove all timestamps
'-quiet', # don't show useless logs
'-CommonIFD0=', # remove IFD0 metadata
'-o', self.output_filename,
self.filename]
try:
subprocess.check_call(cmd)
except subprocess.CalledProcessError as e: # pragma: no cover
logging.error("Something went wrong during the processing of %s: %s", self.filename, e)
return False
return True
def _get_exiftool_path() -> str: # pragma: no cover
exiftool_path = '/usr/bin/exiftool'
if os.path.isfile(exiftool_path):
if os.access(exiftool_path, os.X_OK):
return exiftool_path
# ArchLinux
exiftool_path = '/usr/bin/vendor_perl/exiftool'
if os.path.isfile(exiftool_path):
if os.access(exiftool_path, os.X_OK):
return exiftool_path
raise RuntimeError("Unable to find exiftool")
import shutil
from typing import Dict
from typing import Dict, Union
from . import abstract
......@@ -7,7 +7,7 @@ class HarmlessParser(abstract.AbstractParser):
""" This is the parser for filetypes that can not contain metadata. """
mimetypes = {'text/plain', 'image/x-ms-bmp'}
def get_meta(self) -> Dict[str, str]:
def get_meta(self) -> Dict[str, Union[str, dict]]:
return dict()
def remove_all(self) -> bool:
......
import subprocess
import imghdr
import json
import os
import shutil
import tempfile
import re
from typing import Set
import cairo
......@@ -13,44 +8,12 @@ import gi
gi.require_version('GdkPixbuf', '2.0')
from gi.repository import GdkPixbuf
from . import abstract, _get_exiftool_path
from . import exiftool
# Make pyflakes happy
assert Set
class _ImageParser(abstract.AbstractParser):
""" Since we use `exiftool` to get metadata from
all images fileformat, `get_meta` is implemented in this class,
and all the image-handling ones are inheriting from it."""
meta_whitelist = set() # type: Set[str]
@staticmethod
def __handle_problematic_filename(filename: str, callback) -> str:
""" This method takes a filename with a problematic name,
and safely applies it a `callback`."""
tmpdirname = tempfile.mkdtemp()
fname = os.path.join(tmpdirname, "temp_file")
shutil.copy(filename, fname)
out = callback(fname)
shutil.rmtree(tmpdirname)
return out
def get_meta(self):
""" There is no way to escape the leading(s) dash(es) of the current
self.filename to prevent parameter injections, so we need to take care
of this.
"""
fun = lambda f: subprocess.check_output([_get_exiftool_path(), '-json', f])
if re.search('^[a-z0-9/]', self.filename) is None:
out = self.__handle_problematic_filename(self.filename, fun)
else:
out = fun(self.filename)
meta = json.loads(out.decode('utf-8'))[0]
for key in self.meta_whitelist:
meta.pop(key, None)
return meta
class PNGParser(_ImageParser):
class PNGParser(exiftool.ExiftoolParser):
mimetypes = {'image/png', }
meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName',
'Directory', 'FileSize', 'FileModifyDate',
......@@ -71,19 +34,26 @@ class PNGParser(_ImageParser):
except MemoryError: # pragma: no cover
raise ValueError
def remove_all(self):
def remove_all(self) -> bool:
if self.lightweight_cleaning:
return self._lightweight_cleanup()
surface = cairo.ImageSurface.create_from_png(self.filename)
surface.write_to_png(self.output_filename)
return True
class GdkPixbufAbstractParser(_ImageParser):
class GdkPixbufAbstractParser(exiftool.ExiftoolParser):
""" GdkPixbuf can handle a lot of surfaces, so we're rending images on it,
this has the side-effect of completely removing metadata.
"""
_type = ''
def remove_all(self):
def __init__(self, filename):
super().__init__(filename)
if imghdr.what(filename) != self._type: # better safe than sorry
raise ValueError
def remove_all(self) -> bool:
_, extension = os.path.splitext(self.filename)
pixbuf = GdkPixbuf.Pixbuf.new_from_file(self.filename)
if extension.lower() == '.jpg':
......@@ -91,11 +61,6 @@ class GdkPixbufAbstractParser(_ImageParser):
pixbuf.savev(self.output_filename, extension[1:], [], [])
return True
def __init__(self, filename):
super().__init__(filename)
if imghdr.what(filename) != self._type: # better safe than sorry
raise ValueError
class JPGParser(GdkPixbufAbstractParser):
_type = 'jpeg'
......
......@@ -2,7 +2,7 @@ import logging
import os
import re
import zipfile
from typing import Dict, Set, Pattern
from typing import Dict, Set, Pattern, Tuple, Union
import xml.etree.ElementTree as ET # type: ignore
......@@ -14,9 +14,8 @@ from .archive import ArchiveBasedAbstractParser
assert Set
assert Pattern
def _parse_xml(full_path: str):
def _parse_xml(full_path: str) -> Tuple[ET.ElementTree, Dict[str, str]]:
""" This function parses XML, with namespace support. """
namespace_map = dict()
for _, (key, value) in ET.iterparse(full_path, ("start-ns", )):
# The ns[0-9]+ namespaces are reserved for internal usage, so
......@@ -88,6 +87,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
r'^docProps/custom\.xml$',
r'^word/printerSettings/',
r'^word/theme',
r'^word/people\.xml$',
# we have a whitelist in self.files_to_keep,
# so we can trash everything else
......@@ -182,20 +182,20 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
parent_map = {c:p for p in tree.iter() for c in p}
elements = list()
elements_del = list()
for element in tree.iterfind('.//w:del', namespace):
elements.append(element)
for element in elements:
elements_del.append(element)
for element in elements_del:
parent_map[element].remove(element)
elements = list()
elements_ins = list()
for element in tree.iterfind('.//w:ins', namespace):
for position, item in enumerate(tree.iter()): # pragma: no cover
if item == element:
for children in element.iterfind('./*'):
elements.append((element, position, children))
elements_ins.append((element, position, children))
break
for (element, position, children) in elements:
for (element, position, children) in elements_ins:
parent_map[element].insert(position, children)
parent_map[element].remove(element)
......@@ -296,7 +296,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
return True
def get_meta(self) -> Dict[str, str]:
def get_meta(self) -> Dict[str, Union[str, dict]]:
"""
Yes, I know that parsing xml with regexp ain't pretty,
be my guest and fix it if you want.
......@@ -381,7 +381,7 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
return False
return True
def get_meta(self) -> Dict[str, str]:
def get_meta(self) -> Dict[str, Union[str, dict]]:
"""
Yes, I know that parsing xml with regexp ain't pretty,
be my guest and fix it if you want.
......
......@@ -18,6 +18,8 @@ def __load_all_parsers():
continue
elif fname.endswith('__init__.py'):
continue
elif fname.endswith('exiftool.py'):
continue
basename = os.path.basename(fname)
name, _ = os.path.splitext(basename)
importlib.import_module('.' + name, package='libmat2')
......@@ -33,6 +35,7 @@ def _get_parsers() -> List[T]:
def get_parser(filename: str) -> Tuple[Optional[T], Optional[str]]:
""" Return the appropriate parser for a giver filename. """
mtype, _ = mimetypes.guess_type(filename)
_, extension = os.path.splitext(filename)
......
......@@ -7,6 +7,7 @@ import re
import logging
import tempfile
import io
from typing import Dict, Union
from distutils.version import LooseVersion
import cairo
......@@ -37,7 +38,12 @@ class PDFParser(abstract.AbstractParser):
except GLib.GError: # Invalid PDF
raise ValueError
def remove_all_lightweight(self):
def remove_all(self) -> bool:
if self.lightweight_cleaning is True:
return self.__remove_all_lightweight()
return self.__remove_all_thorough()
def __remove_all_lightweight(self) -> bool:
"""
Load the document into Poppler, render pages on a new PDFSurface.
"""
......@@ -64,7 +70,7 @@ class PDFParser(abstract.AbstractParser):
return True
def remove_all(self):
def __remove_all_thorough(self) -> bool:
"""
Load the document into Poppler, render pages on PNG,
and shove those PNG into a new PDF.
......@@ -119,13 +125,13 @@ class PDFParser(abstract.AbstractParser):
return True
@staticmethod
def __parse_metadata_field(data: str) -> dict:
def __parse_metadata_field(data: str) -> Dict[str, str]:
metadata = {}
for (_, key, value) in re.findall(r"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)</\1:\2>", data, re.I):
metadata[key] = value
return metadata
def get_meta(self):
def get_meta(self) -> Dict[str, Union[str, dict]]:
""" Return a dict with all the meta of the file
"""
metadata = {}
......
......@@ -14,7 +14,7 @@ class TorrentParser(abstract.AbstractParser):
if self.dict_repr is None:
raise ValueError
def get_meta(self) -> Dict[str, str]:
def get_meta(self) -> Dict[str, Union[str, dict]]:
metadata = {}
for key, value in self.dict_repr.items():
if key not in self.whitelist:
......