Skip to content
Commits on Source (9)
image: debian
variables:
CONTAINER_REGISTRY: $CI_REGISTRY/georg/mat2-ci-images
stages:
- linting
- test
bandit:
linting:bandit:
image: $CONTAINER_REGISTRY:linting
stage: linting
script: # TODO: remove B405 and B314
- apt-get -qqy update
- apt-get -qqy install --no-install-recommends python3-bandit
- bandit ./mat2 --format txt --skip B101
- bandit -r ./nautilus/ --format txt --skip B101
- bandit -r ./libmat2 --format txt --skip B101,B404,B603,B405,B314
pylint:
linting:pylint:
image: $CONTAINER_REGISTRY:linting
stage: linting
script:
- apt-get -qqy update
- apt-get -qqy install --no-install-recommends pylint3 python3-mutagen python3-gi-cairo gir1.2-poppler-0.18 gir1.2-gdkpixbuf-2.0
- pylint3 --extension-pkg-whitelist=cairo,gi ./libmat2 ./mat2
- pylint3 --disable=no-else-return --extension-pkg-whitelist=cairo,gi ./libmat2 ./mat2
# Once nautilus-python is in Debian, decomment it form the line below
- pylint3 --extension-pkg-whitelist=Nautilus,GObject,Gtk,Gio,GLib,gi ./nautilus/mat2.py
- pylint3 --disable=no-else-return --extension-pkg-whitelist=Nautilus,GObject,Gtk,Gio,GLib,gi ./nautilus/mat2.py
pyflakes:
linting:pyflakes:
image: $CONTAINER_REGISTRY:linting
stage: linting
script:
- apt-get -qqy update
- apt-get -qqy install --no-install-recommends pyflakes3
- pyflakes3 ./libmat2 ./mat2 ./tests/ ./nautilus
mypy:
linting:mypy:
image: $CONTAINER_REGISTRY:linting
stage: linting
script:
- apt-get -qqy update
- apt-get -qqy install --no-install-recommends python3-pip
- pip3 install mypy
- mypy --ignore-missing-imports mat2 libmat2/*.py ./nautilus/mat2.py
tests:archlinux:
image: $CONTAINER_REGISTRY:archlinux
stage: test
script:
- python3 setup.py test
tests:debian:
image: $CONTAINER_REGISTRY:debian
stage: test
script:
- apt-get -qqy update
- apt-get -qqy install --no-install-recommends python3-mutagen python3-gi-cairo gir1.2-poppler-0.18 gir1.2-gdkpixbuf-2.0 libimage-exiftool-perl python3-coverage ffmpeg
- apt-get -qqy purge bubblewrap
- python3-coverage run --branch -m unittest discover -s tests/
- python3-coverage report --fail-under=90 -m --include 'libmat2/*'
tests:debian_with_bubblewrap:
image: $CONTAINER_REGISTRY:debian
stage: test
tags:
- whitewhale
allow_failure: true
script:
- apt-get -qqy update
- apt-get -qqy install --no-install-recommends python3-mutagen python3-gi-cairo gir1.2-poppler-0.18 gir1.2-gdkpixbuf-2.0 libimage-exiftool-perl python3-coverage ffmpeg bubblewrap
- python3-coverage run --branch -m unittest discover -s tests/
- python3-coverage report --fail-under=100 -m --include 'libmat2/*'
tests:fedora:
image: fedora
image: $CONTAINER_REGISTRY:fedora
stage: test
tags:
- whitewhale
script:
- dnf install -y python3 python3-mutagen python3-gobject gdk-pixbuf2 poppler-glib gdk-pixbuf2 gdk-pixbuf2-modules cairo-gobject cairo python3-cairo perl-Image-ExifTool mailcap
- gdk-pixbuf-query-loaders-64 > /usr/lib64/gdk-pixbuf-2.0/2.10.0/loaders.cache
- python3 setup.py test
tests:archlinux:
image: archlinux/base
tests:gentoo:
image: $CONTAINER_REGISTRY:gentoo
stage: test
tags:
- whitewhale
allow_failure: true
script:
- pacman -Sy --noconfirm python-mutagen python-gobject gdk-pixbuf2 poppler-glib gdk-pixbuf2 python-cairo perl-image-exiftool python-setuptools mailcap ffmpeg
- python3 setup.py test
- python3 -m unittest discover -v
# 0.9.0 - 2019-05-10
- Add tar/tar.gz/tar.bz2/tar.zx archives support
- Add support for xhtml files
- Improve handling of read-only files
- Improve a bit the command line's documentation
- Fix a confusing error message
- Add even more tests
- Usuals internal cleanups/refactorings
# 0.8.0 - 2019-02-28
- Add support for epub files
......
# Contributing to MAT2
The main repository for MAT2 is on [0xacab]( https://0xacab.org/jvoisin/mat2 ),
with a mirror on [gitlab.com]( https://gitlab.com/jvoisin/mat2 ).
but you can send patches to jvoisin by [email](https://dustri.org/) if you prefer.
Do feel free to pick up [an issue]( https://0xacab.org/jvoisin/mat2/issues )
and to send a pull-request. Please do check that everything is fine by running the
......@@ -29,9 +29,10 @@ Since MAT2 is written in Python3, please conform as much as possible to the
6. Create a tag with `git tag -s $VERSION`
7. Push the commit with `git push origin master`
8. Push the tag with `git push --tags`
9. Create the signed tarball with `git archive --format=tar.xz --prefix=mat-$VERSION/ $VERSION > mat-$VERSION.tar.xz`
10. Sign the tarball with `gpg --armor --detach-sign mat-$VERSION.tar.xz`
11. Upload the result on Gitlab's [tag page](https://0xacab.org/jvoisin/mat2/tags) and add the changelog there
12. Announce the release on the [mailing list](https://mailman.boum.org/listinfo/mat-dev)
13. Upload the new version on pypi with `python3 setup.py sdist bdist_wheel` then `twine upload -s dist/*`
14. Do the secret release dance
9. Download the gitlab archive of the release
10. Diff it against the local copy
11. If there is no difference, sign the archive with `gpg --armor --detach-sign mat2-$VERSION.tar.xz`
12. Upload the signature on Gitlab's [tag page](https://0xacab.org/jvoisin/mat2/tags) and add the changelog there
13. Announce the release on the [mailing list](https://mailman.boum.org/listinfo/mat-dev)
14. Sign'n'upload the new version on pypi with `python3 setup.py sdist bdist_wheel` then `twine upload -s dist/*`
15. Do the secret release dance
......@@ -70,7 +70,8 @@ optional arguments:
-V, --verbose show more verbose status information
--unknown-members policy
how to handle unknown members of archive-style files
(policy should be one of: abort, omit, keep)
(policy should be one of: abort, omit, keep) [Default:
abort]
-s, --show list harmful metadata detectable by MAT2 without
removing them
-L, --lightweight remove SOME metadata
......
mat2 (0.9.0-1) unstable; urgency=medium
* New upstream release.
* d/control:
- Bump debhelper compat level to 12 and Standards-Version to 4.4.0, no
changes required.
- Drop ancient X-Python3-Version field.
- Extend description to mention new support for .xhtml and .tar.xz files.
- Use my debian.org mail address.
* d/gitlab-ci.yml:
- Pull in changes made upstream by the Salsa CI team to make the CI work
again.
-- Georg Faerber <georg@debian.org> Wed, 10 Jul 2019 17:51:24 +0000
mat2 (0.8.0-3) unstable; urgency=medium
* Upload to unstable. This adds a new binary package 'mat' which handles
......
......@@ -2,9 +2,9 @@ Source: mat2
Section: utils
Priority: optional
Maintainer: Debian Privacy Tools Maintainers <pkg-privacy-maintainers@lists.alioth.debian.org>
Uploaders: Georg Faerber <georg@riseup.net>,
Uploaders: Georg Faerber <georg@debian.org>,
Jonas Meurer <jonas@freesources.org>,
Build-Depends: debhelper-compat (= 11),
Build-Depends: debhelper-compat (= 12),
dh-exec,
dh-python,
ffmpeg,
......@@ -15,8 +15,7 @@ Build-Depends: debhelper-compat (= 11),
python3-gi-cairo,
python3-mutagen,
python3-setuptools,
Standards-Version: 4.3.0
X-Python3-Version: >= 3.5
Standards-Version: 4.4.0
Homepage: https://0xacab.org/jvoisin/mat2
Vcs-Git: https://salsa.debian.org/pkg-privacy-team/mat2.git
Vcs-Browser: https://salsa.debian.org/pkg-privacy-team/mat2
......@@ -61,7 +60,7 @@ Description: Metadata anonymisation toolkit v2
- Electronic Publication (.epub)
- Free Lossless Audio Codec (.flac)
- Graphics Interchange Format (.gif)
- Hypertext Markup Language (.html)
- Hypertext Markup Language (.html, .xhtml)
- Portable Network Graphics (PNG)
- JPEG (.jpeg, .jpg, ...)
- MPEG Audio (.mp3, .mp2, .mp1, .mpa)
......@@ -70,7 +69,7 @@ Description: Metadata anonymisation toolkit v2
- Ogg Vorbis (.ogg)
- Open Document (.odt, .odx, .ods, ...)
- Portable Document Fileformat (.pdf)
- Tape ARchive (.tar, .tar.bz2, .tar.gz)
- Tape ARchive (.tar, .tar.bz2, .tar.gz, .tar.zx)
- Torrent (.torrent)
- Windows Media Video (.wmv)
- ZIP (.zip)
......
include: https://salsa.debian.org/salsa-ci-team/pipeline/raw/master/salsa-ci.yml
.limits: &limits
except:
- tags
only:
changes:
- debian/**/*
build:
extends: .build-unstable
<<: *limits
lintian:
extends: .test-lintian
<<: *limits
piuparts:
extends: .test-piuparts
<<: *limits
autopkgtest:
extends: .test-autopkgtest
<<: *limits
reprotest:
extends: .test-reprotest
<<: *limits
---
include:
- https://salsa.debian.org/salsa-ci-team/pipeline/raw/master/salsa-ci.yml
- https://salsa.debian.org/salsa-ci-team/pipeline/raw/master/pipeline-jobs.yml
.TH MAT2 "1" "February 2019" "MAT2 0.8.0" "User Commands"
.TH MAT2 "1" "May 2019" "MAT2 0.9.0" "User Commands"
.SH NAME
mat2 \- the metadata anonymisation toolkit 2
......
......@@ -30,27 +30,35 @@ UNSUPPORTED_EXTENSIONS = {
}
DEPENDENCIES = {
'cairo': 'Cairo',
'gi': 'PyGobject',
'gi.repository.GdkPixbuf': 'GdkPixbuf from PyGobject',
'gi.repository.Poppler': 'Poppler from PyGobject',
'gi.repository.GLib': 'GLib from PyGobject',
'mutagen': 'Mutagen',
'Cairo': 'cairo',
'PyGobject': 'gi',
'GdkPixbuf from PyGobject': 'gi.repository.GdkPixbuf',
'Poppler from PyGobject': 'gi.repository.Poppler',
'GLib from PyGobject': 'gi.repository.GLib',
'Mutagen': 'mutagen',
}
CMD_DEPENDENCIES = {
'Exiftool': exiftool._get_exiftool_path,
'Ffmpeg': video._get_ffmpeg_path,
}
def check_dependencies() -> Dict[str, bool]:
ret = collections.defaultdict(bool) # type: Dict[str, bool]
ret['Exiftool'] = bool(exiftool._get_exiftool_path())
ret['Ffmpeg'] = bool(video._get_ffmpeg_path())
for key, value in DEPENDENCIES.items():
ret[value] = True
ret[key] = True
try:
importlib.import_module(key)
importlib.import_module(value)
except ImportError: # pragma: no cover
ret[value] = False # pragma: no cover
ret[key] = False # pragma: no cover
for k, v in CMD_DEPENDENCIES.items():
ret[k] = True
try:
v()
except RuntimeError: # pragma: no cover
ret[k] = False
return ret
......
......@@ -25,17 +25,22 @@ class AbstractParser(abc.ABC):
self.filename = filename
fname, extension = os.path.splitext(filename)
# Special case for tar.gz, tar.bz2, … files
if fname.endswith('.tar') and len(fname) > 4:
fname, extension = fname[:-4], '.tar' + extension
self.output_filename = fname + '.cleaned' + extension
self.lightweight_cleaning = False
@abc.abstractmethod
def get_meta(self) -> Dict[str, Union[str, dict]]:
pass # pragma: no cover
"""Return all the metadata of the current file"""
@abc.abstractmethod
def remove_all(self) -> bool:
"""
Remove all the metadata of the current file
:raises RuntimeError: Raised if the cleaning process went wrong.
"""
# pylint: disable=unnecessary-pass
pass # pragma: no cover
This diff is collapsed.
......@@ -38,6 +38,8 @@ class MP3Parser(MutagenParser):
metadata = {} # type: Dict[str, Union[str, dict]]
meta = mutagen.File(self.filename).tags
for key in meta:
if not hasattr(meta[key], 'text'): # pragma: no cover
continue
metadata[key.rstrip(' \t\r\n\0')] = ', '.join(map(str, meta[key].text))
return metadata
......
......@@ -5,7 +5,7 @@ import xml.etree.ElementTree as ET # type: ignore
from . import archive, office
class EPUBParser(archive.ArchiveBasedAbstractParser):
class EPUBParser(archive.ZipParser):
mimetypes = {'application/epub+zip', }
metadata_namespace = '{http://purl.org/dc/elements/1.1/}'
......
......@@ -15,14 +15,14 @@ class ExiftoolParser(abstract.AbstractParser):
from a import file, hence why several parsers are re-using its `get_meta`
method.
"""
meta_whitelist = set() # type: Set[str]
meta_allowlist = set() # type: Set[str]
def get_meta(self) -> Dict[str, Union[str, dict]]:
out = subprocess.run([_get_exiftool_path(), '-json', self.filename],
input_filename=self.filename,
check=True, stdout=subprocess.PIPE).stdout
meta = json.loads(out.decode('utf-8'))[0]
for key in self.meta_whitelist:
for key in self.meta_allowlist:
meta.pop(key, None)
return meta
......
......@@ -15,7 +15,7 @@ assert Set
class PNGParser(exiftool.ExiftoolParser):
mimetypes = {'image/png', }
meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName',
meta_allowlist = {'SourceFile', 'ExifToolVersion', 'FileName',
'Directory', 'FileSize', 'FileModifyDate',
'FileAccessDate', 'FileInodeChangeDate',
'FilePermissions', 'FileType', 'FileTypeExtension',
......@@ -44,7 +44,7 @@ class PNGParser(exiftool.ExiftoolParser):
class GIFParser(exiftool.ExiftoolParser):
mimetypes = {'image/gif'}
meta_whitelist = {'AnimationIterations', 'BackgroundColor', 'BitsPerPixel',
meta_allowlist = {'AnimationIterations', 'BackgroundColor', 'BitsPerPixel',
'ColorResolutionDepth', 'Directory', 'Duration',
'ExifToolVersion', 'FileAccessDate',
'FileInodeChangeDate', 'FileModifyDate', 'FileName',
......@@ -86,7 +86,7 @@ class GdkPixbufAbstractParser(exiftool.ExiftoolParser):
class JPGParser(GdkPixbufAbstractParser):
_type = 'jpeg'
mimetypes = {'image/jpeg'}
meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName',
meta_allowlist = {'SourceFile', 'ExifToolVersion', 'FileName',
'Directory', 'FileSize', 'FileModifyDate',
'FileAccessDate', "FileInodeChangeDate",
'FilePermissions', 'FileType', 'FileTypeExtension',
......@@ -99,7 +99,7 @@ class JPGParser(GdkPixbufAbstractParser):
class TiffParser(GdkPixbufAbstractParser):
_type = 'tiff'
mimetypes = {'image/tiff'}
meta_whitelist = {'Compression', 'ExifByteOrder', 'ExtraSamples',
meta_allowlist = {'Compression', 'ExifByteOrder', 'ExtraSamples',
'FillOrder', 'PhotometricInterpretation',
'PlanarConfiguration', 'RowsPerStrip', 'SamplesPerPixel',
'StripByteCounts', 'StripOffsets', 'BitsPerSample',
......
......@@ -6,7 +6,7 @@ from typing import Dict, Set, Pattern, Tuple, Any
import xml.etree.ElementTree as ET # type: ignore
from .archive import ArchiveBasedAbstractParser
from .archive import ZipParser
# pylint: disable=line-too-long
......@@ -43,7 +43,7 @@ def _sort_xml_attributes(full_path: str) -> bool:
return True
class MSOfficeParser(ArchiveBasedAbstractParser):
class MSOfficeParser(ZipParser):
mimetypes = {
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
......@@ -89,7 +89,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
r'^word/theme',
r'^word/people\.xml$',
# we have a whitelist in self.files_to_keep,
# we have an allowlist in self.files_to_keep,
# so we can trash everything else
r'^word/_rels/',
}))
......@@ -100,7 +100,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
def __fill_files_to_keep_via_content_types(self) -> bool:
""" There is a suer-handy `[Content_Types].xml` file
in MS Office archives, describing what each other file contains.
The self.content_types_to_keep member contains a type whitelist,
The self.content_types_to_keep member contains a type allowlist,
so we're using it to fill the self.files_to_keep one.
"""
with zipfile.ZipFile(self.filename) as zin:
......@@ -220,7 +220,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
for file_to_omit in self.files_to_omit:
if file_to_omit.search(fname):
matches = map(lambda r: r.search(fname), self.files_to_keep)
if any(matches): # the file is whitelisted
if any(matches): # the file is in the allowlist
continue
removed_fnames.add(fname)
break
......@@ -312,7 +312,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
return {file_path: 'harmful content', }
class LibreOfficeParser(ArchiveBasedAbstractParser):
class LibreOfficeParser(ZipParser):
mimetypes = {
'application/vnd.oasis.opendocument.text',
'application/vnd.oasis.opendocument.spreadsheet',
......
......@@ -7,13 +7,10 @@ from typing import TypeVar, List, Tuple, Optional
from . import abstract, UNSUPPORTED_EXTENSIONS
assert Tuple # make pyflakes happy
T = TypeVar('T', bound='abstract.AbstractParser')
mimetypes.add_type('application/epub+zip', '.epub')
# EPUB Navigation Control XML File
mimetypes.add_type('application/x-dtbncx+xml', '.ncx')
mimetypes.add_type('application/x-dtbncx+xml', '.ncx') # EPUB Navigation Control XML File
def __load_all_parsers():
......@@ -43,13 +40,17 @@ def _get_parsers() -> List[T]:
def get_parser(filename: str) -> Tuple[Optional[T], Optional[str]]:
""" Return the appropriate parser for a giver filename. """
""" Return the appropriate parser for a given filename. """
mtype, _ = mimetypes.guess_type(filename)
_, extension = os.path.splitext(filename)
if extension.lower() in UNSUPPORTED_EXTENSIONS:
return None, mtype
if mtype == 'application/x-tar':
if extension[1:] in ('bz2', 'gz', 'xz'):
mtype = mtype + '+' + extension[1:]
for parser_class in _get_parsers(): # type: ignore
if mtype in parser_class.mimetypes:
try:
......
......@@ -6,7 +6,7 @@ from . import abstract
class TorrentParser(abstract.AbstractParser):
mimetypes = {'application/x-bittorrent', }
whitelist = {b'announce', b'announce-list', b'info'}
allowlist = {b'announce', b'announce-list', b'info'}
def __init__(self, filename):
super().__init__(filename)
......@@ -18,14 +18,14 @@ class TorrentParser(abstract.AbstractParser):
def get_meta(self) -> Dict[str, Union[str, dict]]:
metadata = {}
for key, value in self.dict_repr.items():
if key not in self.whitelist:
if key not in self.allowlist:
metadata[key.decode('utf-8')] = value
return metadata
def remove_all(self) -> bool:
cleaned = dict()
for key, value in self.dict_repr.items():
if key in self.whitelist:
if key in self.allowlist:
cleaned[key] = value
with open(self.output_filename, 'wb') as f:
f.write(_BencodeHandler().bencode(cleaned))
......
......@@ -10,10 +10,10 @@ from . import subprocess
class AbstractFFmpegParser(exiftool.ExiftoolParser):
""" Abstract parser for all FFmpeg-based ones, mainly for video. """
# Some fileformats have mandatory metadata fields
meta_key_value_whitelist = {} # type: Dict[str, Union[str, int]]
meta_key_value_allowlist = {} # type: Dict[str, Union[str, int]]
def remove_all(self) -> bool:
if self.meta_key_value_whitelist:
if self.meta_key_value_allowlist:
logging.warning('The format of "%s" (%s) has some mandatory '
'metadata fields; mat2 filled them with standard '
'data.', self.filename, ', '.join(self.mimetypes))
......@@ -45,8 +45,8 @@ class AbstractFFmpegParser(exiftool.ExiftoolParser):
ret = dict() # type: Dict[str, Union[str, dict]]
for key, value in meta.items():
if key in self.meta_key_value_whitelist.keys():
if value == self.meta_key_value_whitelist[key]:
if key in self.meta_key_value_allowlist.keys():
if value == self.meta_key_value_allowlist[key]:
continue
ret[key] = value
return ret
......@@ -54,7 +54,7 @@ class AbstractFFmpegParser(exiftool.ExiftoolParser):
class WMVParser(AbstractFFmpegParser):
mimetypes = {'video/x-ms-wmv', }
meta_whitelist = {'AudioChannels', 'AudioCodecID', 'AudioCodecName',
meta_allowlist = {'AudioChannels', 'AudioCodecID', 'AudioCodecName',
'ErrorCorrectionType', 'AudioSampleRate', 'DataPackets',
'Directory', 'Duration', 'ExifToolVersion',
'FileAccessDate', 'FileInodeChangeDate', 'FileLength',
......@@ -64,7 +64,7 @@ class WMVParser(AbstractFFmpegParser):
'ImageWidth', 'MIMEType', 'MaxBitrate', 'MaxPacketSize',
'Megapixels', 'MinPacketSize', 'Preroll', 'SendDuration',
'SourceFile', 'StreamNumber', 'VideoCodecName', }
meta_key_value_whitelist = { # some metadata are mandatory :/
meta_key_value_allowlist = { # some metadata are mandatory :/
'AudioCodecDescription': '',
'CreationDate': '0000:00:00 00:00:00Z',
'FileID': '00000000-0000-0000-0000-000000000000',
......@@ -78,7 +78,7 @@ class WMVParser(AbstractFFmpegParser):
class AVIParser(AbstractFFmpegParser):
mimetypes = {'video/x-msvideo', }
meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', 'Directory',
meta_allowlist = {'SourceFile', 'ExifToolVersion', 'FileName', 'Directory',
'FileSize', 'FileModifyDate', 'FileAccessDate',
'FileInodeChangeDate', 'FilePermissions', 'FileType',
'FileTypeExtension', 'MIMEType', 'FrameRate', 'MaxDataRate',
......@@ -98,7 +98,7 @@ class AVIParser(AbstractFFmpegParser):
class MP4Parser(AbstractFFmpegParser):
mimetypes = {'video/mp4', }
meta_whitelist = {'AudioFormat', 'AvgBitrate', 'Balance', 'TrackDuration',
meta_allowlist = {'AudioFormat', 'AvgBitrate', 'Balance', 'TrackDuration',
'XResolution', 'YResolution', 'ExifToolVersion',
'FileAccessDate', 'FileInodeChangeDate', 'FileModifyDate',
'FileName', 'FilePermissions', 'MIMEType', 'FileType',
......@@ -109,7 +109,7 @@ class MP4Parser(AbstractFFmpegParser):
'MovieDataSize', 'VideoFrameRate', 'MediaTimeScale',
'SourceImageHeight', 'SourceImageWidth',
'MatrixStructure', 'MediaDuration'}
meta_key_value_whitelist = { # some metadata are mandatory :/
meta_key_value_allowlist = { # some metadata are mandatory :/
'CreateDate': '0000:00:00 00:00:00',
'CurrentTime': '0 s',
'MediaCreateDate': '0000:00:00 00:00:00',
......
......@@ -37,15 +37,15 @@ class CSSParser(abstract.AbstractParser):
class AbstractHTMLParser(abstract.AbstractParser):
tags_blacklist = set() # type: Set[str]
tags_blocklist = set() # type: Set[str]
# In some html/xml-based formats some tags are mandatory,
# so we're keeping them, but are discaring their content
tags_required_blacklist = set() # type: Set[str]
# so we're keeping them, but are discarding their content
tags_required_blocklist = set() # type: Set[str]
def __init__(self, filename):
super().__init__(filename)
self.__parser = _HTMLParser(self.filename, self.tags_blacklist,
self.tags_required_blacklist)
self.__parser = _HTMLParser(self.filename, self.tags_blocklist,
self.tags_required_blocklist)
with open(filename, encoding='utf-8') as f:
self.__parser.feed(f.read())
self.__parser.close()
......@@ -58,14 +58,14 @@ class AbstractHTMLParser(abstract.AbstractParser):
class HTMLParser(AbstractHTMLParser):
mimetypes = {'text/html', }
tags_blacklist = {'meta', }
tags_required_blacklist = {'title', }
mimetypes = {'text/html', 'application/xhtml+xml'}
tags_blocklist = {'meta', }
tags_required_blocklist = {'title', }
class DTBNCXParser(AbstractHTMLParser):
mimetypes = {'application/x-dtbncx+xml', }
tags_required_blacklist = {'title', 'doctitle', 'meta'}
tags_required_blocklist = {'title', 'doctitle', 'meta'}
class _HTMLParser(parser.HTMLParser):
......@@ -79,7 +79,7 @@ class _HTMLParser(parser.HTMLParser):
Also, gotcha: the `tag` parameters are always in lowercase.
"""
def __init__(self, filename, blacklisted_tags, required_blacklisted_tags):
def __init__(self, filename, blocklisted_tags, required_blocklisted_tags):
super().__init__()
self.filename = filename
self.__textrepr = ''
......@@ -90,24 +90,24 @@ class _HTMLParser(parser.HTMLParser):
self.__in_dangerous_but_required_tag = 0
self.__in_dangerous_tag = 0
if required_blacklisted_tags & blacklisted_tags: # pragma: nocover
if required_blocklisted_tags & blocklisted_tags: # pragma: nocover
raise ValueError("There is an overlap between %s and %s" % (
required_blacklisted_tags, blacklisted_tags))
self.tag_required_blacklist = required_blacklisted_tags
self.tag_blacklist = blacklisted_tags
required_blocklisted_tags, blocklisted_tags))
self.tag_required_blocklist = required_blocklisted_tags
self.tag_blocklist = blocklisted_tags
def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
original_tag = self.get_starttag_text()
self.__validation_queue.append(original_tag)
if tag in self.tag_blacklist:
if tag in self.tag_blocklist:
self.__in_dangerous_tag += 1
if self.__in_dangerous_tag == 0:
if self.__in_dangerous_but_required_tag == 0:
self.__textrepr += original_tag
if tag in self.tag_required_blacklist:
if tag in self.tag_required_blocklist:
self.__in_dangerous_but_required_tag += 1
def handle_endtag(self, tag: str):
......@@ -123,7 +123,7 @@ class _HTMLParser(parser.HTMLParser):
"tag %s in %s" %
(tag, previous_tag, self.filename))
if tag in self.tag_required_blacklist:
if tag in self.tag_required_blocklist:
self.__in_dangerous_but_required_tag -= 1
if self.__in_dangerous_tag == 0:
......@@ -131,7 +131,7 @@ class _HTMLParser(parser.HTMLParser):
# There is no `get_endtag_text()` method :/
self.__textrepr += '</' + previous_tag + '>'
if tag in self.tag_blacklist:
if tag in self.tag_blocklist:
self.__in_dangerous_tag -= 1
def handle_data(self, data: str):
......@@ -141,14 +141,14 @@ class _HTMLParser(parser.HTMLParser):
self.__textrepr += escape(data)
def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
if tag in self.tag_required_blacklist | self.tag_blacklist:
if tag in self.tag_required_blocklist | self.tag_blocklist:
meta = {k:v for k, v in attrs}
name = meta.get('name', 'harmful metadata')
content = meta.get('content', 'harmful data')
self.__meta[name] = content
if self.__in_dangerous_tag == 0:
if tag in self.tag_required_blacklist:
if tag in self.tag_required_blocklist:
self.__textrepr += '<' + tag + ' />'
return
......