Georg Faerber · Georg Faerber · Georg Faerber · Georg Faerber · Georg Faerber · Georg Faerber
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
-image: debian
+variables:
+  CONTAINER_REGISTRY: $CI_REGISTRY/georg/mat2-ci-images

 stages:
  - linting
  - test

-bandit:
+linting:bandit:
+  image: $CONTAINER_REGISTRY:linting 
  stage: linting
  script:  # TODO: remove B405 and B314
-  - apt-get -qqy update
-  - apt-get -qqy install --no-install-recommends python3-bandit
    - bandit ./mat2 --format txt --skip B101
    - bandit -r ./nautilus/ --format txt --skip B101
    - bandit -r ./libmat2 --format txt --skip B101,B404,B603,B405,B314

-pylint:
+linting:pylint:
+  image: $CONTAINER_REGISTRY:linting
  stage: linting
  script:
-  - apt-get -qqy update
-  - apt-get -qqy install --no-install-recommends pylint3 python3-mutagen python3-gi-cairo gir1.2-poppler-0.18 gir1.2-gdkpixbuf-2.0
-  - pylint3 --extension-pkg-whitelist=cairo,gi ./libmat2 ./mat2
+    - pylint3 --disable=no-else-return --extension-pkg-whitelist=cairo,gi ./libmat2 ./mat2
    # Once nautilus-python is in Debian, decomment it form the line below
-  - pylint3 --extension-pkg-whitelist=Nautilus,GObject,Gtk,Gio,GLib,gi ./nautilus/mat2.py
+    - pylint3 --disable=no-else-return --extension-pkg-whitelist=Nautilus,GObject,Gtk,Gio,GLib,gi ./nautilus/mat2.py

-pyflakes:
+linting:pyflakes:
+  image: $CONTAINER_REGISTRY:linting
  stage: linting
  script:
-  - apt-get -qqy update
-  - apt-get -qqy install --no-install-recommends pyflakes3
    - pyflakes3 ./libmat2 ./mat2 ./tests/ ./nautilus

-mypy:
+linting:mypy:
+  image: $CONTAINER_REGISTRY:linting
  stage: linting
  script:
-  - apt-get -qqy update
-  - apt-get -qqy install --no-install-recommends python3-pip
-  - pip3 install mypy
    - mypy --ignore-missing-imports mat2 libmat2/*.py ./nautilus/mat2.py

+tests:archlinux:
+  image: $CONTAINER_REGISTRY:archlinux
+  stage: test
+  script:
+    - python3 setup.py test
+  
 tests:debian:
+  image: $CONTAINER_REGISTRY:debian
  stage: test
  script:
-  - apt-get -qqy update
-  - apt-get -qqy install --no-install-recommends python3-mutagen python3-gi-cairo gir1.2-poppler-0.18 gir1.2-gdkpixbuf-2.0 libimage-exiftool-perl python3-coverage ffmpeg
    - apt-get -qqy purge bubblewrap
    - python3-coverage run --branch -m unittest discover -s tests/
    - python3-coverage report --fail-under=90 -m --include 'libmat2/*'

 tests:debian_with_bubblewrap:
+  image: $CONTAINER_REGISTRY:debian
  stage: test
-  tags:
-    - whitewhale
+  allow_failure: true
  script:
-  - apt-get -qqy update
-  - apt-get -qqy install --no-install-recommends python3-mutagen python3-gi-cairo gir1.2-poppler-0.18 gir1.2-gdkpixbuf-2.0 libimage-exiftool-perl python3-coverage ffmpeg bubblewrap
    - python3-coverage run --branch -m unittest discover -s tests/
    - python3-coverage report --fail-under=100 -m --include 'libmat2/*'

 tests:fedora:
-  image: fedora
+  image: $CONTAINER_REGISTRY:fedora
  stage: test
-  tags:
-    - whitewhale
  script:
-  - dnf install -y python3 python3-mutagen python3-gobject gdk-pixbuf2 poppler-glib gdk-pixbuf2 gdk-pixbuf2-modules cairo-gobject cairo python3-cairo perl-Image-ExifTool mailcap
-  - gdk-pixbuf-query-loaders-64 > /usr/lib64/gdk-pixbuf-2.0/2.10.0/loaders.cache
    - python3 setup.py test

-tests:archlinux:
-  image: archlinux/base
+tests:gentoo:
+  image: $CONTAINER_REGISTRY:gentoo
  stage: test
-  tags:
-    - whitewhale
+  allow_failure: true
  script:
-  - pacman -Sy --noconfirm python-mutagen python-gobject gdk-pixbuf2 poppler-glib gdk-pixbuf2 python-cairo perl-image-exiftool python-setuptools mailcap ffmpeg
-  - python3 setup.py test
+    - python3 -m unittest discover -v
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
+# 0.9.0 - 2019-05-10
+
+- Add tar/tar.gz/tar.bz2/tar.zx archives support
+- Add support for xhtml files
+- Improve handling of read-only files
+- Improve a bit the command line's documentation
+- Fix a confusing error message
+- Add even more tests
+- Usuals internal cleanups/refactorings
+
 # 0.8.0 - 2019-02-28

 - Add support for epub files

--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
 # Contributing to MAT2

 The main repository for MAT2 is on [0xacab]( https://0xacab.org/jvoisin/mat2 ),
-with a mirror on [gitlab.com]( https://gitlab.com/jvoisin/mat2 ).
+but you can send patches to jvoisin by [email](https://dustri.org/) if you prefer.

 Do feel free to pick up [an issue]( https://0xacab.org/jvoisin/mat2/issues )
 and to send a pull-request. Please do check that everything is fine by running the
@@ -29,9 +29,10 @@ Since MAT2 is written in Python3, please conform as much as possible to the
 6. Create a tag with `git tag -s $VERSION`
 7. Push the commit with `git push origin master`
 8. Push the tag with `git push --tags`
-9. Create the signed tarball with `git archive --format=tar.xz --prefix=mat-$VERSION/ $VERSION > mat-$VERSION.tar.xz`
-10. Sign the tarball with `gpg --armor --detach-sign mat-$VERSION.tar.xz`
-11. Upload the result on Gitlab's [tag page](https://0xacab.org/jvoisin/mat2/tags) and add the changelog there
-12. Announce the release on the [mailing list](https://mailman.boum.org/listinfo/mat-dev)
-13. Upload the new version on pypi with `python3 setup.py sdist bdist_wheel` then `twine upload -s dist/*`
-14. Do the secret release dance
+9. Download the gitlab archive of the release
+10. Diff it against the local copy
+11. If there is no difference, sign the archive with `gpg --armor --detach-sign mat2-$VERSION.tar.xz`
+12. Upload the signature on Gitlab's [tag page](https://0xacab.org/jvoisin/mat2/tags) and add the changelog there
+13. Announce the release on the [mailing list](https://mailman.boum.org/listinfo/mat-dev)
+14. Sign'n'upload the new version on pypi with `python3 setup.py sdist bdist_wheel` then `twine upload -s dist/*`
+15. Do the secret release dance
--- a/README.md
+++ b/README.md
@@ -70,7 +70,8 @@ optional arguments:
  -V, --verbose         show more verbose status information
  --unknown-members policy
                        how to handle unknown members of archive-style files
-                        (policy should be one of: abort, omit, keep)
+                        (policy should be one of: abort, omit, keep) [Default:
+                        abort]
  -s, --show            list harmful metadata detectable by MAT2 without
                        removing them
  -L, --lightweight     remove SOME metadata

--- a/debian/changelog
+++ b/debian/changelog
+mat2 (0.9.0-1) unstable; urgency=medium
+
+  * New upstream release.
+  * d/control:
+    - Bump debhelper compat level to 12 and Standards-Version to 4.4.0, no
+      changes required.
+    - Drop ancient X-Python3-Version field.
+    - Extend description to mention new support for .xhtml and .tar.xz files.
+    - Use my debian.org mail address.
+  * d/gitlab-ci.yml:
+    - Pull in changes made upstream by the Salsa CI team to make the CI work
+      again.
+
+ -- Georg Faerber <georg@debian.org>  Wed, 10 Jul 2019 17:51:24 +0000
+
 mat2 (0.8.0-3) unstable; urgency=medium

  * Upload to unstable. This adds a new binary package 'mat' which handles

--- a/debian/control
+++ b/debian/control
@@ -2,9 +2,9 @@ Source: mat2
 Section: utils
 Priority: optional
 Maintainer: Debian Privacy Tools Maintainers <pkg-privacy-maintainers@lists.alioth.debian.org>
-Uploaders: Georg Faerber <georg@riseup.net>,
+Uploaders: Georg Faerber <georg@debian.org>,
           Jonas Meurer <jonas@freesources.org>,
-Build-Depends: debhelper-compat (= 11),
+Build-Depends: debhelper-compat (= 12),
               dh-exec,
               dh-python,
               ffmpeg,
@@ -15,8 +15,7 @@ Build-Depends: debhelper-compat (= 11),
               python3-gi-cairo,
               python3-mutagen,
               python3-setuptools,
-Standards-Version: 4.3.0
-X-Python3-Version: >= 3.5
+Standards-Version: 4.4.0
 Homepage: https://0xacab.org/jvoisin/mat2
 Vcs-Git: https://salsa.debian.org/pkg-privacy-team/mat2.git
 Vcs-Browser: https://salsa.debian.org/pkg-privacy-team/mat2
@@ -61,7 +60,7 @@ Description: Metadata anonymisation toolkit v2
    - Electronic Publication (.epub)
    - Free Lossless Audio Codec (.flac)
    - Graphics Interchange Format (.gif)
-    - Hypertext Markup Language (.html)
+    - Hypertext Markup Language (.html, .xhtml)
    - Portable Network Graphics (PNG)
    - JPEG (.jpeg, .jpg, ...)
    - MPEG Audio (.mp3, .mp2, .mp1, .mpa)
@@ -70,7 +69,7 @@ Description: Metadata anonymisation toolkit v2
    - Ogg Vorbis (.ogg)
    - Open Document (.odt, .odx, .ods, ...)
    - Portable Document Fileformat (.pdf)
-    - Tape ARchive (.tar, .tar.bz2, .tar.gz)
+    - Tape ARchive (.tar, .tar.bz2, .tar.gz, .tar.zx)
    - Torrent (.torrent)
    - Windows Media Video (.wmv)
    - ZIP (.zip)

--- a/debian/gitlab-ci.yml
+++ b/debian/gitlab-ci.yml
-include: https://salsa.debian.org/salsa-ci-team/pipeline/raw/master/salsa-ci.yml
-
-.limits: &limits
-    except:
-      - tags
-    only:
-      changes:
-        - debian/**/*
-
-build:
-    extends: .build-unstable
-    <<: *limits
-
-lintian:
-    extends: .test-lintian
-    <<: *limits
-
-piuparts:
-    extends: .test-piuparts
-    <<: *limits
-
-autopkgtest:
-    extends: .test-autopkgtest
-    <<: *limits
-
-reprotest:
-    extends: .test-reprotest
-    <<: *limits
+---
+include:
+  - https://salsa.debian.org/salsa-ci-team/pipeline/raw/master/salsa-ci.yml
+  - https://salsa.debian.org/salsa-ci-team/pipeline/raw/master/pipeline-jobs.yml
--- a/doc/mat2.1
+++ b/doc/mat2.1
-.TH MAT2 "1" "February 2019" "MAT2 0.8.0" "User Commands"
+.TH MAT2 "1" "May 2019" "MAT2 0.9.0" "User Commands"

 .SH NAME
 mat2 \- the metadata anonymisation toolkit 2

--- a/libmat2/__init__.py
+++ b/libmat2/__init__.py
@@ -30,27 +30,35 @@ UNSUPPORTED_EXTENSIONS = {
    }

 DEPENDENCIES = {
-    'cairo': 'Cairo',
-    'gi': 'PyGobject',
-    'gi.repository.GdkPixbuf': 'GdkPixbuf from PyGobject',
-    'gi.repository.Poppler': 'Poppler from PyGobject',
-    'gi.repository.GLib': 'GLib from PyGobject',
-    'mutagen': 'Mutagen',
+    'Cairo': 'cairo',
+    'PyGobject': 'gi',
+    'GdkPixbuf from PyGobject': 'gi.repository.GdkPixbuf',
+    'Poppler from PyGobject': 'gi.repository.Poppler',
+    'GLib from PyGobject': 'gi.repository.GLib',
+    'Mutagen': 'mutagen',
    }

+CMD_DEPENDENCIES = {
+    'Exiftool': exiftool._get_exiftool_path,
+    'Ffmpeg': video._get_ffmpeg_path,
+    }

 def check_dependencies() -> Dict[str, bool]:
    ret = collections.defaultdict(bool)  # type: Dict[str, bool]

-    ret['Exiftool'] = bool(exiftool._get_exiftool_path())
-    ret['Ffmpeg'] = bool(video._get_ffmpeg_path())
-
    for key, value in DEPENDENCIES.items():
-        ret[value] = True
+        ret[key] = True
        try:
-            importlib.import_module(key)
+            importlib.import_module(value)
        except ImportError:  # pragma: no cover
-            ret[value] = False  # pragma: no cover
+            ret[key] = False  # pragma: no cover
+
+    for k, v in CMD_DEPENDENCIES.items():
+        ret[k] = True
+        try:
+            v()
+        except RuntimeError:  # pragma: no cover
+            ret[k] = False

    return ret


--- a/libmat2/abstract.py
+++ b/libmat2/abstract.py
@@ -25,17 +25,22 @@ class AbstractParser(abc.ABC):

        self.filename = filename
        fname, extension = os.path.splitext(filename)
+
+        # Special case for tar.gz, tar.bz2, … files
+        if fname.endswith('.tar') and len(fname) > 4:
+            fname, extension = fname[:-4], '.tar' + extension
+
        self.output_filename = fname + '.cleaned' + extension
        self.lightweight_cleaning = False

    @abc.abstractmethod
    def get_meta(self) -> Dict[str, Union[str, dict]]:
-        pass  # pragma: no cover
+        """Return all the metadata of the current file"""

    @abc.abstractmethod
    def remove_all(self) -> bool:
        """
+        Remove all the metadata of the current file
+
        :raises RuntimeError: Raised if the cleaning process went wrong.
        """
-        # pylint: disable=unnecessary-pass
-        pass  # pragma: no cover
--- a/libmat2/archive.py
+++ b/libmat2/archive.py
--- a/libmat2/audio.py
+++ b/libmat2/audio.py
@@ -38,6 +38,8 @@ class MP3Parser(MutagenParser):
        metadata = {}  # type: Dict[str, Union[str, dict]]
        meta = mutagen.File(self.filename).tags
        for key in meta:
+            if not hasattr(meta[key], 'text'):  # pragma: no cover
+                continue
            metadata[key.rstrip(' \t\r\n\0')] = ', '.join(map(str, meta[key].text))
        return metadata


--- a/libmat2/epub.py
+++ b/libmat2/epub.py
@@ -5,7 +5,7 @@ import xml.etree.ElementTree as ET  # type: ignore

 from . import archive, office

-class EPUBParser(archive.ArchiveBasedAbstractParser):
+class EPUBParser(archive.ZipParser):
    mimetypes = {'application/epub+zip', }
    metadata_namespace = '{http://purl.org/dc/elements/1.1/}'


--- a/libmat2/exiftool.py
+++ b/libmat2/exiftool.py
@@ -15,14 +15,14 @@ class ExiftoolParser(abstract.AbstractParser):
    from a import file, hence why several parsers are re-using its `get_meta`
    method.
    """
-    meta_whitelist = set()  # type: Set[str]
+    meta_allowlist = set()  # type: Set[str]

    def get_meta(self) -> Dict[str, Union[str, dict]]:
        out = subprocess.run([_get_exiftool_path(), '-json', self.filename],
                             input_filename=self.filename,
                             check=True, stdout=subprocess.PIPE).stdout
        meta = json.loads(out.decode('utf-8'))[0]
-        for key in self.meta_whitelist:
+        for key in self.meta_allowlist:
            meta.pop(key, None)
        return meta


--- a/libmat2/images.py
+++ b/libmat2/images.py
@@ -15,7 +15,7 @@ assert Set

 class PNGParser(exiftool.ExiftoolParser):
    mimetypes = {'image/png', }
-    meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName',
+    meta_allowlist = {'SourceFile', 'ExifToolVersion', 'FileName',
                      'Directory', 'FileSize', 'FileModifyDate',
                      'FileAccessDate', 'FileInodeChangeDate',
                      'FilePermissions', 'FileType', 'FileTypeExtension',
@@ -44,7 +44,7 @@ class PNGParser(exiftool.ExiftoolParser):

 class GIFParser(exiftool.ExiftoolParser):
    mimetypes = {'image/gif'}
-    meta_whitelist = {'AnimationIterations', 'BackgroundColor', 'BitsPerPixel',
+    meta_allowlist = {'AnimationIterations', 'BackgroundColor', 'BitsPerPixel',
                      'ColorResolutionDepth', 'Directory', 'Duration',
                      'ExifToolVersion', 'FileAccessDate',
                      'FileInodeChangeDate', 'FileModifyDate', 'FileName',
@@ -86,7 +86,7 @@ class GdkPixbufAbstractParser(exiftool.ExiftoolParser):
 class JPGParser(GdkPixbufAbstractParser):
    _type = 'jpeg'
    mimetypes = {'image/jpeg'}
-    meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName',
+    meta_allowlist = {'SourceFile', 'ExifToolVersion', 'FileName',
                      'Directory', 'FileSize', 'FileModifyDate',
                      'FileAccessDate', "FileInodeChangeDate",
                      'FilePermissions', 'FileType', 'FileTypeExtension',
@@ -99,7 +99,7 @@ class JPGParser(GdkPixbufAbstractParser):
 class TiffParser(GdkPixbufAbstractParser):
    _type = 'tiff'
    mimetypes = {'image/tiff'}
-    meta_whitelist = {'Compression', 'ExifByteOrder', 'ExtraSamples',
+    meta_allowlist = {'Compression', 'ExifByteOrder', 'ExtraSamples',
                      'FillOrder', 'PhotometricInterpretation',
                      'PlanarConfiguration', 'RowsPerStrip', 'SamplesPerPixel',
                      'StripByteCounts', 'StripOffsets', 'BitsPerSample',

--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -6,7 +6,7 @@ from typing import Dict, Set, Pattern, Tuple, Any

 import xml.etree.ElementTree as ET  # type: ignore

-from .archive import ArchiveBasedAbstractParser
+from .archive import ZipParser

 # pylint: disable=line-too-long

@@ -43,7 +43,7 @@ def _sort_xml_attributes(full_path: str) -> bool:
    return True


-class MSOfficeParser(ArchiveBasedAbstractParser):
+class MSOfficeParser(ZipParser):
    mimetypes = {
        'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
@@ -89,7 +89,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
            r'^word/theme',
            r'^word/people\.xml$',

-            # we have a whitelist in self.files_to_keep,
+            # we have an allowlist in self.files_to_keep,
            # so we can trash everything else
            r'^word/_rels/',
        }))
@@ -100,7 +100,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
    def __fill_files_to_keep_via_content_types(self) -> bool:
        """ There is a suer-handy `[Content_Types].xml` file
        in MS Office archives, describing what each other file contains.
-        The self.content_types_to_keep member contains a type whitelist,
+        The self.content_types_to_keep member contains a type allowlist,
        so we're using it to fill the self.files_to_keep one.
        """
        with zipfile.ZipFile(self.filename) as zin:
@@ -220,7 +220,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
                for file_to_omit in self.files_to_omit:
                    if file_to_omit.search(fname):
                        matches = map(lambda r: r.search(fname), self.files_to_keep)
-                        if any(matches):  # the file is whitelisted
+                        if any(matches):  # the file is in the allowlist
                            continue
                        removed_fnames.add(fname)
                        break
@@ -312,7 +312,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
                return {file_path: 'harmful content', }


-class LibreOfficeParser(ArchiveBasedAbstractParser):
+class LibreOfficeParser(ZipParser):
    mimetypes = {
        'application/vnd.oasis.opendocument.text',
        'application/vnd.oasis.opendocument.spreadsheet',

--- a/libmat2/parser_factory.py
+++ b/libmat2/parser_factory.py
@@ -7,13 +7,10 @@ from typing import TypeVar, List, Tuple, Optional

 from . import abstract, UNSUPPORTED_EXTENSIONS

-assert Tuple  # make pyflakes happy
-
 T = TypeVar('T', bound='abstract.AbstractParser')

 mimetypes.add_type('application/epub+zip', '.epub')
-# EPUB Navigation Control XML File
-mimetypes.add_type('application/x-dtbncx+xml', '.ncx')
+mimetypes.add_type('application/x-dtbncx+xml', '.ncx')  # EPUB Navigation Control XML File


 def __load_all_parsers():
@@ -43,13 +40,17 @@ def _get_parsers() -> List[T]:


 def get_parser(filename: str) -> Tuple[Optional[T], Optional[str]]:
-    """ Return the appropriate parser for a giver filename. """
+    """ Return the appropriate parser for a given filename. """
    mtype, _ = mimetypes.guess_type(filename)

    _, extension = os.path.splitext(filename)
    if extension.lower() in UNSUPPORTED_EXTENSIONS:
        return None, mtype

+    if mtype == 'application/x-tar':
+        if extension[1:] in ('bz2', 'gz', 'xz'):
+            mtype = mtype + '+' + extension[1:]
+
    for parser_class in _get_parsers():  # type: ignore
        if mtype in parser_class.mimetypes:
            try:

--- a/libmat2/torrent.py
+++ b/libmat2/torrent.py
@@ -6,7 +6,7 @@ from . import abstract

 class TorrentParser(abstract.AbstractParser):
    mimetypes = {'application/x-bittorrent', }
-    whitelist = {b'announce', b'announce-list', b'info'}
+    allowlist = {b'announce', b'announce-list', b'info'}

    def __init__(self, filename):
        super().__init__(filename)
@@ -18,14 +18,14 @@ class TorrentParser(abstract.AbstractParser):
    def get_meta(self) -> Dict[str, Union[str, dict]]:
        metadata = {}
        for key, value in self.dict_repr.items():
-            if key not in self.whitelist:
+            if key not in self.allowlist:
                metadata[key.decode('utf-8')] = value
        return metadata

    def remove_all(self) -> bool:
        cleaned = dict()
        for key, value in self.dict_repr.items():
-            if key in self.whitelist:
+            if key in self.allowlist:
                cleaned[key] = value
        with open(self.output_filename, 'wb') as f:
            f.write(_BencodeHandler().bencode(cleaned))

--- a/libmat2/video.py
+++ b/libmat2/video.py
@@ -10,10 +10,10 @@ from . import subprocess
 class AbstractFFmpegParser(exiftool.ExiftoolParser):
    """ Abstract parser for all FFmpeg-based ones, mainly for video. """
    # Some fileformats have mandatory metadata fields
-    meta_key_value_whitelist = {}  # type: Dict[str, Union[str, int]]
+    meta_key_value_allowlist = {}  # type: Dict[str, Union[str, int]]

    def remove_all(self) -> bool:
-        if self.meta_key_value_whitelist:
+        if self.meta_key_value_allowlist:
            logging.warning('The format of "%s" (%s) has some mandatory '
                            'metadata fields; mat2 filled them with standard '
                            'data.', self.filename, ', '.join(self.mimetypes))
@@ -45,8 +45,8 @@ class AbstractFFmpegParser(exiftool.ExiftoolParser):

        ret = dict()  # type: Dict[str, Union[str, dict]]
        for key, value in meta.items():
-            if key in self.meta_key_value_whitelist.keys():
-                if value == self.meta_key_value_whitelist[key]:
+            if key in self.meta_key_value_allowlist.keys():
+                if value == self.meta_key_value_allowlist[key]:
                    continue
            ret[key] = value
        return ret
@@ -54,7 +54,7 @@ class AbstractFFmpegParser(exiftool.ExiftoolParser):

 class WMVParser(AbstractFFmpegParser):
    mimetypes = {'video/x-ms-wmv', }
-    meta_whitelist = {'AudioChannels', 'AudioCodecID', 'AudioCodecName',
+    meta_allowlist = {'AudioChannels', 'AudioCodecID', 'AudioCodecName',
                      'ErrorCorrectionType', 'AudioSampleRate', 'DataPackets',
                      'Directory', 'Duration', 'ExifToolVersion',
                      'FileAccessDate', 'FileInodeChangeDate', 'FileLength',
@@ -64,7 +64,7 @@ class WMVParser(AbstractFFmpegParser):
                      'ImageWidth', 'MIMEType', 'MaxBitrate', 'MaxPacketSize',
                      'Megapixels', 'MinPacketSize', 'Preroll', 'SendDuration',
                      'SourceFile', 'StreamNumber', 'VideoCodecName', }
-    meta_key_value_whitelist = {  # some metadata are mandatory :/
+    meta_key_value_allowlist = {  # some metadata are mandatory :/
        'AudioCodecDescription': '',
        'CreationDate': '0000:00:00 00:00:00Z',
        'FileID': '00000000-0000-0000-0000-000000000000',
@@ -78,7 +78,7 @@ class WMVParser(AbstractFFmpegParser):

 class AVIParser(AbstractFFmpegParser):
    mimetypes = {'video/x-msvideo', }
-    meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', 'Directory',
+    meta_allowlist = {'SourceFile', 'ExifToolVersion', 'FileName', 'Directory',
                      'FileSize', 'FileModifyDate', 'FileAccessDate',
                      'FileInodeChangeDate', 'FilePermissions', 'FileType',
                      'FileTypeExtension', 'MIMEType', 'FrameRate', 'MaxDataRate',
@@ -98,7 +98,7 @@ class AVIParser(AbstractFFmpegParser):

 class MP4Parser(AbstractFFmpegParser):
    mimetypes = {'video/mp4', }
-    meta_whitelist = {'AudioFormat', 'AvgBitrate', 'Balance', 'TrackDuration',
+    meta_allowlist = {'AudioFormat', 'AvgBitrate', 'Balance', 'TrackDuration',
                      'XResolution', 'YResolution', 'ExifToolVersion',
                      'FileAccessDate', 'FileInodeChangeDate', 'FileModifyDate',
                      'FileName', 'FilePermissions', 'MIMEType', 'FileType',
@@ -109,7 +109,7 @@ class MP4Parser(AbstractFFmpegParser):
                      'MovieDataSize', 'VideoFrameRate', 'MediaTimeScale',
                      'SourceImageHeight', 'SourceImageWidth',
                      'MatrixStructure', 'MediaDuration'}
-    meta_key_value_whitelist = {  # some metadata are mandatory :/
+    meta_key_value_allowlist = {  # some metadata are mandatory :/
        'CreateDate': '0000:00:00 00:00:00',
        'CurrentTime': '0 s',
        'MediaCreateDate': '0000:00:00 00:00:00',

--- a/libmat2/web.py
+++ b/libmat2/web.py
@@ -37,15 +37,15 @@ class CSSParser(abstract.AbstractParser):


 class AbstractHTMLParser(abstract.AbstractParser):
-    tags_blacklist = set()  # type: Set[str]
+    tags_blocklist = set()  # type: Set[str]
    # In some html/xml-based formats some tags are mandatory,
-    # so we're keeping them, but are discaring their content
-    tags_required_blacklist = set()  # type: Set[str]
+    # so we're keeping them, but are discarding their content
+    tags_required_blocklist = set()  # type: Set[str]

    def __init__(self, filename):
        super().__init__(filename)
-        self.__parser = _HTMLParser(self.filename, self.tags_blacklist,
-                                    self.tags_required_blacklist)
+        self.__parser = _HTMLParser(self.filename, self.tags_blocklist,
+                                    self.tags_required_blocklist)
        with open(filename, encoding='utf-8') as f:
            self.__parser.feed(f.read())
        self.__parser.close()
@@ -58,14 +58,14 @@ class AbstractHTMLParser(abstract.AbstractParser):


 class HTMLParser(AbstractHTMLParser):
-    mimetypes = {'text/html', }
-    tags_blacklist = {'meta', }
-    tags_required_blacklist = {'title', }
+    mimetypes = {'text/html', 'application/xhtml+xml'}
+    tags_blocklist = {'meta', }
+    tags_required_blocklist = {'title', }


 class DTBNCXParser(AbstractHTMLParser):
    mimetypes = {'application/x-dtbncx+xml', }
-    tags_required_blacklist = {'title', 'doctitle', 'meta'}
+    tags_required_blocklist = {'title', 'doctitle', 'meta'}


 class _HTMLParser(parser.HTMLParser):
@@ -79,7 +79,7 @@ class _HTMLParser(parser.HTMLParser):

    Also, gotcha: the `tag` parameters are always in lowercase.
    """
-    def __init__(self, filename, blacklisted_tags, required_blacklisted_tags):
+    def __init__(self, filename, blocklisted_tags, required_blocklisted_tags):
        super().__init__()
        self.filename = filename
        self.__textrepr = ''
@@ -90,24 +90,24 @@ class _HTMLParser(parser.HTMLParser):
        self.__in_dangerous_but_required_tag = 0
        self.__in_dangerous_tag = 0

-        if required_blacklisted_tags & blacklisted_tags:  # pragma: nocover
+        if required_blocklisted_tags & blocklisted_tags:  # pragma: nocover
            raise ValueError("There is an overlap between %s and %s" % (
-                required_blacklisted_tags, blacklisted_tags))
-        self.tag_required_blacklist = required_blacklisted_tags
-        self.tag_blacklist = blacklisted_tags
+                required_blocklisted_tags, blocklisted_tags))
+        self.tag_required_blocklist = required_blocklisted_tags
+        self.tag_blocklist = blocklisted_tags

    def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
        original_tag = self.get_starttag_text()
        self.__validation_queue.append(original_tag)

-        if tag in self.tag_blacklist:
+        if tag in self.tag_blocklist:
            self.__in_dangerous_tag += 1

        if self.__in_dangerous_tag == 0:
            if self.__in_dangerous_but_required_tag == 0:
                self.__textrepr += original_tag

-        if tag in self.tag_required_blacklist:
+        if tag in self.tag_required_blocklist:
            self.__in_dangerous_but_required_tag += 1

    def handle_endtag(self, tag: str):
@@ -123,7 +123,7 @@ class _HTMLParser(parser.HTMLParser):
                             "tag %s in %s" %
                             (tag, previous_tag, self.filename))

-        if tag in self.tag_required_blacklist:
+        if tag in self.tag_required_blocklist:
            self.__in_dangerous_but_required_tag -= 1

        if self.__in_dangerous_tag == 0:
@@ -131,7 +131,7 @@ class _HTMLParser(parser.HTMLParser):
                # There is no `get_endtag_text()` method :/
                self.__textrepr += '</' + previous_tag + '>'

-        if tag in self.tag_blacklist:
+        if tag in self.tag_blocklist:
            self.__in_dangerous_tag -= 1

    def handle_data(self, data: str):
@@ -141,14 +141,14 @@ class _HTMLParser(parser.HTMLParser):
                    self.__textrepr += escape(data)

    def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
-        if tag in self.tag_required_blacklist | self.tag_blacklist:
+        if tag in self.tag_required_blocklist | self.tag_blocklist:
            meta = {k:v for k, v in attrs}
            name = meta.get('name', 'harmful metadata')
            content = meta.get('content', 'harmful data')
            self.__meta[name] = content

            if self.__in_dangerous_tag == 0:
-                if tag in self.tag_required_blacklist:
+                if tag in self.tag_required_blocklist:
                    self.__textrepr += '<' + tag + ' />'
                return