Skip to content
Commits on Source (8)
# 0.8.0 - 2019-02-28
- Add support for epub files
- Fix the setup.py file crashing on non-utf8 platforms
- Improve css support
- Improve html support
# 0.7.0 - 2019-02-17
- Add support for wmv files
......
......@@ -32,6 +32,7 @@ metadata.
- `gir1.2-gdkpixbuf-2.0` for images support
- `FFmpeg`, optionally, for video support
- `libimage-exiftool-perl` for everything else
- `bubblewrap`, optionally, for sandboxing
Please note that MAT2 requires at least Python3.5, meaning that it
doesn't run on [Debian Jessie](https://packages.debian.org/jessie/python3).
......
mat2 (0.8.0-1) unstable; urgency=medium
* New upstream release.
* d/control:
- Recommend python-nautilus to make use of the Nautilus extension.
- Sort supported file formats in the description alphabetically.
- Mention new support for .epub files in the description.
* d/mat2.install:
- Install the Nautilus extension. (Closes: #910491)
* d/patches:
- Add patch to make the Nautilus extension work with Python 2.7, as
python-nautilus still builds against it in Debian buster. This is an
important step to handle the transition from mat to mat2.
Thanks a lot for input, comments and review to intrigeri, Daniel Kahn
Gillmor, Julien Voisin and Jonas Meurer!
-- Georg Faerber <georg@riseup.net> Thu, 28 Feb 2019 11:22:10 +0000
mat2 (0.7.0-1) unstable; urgency=medium
* New upstream release.
......
......@@ -32,6 +32,7 @@ Depends: gir1.2-gdkpixbuf-2.0,
${misc:Depends},
${python3:Depends},
Recommends: bubblewrap,
python-nautilus,
Suggests: ffmpeg,
Description: Metadata anonymisation toolkit v2
Metadata consist of information that characterizes data. Metadata are
......@@ -54,19 +55,20 @@ Description: Metadata anonymisation toolkit v2
any metadata, or better: use plain-text.
.
Formats supported to some extent are:
- Audio Video Interleave (.avi)
- Electronic Publication (.epub)
- Free Lossless Audio Codec (.flac)
- Graphics Interchange Format (.gif)
- Hypertext Markup Language (.html)
- Portable Network Graphics (PNG)
- JPEG (.jpeg, .jpg, ...)
- Open Document (.odt, .odx, .ods, ...)
- MPEG Audio (.mp3, .mp2, .mp1, .mpa)
- MPEG-4 (.mp4)
- Office Openxml (.docx, .pptx, .xlsx, ...)
- Ogg Vorbis (.ogg)
- Open Document (.odt, .odx, .ods, ...)
- Portable Document Fileformat (.pdf)
- Tape ARchive (.tar, .tar.bz2, .tar.gz)
- ZIP (.zip)
- MPEG Audio (.mp3, .mp2, .mp1, .mpa)
- Ogg Vorbis (.ogg)
- Free Lossless Audio Codec (.flac)
- Torrent (.torrent)
- Audio Video Interleave (.avi)
- MPEG-4 (.mp4)
- Windows Media Video (.wmv)
- Graphics Interchange Format (.gif)
- Hypertext Markup Language (.html)
- ZIP (.zip)
#!/usr/bin/dh-exec
mat2 => /usr/bin/mat2
nautilus/mat2.py => /usr/share/nautilus-python/extensions/mat2.py
From: Jonas Meurer <jonas@freesources.org>
Date: Fri, 22 Feb 2019 22:59:18 +0100
Subject: Patch nautilus/mat2.py to be a python2 wrapper around mat2
* Since Debian Buster still ships only Python 2 version of
nautilus-python, the only option to use the mat2 nautilus
extension is to make it a python2 wrapper around mat2.
Author: Georg Faerber <georg@riseup.net>, Julien Voisin
<julien.voisin@dustri.org>
Forwarded: not-needed
---
nautilus/mat2.py | 51 +++++++++++++++++++++++++++------------------------
1 file changed, 27 insertions(+), 24 deletions(-)
diff --git a/nautilus/mat2.py b/nautilus/mat2.py
index 562f8a7..d0794d0 100644
--- a/nautilus/mat2.py
+++ b/nautilus/mat2.py
@@ -1,5 +1,5 @@
-#!/usr/bin/env python3
-
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
"""
Because writing GUI is non-trivial (cf. https://0xacab.org/jvoisin/mat2/issues/3),
we decided to write a Nautilus extensions instead
@@ -12,34 +12,37 @@ thread, so we'll have to resort to using a `queue` to pass "messages" around.
# pylint: disable=no-name-in-module,unused-argument,no-self-use,import-error
-import queue
+import Queue as queue
import threading
-from typing import Tuple, Optional, List
-from urllib.parse import unquote
+from urlparse import unquote
import gi
gi.require_version('Nautilus', '3.0')
gi.require_version('Gtk', '3.0')
gi.require_version('GdkPixbuf', '2.0')
from gi.repository import Nautilus, GObject, Gtk, Gio, GLib, GdkPixbuf
+import subprocess
+import mimetypes
+
-from libmat2 import parser_factory
+def _remove_metadata(fpath):
+ """ This is a simple wrapper around the mat2 cli. """
+ try:
+ return subprocess.check_output(['mat2', fpath])
+ except subprocess.CalledProcessError, e:
+ return e.output
-def _remove_metadata(fpath) -> Tuple[bool, Optional[str]]:
- """ This is a simple wrapper around libmat2, because it's
- easier and cleaner this way.
- """
- parser, mtype = parser_factory.get_parser(fpath)
- if parser is None:
- return False, mtype
- return parser.remove_all(), mtype
+def _guess_mtype(fpath):
+ """ Function to guess the mtype of a given file. """
+ mtype, _ = mimetypes.guess_type(fpath)
+ return mtype
class Mat2Extension(GObject.GObject, Nautilus.MenuProvider, Nautilus.LocationWidgetProvider):
""" This class adds an item to the right-clic menu in Nautilus. """
def __init__(self):
- super().__init__()
+ super(Mat2Extension, self).__init__()
self.infobar_hbox = None
self.infobar = None
self.failed_items = list()
@@ -61,7 +64,7 @@ class Mat2Extension(GObject.GObject, Nautilus.MenuProvider, Nautilus.LocationWid
self.infobar.get_content_area().pack_start(self.infobar_hbox, True, True, 0)
self.infobar.show_all()
- def get_widget(self, uri, window) -> Gtk.Widget:
+ def get_widget(self, uri, window):
""" This is the method that we have to implement (because we're
a LocationWidgetProvider) in order to show our infobar.
"""
@@ -103,7 +106,7 @@ class Mat2Extension(GObject.GObject, Nautilus.MenuProvider, Nautilus.LocationWid
window.show_all()
@staticmethod
- def __validate(fileinfo) -> Tuple[bool, str]:
+ def __validate(fileinfo):
""" Validate if a given file FileInfo `fileinfo` can be processed.
Returns a boolean, and a textreason why"""
if fileinfo.get_uri_scheme() != "file" or fileinfo.is_directory():
@@ -112,7 +115,7 @@ class Mat2Extension(GObject.GObject, Nautilus.MenuProvider, Nautilus.LocationWid
return False, "Not writeable"
return True, ""
- def __create_treeview(self) -> Gtk.TreeView:
+ def __create_treeview(self):
liststore = Gtk.ListStore(GdkPixbuf.Pixbuf, str, str)
treeview = Gtk.TreeView(model=liststore)
@@ -144,7 +147,7 @@ class Mat2Extension(GObject.GObject, Nautilus.MenuProvider, Nautilus.LocationWid
treeview.show_all()
return treeview
- def __create_progressbar(self) -> Gtk.ProgressBar:
+ def __create_progressbar(self):
""" Create the progressbar used to notify that files are currently
being processed.
"""
@@ -161,7 +164,7 @@ class Mat2Extension(GObject.GObject, Nautilus.MenuProvider, Nautilus.LocationWid
return progressbar
- def __update_progressbar(self, processing_queue, progressbar) -> bool:
+ def __update_progressbar(self, processing_queue, progressbar):
""" This method is run via `Glib.add_idle` to update the progressbar."""
try:
fname = processing_queue.get(block=False)
@@ -186,7 +189,7 @@ class Mat2Extension(GObject.GObject, Nautilus.MenuProvider, Nautilus.LocationWid
self.infobar.show_all()
return True
- def __clean_files(self, files: list, processing_queue: queue.Queue) -> bool:
+ def __clean_files(self, files, processing_queue):
""" This method is threaded in order to avoid blocking the GUI
while cleaning up the files.
"""
@@ -200,8 +203,8 @@ class Mat2Extension(GObject.GObject, Nautilus.MenuProvider, Nautilus.LocationWid
continue
fpath = unquote(fileinfo.get_uri()[7:]) # `len('file://') = 7`
- success, mtype = _remove_metadata(fpath)
- if not success:
+ if _remove_metadata(fpath):
+ mtype = _guess_mtype(fpath)
self.failed_items.append((fname, mtype, 'Unsupported/invalid'))
processing_queue.put(None) # signal that we processed all the files
return True
@@ -226,7 +229,7 @@ class Mat2Extension(GObject.GObject, Nautilus.MenuProvider, Nautilus.LocationWid
""" https://bugzilla.gnome.org/show_bug.cgi?id=784278 """
return None
- def get_file_items(self, window, files) -> Optional[List[Nautilus.MenuItem]]:
+ def get_file_items(self, window, files):
""" This method is the one allowing us to create a menu item.
"""
# Do not show the menu item if not a single file has a chance to be
0001-nautilus-ext-python2.7.patch
.TH MAT2 "1" "February 2019" "MAT2 0.7.0" "User Commands"
.TH MAT2 "1" "February 2019" "MAT2 0.8.0" "User Commands"
.SH NAME
mat2 \- the metadata anonymisation toolkit 2
......
......@@ -4,13 +4,14 @@ import tempfile
import os
import logging
import shutil
from typing import Dict, Set, Pattern, Union, Any
from typing import Dict, Set, Pattern, Union, Any, List
from . import abstract, UnknownMemberPolicy, parser_factory
# Make pyflakes happy
assert Set
assert Pattern
assert List
assert Union
......@@ -115,9 +116,18 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
temp_folder = tempfile.mkdtemp()
abort = False
items = list() # type: List[zipfile.ZipInfo]
for item in sorted(zin.infolist(), key=lambda z: z.filename):
# Some fileformats do require to have the `mimetype` file
# as the first file in the archive.
if item.filename == 'mimetype':
items = [item] + items
else:
items.append(item)
# Since files order is a fingerprint factor,
# we're iterating (and thus inserting) them in lexicographic order.
for item in sorted(zin.infolist(), key=lambda z: z.filename):
for item in items:
if item.filename[-1] == '/': # `is_dir` is added in Python3.6
continue # don't keep empty folders
......
import logging
import re
import uuid
import xml.etree.ElementTree as ET # type: ignore
from . import archive, office
class EPUBParser(archive.ArchiveBasedAbstractParser):
mimetypes = {'application/epub+zip', }
metadata_namespace = '{http://purl.org/dc/elements/1.1/}'
def __init__(self, filename):
super().__init__(filename)
self.files_to_keep = set(map(re.compile, { # type: ignore
'META-INF/container.xml',
'mimetype',
'OEBPS/content.opf',
}))
self.uniqid = uuid.uuid4()
def _specific_get_meta(self, full_path, file_path):
if file_path != 'OEBPS/content.opf':
return {}
with open(full_path, encoding='utf-8') as f:
try:
results = re.findall(r"<((?:meta|dc|cp).+?)[^>]*>(.+)</\1>",
f.read(), re.I|re.M)
return {k:v for (k, v) in results}
except (TypeError, UnicodeDecodeError):
return {file_path: 'harmful content', }
def _specific_cleanup(self, full_path: str):
if full_path.endswith('OEBPS/content.opf'):
return self.__handle_contentopf(full_path)
elif full_path.endswith('OEBPS/toc.ncx'):
return self.__handle_tocncx(full_path)
return True
def __handle_tocncx(self, full_path: str):
try:
tree, namespace = office._parse_xml(full_path)
except ET.ParseError: # pragma: nocover
logging.error("Unable to parse %s in %s.", full_path, self.filename)
return False
for item in tree.iterfind('.//', namespace): # pragma: nocover
if item.tag.strip().lower().endswith('head'):
item.clear()
ET.SubElement(item, 'meta', attrib={'name': '', 'content': ''})
break
tree.write(full_path, xml_declaration=True, encoding='utf-8',
short_empty_elements=False)
return True
def __handle_contentopf(self, full_path: str):
try:
tree, namespace = office._parse_xml(full_path)
except ET.ParseError:
logging.error("Unable to parse %s in %s.", full_path, self.filename)
return False
for item in tree.iterfind('.//', namespace): # pragma: nocover
if item.tag.strip().lower().endswith('metadata'):
item.clear()
# item with mandatory content
uniqid = ET.Element(self.metadata_namespace + 'identifier')
uniqid.text = str(self.uniqid)
uniqid.set('id', 'id')
item.append(uniqid)
# items without mandatory content
for name in {'language', 'title'}:
uniqid = ET.Element(self.metadata_namespace + name)
item.append(uniqid)
break # there is only a single <metadata> block
tree.write(full_path, xml_declaration=True, encoding='utf-8')
return True
from html import parser
from typing import Dict, Any, List, Tuple
from . import abstract
class HTMLParser(abstract.AbstractParser):
mimetypes = {'text/html', }
def __init__(self, filename):
super().__init__(filename)
self.__parser = _HTMLParser()
with open(filename) as f:
self.__parser.feed(f.read())
self.__parser.close()
def get_meta(self) -> Dict[str, Any]:
return self.__parser.get_meta()
def remove_all(self) -> bool:
return self.__parser.remove_all(self.output_filename)
class _HTMLParser(parser.HTMLParser):
"""Python doesn't have a validating html parser in its stdlib, so
we're using an internal queue to track all the opening/closing tags,
and hoping for the best.
"""
def __init__(self):
super().__init__()
self.__textrepr = ''
self.__meta = {}
self.__validation_queue = []
def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
self.__textrepr += self.get_starttag_text()
self.__validation_queue.append(tag)
def handle_endtag(self, tag: str):
if not self.__validation_queue:
raise ValueError
elif tag != self.__validation_queue.pop():
raise ValueError
# There is no `get_endtag_text()` method :/
self.__textrepr += '</' + tag + '>\n'
def handle_data(self, data: str):
if data.strip():
self.__textrepr += data
def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
if tag == 'meta':
meta = {k:v for k, v in attrs}
name = meta.get('name', 'harmful metadata')
content = meta.get('content', 'harmful data')
self.__meta[name] = content
else:
self.__textrepr += self.get_starttag_text()
def remove_all(self, output_filename: str) -> bool:
if self.__validation_queue:
raise ValueError
with open(output_filename, 'w') as f:
f.write(self.__textrepr)
return True
def get_meta(self) -> Dict[str, Any]:
if self.__validation_queue:
raise ValueError
return self.__meta
import logging
import glob
import os
import mimetypes
......@@ -10,6 +11,10 @@ assert Tuple # make pyflakes happy
T = TypeVar('T', bound='abstract.AbstractParser')
mimetypes.add_type('application/epub+zip', '.epub')
# EPUB Navigation Control XML File
mimetypes.add_type('application/x-dtbncx+xml', '.ncx')
def __load_all_parsers():
""" Loads every parser in a dynamic way """
......@@ -49,6 +54,8 @@ def get_parser(filename: str) -> Tuple[Optional[T], Optional[str]]:
if mtype in parser_class.mimetypes:
try:
return parser_class(filename), mtype
except ValueError:
except ValueError as e:
logging.info("Got an exception when trying to instanciate "
"%s for %s: %s", parser_class, filename, e)
return None, mtype
return None, mtype
from html import parser, escape
from typing import Dict, Any, List, Tuple, Set
import re
import string
from . import abstract
assert Set
# pylint: disable=too-many-instance-attributes
class CSSParser(abstract.AbstractParser):
"""There is no such things as metadata in CSS files,
only comments of the form `/* … */`, so we're removing the laters."""
mimetypes = {'text/css', }
flags = re.MULTILINE | re.DOTALL
def remove_all(self) -> bool:
with open(self.filename, encoding='utf-8') as f:
cleaned = re.sub(r'/\*.*?\*/', '', f.read(), 0, self.flags)
with open(self.output_filename, 'w', encoding='utf-8') as f:
f.write(cleaned)
return True
def get_meta(self) -> Dict[str, Any]:
metadata = {}
with open(self.filename, encoding='utf-8') as f:
cssdoc = re.findall(r'/\*(.*?)\*/', f.read(), self.flags)
for match in cssdoc:
for line in match.splitlines():
try:
k, v = line.split(':')
metadata[k.strip(string.whitespace + '*')] = v.strip()
except ValueError:
metadata['harmful data'] = line.strip()
return metadata
class AbstractHTMLParser(abstract.AbstractParser):
tags_blacklist = set() # type: Set[str]
# In some html/xml-based formats some tags are mandatory,
# so we're keeping them, but are discaring their content
tags_required_blacklist = set() # type: Set[str]
def __init__(self, filename):
super().__init__(filename)
self.__parser = _HTMLParser(self.filename, self.tags_blacklist,
self.tags_required_blacklist)
with open(filename, encoding='utf-8') as f:
self.__parser.feed(f.read())
self.__parser.close()
def get_meta(self) -> Dict[str, Any]:
return self.__parser.get_meta()
def remove_all(self) -> bool:
return self.__parser.remove_all(self.output_filename)
class HTMLParser(AbstractHTMLParser):
mimetypes = {'text/html', }
tags_blacklist = {'meta', }
tags_required_blacklist = {'title', }
class DTBNCXParser(AbstractHTMLParser):
mimetypes = {'application/x-dtbncx+xml', }
tags_required_blacklist = {'title', 'doctitle', 'meta'}
class _HTMLParser(parser.HTMLParser):
"""Python doesn't have a validating html parser in its stdlib, so
we're using an internal queue to track all the opening/closing tags,
and hoping for the best.
Moreover, the parser.HTMLParser call doesn't provide a get_endtag_text
method, so we have to use get_starttag_text instead, put its result in a
LIFO, and transform it in a closing tag when needed.
Also, gotcha: the `tag` parameters are always in lowercase.
"""
def __init__(self, filename, blacklisted_tags, required_blacklisted_tags):
super().__init__()
self.filename = filename
self.__textrepr = ''
self.__meta = {}
self.__validation_queue = [] # type: List[str]
# We're using counters instead of booleans, to handle nested tags
self.__in_dangerous_but_required_tag = 0
self.__in_dangerous_tag = 0
if required_blacklisted_tags & blacklisted_tags: # pragma: nocover
raise ValueError("There is an overlap between %s and %s" % (
required_blacklisted_tags, blacklisted_tags))
self.tag_required_blacklist = required_blacklisted_tags
self.tag_blacklist = blacklisted_tags
def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
original_tag = self.get_starttag_text()
self.__validation_queue.append(original_tag)
if tag in self.tag_blacklist:
self.__in_dangerous_tag += 1
if self.__in_dangerous_tag == 0:
if self.__in_dangerous_but_required_tag == 0:
self.__textrepr += original_tag
if tag in self.tag_required_blacklist:
self.__in_dangerous_but_required_tag += 1
def handle_endtag(self, tag: str):
if not self.__validation_queue:
raise ValueError("The closing tag %s doesn't have a corresponding "
"opening one in %s." % (tag, self.filename))
previous_tag = self.__validation_queue.pop()
previous_tag = previous_tag[1:-1] # remove < and >
previous_tag = previous_tag.split(' ')[0] # remove attributes
if tag != previous_tag.lower():
raise ValueError("The closing tag %s doesn't match the previous "
"tag %s in %s" %
(tag, previous_tag, self.filename))
if tag in self.tag_required_blacklist:
self.__in_dangerous_but_required_tag -= 1
if self.__in_dangerous_tag == 0:
if self.__in_dangerous_but_required_tag == 0:
# There is no `get_endtag_text()` method :/
self.__textrepr += '</' + previous_tag + '>'
if tag in self.tag_blacklist:
self.__in_dangerous_tag -= 1
def handle_data(self, data: str):
if self.__in_dangerous_but_required_tag == 0:
if self.__in_dangerous_tag == 0:
if data.strip():
self.__textrepr += escape(data)
def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
if tag in self.tag_required_blacklist | self.tag_blacklist:
meta = {k:v for k, v in attrs}
name = meta.get('name', 'harmful metadata')
content = meta.get('content', 'harmful data')
self.__meta[name] = content
if self.__in_dangerous_tag == 0:
if tag in self.tag_required_blacklist:
self.__textrepr += '<' + tag + ' />'
return
if self.__in_dangerous_tag == 0:
if self.__in_dangerous_but_required_tag == 0:
self.__textrepr += self.get_starttag_text()
def remove_all(self, output_filename: str) -> bool:
if self.__validation_queue:
raise ValueError("Some tags (%s) were left unclosed in %s" % (
', '.join(self.__validation_queue),
self.filename))
with open(output_filename, 'w', encoding='utf-8') as f:
f.write(self.__textrepr)
return True
def get_meta(self) -> Dict[str, Any]:
if self.__validation_queue:
raise ValueError("Some tags (%s) were left unclosed in %s" % (
', '.join(self.__validation_queue),
self.filename))
return self.__meta
......@@ -15,7 +15,7 @@ except ValueError as e:
print(e)
sys.exit(1)
__version__ = '0.7.0'
__version__ = '0.8.0'
# Make pyflakes happy
assert Tuple
......
import setuptools
with open("README.md", "r") as fh:
with open("README.md", encoding='utf-8') as fh:
long_description = fh.read()
setuptools.setup(
name="mat2",
version='0.7.0',
version='0.8.0',
author="Julien (jvoisin) Voisin",
author_email="julien.voisin+mat2@dustri.org",
description="A handy tool to trash your metadata",
......
/**
* This is my super css framework
* version: 1.0
* author : jvoisin
*/
body {
color: red;
background-color: blue;
}
.underline {
text-decoration: underline; /* underline is cool */
}
......@@ -7,7 +7,7 @@ import logging
import zipfile
from libmat2 import pdf, images, audio, office, parser_factory, torrent
from libmat2 import harmless, video, html
from libmat2 import harmless, video, web
# No need to logging messages, should something go wrong,
# the testsuite _will_ fail.
......@@ -220,52 +220,74 @@ class TestCorruptedFiles(unittest.TestCase):
os.remove('./tests/data/--output.avi')
def test_zip(self):
with zipfile.ZipFile('./tests/data/dirty.zip', 'w') as zout:
with zipfile.ZipFile('./tests/data/clean.zip', 'w') as zout:
zout.write('./tests/data/dirty.flac')
zout.write('./tests/data/dirty.docx')
zout.write('./tests/data/dirty.jpg')
zout.write('./tests/data/embedded_corrupted.docx')
p, mimetype = parser_factory.get_parser('./tests/data/dirty.zip')
p, mimetype = parser_factory.get_parser('./tests/data/clean.zip')
self.assertEqual(mimetype, 'application/zip')
meta = p.get_meta()
self.assertEqual(meta['tests/data/dirty.flac']['comments'], 'Thank you for using MAT !')
self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
self.assertFalse(p.remove_all())
os.remove('./tests/data/dirty.zip')
os.remove('./tests/data/clean.zip')
def test_html(self):
shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
with open('./tests/data/clean.html', 'a') as f:
f.write('<open>but not</closed>')
with self.assertRaises(ValueError):
html.HTMLParser('./tests/data/clean.html')
web.HTMLParser('./tests/data/clean.html')
os.remove('./tests/data/clean.html')
# Yes, we're able to deal with malformed html :/
shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
with open('./tests/data/clean.html', 'a') as f:
f.write('<meta name=\'this" is="weird"/>')
p = html.HTMLParser('./tests/data/clean.html')
p = web.HTMLParser('./tests/data/clean.html')
self.assertTrue(p.remove_all())
p = html.HTMLParser('./tests/data/clean.cleaned.html')
p = web.HTMLParser('./tests/data/clean.cleaned.html')
self.assertEqual(p.get_meta(), {})
os.remove('./tests/data/clean.html')
os.remove('./tests/data/clean.cleaned.html')
with open('./tests/data/clean.html', 'w') as f:
f.write('</close>')
f.write('</meta>')
with self.assertRaises(ValueError):
html.HTMLParser('./tests/data/clean.html')
web.HTMLParser('./tests/data/clean.html')
os.remove('./tests/data/clean.html')
with open('./tests/data/clean.html', 'w') as f:
f.write('<notclosed>')
p = html.HTMLParser('./tests/data/clean.html')
f.write('<meta><a>test</a><set/></meta><title></title><meta>')
p = web.HTMLParser('./tests/data/clean.html')
with self.assertRaises(ValueError):
p.get_meta()
p = html.HTMLParser('./tests/data/clean.html')
p = web.HTMLParser('./tests/data/clean.html')
with self.assertRaises(ValueError):
p.remove_all()
os.remove('./tests/data/clean.html')
with open('./tests/data/clean.html', 'w') as f:
f.write('<doctitle><br/></doctitle><br/><notclosed>')
p = web.HTMLParser('./tests/data/clean.html')
with self.assertRaises(ValueError):
p.get_meta()
p = web.HTMLParser('./tests/data/clean.html')
with self.assertRaises(ValueError):
p.remove_all()
os.remove('./tests/data/clean.html')
def test_epub(self):
with zipfile.ZipFile('./tests/data/clean.epub', 'w') as zout:
zout.write('./tests/data/dirty.jpg', 'OEBPS/content.opf')
p, mimetype = parser_factory.get_parser('./tests/data/clean.epub')
self.assertEqual(mimetype, 'application/epub+zip')
meta = p.get_meta()
self.assertEqual(meta['OEBPS/content.opf']['OEBPS/content.opf'],
'harmful content')
self.assertFalse(p.remove_all())
os.remove('./tests/data/clean.epub')
......@@ -83,6 +83,8 @@ class TestZipOrder(unittest.TestCase):
previous_name = ''
for item in zin.infolist():
if previous_name == '':
if item.filename == 'mimetype':
continue
previous_name = item.filename
continue
elif item.filename < previous_name:
......@@ -97,6 +99,8 @@ class TestZipOrder(unittest.TestCase):
previous_name = ''
for item in zin.infolist():
if previous_name == '':
if item.filename == 'mimetype':
continue
previous_name = item.filename
continue
self.assertGreaterEqual(item.filename, previous_name)
......