Commit 303aee94 authored by Jérémy Bobbio's avatar Jérémy Bobbio

Use lazy extraction instead of explicit bracketing

Previously, code requiring access to file content had to be explicitly
bracketed using get_content() for files to be extracted and then deleted.
Such construction is problematic for parallel processing as a file might be
processed be multiple operations currently (e.g. multiple files being extracted
from a unique archive at the same time).

We thus removes the get_content() context and @needs_content decorator to
prefer lazy path initialization: actual content will be made available through
the path property. The extraction will happen then if necessary.

The extracted file should normally be deleted when Python garbage collector
reclaims the object. As a safety net, we still have a global registry of all
temporary files and directories and remove them on exit.
parent 939b4b06
......@@ -21,6 +21,8 @@ from functools import wraps
import logging
from distutils.spawn import find_executable
import os
import shutil
import tempfile
VERSION = "42"
......@@ -115,4 +117,32 @@ def set_locale():
os.environ['TZ'] = 'UTC'
temp_files = []
temp_dirs = []
def get_named_temporary_file(*args, **kwargs):
f = tempfile.NamedTemporaryFile(*args, **kwargs)
temp_files.append(f.name)
return f
def get_temporary_directory(*args, **kwargs):
d = tempfile.TemporaryDirectory(*args, **kwargs)
temp_dirs.append(d)
return d
def clean_all_temp_files():
for temp_file in temp_files:
try:
os.unlink(temp_file)
except FileNotFoundError:
pass
except:
logger.exception('Unable to delete %s', temp_file)
for temp_dir in temp_dirs:
try:
temp_dir.cleanup()
except:
logger.exception('Unable to delete %s', temp_dir)
......@@ -30,7 +30,7 @@ try:
import tlsh
except ImportError:
tlsh = None
from diffoscope import logger, VERSION, set_locale
from diffoscope import logger, VERSION, set_locale, clean_all_temp_files
import diffoscope.comparators
from diffoscope.config import Config
from diffoscope.presenters.html import output_html
......@@ -153,6 +153,8 @@ def main(args=None):
import pdb
pdb.post_mortem()
sys.exit(2)
finally:
clean_all_temp_files()
if __name__ == '__main__':
main()
......@@ -94,19 +94,18 @@ def compare_root_paths(path1, path2):
def compare_files(file1, file2, source=None):
logger.debug('compare files %s and %s', file1, file2)
with file1.get_content(), file2.get_content():
if file1.has_same_content_as(file2):
logger.debug('same content, skipping')
return None
specialize(file1)
specialize(file2)
if isinstance(file1, NonExistingFile):
file1.other_file = file2
elif isinstance(file2, NonExistingFile):
file2.other_file = file1
elif file1.__class__.__name__ != file2.__class__.__name__:
return file1.compare_bytes(file2, source)
return file1.compare(file2, source)
if file1.has_same_content_as(file2):
logger.debug('same content, skipping')
return None
specialize(file1)
specialize(file2)
if isinstance(file1, NonExistingFile):
file1.other_file = file2
elif isinstance(file2, NonExistingFile):
file2.other_file = file1
elif file1.__class__.__name__ != file2.__class__.__name__:
return file1.compare_bytes(file2, source)
return file1.compare(file2, source)
def compare_commented_files(file1, file2, comment=None, source=None):
difference = compare_files(file1, file2, source=source)
......
......@@ -71,14 +71,6 @@ def compare_binary_files(file1, file2, source=None):
SMALL_FILE_THRESHOLD = 65536 # 64 kiB
# decorator for functions which needs to access the file content
# (and so requires a path to be set)
def needs_content(original_method):
@wraps(original_method)
def wrapper(self, other, *args, **kwargs):
with self.get_content(), other.get_content():
return original_method(self, other, *args, **kwargs)
return wrapper
class File(object, metaclass=ABCMeta):
if hasattr(magic, 'open'): # use Magic-file-extensions from file
......@@ -109,12 +101,22 @@ class File(object, metaclass=ABCMeta):
return self._mimedb_encoding.from_file(path).decode('utf-8')
def __repr__(self):
return '<%s %s %s>' % (self.__class__, self.name, self.path)
return '<%s %s>' % (self.__class__, self.name)
# Path should only be used when accessing the file content (through get_content())
# This should return a path that allows to access the file content
@property
@abstractmethod
def path(self):
return self._path
raise NotImplemented
# Remove any temporary data associated with the file. The function
# should be idempotent and work during the destructor. It does nothing by
# default.
def cleanup(self):
pass
def __del__(self):
self.cleanup()
# This might be different from path and is used to do file extension matching
@property
......@@ -124,32 +126,25 @@ class File(object, metaclass=ABCMeta):
@property
def magic_file_type(self):
if not hasattr(self, '_magic_file_type'):
with self.get_content():
self._magic_file_type = File.guess_file_type(self.path)
self._magic_file_type = File.guess_file_type(self.path)
return self._magic_file_type
if tlsh:
@property
def fuzzy_hash(self):
if not hasattr(self, '_fuzzy_hash'):
with self.get_content():
# tlsh is not meaningful with files smaller than 512 bytes
if os.stat(self.path).st_size >= 512:
h = tlsh.Tlsh()
with open(self.path, 'rb') as f:
for buf in iter(lambda: f.read(32768), b''):
h.update(buf)
h.final()
self._fuzzy_hash = h.hexdigest()
else:
self._fuzzy_hash = None
# tlsh is not meaningful with files smaller than 512 bytes
if os.stat(self.path).st_size >= 512:
h = tlsh.Tlsh()
with open(self.path, 'rb') as f:
for buf in iter(lambda: f.read(32768), b''):
h.update(buf)
h.final()
self._fuzzy_hash = h.hexdigest()
else:
self._fuzzy_hash = None
return self._fuzzy_hash
@abstractmethod
@contextmanager
def get_content(self):
raise NotImplemented
@abstractmethod
def is_directory():
raise NotImplemented
......@@ -162,7 +157,6 @@ class File(object, metaclass=ABCMeta):
def is_device():
raise NotImplemented
@needs_content
def compare_bytes(self, other, source=None):
return compare_binary_files(self, other, source)
......@@ -175,7 +169,6 @@ class File(object, metaclass=ABCMeta):
return difference
@tool_required('cmp')
@needs_content
def has_same_content_as(self, other):
logger.debug('%s has_same_content %s', self, other)
# try comparing small files directly first
......@@ -190,7 +183,6 @@ class File(object, metaclass=ABCMeta):
# To be specialized directly, or by implementing compare_details
@needs_content
def compare(self, other, source=None):
if hasattr(self, 'compare_details'):
try:
......@@ -227,17 +219,11 @@ class File(object, metaclass=ABCMeta):
class FilesystemFile(File):
def __init__(self, path):
self._path = None
self._name = path
@contextmanager
def get_content(self):
if self._path is not None:
yield
else:
self._path = self._name
yield
self._path = None
@property
def path(self):
return self._name
def is_directory(self):
return not os.path.islink(self._name) and os.path.isdir(self._name)
......@@ -261,10 +247,13 @@ class NonExistingFile(File):
return False
def __init__(self, path, other_file=None):
self._path = None
self._name = path
self._other_file = other_file
@property
def path(self):
return '/dev/null'
@property
def other_file(self):
return self._other_file
......@@ -276,12 +265,6 @@ class NonExistingFile(File):
def has_same_content_as(self, other):
return False
@contextmanager
def get_content(self):
self._path = '/dev/null'
yield
self._path = None
def is_directory(self):
return False
......
......@@ -21,7 +21,7 @@ import os.path
import re
import subprocess
import diffoscope.comparators
from diffoscope.comparators.binary import File, needs_content
from diffoscope.comparators.binary import File
from diffoscope.comparators.utils import Archive, get_compressed_content_name, NO_COMMENT
from diffoscope import logger, tool_required
......@@ -62,7 +62,6 @@ class Bzip2File(File):
def recognizes(file):
return Bzip2File.RE_FILE_TYPE.match(file.magic_file_type)
@needs_content
def compare_details(self, other, source=None):
with Bzip2Container(self).open() as my_container, \
Bzip2Container(other).open() as other_container:
......
......@@ -25,7 +25,7 @@ import subprocess
import stat
import struct
from diffoscope import logger, tool_required
from diffoscope.comparators.binary import File, needs_content
from diffoscope.comparators.binary import File
from diffoscope.comparators.utils import Archive, Command
from diffoscope.difference import Difference
......@@ -94,40 +94,38 @@ def is_header_valid(buf, size, offset=0):
class CbfsFile(File):
@staticmethod
def recognizes(file):
with file.get_content():
size = os.stat(file.path).st_size
if size < CBFS_HEADER_SIZE:
return False
with open(file.path, 'rb') as f:
# pick at the latest byte as it should contain the relative offset of the header
f.seek(-4, io.SEEK_END)
# <pgeorgi> given the hardware we support so far, it looks like
# that field is now bound to be little endian
# -- #coreboot, 2015-10-14
rel_offset = struct.unpack('<i', f.read(4))[0]
if rel_offset < 0 and -rel_offset > CBFS_HEADER_SIZE and -rel_offset < size:
f.seek(rel_offset, io.SEEK_END)
logger.debug('looking for header at offset: %x', f.tell())
if is_header_valid(f.read(CBFS_HEADER_SIZE), size):
return True
elif not file.name.endswith('.rom'):
return False
else:
logger.debug('CBFS relative offset seems wrong, scanning whole image')
f.seek(0, io.SEEK_SET)
offset = 0
buf = f.read(CBFS_HEADER_SIZE)
while len(buf) >= CBFS_HEADER_SIZE:
if is_header_valid(buf, size, offset):
return True
if len(buf) - offset <= CBFS_HEADER_SIZE:
buf = f.read(32768)
offset = 0
else:
offset += 1
return False
@needs_content
size = os.stat(file.path).st_size
if size < CBFS_HEADER_SIZE:
return False
with open(file.path, 'rb') as f:
# pick at the latest byte as it should contain the relative offset of the header
f.seek(-4, io.SEEK_END)
# <pgeorgi> given the hardware we support so far, it looks like
# that field is now bound to be little endian
# -- #coreboot, 2015-10-14
rel_offset = struct.unpack('<i', f.read(4))[0]
if rel_offset < 0 and -rel_offset > CBFS_HEADER_SIZE and -rel_offset < size:
f.seek(rel_offset, io.SEEK_END)
logger.debug('looking for header at offset: %x', f.tell())
if is_header_valid(f.read(CBFS_HEADER_SIZE), size):
return True
elif not file.name.endswith('.rom'):
return False
else:
logger.debug('CBFS relative offset seems wrong, scanning whole image')
f.seek(0, io.SEEK_SET)
offset = 0
buf = f.read(CBFS_HEADER_SIZE)
while len(buf) >= CBFS_HEADER_SIZE:
if is_header_valid(buf, size, offset):
return True
if len(buf) - offset <= CBFS_HEADER_SIZE:
buf = f.read(32768)
offset = 0
else:
offset += 1
return False
def compare_details(self, other, source=None):
differences = []
differences.append(Difference.from_command(CbfsListing, self.path, other.path))
......
......@@ -20,7 +20,7 @@
import re
from diffoscope import tool_required
from diffoscope.comparators.binary import File, needs_content
from diffoscope.comparators.binary import File
from diffoscope.comparators.libarchive import LibarchiveContainer
from diffoscope.comparators.utils import Command
from diffoscope.difference import Difference
......@@ -39,7 +39,6 @@ class CpioFile(File):
def recognizes(file):
return CpioFile.RE_FILE_TYPE.search(file.magic_file_type)
@needs_content
def compare_details(self, other, source=None):
differences = []
differences.append(Difference.from_command(
......
......@@ -21,7 +21,7 @@ import re
import os.path
from diffoscope import logger
from diffoscope.difference import Difference
from diffoscope.comparators.binary import File, needs_content
from diffoscope.comparators.binary import File
from diffoscope.comparators.libarchive import LibarchiveContainer
from diffoscope.comparators.utils import \
Archive, ArchiveMember, get_ar_content
......@@ -49,7 +49,6 @@ class DebFile(File):
def set_files_with_same_content_in_data(self, files):
self._files_with_same_content_in_data = files
@needs_content
def compare_details(self, other, source=None):
differences = []
my_content = get_ar_content(self.path)
......@@ -81,7 +80,6 @@ class Md5sumsFile(File):
d[path] = md5sum
return d
@needs_content
def compare(self, other, source=None):
if other.path is None:
return None
......@@ -122,7 +120,6 @@ class DebDataTarFile(File):
file.container.source.name.startswith('data.tar.') and \
isinstance(file.container.source.container.source, DebFile)
@needs_content
def compare_details(self, other, source=None):
differences = []
ignore_files = self.container.source.container.source.files_with_same_content_in_data
......
......@@ -25,7 +25,7 @@ import re
from debian.deb822 import Dsc
from diffoscope.changes import Changes
import diffoscope.comparators
from diffoscope.comparators.binary import File, NonExistingFile, needs_content
from diffoscope.comparators.binary import File, NonExistingFile
from diffoscope.comparators.utils import Container, NO_COMMENT
from diffoscope.config import Config
from diffoscope.difference import Difference
......@@ -53,15 +53,9 @@ class DebControlMember(File):
def name(self):
return self._name
@contextmanager
def get_content(self):
if self._path is not None:
yield
else:
with self.container.source.get_content():
self._path = os.path.join(os.path.dirname(self.container.source.path), self.name)
yield
self._path = None
@property
def path(self):
return os.path.join(os.path.dirname(self.container.source.path), self.name)
def is_directory(self):
return False
......@@ -109,7 +103,6 @@ class DebControlFile(File):
def deb822(self):
return self._deb822
@needs_content
def compare_details(self, other, source=None):
differences = []
......@@ -142,14 +135,13 @@ class DotChangesFile(DebControlFile):
def recognizes(file):
if not DotChangesFile.RE_FILE_EXTENSION.search(file.name):
return False
with file.get_content():
changes = Changes(filename=file.path)
try:
changes.validate(check_signature=False)
except FileNotFoundError:
return False
file._deb822 = changes
return True
changes = Changes(filename=file.path)
try:
changes.validate(check_signature=False)
except FileNotFoundError:
return False
file._deb822 = changes
return True
class DotDscFile(DebControlFile):
RE_FILE_EXTENSION = re.compile(r'\.dsc$')
......@@ -158,19 +150,18 @@ class DotDscFile(DebControlFile):
def recognizes(file):
if not DotDscFile.RE_FILE_EXTENSION.search(file.name):
return False
with file.get_content():
with open(file.path, 'rb') as f:
dsc = Dsc(f)
for d in dsc.get('Files'):
md5 = hashlib.md5()
# XXX: this will not work for containers
in_dsc_path = os.path.join(os.path.dirname(file.path), d['Name'])
if not os.path.exists(in_dsc_path):
return False
with open(in_dsc_path, 'rb') as f:
for buf in iter(partial(f.read, 32768), b''):
md5.update(buf)
if md5.hexdigest() != d['md5sum']:
return False
file._deb822 = dsc
return True
with open(file.path, 'rb') as f:
dsc = Dsc(f)
for d in dsc.get('Files'):
md5 = hashlib.md5()
# XXX: this will not work for containers
in_dsc_path = os.path.join(os.path.dirname(file.path), d['Name'])
if not os.path.exists(in_dsc_path):
return False
with open(in_dsc_path, 'rb') as f:
for buf in iter(partial(f.read, 32768), b''):
md5.update(buf)
if md5.hexdigest() != d['md5sum']:
return False
file._deb822 = dsc
return True
......@@ -20,10 +20,10 @@
from contextlib import contextmanager
import os
import tempfile
from diffoscope.comparators.binary import File, FilesystemFile, needs_content
from diffoscope.comparators.binary import File, FilesystemFile
from diffoscope.comparators.utils import format_device
from diffoscope.difference import Difference
from diffoscope import logger
from diffoscope import logger, get_named_temporary_file
class Device(File):
......@@ -39,16 +39,24 @@ class Device(File):
def has_same_content_as(self, other):
return self.get_device() == other.get_device()
@contextmanager
def get_content(self):
with tempfile.NamedTemporaryFile(mode='w+', suffix='diffoscope') as f:
def create_placeholder(self):
with get_named_temporary_file(mode='w+', suffix='diffoscope', delete=False) as f:
f.write(format_device(*self.get_device()))
f.flush()
self._path = f.name
yield
self._path = None
return f.name
@property
def path(self):
if not hasattr(self, '_placeholder'):
self._placeholder = self.create_placeholder()
return self._placeholder
def cleanup(self):
if hasattr(self, '_placeholder'):
os.remove(self._placeholder)
del self._placeholder
super().cleanup()
@needs_content
def compare(self, other, source=None):
with open(self.path) as my_content, \
open(other.path) as other_content:
......
......@@ -21,7 +21,7 @@ import re
import os.path
import subprocess
from diffoscope import logger, tool_required
from diffoscope.comparators.binary import File, needs_content
from diffoscope.comparators.binary import File
from diffoscope.comparators.utils import Archive, get_compressed_content_name
from diffoscope.difference import Difference
......@@ -59,7 +59,6 @@ class DexFile(File):
def recognizes(file):
return DexFile.RE_FILE_TYPE.match(file.magic_file_type)
@needs_content
def compare_details(self, other, source=None):
with DexContainer(self).open() as my_container, \
DexContainer(other).open() as other_container:
......
......@@ -113,10 +113,6 @@ class FilesystemDirectory(object):
def name(self):
return self._path
@contextmanager
def get_content(self):
yield
def is_directory(self):
return True
......@@ -140,15 +136,14 @@ class FilesystemDirectory(object):
for name in sorted(set(my_names).intersection(other_names)):
my_file = my_container.get_member(name)
other_file = other_container.get_member(name)
with my_file.get_content(), other_file.get_content():
inner_difference = diffoscope.comparators.compare_files(
my_file, other_file, source=name)
meta_differences = compare_meta(my_file.name, other_file.name)
if meta_differences and not inner_difference:
inner_difference = Difference(None, my_file.path, other_file.path)
if inner_difference:
inner_difference.add_details(meta_differences)
differences.append(inner_difference)
inner_difference = diffoscope.comparators.compare_files(
my_file, other_file, source=name)
meta_differences = compare_meta(my_file.name, other_file.name)
if meta_differences and not inner_difference:
inner_difference = Difference(None, my_file.path, other_file.path)
if inner_difference:
inner_difference.add_details(meta_differences)
differences.append(inner_difference)
if not differences:
return None
difference = Difference(None, self.path, other.path, source)
......@@ -159,10 +154,9 @@ class FilesystemDirectory(object):
class DirectoryContainer(Container):
@contextmanager
def open(self):
with self.source.get_content():
self._path = self.source.path.rstrip('/') or '/'
yield self
self._path = None
self._path = self.source.path.rstrip('/') or '/'
yield self
self._path = None
def get_member_names(self):
names = []
......
......@@ -20,7 +20,7 @@
import os.path
import re
from diffoscope import tool_required
from diffoscope.comparators.binary import File, needs_content
from diffoscope.comparators.binary import File
from diffoscope.comparators.utils import get_ar_content, Command
from diffoscope.difference import Difference
......@@ -90,7 +90,6 @@ class ElfFile(File):
def recognizes(file):
return ElfFile.RE_FILE_TYE.match(file.magic_file_type)
@needs_content
def compare_details(self, other, source=None):
return _compare_elf_data(self.path, other.path)
......@@ -102,7 +101,6 @@ class StaticLibFile(File):
def recognizes(file):
return StaticLibFile.RE_FILE_TYPE.search(file.magic_file_type) and StaticLibFile.RE_FILE_EXTENSION.search(file.name)
@needs_content
def compare_details(self, other, source=None):
differences = []
# look up differences in metadata
......
......@@ -19,7 +19,7 @@
import re
from diffoscope import tool_required
from diffoscope.comparators.binary import File, needs_content
from diffoscope.comparators.binary import File
from diffoscope.comparators.utils import Command
from diffoscope.difference import Difference
......@@ -40,6 +40,5 @@ class TtfFile(File):
def recognizes(file):
return TtfFile.RE_FILE_TYPE.match(file.magic_file_type)
@needs_content
def compare_details(self, other, source=None):
return [Difference.from_command(Showttf, self.path, other.path)]
......@@ -24,7 +24,7 @@ try:
except ImportError:
guestfs = None
from diffoscope import logger
from diffoscope.comparators.binary import File, needs_content
from diffoscope.comparators.binary import File
from diffoscope.comparators.utils import Archive, get_compressed_content_name
from diffoscope.difference import Difference
......@@ -77,7 +77,6 @@ class FsImageFile(File):
def recognizes(file):
return FsImageFile.RE_FILE_TYPE.match(file.magic_file_type)
@needs_content
def compare_details(self, other, source=None):
differences = []
with FsImageContainer(self).open() as my_container, \
......
......@@ -20,7 +20,7 @@
from io import BytesIO
import re
from diffoscope import tool_required
from diffoscope.comparators.binary import File, needs_content
from diffoscope.comparators.binary import File
from diffoscope.comparators.utils import Command
from diffoscope.difference import Difference
from diffoscope import logger
......@@ -63,6 +63,5 @@ class MoFile(File):
def recognizes(file):
return MoFile.RE_FILE_TYPE.match(file.magic_file_type)
@needs_content
def compare_details(self, other, source=None):
return [Difference.from_command(Msgunfmt, self.path, other.path)]
......@@ -22,7 +22,6 @@ import subprocess
import os.path
import diffoscope.comparators
from diffoscope import logger, tool_required
from diffoscope.comparators.binary import needs_content
from diffoscope.comparators.utils import Archive, get_compressed_content_name, NO_COMMENT
from diffoscope.difference import Difference
......@@ -63,7 +62,6 @@ class GzipFile(object):
def recognizes(file):
return GzipFile.RE_FILE_TYPE.match(file.magic_file_type)
@needs_content
def compare_details(self, other, source=None):
differences = []
differences.append(Difference.from_text(
......
......@@ -22,7 +22,7 @@ import platform
import struct
import subprocess
from diffoscope import tool_required
from diffoscope.comparators.binary import File, needs_content
from diffoscope.comparators.binary import File
from diffoscope.comparators.utils import Command
from diffoscope.difference import Difference
from diffoscope import logger
......@@ -60,29 +60,27 @@ class HiFile(File):
if HiFile.hi_version is None:
return False
with file.get_content():
with open(file.path, 'rb') as fp:
# read magic
buf = fp.read(4)
if buf != HI_MAGIC:
logger.debug('Haskell interface magic mismatch. Found %r instead of %r or %r', buf, HI_MAGIC_32, HI_MAGIC_64)
return False
# skip some old descriptor thingy that has varying size
if buf == HI_MAGIC_32: