Skip to content
Snippets Groups Projects
  • Dimitrios Apostolou's avatar
    1ee33f4e
    tempfiles: clean up temporary directories as you go · 1ee33f4e
    Dimitrios Apostolou authored and Chris Lamb's avatar Chris Lamb committed
    Previously the temporary directories where all left in place until the
    end of the process, when a massive cleanup was being run. The cause
    was the _DIRS variable that was holding references to the
    TemporaryDirectory objects, so their destructor was not running.
    
    We now remove the _DIRS list completely (replacing it with _BASEDIR for
    storing only the parent temporary directory). As a result, the temporary
    directories and their contents are being cleaned up automatically when
    the class destructor runs.
    
    This in turn requires modifications to preexisting code:
    
    1. Code that was directly using `get_temporary_directory().name`
       stopped working; the object from `get_temporary_directory()`
       needs to be referenced directly to keep it alive.
    2. `get_temporary_directory()` is now starting a `with` block
       (context manager) where it was straightforward.
    1ee33f4e
    History
    tempfiles: clean up temporary directories as you go
    Dimitrios Apostolou authored and Chris Lamb's avatar Chris Lamb committed
    Previously the temporary directories where all left in place until the
    end of the process, when a massive cleanup was being run. The cause
    was the _DIRS variable that was holding references to the
    TemporaryDirectory objects, so their destructor was not running.
    
    We now remove the _DIRS list completely (replacing it with _BASEDIR for
    storing only the parent temporary directory). As a result, the temporary
    directories and their contents are being cleaned up automatically when
    the class destructor runs.
    
    This in turn requires modifications to preexisting code:
    
    1. Code that was directly using `get_temporary_directory().name`
       stopped working; the object from `get_temporary_directory()`
       needs to be referenced directly to keep it alive.
    2. `get_temporary_directory()` is now starting a `with` block
       (context manager) where it was straightforward.
binwalk.py 3.69 KiB
#
# diffoscope: in-depth comparison of files, archives, and directories
#
# Copyright © 2017-2020 Chris Lamb <lamby@debian.org>
#
# diffoscope is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# diffoscope is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with diffoscope.  If not, see <https://www.gnu.org/licenses/>.

import os
import re
import glob
import logging

from diffoscope.tools import python_module_missing
from diffoscope.tempfiles import get_temporary_directory

from .utils.file import File
from .utils.archive import Archive

try:
    from .rpm import RpmFile
except ImportError:
    from .rpm_fallback import RpmFile

try:
    import binwalk
except ImportError:
    python_module_missing("rpm")
    binwalk = None
else:
    # Disable binwalk's own user configuration for predictable results and to
    # ensure it does not create (!) unnecessary directories, etc. (re. #903444)
    def fn(self):
        if not hasattr(fn, "_temp_dir"):
            fn._temp_dir = get_temporary_directory("binwalk")
        return fn._temp_dir.name

    binwalk.core.settings.Settings._get_user_config_dir = fn

logger = logging.getLogger(__name__)


class BinwalkFileContainer(Archive):
    def open_archive(self):
        return self

    def close_archive(self):
        self.source._unpacked.cleanup()

    def get_member_names(self):
        return sorted(self.source._members.keys())

    def extract(self, member_name, dest_dir):
        return self.source._members[member_name]


class BinwalkFile(File):
    FILE_TYPE_RE = re.compile(r"\bcpio archive\b")
    CONTAINER_CLASSES = [BinwalkFileContainer]

    @classmethod
    def recognizes(cls, file):
        if binwalk is None:
            return False

        if not super().recognizes(file):
            return False

        # RPM files are .cpio, but let's always leave it to the RPM comparator.
        # This avoids the RpmFile > BinWalkFile > CpioFile > RpmFile circular
        # ordering dependency.
        if file.container and isinstance(file.container.source, RpmFile):
            return False

        # Don't recurse; binwalk has already found everything
        if isinstance(file.container, cls.CONTAINER_CLASSES[0]):
            return False

        unpacked = get_temporary_directory(prefix="binwalk")
        logger.debug("Extracting %s to %s", file.path, unpacked.name)

        binwalk.scan(
            file.path,
            dd="cpio:cpio",
            carve=True,
            quiet=True,
            signature=True,
            directory=unpacked.name,
        )

        members = {
            "{} file embedded at offset {}".format(
                os.path.splitext(x)[1],
                os.path.basename(os.path.splitext(x)[0]),
            ): x
            for x in glob.glob(os.path.join(unpacked.name, "*/*"))
        }

        logger.debug("Found %d embedded member(s)", len(members))

        if not members:
            unpacked.cleanup()
            return False

        file._members = members
        file._unpacked = unpacked

        return True

    def compare(self, other, source=None):
        difference = super().compare(other, source)

        if difference is not None:
            difference.add_comment(
                "comprises of {} embedded members".format(len(self._members))
            )

        return difference