Newer
Older
#
# diffoscope: in-depth comparison of files, archives, and directories
#
# Copyright © 2020 Jean-Romain Garnier <salsa@jean-romain.com>
#
# diffoscope is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# diffoscope is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with diffoscope. If not, see <https://www.gnu.org/licenses/>.
import logging
from .utils.file import File
from .utils.operation import Operation
from .utils.container import Container
from diffoscope.difference import Difference
from diffoscope.excludes import operation_excluded
from diffoscope.tools import (
tool_required,
tool_check_installed,
python_module_missing,
)
try:
import tlsh
except:
tlsh = None
try:
import r2pipe
except:
python_module_missing("r2pipe")
r2pipe = None
logger = logging.getLogger(__name__)
if not tool_check_installed("radare2"):
r2pipe = None
logger.debug("radare2 not found, disabling decompiler")
class Decompile(Operation, metaclass=abc.ABCMeta):
def __init__(self, file, *args, **kwargs):
super().__init__(file.path, *args, **kwargs)
self.file = file
def start(self):
logger.debug("Executing %s", self.full_name())
if not isinstance(self.file, AsmFunction):
self._stdout = ""
return
self._decompile()
@abc.abstractmethod
def _decompile(self):
raise NotImplementedError()
def should_show_error(self):
return False
return self._stdout.encode("utf-8").splitlines(True)
class DecompileGhidra(Decompile):
# Remove addresses from warnings as they can create a lot of irrelevant noise
_JUMPTABLE_WARNING_RE = re.compile(rb"(^\s*// WARNING:.*)(0x[0-9a-f]+)")
def _run_r2_command(self):
self.file.decompiler.jump(self.file.offset)
output = self.file.decompiler.r2.cmdj("pdgj")
if not output:
# Output is None if the pdg command doesn't exist
output = {
"errors": [
'Missing r2ghidra-dec, install it with "r2pm install r2ghidra-dec"'
]
}
return output
@tool_required("radare2")
def _decompile(self):
ghidra_output = self._run_r2_command()
self._stdout = ghidra_output["code"]
# Show errors on stdout so a failed decompilation for 1 function
# doesn't stop the diff for the whole file
self._stdout = "\n".join(ghidra_output["errors"])
logger.debug(
"r2ghidra decompiler error for %s: %s",
self.file.signature,
def name(self):
return "r2ghidra"
def full_name(self, *args, **kwargs):
return "radare2 r2ghidra"
return self._JUMPTABLE_WARNING_RE.sub(rb"\g<1>0xX", line)
class DecompileRadare2(Decompile):
"""
Significantly faster than the ghidra decompiler, but still outputs assembly
code, with added comments to make it more readable
"""
def _run_r2_command(self):
self.file.decompiler.jump(self.file.offset)
return self.file.decompiler.r2.cmd("pdc")
@tool_required("radare2")
def _decompile(self):
self._stdout = self._run_r2_command()
def name(self):
return "disass"
def full_name(self, *args, **kwargs):
return "radare2 disass"
class AsmFunction(File):
DESCRIPTION = "ASM Function"
# Mapping between the Config().decompiler option and the command class
DecompileGhidra,
DecompileRadare2,
]
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
def __init__(self, decompiler, data_dict):
super().__init__(container=decompiler)
self.data_dict = data_dict
self.decompiler = decompiler
self._name = self.func_name
@property
def name(self):
# Multiple functions can have the same name but a different signature,
# so use the signature as name for diffoscope
return self.signature
@property
def progress_name(self):
return "{} [{}]".format(
self.container.source.progress_name, super().progress_name
)
@property
def path(self):
return self.container.source.path
def is_directory(self):
return False
def is_symlink(self):
return False
def is_device(self):
return False
def is_socket_or_fifo(self):
if tlsh:
@property
def fuzzy_hash(self):
if not hasattr(self, "_fuzzy_hash"):
try:
hex_digest = tlsh.hash(self.asm.encode())
except ValueError:
# File must contain a certain amount of randomness
return None
# For short files, the hex_digest is an empty string, so turn
# it into None
self._fuzzy_hash = hex_digest or None
return self._fuzzy_hash
def has_same_content_as(self, other):
try:
return self.hex_dump == other.hex_dump
except AttributeError:
# 'other' is not a function.
logger.debug("has_same_content: Not an asm function: %s", other)
return False
@classmethod
def recognizes(cls, file):
# No file should be recognized as an asm function
return False
def compare(self, other, source=None):
"""
Override file's compare method to get rid of the binary diff fallback,
as it would be redundant with other outputs
"""
details = self.compare_details(other, source)
details = [x for x in details if x]
if not details:
return None
difference = Difference(self.name, other.name, source=source)
difference.add_details(details)
return difference
def compare_details(self, other, source=None):
return [
Difference.from_operation(x, self, other)
for x in list(self.DECOMPILE_OPERATIONS)
@property
def func_name(self):
return self.data_dict["name"]
@property
def offset(self):
return self.data_dict["offset"]
@property
def size(self):
return self.data_dict["size"]
@property
def signature(self):
return self.data_dict["signature"]
@property
def hex_dump(self):
if not hasattr(self, "_hex_dump"):
self._hex_dump = self.decompiler.dump(self.offset, self.size)
return self._hex_dump
def asm(self):
if not hasattr(self, "_asm"):
ops = self.decompiler.disassemble(self.offset)
self._asm = ""
for instr in ops:
try:
self._asm += instr["disasm"] + "\n"
except KeyError:
# Invalid instruction
self._asm += "invalid\n"
def all_decompile_operations_are_excluded(file):
for klass in AsmFunction.DECOMPILE_OPERATIONS:
name = " ".join(klass(file).full_name())
if not operation_excluded(name):
return False
return True
class DecompilableContainer(Container):
auto_diff_metadata = False
# Don't use @tool_required here so subclassing DecompilableContainer
# doesn't block the new subclass from doing its work if radare2
# isn't installed
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
logger.debug("Creating DecompileContainer for %s", self.source.path)
self._functions = {}
# Skip disassembly (and decompilation) if a dependency is missing
# or if radare2 commands are excluded
if r2pipe is None or all_decompile_operations_are_excluded(
self.source
):
return
# Use "-2" flag to silence radare2 warnings
self.r2 = r2pipe.open(self.source.path, flags=["-2"])
# Run radare2 command which finds the functions in the executable
self.r2.cmd("aa") # Analyse all
# Hide offset in asm as it serves the same purpose as line numbers,
# which shouldn't be diffed
self.r2.cmd("e asm.offset = false")
# In hex dump of function, hide everything but the hex values
self.r2.cmd(
"e hex.offset = false;e hex.header = false;e hex.ascii = false"
)
# Use radare2 to get the list of functions
# If there aren't any, cmdj returns None
functions = self.r2.cmdj("aj") or []
for f in functions:
func = AsmFunction(self, f)
self._functions[func.signature] = func
logger.debug("Adding function %s", func.signature)
def cleanup(self):
self.r2.quit()
def get_member_names(self):
return self._functions.keys()
def get_member(self, member_name):
return self._functions[member_name]
def jump(self, offset):
self.r2.cmd("s {}".format(offset))
def dump(self, offset, size):
self.jump(offset)
return self.r2.cmd("px {}".format(size)).strip()
def disassemble(self, offset):
self.jump(offset)
return self.r2.cmdj("pdfj")["ops"]