Newer
Older

Chris Lamb
committed
#
# diffoscope: in-depth comparison of files, archives, and directories
#
# Copyright © 2016-2022, 2024-2025 Chris Lamb <lamby@debian.org>

Chris Lamb
committed
#
# diffoscope is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# diffoscope is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with diffoscope. If not, see <https://www.gnu.org/licenses/>.
import abc
import uuid
import os.path

Chris Lamb
committed
import itertools

Chris Lamb
committed
from diffoscope.config import Config
from diffoscope.difference import Difference
from diffoscope.excludes import filter_excludes

Chris Lamb
committed
from diffoscope.progress import Progress

Chris Lamb
committed
from ..missing_file import MissingFile, AbstractMissingType

Chris Lamb
committed

Chris Lamb
committed
from .fuzzy import perform_fuzzy_matching
NO_COMMENT = None
logger = logging.getLogger(__name__)

Chris Lamb
committed
class Container(metaclass=abc.ABCMeta):
auto_diff_metadata = True

Chris Lamb
committed
def __new__(cls, source):
if isinstance(source, MissingFile):
new = super(Container, MissingContainer).__new__(MissingContainer)
new.__init__(source)
return new
return super(Container, cls).__new__(cls)

Chris Lamb
committed
def __init__(self, source):
self._source = source
# Keep a count of how "nested" we are
self.depth = 0
if hasattr(source, "container") and source.container is not None:
self.depth = source.container.depth + 1

Chris Lamb
committed
@property
def source(self):
return self._source
@abc.abstractmethod
def get_member_names(self):
raise NotImplementedError()
@abc.abstractmethod
def get_member(self, member_name):
raise NotImplementedError()
def get_path_name(self, dest_dir):
return os.path.join(dest_dir, str(uuid.uuid4()))
def get_filtered_members(self):
# If your get_member implementation is O(n) then this will be O(n^2)
# cost. In such cases it is recommended to override this as well
for name in filter_excludes(self.get_member_names()):
yield name, self.get_member(name)
def perform_fuzzy_matching(self, my_members, other_members):
return perform_fuzzy_matching(my_members, other_members)
def get_adjusted_members(self):
Returns an iterable of pairs. The key is what is used to match when
comparing containers. This may be used to e.g. strip off version
numbers, hashes, etc, efficiently for known file formats, so that we
don't need to use the expensive tlsh "fuzzy-hashing" logic.
Note that containers with 1 element are already force-compared against
other containers with 1 element, so you don't need to override this
method for those cases.
return self.get_filtered_members()

Chris Lamb
committed
def lookup_file(self, *names):
"""
Try to fetch a specific file by digging in containers.
"""

Chris Lamb
committed
from .specialize import specialize

Chris Lamb
committed
name, remainings = names[0], names[1:]
try:
file = self.get_member(name)
except KeyError:
return None
logger.debug("lookup_file(%s) -> %s", names, file)

Chris Lamb
committed
specialize(file)
if not remainings:
return file

Chris Lamb
committed
container = file.as_container
if not container:
return None

Chris Lamb
committed
return container.lookup_file(*remainings)
def get_adjusted_members_sizes(self):
for name, member in self.get_adjusted_members():
else:
size = path_apparent_size(member.path)
yield name, (member, size)

Chris Lamb
committed
def comparisons(self, other):
my_members = OrderedDict(self.get_adjusted_members_sizes())
other_members = OrderedDict(other.get_adjusted_members_sizes())
total_size = sum(
x[1]
for x in itertools.chain(
my_members.values(), other_members.values()
)
)
# TODO: progress could be a bit more accurate here, give more weight to fuzzy-hashed files

Chris Lamb
committed
def prep_yield(my_name, other_name, comment=NO_COMMENT):
my_member, my_size = my_members.pop(my_name)
other_member, other_size = other_members.pop(other_name)
p.begin_step(my_size + other_size, msg=my_member.progress_name)
return my_member, other_member, comment
# if both containers contain 1 element, compare these
if len(my_members) == 1 and len(other_members) == 1:
yield prep_yield(
next(iter(my_members.keys())),
next(iter(other_members.keys())),
)
return
other_names = set(other_members.keys())
# keep it sorted like my_members
both_names = [
name for name in my_members.keys() if name in other_names
]
for name in both_names:
yield prep_yield(name, name)
for my_name, other_name, score in self.perform_fuzzy_matching(
my_members, other_members
):

Chris Lamb
committed
percentage = (1 - (score / 400.0)) * 100
if percentage >= 99:
percentage = 99
comment = (
f"Files {percentage:.0f}% similar despite different names"
)

Chris Lamb
committed
if score == 0:
comment = "Files identical despite different names"
yield prep_yield(my_name, other_name, comment)

Chris Lamb
committed
if Config().new_file:
for my_member, my_size in my_members.values():

Ximin Luo
committed
p.begin_step(my_size, msg=my_member.progress_name)
yield my_member, MissingFile(
for other_member, other_size in other_members.values():

Ximin Luo
committed
p.begin_step(other_size, msg=other_member.progress_name)
), other_member, NO_COMMENT

Chris Lamb
committed

Ximin Luo
committed
def compare(self, other, source=None, no_recurse=False):
from .compare import compare_files

Chris Lamb
committed
from ..directory import compare_meta
def compare_pair(file1, file2, comment):
if Config().timeout_exceeded():
difference = Difference(file1.name, file2.name)
msg = "Timeout exceeded; details may be incomplete."
difference.add_comment(msg)
return difference

Chris Lamb
committed
try:
difference = compare_files(
file1, file2, source=None, diff_content_only=no_recurse
)
except PermissionError as exc:
logger.warning(f"Skipping {exc.filename} ({exc.strerror})")
return

Chris Lamb
committed
if isinstance(file1, AbstractMissingType) or isinstance(
file2, AbstractMissingType
):
# There is no need to compare metadata with a missing file,
# as it doesn't make much sense
meta_differences = []
else:
meta_differences = compare_meta(file1.name, file2.name)
if meta_differences and not difference:
difference = Difference(file1.path, file2.path)
if difference:
difference.add_details(meta_differences)
if comment:
if difference is None:
difference = Difference(file1.name, file2.name)
difference.add_comment(comment)
return difference

Chris Lamb
committed
return filter(
None, itertools.starmap(compare_pair, self.comparisons(other))
)

Chris Lamb
committed

Chris Lamb
committed
class MissingContainer(Container, AbstractMissingType):

Chris Lamb
committed
def get_member_names(self):
return self.source.other_file.as_container.get_member_names()
def get_member(self, member_name):