-
Add a --diff-context option to control unified diff context size. (!88) Signed-off-by:
Chris Lamb <lamby@debian.org>
Add a --diff-context option to control unified diff context size. (!88) Signed-off-by:
Chris Lamb <lamby@debian.org>
diff.py 22.21 KiB
#
# diffoscope: in-depth comparison of files, archives, and directories
#
# Copyright © 2014-2015 Jérémy Bobbio <lunar@debian.org>
# Copyright © 2017-2021 Chris Lamb <lamby@debian.org>
#
# diffoscope is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# diffoscope is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with diffoscope. If not, see <https://www.gnu.org/licenses/>.
import re
import io
import os
import errno
import fcntl
import hashlib
import logging
import threading
import subprocess
from difflib import Differ
from multiprocessing.dummy import Queue
from .tools import get_tool_name, tool_required
from .config import Config
from .profiling import profile
from .tempfiles import get_temporary_directory
DIFF_CHUNK = 4096
logger = logging.getLogger(__name__)
re_diff_change = re.compile(r"^([+-@]).*", re.MULTILINE)
class DiffParser:
RANGE_RE = re.compile(
# example: '@@ -26814,9 +26814,8 @@'
rb"""
^
@@\s+
-
(?P<start1>\d+)(,(?P<len1>\d+))?
\s+
\+
(?P<start2>\d+)(,(?P<len2>\d+))?
\s+@@
$
""",
re.VERBOSE,
)
def __init__(self, output, end_nl_q1, end_nl_q2):
self._output = output
self._end_nl_q1 = end_nl_q1
self._end_nl_q2 = end_nl_q2
self._action = self.read_headers
self._diff = io.BytesIO()
self._success = False
self._remaining_hunk_lines = None
self._block_len = None
self._direction = None
self._end_nl = None
self._max_lines = Config().max_diff_block_lines_saved
@property
def diff(self):
return self._diff.getvalue().decode("UTF-8", errors="replace")
@property
def success(self):
return self._success
def parse(self):
for line in self._output.split(b"\n"):
self._action = self._action(line)
self._action = None
self._success = True
def read_headers(self, line):
if not line:
return None
if line.startswith(b"---"):
return self.read_headers
if line.startswith(b"+++"):
return self.read_headers
found = DiffParser.RANGE_RE.match(line)
if not found:
raise ValueError("Unable to parse diff headers: %r" % line)
self._diff.write(line + b"\n")
if found.group("len1"):
self._remaining_hunk_lines = int(found.group("len1"))
else:
self._remaining_hunk_lines = 1
if found.group("len2"):
self._remaining_hunk_lines += int(found.group("len2"))
else:
self._remaining_hunk_lines += 1
self._direction = None
return self.read_hunk
def read_hunk(self, line):
if not line:
return None
if line[:1] == b" ":
self._remaining_hunk_lines -= 2
elif line[:1] == b"+":
self._remaining_hunk_lines -= 1
elif line[:1] == b"-":
self._remaining_hunk_lines -= 1
elif line[:1] == b"\\":
# When both files don't end with \n, do not show it as a difference
if self._end_nl is None:
end_nl1 = self._end_nl_q1.get()
end_nl2 = self._end_nl_q2.get()
self._end_nl = end_nl1 and end_nl2
if not self._end_nl:
return self.read_hunk
elif self._remaining_hunk_lines == 0:
return self.read_headers(line)
else:
raise ValueError("Unable to parse diff hunk: %r" % line)
self._diff.write(line + b"\n")
if line[:1] in (b"-", b"+"):
if line[:1] == self._direction:
self._block_len += 1
else:
self._block_len = 1
self._direction = line[:1]
if self._block_len >= self._max_lines:
return self.skip_block
else:
self._block_len = 1
self._direction = line[:1]
return self.read_hunk
def skip_block(self, line):
if self._remaining_hunk_lines == 0 or line[:1] != self._direction:
removed = self._block_len - Config().max_diff_block_lines_saved
if removed:
self._diff.write(
b"%s[ %d lines removed ]\n" % (self._direction, removed)
)
return self.read_hunk(line)
self._block_len += 1
self._remaining_hunk_lines -= 1
return self.skip_block
@tool_required("diff")
def run_diff(fifo1, fifo2, end_nl_q1, end_nl_q2):
cmd = [
get_tool_name("diff"),
"-a",
"-U" + str(Config().diff_context),
fifo1,
fifo2,
]
logger.debug("Running %s", " ".join(cmd))
p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
parser = DiffParser(p.stdout, end_nl_q1, end_nl_q2)
parser.parse()
logger.debug(
"%s: returncode %d, parsed %s",
" ".join(cmd),
p.returncode,
parser.success,
)
if not parser.success and p.returncode not in (0, 1):
raise subprocess.CalledProcessError(
p.returncode, cmd, output=parser.diff.encode("utf-8")
)
if p.returncode == 0:
return None
return parser.diff
class FIFOFeeder(threading.Thread):
def __init__(self, feeder, fifo_path, end_nl_q=None, daemon=True, *args):
os.mkfifo(fifo_path)
super().__init__(daemon=daemon)
self.feeder = feeder
self.fifo_path = fifo_path
self.end_nl_q = Queue() if end_nl_q is None else end_nl_q
self._exception = None
self._want_join = threading.Event()
def __enter__(self):
self.start()
return self
def __exit__(self, exc_type, exc_value, exc_tb):
os.remove(self.fifo_path)
self.join()
def run(self):
try:
# Try to open the FIFO nonblocking, so we can periodically check
# if the main thread wants us to wind down. If it does, there's no
# more need for the FIFO, so stop the thread.
while True:
try:
fd = os.open(self.fifo_path, os.O_WRONLY | os.O_NONBLOCK)
except OSError as e:
if e.errno != errno.ENXIO:
raise
if self._want_join.is_set():
return
else:
break
# Now clear the fd's nonblocking flag to let writes block normally.
fcntl.fcntl(fd, fcntl.F_SETFL, 0)
with open(fd, "wb") as fifo:
# The queue works around a unified diff limitation: if there's
# no newlines in both don't make it a difference
end_nl = self.feeder(fifo)
self.end_nl_q.put(end_nl)
except Exception as error:
self._exception = error
def join(self):
self._want_join.set()
super().join()
if self._exception is not None:
raise self._exception
class _Feeder:
"""
A 'feeder' is a specialized writer.
A 'feeder' is a callable that takes as argument a writeable file, and
writes to it. Feeders can transform the written data, truncate it,
checksum it, and so on. The callable must return True to represent that
the data had a terminating newline, and False otherwise.
Feeders are created by the functions make_feeder_from_raw_reader() and
empty_file_feeder(). The returned objects are closures, and are not
(currently?) instances of any particular class.
"""
def empty_file_feeder():
"""
Returns a feeder that simulates an empty file.
See _Feeder for feeders.
"""
def feeder(f):
return False
return feeder
def make_feeder_from_raw_reader(in_file, filterfn=None):
"""
Create a feeder that checksums, truncates, and transcodes the data. The
optional argument filterfn is a callable that gets passed each line, and
returns the line that should be used in its stead. (There is no facility
for filterfn to discard a line entirely.)
See _Feeder for feeders.
"""
def feeder(out_file):
h = None
end_nl = False
max_lines = Config().max_diff_input_lines
line_count = 0
if max_lines < float("inf"):
h = hashlib.sha256()
for buf in in_file:
line_count += 1
out = filterfn(buf) if filterfn else buf
if h:
h.update(out)
if line_count < max_lines:
out_file.write(out)
end_nl = buf[-1] == "\n"
if h and line_count >= max_lines:
out_file.write(
"[ Too much input for diff (SHA: {}) ]\n".format(
h.hexdigest()
).encode("utf-8")
)
end_nl = True
return end_nl
return feeder
def diff(feeder1, feeder2):
with get_temporary_directory() as tmpdir:
fifo1_path = os.path.join(tmpdir, "fifo1")
fifo2_path = os.path.join(tmpdir, "fifo2")
with FIFOFeeder(feeder1, fifo1_path) as fifo1, FIFOFeeder(
feeder2, fifo2_path
) as fifo2:
return run_diff(
fifo1_path, fifo2_path, fifo1.end_nl_q, fifo2.end_nl_q
)
def diff_split_lines(diff, keepends=True):
lines = diff.split("\n")
if not keepends:
return lines
return [line + "\n" for line in lines[:-1]] + (
[lines[-1]] if lines[-1] else []
)
def reverse_unified_diff(diff):
assert isinstance(diff, str)
res = []
# XXX - should move to just use plain bytes nearly everywhere until ready
# to print instead of using strings internally as well.
for line in diff_split_lines(diff):
line = line.encode("utf-8", errors="replace")
assert isinstance(line, bytes)
found = DiffParser.RANGE_RE.match(line)
if found:
before = found.group("start2")
if found.group("len2") is not None:
before += b"," + found.group("len2")
after = found.group("start1")
if found.group("len1") is not None:
after += b"," + found.group("len1")
res.append(b"@@ -%s +%s @@\n" % (before, after))
elif line.startswith(b"-"):
res.append(b"+")
res.append(line[1:])
elif line.startswith(b"+"):
res.append(b"-")
res.append(line[1:])
else:
res.append(line)
return b"".join(res).decode("utf-8", errors="replace")
def color_unified_diff(diff):
RESET = "\033[0m"
RED, GREEN, CYAN = "\033[31m", "\033[32m", "\033[0;36m"
def repl(m):
return "{}{}{}".format(
{"-": RED, "@": CYAN, "+": GREEN}[m.group(1)], m.group(0), RESET
)
return re_diff_change.sub(repl, diff)
DIFFON = "\x01"
DIFFOFF = "\x02"
MAX_WAGNER_FISCHER_SIZE = (
256 # any higher, and linediff takes >1 second and >200MB RAM
)
def _linediff_sane(x):
# turn non-printable chars into "."
return "." if ord(x) < 32 and x not in "\t\n" else x
def diffinput_truncate(s, sz):
# Truncate, preserving uniqueness
if len(s) > sz:
s = s[
:sz
] + "[ ... truncated by diffoscope; len: {}, SHA: {} ... ]".format(
len(s[sz:]), hashlib.sha256(s[sz:].encode("utf-8")).hexdigest()
)
return s
def linediff(s, t, diffon, diffoff):
# calculate common prefix/suffix, easy optimisation for Wagner-Fischer
prefix = os.path.commonprefix((s, t))
if prefix:
s = s[len(prefix) :]
t = t[len(prefix) :]
suffix = os.path.commonprefix((s[::-1], t[::-1]))[::-1]
if suffix:
s = s[: -len(suffix)]
t = t[: -len(suffix)]
# truncate so Wagner-Fischer doesn't blow up RAM
s = diffinput_truncate(s, MAX_WAGNER_FISCHER_SIZE)
t = diffinput_truncate(t, MAX_WAGNER_FISCHER_SIZE)
l1, l2 = zip(*linediff_simplify(linediff_wagnerfischer(s, t)))
def to_string(k, v):
sanev = "".join(_linediff_sane(c) for c in v)
return (diffon + sanev + diffoff) if k else sanev
s1 = "".join(to_string(*p) for p in l1)
t1 = "".join(to_string(*p) for p in l2)
return f"{prefix}{s1}{suffix}", f"{prefix}{t1}{suffix}"
def linediff_wagnerfischer(s, t):
"""
Line diff algorithm, originally from diff2html.
https://en.wikipedia.org/wiki/Wagner%E2%80%93Fischer_algorithm
Finds the minimum (levenshtein) edit distance between two strings, but has
quadratic performance O(m*n).
"""
m, n = len(s), len(t)
d = [[(0, 0) for i in range(n + 1)] for i in range(m + 1)]
d[0][0] = (0, (0, 0))
for i in range(1, m + 1):
d[i][0] = (i, (i - 1, 0))
for j in range(1, n + 1):
d[0][j] = (j, (0, j - 1))
# NB. This loop is O(len(s) * len(t))
for i in range(1, m + 1):
for j in range(1, n + 1):
if s[i - 1] == t[j - 1]:
cost = 0
else:
cost = 1
d[i][j] = min(
(d[i - 1][j][0] + 1, (i - 1, j)),
(d[i][j - 1][0] + 1, (i, j - 1)),
(d[i - 1][j - 1][0] + cost, (i - 1, j - 1)),
)
coords = []
coord = (m, n)
while coord != (0, 0):
coords.insert(0, coord)
x, y = coord
coord = d[x][y][1]
for coord in coords:
cx, cy = coord
child_val = d[cx][cy][0]
father_coord = d[cx][cy][1]
fx, fy = father_coord
father_val = d[fx][fy][0]
diff = (cx - fx, cy - fy)
if diff == (0, 1):
yield (False, ""), (True, t[fy])
elif diff == (1, 0):
yield (True, s[fx]), (False, "")
elif child_val - father_val == 1:
yield (True, s[fx]), (True, t[fy])
else:
assert s[fx] == t[fy]
yield (False, s[fx]), (False, t[fy])
def linediff_simplify(g):
"""Simplify the output of Wagner-Fischer."""
current = None
for l, r in g:
if not current:
current = l, r
elif current[0][0] == l[0] and current[1][0] == r[0]:
current = (
(l[0], current[0][1] + l[1]),
(r[0], current[1][1] + r[1]),
)
else:
yield current
current = l, r
if current:
yield current
class SideBySideDiff:
"""Calculates a side-by-side diff from a unified diff."""
def __init__(self, unified_diff, diffon=DIFFON, diffoff=DIFFOFF):
self.unified_diff = unified_diff
self.diffon = diffon
self.diffoff = diffoff
self.reset()
def reset(self):
self.differ = Differ()
self.buf = []
self.add_cpt = 0
self.del_cpt = 0
self.line1 = 0
self.line2 = 0
self.hunk_off1 = 0
self.hunk_size1 = 0
self.hunk_off2 = 0
self.hunk_size2 = 0
self._bytes_processed = 0
@property
def bytes_processed(self):
return self._bytes_processed
def match_lines(self, l0, l1):
"""
Given two lists of lines, use python's difflib to make the diff a bit
"smarter", as it better detects added lines (see !64)
"""
if len(l0) + len(l1) > 750:
# difflib.Differ.compare is at least O(n^2), so don't call it if
# our inputs are too large.
logger.debug(
"Not calling difflib.Differ.compare(x, y) with len(x) == %d and len(y) == %d",
len(l0),
len(l1),
)
return
saved_line = None
for line in self.differ.compare(l0, l1):
# Skip lines with details as they don't matter to us
if line.startswith("? "):
continue
# When we encounter a +, it may either be because a line was
# added, or because a line was modified (in that case,
# save_line will not be None)
if line.startswith("+ "):
yield from self.yield_line(saved_line or "", line[2:])
saved_line = None
continue
# If saved_line is not None, we need to yield from it before
# moving on
if saved_line is not None:
yield from self.yield_line(saved_line, "")
saved_line = None
# Handle removed and unchanged lines
if line.startswith("- "):
saved_line = line[2:]
elif line.startswith(" "):
line = line[2:]
yield from self.yield_line(line, line)
# Make sure we don't forget a line
if saved_line is not None:
yield from self.yield_line(saved_line, "")
def empty_buffer(self):
if self.del_cpt == 0 or self.add_cpt == 0:
# All lines are unchanged lines
for l in self.buf:
yield from self.yield_line(l[0], l[1])
elif self.del_cpt != 0 and self.add_cpt != 0:
l0 = [pair[0] for pair in self.buf if pair[0] is not None]
l1 = [pair[1] for pair in self.buf if pair[1] is not None]
yield from self.match_lines(l0, l1)
def yield_line(self, s1, s2):
orig1 = s1
orig2 = s2
if s1 is None and s2 is None:
type_name = "unmodified"
elif s1 == "" and s2 == "":
type_name = "unmodified"
elif s1 is None or s1 == "":
type_name = "added"
elif s2 is None or s2 == "":
type_name = "deleted"
elif (
orig1 == orig2
and not s1.endswith("lines removed ]")
and not s2.endswith("lines removed ]")
):
type_name = "unmodified"
else:
type_name = "changed"
with profile("diff", "linediff"):
s1, s2 = linediff(s1, s2, self.diffon, self.diffoff)
yield "L", (type_name, s1, self.line1, s2, self.line2)
m = orig1 and re.match(r"^\[ (\d+) lines removed \]$", orig1)
if m:
self.line1 += int(m.group(1))
elif orig1:
self.line1 += 1
m = orig2 and re.match(r"^\[ (\d+) lines removed \]$", orig2)
if m:
self.line2 += int(m.group(1))
elif orig2:
self.line2 += 1
self.add_cpt = 0
self.del_cpt = 0
self.buf = []
def items(self):
"""Yield the items that form the side-by-side diff.
Each item is a (type, value) tuple, as follows:
type == "H", value is a tuple representing a hunk header
hunk_offset1, hunk_size1, hunk_offset2, hunk_size2 = value
all ints
type == "L", value is a tuple representing a line of a hunk
mode, line1, lineno1, line2, lineno2 = value
where mode is one of {"unmodified", "added", "deleted", "changed"}
line* are strings
lineno* are ints
type == "C", value is a comment
comment = value
a string
"""
self.reset()
for l in diff_split_lines(self.unified_diff, False):
self._bytes_processed += len(l) + 1
m = re.match(r"^--- ([^\s]*)", l)
if m:
yield from self.empty_buffer()
continue
m = re.match(r"^\+\+\+ ([^\s]*)", l)
if m:
yield from self.empty_buffer()
continue
m = re.match(r"@@ -(\d+),?(\d*) \+(\d+),?(\d*)", l)
if m:
yield from self.empty_buffer()
hunk_data = map(lambda x: x == "" and 1 or int(x), m.groups())
(
self.hunk_off1,
self.hunk_size1,
self.hunk_off2,
self.hunk_size2,
) = hunk_data
self.line1, self.line2 = self.hunk_off1, self.hunk_off2
yield "H", (
self.hunk_off1,
self.hunk_size1,
self.hunk_off2,
self.hunk_size2,
)
continue
if re.match(r"^\[", l):
yield from self.empty_buffer()
yield "C", l
if re.match(r"^\\ No newline", l):
if self.hunk_size2 == 0:
self.buf[-1] = (
self.buf[-1][0],
self.buf[-1][1] + "\n" + l[2:],
)
else:
self.buf[-1] = (
self.buf[-1][0] + "\n" + l[2:],
self.buf[-1][1],
)
continue
if self.hunk_size1 <= 0 and self.hunk_size2 <= 0:
yield from self.empty_buffer()
continue
m = re.match(r"^\+\[ (\d+) lines removed \]$", l)
if m:
self.add_cpt += int(m.group(1))
self.hunk_size2 -= int(m.group(1))
self.buf.append((None, l[1:]))
continue
if re.match(r"^\+", l):
self.add_cpt += 1
self.hunk_size2 -= 1
self.buf.append((None, l[1:]))
continue
m = re.match(r"^-\[ (\d+) lines removed \]$", l)
if m:
self.del_cpt += int(m.group(1))
self.hunk_size1 -= int(m.group(1))
self.buf.append((l[1:], None))
continue
if re.match(r"^-", l):
self.del_cpt += 1
self.hunk_size1 -= 1
self.buf.append((l[1:], None))
continue
if re.match(r"^ ", l) and self.hunk_size1 and self.hunk_size2:
yield from self.empty_buffer()
self.hunk_size1 -= 1
self.hunk_size2 -= 1
self.buf.append((l[1:], l[1:]))
continue
yield from self.empty_buffer()
yield from self.empty_buffer()