diff.py 20 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
# -*- coding: utf-8 -*-
#
# diffoscope: in-depth comparison of files, archives, and directories
#
# Copyright © 2014-2015 Jérémy Bobbio <lunar@debian.org>
#
# diffoscope is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# diffoscope is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with diffoscope.  If not, see <https://www.gnu.org/licenses/>.

import re
import io
22
import os
23 24
import errno
import fcntl
25
import hashlib
26 27 28 29 30 31
import logging
import threading
import subprocess

from multiprocessing.dummy import Queue

32 33
from diffoscope.tempfiles import get_temporary_directory

34
from .tools import get_tool_name, tool_required
35 36 37 38 39 40 41 42 43 44
from .config import Config

DIFF_CHUNK = 4096

logger = logging.getLogger(__name__)
re_diff_change = re.compile(r'^([+-@]).*', re.MULTILINE)


class DiffParser(object):
    RANGE_RE = re.compile(
45
        # example: '@@ -26814,9 +26814,8 @@'
46
        rb'''
47 48 49 50 51 52 53 54 55 56 57
          ^
            @@\s+
              -
              (?P<start1>\d+)(,(?P<len1>\d+))?
            \s+
              \+
              (?P<start2>\d+)(,(?P<len2>\d+))?
            \s+@@
          $
        ''',
        re.VERBOSE,
58 59 60 61 62 63 64
    )

    def __init__(self, output, end_nl_q1, end_nl_q2):
        self._output = output
        self._end_nl_q1 = end_nl_q1
        self._end_nl_q2 = end_nl_q2
        self._action = self.read_headers
65
        self._diff = io.BytesIO()
66 67 68 69 70
        self._success = False
        self._remaining_hunk_lines = None
        self._block_len = None
        self._direction = None
        self._end_nl = None
71
        self._max_lines = Config().max_diff_block_lines_saved
72 73 74

    @property
    def diff(self):
75
        return self._diff.getvalue().decode('UTF-8', errors='replace')
76 77 78 79 80 81

    @property
    def success(self):
        return self._success

    def parse(self):
82 83
        for line in self._output.split(b'\n'):
            self._action = self._action(line)
84

85
        self._action = None
86 87 88 89 90 91
        self._success = True

    def read_headers(self, line):
        if not line:
            return None

92
        if line.startswith(b'---'):
93 94
            return self.read_headers

95
        if line.startswith(b'+++'):
96 97
            return self.read_headers

98 99
        found = DiffParser.RANGE_RE.match(line)

100
        if not found:
101
            raise ValueError('Unable to parse diff headers: %r' % line)
102

103
        self._diff.write(line + b'\n')
104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
        if found.group('len1'):
            self._remaining_hunk_lines = int(found.group('len1'))
        else:
            self._remaining_hunk_lines = 1
        if found.group('len2'):
            self._remaining_hunk_lines += int(found.group('len2'))
        else:
            self._remaining_hunk_lines += 1

        self._direction = None

        return self.read_hunk

    def read_hunk(self, line):
        if not line:
            return None

121
        if line[:1] == b' ':
122
            self._remaining_hunk_lines -= 2
123
        elif line[:1] == b'+':
124
            self._remaining_hunk_lines -= 1
125
        elif line[:1] == b'-':
126
            self._remaining_hunk_lines -= 1
127
        elif line[:1] == b'\\':
128 129 130 131 132 133 134 135 136 137
            # When both files don't end with \n, do not show it as a difference
            if self._end_nl is None:
                end_nl1 = self._end_nl_q1.get()
                end_nl2 = self._end_nl_q2.get()
                self._end_nl = end_nl1 and end_nl2
            if not self._end_nl:
                return self.read_hunk
        elif self._remaining_hunk_lines == 0:
            return self.read_headers(line)
        else:
138
            raise ValueError('Unable to parse diff hunk: %r' % line)
139

140
        self._diff.write(line + b'\n')
141

142 143
        if line[:1] in (b'-', b'+'):
            if line[:1] == self._direction:
144 145 146
                self._block_len += 1
            else:
                self._block_len = 1
147
                self._direction = line[:1]
148

149
            if self._block_len >= self._max_lines:
150 151 152
                return self.skip_block
        else:
            self._block_len = 1
153
            self._direction = line[:1]
154 155 156 157

        return self.read_hunk

    def skip_block(self, line):
158
        if self._remaining_hunk_lines == 0 or line[:1] != self._direction:
159 160
            removed = self._block_len - Config().max_diff_block_lines_saved
            if removed:
161
                self._diff.write(b'%s[ %d lines removed ]\n' % (
Chris Lamb's avatar
Chris Lamb committed
162 163 164
                    self._direction,
                    removed,
                ))
165 166 167 168 169 170 171
            return self.read_hunk(line)

        self._block_len += 1
        self._remaining_hunk_lines -= 1

        return self.skip_block

Chris Lamb's avatar
Chris Lamb committed
172

173 174
@tool_required('diff')
def run_diff(fifo1, fifo2, end_nl_q1, end_nl_q2):
175
    cmd = [get_tool_name('diff'), '-aU7', fifo1, fifo2]
176 177 178

    logger.debug("Running %s", ' '.join(cmd))

179
    p = subprocess.run(
180 181 182 183 184
        cmd,
        bufsize=1,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
    )
185

186
    parser = DiffParser(p.stdout, end_nl_q1, end_nl_q2)
187
    parser.parse()
188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203

    logger.debug(
        "%s: returncode %d, parsed %s",
        ' '.join(cmd),
        p.returncode,
        parser.success,
    )

    if not parser.success and p.returncode not in (0, 1):
        raise subprocess.CalledProcessError(p.returncode, cmd, output=diff)

    if p.returncode == 0:
        return None

    return parser.diff

Chris Lamb's avatar
Chris Lamb committed
204

205
class FIFOFeeder(threading.Thread):
206
    def __init__(self, feeder, fifo_path, end_nl_q=None, daemon=True, *args):
207 208 209 210 211 212 213
        os.mkfifo(fifo_path)
        super().__init__(daemon=daemon)
        self.feeder = feeder
        self.fifo_path = fifo_path
        self.end_nl_q = Queue() if end_nl_q is None else end_nl_q
        self._exception = None
        self._want_join = threading.Event()
214

215 216 217
    def __enter__(self):
        self.start()
        return self
218

219 220
    def __exit__(self, exc_type, exc_value, exc_tb):
        self.join()
221

222 223 224 225 226 227 228
    def run(self):
        try:
            # Try to open the FIFO nonblocking, so we can periodically check
            # if the main thread wants us to wind down.  If it does, there's no
            # more need for the FIFO, so stop the thread.
            while True:
                try:
Chris Lamb's avatar
Chris Lamb committed
229
                    fd = os.open(self.fifo_path, os.O_WRONLY | os.O_NONBLOCK)
230 231 232 233 234 235 236 237 238
                except OSError as error:
                    if error.errno != errno.ENXIO:
                        raise
                    elif self._want_join.is_set():
                        return
                else:
                    break

            # Now clear the fd's nonblocking flag to let writes block normally.
Chris Lamb's avatar
Chris Lamb committed
239 240 241
            fcntl.fcntl(fd, fcntl.F_SETFL, 0)

            with open(fd, 'wb') as fifo:
242 243
                # The queue works around a unified diff limitation: if there's
                # no newlines in both don't make it a difference
244
                end_nl = self.feeder(fifo)
245 246 247
                self.end_nl_q.put(end_nl)
        except Exception as error:
            self._exception = error
248

249 250 251 252 253
    def join(self):
        self._want_join.set()
        super().join()
        if self._exception is not None:
            raise self._exception
254

Chris Lamb's avatar
Chris Lamb committed
255

256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319
class _Feeder:
    """
    A 'feeder' is a specialized writer.

    A 'feeder' is a callable that takes as argument a writeable file, and
    writes to it.  Feeders can transform the written data, truncate it,
    checksum it, and so on.  The callable must return True to represent that
    the data had a terminating newline, and False otherwise.

    Feeders are created by the functions make_feeder_from_raw_reader() and
    empty_file_feeder().  The returned objects are closures, and are not
    (currently?) instances of any particular class.
    """
    pass


def empty_file_feeder():
    """
    Returns a feeder that simulates an empty file.

    See _Feeder for feeders.
    """
    def feeder(f):
        return False
    return feeder


def make_feeder_from_raw_reader(in_file, filter=None):
    """
    Create a feeder that checksums, truncates, and transcodes the data.  The
    optional argument FILTER is a callable that gets passed each line, and
    returns the line that should be used in its stead.  (There is no facility
    for FILTER to discard a line entirely.)

    See _Feeder for feeders.
    """
    def feeder(out_file):
        h = None
        end_nl = False
        max_lines = Config().max_diff_input_lines
        line_count = 0

        if max_lines < float("inf"):
            h = hashlib.sha1()

        for buf in in_file:
            line_count += 1
            out = filter(buf) if filter else buf
            if h:
                h.update(out)
            if line_count < max_lines:
                out_file.write(out)
            end_nl = buf[-1] == '\n'

        if h and line_count >= max_lines:
            out_file.write("[ Too much input for diff (SHA1: {}) ]\n".format(
                h.hexdigest(),
            ).encode('utf-8'))
            end_nl = True

        return end_nl
    return feeder


320
def diff(feeder1, feeder2):
321 322
    tmpdir = get_temporary_directory().name

323 324 325
    fifo1_path = os.path.join(tmpdir, 'fifo1')
    fifo2_path = os.path.join(tmpdir, 'fifo2')
    with FIFOFeeder(feeder1, fifo1_path) as fifo1, \
Chris Lamb's avatar
Chris Lamb committed
326
            FIFOFeeder(feeder2, fifo2_path) as fifo2:
327
        return run_diff(fifo1_path, fifo2_path, fifo1.end_nl_q, fifo2.end_nl_q)
328

Chris Lamb's avatar
Chris Lamb committed
329

330 331 332 333 334 335 336
def diff_split_lines(diff, keepends=True):
    lines = diff.split("\n")
    if not keepends:
        return lines
    return [line + "\n" for line in lines[:-1]] + ([lines[-1]] if lines[-1] else [])


337
def reverse_unified_diff(diff):
338
    assert isinstance(diff, str)
339
    res = []
340 341
    # XXX - should move to just use plain bytes nearly everywhere until ready
    # to print instead of using strings internally as well.
342
    for line in diff_split_lines(diff):
343 344
        line = line.encode('utf-8', errors='replace')
        assert isinstance(line, bytes)
345 346 347 348 349
        found = DiffParser.RANGE_RE.match(line)

        if found:
            before = found.group('start2')
            if found.group('len2') is not None:
350
                before += b',' + found.group('len2')
351 352 353

            after = found.group('start1')
            if found.group('len1') is not None:
354
                after += b',' + found.group('len1')
355

356 357 358
            res.append(b'@@ -%s +%s @@\n' % (before, after))
        elif line.startswith(b'-'):
            res.append(b'+')
359
            res.append(line[1:])
360 361
        elif line.startswith(b'+'):
            res.append(b'-')
362 363 364
            res.append(line[1:])
        else:
            res.append(line)
365
    return b''.join(res).decode('utf-8', errors='replace')
366

Chris Lamb's avatar
Chris Lamb committed
367

368 369 370 371 372 373 374 375 376 377 378 379
def color_unified_diff(diff):
    RESET = '\033[0m'
    RED, GREEN, CYAN = '\033[31m', '\033[32m', '\033[0;36m'

    def repl(m):
        return '{}{}{}'.format({
            '-': RED,
            '@': CYAN,
            '+': GREEN,
        }[m.group(1)], m.group(0), RESET)

    return re_diff_change.sub(repl, diff)
380

Mattia Rizzolo's avatar
Mattia Rizzolo committed
381

382 383
DIFFON = "\x01"
DIFFOFF = "\x02"
Mattia Rizzolo's avatar
Mattia Rizzolo committed
384
MAX_WF_SIZE = 1024  # any higher, and linediff takes >1 second and >200MB RAM
385

Mattia Rizzolo's avatar
Mattia Rizzolo committed
386

387
def _linediff_sane(x):
388 389 390
    # turn non-printable chars into "."
    return "." if ord(x) < 32 and x not in '\t\n' else x

Mattia Rizzolo's avatar
Mattia Rizzolo committed
391

392 393 394 395 396 397
def diffinput_truncate(s, sz):
    # Truncate, preserving uniqueness
    if len(s) > sz:
        s = s[:sz] + "[ ... truncated by diffoscope; len: {}, SHA1: {} ... ]".format(
            len(s[sz:]), hashlib.sha1(s[sz:].encode('utf-8')).hexdigest())
    return s
398

Mattia Rizzolo's avatar
Mattia Rizzolo committed
399

400
def linediff(s, t, diffon, diffoff):
401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422
    # calculate common prefix/suffix, easy optimisation to WF
    prefix = os.path.commonprefix((s, t))
    if prefix:
        s = s[len(prefix):]
        t = t[len(prefix):]
    suffix = os.path.commonprefix((s[::-1], t[::-1]))[::-1]
    if suffix:
        s = s[:-len(suffix)]
        t = t[:-len(suffix)]

    # truncate so WF doesn't blow up RAM
    s = diffinput_truncate(s, MAX_WF_SIZE)
    t = diffinput_truncate(t, MAX_WF_SIZE)
    l1, l2 = zip(*linediff_simplify(linediff_wagnerfischer(s, t)))

    def to_string(k, v):
        sanev = "".join(_linediff_sane(c) for c in v)
        return (diffon + sanev + diffoff) if k else sanev
    s1 = ''.join(to_string(*p) for p in l1)
    t1 = ''.join(to_string(*p) for p in l2)
    return prefix + s1 + suffix, prefix + t1 + suffix

Mattia Rizzolo's avatar
Mattia Rizzolo committed
423

424
def linediff_wagnerfischer(s, t):
425
    '''
426
    Line diff algorithm, originally from diff2html.
427

428 429 430 431 432
    https://en.wikipedia.org/wiki/Wagner%E2%80%93Fischer_algorithm

    Finds the minimum (levenshtein) edit distance between two strings, but has
    quadratic performance O(m*n).
    '''
433
    m, n = len(s), len(t)
434
    d = [[(0, 0) for i in range(n + 1)] for i in range(m + 1)]
435 436

    d[0][0] = (0, (0, 0))
437 438 439 440 441 442 443 444
    for i in range(1, m + 1):
        d[i][0] = (i, (i -1, 0))
    for j in range(1, n + 1):
        d[0][j] = (j, (0, j - 1))

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if s[i - 1] == t[j - 1]:
445 446 447
                cost = 0
            else:
                cost = 1
448 449 450
            d[i][j] = min((d[i - 1][j][0] + 1, (i -1, j)),
                          (d[i][j - 1][0] + 1, (i, j - 1)),
                          (d[i - 1][j - 1][0] + cost, (i - 1, j - 1)))
451

452
    coords = []
453 454
    coord = (m, n)
    while coord != (0, 0):
455
        coords.insert(0, coord)
456 457 458
        x, y = coord
        coord = d[x][y][1]

459
    for coord in coords:
460 461 462 463 464 465 466
        cx, cy = coord
        child_val = d[cx][cy][0]

        father_coord = d[cx][cy][1]
        fx, fy = father_coord
        father_val = d[fx][fy][0]

467
        diff = (cx - fx, cy - fy)
468 469

        if diff == (0, 1):
470
            yield (False, ""), (True, t[fy])
471
        elif diff == (1, 0):
472
            yield (True, s[fx]), (False, "")
473
        elif child_val - father_val == 1:
474
            yield (True, s[fx]), (True, t[fy])
475
        else:
476 477 478
            assert s[fx] == t[fy]
            yield (False, s[fx]), (False, t[fy])

Mattia Rizzolo's avatar
Mattia Rizzolo committed
479

480 481 482 483 484 485 486
def linediff_simplify(g):
    """Simplify the output of WF."""
    current = None
    for l, r in g:
        if not current:
            current = l, r
        elif current[0][0] == l[0] and current[1][0] == r[0]:
487 488
            current = (l[0], current[0][1] + l[1]
                       ), (r[0], current[1][1] + r[1])
489 490 491 492 493
        else:
            yield current
            current = l, r
    if current:
        yield current
494

Mattia Rizzolo's avatar
Mattia Rizzolo committed
495

496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528
class SideBySideDiff(object):
    """Calculates a side-by-side diff from a unified diff."""

    def __init__(self, unified_diff, diffon=DIFFON, diffoff=DIFFOFF):
        self.unified_diff = unified_diff
        self.diffon = diffon
        self.diffoff = diffoff
        self.reset()

    def reset(self):
        self.buf = []
        self.add_cpt = 0
        self.del_cpt = 0
        self.line1 = 0
        self.line2 = 0
        self.hunk_off1 = 0
        self.hunk_size1 = 0
        self.hunk_off2 = 0
        self.hunk_size2 = 0
        self._bytes_processed = 0

    @property
    def bytes_processed(self):
        return self._bytes_processed

    def empty_buffer(self):
        if self.del_cpt == 0 or self.add_cpt == 0:
            for l in self.buf:
                yield from self.yield_line(l[0], l[1])

        elif self.del_cpt != 0 and self.add_cpt != 0:
            l0, l1 = [], []
            for l in self.buf:
Mattia Rizzolo's avatar
Mattia Rizzolo committed
529
                if l[0] is not None:
530
                    l0.append(l[0])
Mattia Rizzolo's avatar
Mattia Rizzolo committed
531
                if l[1] is not None:
532 533 534 535 536 537 538 539 540 541 542 543 544 545
                    l1.append(l[1])
            max_len = (len(l0) > len(l1)) and len(l0) or len(l1)
            for i in range(max_len):
                s0, s1 = "", ""
                if i < len(l0):
                    s0 = l0[i]
                if i < len(l1):
                    s1 = l1[i]
                yield from self.yield_line(s0, s1)

    def yield_line(self, s1, s2):
        orig1 = s1
        orig2 = s2

546
        if s1 is None and s2 is None:
547 548 549
            type_name = "unmodified"
        elif s1 == "" and s2 == "":
            type_name = "unmodified"
550
        elif s1 is None or s1 == "":
551
            type_name = "added"
552
        elif s2 is None or s2 == "":
553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597
            type_name = "deleted"
        elif orig1 == orig2 and not s1.endswith('lines removed ]') and not s2.endswith('lines removed ]'):
            type_name = "unmodified"
        else:
            type_name = "changed"
            s1, s2 = linediff(s1, s2, self.diffon, self.diffoff)

        yield "L", (type_name, s1, self.line1, s2, self.line2)

        m = orig1 and re.match(r"^\[ (\d+) lines removed \]$", orig1)
        if m:
            self.line1 += int(m.group(1))
        elif orig1:
            self.line1 += 1
        m = orig2 and re.match(r"^\[ (\d+) lines removed \]$", orig2)
        if m:
            self.line2 += int(m.group(1))
        elif orig2:
            self.line2 += 1

        self.add_cpt = 0
        self.del_cpt = 0
        self.buf = []

    def items(self):
        """Yield the items that form the side-by-side diff.

        Each item is a (type, value) tuple, as follows:

        type == "H", value is a tuple representing a hunk header
            hunk_offset1, hunk_size1, hunk_offset2, hunk_size2 = value
            all ints

        type == "L", value is a tuple representing a line of a hunk
            mode, line1, lineno1, line2, lineno2 = value
            where mode is one of {"unmodified", "added", "deleted", "changed"}
            line* are strings
            lineno* are ints

        type == "C", value is a comment
            comment = value
            a string
        """
        self.reset()

598
        for l in diff_split_lines(self.unified_diff, False):
599 600 601 602 603 604 605 606 607 608 609 610 611
            self._bytes_processed += len(l) + 1
            m = re.match(r'^--- ([^\s]*)', l)
            if m:
                yield from self.empty_buffer()
                continue
            m = re.match(r'^\+\+\+ ([^\s]*)', l)
            if m:
                yield from self.empty_buffer()
                continue

            m = re.match(r"@@ -(\d+),?(\d*) \+(\d+),?(\d*)", l)
            if m:
                yield from self.empty_buffer()
Chris Lamb's avatar
Chris Lamb committed
612
                hunk_data = map(lambda x: x == "" and 1 or int(x), m.groups())
613 614 615 616 617 618 619 620 621 622 623
                self.hunk_off1, self.hunk_size1, self.hunk_off2, self.hunk_size2 = hunk_data
                self.line1, self.line2 = self.hunk_off1, self.hunk_off2
                yield "H", (self.hunk_off1, self.hunk_size1, self.hunk_off2, self.hunk_size2)
                continue

            if re.match(r'^\[', l):
                yield from self.empty_buffer()
                yield "C", l

            if re.match(r"^\\ No newline", l):
                if self.hunk_size2 == 0:
624 625
                    self.buf[-1] = (self.buf[-1][0],
                                    self.buf[-1][1] + '\n' + l[2:])
626
                else:
627
                    self.buf[-1] = (self.buf[-1][0] + '\n' +
628
                                    l[2:], self.buf[-1][1])
629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670
                continue

            if self.hunk_size1 <= 0 and self.hunk_size2 <= 0:
                yield from self.empty_buffer()
                continue

            m = re.match(r"^\+\[ (\d+) lines removed \]$", l)
            if m:
                self.add_cpt += int(m.group(1))
                self.hunk_size2 -= int(m.group(1))
                self.buf.append((None, l[1:]))
                continue

            if re.match(r"^\+", l):
                self.add_cpt += 1
                self.hunk_size2 -= 1
                self.buf.append((None, l[1:]))
                continue

            m = re.match(r"^-\[ (\d+) lines removed \]$", l)
            if m:
                self.del_cpt += int(m.group(1))
                self.hunk_size1 -= int(m.group(1))
                self.buf.append((l[1:], None))
                continue

            if re.match(r"^-", l):
                self.del_cpt += 1
                self.hunk_size1 -= 1
                self.buf.append((l[1:], None))
                continue

            if re.match(r"^ ", l) and self.hunk_size1 and self.hunk_size2:
                yield from self.empty_buffer()
                self.hunk_size1 -= 1
                self.hunk_size2 -= 1
                self.buf.append((l[1:], l[1:]))
                continue

            yield from self.empty_buffer()

        yield from self.empty_buffer()