html.py 18.5 KB
Newer Older
1 2
# -*- coding: utf-8 -*-
#
Jérémy Bobbio's avatar
Jérémy Bobbio committed
3
# diffoscope: in-depth comparison of files, archives, and directories
4
#
Jérémy Bobbio's avatar
Jérémy Bobbio committed
5 6
# Copyright © 2014-2015 Jérémy Bobbio <lunar@debian.org>
#           ©      2015 Reiner Herrmann <reiner@reiner-h.de>
7 8 9 10
#           © 2012-2013 Olivier Matz <zer0@droids-corp.org>
#           ©      2012 Alan De Smet <adesmet@cs.wisc.edu>
#           ©      2012 Sergey Satskiy <sergey.satskiy@gmail.com>
#           ©      2012 scito <info@scito.ch>
Jérémy Bobbio's avatar
Jérémy Bobbio committed
11
#
12
#
Jérémy Bobbio's avatar
Jérémy Bobbio committed
13
# diffoscope is free software: you can redistribute it and/or modify
14 15 16 17
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
Jérémy Bobbio's avatar
Jérémy Bobbio committed
18
# diffoscope is distributed in the hope that it will be useful,
19 20 21 22 23
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
Jérémy Bobbio's avatar
Jérémy Bobbio committed
24
# along with diffoscope.  If not, see <http://www.gnu.org/licenses/>.
25 26 27 28 29 30 31 32
#
#
# Most of the code is borrowed from diff2html.py available at:
# http://git.droids-corp.org/?p=diff2html.git
#
# Part of the code is inspired by diff2html.rb from
# Dave Burt <dave (at) burt.id.au> (mainly for html theme)
#
33

34
import cgi
35
from io import StringIO
36
import re
37
import sys
38 39 40 41 42
import os
import os.path
import codecs
import hashlib
from contextlib import contextmanager
43
from xml.sax.saxutils import escape
Jérémy Bobbio's avatar
Jérémy Bobbio committed
44
from diffoscope import logger, VERSION
45
from diffoscope.config import Config
46
from diffoscope.presenters.icon import FAVICON_BASE64
Jérémy Bobbio's avatar
Jérémy Bobbio committed
47
from functools import reduce
48

49 50 51
# minimum line size, we add a zero-sized breakable space every
# LINESIZE characters
LINESIZE = 20
52
MAX_LINE_SIZE = 1024
53 54 55 56 57 58 59 60
TABSIZE = 8

# Characters we're willing to word wrap on
WORDBREAK = " \t;.,/):-"

DIFFON = "\x01"
DIFFOFF = "\x02"

61
HEADER = """<!DOCTYPE html>
62 63
<html>
<head>
64 65
  <meta charset="utf-8" />
  <meta name="generator" content="diffoscope" />
66
  <link rel="icon" type="image/png" href="data:image/png;base64,%(favicon)s" />
67 68 69 70 71 72 73 74 75 76
  <title>%(title)s</title>
  <style>
    body {
      background: white;
      color: black;
    }
    .footer {
      font-size: small;
    }
    .difference {
77
      border: outset #888 1px;
78
      background-color:rgba(0,0,0,.1);
79 80
      padding: 0.5em;
      margin: 0.5em 0;
81
    }
82 83 84
    .difference table {
      table-layout: fixed;
      width: 100%%;
85
      border: 0;
86 87 88
    }
    .difference th,
    .difference td {
89 90
      border: 0;
    }
91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
    table.diff {
      border: 0px;
      border-collapse:collapse;
      font-size:0.75em;
      font-family: Lucida Console, monospace;
    }
    td.line {
      color:#8080a0
    }
    th {
      background: black;
      color: white
    }
    tr.diffunmodified td {
      background: #D0D0E0
    }
    tr.diffhunk td {
      background: #A0A0A0
    }
    tr.diffadded td {
      background: #CCFFCC
112
    }
113 114 115 116 117 118
    tr.diffdeleted td {
      background: #FFCCCC
    }
    tr.diffchanged td {
      background: #FFFFA0
    }
119 120 121
    ins, del {
      background: #E0C880;
      text-decoration: none
122 123 124
    }
    span.diffponct {
      color: #B08080
125
    }
126 127 128 129 130 131
    .comment {
      font-style: italic;
    }
    .source {
      font-weight: bold;
    }
132 133 134 135 136 137
    .error {
      border: solid black 1px;
      background: red;
      color: white;
      padding: 0.2em;
    }
138 139 140 141 142 143 144 145 146 147
    .anchor {
      margin-left: 0.5em;
      font-size: 80%%;
      color: #333;
      text-decoration: none;
      display: none;
    }
    .diffheader:hover .anchor {
      display: inline;
    }
148 149 150
    .ondemand {
      text-align: center;
    }
151
  </style>
152
  %(css_link)s
153 154 155 156 157
</head>
<body>
"""

FOOTER = """
Jérémy Bobbio's avatar
Jérémy Bobbio committed
158
<div class="footer">Generated by diffoscope %(version)s</div>
159 160 161 162
</body>
</html>
"""

163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
SCRIPTS = """
<script src="%(jquery_url)s"></script>
<script type="text/javascript">
$(function() {
  $("div.ondemand a").on('click', function (){
    var filename = $(this).attr('href');
    var div = $(this).parent();
    div.text('... loading ...');
    div.load(filename + " table", function() {
        // http://stackoverflow.com/a/8452751/946226
        $(this).children(':first').unwrap();
    });
    return false;
  });
});
</script>
"""
180 181 182 183

class PrintLimitReached(Exception):
    pass

184

185
def create_limited_print_func(print_func, max_page_size):
186 187 188
    def limited_print_func(s, force=False):
        if not hasattr(limited_print_func, 'char_count'):
            limited_print_func.char_count = 0
189
        print_func(s)
190
        limited_print_func.char_count += len(s)
191
        if not force and limited_print_func.char_count >= max_page_size:
192 193 194
            raise PrintLimitReached()
    return limited_print_func

195

196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217
buf = []
add_cpt, del_cpt = 0, 0
line1, line2 = 0, 0
hunk_off1, hunk_size1, hunk_off2, hunk_size2 = 0, 0, 0, 0


def sane(x):
    r = ""
    for i in x:
        j = ord(i)
        if i not in ['\t', '\n'] and (j < 32):
            r = r + "."
        else:
            r = r + i
    return r


def linediff(s, t):
    '''
    Original line diff algorithm of diff2html. It's character based.
    '''
    if len(s):
218
        s = ''.join([ sane(c) for c in s ])
219
    if len(t):
220
        t = ''.join([ sane(c) for c in t ])
221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274

    m, n = len(s), len(t)
    d = [[(0, 0) for i in range(n+1)] for i in range(m+1)]


    d[0][0] = (0, (0, 0))
    for i in range(m+1)[1:]:
        d[i][0] = (i,(i-1, 0))
    for j in range(n+1)[1:]:
        d[0][j] = (j,(0, j-1))

    for i in range(m+1)[1:]:
        for j in range(n+1)[1:]:
            if s[i-1] == t[j-1]:
                cost = 0
            else:
                cost = 1
            d[i][j] = min((d[i-1][j][0] + 1, (i-1, j)),
                          (d[i][j-1][0] + 1, (i, j-1)),
                          (d[i-1][j-1][0] + cost, (i-1, j-1)))

    l = []
    coord = (m, n)
    while coord != (0, 0):
        l.insert(0, coord)
        x, y = coord
        coord = d[x][y][1]

    l1 = []
    l2 = []

    for coord in l:
        cx, cy = coord
        child_val = d[cx][cy][0]

        father_coord = d[cx][cy][1]
        fx, fy = father_coord
        father_val = d[fx][fy][0]

        diff = (cx-fx, cy-fy)

        if diff == (0, 1):
            l1.append("")
            l2.append(DIFFON + t[fy] + DIFFOFF)
        elif diff == (1, 0):
            l1.append(DIFFON + s[fx] + DIFFOFF)
            l2.append("")
        elif child_val-father_val == 1:
            l1.append(DIFFON + s[fx] + DIFFOFF)
            l2.append(DIFFON + t[fy] + DIFFOFF)
        else:
            l1.append(s[fx])
            l2.append(t[fy])

275
    return ''.join(l1).replace(DIFFOFF + DIFFON, ''), ''.join(l2).replace(DIFFOFF + DIFFON, '')
276 277


278
def convert(s, ponct=0, tag=''):
279
    i = 0
280
    t = StringIO()
281 282 283
    for c in s:
        # used by diffs
        if c == DIFFON:
284
            t.write('<%s>' % tag)
285
        elif c == DIFFOFF:
286
            t.write('</%s>' % tag)
287 288 289

        # special highlighted chars
        elif c == "\t" and ponct == 1:
290
            n = TABSIZE-(i%TABSIZE)
291
            if n == 0:
292
                n = TABSIZE
293
            t.write('<span class="diffponct">\xbb</span>'+'\xa0'*(n-1))
294
        elif c == " " and ponct == 1:
295
            t.write('<span class="diffponct">\xb7</span>')
296
        elif c == "\n" and ponct == 1:
297
            t.write('<br/><span class="diffponct">\</span>')
298 299
        elif ord(c) < 32:
            conv = u"\\x%x" % ord(c)
300
            t.write('<em>%s</em>' % conv)
301
            i += len(conv)
302
        else:
303
            t.write(cgi.escape(c))
304 305
            i += 1

306
        if WORDBREAK.count(c) == 1:
307
            t.write('\u200b')
308
            i = 0
309
        if i > LINESIZE:
310
            i = 0
311
            t.write('\u200b')
312

313
    return t.getvalue()
314 315 316 317 318 319 320 321 322 323 324 325 326 327


def output_hunk(print_func):
    print_func(u'<tr class="diffhunk"><td colspan="2">Offset %d, %d lines modified</td>'%(hunk_off1, hunk_size1))
    print_func(u'<td colspan="2">Offset %d, %d lines modified</td></tr>\n'%(hunk_off2, hunk_size2))


def output_line(print_func, s1, s2):
    global line1
    global line2

    orig1 = s1
    orig2 = s2

328 329 330 331 332
    if s1 and len(s1) > MAX_LINE_SIZE:
        s1 = s1[:MAX_LINE_SIZE] + u" ✂"
    if s2 and len(s2) > MAX_LINE_SIZE:
        s2 = s2[:MAX_LINE_SIZE] + u" ✂"

333 334 335 336 337 338 339 340
    if s1 == None and s2 == None:
        type_name = "unmodified"
    elif s1 == "" and s2 == "":
        type_name = "unmodified"
    elif s1 == None or s1 == "":
        type_name = "added"
    elif s2 == None or s2 == "":
        type_name = "deleted"
341
    elif orig1 == orig2 and not s1.endswith('lines removed ]') and not s2.endswith('lines removed ]'):
342 343 344
        type_name = "unmodified"
    else:
        type_name = "changed"
345
        s1, s2 = linediff(s1, s2)
346 347 348

    print_func(u'<tr class="diff%s">' % type_name)
    try:
349
        if s1:
350 351
            print_func(u'<td class="diffline">%d </td>' % line1)
            print_func(u'<td class="diffpresent">')
352
            print_func(convert(s1, ponct=1, tag='del'))
353 354
            print_func(u'</td>')
        else:
355
            print_func(u'<td colspan="2">\xa0</td>')
356

357
        if s2:
358 359
            print_func(u'<td class="diffline">%d </td>' % line2)
            print_func(u'<td class="diffpresent">')
360
            print_func(convert(s2, ponct=1, tag='ins'))
361 362
            print_func(u'</td>')
        else:
363
            print_func(u'<td colspan="2">\xa0</td>')
364 365 366 367 368 369
    finally:
        print_func(u"</tr>\n", force=True)

    m = orig1 and re.match(r"^\[ (\d+) lines removed \]$", orig1)
    if m:
        line1 += int(m.group(1))
370
    elif orig1:
371 372 373 374
        line1 += 1
    m = orig2 and re.match(r"^\[ (\d+) lines removed \]$", orig2)
    if m:
        line2 += int(m.group(1))
375
    elif orig2:
376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407
        line2 += 1


def empty_buffer(print_func):
    global buf
    global add_cpt
    global del_cpt

    if del_cpt == 0 or add_cpt == 0:
        for l in buf:
            output_line(print_func, l[0], l[1])

    elif del_cpt != 0 and add_cpt != 0:
        l0, l1 = [], []
        for l in buf:
            if l[0] != None:
                l0.append(l[0])
            if l[1] != None:
                l1.append(l[1])
        max_len = (len(l0) > len(l1)) and len(l0) or len(l1)
        for i in range(max_len):
            s0, s1 = "", ""
            if i < len(l0):
                s0 = l0[i]
            if i < len(l1):
                s1 = l1[i]
            output_line(print_func, s0, s1)

    add_cpt, del_cpt = 0, 0
    buf = []


408
def output_unified_diff_table(print_func, unified_diff):
409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436
    global add_cpt, del_cpt
    global line1, line2
    global hunk_off1, hunk_size1, hunk_off2, hunk_size2

    print_func(u'<table class="diff">\n')
    try:
        print_func(u'<colgroup><col style="width: 3em;"/><col style="99%"/>\n')
        print_func(u'<col style="width: 3em;"/><col style="99%"/></colgroup>\n')

        for l in unified_diff.splitlines():
            m = re.match(r'^--- ([^\s]*)', l)
            if m:
                empty_buffer(print_func)
                continue
            m = re.match(r'^\+\+\+ ([^\s]*)', l)
            if m:
                empty_buffer(print_func)
                continue

            m = re.match(r"@@ -(\d+),?(\d*) \+(\d+),?(\d*)", l)
            if m:
                empty_buffer(print_func)
                hunk_data = map(lambda x:x=="" and 1 or int(x), m.groups())
                hunk_off1, hunk_size1, hunk_off2, hunk_size2 = hunk_data
                line1, line2 = hunk_off1, hunk_off2
                output_hunk(print_func)
                continue

437 438 439 440
            if re.match(r'^\[', l):
                empty_buffer(print_func)
                print_func(u'<td colspan="2">%s</td>\n' % l)

441 442 443
            if re.match(r"^\\ No newline", l):
                if hunk_size2 == 0:
                    buf[-1] = (buf[-1][0], buf[-1][1] + '\n' + l[2:])
444
                else:
445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489
                    buf[-1] = (buf[-1][0] + '\n' + l[2:], buf[-1][1])
                continue

            if hunk_size1 <= 0 and hunk_size2 <= 0:
                empty_buffer(print_func)
                continue

            m = re.match(r"^\+\[ (\d+) lines removed \]$", l)
            if m:
                add_cpt += int(m.group(1))
                hunk_size2 -= int(m.group(1))
                buf.append((None, l[1:]))
                continue

            if re.match(r"^\+", l):
                add_cpt += 1
                hunk_size2 -= 1
                buf.append((None, l[1:]))
                continue

            m = re.match(r"^-\[ (\d+) lines removed \]$", l)
            if m:
                del_cpt += int(m.group(1))
                hunk_size1 -= int(m.group(1))
                buf.append((l[1:], None))
                continue

            if re.match(r"^-", l):
                del_cpt += 1
                hunk_size1 -= 1
                buf.append((l[1:], None))
                continue

            if re.match(r"^ ", l) and hunk_size1 and hunk_size2:
                empty_buffer(print_func)
                hunk_size1 -= 1
                hunk_size2 -= 1
                buf.append((l[1:], l[1:]))
                continue

            empty_buffer(print_func)

        empty_buffer(print_func)
    finally:
        print_func(u"</table>", force=True)
490

491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506
def output_unified_diff(print_func, css_url, directory, unified_diff):
    if directory and len(unified_diff) > Config.general.separate_file_diff_size:
        # open a new file for this table
        filename="%s.html" % hashlib.md5(unified_diff.encode('utf-8')).hexdigest()
        logger.debug('separate html output for diff of size %d', len(unified_diff))
        with file_printer(directory, filename) as new_print_func:
            output_header(css_url, new_print_func)
            output_unified_diff_table(new_print_func, unified_diff)
            output_footer(new_print_func)

        print_func("<div class='ondemand'>\n")
        print_func("... <a href='%s'>load diff</a> ...\n" % escape(filename))
        print_func("</div>\n")

    else:
        output_unified_diff_table(print_func, unified_diff)
507

508
def output_difference(difference, print_func, css_url, directory, parents):
509 510 511 512
    if Config.general.hide_profile is not None:
        if difference.source1 in Config.general.hide_profile:
            logger.debug('output for %s is hidden', difference.source1)
            return
513
    logger.debug('html output for %s', difference.source1)
514
    sources = parents + [difference.source1]
515
    print_func(u"<div class='difference'>")
516
    try:
517
        print_func(u"<div class='diffheader'>")
518
        if difference.source1 == difference.source2:
519
            print_func(u"<div><span class='source'>%s<span>"
520
                       % escape(difference.source1))
521
        else:
522
            print_func(u"<div><span class='source'>%s</span> vs.</div>"
523
                       % escape(difference.source1))
524
            print_func(u"<div><span class='source'>%s</span>"
525
                       % escape(difference.source2))
526
        anchor = '/'.join(sources[1:])
527
        print_func(u" <a class='anchor' href='#%s' name='%s'>\xb6</a>" % (anchor, anchor))
528
        print_func(u"</div>")
529
        if difference.comments:
530
            print_func(u"<div class='comment'>%s</div>"
531
                       % u'<br />'.join(map(escape, difference.comments)))
532 533
        print_func(u"</div>")
        if difference.unified_diff:
534
            output_unified_diff(print_func, css_url, directory, difference.unified_diff)
535
        for detail in difference.details:
536
            output_difference(detail, print_func, css_url, directory, sources)
537
    except PrintLimitReached:
538
        logger.debug('print limit reached')
539 540
        raise
    finally:
541
        print_func(u"</div>", force=True)
542

543

544 545 546 547 548 549
def output_header(css_url, print_func):
    if css_url:
        css_link = '<link href="%s" type="text/css" rel="stylesheet" />' % css_url
    else:
        css_link = ''
    print_func(HEADER % {'title': escape(' '.join(sys.argv)),
550
                         'favicon': FAVICON_BASE64,
551 552 553
                         'css_link': css_link,
                        })

554 555 556
def output_footer(print_func):
    print_func(FOOTER % {'version': VERSION}, force=True)

557

558
def output_html(difference, css_url=None, print_func=None):
559 560 561
    """
    Default presenter, all in one HTML file
    """
562 563
    if print_func is None:
        print_func = print
564
    print_func = create_limited_print_func(print_func, Config.general.max_report_size)
565
    try:
566
        output_header(css_url, print_func)
567
        output_difference(difference, print_func, css_url, None, [])
568
    except PrintLimitReached:
569
        logger.debug('print limit reached')
570
        print_func(u"<div class='error'>Max output size reached.</div>",
571
                   force=True)
572 573 574 575 576 577 578 579 580
    output_footer(print_func)

@contextmanager
def file_printer(directory, filename):
    with codecs.open(os.path.join(directory,filename), 'w', encoding='utf-8') as f:
        print_func = f.write
        print_func = create_limited_print_func(print_func, Config.general.max_report_size)
        yield print_func

581 582
JQUERY_SYSTEM_LOCATIONS = ['/usr/share/javascript/jquery/jquery.js']

583 584 585 586 587 588 589 590 591 592 593 594 595 596
def output_html_directory(directory, difference, css_url=None, jquery_url=None):
    """
    Multi-file presenter. Writes to a directory, and puts large diff tables
    into files of their own.

    This uses jQuery. By default it uses /usr/share/javascript/jquery/jquery.js
    (symlinked, so that you can still share the result over HTTP).
    You can also pass --jquery URL to diffoscope to use a central jQuery copy.
    """
    if not os.path.exists(directory):
        os.makedirs(directory)

    if not jquery_url:
        jquery_symlink = os.path.join(directory, "jquery.js")
597 598 599 600 601 602 603 604 605 606 607 608 609 610 611
        if os.path.exists(jquery_symlink):
            jquery_url = "./jquery.js"
        else:
            if os.path.lexists(jquery_symlink):
                os.unlink(jquery_symlink)
            for path in JQUERY_SYSTEM_LOCATIONS:
                if os.path.exists(path):
                    os.symlink("/usr/share/javascript/jquery/jquery.js", jquery_symlink)
                    jquery_url = "./jquery.js"
                    break
            if not jquery_url:
                logger.warning('--jquery was not specified and jQuery was not found in any known location. Disabling on-demand inline loading.')
                logger.debug('Locations searched: %s', ', '.join(JQUERY_SYSTEM_LOCATIONS))
    if jquery_url == 'disable':
        jquery_url = None
612 613 614 615 616 617 618 619 620 621

    with file_printer(directory, "index.html") as print_func:
        print_func = create_limited_print_func(print_func, Config.general.max_report_size)
        try:
            output_header(css_url, print_func)
            output_difference(difference, print_func, css_url, directory, [])
        except PrintLimitReached:
            logger.debug('print limit reached')
            print_func(u"<div class='error'>Max output size reached.</div>",
                       force=True)
622 623
        if jquery_url:
            print_func(SCRIPTS % {'jquery_url': escape(jquery_url)}, force=True)
624
        output_footer(print_func)