Commit 56fcdec5 authored by Ximin Luo's avatar Ximin Luo

More ydiff/linediff from diffoscope.{difference => diff} to group unified_diff...

More ydiff/linediff from diffoscope.{difference => diff} to group unified_diff related things together
parent 4afa1ccd
......@@ -289,3 +289,255 @@ def color_unified_diff(diff):
}[m.group(1)], m.group(0), RESET)
return re_diff_change.sub(repl, diff)
DIFFON = "\x01"
DIFFOFF = "\x02"
def _linediff_sane(x):
r = ""
for i in x:
j = ord(i)
if i not in ['\t', '\n'] and (j < 32):
r = r + "."
else:
r = r + i
return r
def linediff(s, t, diffon, diffoff):
'''
Original line diff algorithm of diff2html. It's character based.
'''
if len(s):
s = ''.join([ _linediff_sane(c) for c in s ])
if len(t):
t = ''.join([ _linediff_sane(c) for c in t ])
m, n = len(s), len(t)
d = [[(0, 0) for i in range(n+1)] for i in range(m+1)]
d[0][0] = (0, (0, 0))
for i in range(m+1)[1:]:
d[i][0] = (i,(i-1, 0))
for j in range(n+1)[1:]:
d[0][j] = (j,(0, j-1))
for i in range(m+1)[1:]:
for j in range(n+1)[1:]:
if s[i-1] == t[j-1]:
cost = 0
else:
cost = 1
d[i][j] = min((d[i-1][j][0] + 1, (i-1, j)),
(d[i][j-1][0] + 1, (i, j-1)),
(d[i-1][j-1][0] + cost, (i-1, j-1)))
l = []
coord = (m, n)
while coord != (0, 0):
l.insert(0, coord)
x, y = coord
coord = d[x][y][1]
l1 = []
l2 = []
for coord in l:
cx, cy = coord
child_val = d[cx][cy][0]
father_coord = d[cx][cy][1]
fx, fy = father_coord
father_val = d[fx][fy][0]
diff = (cx-fx, cy-fy)
if diff == (0, 1):
l1.append("")
l2.append(diffon + t[fy] + diffoff)
elif diff == (1, 0):
l1.append(diffon + s[fx] + diffoff)
l2.append("")
elif child_val-father_val == 1:
l1.append(diffon + s[fx] + diffoff)
l2.append(diffon + t[fy] + diffoff)
else:
l1.append(s[fx])
l2.append(t[fy])
return ''.join(l1).replace(diffoff + diffon, ''), ''.join(l2).replace(diffoff + diffon, '')
class SideBySideDiff(object):
"""Calculates a side-by-side diff from a unified diff."""
def __init__(self, unified_diff, diffon=DIFFON, diffoff=DIFFOFF):
self.unified_diff = unified_diff
self.diffon = diffon
self.diffoff = diffoff
self.reset()
def reset(self):
self.buf = []
self.add_cpt = 0
self.del_cpt = 0
self.line1 = 0
self.line2 = 0
self.hunk_off1 = 0
self.hunk_size1 = 0
self.hunk_off2 = 0
self.hunk_size2 = 0
self._bytes_processed = 0
@property
def bytes_processed(self):
return self._bytes_processed
def empty_buffer(self):
if self.del_cpt == 0 or self.add_cpt == 0:
for l in self.buf:
yield from self.yield_line(l[0], l[1])
elif self.del_cpt != 0 and self.add_cpt != 0:
l0, l1 = [], []
for l in self.buf:
if l[0] != None:
l0.append(l[0])
if l[1] != None:
l1.append(l[1])
max_len = (len(l0) > len(l1)) and len(l0) or len(l1)
for i in range(max_len):
s0, s1 = "", ""
if i < len(l0):
s0 = l0[i]
if i < len(l1):
s1 = l1[i]
yield from self.yield_line(s0, s1)
def yield_line(self, s1, s2):
orig1 = s1
orig2 = s2
if s1 == None and s2 == None:
type_name = "unmodified"
elif s1 == "" and s2 == "":
type_name = "unmodified"
elif s1 == None or s1 == "":
type_name = "added"
elif s2 == None or s2 == "":
type_name = "deleted"
elif orig1 == orig2 and not s1.endswith('lines removed ]') and not s2.endswith('lines removed ]'):
type_name = "unmodified"
else:
type_name = "changed"
s1, s2 = linediff(s1, s2, self.diffon, self.diffoff)
yield "L", (type_name, s1, self.line1, s2, self.line2)
m = orig1 and re.match(r"^\[ (\d+) lines removed \]$", orig1)
if m:
self.line1 += int(m.group(1))
elif orig1:
self.line1 += 1
m = orig2 and re.match(r"^\[ (\d+) lines removed \]$", orig2)
if m:
self.line2 += int(m.group(1))
elif orig2:
self.line2 += 1
self.add_cpt = 0
self.del_cpt = 0
self.buf = []
def items(self):
"""Yield the items that form the side-by-side diff.
Each item is a (type, value) tuple, as follows:
type == "H", value is a tuple representing a hunk header
hunk_offset1, hunk_size1, hunk_offset2, hunk_size2 = value
all ints
type == "L", value is a tuple representing a line of a hunk
mode, line1, lineno1, line2, lineno2 = value
where mode is one of {"unmodified", "added", "deleted", "changed"}
line* are strings
lineno* are ints
type == "C", value is a comment
comment = value
a string
"""
self.reset()
for l in self.unified_diff.splitlines():
self._bytes_processed += len(l) + 1
m = re.match(r'^--- ([^\s]*)', l)
if m:
yield from self.empty_buffer()
continue
m = re.match(r'^\+\+\+ ([^\s]*)', l)
if m:
yield from self.empty_buffer()
continue
m = re.match(r"@@ -(\d+),?(\d*) \+(\d+),?(\d*)", l)
if m:
yield from self.empty_buffer()
hunk_data = map(lambda x:x=="" and 1 or int(x), m.groups())
self.hunk_off1, self.hunk_size1, self.hunk_off2, self.hunk_size2 = hunk_data
self.line1, self.line2 = self.hunk_off1, self.hunk_off2
yield "H", (self.hunk_off1, self.hunk_size1, self.hunk_off2, self.hunk_size2)
continue
if re.match(r'^\[', l):
yield from self.empty_buffer()
yield "C", l
if re.match(r"^\\ No newline", l):
if self.hunk_size2 == 0:
self.buf[-1] = (self.buf[-1][0], self.buf[-1][1] + '\n' + l[2:])
else:
self.buf[-1] = (buf[-1][0] + '\n' + l[2:], self.buf[-1][1])
continue
if self.hunk_size1 <= 0 and self.hunk_size2 <= 0:
yield from self.empty_buffer()
continue
m = re.match(r"^\+\[ (\d+) lines removed \]$", l)
if m:
self.add_cpt += int(m.group(1))
self.hunk_size2 -= int(m.group(1))
self.buf.append((None, l[1:]))
continue
if re.match(r"^\+", l):
self.add_cpt += 1
self.hunk_size2 -= 1
self.buf.append((None, l[1:]))
continue
m = re.match(r"^-\[ (\d+) lines removed \]$", l)
if m:
self.del_cpt += int(m.group(1))
self.hunk_size1 -= int(m.group(1))
self.buf.append((l[1:], None))
continue
if re.match(r"^-", l):
self.del_cpt += 1
self.hunk_size1 -= 1
self.buf.append((l[1:], None))
continue
if re.match(r"^ ", l) and self.hunk_size1 and self.hunk_size2:
yield from self.empty_buffer()
self.hunk_size1 -= 1
self.hunk_size2 -= 1
self.buf.append((l[1:], l[1:]))
continue
yield from self.empty_buffer()
yield from self.empty_buffer()
......@@ -17,9 +17,7 @@
# You should have received a copy of the GNU General Public License
# along with diffoscope. If not, see <https://www.gnu.org/licenses/>.
import hashlib
import heapq
import re
import logging
from . import feeders
......@@ -325,256 +323,3 @@ class VisualDifference(object):
def size(self):
return len(self.data_type) + len(self.content) + len(self.source)
DIFFON = "\x01"
DIFFOFF = "\x02"
def _linediff_sane(x):
r = ""
for i in x:
j = ord(i)
if i not in ['\t', '\n'] and (j < 32):
r = r + "."
else:
r = r + i
return r
def linediff(s, t, diffon, diffoff):
'''
Original line diff algorithm of diff2html. It's character based.
'''
if len(s):
s = ''.join([ _linediff_sane(c) for c in s ])
if len(t):
t = ''.join([ _linediff_sane(c) for c in t ])
m, n = len(s), len(t)
d = [[(0, 0) for i in range(n+1)] for i in range(m+1)]
d[0][0] = (0, (0, 0))
for i in range(m+1)[1:]:
d[i][0] = (i,(i-1, 0))
for j in range(n+1)[1:]:
d[0][j] = (j,(0, j-1))
for i in range(m+1)[1:]:
for j in range(n+1)[1:]:
if s[i-1] == t[j-1]:
cost = 0
else:
cost = 1
d[i][j] = min((d[i-1][j][0] + 1, (i-1, j)),
(d[i][j-1][0] + 1, (i, j-1)),
(d[i-1][j-1][0] + cost, (i-1, j-1)))
l = []
coord = (m, n)
while coord != (0, 0):
l.insert(0, coord)
x, y = coord
coord = d[x][y][1]
l1 = []
l2 = []
for coord in l:
cx, cy = coord
child_val = d[cx][cy][0]
father_coord = d[cx][cy][1]
fx, fy = father_coord
father_val = d[fx][fy][0]
diff = (cx-fx, cy-fy)
if diff == (0, 1):
l1.append("")
l2.append(diffon + t[fy] + diffoff)
elif diff == (1, 0):
l1.append(diffon + s[fx] + diffoff)
l2.append("")
elif child_val-father_val == 1:
l1.append(diffon + s[fx] + diffoff)
l2.append(diffon + t[fy] + diffoff)
else:
l1.append(s[fx])
l2.append(t[fy])
return ''.join(l1).replace(diffoff + diffon, ''), ''.join(l2).replace(diffoff + diffon, '')
class SideBySideDiff(object):
"""Calculates a side-by-side diff from a unified diff."""
def __init__(self, unified_diff, diffon=DIFFON, diffoff=DIFFOFF):
self.unified_diff = unified_diff
self.diffon = diffon
self.diffoff = diffoff
self.reset()
def reset(self):
self.buf = []
self.add_cpt = 0
self.del_cpt = 0
self.line1 = 0
self.line2 = 0
self.hunk_off1 = 0
self.hunk_size1 = 0
self.hunk_off2 = 0
self.hunk_size2 = 0
self._bytes_processed = 0
@property
def bytes_processed(self):
return self._bytes_processed
def empty_buffer(self):
if self.del_cpt == 0 or self.add_cpt == 0:
for l in self.buf:
yield from self.yield_line(l[0], l[1])
elif self.del_cpt != 0 and self.add_cpt != 0:
l0, l1 = [], []
for l in self.buf:
if l[0] != None:
l0.append(l[0])
if l[1] != None:
l1.append(l[1])
max_len = (len(l0) > len(l1)) and len(l0) or len(l1)
for i in range(max_len):
s0, s1 = "", ""
if i < len(l0):
s0 = l0[i]
if i < len(l1):
s1 = l1[i]
yield from self.yield_line(s0, s1)
def yield_line(self, s1, s2):
orig1 = s1
orig2 = s2
if s1 == None and s2 == None:
type_name = "unmodified"
elif s1 == "" and s2 == "":
type_name = "unmodified"
elif s1 == None or s1 == "":
type_name = "added"
elif s2 == None or s2 == "":
type_name = "deleted"
elif orig1 == orig2 and not s1.endswith('lines removed ]') and not s2.endswith('lines removed ]'):
type_name = "unmodified"
else:
type_name = "changed"
s1, s2 = linediff(s1, s2, self.diffon, self.diffoff)
yield "L", (type_name, s1, self.line1, s2, self.line2)
m = orig1 and re.match(r"^\[ (\d+) lines removed \]$", orig1)
if m:
self.line1 += int(m.group(1))
elif orig1:
self.line1 += 1
m = orig2 and re.match(r"^\[ (\d+) lines removed \]$", orig2)
if m:
self.line2 += int(m.group(1))
elif orig2:
self.line2 += 1
self.add_cpt = 0
self.del_cpt = 0
self.buf = []
def items(self):
"""Yield the items that form the side-by-side diff.
Each item is a (type, value) tuple, as follows:
type == "H", value is a tuple representing a hunk header
hunk_offset1, hunk_size1, hunk_offset2, hunk_size2 = value
all ints
type == "L", value is a tuple representing a line of a hunk
mode, line1, lineno1, line2, lineno2 = value
where mode is one of {"unmodified", "added", "deleted", "changed"}
line* are strings
lineno* are ints
type == "C", value is a comment
comment = value
a string
"""
self.reset()
for l in self.unified_diff.splitlines():
self._bytes_processed += len(l) + 1
m = re.match(r'^--- ([^\s]*)', l)
if m:
yield from self.empty_buffer()
continue
m = re.match(r'^\+\+\+ ([^\s]*)', l)
if m:
yield from self.empty_buffer()
continue
m = re.match(r"@@ -(\d+),?(\d*) \+(\d+),?(\d*)", l)
if m:
yield from self.empty_buffer()
hunk_data = map(lambda x:x=="" and 1 or int(x), m.groups())
self.hunk_off1, self.hunk_size1, self.hunk_off2, self.hunk_size2 = hunk_data
self.line1, self.line2 = self.hunk_off1, self.hunk_off2
yield "H", (self.hunk_off1, self.hunk_size1, self.hunk_off2, self.hunk_size2)
continue
if re.match(r'^\[', l):
yield from self.empty_buffer()
yield "C", l
if re.match(r"^\\ No newline", l):
if self.hunk_size2 == 0:
self.buf[-1] = (self.buf[-1][0], self.buf[-1][1] + '\n' + l[2:])
else:
self.buf[-1] = (buf[-1][0] + '\n' + l[2:], self.buf[-1][1])
continue
if self.hunk_size1 <= 0 and self.hunk_size2 <= 0:
yield from self.empty_buffer()
continue
m = re.match(r"^\+\[ (\d+) lines removed \]$", l)
if m:
self.add_cpt += int(m.group(1))
self.hunk_size2 -= int(m.group(1))
self.buf.append((None, l[1:]))
continue
if re.match(r"^\+", l):
self.add_cpt += 1
self.hunk_size2 -= 1
self.buf.append((None, l[1:]))
continue
m = re.match(r"^-\[ (\d+) lines removed \]$", l)
if m:
self.del_cpt += int(m.group(1))
self.hunk_size1 -= int(m.group(1))
self.buf.append((l[1:], None))
continue
if re.match(r"^-", l):
self.del_cpt += 1
self.hunk_size1 -= 1
self.buf.append((l[1:], None))
continue
if re.match(r"^ ", l) and self.hunk_size1 and self.hunk_size2:
yield from self.empty_buffer()
self.hunk_size1 -= 1
self.hunk_size2 -= 1
self.buf.append((l[1:], l[1:]))
continue
yield from self.empty_buffer()
yield from self.empty_buffer()
......@@ -43,7 +43,7 @@ import contextlib
from diffoscope import VERSION
from diffoscope.config import Config
from diffoscope.difference import SideBySideDiff, DIFFON, DIFFOFF
from diffoscope.diff import SideBySideDiff, DIFFON, DIFFOFF
from ..icon import FAVICON_BASE64
from ..utils import PrintLimitReached, DiffBlockLimitReached, \
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment