DiffMyerspublic class DiffMyers extends Object A class to compare vectors of objects. The result of comparison
is a list of change objects which form an
edit script. The objects compared are traditionally lines
of text from two files. Comparison options such as "ignore
whitespace" are implemented by modifying the equals
and hashcode methods for the objects compared.
The basic algorithm is described in:
"An O(ND) Difference Algorithm and its Variations", Eugene Myers,
Algorithmica Vol. 1 No. 2, 1986, p 251.
This class outputs different results from GNU diff 1.15 on some
inputs. Our results are actually better (smaller change list, smaller
total size of changes), but it would be nice to know why. Perhaps
there is a memory overwrite bug in GNU diff 1.15. |
Fields Summary |
---|
private int | equiv_max1 more than the maximum equivalence value used for this or its
sibling file. | public boolean | heuristicWhen set to true, the comparison uses a heuristic to speed it up.
With this heuristic, for files with a constant small density
of changes, the algorithm is linear in the file size. | public boolean | no_discardsWhen set to true, the algorithm returns a guarranteed minimal
set of changes. This makes things slower, sometimes much slower. | private int[] | xvec | private int[] | yvec | private int[] | fdiag | private int[] | bdiag | private int | fdiagoff | private int | bdiagoff | private final file_data[] | filevec | private int | cost | private boolean | inhibit |
Constructors Summary |
---|
public DiffMyers(Object[] a, Object[] b)Prepare to find differences between two arrays. Each element of
the arrays is translated to an "equivalence number" based on
the result of equals . The original Object arrays
are no longer needed for computing the differences. They will
be needed again later to print the results of the comparison as
an edit script, if desired.
Hashtable h = new Hashtable(a.length + b.length);
filevec[0] = new file_data(a,h);
filevec[1] = new file_data(b,h);
|
Methods Summary |
---|
private jdiff.DiffMyers$change | build_reverse_script()Scan the tables of which lines are inserted and deleted,
producing an edit script in reverse order.
change script = null;
final boolean[] changed0 = filevec[0].changed_flag;
final boolean[] changed1 = filevec[1].changed_flag;
final int len0 = filevec[0].buffered_lines;
final int len1 = filevec[1].buffered_lines;
/* Note that changedN[len0] does exist, and contains 0. */
int i0 = 0, i1 = 0;
while (i0 < len0 || i1 < len1)
{
if (changed0[1+i0] || changed1[1+i1])
{
int line0 = i0, line1 = i1;
/* Find # lines changed here in each file. */
while (changed0[1+i0]) ++i0;
while (changed1[1+i1]) ++i1;
/* Record this change. */
script = new change(line0, line1, i0 - line0, i1 - line1, script);
}
/* We have reached lines in the two files that match each other. */
i0++; i1++;
}
return script;
| private jdiff.DiffMyers$change | build_script()Scan the tables of which lines are inserted and deleted,
producing an edit script in forward order.
change script = null;
final boolean[] changed0 = filevec[0].changed_flag;
final boolean[] changed1 = filevec[1].changed_flag;
final int len0 = filevec[0].buffered_lines;
final int len1 = filevec[1].buffered_lines;
int i0 = len0, i1 = len1;
/* Note that changedN[-1] does exist, and contains 0. */
while (i0 >= 0 || i1 >= 0)
{
if (changed0[i0] || changed1[i1])
{
int line0 = i0, line1 = i1;
/* Find # lines changed here in each file. */
while (changed0[i0]) --i0;
while (changed1[i1]) --i1;
/* Record this change. */
script = new change(i0, i1, line0 - i0, line1 - i1, script);
}
/* We have reached lines in the two files that match each other. */
i0--; i1--;
}
return script;
| private void | compareseq(int xoff, int xlim, int yoff, int ylim)Compare in detail contiguous subsequences of the two files
which are known, as a whole, to match each other.
The results are recorded in the vectors filevec[N].changed_flag, by
storing a 1 in the element for each line that is an insertion or deletion.
The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
Note that XLIM, YLIM are exclusive bounds.
All line numbers are origin-0 and discarded lines are not counted.
/* Slide down the bottom initial diagonal. */
while (xoff < xlim && yoff < ylim && xvec[xoff] == yvec[yoff]) {
++xoff; ++yoff;
}
/* Slide up the top initial diagonal. */
while (xlim > xoff && ylim > yoff && xvec[xlim - 1] == yvec[ylim - 1]) {
--xlim; --ylim;
}
/* Handle simple cases. */
if (xoff == xlim)
while (yoff < ylim)
filevec[1].changed_flag[1+filevec[1].realindexes[yoff++]] = true;
else if (yoff == ylim)
while (xoff < xlim)
filevec[0].changed_flag[1+filevec[0].realindexes[xoff++]] = true;
else
{
/* Find a point of correspondence in the middle of the files. */
int d = diag (xoff, xlim, yoff, ylim);
int c = cost;
int f = fdiag[fdiagoff + d];
int b = bdiag[bdiagoff + d];
if (c == 1)
{
/* This should be impossible, because it implies that
one of the two subsequences is empty,
and that case was handled above without calling `diag'.
Let's verify that this is true. */
throw new IllegalArgumentException("Empty subsequence");
}
else
{
/* Use that point to split this problem into two subproblems. */
compareseq (xoff, b, yoff, b - d);
/* This used to use f instead of b,
but that is incorrect!
It is not necessarily the case that diagonal d
has a snake from b to f. */
compareseq (b, xlim, b - d, ylim);
}
}
| private int | diag(int xoff, int xlim, int yoff, int ylim)Find the midpoint of the shortest edit script for a specified
portion of the two files.
We scan from the beginnings of the files, and simultaneously from the ends,
doing a breadth-first search through the space of edit-sequence.
When the two searches meet, we have found the midpoint of the shortest
edit sequence.
The value returned is the number of the diagonal on which the midpoint lies.
The diagonal number equals the number of inserted lines minus the number
of deleted lines (counting only lines before the midpoint).
The edit cost is stored into COST; this is the total number of
lines inserted or deleted (counting only lines before the midpoint).
This function assumes that the first lines of the specified portions
of the two files do not match, and likewise that the last lines do not
match. The caller must trim matching lines from the beginning and end
of the portions it is going to specify.
Note that if we return the "wrong" diagonal value, or if
the value of bdiag at that diagonal is "wrong",
the worst this can do is cause suboptimal diff output.
It cannot cause incorrect diff output.
final int[] fd = fdiag; // Give the compiler a chance.
final int[] bd = bdiag; // Additional help for the compiler.
final int[] xv = xvec; // Still more help for the compiler.
final int[] yv = yvec; // And more and more . . .
final int dmin = xoff - ylim; // Minimum valid diagonal.
final int dmax = xlim - yoff; // Maximum valid diagonal.
final int fmid = xoff - yoff; // Center diagonal of top-down search.
final int bmid = xlim - ylim; // Center diagonal of bottom-up search.
int fmin = fmid, fmax = fmid; // Limits of top-down search.
int bmin = bmid, bmax = bmid; // Limits of bottom-up search.
/* True if southeast corner is on an odd
diagonal with respect to the northwest. */
final boolean odd = (fmid - bmid & 1) != 0;
fd[fdiagoff + fmid] = xoff;
bd[bdiagoff + bmid] = xlim;
for (int c = 1;; ++c)
{
int d; /* Active diagonal. */
boolean big_snake = false;
/* Extend the top-down search by an edit step in each diagonal. */
if (fmin > dmin)
fd[fdiagoff + --fmin - 1] = -1;
else
++fmin;
if (fmax < dmax)
fd[fdiagoff + ++fmax + 1] = -1;
else
--fmax;
for (d = fmax; d >= fmin; d -= 2)
{
int x, y, oldx, tlo = fd[fdiagoff + d - 1], thi = fd[fdiagoff + d + 1];
if (tlo >= thi)
x = tlo + 1;
else
x = thi;
oldx = x;
y = x - d;
while (x < xlim && y < ylim && xv[x] == yv[y]) {
++x; ++y;
}
if (x - oldx > 20)
big_snake = true;
fd[fdiagoff + d] = x;
if (odd && bmin <= d && d <= bmax && bd[bdiagoff + d] <= fd[fdiagoff + d])
{
cost = 2 * c - 1;
return d;
}
}
/* Similar extend the bottom-up search. */
if (bmin > dmin)
bd[bdiagoff + --bmin - 1] = Integer.MAX_VALUE;
else
++bmin;
if (bmax < dmax)
bd[bdiagoff + ++bmax + 1] = Integer.MAX_VALUE;
else
--bmax;
for (d = bmax; d >= bmin; d -= 2)
{
int x, y, oldx, tlo = bd[bdiagoff + d - 1], thi = bd[bdiagoff + d + 1];
if (tlo < thi)
x = tlo;
else
x = thi - 1;
oldx = x;
y = x - d;
while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) {
--x; --y;
}
if (oldx - x > 20)
big_snake = true;
bd[bdiagoff + d] = x;
if (!odd && fmin <= d && d <= fmax && bd[bdiagoff + d] <= fd[fdiagoff + d])
{
cost = 2 * c;
return d;
}
}
/* Heuristic: check occasionally for a diagonal that has made
lots of progress compared with the edit distance.
If we have any such, find the one that has made the most
progress and return it as if it had succeeded.
With this heuristic, for files with a constant small density
of changes, the algorithm is linear in the file size. */
if (c > 200 && big_snake && heuristic)
{
int best = 0;
int bestpos = -1;
for (d = fmax; d >= fmin; d -= 2)
{
int dd = d - fmid;
if ((fd[fdiagoff + d] - xoff)*2 - dd > 12 * (c + (dd > 0 ? dd : -dd)))
{
if (fd[fdiagoff + d] * 2 - dd > best
&& fd[fdiagoff + d] - xoff > 20
&& fd[fdiagoff + d] - d - yoff > 20)
{
int k;
int x = fd[fdiagoff + d];
/* We have a good enough best diagonal;
now insist that it end with a significant snake. */
for (k = 1; k <= 20; k++)
if (xvec[x - k] != yvec[x - d - k])
break;
if (k == 21)
{
best = fd[fdiagoff + d] * 2 - dd;
bestpos = d;
}
}
}
}
if (best > 0)
{
cost = 2 * c - 1;
return bestpos;
}
best = 0;
for (d = bmax; d >= bmin; d -= 2)
{
int dd = d - bmid;
if ((xlim - bd[bdiagoff + d])*2 + dd > 12 * (c + (dd > 0 ? dd : -dd)))
{
if ((xlim - bd[bdiagoff + d]) * 2 + dd > best
&& xlim - bd[bdiagoff + d] > 20
&& ylim - (bd[bdiagoff + d] - d) > 20)
{
/* We have a good enough best diagonal;
now insist that it end with a significant snake. */
int k;
int x = bd[bdiagoff + d];
for (k = 0; k < 20; k++)
if (xvec[x + k] != yvec[x - d + k])
break;
if (k == 20)
{
best = (xlim - bd[bdiagoff + d]) * 2 + dd;
bestpos = d;
}
}
}
}
if (best > 0)
{
cost = 2 * c - 1;
return bestpos;
}
}
}
| public jdiff.DiffMyers$change | diff_2(boolean reverse)
/* Some lines are obviously insertions or deletions
because they don't match anything. Detect them now,
and avoid even thinking about them in the main comparison algorithm. */
discard_confusing_lines ();
/* Now do the main comparison algorithm, considering just the
undiscarded lines. */
xvec = filevec[0].undiscarded;
yvec = filevec[1].undiscarded;
int diags =
filevec[0].nondiscarded_lines + filevec[1].nondiscarded_lines + 3;
fdiag = new int[diags];
fdiagoff = filevec[1].nondiscarded_lines + 1;
bdiag = new int[diags];
bdiagoff = filevec[1].nondiscarded_lines + 1;
compareseq (0, filevec[0].nondiscarded_lines,
0, filevec[1].nondiscarded_lines);
fdiag = null;
bdiag = null;
/* Modify the results slightly to make them prettier
in cases where that can validly be done. */
shift_boundaries ();
/* Get the results of comparison in the form of a chain
of `struct change's -- an edit script. */
if (reverse)
return build_reverse_script();
else
return build_script();
| private void | discard_confusing_lines()Discard lines from one file that have no matches in the other file.
filevec[0].discard_confusing_lines(filevec[1]);
filevec[1].discard_confusing_lines(filevec[0]);
| private void | shift_boundaries()Adjust inserts/deletes of blank lines to join changes
as much as possible.
if (inhibit)
return;
filevec[0].shift_boundaries(filevec[1]);
filevec[1].shift_boundaries(filevec[0]);
|
|