MyersDiff.java
/*
* Copyright (C) 2008-2009, Johannes E. Schindelin <johannes.schindelin@gmx.de>
* Copyright (C) 2009, Johannes Schindelin <johannes.schindelin@gmx.de>
* and other copyright owners as documented in the project's IP log.
*
* This program and the accompanying materials are made available
* under the terms of the Eclipse Distribution License v1.0 which
* accompanies this distribution, is reproduced below, and is
* available at http://www.eclipse.org/org/documents/edl-v10.php
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials provided
* with the distribution.
*
* - Neither the name of the Eclipse Foundation, Inc. nor the
* names of its contributors may be used to endorse or promote
* products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package org.eclipse.jgit.diff;
import java.text.MessageFormat;
import org.eclipse.jgit.errors.DiffInterruptedException;
import org.eclipse.jgit.internal.JGitText;
import org.eclipse.jgit.util.IntList;
import org.eclipse.jgit.util.LongList;
/**
* Diff algorithm, based on "An O(ND) Difference Algorithm and its Variations",
* by Eugene Myers.
* <p>
* The basic idea is to put the line numbers of text A as columns ("x") and the
* lines of text B as rows ("y"). Now you try to find the shortest "edit path"
* from the upper left corner to the lower right corner, where you can always go
* horizontally or vertically, but diagonally from (x,y) to (x+1,y+1) only if
* line x in text A is identical to line y in text B.
* <p>
* Myers' fundamental concept is the "furthest reaching D-path on diagonal k": a
* D-path is an edit path starting at the upper left corner and containing
* exactly D non-diagonal elements ("differences"). The furthest reaching D-path
* on diagonal k is the one that contains the most (diagonal) elements which
* ends on diagonal k (where k = y - x).
* <p>
* Example:
*
* <pre>
* H E L L O W O R L D
* ____
* L \___
* O \___
* W \________
* </pre>
* <p>
* Since every D-path has exactly D horizontal or vertical elements, it can only
* end on the diagonals -D, -D+2, ..., D-2, D.
* <p>
* Since every furthest reaching D-path contains at least one furthest reaching
* (D-1)-path (except for D=0), we can construct them recursively.
* <p>
* Since we are really interested in the shortest edit path, we can start
* looking for a 0-path, then a 1-path, and so on, until we find a path that
* ends in the lower right corner.
* <p>
* To save space, we do not need to store all paths (which has quadratic space
* requirements), but generate the D-paths simultaneously from both sides. When
* the ends meet, we will have found "the middle" of the path. From the end
* points of that diagonal part, we can generate the rest recursively.
* <p>
* This only requires linear space.
* <p>
* The overall (runtime) complexity is:
*
* <pre>
* O(N * D^2 + 2 * N/2 * (D/2)^2 + 4 * N/4 * (D/4)^2 + ...)
* = O(N * D^2 * 5 / 4) = O(N * D^2),
* </pre>
* <p>
* (With each step, we have to find the middle parts of twice as many regions as
* before, but the regions (as well as the D) are halved.)
* <p>
* So the overall runtime complexity stays the same with linear space, albeit
* with a larger constant factor.
*
* @param <S>
* type of sequence.
*/
@SuppressWarnings("hiding")
public class MyersDiff<S extends Sequence> {
/** Singleton instance of MyersDiff. */
public static final DiffAlgorithm INSTANCE = new LowLevelDiffAlgorithm() {
@SuppressWarnings("unused")
@Override
public <S extends Sequence> void diffNonCommon(EditList edits,
HashedSequenceComparator<S> cmp, HashedSequence<S> a,
HashedSequence<S> b, Edit region) {
new MyersDiff<>(edits, cmp, a, b, region);
}
};
/**
* The list of edits found during the last call to
* {@link #calculateEdits(Edit)}
*/
protected EditList edits;
/** Comparison function for sequences. */
protected HashedSequenceComparator<S> cmp;
/**
* The first text to be compared. Referred to as "Text A" in the comments
*/
protected HashedSequence<S> a;
/**
* The second text to be compared. Referred to as "Text B" in the comments
*/
protected HashedSequence<S> b;
private MyersDiff(EditList edits, HashedSequenceComparator<S> cmp,
HashedSequence<S> a, HashedSequence<S> b, Edit region) {
this.edits = edits;
this.cmp = cmp;
this.a = a;
this.b = b;
calculateEdits(region);
}
// TODO: use ThreadLocal for future multi-threaded operations
MiddleEdit middle = new MiddleEdit();
/**
* Entrypoint into the algorithm this class is all about. This method triggers that the
* differences between A and B are calculated in form of a list of edits.
* @param r portion of the sequences to examine.
*/
private void calculateEdits(Edit r) {
middle.initialize(r.beginA, r.endA, r.beginB, r.endB);
if (middle.beginA >= middle.endA &&
middle.beginB >= middle.endB)
return;
calculateEdits(middle.beginA, middle.endA,
middle.beginB, middle.endB);
}
/**
* Calculates the differences between a given part of A against another
* given part of B
*
* @param beginA
* start of the part of A which should be compared
* (0<=beginA<sizeof(A))
* @param endA
* end of the part of A which should be compared
* (beginA<=endA<sizeof(A))
* @param beginB
* start of the part of B which should be compared
* (0<=beginB<sizeof(B))
* @param endB
* end of the part of B which should be compared
* (beginB<=endB<sizeof(B))
*/
protected void calculateEdits(int beginA, int endA,
int beginB, int endB) {
Edit edit = middle.calculate(beginA, endA, beginB, endB);
if (beginA < edit.beginA || beginB < edit.beginB) {
int k = edit.beginB - edit.beginA;
int x = middle.backward.snake(k, edit.beginA);
calculateEdits(beginA, x, beginB, k + x);
}
if (edit.getType() != Edit.Type.EMPTY)
edits.add(edits.size(), edit);
// after middle
if (endA > edit.endA || endB > edit.endB) {
int k = edit.endB - edit.endA;
int x = middle.forward.snake(k, edit.endA);
calculateEdits(x, endA, k + x, endB);
}
}
/**
* A class to help bisecting the sequences a and b to find minimal
* edit paths.
*
* As the arrays are reused for space efficiency, you will need one
* instance per thread.
*
* The entry function is the calculate() method.
*/
class MiddleEdit {
void initialize(int beginA, int endA, int beginB, int endB) {
this.beginA = beginA; this.endA = endA;
this.beginB = beginB; this.endB = endB;
// strip common parts on either end
int k = beginB - beginA;
this.beginA = forward.snake(k, beginA);
this.beginB = k + this.beginA;
k = endB - endA;
this.endA = backward.snake(k, endA);
this.endB = k + this.endA;
}
/*
* This function calculates the "middle" Edit of the shortest
* edit path between the given subsequences of a and b.
*
* Once a forward path and a backward path meet, we found the
* middle part. From the last snake end point on both of them,
* we construct the Edit.
*
* It is assumed that there is at least one edit in the range.
*/
// TODO: measure speed impact when this is synchronized
Edit calculate(int beginA, int endA, int beginB, int endB) {
if (beginA == endA || beginB == endB)
return new Edit(beginA, endA, beginB, endB);
this.beginA = beginA; this.endA = endA;
this.beginB = beginB; this.endB = endB;
/*
* Following the conventions in Myers' paper, "k" is
* the difference between the index into "b" and the
* index into "a".
*/
int minK = beginB - endA;
int maxK = endB - beginA;
forward.initialize(beginB - beginA, beginA, minK, maxK);
backward.initialize(endB - endA, endA, minK, maxK);
for (int d = 1; ; d++)
if (forward.calculate(d) ||
backward.calculate(d))
return edit;
}
/*
* For each d, we need to hold the d-paths for the diagonals
* k = -d, -d + 2, ..., d - 2, d. These are stored in the
* forward (and backward) array.
*
* As we allow subsequences, too, this needs some refinement:
* the forward paths start on the diagonal forwardK =
* beginB - beginA, and backward paths start on the diagonal
* backwardK = endB - endA.
*
* So, we need to hold the forward d-paths for the diagonals
* k = forwardK - d, forwardK - d + 2, ..., forwardK + d and
* the analogue for the backward d-paths. This means that
* we can turn (k, d) into the forward array index using this
* formula:
*
* i = (d + k - forwardK) / 2
*
* There is a further complication: the edit paths should not
* leave the specified subsequences, so k is bounded by
* minK = beginB - endA and maxK = endB - beginA. However,
* (k - forwardK) _must_ be odd whenever d is odd, and it
* _must_ be even when d is even.
*
* The values in the "forward" and "backward" arrays are
* positions ("x") in the sequence a, to get the corresponding
* positions ("y") in the sequence b, you have to calculate
* the appropriate k and then y:
*
* k = forwardK - d + i * 2
* y = k + x
*
* (substitute backwardK for forwardK if you want to get the
* y position for an entry in the "backward" array.
*/
EditPaths forward = new ForwardEditPaths();
EditPaths backward = new BackwardEditPaths();
/* Some variables which are shared between methods */
protected int beginA, endA, beginB, endB;
protected Edit edit;
abstract class EditPaths {
private IntList x = new IntList();
private LongList snake = new LongList();
int beginK, endK, middleK;
int prevBeginK, prevEndK;
/* if we hit one end early, no need to look further */
int minK, maxK; // TODO: better explanation
final int getIndex(int d, int k) {
// TODO: remove
if (((d + k - middleK) % 2) != 0)
throw new RuntimeException(MessageFormat.format(JGitText.get().unexpectedOddResult, Integer.valueOf(d), Integer.valueOf(k), Integer.valueOf(middleK)));
return (d + k - middleK) / 2;
}
final int getX(int d, int k) {
// TODO: remove
if (k < beginK || k > endK)
throw new RuntimeException(MessageFormat.format(JGitText.get().kNotInRange, Integer.valueOf(k), Integer.valueOf(beginK), Integer.valueOf(endK)));
return x.get(getIndex(d, k));
}
final long getSnake(int d, int k) {
// TODO: remove
if (k < beginK || k > endK)
throw new RuntimeException(MessageFormat.format(JGitText.get().kNotInRange, Integer.valueOf(k), Integer.valueOf(beginK), Integer.valueOf(endK)));
return snake.get(getIndex(d, k));
}
private int forceKIntoRange(int k) {
/* if k is odd, so must be the result */
if (k < minK)
return minK + ((k ^ minK) & 1);
else if (k > maxK)
return maxK - ((k ^ maxK) & 1);
return k;
}
void initialize(int k, int x, int minK, int maxK) {
this.minK = minK;
this.maxK = maxK;
beginK = endK = middleK = k;
this.x.clear();
this.x.add(x);
snake.clear();
snake.add(newSnake(k, x));
}
abstract int snake(int k, int x);
abstract int getLeft(int x);
abstract int getRight(int x);
abstract boolean isBetter(int left, int right);
abstract void adjustMinMaxK(int k, int x);
abstract boolean meets(int d, int k, int x, long snake);
final long newSnake(int k, int x) {
long y = k + x;
long ret = ((long) x) << 32;
return ret | y;
}
final int snake2x(long snake) {
return (int) (snake >>> 32);
}
final int snake2y(long snake) {
return (int) snake;
}
final boolean makeEdit(long snake1, long snake2) {
int x1 = snake2x(snake1), x2 = snake2x(snake2);
int y1 = snake2y(snake1), y2 = snake2y(snake2);
/*
* Check for incompatible partial edit paths:
* when there are ambiguities, we might have
* hit incompatible (i.e. non-overlapping)
* forward/backward paths.
*
* In that case, just pretend that we have
* an empty edit at the end of one snake; this
* will force a decision which path to take
* in the next recursion step.
*/
if (x1 > x2 || y1 > y2) {
x1 = x2;
y1 = y2;
}
edit = new Edit(x1, x2, y1, y2);
return true;
}
boolean calculate(int d) {
prevBeginK = beginK;
prevEndK = endK;
beginK = forceKIntoRange(middleK - d);
endK = forceKIntoRange(middleK + d);
// TODO: handle i more efficiently
// TODO: walk snake(k, getX(d, k)) only once per (d, k)
// TODO: move end points out of the loop to avoid conditionals inside the loop
// go backwards so that we can avoid temp vars
for (int k = endK; k >= beginK; k -= 2) {
if (Thread.interrupted()) {
throw new DiffInterruptedException();
}
int left = -1, right = -1;
long leftSnake = -1L, rightSnake = -1L;
// TODO: refactor into its own function
if (k > prevBeginK) {
int i = getIndex(d - 1, k - 1);
left = x.get(i);
int end = snake(k - 1, left);
leftSnake = left != end ?
newSnake(k - 1, end) :
snake.get(i);
if (meets(d, k - 1, end, leftSnake))
return true;
left = getLeft(end);
}
if (k < prevEndK) {
int i = getIndex(d - 1, k + 1);
right = x.get(i);
int end = snake(k + 1, right);
rightSnake = right != end ?
newSnake(k + 1, end) :
snake.get(i);
if (meets(d, k + 1, end, rightSnake))
return true;
right = getRight(end);
}
int newX;
long newSnake;
if (k >= prevEndK ||
(k > prevBeginK &&
isBetter(left, right))) {
newX = left;
newSnake = leftSnake;
}
else {
newX = right;
newSnake = rightSnake;
}
if (meets(d, k, newX, newSnake))
return true;
adjustMinMaxK(k, newX);
int i = getIndex(d, k);
x.set(i, newX);
snake.set(i, newSnake);
}
return false;
}
}
class ForwardEditPaths extends EditPaths {
@Override
final int snake(int k, int x) {
for (; x < endA && k + x < endB; x++)
if (!cmp.equals(a, x, b, k + x))
break;
return x;
}
@Override
final int getLeft(int x) {
return x;
}
@Override
final int getRight(int x) {
return x + 1;
}
@Override
final boolean isBetter(int left, int right) {
return left > right;
}
@Override
final void adjustMinMaxK(int k, int x) {
if (x >= endA || k + x >= endB) {
if (k > backward.middleK)
maxK = k;
else
minK = k;
}
}
@Override
final boolean meets(int d, int k, int x, long snake) {
if (k < backward.beginK || k > backward.endK)
return false;
// TODO: move out of loop
if (((d - 1 + k - backward.middleK) % 2) != 0)
return false;
if (x < backward.getX(d - 1, k))
return false;
makeEdit(snake, backward.getSnake(d - 1, k));
return true;
}
}
class BackwardEditPaths extends EditPaths {
@Override
final int snake(int k, int x) {
for (; x > beginA && k + x > beginB; x--)
if (!cmp.equals(a, x - 1, b, k + x - 1))
break;
return x;
}
@Override
final int getLeft(int x) {
return x - 1;
}
@Override
final int getRight(int x) {
return x;
}
@Override
final boolean isBetter(int left, int right) {
return left < right;
}
@Override
final void adjustMinMaxK(int k, int x) {
if (x <= beginA || k + x <= beginB) {
if (k > forward.middleK)
maxK = k;
else
minK = k;
}
}
@Override
final boolean meets(int d, int k, int x, long snake) {
if (k < forward.beginK || k > forward.endK)
return false;
// TODO: move out of loop
if (((d + k - forward.middleK) % 2) != 0)
return false;
if (x > forward.getX(d, k))
return false;
makeEdit(forward.getSnake(d, k), snake);
return true;
}
}
}
/**
* Main method
*
* @param args
* two filenames specifying the contents to be diffed
*/
public static void main(String[] args) {
if (args.length != 2) {
System.err.println(JGitText.get().need2Arguments);
System.exit(1);
}
try {
RawText a = new RawText(new java.io.File(args[0]));
RawText b = new RawText(new java.io.File(args[1]));
EditList r = INSTANCE.diff(RawTextComparator.DEFAULT, a, b);
System.out.println(r.toString());
} catch (Exception e) {
e.printStackTrace();
}
}
}