1 /*
2 * Copyright (C) 2008-2009, Johannes E. Schindelin <johannes.schindelin@gmx.de>
3 * Copyright (C) 2009, Johannes Schindelin <johannes.schindelin@gmx.de>
4 * and other copyright owners as documented in the project's IP log.
5 *
6 * This program and the accompanying materials are made available
7 * under the terms of the Eclipse Distribution License v1.0 which
8 * accompanies this distribution, is reproduced below, and is
9 * available at http://www.eclipse.org/org/documents/edl-v10.php
10 *
11 * All rights reserved.
12 *
13 * Redistribution and use in source and binary forms, with or
14 * without modification, are permitted provided that the following
15 * conditions are met:
16 *
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 *
20 * - Redistributions in binary form must reproduce the above
21 * copyright notice, this list of conditions and the following
22 * disclaimer in the documentation and/or other materials provided
23 * with the distribution.
24 *
25 * - Neither the name of the Eclipse Foundation, Inc. nor the
26 * names of its contributors may be used to endorse or promote
27 * products derived from this software without specific prior
28 * written permission.
29 *
30 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
31 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
32 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
33 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
35 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
36 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
37 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
38 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
39 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
40 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
41 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
42 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 */
44
45 package org.eclipse.jgit.diff;
46
47 import java.text.MessageFormat;
48
49 import org.eclipse.jgit.errors.DiffInterruptedException;
50 import org.eclipse.jgit.internal.JGitText;
51 import org.eclipse.jgit.util.IntList;
52 import org.eclipse.jgit.util.LongList;
53
54 /**
55 * Diff algorithm, based on "An O(ND) Difference Algorithm and its Variations",
56 * by Eugene Myers.
57 * <p>
58 * The basic idea is to put the line numbers of text A as columns ("x") and the
59 * lines of text B as rows ("y"). Now you try to find the shortest "edit path"
60 * from the upper left corner to the lower right corner, where you can always go
61 * horizontally or vertically, but diagonally from (x,y) to (x+1,y+1) only if
62 * line x in text A is identical to line y in text B.
63 * <p>
64 * Myers' fundamental concept is the "furthest reaching D-path on diagonal k": a
65 * D-path is an edit path starting at the upper left corner and containing
66 * exactly D non-diagonal elements ("differences"). The furthest reaching D-path
67 * on diagonal k is the one that contains the most (diagonal) elements which
68 * ends on diagonal k (where k = y - x).
69 * <p>
70 * Example:
71 *
72 * <pre>
73 * H E L L O W O R L D
74 * ____
75 * L \___
76 * O \___
77 * W \________
78 * </pre>
79 * <p>
80 * Since every D-path has exactly D horizontal or vertical elements, it can only
81 * end on the diagonals -D, -D+2, ..., D-2, D.
82 * <p>
83 * Since every furthest reaching D-path contains at least one furthest reaching
84 * (D-1)-path (except for D=0), we can construct them recursively.
85 * <p>
86 * Since we are really interested in the shortest edit path, we can start
87 * looking for a 0-path, then a 1-path, and so on, until we find a path that
88 * ends in the lower right corner.
89 * <p>
90 * To save space, we do not need to store all paths (which has quadratic space
91 * requirements), but generate the D-paths simultaneously from both sides. When
92 * the ends meet, we will have found "the middle" of the path. From the end
93 * points of that diagonal part, we can generate the rest recursively.
94 * <p>
95 * This only requires linear space.
96 * <p>
97 * The overall (runtime) complexity is:
98 *
99 * <pre>
100 * O(N * D^2 + 2 * N/2 * (D/2)^2 + 4 * N/4 * (D/4)^2 + ...)
101 * = O(N * D^2 * 5 / 4) = O(N * D^2),
102 * </pre>
103 * <p>
104 * (With each step, we have to find the middle parts of twice as many regions as
105 * before, but the regions (as well as the D) are halved.)
106 * <p>
107 * So the overall runtime complexity stays the same with linear space, albeit
108 * with a larger constant factor.
109 *
110 * @param <S>
111 * type of sequence.
112 */
113 @SuppressWarnings("hiding")
114 public class MyersDiff<S extends Sequence> {
115 /** Singleton instance of MyersDiff. */
116 public static final DiffAlgorithm INSTANCE = new LowLevelDiffAlgorithm() {
117 @SuppressWarnings("unused")
118 @Override
119 public <S extends Sequence> void diffNonCommon(EditList edits,
120 HashedSequenceComparator<S> cmp, HashedSequence<S> a,
121 HashedSequence<S> b, Edit region) {
122 new MyersDiff<>(edits, cmp, a, b, region);
123 }
124 };
125
126 /**
127 * The list of edits found during the last call to
128 * {@link #calculateEdits(Edit)}
129 */
130 protected EditList edits;
131
132 /** Comparison function for sequences. */
133 protected HashedSequenceComparator<S> cmp;
134
135 /**
136 * The first text to be compared. Referred to as "Text A" in the comments
137 */
138 protected HashedSequence<S> a;
139
140 /**
141 * The second text to be compared. Referred to as "Text B" in the comments
142 */
143 protected HashedSequence<S> b;
144
145 private MyersDiff(EditList edits, HashedSequenceComparator<S> cmp,
146 HashedSequence<S> a, HashedSequence<S> b, Edit region) {
147 this.edits = edits;
148 this.cmp = cmp;
149 this.a = a;
150 this.b = b;
151 calculateEdits(region);
152 }
153
154 // TODO: use ThreadLocal for future multi-threaded operations
155 MiddleEdit middle = new MiddleEdit();
156
157 /**
158 * Entrypoint into the algorithm this class is all about. This method triggers that the
159 * differences between A and B are calculated in form of a list of edits.
160 * @param r portion of the sequences to examine.
161 */
162 private void calculateEdits(Edit r) {
163 middle.initialize(r.beginA, r.endA, r.beginB, r.endB);
164 if (middle.beginA >= middle.endA &&
165 middle.beginB >= middle.endB)
166 return;
167
168 calculateEdits(middle.beginA, middle.endA,
169 middle.beginB, middle.endB);
170 }
171
172 /**
173 * Calculates the differences between a given part of A against another
174 * given part of B
175 *
176 * @param beginA
177 * start of the part of A which should be compared
178 * (0<=beginA<sizeof(A))
179 * @param endA
180 * end of the part of A which should be compared
181 * (beginA<=endA<sizeof(A))
182 * @param beginB
183 * start of the part of B which should be compared
184 * (0<=beginB<sizeof(B))
185 * @param endB
186 * end of the part of B which should be compared
187 * (beginB<=endB<sizeof(B))
188 */
189 protected void calculateEdits(int beginA, int endA,
190 int beginB, int endB) {
191 Edit edit = middle.calculate(beginA, endA, beginB, endB);
192
193 if (beginA < edit.beginA || beginB < edit.beginB) {
194 int k = edit.beginB - edit.beginA;
195 int x = middle.backward.snake(k, edit.beginA);
196 calculateEdits(beginA, x, beginB, k + x);
197 }
198
199 if (edit.getType() != Edit.Type.EMPTY)
200 edits.add(edits.size(), edit);
201
202 // after middle
203 if (endA > edit.endA || endB > edit.endB) {
204 int k = edit.endB - edit.endA;
205 int x = middle.forward.snake(k, edit.endA);
206 calculateEdits(x, endA, k + x, endB);
207 }
208 }
209
210 /**
211 * A class to help bisecting the sequences a and b to find minimal
212 * edit paths.
213 *
214 * As the arrays are reused for space efficiency, you will need one
215 * instance per thread.
216 *
217 * The entry function is the calculate() method.
218 */
219 class MiddleEdit {
220 void initialize(int beginA, int endA, int beginB, int endB) {
221 this.beginA = beginA; this.endA = endA;
222 this.beginB = beginB; this.endB = endB;
223
224 // strip common parts on either end
225 int k = beginB - beginA;
226 this.beginA = forward.snake(k, beginA);
227 this.beginB = k + this.beginA;
228
229 k = endB - endA;
230 this.endA = backward.snake(k, endA);
231 this.endB = k + this.endA;
232 }
233
234 /*
235 * This function calculates the "middle" Edit of the shortest
236 * edit path between the given subsequences of a and b.
237 *
238 * Once a forward path and a backward path meet, we found the
239 * middle part. From the last snake end point on both of them,
240 * we construct the Edit.
241 *
242 * It is assumed that there is at least one edit in the range.
243 */
244 // TODO: measure speed impact when this is synchronized
245 Edit calculate(int beginA, int endA, int beginB, int endB) {
246 if (beginA == endA || beginB == endB)
247 return new Edit(beginA, endA, beginB, endB);
248 this.beginA = beginA; this.endA = endA;
249 this.beginB = beginB; this.endB = endB;
250
251 /*
252 * Following the conventions in Myers' paper, "k" is
253 * the difference between the index into "b" and the
254 * index into "a".
255 */
256 int minK = beginB - endA;
257 int maxK = endB - beginA;
258
259 forward.initialize(beginB - beginA, beginA, minK, maxK);
260 backward.initialize(endB - endA, endA, minK, maxK);
261
262 for (int d = 1; ; d++)
263 if (forward.calculate(d) ||
264 backward.calculate(d))
265 return edit;
266 }
267
268 /*
269 * For each d, we need to hold the d-paths for the diagonals
270 * k = -d, -d + 2, ..., d - 2, d. These are stored in the
271 * forward (and backward) array.
272 *
273 * As we allow subsequences, too, this needs some refinement:
274 * the forward paths start on the diagonal forwardK =
275 * beginB - beginA, and backward paths start on the diagonal
276 * backwardK = endB - endA.
277 *
278 * So, we need to hold the forward d-paths for the diagonals
279 * k = forwardK - d, forwardK - d + 2, ..., forwardK + d and
280 * the analogue for the backward d-paths. This means that
281 * we can turn (k, d) into the forward array index using this
282 * formula:
283 *
284 * i = (d + k - forwardK) / 2
285 *
286 * There is a further complication: the edit paths should not
287 * leave the specified subsequences, so k is bounded by
288 * minK = beginB - endA and maxK = endB - beginA. However,
289 * (k - forwardK) _must_ be odd whenever d is odd, and it
290 * _must_ be even when d is even.
291 *
292 * The values in the "forward" and "backward" arrays are
293 * positions ("x") in the sequence a, to get the corresponding
294 * positions ("y") in the sequence b, you have to calculate
295 * the appropriate k and then y:
296 *
297 * k = forwardK - d + i * 2
298 * y = k + x
299 *
300 * (substitute backwardK for forwardK if you want to get the
301 * y position for an entry in the "backward" array.
302 */
303 EditPaths forward = new ForwardEditPaths();
304 EditPaths backward = new BackwardEditPaths();
305
306 /* Some variables which are shared between methods */
307 protected int beginA, endA, beginB, endB;
308 protected Edit edit;
309
310 abstract class EditPaths {
311 private IntList x = new IntList();
312 private LongList snake = new LongList();
313 int beginK, endK, middleK;
314 int prevBeginK, prevEndK;
315 /* if we hit one end early, no need to look further */
316 int minK, maxK; // TODO: better explanation
317
318 final int getIndex(int d, int k) {
319 // TODO: remove
320 if (((d + k - middleK) % 2) != 0)
321 throw new RuntimeException(MessageFormat.format(JGitText.get().unexpectedOddResult, Integer.valueOf(d), Integer.valueOf(k), Integer.valueOf(middleK)));
322 return (d + k - middleK) / 2;
323 }
324
325 final int getX(int d, int k) {
326 // TODO: remove
327 if (k < beginK || k > endK)
328 throw new RuntimeException(MessageFormat.format(JGitText.get().kNotInRange, Integer.valueOf(k), Integer.valueOf(beginK), Integer.valueOf(endK)));
329 return x.get(getIndex(d, k));
330 }
331
332 final long getSnake(int d, int k) {
333 // TODO: remove
334 if (k < beginK || k > endK)
335 throw new RuntimeException(MessageFormat.format(JGitText.get().kNotInRange, Integer.valueOf(k), Integer.valueOf(beginK), Integer.valueOf(endK)));
336 return snake.get(getIndex(d, k));
337 }
338
339 private int forceKIntoRange(int k) {
340 /* if k is odd, so must be the result */
341 if (k < minK)
342 return minK + ((k ^ minK) & 1);
343 else if (k > maxK)
344 return maxK - ((k ^ maxK) & 1);
345 return k;
346 }
347
348 void initialize(int k, int x, int minK, int maxK) {
349 this.minK = minK;
350 this.maxK = maxK;
351 beginK = endK = middleK = k;
352 this.x.clear();
353 this.x.add(x);
354 snake.clear();
355 snake.add(newSnake(k, x));
356 }
357
358 abstract int snake(int k, int x);
359 abstract int getLeft(int x);
360 abstract int getRight(int x);
361 abstract boolean isBetter(int left, int right);
362 abstract void adjustMinMaxK(int k, int x);
363 abstract boolean meets(int d, int k, int x, long snake);
364
365 final long newSnake(int k, int x) {
366 long y = k + x;
367 long ret = ((long) x) << 32;
368 return ret | y;
369 }
370
371 final int snake2x(long snake) {
372 return (int) (snake >>> 32);
373 }
374
375 final int snake2y(long snake) {
376 return (int) snake;
377 }
378
379 final boolean makeEdit(long snake1, long snake2) {
380 int x1 = snake2x(snake1), x2 = snake2x(snake2);
381 int y1 = snake2y(snake1), y2 = snake2y(snake2);
382 /*
383 * Check for incompatible partial edit paths:
384 * when there are ambiguities, we might have
385 * hit incompatible (i.e. non-overlapping)
386 * forward/backward paths.
387 *
388 * In that case, just pretend that we have
389 * an empty edit at the end of one snake; this
390 * will force a decision which path to take
391 * in the next recursion step.
392 */
393 if (x1 > x2 || y1 > y2) {
394 x1 = x2;
395 y1 = y2;
396 }
397 edit = new Edit(x1, x2, y1, y2);
398 return true;
399 }
400
401 boolean calculate(int d) {
402 prevBeginK = beginK;
403 prevEndK = endK;
404 beginK = forceKIntoRange(middleK - d);
405 endK = forceKIntoRange(middleK + d);
406 // TODO: handle i more efficiently
407 // TODO: walk snake(k, getX(d, k)) only once per (d, k)
408 // TODO: move end points out of the loop to avoid conditionals inside the loop
409 // go backwards so that we can avoid temp vars
410 for (int k = endK; k >= beginK; k -= 2) {
411 if (Thread.interrupted()) {
412 throw new DiffInterruptedException();
413 }
414 int left = -1, right = -1;
415 long leftSnake = -1L, rightSnake = -1L;
416 // TODO: refactor into its own function
417 if (k > prevBeginK) {
418 int i = getIndex(d - 1, k - 1);
419 left = x.get(i);
420 int end = snake(k - 1, left);
421 leftSnake = left != end ?
422 newSnake(k - 1, end) :
423 snake.get(i);
424 if (meets(d, k - 1, end, leftSnake))
425 return true;
426 left = getLeft(end);
427 }
428 if (k < prevEndK) {
429 int i = getIndex(d - 1, k + 1);
430 right = x.get(i);
431 int end = snake(k + 1, right);
432 rightSnake = right != end ?
433 newSnake(k + 1, end) :
434 snake.get(i);
435 if (meets(d, k + 1, end, rightSnake))
436 return true;
437 right = getRight(end);
438 }
439 int newX;
440 long newSnake;
441 if (k >= prevEndK ||
442 (k > prevBeginK &&
443 isBetter(left, right))) {
444 newX = left;
445 newSnake = leftSnake;
446 }
447 else {
448 newX = right;
449 newSnake = rightSnake;
450 }
451 if (meets(d, k, newX, newSnake))
452 return true;
453 adjustMinMaxK(k, newX);
454 int i = getIndex(d, k);
455 x.set(i, newX);
456 snake.set(i, newSnake);
457 }
458 return false;
459 }
460 }
461
462 class ForwardEditPaths extends EditPaths {
463 @Override
464 final int snake(int k, int x) {
465 for (; x < endA && k + x < endB; x++)
466 if (!cmp.equals(a, x, b, k + x))
467 break;
468 return x;
469 }
470
471 @Override
472 final int getLeft(int x) {
473 return x;
474 }
475
476 @Override
477 final int getRight(int x) {
478 return x + 1;
479 }
480
481 @Override
482 final boolean isBetter(int left, int right) {
483 return left > right;
484 }
485
486 @Override
487 final void adjustMinMaxK(int k, int x) {
488 if (x >= endA || k + x >= endB) {
489 if (k > backward.middleK)
490 maxK = k;
491 else
492 minK = k;
493 }
494 }
495
496 @Override
497 final boolean meets(int d, int k, int x, long snake) {
498 if (k < backward.beginK || k > backward.endK)
499 return false;
500 // TODO: move out of loop
501 if (((d - 1 + k - backward.middleK) % 2) != 0)
502 return false;
503 if (x < backward.getX(d - 1, k))
504 return false;
505 makeEdit(snake, backward.getSnake(d - 1, k));
506 return true;
507 }
508 }
509
510 class BackwardEditPaths extends EditPaths {
511 @Override
512 final int snake(int k, int x) {
513 for (; x > beginA && k + x > beginB; x--)
514 if (!cmp.equals(a, x - 1, b, k + x - 1))
515 break;
516 return x;
517 }
518
519 @Override
520 final int getLeft(int x) {
521 return x - 1;
522 }
523
524 @Override
525 final int getRight(int x) {
526 return x;
527 }
528
529 @Override
530 final boolean isBetter(int left, int right) {
531 return left < right;
532 }
533
534 @Override
535 final void adjustMinMaxK(int k, int x) {
536 if (x <= beginA || k + x <= beginB) {
537 if (k > forward.middleK)
538 maxK = k;
539 else
540 minK = k;
541 }
542 }
543
544 @Override
545 final boolean meets(int d, int k, int x, long snake) {
546 if (k < forward.beginK || k > forward.endK)
547 return false;
548 // TODO: move out of loop
549 if (((d + k - forward.middleK) % 2) != 0)
550 return false;
551 if (x > forward.getX(d, k))
552 return false;
553 makeEdit(forward.getSnake(d, k), snake);
554 return true;
555 }
556 }
557 }
558
559 /**
560 * Main method
561 *
562 * @param args
563 * two filenames specifying the contents to be diffed
564 */
565 public static void main(String[] args) {
566 if (args.length != 2) {
567 System.err.println(JGitText.get().need2Arguments);
568 System.exit(1);
569 }
570 try {
571 RawText a = new RawText(new java.io.File(args[0]));
572 RawText b = new RawText(new java.io.File(args[1]));
573 EditList r = INSTANCE.diff(RawTextComparator.DEFAULT, a, b);
574 System.out.println(r.toString());
575 } catch (Exception e) {
576 e.printStackTrace();
577 }
578 }
579 }