View Javadoc
1   /*
2    * Copyright (C) 2008-2009, Johannes E. Schindelin <johannes.schindelin@gmx.de>
3    * Copyright (C) 2009, Johannes Schindelin <johannes.schindelin@gmx.de>
4    * and other copyright owners as documented in the project's IP log.
5    *
6    * This program and the accompanying materials are made available
7    * under the terms of the Eclipse Distribution License v1.0 which
8    * accompanies this distribution, is reproduced below, and is
9    * available at http://www.eclipse.org/org/documents/edl-v10.php
10   *
11   * All rights reserved.
12   *
13   * Redistribution and use in source and binary forms, with or
14   * without modification, are permitted provided that the following
15   * conditions are met:
16   *
17   * - Redistributions of source code must retain the above copyright
18   *   notice, this list of conditions and the following disclaimer.
19   *
20   * - Redistributions in binary form must reproduce the above
21   *   copyright notice, this list of conditions and the following
22   *   disclaimer in the documentation and/or other materials provided
23   *   with the distribution.
24   *
25   * - Neither the name of the Eclipse Foundation, Inc. nor the
26   *   names of its contributors may be used to endorse or promote
27   *   products derived from this software without specific prior
28   *   written permission.
29   *
30   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
31   * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
32   * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
33   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34   * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
35   * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
36   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
37   * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
38   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
39   * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
40   * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
41   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
42   * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43   */
44  
45  package org.eclipse.jgit.diff;
46  
47  import java.text.MessageFormat;
48  
49  import org.eclipse.jgit.errors.DiffInterruptedException;
50  import org.eclipse.jgit.internal.JGitText;
51  import org.eclipse.jgit.util.IntList;
52  import org.eclipse.jgit.util.LongList;
53  
54  /**
55   * Diff algorithm, based on "An O(ND) Difference Algorithm and its Variations",
56   * by Eugene Myers.
57   * <p>
58   * The basic idea is to put the line numbers of text A as columns ("x") and the
59   * lines of text B as rows ("y"). Now you try to find the shortest "edit path"
60   * from the upper left corner to the lower right corner, where you can always go
61   * horizontally or vertically, but diagonally from (x,y) to (x+1,y+1) only if
62   * line x in text A is identical to line y in text B.
63   * <p>
64   * Myers' fundamental concept is the "furthest reaching D-path on diagonal k": a
65   * D-path is an edit path starting at the upper left corner and containing
66   * exactly D non-diagonal elements ("differences"). The furthest reaching D-path
67   * on diagonal k is the one that contains the most (diagonal) elements which
68   * ends on diagonal k (where k = y - x).
69   * <p>
70   * Example:
71   *
72   * <pre>
73   *    H E L L O   W O R L D
74   *    ____
75   *  L     \___
76   *  O         \___
77   *  W             \________
78   * </pre>
79   * <p>
80   * Since every D-path has exactly D horizontal or vertical elements, it can only
81   * end on the diagonals -D, -D+2, ..., D-2, D.
82   * <p>
83   * Since every furthest reaching D-path contains at least one furthest reaching
84   * (D-1)-path (except for D=0), we can construct them recursively.
85   * <p>
86   * Since we are really interested in the shortest edit path, we can start
87   * looking for a 0-path, then a 1-path, and so on, until we find a path that
88   * ends in the lower right corner.
89   * <p>
90   * To save space, we do not need to store all paths (which has quadratic space
91   * requirements), but generate the D-paths simultaneously from both sides. When
92   * the ends meet, we will have found "the middle" of the path. From the end
93   * points of that diagonal part, we can generate the rest recursively.
94   * <p>
95   * This only requires linear space.
96   * <p>
97   * The overall (runtime) complexity is:
98   *
99   * <pre>
100  *     O(N * D^2 + 2 * N/2 * (D/2)^2 + 4 * N/4 * (D/4)^2 + ...)
101  *     = O(N * D^2 * 5 / 4) = O(N * D^2),
102  * </pre>
103  * <p>
104  * (With each step, we have to find the middle parts of twice as many regions as
105  * before, but the regions (as well as the D) are halved.)
106  * <p>
107  * So the overall runtime complexity stays the same with linear space, albeit
108  * with a larger constant factor.
109  *
110  * @param <S>
111  *            type of sequence.
112  */
113 @SuppressWarnings("hiding")
114 public class MyersDiff<S extends Sequence> {
115 	/** Singleton instance of MyersDiff. */
116 	public static final DiffAlgorithm INSTANCE = new LowLevelDiffAlgorithm() {
117 		@SuppressWarnings("unused")
118 		@Override
119 		public <S extends Sequence> void diffNonCommon(EditList edits,
120 				HashedSequenceComparator<S> cmp, HashedSequence<S> a,
121 				HashedSequence<S> b, Edit region) {
122 			new MyersDiff<>(edits, cmp, a, b, region);
123 		}
124 	};
125 
126 	/**
127 	 * The list of edits found during the last call to
128 	 * {@link #calculateEdits(Edit)}
129 	 */
130 	protected EditList edits;
131 
132 	/** Comparison function for sequences. */
133 	protected HashedSequenceComparator<S> cmp;
134 
135 	/**
136 	 * The first text to be compared. Referred to as "Text A" in the comments
137 	 */
138 	protected HashedSequence<S> a;
139 
140 	/**
141 	 * The second text to be compared. Referred to as "Text B" in the comments
142 	 */
143 	protected HashedSequence<S> b;
144 
145 	private MyersDiff(EditList edits, HashedSequenceComparator<S> cmp,
146 			HashedSequence<S> a, HashedSequence<S> b, Edit region) {
147 		this.edits = edits;
148 		this.cmp = cmp;
149 		this.a = a;
150 		this.b = b;
151 		calculateEdits(region);
152 	}
153 
154 	// TODO: use ThreadLocal for future multi-threaded operations
155 	MiddleEdit middle = new MiddleEdit();
156 
157 	/**
158 	 * Entrypoint into the algorithm this class is all about. This method triggers that the
159 	 * differences between A and B are calculated in form of a list of edits.
160 	 * @param r portion of the sequences to examine.
161 	 */
162 	private void calculateEdits(Edit r) {
163 		middle.initialize(r.beginA, r.endA, r.beginB, r.endB);
164 		if (middle.beginA >= middle.endA &&
165 				middle.beginB >= middle.endB)
166 			return;
167 
168 		calculateEdits(middle.beginA, middle.endA,
169 				middle.beginB, middle.endB);
170 	}
171 
172 	/**
173 	 * Calculates the differences between a given part of A against another
174 	 * given part of B
175 	 *
176 	 * @param beginA
177 	 *            start of the part of A which should be compared
178 	 *            (0&lt;=beginA&lt;sizeof(A))
179 	 * @param endA
180 	 *            end of the part of A which should be compared
181 	 *            (beginA&lt;=endA&lt;sizeof(A))
182 	 * @param beginB
183 	 *            start of the part of B which should be compared
184 	 *            (0&lt;=beginB&lt;sizeof(B))
185 	 * @param endB
186 	 *            end of the part of B which should be compared
187 	 *            (beginB&lt;=endB&lt;sizeof(B))
188 	 */
189 	protected void calculateEdits(int beginA, int endA,
190 			int beginB, int endB) {
191 		Edit edit = middle.calculate(beginA, endA, beginB, endB);
192 
193 		if (beginA < edit.beginA || beginB < edit.beginB) {
194 			int k = edit.beginB - edit.beginA;
195 			int x = middle.backward.snake(k, edit.beginA);
196 			calculateEdits(beginA, x, beginB, k + x);
197 		}
198 
199 		if (edit.getType() != Edit.Type.EMPTY)
200 			edits.add(edits.size(), edit);
201 
202 		// after middle
203 		if (endA > edit.endA || endB > edit.endB) {
204 			int k = edit.endB - edit.endA;
205 			int x = middle.forward.snake(k, edit.endA);
206 			calculateEdits(x, endA, k + x, endB);
207 		}
208 	}
209 
210 	/**
211 	 * A class to help bisecting the sequences a and b to find minimal
212 	 * edit paths.
213 	 *
214 	 * As the arrays are reused for space efficiency, you will need one
215 	 * instance per thread.
216 	 *
217 	 * The entry function is the calculate() method.
218 	 */
219 	class MiddleEdit {
220 		void initialize(int beginA, int endA, int beginB, int endB) {
221 			this.beginA = beginA; this.endA = endA;
222 			this.beginB = beginB; this.endB = endB;
223 
224 			// strip common parts on either end
225 			int k = beginB - beginA;
226 			this.beginA = forward.snake(k, beginA);
227 			this.beginB = k + this.beginA;
228 
229 			k = endB - endA;
230 			this.endA = backward.snake(k, endA);
231 			this.endB = k + this.endA;
232 		}
233 
234 		/*
235 		 * This function calculates the "middle" Edit of the shortest
236 		 * edit path between the given subsequences of a and b.
237 		 *
238 		 * Once a forward path and a backward path meet, we found the
239 		 * middle part.  From the last snake end point on both of them,
240 		 * we construct the Edit.
241 		 *
242 		 * It is assumed that there is at least one edit in the range.
243 		 */
244 		// TODO: measure speed impact when this is synchronized
245 		Edit calculate(int beginA, int endA, int beginB, int endB) {
246 			if (beginA == endA || beginB == endB)
247 				return new Edit(beginA, endA, beginB, endB);
248 			this.beginA = beginA; this.endA = endA;
249 			this.beginB = beginB; this.endB = endB;
250 
251 			/*
252 			 * Following the conventions in Myers' paper, "k" is
253 			 * the difference between the index into "b" and the
254 			 * index into "a".
255 			 */
256 			int minK = beginB - endA;
257 			int maxK = endB - beginA;
258 
259 			forward.initialize(beginB - beginA, beginA, minK, maxK);
260 			backward.initialize(endB - endA, endA, minK, maxK);
261 
262 			for (int d = 1; ; d++)
263 				if (forward.calculate(d) ||
264 						backward.calculate(d))
265 					return edit;
266 		}
267 
268 		/*
269 		 * For each d, we need to hold the d-paths for the diagonals
270 		 * k = -d, -d + 2, ..., d - 2, d.  These are stored in the
271 		 * forward (and backward) array.
272 		 *
273 		 * As we allow subsequences, too, this needs some refinement:
274 		 * the forward paths start on the diagonal forwardK =
275 		 * beginB - beginA, and backward paths start on the diagonal
276 		 * backwardK = endB - endA.
277 		 *
278 		 * So, we need to hold the forward d-paths for the diagonals
279 		 * k = forwardK - d, forwardK - d + 2, ..., forwardK + d and
280 		 * the analogue for the backward d-paths.  This means that
281 		 * we can turn (k, d) into the forward array index using this
282 		 * formula:
283 		 *
284 		 *	i = (d + k - forwardK) / 2
285 		 *
286 		 * There is a further complication: the edit paths should not
287 		 * leave the specified subsequences, so k is bounded by
288 		 * minK = beginB - endA and maxK = endB - beginA.  However,
289 		 * (k - forwardK) _must_ be odd whenever d is odd, and it
290 		 * _must_ be even when d is even.
291 		 *
292 		 * The values in the "forward" and "backward" arrays are
293 		 * positions ("x") in the sequence a, to get the corresponding
294 		 * positions ("y") in the sequence b, you have to calculate
295 		 * the appropriate k and then y:
296 		 *
297 		 *	k = forwardK - d + i * 2
298 		 *	y = k + x
299 		 *
300 		 * (substitute backwardK for forwardK if you want to get the
301 		 * y position for an entry in the "backward" array.
302 		 */
303 		EditPaths forward = new ForwardEditPaths();
304 		EditPaths backward = new BackwardEditPaths();
305 
306 		/* Some variables which are shared between methods */
307 		protected int beginA, endA, beginB, endB;
308 		protected Edit edit;
309 
310 		abstract class EditPaths {
311 			private IntList x = new IntList();
312 			private LongList snake = new LongList();
313 			int beginK, endK, middleK;
314 			int prevBeginK, prevEndK;
315 			/* if we hit one end early, no need to look further */
316 			int minK, maxK; // TODO: better explanation
317 
318 			final int getIndex(int d, int k) {
319 // TODO: remove
320 if (((d + k - middleK) % 2) != 0)
321 	throw new RuntimeException(MessageFormat.format(JGitText.get().unexpectedOddResult, Integer.valueOf(d), Integer.valueOf(k), Integer.valueOf(middleK)));
322 				return (d + k - middleK) / 2;
323 			}
324 
325 			final int getX(int d, int k) {
326 // TODO: remove
327 if (k < beginK || k > endK)
328 	throw new RuntimeException(MessageFormat.format(JGitText.get().kNotInRange, Integer.valueOf(k), Integer.valueOf(beginK), Integer.valueOf(endK)));
329 				return x.get(getIndex(d, k));
330 			}
331 
332 			final long getSnake(int d, int k) {
333 // TODO: remove
334 if (k < beginK || k > endK)
335 	throw new RuntimeException(MessageFormat.format(JGitText.get().kNotInRange, Integer.valueOf(k), Integer.valueOf(beginK), Integer.valueOf(endK)));
336 				return snake.get(getIndex(d, k));
337 			}
338 
339 			private int forceKIntoRange(int k) {
340 				/* if k is odd, so must be the result */
341 				if (k < minK)
342 					return minK + ((k ^ minK) & 1);
343 				else if (k > maxK)
344 					return maxK - ((k ^ maxK) & 1);
345 				return k;
346 			}
347 
348 			void initialize(int k, int x, int minK, int maxK) {
349 				this.minK = minK;
350 				this.maxK = maxK;
351 				beginK = endK = middleK = k;
352 				this.x.clear();
353 				this.x.add(x);
354 				snake.clear();
355 				snake.add(newSnake(k, x));
356 			}
357 
358 			abstract int snake(int k, int x);
359 			abstract int getLeft(int x);
360 			abstract int getRight(int x);
361 			abstract boolean isBetter(int left, int right);
362 			abstract void adjustMinMaxK(final int k, final int x);
363 			abstract boolean meets(int d, int k, int x, long snake);
364 
365 			final long newSnake(int k, int x) {
366 				long y = k + x;
367 				long ret = ((long) x) << 32;
368 				return ret | y;
369 			}
370 
371 			final int snake2x(long snake) {
372 				return (int) (snake >>> 32);
373 			}
374 
375 			final int snake2y(long snake) {
376 				return (int) snake;
377 			}
378 
379 			final boolean makeEdit(long snake1, long snake2) {
380 				int x1 = snake2x(snake1), x2 = snake2x(snake2);
381 				int y1 = snake2y(snake1), y2 = snake2y(snake2);
382 				/*
383 				 * Check for incompatible partial edit paths:
384 				 * when there are ambiguities, we might have
385 				 * hit incompatible (i.e. non-overlapping)
386 				 * forward/backward paths.
387 				 *
388 				 * In that case, just pretend that we have
389 				 * an empty edit at the end of one snake; this
390 				 * will force a decision which path to take
391 				 * in the next recursion step.
392 				 */
393 				if (x1 > x2 || y1 > y2) {
394 					x1 = x2;
395 					y1 = y2;
396 				}
397 				edit = new Edit(x1, x2, y1, y2);
398 				return true;
399 			}
400 
401 			boolean calculate(int d) {
402 				prevBeginK = beginK;
403 				prevEndK = endK;
404 				beginK = forceKIntoRange(middleK - d);
405 				endK = forceKIntoRange(middleK + d);
406 				// TODO: handle i more efficiently
407 				// TODO: walk snake(k, getX(d, k)) only once per (d, k)
408 				// TODO: move end points out of the loop to avoid conditionals inside the loop
409 				// go backwards so that we can avoid temp vars
410 				for (int k = endK; k >= beginK; k -= 2) {
411 					if (Thread.interrupted()) {
412 						throw new DiffInterruptedException();
413 					}
414 					int left = -1, right = -1;
415 					long leftSnake = -1L, rightSnake = -1L;
416 					// TODO: refactor into its own function
417 					if (k > prevBeginK) {
418 						int i = getIndex(d - 1, k - 1);
419 						left = x.get(i);
420 						int end = snake(k - 1, left);
421 						leftSnake = left != end ?
422 							newSnake(k - 1, end) :
423 							snake.get(i);
424 						if (meets(d, k - 1, end, leftSnake))
425 							return true;
426 						left = getLeft(end);
427 					}
428 					if (k < prevEndK) {
429 						int i = getIndex(d - 1, k + 1);
430 						right = x.get(i);
431 						int end = snake(k + 1, right);
432 						rightSnake = right != end ?
433 							newSnake(k + 1, end) :
434 							snake.get(i);
435 						if (meets(d, k + 1, end, rightSnake))
436 							return true;
437 						right = getRight(end);
438 					}
439 					int newX;
440 					long newSnake;
441 					if (k >= prevEndK ||
442 							(k > prevBeginK &&
443 							 isBetter(left, right))) {
444 						newX = left;
445 						newSnake = leftSnake;
446 					}
447 					else {
448 						newX = right;
449 						newSnake = rightSnake;
450 					}
451 					if (meets(d, k, newX, newSnake))
452 						return true;
453 					adjustMinMaxK(k, newX);
454 					int i = getIndex(d, k);
455 					x.set(i, newX);
456 					snake.set(i, newSnake);
457 				}
458 				return false;
459 			}
460 		}
461 
462 		class ForwardEditPaths extends EditPaths {
463 			@Override
464 			final int snake(int k, int x) {
465 				for (; x < endA && k + x < endB; x++)
466 					if (!cmp.equals(a, x, b, k + x))
467 						break;
468 				return x;
469 			}
470 
471 			@Override
472 			final int getLeft(final int x) {
473 				return x;
474 			}
475 
476 			@Override
477 			final int getRight(final int x) {
478 				return x + 1;
479 			}
480 
481 			@Override
482 			final boolean isBetter(final int left, final int right) {
483 				return left > right;
484 			}
485 
486 			@Override
487 			final void adjustMinMaxK(final int k, final int x) {
488 				if (x >= endA || k + x >= endB) {
489 					if (k > backward.middleK)
490 						maxK = k;
491 					else
492 						minK = k;
493 				}
494 			}
495 
496 			@Override
497 			final boolean meets(int d, int k, int x, long snake) {
498 				if (k < backward.beginK || k > backward.endK)
499 					return false;
500 				// TODO: move out of loop
501 				if (((d - 1 + k - backward.middleK) % 2) != 0)
502 					return false;
503 				if (x < backward.getX(d - 1, k))
504 					return false;
505 				makeEdit(snake, backward.getSnake(d - 1, k));
506 				return true;
507 			}
508 		}
509 
510 		class BackwardEditPaths extends EditPaths {
511 			@Override
512 			final int snake(int k, int x) {
513 				for (; x > beginA && k + x > beginB; x--)
514 					if (!cmp.equals(a, x - 1, b, k + x - 1))
515 						break;
516 				return x;
517 			}
518 
519 			@Override
520 			final int getLeft(final int x) {
521 				return x - 1;
522 			}
523 
524 			@Override
525 			final int getRight(final int x) {
526 				return x;
527 			}
528 
529 			@Override
530 			final boolean isBetter(final int left, final int right) {
531 				return left < right;
532 			}
533 
534 			@Override
535 			final void adjustMinMaxK(final int k, final int x) {
536 				if (x <= beginA || k + x <= beginB) {
537 					if (k > forward.middleK)
538 						maxK = k;
539 					else
540 						minK = k;
541 				}
542 			}
543 
544 			@Override
545 			final boolean meets(int d, int k, int x, long snake) {
546 				if (k < forward.beginK || k > forward.endK)
547 					return false;
548 				// TODO: move out of loop
549 				if (((d + k - forward.middleK) % 2) != 0)
550 					return false;
551 				if (x > forward.getX(d, k))
552 					return false;
553 				makeEdit(forward.getSnake(d, k), snake);
554 				return true;
555 			}
556 		}
557 	}
558 
559 	/**
560 	 * @param args two filenames specifying the contents to be diffed
561 	 */
562 	public static void main(String[] args) {
563 		if (args.length != 2) {
564 			System.err.println(JGitText.get().need2Arguments);
565 			System.exit(1);
566 		}
567 		try {
568 			RawText a = new RawText(new java.io.File(args[0]));
569 			RawText b = new RawText(new java.io.File(args[1]));
570 			EditList r = INSTANCE.diff(RawTextComparator.DEFAULT, a, b);
571 			System.out.println(r.toString());
572 		} catch (Exception e) {
573 			e.printStackTrace();
574 		}
575 	}
576 }