View Javadoc
1   /*
2    * Copyright (C) 2008-2009, Johannes E. Schindelin <johannes.schindelin@gmx.de>
3    * Copyright (C) 2009, Johannes Schindelin <johannes.schindelin@gmx.de> and others
4    *
5    * This program and the accompanying materials are made available under the
6    * terms of the Eclipse Distribution License v. 1.0 which is available at
7    * https://www.eclipse.org/org/documents/edl-v10.php.
8    *
9    * SPDX-License-Identifier: BSD-3-Clause
10   */
11  
12  package org.eclipse.jgit.diff;
13  
14  import java.text.MessageFormat;
15  
16  import org.eclipse.jgit.errors.DiffInterruptedException;
17  import org.eclipse.jgit.internal.JGitText;
18  import org.eclipse.jgit.util.IntList;
19  import org.eclipse.jgit.util.LongList;
20  
21  /**
22   * Diff algorithm, based on "An O(ND) Difference Algorithm and its Variations",
23   * by Eugene Myers.
24   * <p>
25   * The basic idea is to put the line numbers of text A as columns ("x") and the
26   * lines of text B as rows ("y"). Now you try to find the shortest "edit path"
27   * from the upper left corner to the lower right corner, where you can always go
28   * horizontally or vertically, but diagonally from (x,y) to (x+1,y+1) only if
29   * line x in text A is identical to line y in text B.
30   * <p>
31   * Myers' fundamental concept is the "furthest reaching D-path on diagonal k": a
32   * D-path is an edit path starting at the upper left corner and containing
33   * exactly D non-diagonal elements ("differences"). The furthest reaching D-path
34   * on diagonal k is the one that contains the most (diagonal) elements which
35   * ends on diagonal k (where k = y - x).
36   * <p>
37   * Example:
38   *
39   * <pre>
40   *    H E L L O   W O R L D
41   *    ____
42   *  L     \___
43   *  O         \___
44   *  W             \________
45   * </pre>
46   * <p>
47   * Since every D-path has exactly D horizontal or vertical elements, it can only
48   * end on the diagonals -D, -D+2, ..., D-2, D.
49   * <p>
50   * Since every furthest reaching D-path contains at least one furthest reaching
51   * (D-1)-path (except for D=0), we can construct them recursively.
52   * <p>
53   * Since we are really interested in the shortest edit path, we can start
54   * looking for a 0-path, then a 1-path, and so on, until we find a path that
55   * ends in the lower right corner.
56   * <p>
57   * To save space, we do not need to store all paths (which has quadratic space
58   * requirements), but generate the D-paths simultaneously from both sides. When
59   * the ends meet, we will have found "the middle" of the path. From the end
60   * points of that diagonal part, we can generate the rest recursively.
61   * <p>
62   * This only requires linear space.
63   * <p>
64   * The overall (runtime) complexity is:
65   *
66   * <pre>
67   *     O(N * D^2 + 2 * N/2 * (D/2)^2 + 4 * N/4 * (D/4)^2 + ...)
68   *     = O(N * D^2 * 5 / 4) = O(N * D^2),
69   * </pre>
70   * <p>
71   * (With each step, we have to find the middle parts of twice as many regions as
72   * before, but the regions (as well as the D) are halved.)
73   * <p>
74   * So the overall runtime complexity stays the same with linear space, albeit
75   * with a larger constant factor.
76   *
77   * @param <S>
78   *            type of sequence.
79   */
80  @SuppressWarnings("hiding")
81  public class MyersDiff<S extends Sequence> {
82  	/** Singleton instance of MyersDiff. */
83  	public static final DiffAlgorithm INSTANCE = new LowLevelDiffAlgorithm() {
84  		@SuppressWarnings("unused")
85  		@Override
86  		public <S extends Sequence> void diffNonCommon(EditList edits,
87  				HashedSequenceComparator<S> cmp, HashedSequence<S> a,
88  				HashedSequence<S> b, Edit region) {
89  			new MyersDiff<>(edits, cmp, a, b, region);
90  		}
91  	};
92  
93  	/**
94  	 * The list of edits found during the last call to
95  	 * {@link #calculateEdits(Edit)}
96  	 */
97  	protected EditList edits;
98  
99  	/** Comparison function for sequences. */
100 	protected HashedSequenceComparator<S> cmp;
101 
102 	/**
103 	 * The first text to be compared. Referred to as "Text A" in the comments
104 	 */
105 	protected HashedSequence<S> a;
106 
107 	/**
108 	 * The second text to be compared. Referred to as "Text B" in the comments
109 	 */
110 	protected HashedSequence<S> b;
111 
112 	private MyersDiff(EditList edits, HashedSequenceComparator<S> cmp,
113 			HashedSequence<S> a, HashedSequence<S> b, Edit region) {
114 		this.edits = edits;
115 		this.cmp = cmp;
116 		this.a = a;
117 		this.b = b;
118 		calculateEdits(region);
119 	}
120 
121 	// TODO: use ThreadLocal for future multi-threaded operations
122 	MiddleEdit middle = new MiddleEdit();
123 
124 	/**
125 	 * Entrypoint into the algorithm this class is all about. This method triggers that the
126 	 * differences between A and B are calculated in form of a list of edits.
127 	 * @param r portion of the sequences to examine.
128 	 */
129 	private void calculateEdits(Edit r) {
130 		middle.initialize(r.beginA, r.endA, r.beginB, r.endB);
131 		if (middle.beginA >= middle.endA &&
132 				middle.beginB >= middle.endB)
133 			return;
134 
135 		calculateEdits(middle.beginA, middle.endA,
136 				middle.beginB, middle.endB);
137 	}
138 
139 	/**
140 	 * Calculates the differences between a given part of A against another
141 	 * given part of B
142 	 *
143 	 * @param beginA
144 	 *            start of the part of A which should be compared
145 	 *            (0&lt;=beginA&lt;sizeof(A))
146 	 * @param endA
147 	 *            end of the part of A which should be compared
148 	 *            (beginA&lt;=endA&lt;sizeof(A))
149 	 * @param beginB
150 	 *            start of the part of B which should be compared
151 	 *            (0&lt;=beginB&lt;sizeof(B))
152 	 * @param endB
153 	 *            end of the part of B which should be compared
154 	 *            (beginB&lt;=endB&lt;sizeof(B))
155 	 */
156 	protected void calculateEdits(int beginA, int endA,
157 			int beginB, int endB) {
158 		Edit edit = middle.calculate(beginA, endA, beginB, endB);
159 
160 		if (beginA < edit.beginA || beginB < edit.beginB) {
161 			int k = edit.beginB - edit.beginA;
162 			int x = middle.backward.snake(k, edit.beginA);
163 			calculateEdits(beginA, x, beginB, k + x);
164 		}
165 
166 		if (edit.getType() != Edit.Type.EMPTY)
167 			edits.add(edits.size(), edit);
168 
169 		// after middle
170 		if (endA > edit.endA || endB > edit.endB) {
171 			int k = edit.endB - edit.endA;
172 			int x = middle.forward.snake(k, edit.endA);
173 			calculateEdits(x, endA, k + x, endB);
174 		}
175 	}
176 
177 	/**
178 	 * A class to help bisecting the sequences a and b to find minimal
179 	 * edit paths.
180 	 *
181 	 * As the arrays are reused for space efficiency, you will need one
182 	 * instance per thread.
183 	 *
184 	 * The entry function is the calculate() method.
185 	 */
186 	class MiddleEdit {
187 		void initialize(int beginA, int endA, int beginB, int endB) {
188 			this.beginA = beginA; this.endA = endA;
189 			this.beginB = beginB; this.endB = endB;
190 
191 			// strip common parts on either end
192 			int k = beginB - beginA;
193 			this.beginA = forward.snake(k, beginA);
194 			this.beginB = k + this.beginA;
195 
196 			k = endB - endA;
197 			this.endA = backward.snake(k, endA);
198 			this.endB = k + this.endA;
199 		}
200 
201 		/*
202 		 * This function calculates the "middle" Edit of the shortest
203 		 * edit path between the given subsequences of a and b.
204 		 *
205 		 * Once a forward path and a backward path meet, we found the
206 		 * middle part.  From the last snake end point on both of them,
207 		 * we construct the Edit.
208 		 *
209 		 * It is assumed that there is at least one edit in the range.
210 		 */
211 		// TODO: measure speed impact when this is synchronized
212 		Edit calculate(int beginA, int endA, int beginB, int endB) {
213 			if (beginA == endA || beginB == endB)
214 				return new Edit(beginA, endA, beginB, endB);
215 			this.beginA = beginA; this.endA = endA;
216 			this.beginB = beginB; this.endB = endB;
217 
218 			/*
219 			 * Following the conventions in Myers' paper, "k" is
220 			 * the difference between the index into "b" and the
221 			 * index into "a".
222 			 */
223 			int minK = beginB - endA;
224 			int maxK = endB - beginA;
225 
226 			forward.initialize(beginB - beginA, beginA, minK, maxK);
227 			backward.initialize(endB - endA, endA, minK, maxK);
228 
229 			for (int d = 1; ; d++)
230 				if (forward.calculate(d) ||
231 						backward.calculate(d))
232 					return edit;
233 		}
234 
235 		/*
236 		 * For each d, we need to hold the d-paths for the diagonals
237 		 * k = -d, -d + 2, ..., d - 2, d.  These are stored in the
238 		 * forward (and backward) array.
239 		 *
240 		 * As we allow subsequences, too, this needs some refinement:
241 		 * the forward paths start on the diagonal forwardK =
242 		 * beginB - beginA, and backward paths start on the diagonal
243 		 * backwardK = endB - endA.
244 		 *
245 		 * So, we need to hold the forward d-paths for the diagonals
246 		 * k = forwardK - d, forwardK - d + 2, ..., forwardK + d and
247 		 * the analogue for the backward d-paths.  This means that
248 		 * we can turn (k, d) into the forward array index using this
249 		 * formula:
250 		 *
251 		 *	i = (d + k - forwardK) / 2
252 		 *
253 		 * There is a further complication: the edit paths should not
254 		 * leave the specified subsequences, so k is bounded by
255 		 * minK = beginB - endA and maxK = endB - beginA.  However,
256 		 * (k - forwardK) _must_ be odd whenever d is odd, and it
257 		 * _must_ be even when d is even.
258 		 *
259 		 * The values in the "forward" and "backward" arrays are
260 		 * positions ("x") in the sequence a, to get the corresponding
261 		 * positions ("y") in the sequence b, you have to calculate
262 		 * the appropriate k and then y:
263 		 *
264 		 *	k = forwardK - d + i * 2
265 		 *	y = k + x
266 		 *
267 		 * (substitute backwardK for forwardK if you want to get the
268 		 * y position for an entry in the "backward" array.
269 		 */
270 		EditPaths forward = new ForwardEditPaths();
271 		EditPaths backward = new BackwardEditPaths();
272 
273 		/* Some variables which are shared between methods */
274 		protected int beginA, endA, beginB, endB;
275 		protected Edit edit;
276 
277 		abstract class EditPaths {
278 			private IntListtList.html#IntList">IntList x = new IntList();
279 			private LongListt.html#LongList">LongList snake = new LongList();
280 			int beginK, endK, middleK;
281 			int prevBeginK, prevEndK;
282 			/* if we hit one end early, no need to look further */
283 			int minK, maxK; // TODO: better explanation
284 
285 			final int getIndex(int d, int k) {
286 // TODO: remove
287 if (((d + k - middleK) % 2) != 0)
288 	throw new RuntimeException(MessageFormat.format(JGitText.get().unexpectedOddResult, Integer.valueOf(d), Integer.valueOf(k), Integer.valueOf(middleK)));
289 				return (d + k - middleK) / 2;
290 			}
291 
292 			final int getX(int d, int k) {
293 // TODO: remove
294 if (k < beginK || k > endK)
295 	throw new RuntimeException(MessageFormat.format(JGitText.get().kNotInRange, Integer.valueOf(k), Integer.valueOf(beginK), Integer.valueOf(endK)));
296 				return x.get(getIndex(d, k));
297 			}
298 
299 			final long getSnake(int d, int k) {
300 // TODO: remove
301 if (k < beginK || k > endK)
302 	throw new RuntimeException(MessageFormat.format(JGitText.get().kNotInRange, Integer.valueOf(k), Integer.valueOf(beginK), Integer.valueOf(endK)));
303 				return snake.get(getIndex(d, k));
304 			}
305 
306 			private int forceKIntoRange(int k) {
307 				/* if k is odd, so must be the result */
308 				if (k < minK)
309 					return minK + ((k ^ minK) & 1);
310 				else if (k > maxK)
311 					return maxK - ((k ^ maxK) & 1);
312 				return k;
313 			}
314 
315 			void initialize(int k, int x, int minK, int maxK) {
316 				this.minK = minK;
317 				this.maxK = maxK;
318 				beginK = endK = middleK = k;
319 				this.x.clear();
320 				this.x.add(x);
321 				snake.clear();
322 				snake.add(newSnake(k, x));
323 			}
324 
325 			abstract int snake(int k, int x);
326 			abstract int getLeft(int x);
327 			abstract int getRight(int x);
328 			abstract boolean isBetter(int left, int right);
329 			abstract void adjustMinMaxK(int k, int x);
330 			abstract boolean meets(int d, int k, int x, long snake);
331 
332 			final long newSnake(int k, int x) {
333 				long y = k + x;
334 				long ret = ((long) x) << 32;
335 				return ret | y;
336 			}
337 
338 			final int snake2x(long snake) {
339 				return (int) (snake >>> 32);
340 			}
341 
342 			final int snake2y(long snake) {
343 				return (int) snake;
344 			}
345 
346 			final boolean makeEdit(long snake1, long snake2) {
347 				int x1 = snake2x(snake1), x2 = snake2x(snake2);
348 				int y1 = snake2y(snake1), y2 = snake2y(snake2);
349 				/*
350 				 * Check for incompatible partial edit paths:
351 				 * when there are ambiguities, we might have
352 				 * hit incompatible (i.e. non-overlapping)
353 				 * forward/backward paths.
354 				 *
355 				 * In that case, just pretend that we have
356 				 * an empty edit at the end of one snake; this
357 				 * will force a decision which path to take
358 				 * in the next recursion step.
359 				 */
360 				if (x1 > x2 || y1 > y2) {
361 					x1 = x2;
362 					y1 = y2;
363 				}
364 				edit = new Edit(x1, x2, y1, y2);
365 				return true;
366 			}
367 
368 			boolean calculate(int d) {
369 				prevBeginK = beginK;
370 				prevEndK = endK;
371 				beginK = forceKIntoRange(middleK - d);
372 				endK = forceKIntoRange(middleK + d);
373 				// TODO: handle i more efficiently
374 				// TODO: walk snake(k, getX(d, k)) only once per (d, k)
375 				// TODO: move end points out of the loop to avoid conditionals inside the loop
376 				// go backwards so that we can avoid temp vars
377 				for (int k = endK; k >= beginK; k -= 2) {
378 					if (Thread.interrupted()) {
379 						throw new DiffInterruptedException();
380 					}
381 					int left = -1, right = -1;
382 					long leftSnake = -1L, rightSnake = -1L;
383 					// TODO: refactor into its own function
384 					if (k > prevBeginK) {
385 						int i = getIndex(d - 1, k - 1);
386 						left = x.get(i);
387 						int end = snake(k - 1, left);
388 						leftSnake = left != end ?
389 							newSnake(k - 1, end) :
390 							snake.get(i);
391 						if (meets(d, k - 1, end, leftSnake))
392 							return true;
393 						left = getLeft(end);
394 					}
395 					if (k < prevEndK) {
396 						int i = getIndex(d - 1, k + 1);
397 						right = x.get(i);
398 						int end = snake(k + 1, right);
399 						rightSnake = right != end ?
400 							newSnake(k + 1, end) :
401 							snake.get(i);
402 						if (meets(d, k + 1, end, rightSnake))
403 							return true;
404 						right = getRight(end);
405 					}
406 					int newX;
407 					long newSnake;
408 					if (k >= prevEndK ||
409 							(k > prevBeginK &&
410 							 isBetter(left, right))) {
411 						newX = left;
412 						newSnake = leftSnake;
413 					}
414 					else {
415 						newX = right;
416 						newSnake = rightSnake;
417 					}
418 					if (meets(d, k, newX, newSnake))
419 						return true;
420 					adjustMinMaxK(k, newX);
421 					int i = getIndex(d, k);
422 					x.set(i, newX);
423 					snake.set(i, newSnake);
424 				}
425 				return false;
426 			}
427 		}
428 
429 		class ForwardEditPaths extends EditPaths {
430 			@Override
431 			final int snake(int k, int x) {
432 				for (; x < endA && k + x < endB; x++)
433 					if (!cmp.equals(a, x, b, k + x))
434 						break;
435 				return x;
436 			}
437 
438 			@Override
439 			final int getLeft(int x) {
440 				return x;
441 			}
442 
443 			@Override
444 			final int getRight(int x) {
445 				return x + 1;
446 			}
447 
448 			@Override
449 			final boolean isBetter(int left, int right) {
450 				return left > right;
451 			}
452 
453 			@Override
454 			final void adjustMinMaxK(int k, int x) {
455 				if (x >= endA || k + x >= endB) {
456 					if (k > backward.middleK)
457 						maxK = k;
458 					else
459 						minK = k;
460 				}
461 			}
462 
463 			@Override
464 			final boolean meets(int d, int k, int x, long snake) {
465 				if (k < backward.beginK || k > backward.endK)
466 					return false;
467 				// TODO: move out of loop
468 				if (((d - 1 + k - backward.middleK) % 2) != 0)
469 					return false;
470 				if (x < backward.getX(d - 1, k))
471 					return false;
472 				makeEdit(snake, backward.getSnake(d - 1, k));
473 				return true;
474 			}
475 		}
476 
477 		class BackwardEditPaths extends EditPaths {
478 			@Override
479 			final int snake(int k, int x) {
480 				for (; x > beginA && k + x > beginB; x--)
481 					if (!cmp.equals(a, x - 1, b, k + x - 1))
482 						break;
483 				return x;
484 			}
485 
486 			@Override
487 			final int getLeft(int x) {
488 				return x - 1;
489 			}
490 
491 			@Override
492 			final int getRight(int x) {
493 				return x;
494 			}
495 
496 			@Override
497 			final boolean isBetter(int left, int right) {
498 				return left < right;
499 			}
500 
501 			@Override
502 			final void adjustMinMaxK(int k, int x) {
503 				if (x <= beginA || k + x <= beginB) {
504 					if (k > forward.middleK)
505 						maxK = k;
506 					else
507 						minK = k;
508 				}
509 			}
510 
511 			@Override
512 			final boolean meets(int d, int k, int x, long snake) {
513 				if (k < forward.beginK || k > forward.endK)
514 					return false;
515 				// TODO: move out of loop
516 				if (((d + k - forward.middleK) % 2) != 0)
517 					return false;
518 				if (x > forward.getX(d, k))
519 					return false;
520 				makeEdit(forward.getSnake(d, k), snake);
521 				return true;
522 			}
523 		}
524 	}
525 
526 	/**
527 	 * Main method
528 	 *
529 	 * @param args
530 	 *            two filenames specifying the contents to be diffed
531 	 */
532 	public static void main(String[] args) {
533 		if (args.length != 2) {
534 			System.err.println(JGitText.get().need2Arguments);
535 			System.exit(1);
536 		}
537 		try {
538 			RawText a = new RawText(new java.io.File(args[0]));
539 			RawText b = new RawText(new java.io.File(args[1]));
540 			EditList r = INSTANCE.diff(RawTextComparator.DEFAULT, a, b);
541 			System.out.println(r.toString());
542 		} catch (Exception e) {
543 			e.printStackTrace();
544 		}
545 	}
546 }