View Javadoc
1   /*
2    * Copyright (C) 2010, Google Inc.
3    * and other copyright owners as documented in the project's IP log.
4    *
5    * This program and the accompanying materials are made available
6    * under the terms of the Eclipse Distribution License v1.0 which
7    * accompanies this distribution, is reproduced below, and is
8    * available at http://www.eclipse.org/org/documents/edl-v10.php
9    *
10   * All rights reserved.
11   *
12   * Redistribution and use in source and binary forms, with or
13   * without modification, are permitted provided that the following
14   * conditions are met:
15   *
16   * - Redistributions of source code must retain the above copyright
17   *   notice, this list of conditions and the following disclaimer.
18   *
19   * - Redistributions in binary form must reproduce the above
20   *   copyright notice, this list of conditions and the following
21   *   disclaimer in the documentation and/or other materials provided
22   *   with the distribution.
23   *
24   * - Neither the name of the Eclipse Foundation, Inc. nor the
25   *   names of its contributors may be used to endorse or promote
26   *   products derived from this software without specific prior
27   *   written permission.
28   *
29   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
30   * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
31   * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
32   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33   * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
34   * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
35   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
36   * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
37   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
38   * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
39   * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
40   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
41   * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
42   */
43  
44  package org.eclipse.jgit.diff;
45  
46  import static org.eclipse.jgit.diff.DiffEntry.Side.NEW;
47  import static org.eclipse.jgit.diff.DiffEntry.Side.OLD;
48  
49  import java.io.IOException;
50  import java.util.ArrayList;
51  import java.util.Arrays;
52  import java.util.BitSet;
53  import java.util.List;
54  
55  import org.eclipse.jgit.diff.DiffEntry.ChangeType;
56  import org.eclipse.jgit.diff.SimilarityIndex.TableFullException;
57  import org.eclipse.jgit.errors.CancelledException;
58  import org.eclipse.jgit.internal.JGitText;
59  import org.eclipse.jgit.lib.FileMode;
60  import org.eclipse.jgit.lib.NullProgressMonitor;
61  import org.eclipse.jgit.lib.ProgressMonitor;
62  
63  class SimilarityRenameDetector {
64  	/**
65  	 * Number of bits we need to express an index into src or dst list.
66  	 * <p>
67  	 * This must be 28, giving us a limit of 2^28 entries in either list, which
68  	 * is an insane limit of 536,870,912 file names being considered in a single
69  	 * rename pass. The other 8 bits are used to store the score, while staying
70  	 * under 127 so the long doesn't go negative.
71  	 */
72  	private static final int BITS_PER_INDEX = 28;
73  
74  	private static final int INDEX_MASK = (1 << BITS_PER_INDEX) - 1;
75  
76  	private static final int SCORE_SHIFT = 2 * BITS_PER_INDEX;
77  
78  	private ContentSource.Pair reader;
79  
80  	/**
81  	 * All sources to consider for copies or renames.
82  	 * <p>
83  	 * A source is typically a {@link ChangeType#DELETE} change, but could be
84  	 * another type when trying to perform copy detection concurrently with
85  	 * rename detection.
86  	 */
87  	private List<DiffEntry> srcs;
88  
89  	/**
90  	 * All destinations to consider looking for a rename.
91  	 * <p>
92  	 * A destination is typically an {@link ChangeType#ADD}, as the name has
93  	 * just come into existence, and we want to discover where its initial
94  	 * content came from.
95  	 */
96  	private List<DiffEntry> dsts;
97  
98  	/**
99  	 * Matrix of all examined file pairs, and their scores.
100 	 * <p>
101 	 * The upper 8 bits of each long stores the score, but the score is bounded
102 	 * to be in the range (0, 128] so that the highest bit is never set, and all
103 	 * entries are therefore positive.
104 	 * <p>
105 	 * List indexes to an element of {@link #srcs} and {@link #dsts} are encoded
106 	 * as the lower two groups of 28 bits, respectively, but the encoding is
107 	 * inverted, so that 0 is expressed as {@code (1 << 28) - 1}. This sorts
108 	 * lower list indices later in the matrix, giving precedence to files whose
109 	 * names sort earlier in the tree.
110 	 */
111 	private long[] matrix;
112 
113 	/** Score a pair must exceed to be considered a rename. */
114 	private int renameScore = 60;
115 
116 	/** Set if any {@link SimilarityIndex.TableFullException} occurs. */
117 	private boolean tableOverflow;
118 
119 	private List<DiffEntry> out;
120 
121 	SimilarityRenameDetector(ContentSource.Pair reader, List<DiffEntry> srcs,
122 			List<DiffEntry> dsts) {
123 		this.reader = reader;
124 		this.srcs = srcs;
125 		this.dsts = dsts;
126 	}
127 
128 	void setRenameScore(int score) {
129 		renameScore = score;
130 	}
131 
132 	void compute(ProgressMonitor pm) throws IOException, CancelledException {
133 		if (pm == null)
134 			pm = NullProgressMonitor.INSTANCE;
135 
136 		pm.beginTask(JGitText.get().renamesFindingByContent, //
137 				2 * srcs.size() * dsts.size());
138 
139 		int mNext = buildMatrix(pm);
140 		out = new ArrayList<>(Math.min(mNext, dsts.size()));
141 
142 		// Match rename pairs on a first come, first serve basis until
143 		// we have looked at everything that is above our minimum score.
144 		//
145 		for (--mNext; mNext >= 0; mNext--) {
146 			if (pm.isCancelled()) {
147 				// TODO(ms): use org.eclipse.jgit.api.errors.CanceledException
148 				// in next major version
149 				throw new CancelledException(JGitText.get().renameCancelled);
150 			}
151 			long ent = matrix[mNext];
152 			int sIdx = srcFile(ent);
153 			int dIdx = dstFile(ent);
154 			DiffEntry s = srcs.get(sIdx);
155 			DiffEntry d = dsts.get(dIdx);
156 
157 			if (d == null) {
158 				pm.update(1);
159 				continue; // was already matched earlier
160 			}
161 
162 			ChangeType type;
163 			if (s.changeType == ChangeType.DELETE) {
164 				// First use of this source file. Tag it as a rename so we
165 				// later know it is already been used as a rename, other
166 				// matches (if any) will claim themselves as copies instead.
167 				//
168 				s.changeType = ChangeType.RENAME;
169 				type = ChangeType.RENAME;
170 			} else {
171 				type = ChangeType.COPY;
172 			}
173 
174 			out.add(DiffEntry.pair(type, s, d, score(ent)));
175 			dsts.set(dIdx, null); // Claim the destination was matched.
176 			pm.update(1);
177 		}
178 
179 		srcs = compactSrcList(srcs);
180 		dsts = compactDstList(dsts);
181 		pm.endTask();
182 	}
183 
184 	List<DiffEntry> getMatches() {
185 		return out;
186 	}
187 
188 	List<DiffEntry> getLeftOverSources() {
189 		return srcs;
190 	}
191 
192 	List<DiffEntry> getLeftOverDestinations() {
193 		return dsts;
194 	}
195 
196 	boolean isTableOverflow() {
197 		return tableOverflow;
198 	}
199 
200 	private static List<DiffEntry> compactSrcList(List<DiffEntry> in) {
201 		ArrayList<DiffEntry> r = new ArrayList<>(in.size());
202 		for (DiffEntry e : in) {
203 			if (e.changeType == ChangeType.DELETE)
204 				r.add(e);
205 		}
206 		return r;
207 	}
208 
209 	private static List<DiffEntry> compactDstList(List<DiffEntry> in) {
210 		ArrayList<DiffEntry> r = new ArrayList<>(in.size());
211 		for (DiffEntry e : in) {
212 			if (e != null)
213 				r.add(e);
214 		}
215 		return r;
216 	}
217 
218 	private int buildMatrix(ProgressMonitor pm)
219 			throws IOException, CancelledException {
220 		// Allocate for the worst-case scenario where every pair has a
221 		// score that we need to consider. We might not need that many.
222 		//
223 		matrix = new long[srcs.size() * dsts.size()];
224 
225 		long[] srcSizes = new long[srcs.size()];
226 		long[] dstSizes = new long[dsts.size()];
227 		BitSet dstTooLarge = null;
228 
229 		// Consider each pair of files, if the score is above the minimum
230 		// threshold we need record that scoring in the matrix so we can
231 		// later find the best matches.
232 		//
233 		int mNext = 0;
234 		SRC: for (int srcIdx = 0; srcIdx < srcs.size(); srcIdx++) {
235 			DiffEntry srcEnt = srcs.get(srcIdx);
236 			if (!isFile(srcEnt.oldMode)) {
237 				pm.update(dsts.size());
238 				continue;
239 			}
240 
241 			SimilarityIndex s = null;
242 
243 			for (int dstIdx = 0; dstIdx < dsts.size(); dstIdx++) {
244 				if (pm.isCancelled()) {
245 					// TODO(ms): use
246 					// org.eclipse.jgit.api.errors.CanceledException in next
247 					// major version
248 					throw new CancelledException(
249 							JGitText.get().renameCancelled);
250 				}
251 
252 				DiffEntry dstEnt = dsts.get(dstIdx);
253 
254 				if (!isFile(dstEnt.newMode)) {
255 					pm.update(1);
256 					continue;
257 				}
258 
259 				if (!RenameDetector.sameType(srcEnt.oldMode, dstEnt.newMode)) {
260 					pm.update(1);
261 					continue;
262 				}
263 
264 				if (dstTooLarge != null && dstTooLarge.get(dstIdx)) {
265 					pm.update(1);
266 					continue;
267 				}
268 
269 				long srcSize = srcSizes[srcIdx];
270 				if (srcSize == 0) {
271 					srcSize = size(OLD, srcEnt) + 1;
272 					srcSizes[srcIdx] = srcSize;
273 				}
274 
275 				long dstSize = dstSizes[dstIdx];
276 				if (dstSize == 0) {
277 					dstSize = size(NEW, dstEnt) + 1;
278 					dstSizes[dstIdx] = dstSize;
279 				}
280 
281 				long max = Math.max(srcSize, dstSize);
282 				long min = Math.min(srcSize, dstSize);
283 				if (min * 100 / max < renameScore) {
284 					// Cannot possibly match, as the file sizes are so different
285 					pm.update(1);
286 					continue;
287 				}
288 
289 				if (s == null) {
290 					try {
291 						s = hash(OLD, srcEnt);
292 					} catch (TableFullException tableFull) {
293 						tableOverflow = true;
294 						continue SRC;
295 					}
296 				}
297 
298 				SimilarityIndex d;
299 				try {
300 					d = hash(NEW, dstEnt);
301 				} catch (TableFullException tableFull) {
302 					if (dstTooLarge == null)
303 						dstTooLarge = new BitSet(dsts.size());
304 					dstTooLarge.set(dstIdx);
305 					tableOverflow = true;
306 					pm.update(1);
307 					continue;
308 				}
309 
310 				int contentScore = s.score(d, 10000);
311 
312 				// nameScore returns a value between 0 and 100, but we want it
313 				// to be in the same range as the content score. This allows it
314 				// to be dropped into the pretty formula for the final score.
315 				int nameScore = nameScore(srcEnt.oldPath, dstEnt.newPath) * 100;
316 
317 				int score = (contentScore * 99 + nameScore * 1) / 10000;
318 
319 				if (score < renameScore) {
320 					pm.update(1);
321 					continue;
322 				}
323 
324 				matrix[mNext++] = encode(score, srcIdx, dstIdx);
325 				pm.update(1);
326 			}
327 		}
328 
329 		// Sort everything in the range we populated, which might be the
330 		// entire matrix, or just a smaller slice if we had some bad low
331 		// scoring pairs.
332 		//
333 		Arrays.sort(matrix, 0, mNext);
334 		return mNext;
335 	}
336 
337 	static int nameScore(String a, String b) {
338 	    int aDirLen = a.lastIndexOf("/") + 1; //$NON-NLS-1$
339 	    int bDirLen = b.lastIndexOf("/") + 1; //$NON-NLS-1$
340 
341 	    int dirMin = Math.min(aDirLen, bDirLen);
342 	    int dirMax = Math.max(aDirLen, bDirLen);
343 
344 	    final int dirScoreLtr;
345 	    final int dirScoreRtl;
346 
347 		if (dirMax == 0) {
348 			dirScoreLtr = 100;
349 			dirScoreRtl = 100;
350 		} else {
351 			int dirSim = 0;
352 			for (; dirSim < dirMin; dirSim++) {
353 				if (a.charAt(dirSim) != b.charAt(dirSim))
354 					break;
355 			}
356 			dirScoreLtr = (dirSim * 100) / dirMax;
357 
358 			if (dirScoreLtr == 100) {
359 				dirScoreRtl = 100;
360 			} else {
361 				for (dirSim = 0; dirSim < dirMin; dirSim++) {
362 					if (a.charAt(aDirLen - 1 - dirSim) != b.charAt(bDirLen - 1
363 							- dirSim))
364 						break;
365 				}
366 				dirScoreRtl = (dirSim * 100) / dirMax;
367 			}
368 		}
369 
370 		int fileMin = Math.min(a.length() - aDirLen, b.length() - bDirLen);
371 		int fileMax = Math.max(a.length() - aDirLen, b.length() - bDirLen);
372 
373 		int fileSim = 0;
374 		for (; fileSim < fileMin; fileSim++) {
375 			if (a.charAt(a.length() - 1 - fileSim) != b.charAt(b.length() - 1
376 					- fileSim))
377 				break;
378 		}
379 		int fileScore = (fileSim * 100) / fileMax;
380 
381 		return (((dirScoreLtr + dirScoreRtl) * 25) + (fileScore * 50)) / 100;
382 	}
383 
384 	private SimilarityIndex hash(DiffEntry.Side side, DiffEntry ent)
385 			throws IOException, TableFullException {
386 		SimilarityIndex r = new SimilarityIndex();
387 		r.hash(reader.open(side, ent));
388 		r.sort();
389 		return r;
390 	}
391 
392 	private long size(DiffEntry.Side side, DiffEntry ent) throws IOException {
393 		return reader.size(side, ent);
394 	}
395 
396 	private static int score(long value) {
397 		return (int) (value >>> SCORE_SHIFT);
398 	}
399 
400 	static int srcFile(long value) {
401 		return decodeFile(((int) (value >>> BITS_PER_INDEX)) & INDEX_MASK);
402 	}
403 
404 	static int dstFile(long value) {
405 		return decodeFile(((int) value) & INDEX_MASK);
406 	}
407 
408 	static long encode(int score, int srcIdx, int dstIdx) {
409 		return (((long) score) << SCORE_SHIFT) //
410 				| (encodeFile(srcIdx) << BITS_PER_INDEX) //
411 				| encodeFile(dstIdx);
412 	}
413 
414 	private static long encodeFile(int idx) {
415 		// We invert the index so that the first file in the list sorts
416 		// later in the table. This permits us to break ties favoring
417 		// earlier names over later ones.
418 		//
419 		return INDEX_MASK - idx;
420 	}
421 
422 	private static int decodeFile(int v) {
423 		return INDEX_MASK - v;
424 	}
425 
426 	private static boolean isFile(FileMode mode) {
427 		return (mode.getBits() & FileMode.TYPE_MASK) == FileMode.TYPE_FILE;
428 	}
429 }