View Javadoc
1   /*
2    * Copyright (C) 2010, Google Inc. and others
3    *
4    * This program and the accompanying materials are made available under the
5    * terms of the Eclipse Distribution License v. 1.0 which is available at
6    * https://www.eclipse.org/org/documents/edl-v10.php.
7    *
8    * SPDX-License-Identifier: BSD-3-Clause
9    */
10  
11  package org.eclipse.jgit.diff;
12  
13  import static org.eclipse.jgit.diff.DiffEntry.Side.NEW;
14  import static org.eclipse.jgit.diff.DiffEntry.Side.OLD;
15  import static org.eclipse.jgit.storage.pack.PackConfig.DEFAULT_BIG_FILE_THRESHOLD;
16  
17  import java.io.IOException;
18  import java.util.ArrayList;
19  import java.util.Arrays;
20  import java.util.BitSet;
21  import java.util.List;
22  
23  import org.eclipse.jgit.diff.DiffEntry.ChangeType;
24  import org.eclipse.jgit.diff.SimilarityIndex.TableFullException;
25  import org.eclipse.jgit.errors.CancelledException;
26  import org.eclipse.jgit.internal.JGitText;
27  import org.eclipse.jgit.lib.FileMode;
28  import org.eclipse.jgit.lib.NullProgressMonitor;
29  import org.eclipse.jgit.lib.ObjectLoader;
30  import org.eclipse.jgit.lib.ProgressMonitor;
31  
32  class SimilarityRenameDetector {
33  	/**
34  	 * Number of bits we need to express an index into src or dst list.
35  	 * <p>
36  	 * This must be 28, giving us a limit of 2^28 entries in either list, which
37  	 * is an insane limit of 536,870,912 file names being considered in a single
38  	 * rename pass. The other 8 bits are used to store the score, while staying
39  	 * under 127 so the long doesn't go negative.
40  	 */
41  	private static final int BITS_PER_INDEX = 28;
42  
43  	private static final int INDEX_MASK = (1 << BITS_PER_INDEX) - 1;
44  
45  	private static final int SCORE_SHIFT = 2 * BITS_PER_INDEX;
46  
47  	private ContentSource.Pair reader;
48  
49  	/**
50  	 * All sources to consider for copies or renames.
51  	 * <p>
52  	 * A source is typically a {@link ChangeType#DELETE} change, but could be
53  	 * another type when trying to perform copy detection concurrently with
54  	 * rename detection.
55  	 */
56  	private List<DiffEntry> srcs;
57  
58  	/**
59  	 * All destinations to consider looking for a rename.
60  	 * <p>
61  	 * A destination is typically an {@link ChangeType#ADD}, as the name has
62  	 * just come into existence, and we want to discover where its initial
63  	 * content came from.
64  	 */
65  	private List<DiffEntry> dsts;
66  
67  	/**
68  	 * Matrix of all examined file pairs, and their scores.
69  	 * <p>
70  	 * The upper 8 bits of each long stores the score, but the score is bounded
71  	 * to be in the range (0, 128] so that the highest bit is never set, and all
72  	 * entries are therefore positive.
73  	 * <p>
74  	 * List indexes to an element of {@link #srcs} and {@link #dsts} are encoded
75  	 * as the lower two groups of 28 bits, respectively, but the encoding is
76  	 * inverted, so that 0 is expressed as {@code (1 << 28) - 1}. This sorts
77  	 * lower list indices later in the matrix, giving precedence to files whose
78  	 * names sort earlier in the tree.
79  	 */
80  	private long[] matrix;
81  
82  	/** Score a pair must exceed to be considered a rename. */
83  	private int renameScore = 60;
84  
85  	/**
86  	 * File size threshold (in bytes) for detecting renames. Files larger
87  	 * than this size will not be processed for renames.
88  	 */
89  	private int bigFileThreshold = DEFAULT_BIG_FILE_THRESHOLD;
90  
91  	/** Skip content renames for binary files. */
92  	private boolean skipBinaryFiles = false;
93  
94  	/** Set if any {@link SimilarityIndex.TableFullException} occurs. */
95  	private boolean tableOverflow;
96  
97  	private List<DiffEntry> out;
98  
99  	SimilarityRenameDetector(ContentSource.Pair reader, List<DiffEntry> srcs,
100 			List<DiffEntry> dsts) {
101 		this.reader = reader;
102 		this.srcs = srcs;
103 		this.dsts = dsts;
104 	}
105 
106 	void setRenameScore(int score) {
107 		renameScore = score;
108 	}
109 
110 	void setBigFileThreshold(int threshold) {
111 		bigFileThreshold = threshold;
112 	}
113 
114 	void setSkipBinaryFiles(boolean value) {
115 		skipBinaryFiles = value;
116 	}
117 
118 	void compute(ProgressMonitor pm) throws IOException, CancelledException {
119 		if (pm == null)
120 			pm = NullProgressMonitor.INSTANCE;
121 
122 		pm.beginTask(JGitText.get().renamesFindingByContent, //
123 				2 * srcs.size() * dsts.size());
124 
125 		int mNext = buildMatrix(pm);
126 		out = new ArrayList<>(Math.min(mNext, dsts.size()));
127 
128 		// Match rename pairs on a first come, first serve basis until
129 		// we have looked at everything that is above our minimum score.
130 		//
131 		for (--mNext; mNext >= 0; mNext--) {
132 			if (pm.isCancelled()) {
133 				// TODO(ms): use org.eclipse.jgit.api.errors.CanceledException
134 				// in next major version
135 				throw new CancelledException(JGitText.get().renameCancelled);
136 			}
137 			long ent = matrix[mNext];
138 			int sIdx = srcFile(ent);
139 			int dIdx = dstFile(ent);
140 			DiffEntry s = srcs.get(sIdx);
141 			DiffEntry d = dsts.get(dIdx);
142 
143 			if (d == null) {
144 				pm.update(1);
145 				continue; // was already matched earlier
146 			}
147 
148 			ChangeType type;
149 			if (s.changeType == ChangeType.DELETE) {
150 				// First use of this source file. Tag it as a rename so we
151 				// later know it is already been used as a rename, other
152 				// matches (if any) will claim themselves as copies instead.
153 				//
154 				s.changeType = ChangeType.RENAME;
155 				type = ChangeType.RENAME;
156 			} else {
157 				type = ChangeType.COPY;
158 			}
159 
160 			out.add(DiffEntry.pair(type, s, d, score(ent)));
161 			dsts.set(dIdx, null); // Claim the destination was matched.
162 			pm.update(1);
163 		}
164 
165 		srcs = compactSrcList(srcs);
166 		dsts = compactDstList(dsts);
167 		pm.endTask();
168 	}
169 
170 	List<DiffEntry> getMatches() {
171 		return out;
172 	}
173 
174 	List<DiffEntry> getLeftOverSources() {
175 		return srcs;
176 	}
177 
178 	List<DiffEntry> getLeftOverDestinations() {
179 		return dsts;
180 	}
181 
182 	boolean isTableOverflow() {
183 		return tableOverflow;
184 	}
185 
186 	private static List<DiffEntry> compactSrcList(List<DiffEntry> in) {
187 		ArrayList<DiffEntry> r = new ArrayList<>(in.size());
188 		for (DiffEntry e : in) {
189 			if (e.changeType == ChangeType.DELETE)
190 				r.add(e);
191 		}
192 		return r;
193 	}
194 
195 	private static List<DiffEntry> compactDstList(List<DiffEntry> in) {
196 		ArrayList<DiffEntry> r = new ArrayList<>(in.size());
197 		for (DiffEntry e : in) {
198 			if (e != null)
199 				r.add(e);
200 		}
201 		return r;
202 	}
203 
204 	private int buildMatrix(ProgressMonitor pm)
205 			throws IOException, CancelledException {
206 		// Allocate for the worst-case scenario where every pair has a
207 		// score that we need to consider. We might not need that many.
208 		//
209 		matrix = new long[srcs.size() * dsts.size()];
210 
211 		long[] srcSizes = new long[srcs.size()];
212 		long[] dstSizes = new long[dsts.size()];
213 		BitSet dstTooLarge = null;
214 
215 		// Consider each pair of files, if the score is above the minimum
216 		// threshold we need record that scoring in the matrix so we can
217 		// later find the best matches.
218 		//
219 		int mNext = 0;
220 		SRC: for (int srcIdx = 0; srcIdx < srcs.size(); srcIdx++) {
221 			DiffEntry srcEnt = srcs.get(srcIdx);
222 			if (!isFile(srcEnt.oldMode)) {
223 				pm.update(dsts.size());
224 				continue;
225 			}
226 
227 			SimilarityIndex s = null;
228 
229 			for (int dstIdx = 0; dstIdx < dsts.size(); dstIdx++) {
230 				if (pm.isCancelled()) {
231 					// TODO(ms): use
232 					// org.eclipse.jgit.api.errors.CanceledException in next
233 					// major version
234 					throw new CancelledException(
235 							JGitText.get().renameCancelled);
236 				}
237 
238 				DiffEntry dstEnt = dsts.get(dstIdx);
239 
240 				if (!isFile(dstEnt.newMode)) {
241 					pm.update(1);
242 					continue;
243 				}
244 
245 				if (!RenameDetector.sameType(srcEnt.oldMode, dstEnt.newMode)) {
246 					pm.update(1);
247 					continue;
248 				}
249 
250 				if (dstTooLarge != null && dstTooLarge.get(dstIdx)) {
251 					pm.update(1);
252 					continue;
253 				}
254 
255 				long srcSize = srcSizes[srcIdx];
256 				if (srcSize == 0) {
257 					srcSize = size(OLD, srcEnt) + 1;
258 					srcSizes[srcIdx] = srcSize;
259 				}
260 
261 				long dstSize = dstSizes[dstIdx];
262 				if (dstSize == 0) {
263 					dstSize = size(NEW, dstEnt) + 1;
264 					dstSizes[dstIdx] = dstSize;
265 				}
266 
267 				long max = Math.max(srcSize, dstSize);
268 				long min = Math.min(srcSize, dstSize);
269 				if (min * 100 / max < renameScore) {
270 					// Cannot possibly match, as the file sizes are so different
271 					pm.update(1);
272 					continue;
273 				}
274 
275 				if (max > bigFileThreshold) {
276 					pm.update(1);
277 					continue;
278 				}
279 
280 				if (s == null) {
281 					try {
282 						ObjectLoader loader = reader.open(OLD, srcEnt);
283 						if (skipBinaryFiles && SimilarityIndex.isBinary(loader)) {
284 							pm.update(1);
285 							continue SRC;
286 						}
287 						s = hash(loader);
288 					} catch (TableFullException tableFull) {
289 						tableOverflow = true;
290 						continue SRC;
291 					}
292 				}
293 
294 				SimilarityIndex d;
295 				try {
296 					ObjectLoader loader = reader.open(NEW, dstEnt);
297 					if (skipBinaryFiles && SimilarityIndex.isBinary(loader)) {
298 						pm.update(1);
299 						continue;
300 					}
301 					d = hash(loader);
302 				} catch (TableFullException tableFull) {
303 					if (dstTooLarge == null)
304 						dstTooLarge = new BitSet(dsts.size());
305 					dstTooLarge.set(dstIdx);
306 					tableOverflow = true;
307 					pm.update(1);
308 					continue;
309 				}
310 
311 				int contentScore = s.score(d, 10000);
312 
313 				// nameScore returns a value between 0 and 100, but we want it
314 				// to be in the same range as the content score. This allows it
315 				// to be dropped into the pretty formula for the final score.
316 				int nameScore = nameScore(srcEnt.oldPath, dstEnt.newPath) * 100;
317 
318 				int score = (contentScore * 99 + nameScore * 1) / 10000;
319 
320 				if (score < renameScore) {
321 					pm.update(1);
322 					continue;
323 				}
324 
325 				matrix[mNext++] = encode(score, srcIdx, dstIdx);
326 				pm.update(1);
327 			}
328 		}
329 
330 		// Sort everything in the range we populated, which might be the
331 		// entire matrix, or just a smaller slice if we had some bad low
332 		// scoring pairs.
333 		//
334 		Arrays.sort(matrix, 0, mNext);
335 		return mNext;
336 	}
337 
338 	static int nameScore(String a, String b) {
339 		int aDirLen = a.lastIndexOf('/') + 1;
340 		int bDirLen = b.lastIndexOf('/') + 1;
341 
342 		int dirMin = Math.min(aDirLen, bDirLen);
343 		int dirMax = Math.max(aDirLen, bDirLen);
344 
345 		final int dirScoreLtr;
346 		final int dirScoreRtl;
347 
348 		if (dirMax == 0) {
349 			dirScoreLtr = 100;
350 			dirScoreRtl = 100;
351 		} else {
352 			int dirSim = 0;
353 			for (; dirSim < dirMin; dirSim++) {
354 				if (a.charAt(dirSim) != b.charAt(dirSim))
355 					break;
356 			}
357 			dirScoreLtr = (dirSim * 100) / dirMax;
358 
359 			if (dirScoreLtr == 100) {
360 				dirScoreRtl = 100;
361 			} else {
362 				for (dirSim = 0; dirSim < dirMin; dirSim++) {
363 					if (a.charAt(aDirLen - 1 - dirSim) != b.charAt(bDirLen - 1
364 							- dirSim))
365 						break;
366 				}
367 				dirScoreRtl = (dirSim * 100) / dirMax;
368 			}
369 		}
370 
371 		int fileMin = Math.min(a.length() - aDirLen, b.length() - bDirLen);
372 		int fileMax = Math.max(a.length() - aDirLen, b.length() - bDirLen);
373 
374 		int fileSim = 0;
375 		for (; fileSim < fileMin; fileSim++) {
376 			if (a.charAt(a.length() - 1 - fileSim) != b.charAt(b.length() - 1
377 					- fileSim))
378 				break;
379 		}
380 		int fileScore = (fileSim * 100) / fileMax;
381 
382 		return (((dirScoreLtr + dirScoreRtl) * 25) + (fileScore * 50)) / 100;
383 	}
384 
385 	private SimilarityIndex hash(ObjectLoader objectLoader)
386 			throws IOException, TableFullException {
387 		SimilarityIndex r = new SimilarityIndex();
388 		r.hash(objectLoader);
389 		r.sort();
390 		return r;
391 	}
392 
393 	private long size(DiffEntry.Side side, DiffEntry ent) throws IOException {
394 		return reader.size(side, ent);
395 	}
396 
397 	private static int score(long value) {
398 		return (int) (value >>> SCORE_SHIFT);
399 	}
400 
401 	static int srcFile(long value) {
402 		return decodeFile(((int) (value >>> BITS_PER_INDEX)) & INDEX_MASK);
403 	}
404 
405 	static int dstFile(long value) {
406 		return decodeFile(((int) value) & INDEX_MASK);
407 	}
408 
409 	static long encode(int score, int srcIdx, int dstIdx) {
410 		return (((long) score) << SCORE_SHIFT) //
411 				| (encodeFile(srcIdx) << BITS_PER_INDEX) //
412 				| encodeFile(dstIdx);
413 	}
414 
415 	private static long encodeFile(int idx) {
416 		// We invert the index so that the first file in the list sorts
417 		// later in the table. This permits us to break ties favoring
418 		// earlier names over later ones.
419 		//
420 		return INDEX_MASK - idx;
421 	}
422 
423 	private static int decodeFile(int v) {
424 		return INDEX_MASK - v;
425 	}
426 
427 	private static boolean isFile(FileMode mode) {
428 		return (mode.getBits() & FileMode.TYPE_MASK) == FileMode.TYPE_FILE;
429 	}
430 }