View Javadoc
1   /*
2    * Copyright (C) 2010, Google Inc. and others
3    *
4    * This program and the accompanying materials are made available under the
5    * terms of the Eclipse Distribution License v. 1.0 which is available at
6    * https://www.eclipse.org/org/documents/edl-v10.php.
7    *
8    * SPDX-License-Identifier: BSD-3-Clause
9    */
10  
11  package org.eclipse.jgit.diff;
12  
13  import static org.eclipse.jgit.diff.DiffEntry.Side.NEW;
14  import static org.eclipse.jgit.diff.DiffEntry.Side.OLD;
15  
16  import java.io.IOException;
17  import java.util.ArrayList;
18  import java.util.Arrays;
19  import java.util.BitSet;
20  import java.util.List;
21  
22  import org.eclipse.jgit.diff.DiffEntry.ChangeType;
23  import org.eclipse.jgit.diff.SimilarityIndex.TableFullException;
24  import org.eclipse.jgit.errors.CancelledException;
25  import org.eclipse.jgit.internal.JGitText;
26  import org.eclipse.jgit.lib.FileMode;
27  import org.eclipse.jgit.lib.NullProgressMonitor;
28  import org.eclipse.jgit.lib.ProgressMonitor;
29  
30  class SimilarityRenameDetector {
31  	/**
32  	 * Number of bits we need to express an index into src or dst list.
33  	 * <p>
34  	 * This must be 28, giving us a limit of 2^28 entries in either list, which
35  	 * is an insane limit of 536,870,912 file names being considered in a single
36  	 * rename pass. The other 8 bits are used to store the score, while staying
37  	 * under 127 so the long doesn't go negative.
38  	 */
39  	private static final int BITS_PER_INDEX = 28;
40  
41  	private static final int INDEX_MASK = (1 << BITS_PER_INDEX) - 1;
42  
43  	private static final int SCORE_SHIFT = 2 * BITS_PER_INDEX;
44  
45  	private ContentSource.Pair reader;
46  
47  	/**
48  	 * All sources to consider for copies or renames.
49  	 * <p>
50  	 * A source is typically a {@link ChangeType#DELETE} change, but could be
51  	 * another type when trying to perform copy detection concurrently with
52  	 * rename detection.
53  	 */
54  	private List<DiffEntry> srcs;
55  
56  	/**
57  	 * All destinations to consider looking for a rename.
58  	 * <p>
59  	 * A destination is typically an {@link ChangeType#ADD}, as the name has
60  	 * just come into existence, and we want to discover where its initial
61  	 * content came from.
62  	 */
63  	private List<DiffEntry> dsts;
64  
65  	/**
66  	 * Matrix of all examined file pairs, and their scores.
67  	 * <p>
68  	 * The upper 8 bits of each long stores the score, but the score is bounded
69  	 * to be in the range (0, 128] so that the highest bit is never set, and all
70  	 * entries are therefore positive.
71  	 * <p>
72  	 * List indexes to an element of {@link #srcs} and {@link #dsts} are encoded
73  	 * as the lower two groups of 28 bits, respectively, but the encoding is
74  	 * inverted, so that 0 is expressed as {@code (1 << 28) - 1}. This sorts
75  	 * lower list indices later in the matrix, giving precedence to files whose
76  	 * names sort earlier in the tree.
77  	 */
78  	private long[] matrix;
79  
80  	/** Score a pair must exceed to be considered a rename. */
81  	private int renameScore = 60;
82  
83  	/** Set if any {@link SimilarityIndex.TableFullException} occurs. */
84  	private boolean tableOverflow;
85  
86  	private List<DiffEntry> out;
87  
88  	SimilarityRenameDetector(ContentSource.Pair reader, List<DiffEntry> srcs,
89  			List<DiffEntry> dsts) {
90  		this.reader = reader;
91  		this.srcs = srcs;
92  		this.dsts = dsts;
93  	}
94  
95  	void setRenameScore(int score) {
96  		renameScore = score;
97  	}
98  
99  	void compute(ProgressMonitor pm) throws IOException, CancelledException {
100 		if (pm == null)
101 			pm = NullProgressMonitor.INSTANCE;
102 
103 		pm.beginTask(JGitText.get().renamesFindingByContent, //
104 				2 * srcs.size() * dsts.size());
105 
106 		int mNext = buildMatrix(pm);
107 		out = new ArrayList<>(Math.min(mNext, dsts.size()));
108 
109 		// Match rename pairs on a first come, first serve basis until
110 		// we have looked at everything that is above our minimum score.
111 		//
112 		for (--mNext; mNext >= 0; mNext--) {
113 			if (pm.isCancelled()) {
114 				// TODO(ms): use org.eclipse.jgit.api.errors.CanceledException
115 				// in next major version
116 				throw new CancelledException(JGitText.get().renameCancelled);
117 			}
118 			long ent = matrix[mNext];
119 			int sIdx = srcFile(ent);
120 			int dIdx = dstFile(ent);
121 			DiffEntry s = srcs.get(sIdx);
122 			DiffEntry d = dsts.get(dIdx);
123 
124 			if (d == null) {
125 				pm.update(1);
126 				continue; // was already matched earlier
127 			}
128 
129 			ChangeType type;
130 			if (s.changeType == ChangeType.DELETE) {
131 				// First use of this source file. Tag it as a rename so we
132 				// later know it is already been used as a rename, other
133 				// matches (if any) will claim themselves as copies instead.
134 				//
135 				s.changeType = ChangeType.RENAME;
136 				type = ChangeType.RENAME;
137 			} else {
138 				type = ChangeType.COPY;
139 			}
140 
141 			out.add(DiffEntry.pair(type, s, d, score(ent)));
142 			dsts.set(dIdx, null); // Claim the destination was matched.
143 			pm.update(1);
144 		}
145 
146 		srcs = compactSrcList(srcs);
147 		dsts = compactDstList(dsts);
148 		pm.endTask();
149 	}
150 
151 	List<DiffEntry> getMatches() {
152 		return out;
153 	}
154 
155 	List<DiffEntry> getLeftOverSources() {
156 		return srcs;
157 	}
158 
159 	List<DiffEntry> getLeftOverDestinations() {
160 		return dsts;
161 	}
162 
163 	boolean isTableOverflow() {
164 		return tableOverflow;
165 	}
166 
167 	private static List<DiffEntry> compactSrcList(List<DiffEntry> in) {
168 		ArrayList<DiffEntry> r = new ArrayList<>(in.size());
169 		for (DiffEntry e : in) {
170 			if (e.changeType == ChangeType.DELETE)
171 				r.add(e);
172 		}
173 		return r;
174 	}
175 
176 	private static List<DiffEntry> compactDstList(List<DiffEntry> in) {
177 		ArrayList<DiffEntry> r = new ArrayList<>(in.size());
178 		for (DiffEntry e : in) {
179 			if (e != null)
180 				r.add(e);
181 		}
182 		return r;
183 	}
184 
185 	private int buildMatrix(ProgressMonitor pm)
186 			throws IOException, CancelledException {
187 		// Allocate for the worst-case scenario where every pair has a
188 		// score that we need to consider. We might not need that many.
189 		//
190 		matrix = new long[srcs.size() * dsts.size()];
191 
192 		long[] srcSizes = new long[srcs.size()];
193 		long[] dstSizes = new long[dsts.size()];
194 		BitSet dstTooLarge = null;
195 
196 		// Consider each pair of files, if the score is above the minimum
197 		// threshold we need record that scoring in the matrix so we can
198 		// later find the best matches.
199 		//
200 		int mNext = 0;
201 		SRC: for (int srcIdx = 0; srcIdx < srcs.size(); srcIdx++) {
202 			DiffEntry srcEnt = srcs.get(srcIdx);
203 			if (!isFile(srcEnt.oldMode)) {
204 				pm.update(dsts.size());
205 				continue;
206 			}
207 
208 			SimilarityIndex s = null;
209 
210 			for (int dstIdx = 0; dstIdx < dsts.size(); dstIdx++) {
211 				if (pm.isCancelled()) {
212 					// TODO(ms): use
213 					// org.eclipse.jgit.api.errors.CanceledException in next
214 					// major version
215 					throw new CancelledException(
216 							JGitText.get().renameCancelled);
217 				}
218 
219 				DiffEntry dstEnt = dsts.get(dstIdx);
220 
221 				if (!isFile(dstEnt.newMode)) {
222 					pm.update(1);
223 					continue;
224 				}
225 
226 				if (!RenameDetector.sameType(srcEnt.oldMode, dstEnt.newMode)) {
227 					pm.update(1);
228 					continue;
229 				}
230 
231 				if (dstTooLarge != null && dstTooLarge.get(dstIdx)) {
232 					pm.update(1);
233 					continue;
234 				}
235 
236 				long srcSize = srcSizes[srcIdx];
237 				if (srcSize == 0) {
238 					srcSize = size(OLD, srcEnt) + 1;
239 					srcSizes[srcIdx] = srcSize;
240 				}
241 
242 				long dstSize = dstSizes[dstIdx];
243 				if (dstSize == 0) {
244 					dstSize = size(NEW, dstEnt) + 1;
245 					dstSizes[dstIdx] = dstSize;
246 				}
247 
248 				long max = Math.max(srcSize, dstSize);
249 				long min = Math.min(srcSize, dstSize);
250 				if (min * 100 / max < renameScore) {
251 					// Cannot possibly match, as the file sizes are so different
252 					pm.update(1);
253 					continue;
254 				}
255 
256 				if (s == null) {
257 					try {
258 						s = hash(OLD, srcEnt);
259 					} catch (TableFullException tableFull) {
260 						tableOverflow = true;
261 						continue SRC;
262 					}
263 				}
264 
265 				SimilarityIndex d;
266 				try {
267 					d = hash(NEW, dstEnt);
268 				} catch (TableFullException tableFull) {
269 					if (dstTooLarge == null)
270 						dstTooLarge = new BitSet(dsts.size());
271 					dstTooLarge.set(dstIdx);
272 					tableOverflow = true;
273 					pm.update(1);
274 					continue;
275 				}
276 
277 				int contentScore = s.score(d, 10000);
278 
279 				// nameScore returns a value between 0 and 100, but we want it
280 				// to be in the same range as the content score. This allows it
281 				// to be dropped into the pretty formula for the final score.
282 				int nameScore = nameScore(srcEnt.oldPath, dstEnt.newPath) * 100;
283 
284 				int score = (contentScore * 99 + nameScore * 1) / 10000;
285 
286 				if (score < renameScore) {
287 					pm.update(1);
288 					continue;
289 				}
290 
291 				matrix[mNext++] = encode(score, srcIdx, dstIdx);
292 				pm.update(1);
293 			}
294 		}
295 
296 		// Sort everything in the range we populated, which might be the
297 		// entire matrix, or just a smaller slice if we had some bad low
298 		// scoring pairs.
299 		//
300 		Arrays.sort(matrix, 0, mNext);
301 		return mNext;
302 	}
303 
304 	static int nameScore(String a, String b) {
305 		int aDirLen = a.lastIndexOf('/') + 1;
306 		int bDirLen = b.lastIndexOf('/') + 1;
307 
308 		int dirMin = Math.min(aDirLen, bDirLen);
309 		int dirMax = Math.max(aDirLen, bDirLen);
310 
311 		final int dirScoreLtr;
312 		final int dirScoreRtl;
313 
314 		if (dirMax == 0) {
315 			dirScoreLtr = 100;
316 			dirScoreRtl = 100;
317 		} else {
318 			int dirSim = 0;
319 			for (; dirSim < dirMin; dirSim++) {
320 				if (a.charAt(dirSim) != b.charAt(dirSim))
321 					break;
322 			}
323 			dirScoreLtr = (dirSim * 100) / dirMax;
324 
325 			if (dirScoreLtr == 100) {
326 				dirScoreRtl = 100;
327 			} else {
328 				for (dirSim = 0; dirSim < dirMin; dirSim++) {
329 					if (a.charAt(aDirLen - 1 - dirSim) != b.charAt(bDirLen - 1
330 							- dirSim))
331 						break;
332 				}
333 				dirScoreRtl = (dirSim * 100) / dirMax;
334 			}
335 		}
336 
337 		int fileMin = Math.min(a.length() - aDirLen, b.length() - bDirLen);
338 		int fileMax = Math.max(a.length() - aDirLen, b.length() - bDirLen);
339 
340 		int fileSim = 0;
341 		for (; fileSim < fileMin; fileSim++) {
342 			if (a.charAt(a.length() - 1 - fileSim) != b.charAt(b.length() - 1
343 					- fileSim))
344 				break;
345 		}
346 		int fileScore = (fileSim * 100) / fileMax;
347 
348 		return (((dirScoreLtr + dirScoreRtl) * 25) + (fileScore * 50)) / 100;
349 	}
350 
351 	private SimilarityIndex hash(DiffEntry.Side side, DiffEntry ent)
352 			throws IOException, TableFullException {
353 		SimilarityIndex r = new SimilarityIndex();
354 		r.hash(reader.open(side, ent));
355 		r.sort();
356 		return r;
357 	}
358 
359 	private long size(DiffEntry.Side side, DiffEntry ent) throws IOException {
360 		return reader.size(side, ent);
361 	}
362 
363 	private static int score(long value) {
364 		return (int) (value >>> SCORE_SHIFT);
365 	}
366 
367 	static int srcFile(long value) {
368 		return decodeFile(((int) (value >>> BITS_PER_INDEX)) & INDEX_MASK);
369 	}
370 
371 	static int dstFile(long value) {
372 		return decodeFile(((int) value) & INDEX_MASK);
373 	}
374 
375 	static long encode(int score, int srcIdx, int dstIdx) {
376 		return (((long) score) << SCORE_SHIFT) //
377 				| (encodeFile(srcIdx) << BITS_PER_INDEX) //
378 				| encodeFile(dstIdx);
379 	}
380 
381 	private static long encodeFile(int idx) {
382 		// We invert the index so that the first file in the list sorts
383 		// later in the table. This permits us to break ties favoring
384 		// earlier names over later ones.
385 		//
386 		return INDEX_MASK - idx;
387 	}
388 
389 	private static int decodeFile(int v) {
390 		return INDEX_MASK - v;
391 	}
392 
393 	private static boolean isFile(FileMode mode) {
394 		return (mode.getBits() & FileMode.TYPE_MASK) == FileMode.TYPE_FILE;
395 	}
396 }