View Javadoc
1   /*
2    * Copyright (C) 2010, Google Inc.
3    * and other copyright owners as documented in the project's IP log.
4    *
5    * This program and the accompanying materials are made available
6    * under the terms of the Eclipse Distribution License v1.0 which
7    * accompanies this distribution, is reproduced below, and is
8    * available at http://www.eclipse.org/org/documents/edl-v10.php
9    *
10   * All rights reserved.
11   *
12   * Redistribution and use in source and binary forms, with or
13   * without modification, are permitted provided that the following
14   * conditions are met:
15   *
16   * - Redistributions of source code must retain the above copyright
17   *   notice, this list of conditions and the following disclaimer.
18   *
19   * - Redistributions in binary form must reproduce the above
20   *   copyright notice, this list of conditions and the following
21   *   disclaimer in the documentation and/or other materials provided
22   *   with the distribution.
23   *
24   * - Neither the name of the Eclipse Foundation, Inc. nor the
25   *   names of its contributors may be used to endorse or promote
26   *   products derived from this software without specific prior
27   *   written permission.
28   *
29   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
30   * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
31   * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
32   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33   * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
34   * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
35   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
36   * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
37   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
38   * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
39   * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
40   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
41   * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
42   */
43  
44  package org.eclipse.jgit.diff;
45  
46  import static org.eclipse.jgit.diff.DiffEntry.Side.NEW;
47  import static org.eclipse.jgit.diff.DiffEntry.Side.OLD;
48  
49  import java.io.IOException;
50  import java.util.ArrayList;
51  import java.util.Arrays;
52  import java.util.BitSet;
53  import java.util.List;
54  
55  import org.eclipse.jgit.diff.DiffEntry.ChangeType;
56  import org.eclipse.jgit.diff.SimilarityIndex.TableFullException;
57  import org.eclipse.jgit.internal.JGitText;
58  import org.eclipse.jgit.lib.FileMode;
59  import org.eclipse.jgit.lib.NullProgressMonitor;
60  import org.eclipse.jgit.lib.ProgressMonitor;
61  
62  class SimilarityRenameDetector {
63  	/**
64  	 * Number of bits we need to express an index into src or dst list.
65  	 * <p>
66  	 * This must be 28, giving us a limit of 2^28 entries in either list, which
67  	 * is an insane limit of 536,870,912 file names being considered in a single
68  	 * rename pass. The other 8 bits are used to store the score, while staying
69  	 * under 127 so the long doesn't go negative.
70  	 */
71  	private static final int BITS_PER_INDEX = 28;
72  
73  	private static final int INDEX_MASK = (1 << BITS_PER_INDEX) - 1;
74  
75  	private static final int SCORE_SHIFT = 2 * BITS_PER_INDEX;
76  
77  	private ContentSource.Pair reader;
78  
79  	/**
80  	 * All sources to consider for copies or renames.
81  	 * <p>
82  	 * A source is typically a {@link ChangeType#DELETE} change, but could be
83  	 * another type when trying to perform copy detection concurrently with
84  	 * rename detection.
85  	 */
86  	private List<DiffEntry> srcs;
87  
88  	/**
89  	 * All destinations to consider looking for a rename.
90  	 * <p>
91  	 * A destination is typically an {@link ChangeType#ADD}, as the name has
92  	 * just come into existence, and we want to discover where its initial
93  	 * content came from.
94  	 */
95  	private List<DiffEntry> dsts;
96  
97  	/**
98  	 * Matrix of all examined file pairs, and their scores.
99  	 * <p>
100 	 * The upper 8 bits of each long stores the score, but the score is bounded
101 	 * to be in the range (0, 128] so that the highest bit is never set, and all
102 	 * entries are therefore positive.
103 	 * <p>
104 	 * List indexes to an element of {@link #srcs} and {@link #dsts} are encoded
105 	 * as the lower two groups of 28 bits, respectively, but the encoding is
106 	 * inverted, so that 0 is expressed as {@code (1 << 28) - 1}. This sorts
107 	 * lower list indices later in the matrix, giving precedence to files whose
108 	 * names sort earlier in the tree.
109 	 */
110 	private long[] matrix;
111 
112 	/** Score a pair must exceed to be considered a rename. */
113 	private int renameScore = 60;
114 
115 	/** Set if any {@link SimilarityIndex.TableFullException} occurs. */
116 	private boolean tableOverflow;
117 
118 	private List<DiffEntry> out;
119 
120 	SimilarityRenameDetector(ContentSource.Pair reader, List<DiffEntry> srcs,
121 			List<DiffEntry> dsts) {
122 		this.reader = reader;
123 		this.srcs = srcs;
124 		this.dsts = dsts;
125 	}
126 
127 	void setRenameScore(int score) {
128 		renameScore = score;
129 	}
130 
131 	void compute(ProgressMonitor pm) throws IOException {
132 		if (pm == null)
133 			pm = NullProgressMonitor.INSTANCE;
134 
135 		pm.beginTask(JGitText.get().renamesFindingByContent, //
136 				2 * srcs.size() * dsts.size());
137 
138 		int mNext = buildMatrix(pm);
139 		out = new ArrayList<DiffEntry>(Math.min(mNext, dsts.size()));
140 
141 		// Match rename pairs on a first come, first serve basis until
142 		// we have looked at everything that is above our minimum score.
143 		//
144 		for (--mNext; mNext >= 0; mNext--) {
145 			long ent = matrix[mNext];
146 			int sIdx = srcFile(ent);
147 			int dIdx = dstFile(ent);
148 			DiffEntry s = srcs.get(sIdx);
149 			DiffEntry d = dsts.get(dIdx);
150 
151 			if (d == null) {
152 				pm.update(1);
153 				continue; // was already matched earlier
154 			}
155 
156 			ChangeType type;
157 			if (s.changeType == ChangeType.DELETE) {
158 				// First use of this source file. Tag it as a rename so we
159 				// later know it is already been used as a rename, other
160 				// matches (if any) will claim themselves as copies instead.
161 				//
162 				s.changeType = ChangeType.RENAME;
163 				type = ChangeType.RENAME;
164 			} else {
165 				type = ChangeType.COPY;
166 			}
167 
168 			out.add(DiffEntry.pair(type, s, d, score(ent)));
169 			dsts.set(dIdx, null); // Claim the destination was matched.
170 			pm.update(1);
171 		}
172 
173 		srcs = compactSrcList(srcs);
174 		dsts = compactDstList(dsts);
175 		pm.endTask();
176 	}
177 
178 	List<DiffEntry> getMatches() {
179 		return out;
180 	}
181 
182 	List<DiffEntry> getLeftOverSources() {
183 		return srcs;
184 	}
185 
186 	List<DiffEntry> getLeftOverDestinations() {
187 		return dsts;
188 	}
189 
190 	boolean isTableOverflow() {
191 		return tableOverflow;
192 	}
193 
194 	private static List<DiffEntry> compactSrcList(List<DiffEntry> in) {
195 		ArrayList<DiffEntry> r = new ArrayList<DiffEntry>(in.size());
196 		for (DiffEntry e : in) {
197 			if (e.changeType == ChangeType.DELETE)
198 				r.add(e);
199 		}
200 		return r;
201 	}
202 
203 	private static List<DiffEntry> compactDstList(List<DiffEntry> in) {
204 		ArrayList<DiffEntry> r = new ArrayList<DiffEntry>(in.size());
205 		for (DiffEntry e : in) {
206 			if (e != null)
207 				r.add(e);
208 		}
209 		return r;
210 	}
211 
212 	private int buildMatrix(ProgressMonitor pm) throws IOException {
213 		// Allocate for the worst-case scenario where every pair has a
214 		// score that we need to consider. We might not need that many.
215 		//
216 		matrix = new long[srcs.size() * dsts.size()];
217 
218 		long[] srcSizes = new long[srcs.size()];
219 		long[] dstSizes = new long[dsts.size()];
220 		BitSet dstTooLarge = null;
221 
222 		// Consider each pair of files, if the score is above the minimum
223 		// threshold we need record that scoring in the matrix so we can
224 		// later find the best matches.
225 		//
226 		int mNext = 0;
227 		SRC: for (int srcIdx = 0; srcIdx < srcs.size(); srcIdx++) {
228 			DiffEntry srcEnt = srcs.get(srcIdx);
229 			if (!isFile(srcEnt.oldMode)) {
230 				pm.update(dsts.size());
231 				continue;
232 			}
233 
234 			SimilarityIndex s = null;
235 
236 			for (int dstIdx = 0; dstIdx < dsts.size(); dstIdx++) {
237 				DiffEntry dstEnt = dsts.get(dstIdx);
238 
239 				if (!isFile(dstEnt.newMode)) {
240 					pm.update(1);
241 					continue;
242 				}
243 
244 				if (!RenameDetector.sameType(srcEnt.oldMode, dstEnt.newMode)) {
245 					pm.update(1);
246 					continue;
247 				}
248 
249 				if (dstTooLarge != null && dstTooLarge.get(dstIdx)) {
250 					pm.update(1);
251 					continue;
252 				}
253 
254 				long srcSize = srcSizes[srcIdx];
255 				if (srcSize == 0) {
256 					srcSize = size(OLD, srcEnt) + 1;
257 					srcSizes[srcIdx] = srcSize;
258 				}
259 
260 				long dstSize = dstSizes[dstIdx];
261 				if (dstSize == 0) {
262 					dstSize = size(NEW, dstEnt) + 1;
263 					dstSizes[dstIdx] = dstSize;
264 				}
265 
266 				long max = Math.max(srcSize, dstSize);
267 				long min = Math.min(srcSize, dstSize);
268 				if (min * 100 / max < renameScore) {
269 					// Cannot possibly match, as the file sizes are so different
270 					pm.update(1);
271 					continue;
272 				}
273 
274 				if (s == null) {
275 					try {
276 						s = hash(OLD, srcEnt);
277 					} catch (TableFullException tableFull) {
278 						tableOverflow = true;
279 						continue SRC;
280 					}
281 				}
282 
283 				SimilarityIndex d;
284 				try {
285 					d = hash(NEW, dstEnt);
286 				} catch (TableFullException tableFull) {
287 					if (dstTooLarge == null)
288 						dstTooLarge = new BitSet(dsts.size());
289 					dstTooLarge.set(dstIdx);
290 					tableOverflow = true;
291 					pm.update(1);
292 					continue;
293 				}
294 
295 				int contentScore = s.score(d, 10000);
296 
297 				// nameScore returns a value between 0 and 100, but we want it
298 				// to be in the same range as the content score. This allows it
299 				// to be dropped into the pretty formula for the final score.
300 				int nameScore = nameScore(srcEnt.oldPath, dstEnt.newPath) * 100;
301 
302 				int score = (contentScore * 99 + nameScore * 1) / 10000;
303 
304 				if (score < renameScore) {
305 					pm.update(1);
306 					continue;
307 				}
308 
309 				matrix[mNext++] = encode(score, srcIdx, dstIdx);
310 				pm.update(1);
311 			}
312 		}
313 
314 		// Sort everything in the range we populated, which might be the
315 		// entire matrix, or just a smaller slice if we had some bad low
316 		// scoring pairs.
317 		//
318 		Arrays.sort(matrix, 0, mNext);
319 		return mNext;
320 	}
321 
322 	static int nameScore(String a, String b) {
323 	    int aDirLen = a.lastIndexOf("/") + 1; //$NON-NLS-1$
324 	    int bDirLen = b.lastIndexOf("/") + 1; //$NON-NLS-1$
325 
326 	    int dirMin = Math.min(aDirLen, bDirLen);
327 	    int dirMax = Math.max(aDirLen, bDirLen);
328 
329 	    final int dirScoreLtr;
330 	    final int dirScoreRtl;
331 
332 		if (dirMax == 0) {
333 			dirScoreLtr = 100;
334 			dirScoreRtl = 100;
335 		} else {
336 			int dirSim = 0;
337 			for (; dirSim < dirMin; dirSim++) {
338 				if (a.charAt(dirSim) != b.charAt(dirSim))
339 					break;
340 			}
341 			dirScoreLtr = (dirSim * 100) / dirMax;
342 
343 			if (dirScoreLtr == 100) {
344 				dirScoreRtl = 100;
345 			} else {
346 				for (dirSim = 0; dirSim < dirMin; dirSim++) {
347 					if (a.charAt(aDirLen - 1 - dirSim) != b.charAt(bDirLen - 1
348 							- dirSim))
349 						break;
350 				}
351 				dirScoreRtl = (dirSim * 100) / dirMax;
352 			}
353 		}
354 
355 		int fileMin = Math.min(a.length() - aDirLen, b.length() - bDirLen);
356 		int fileMax = Math.max(a.length() - aDirLen, b.length() - bDirLen);
357 
358 		int fileSim = 0;
359 		for (; fileSim < fileMin; fileSim++) {
360 			if (a.charAt(a.length() - 1 - fileSim) != b.charAt(b.length() - 1
361 					- fileSim))
362 				break;
363 		}
364 		int fileScore = (fileSim * 100) / fileMax;
365 
366 		return (((dirScoreLtr + dirScoreRtl) * 25) + (fileScore * 50)) / 100;
367 	}
368 
369 	private SimilarityIndex hash(DiffEntry.Side side, DiffEntry ent)
370 			throws IOException, TableFullException {
371 		SimilarityIndex r = new SimilarityIndex();
372 		r.hash(reader.open(side, ent));
373 		r.sort();
374 		return r;
375 	}
376 
377 	private long size(DiffEntry.Side side, DiffEntry ent) throws IOException {
378 		return reader.size(side, ent);
379 	}
380 
381 	private static int score(long value) {
382 		return (int) (value >>> SCORE_SHIFT);
383 	}
384 
385 	static int srcFile(long value) {
386 		return decodeFile(((int) (value >>> BITS_PER_INDEX)) & INDEX_MASK);
387 	}
388 
389 	static int dstFile(long value) {
390 		return decodeFile(((int) value) & INDEX_MASK);
391 	}
392 
393 	static long encode(int score, int srcIdx, int dstIdx) {
394 		return (((long) score) << SCORE_SHIFT) //
395 				| (encodeFile(srcIdx) << BITS_PER_INDEX) //
396 				| encodeFile(dstIdx);
397 	}
398 
399 	private static long encodeFile(int idx) {
400 		// We invert the index so that the first file in the list sorts
401 		// later in the table. This permits us to break ties favoring
402 		// earlier names over later ones.
403 		//
404 		return INDEX_MASK - idx;
405 	}
406 
407 	private static int decodeFile(int v) {
408 		return INDEX_MASK - v;
409 	}
410 
411 	private static boolean isFile(FileMode mode) {
412 		return (mode.getBits() & FileMode.TYPE_MASK) == FileMode.TYPE_FILE;
413 	}
414 }