View Javadoc
1   /*
2    * Copyright (C) 2010, Google Inc.
3    * and other copyright owners as documented in the project's IP log.
4    *
5    * This program and the accompanying materials are made available
6    * under the terms of the Eclipse Distribution License v1.0 which
7    * accompanies this distribution, is reproduced below, and is
8    * available at http://www.eclipse.org/org/documents/edl-v10.php
9    *
10   * All rights reserved.
11   *
12   * Redistribution and use in source and binary forms, with or
13   * without modification, are permitted provided that the following
14   * conditions are met:
15   *
16   * - Redistributions of source code must retain the above copyright
17   *   notice, this list of conditions and the following disclaimer.
18   *
19   * - Redistributions in binary form must reproduce the above
20   *   copyright notice, this list of conditions and the following
21   *   disclaimer in the documentation and/or other materials provided
22   *   with the distribution.
23   *
24   * - Neither the name of the Eclipse Foundation, Inc. nor the
25   *   names of its contributors may be used to endorse or promote
26   *   products derived from this software without specific prior
27   *   written permission.
28   *
29   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
30   * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
31   * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
32   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33   * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
34   * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
35   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
36   * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
37   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
38   * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
39   * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
40   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
41   * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
42   */
43  
44  package org.eclipse.jgit.diff;
45  
46  import java.io.EOFException;
47  import java.io.IOException;
48  import java.io.InputStream;
49  import java.util.Arrays;
50  
51  import org.eclipse.jgit.errors.MissingObjectException;
52  import org.eclipse.jgit.lib.ObjectLoader;
53  import org.eclipse.jgit.lib.ObjectStream;
54  
55  /**
56   * Index structure of lines/blocks in one file.
57   * <p>
58   * This structure can be used to compute an approximation of the similarity
59   * between two files. The index is used by
60   * {@link org.eclipse.jgit.diff.SimilarityRenameDetector} to compute scores
61   * between files.
62   * <p>
63   * To save space in memory, this index uses a space efficient encoding which
64   * will not exceed 1 MiB per instance. The index starts out at a smaller size
65   * (closer to 2 KiB), but may grow as more distinct blocks within the scanned
66   * file are discovered.
67   *
68   * @since 4.0
69   */
70  public class SimilarityIndex {
71  	/** A special {@link TableFullException} used in place of OutOfMemoryError. */
72  	public static final TableFullException
73  			TABLE_FULL_OUT_OF_MEMORY = new TableFullException();
74  
75  	/**
76  	 * Shift to apply before storing a key.
77  	 * <p>
78  	 * Within the 64 bit table record space, we leave the highest bit unset so
79  	 * all values are positive. The lower 32 bits to count bytes.
80  	 */
81  	private static final int KEY_SHIFT = 32;
82  
83  	/** Maximum value of the count field, also mask to extract the count. */
84  	private static final long MAX_COUNT = (1L << KEY_SHIFT) - 1;
85  
86  	/**
87  	 * Total amount of bytes hashed into the structure, including \n. This is
88  	 * usually the size of the file minus number of CRLF encounters.
89  	 */
90  	private long hashedCnt;
91  
92  	/** Number of non-zero entries in {@link #idHash}. */
93  	private int idSize;
94  
95  	/** {@link #idSize} that triggers {@link #idHash} to double in size. */
96  	private int idGrowAt;
97  
98  	/**
99  	 * Pairings of content keys and counters.
100 	 * <p>
101 	 * Slots in the table are actually two ints wedged into a single long. The
102 	 * upper 32 bits stores the content key, and the remaining lower bits stores
103 	 * the number of bytes associated with that key. Empty slots are denoted by
104 	 * 0, which cannot occur because the count cannot be 0. Values can only be
105 	 * positive, which we enforce during key addition.
106 	 */
107 	private long[] idHash;
108 
109 	/** {@code idHash.length == 1 << idHashBits}. */
110 	private int idHashBits;
111 
112 	/**
113 	 * Create a new similarity index for the given object
114 	 *
115 	 * @param obj
116 	 *            the object to hash
117 	 * @return similarity index for this object
118 	 * @throws java.io.IOException
119 	 *             file contents cannot be read from the repository.
120 	 * @throws org.eclipse.jgit.diff.SimilarityIndex.TableFullException
121 	 *             object hashing overflowed the storage capacity of the
122 	 *             SimilarityIndex.
123 	 */
124 	public static SimilarityIndex create(ObjectLoader obj) throws IOException,
125 			TableFullException {
126 		SimilarityIndex idx = new SimilarityIndex();
127 		idx.hash(obj);
128 		idx.sort();
129 		return idx;
130 	}
131 
132 	SimilarityIndex() {
133 		idHashBits = 8;
134 		idHash = new long[1 << idHashBits];
135 		idGrowAt = growAt(idHashBits);
136 	}
137 
138 	void hash(ObjectLoader obj) throws MissingObjectException, IOException,
139 			TableFullException {
140 		if (obj.isLarge()) {
141 			hashLargeObject(obj);
142 		} else {
143 			byte[] raw = obj.getCachedBytes();
144 			hash(raw, 0, raw.length);
145 		}
146 	}
147 
148 	private void hashLargeObject(ObjectLoader obj) throws IOException,
149 			TableFullException {
150 		ObjectStream in1 = obj.openStream();
151 		boolean text;
152 		try {
153 			text = !RawText.isBinary(in1);
154 		} finally {
155 			in1.close();
156 		}
157 
158 		ObjectStream in2 = obj.openStream();
159 		try {
160 			hash(in2, in2.getSize(), text);
161 		} finally {
162 			in2.close();
163 		}
164 	}
165 
166 	void hash(byte[] raw, int ptr, final int end) throws TableFullException {
167 		final boolean text = !RawText.isBinary(raw);
168 		hashedCnt = 0;
169 		while (ptr < end) {
170 			int hash = 5381;
171 			int blockHashedCnt = 0;
172 			int start = ptr;
173 
174 			// Hash one line, or one block, whichever occurs first.
175 			do {
176 				int c = raw[ptr++] & 0xff;
177 				// Ignore CR in CRLF sequence if text
178 				if (text && c == '\r' && ptr < end && raw[ptr] == '\n')
179 					continue;
180 				blockHashedCnt++;
181 				if (c == '\n')
182 					break;
183 				hash = (hash << 5) + hash + c;
184 			} while (ptr < end && ptr - start < 64);
185 			hashedCnt += blockHashedCnt;
186 			add(hash, blockHashedCnt);
187 		}
188 	}
189 
190 	void hash(InputStream in, long remaining, boolean text) throws IOException,
191 			TableFullException {
192 		byte[] buf = new byte[4096];
193 		int ptr = 0;
194 		int cnt = 0;
195 
196 		while (0 < remaining) {
197 			int hash = 5381;
198 			int blockHashedCnt = 0;
199 
200 			// Hash one line, or one block, whichever occurs first.
201 			int n = 0;
202 			do {
203 				if (ptr == cnt) {
204 					ptr = 0;
205 					cnt = in.read(buf, 0, buf.length);
206 					if (cnt <= 0)
207 						throw new EOFException();
208 				}
209 
210 				n++;
211 				int c = buf[ptr++] & 0xff;
212 				// Ignore CR in CRLF sequence if text
213 				if (text && c == '\r' && ptr < cnt && buf[ptr] == '\n')
214 					continue;
215 				blockHashedCnt++;
216 				if (c == '\n')
217 					break;
218 				hash = (hash << 5) + hash + c;
219 			} while (n < 64 && n < remaining);
220 			hashedCnt += blockHashedCnt;
221 			add(hash, blockHashedCnt);
222 			remaining -= n;
223 		}
224 	}
225 
226 	/**
227 	 * Sort the internal table so it can be used for efficient scoring.
228 	 * <p>
229 	 * Once sorted, additional lines/blocks cannot be added to the index.
230 	 */
231 	void sort() {
232 		// Sort the array. All of the empty space will wind up at the front,
233 		// because we forced all of the keys to always be positive. Later
234 		// we only work with the back half of the array.
235 		//
236 		Arrays.sort(idHash);
237 	}
238 
239 	/**
240 	 * Compute the similarity score between this index and another.
241 	 * <p>
242 	 * A region of a file is defined as a line in a text file or a fixed-size
243 	 * block in a binary file. To prepare an index, each region in the file is
244 	 * hashed; the values and counts of hashes are retained in a sorted table.
245 	 * Define the similarity fraction F as the the count of matching regions
246 	 * between the two files divided between the maximum count of regions in
247 	 * either file. The similarity score is F multiplied by the maxScore
248 	 * constant, yielding a range [0, maxScore]. It is defined as maxScore for
249 	 * the degenerate case of two empty files.
250 	 * <p>
251 	 * The similarity score is symmetrical; i.e. a.score(b) == b.score(a).
252 	 *
253 	 * @param dst
254 	 *            the other index
255 	 * @param maxScore
256 	 *            the score representing a 100% match
257 	 * @return the similarity score
258 	 */
259 	public int score(SimilarityIndex dst, int maxScore) {
260 		long max = Math.max(hashedCnt, dst.hashedCnt);
261 		if (max == 0)
262 			return maxScore;
263 		return (int) ((common(dst) * maxScore) / max);
264 	}
265 
266 	long common(SimilarityIndex dst) {
267 		return common(this, dst);
268 	}
269 
270 	private static long common(SimilarityIndex src, SimilarityIndex dst) {
271 		int srcIdx = src.packedIndex(0);
272 		int dstIdx = dst.packedIndex(0);
273 		long[] srcHash = src.idHash;
274 		long[] dstHash = dst.idHash;
275 		return common(srcHash, srcIdx, dstHash, dstIdx);
276 	}
277 
278 	private static long common(long[] srcHash, int srcIdx, //
279 			long[] dstHash, int dstIdx) {
280 		if (srcIdx == srcHash.length || dstIdx == dstHash.length)
281 			return 0;
282 
283 		long common = 0;
284 		int srcKey = keyOf(srcHash[srcIdx]);
285 		int dstKey = keyOf(dstHash[dstIdx]);
286 
287 		for (;;) {
288 			if (srcKey == dstKey) {
289 				common += Math.min(countOf(srcHash[srcIdx]),
290 						countOf(dstHash[dstIdx]));
291 
292 				if (++srcIdx == srcHash.length)
293 					break;
294 				srcKey = keyOf(srcHash[srcIdx]);
295 
296 				if (++dstIdx == dstHash.length)
297 					break;
298 				dstKey = keyOf(dstHash[dstIdx]);
299 
300 			} else if (srcKey < dstKey) {
301 				// Regions of src which do not appear in dst.
302 				if (++srcIdx == srcHash.length)
303 					break;
304 				srcKey = keyOf(srcHash[srcIdx]);
305 
306 			} else /* if (dstKey < srcKey) */{
307 				// Regions of dst which do not appear in src.
308 				if (++dstIdx == dstHash.length)
309 					break;
310 				dstKey = keyOf(dstHash[dstIdx]);
311 			}
312 		}
313 
314 		return common;
315 	}
316 
317 	// Testing only
318 	int size() {
319 		return idSize;
320 	}
321 
322 	// Testing only
323 	int key(int idx) {
324 		return keyOf(idHash[packedIndex(idx)]);
325 	}
326 
327 	// Testing only
328 	long count(int idx) {
329 		return countOf(idHash[packedIndex(idx)]);
330 	}
331 
332 	// Brute force approach only for testing.
333 	int findIndex(int key) {
334 		for (int i = 0; i < idSize; i++)
335 			if (key(i) == key)
336 				return i;
337 		return -1;
338 	}
339 
340 	private int packedIndex(int idx) {
341 		return (idHash.length - idSize) + idx;
342 	}
343 
344 	void add(int key, int cnt) throws TableFullException {
345 		key = (key * 0x9e370001) >>> 1; // Mix bits and ensure not negative.
346 
347 		int j = slot(key);
348 		for (;;) {
349 			long v = idHash[j];
350 			if (v == 0) {
351 				// Empty slot in the table, store here.
352 				if (idGrowAt <= idSize) {
353 					grow();
354 					j = slot(key);
355 					continue;
356 				}
357 				idHash[j] = pair(key, cnt);
358 				idSize++;
359 				return;
360 
361 			} else if (keyOf(v) == key) {
362 				// Same key, increment the counter. If it overflows, fail
363 				// indexing to prevent the key from being impacted.
364 				//
365 				idHash[j] = pair(key, countOf(v) + cnt);
366 				return;
367 
368 			} else if (++j >= idHash.length) {
369 				j = 0;
370 			}
371 		}
372 	}
373 
374 	private static long pair(int key, long cnt) throws TableFullException {
375 		if (MAX_COUNT < cnt)
376 			throw new TableFullException();
377 		return (((long) key) << KEY_SHIFT) | cnt;
378 	}
379 
380 	private int slot(int key) {
381 		// We use 31 - idHashBits because the upper bit was already forced
382 		// to be 0 and we want the remaining high bits to be used as the
383 		// table slot.
384 		//
385 		return key >>> (31 - idHashBits);
386 	}
387 
388 	private static int growAt(int idHashBits) {
389 		return (1 << idHashBits) * (idHashBits - 3) / idHashBits;
390 	}
391 
392 	private void grow() throws TableFullException {
393 		if (idHashBits == 30)
394 			throw new TableFullException();
395 
396 		long[] oldHash = idHash;
397 		int oldSize = idHash.length;
398 
399 		idHashBits++;
400 		idGrowAt = growAt(idHashBits);
401 
402 		try {
403 			idHash = new long[1 << idHashBits];
404 		} catch (OutOfMemoryError noMemory) {
405 			throw TABLE_FULL_OUT_OF_MEMORY;
406 		}
407 
408 		for (int i = 0; i < oldSize; i++) {
409 			long v = oldHash[i];
410 			if (v != 0) {
411 				int j = slot(keyOf(v));
412 				while (idHash[j] != 0)
413 					if (++j >= idHash.length)
414 						j = 0;
415 				idHash[j] = v;
416 			}
417 		}
418 	}
419 
420 	private static int keyOf(long v) {
421 		return (int) (v >>> KEY_SHIFT);
422 	}
423 
424 	private static long countOf(long v) {
425 		return v & MAX_COUNT;
426 	}
427 
428 	/** Thrown by {@code create()} when file is too large. */
429 	public static class TableFullException extends Exception {
430 		private static final long serialVersionUID = 1L;
431 	}
432 }