View Javadoc
1   /*
2    * Copyright (C) 2010, Google Inc. and others
3    *
4    * This program and the accompanying materials are made available under the
5    * terms of the Eclipse Distribution License v. 1.0 which is available at
6    * https://www.eclipse.org/org/documents/edl-v10.php.
7    *
8    * SPDX-License-Identifier: BSD-3-Clause
9    */
10  
11  package org.eclipse.jgit.diff;
12  
13  import java.io.EOFException;
14  import java.io.IOException;
15  import java.io.InputStream;
16  import java.util.Arrays;
17  
18  import org.eclipse.jgit.errors.MissingObjectException;
19  import org.eclipse.jgit.lib.ObjectLoader;
20  import org.eclipse.jgit.lib.ObjectStream;
21  
22  /**
23   * Index structure of lines/blocks in one file.
24   * <p>
25   * This structure can be used to compute an approximation of the similarity
26   * between two files. The index is used by
27   * {@link org.eclipse.jgit.diff.SimilarityRenameDetector} to compute scores
28   * between files.
29   * <p>
30   * To save space in memory, this index uses a space efficient encoding which
31   * will not exceed 1 MiB per instance. The index starts out at a smaller size
32   * (closer to 2 KiB), but may grow as more distinct blocks within the scanned
33   * file are discovered.
34   *
35   * @since 4.0
36   */
37  public class SimilarityIndex {
38  	/** A special {@link TableFullException} used in place of OutOfMemoryError. */
39  	public static final TableFullException
40  			TABLE_FULL_OUT_OF_MEMORY = new TableFullException();
41  
42  	/**
43  	 * Shift to apply before storing a key.
44  	 * <p>
45  	 * Within the 64 bit table record space, we leave the highest bit unset so
46  	 * all values are positive. The lower 32 bits to count bytes.
47  	 */
48  	private static final int KEY_SHIFT = 32;
49  
50  	/** Maximum value of the count field, also mask to extract the count. */
51  	private static final long MAX_COUNT = (1L << KEY_SHIFT) - 1;
52  
53  	/**
54  	 * Total amount of bytes hashed into the structure, including \n. This is
55  	 * usually the size of the file minus number of CRLF encounters.
56  	 */
57  	private long hashedCnt;
58  
59  	/** Number of non-zero entries in {@link #idHash}. */
60  	private int idSize;
61  
62  	/** {@link #idSize} that triggers {@link #idHash} to double in size. */
63  	private int idGrowAt;
64  
65  	/**
66  	 * Pairings of content keys and counters.
67  	 * <p>
68  	 * Slots in the table are actually two ints wedged into a single long. The
69  	 * upper 32 bits stores the content key, and the remaining lower bits stores
70  	 * the number of bytes associated with that key. Empty slots are denoted by
71  	 * 0, which cannot occur because the count cannot be 0. Values can only be
72  	 * positive, which we enforce during key addition.
73  	 */
74  	private long[] idHash;
75  
76  	/** {@code idHash.length == 1 << idHashBits}. */
77  	private int idHashBits;
78  
79  	/**
80  	 * Create a new similarity index for the given object
81  	 *
82  	 * @param obj
83  	 *            the object to hash
84  	 * @return similarity index for this object
85  	 * @throws java.io.IOException
86  	 *             file contents cannot be read from the repository.
87  	 * @throws org.eclipse.jgit.diff.SimilarityIndex.TableFullException
88  	 *             object hashing overflowed the storage capacity of the
89  	 *             SimilarityIndex.
90  	 */
91  	public static SimilarityIndex create(ObjectLoader obj) throws IOException,
92  			TableFullException {
93  		SimilarityIndex idx = new SimilarityIndex();
94  		idx.hash(obj);
95  		idx.sort();
96  		return idx;
97  	}
98  
99  	SimilarityIndex() {
100 		idHashBits = 8;
101 		idHash = new long[1 << idHashBits];
102 		idGrowAt = growAt(idHashBits);
103 	}
104 
105 	static boolean isBinary(ObjectLoader obj) throws IOException {
106 		if (obj.isLarge()) {
107 			try (ObjectStream in1 = obj.openStream()) {
108 				return RawText.isBinary(in1);
109 			}
110 		}
111 		byte[] raw = obj.getCachedBytes();
112 		return RawText.isBinary(raw, raw.length, true);
113 	}
114 
115 	void hash(ObjectLoader obj) throws MissingObjectException, IOException,
116 			TableFullException {
117 		if (obj.isLarge()) {
118 			hashLargeObject(obj);
119 		} else {
120 			byte[] raw = obj.getCachedBytes();
121 			hash(raw, 0, raw.length);
122 		}
123 	}
124 
125 	private void hashLargeObject(ObjectLoader obj) throws IOException,
126 			TableFullException {
127 		boolean text;
128 		text = !isBinary(obj);
129 
130 		try (ObjectStream in2 = obj.openStream()) {
131 			hash(in2, in2.getSize(), text);
132 		}
133 	}
134 
135 	void hash(byte[] raw, int ptr, int end) throws TableFullException {
136 		final boolean text = !RawText.isBinary(raw, raw.length, true);
137 		hashedCnt = 0;
138 		while (ptr < end) {
139 			int hash = 5381;
140 			int blockHashedCnt = 0;
141 			int start = ptr;
142 
143 			// Hash one line, or one block, whichever occurs first.
144 			do {
145 				int c = raw[ptr++] & 0xff;
146 				// Ignore CR in CRLF sequence if text
147 				if (text && c == '\r' && ptr < end && raw[ptr] == '\n')
148 					continue;
149 				blockHashedCnt++;
150 				if (c == '\n')
151 					break;
152 				hash = (hash << 5) + hash + c;
153 			} while (ptr < end && ptr - start < 64);
154 			hashedCnt += blockHashedCnt;
155 			add(hash, blockHashedCnt);
156 		}
157 	}
158 
159 	void hash(InputStream in, long remaining, boolean text) throws IOException,
160 			TableFullException {
161 		byte[] buf = new byte[4096];
162 		int ptr = 0;
163 		int cnt = 0;
164 
165 		while (0 < remaining) {
166 			int hash = 5381;
167 			int blockHashedCnt = 0;
168 
169 			// Hash one line, or one block, whichever occurs first.
170 			int n = 0;
171 			do {
172 				if (ptr == cnt) {
173 					ptr = 0;
174 					cnt = in.read(buf, 0, buf.length);
175 					if (cnt <= 0)
176 						throw new EOFException();
177 				}
178 
179 				n++;
180 				int c = buf[ptr++] & 0xff;
181 				// Ignore CR in CRLF sequence if text
182 				if (text && c == '\r' && ptr < cnt && buf[ptr] == '\n')
183 					continue;
184 				blockHashedCnt++;
185 				if (c == '\n')
186 					break;
187 				hash = (hash << 5) + hash + c;
188 			} while (n < 64 && n < remaining);
189 			hashedCnt += blockHashedCnt;
190 			add(hash, blockHashedCnt);
191 			remaining -= n;
192 		}
193 	}
194 
195 	/**
196 	 * Sort the internal table so it can be used for efficient scoring.
197 	 * <p>
198 	 * Once sorted, additional lines/blocks cannot be added to the index.
199 	 */
200 	void sort() {
201 		// Sort the array. All of the empty space will wind up at the front,
202 		// because we forced all of the keys to always be positive. Later
203 		// we only work with the back half of the array.
204 		//
205 		Arrays.sort(idHash);
206 	}
207 
208 	/**
209 	 * Compute the similarity score between this index and another.
210 	 * <p>
211 	 * A region of a file is defined as a line in a text file or a fixed-size
212 	 * block in a binary file. To prepare an index, each region in the file is
213 	 * hashed; the values and counts of hashes are retained in a sorted table.
214 	 * Define the similarity fraction F as the count of matching regions
215 	 * between the two files divided between the maximum count of regions in
216 	 * either file. The similarity score is F multiplied by the maxScore
217 	 * constant, yielding a range [0, maxScore]. It is defined as maxScore for
218 	 * the degenerate case of two empty files.
219 	 * <p>
220 	 * The similarity score is symmetrical; i.e. a.score(b) == b.score(a).
221 	 *
222 	 * @param dst
223 	 *            the other index
224 	 * @param maxScore
225 	 *            the score representing a 100% match
226 	 * @return the similarity score
227 	 */
228 	public int score(SimilarityIndex dst, int maxScore) {
229 		long max = Math.max(hashedCnt, dst.hashedCnt);
230 		if (max == 0)
231 			return maxScore;
232 		return (int) ((common(dst) * maxScore) / max);
233 	}
234 
235 	long common(SimilarityIndex dst) {
236 		return common(this, dst);
237 	}
238 
239 	private static long common(SimilarityIndex src, SimilarityIndex dst) {
240 		int srcIdx = src.packedIndex(0);
241 		int dstIdx = dst.packedIndex(0);
242 		long[] srcHash = src.idHash;
243 		long[] dstHash = dst.idHash;
244 		return common(srcHash, srcIdx, dstHash, dstIdx);
245 	}
246 
247 	private static long common(long[] srcHash, int srcIdx, //
248 			long[] dstHash, int dstIdx) {
249 		if (srcIdx == srcHash.length || dstIdx == dstHash.length)
250 			return 0;
251 
252 		long common = 0;
253 		int srcKey = keyOf(srcHash[srcIdx]);
254 		int dstKey = keyOf(dstHash[dstIdx]);
255 
256 		for (;;) {
257 			if (srcKey == dstKey) {
258 				common += Math.min(countOf(srcHash[srcIdx]),
259 						countOf(dstHash[dstIdx]));
260 
261 				if (++srcIdx == srcHash.length)
262 					break;
263 				srcKey = keyOf(srcHash[srcIdx]);
264 
265 				if (++dstIdx == dstHash.length)
266 					break;
267 				dstKey = keyOf(dstHash[dstIdx]);
268 
269 			} else if (srcKey < dstKey) {
270 				// Regions of src which do not appear in dst.
271 				if (++srcIdx == srcHash.length)
272 					break;
273 				srcKey = keyOf(srcHash[srcIdx]);
274 
275 			} else /* if (dstKey < srcKey) */{
276 				// Regions of dst which do not appear in src.
277 				if (++dstIdx == dstHash.length)
278 					break;
279 				dstKey = keyOf(dstHash[dstIdx]);
280 			}
281 		}
282 
283 		return common;
284 	}
285 
286 	// Testing only
287 	int size() {
288 		return idSize;
289 	}
290 
291 	// Testing only
292 	int key(int idx) {
293 		return keyOf(idHash[packedIndex(idx)]);
294 	}
295 
296 	// Testing only
297 	long count(int idx) {
298 		return countOf(idHash[packedIndex(idx)]);
299 	}
300 
301 	// Brute force approach only for testing.
302 	int findIndex(int key) {
303 		for (int i = 0; i < idSize; i++)
304 			if (key(i) == key)
305 				return i;
306 		return -1;
307 	}
308 
309 	private int packedIndex(int idx) {
310 		return (idHash.length - idSize) + idx;
311 	}
312 
313 	void add(int key, int cnt) throws TableFullException {
314 		key = (key * 0x9e370001) >>> 1; // Mix bits and ensure not negative.
315 
316 		int j = slot(key);
317 		for (;;) {
318 			long v = idHash[j];
319 			if (v == 0) {
320 				// Empty slot in the table, store here.
321 				if (idGrowAt <= idSize) {
322 					grow();
323 					j = slot(key);
324 					continue;
325 				}
326 				idHash[j] = pair(key, cnt);
327 				idSize++;
328 				return;
329 
330 			} else if (keyOf(v) == key) {
331 				// Same key, increment the counter. If it overflows, fail
332 				// indexing to prevent the key from being impacted.
333 				//
334 				idHash[j] = pair(key, countOf(v) + cnt);
335 				return;
336 
337 			} else if (++j >= idHash.length) {
338 				j = 0;
339 			}
340 		}
341 	}
342 
343 	private static long pair(int key, long cnt) throws TableFullException {
344 		if (MAX_COUNT < cnt)
345 			throw new TableFullException();
346 		return (((long) key) << KEY_SHIFT) | cnt;
347 	}
348 
349 	private int slot(int key) {
350 		// We use 31 - idHashBits because the upper bit was already forced
351 		// to be 0 and we want the remaining high bits to be used as the
352 		// table slot.
353 		//
354 		return key >>> (31 - idHashBits);
355 	}
356 
357 	private static int growAt(int idHashBits) {
358 		return (1 << idHashBits) * (idHashBits - 3) / idHashBits;
359 	}
360 
361 	@SuppressWarnings("UnusedException")
362 	private void grow() throws TableFullException {
363 		if (idHashBits == 30)
364 			throw new TableFullException();
365 
366 		long[] oldHash = idHash;
367 		int oldSize = idHash.length;
368 
369 		idHashBits++;
370 		idGrowAt = growAt(idHashBits);
371 
372 		try {
373 			idHash = new long[1 << idHashBits];
374 		} catch (OutOfMemoryError noMemory) {
375 			throw TABLE_FULL_OUT_OF_MEMORY;
376 		}
377 
378 		for (int i = 0; i < oldSize; i++) {
379 			long v = oldHash[i];
380 			if (v != 0) {
381 				int j = slot(keyOf(v));
382 				while (idHash[j] != 0)
383 					if (++j >= idHash.length)
384 						j = 0;
385 				idHash[j] = v;
386 			}
387 		}
388 	}
389 
390 	private static int keyOf(long v) {
391 		return (int) (v >>> KEY_SHIFT);
392 	}
393 
394 	private static long countOf(long v) {
395 		return v & MAX_COUNT;
396 	}
397 
398 	/** Thrown by {@code create()} when file is too large. */
399 	public static class TableFullException extends Exception {
400 		private static final long serialVersionUID = 1L;
401 	}
402 }