View Javadoc
1   /*
2    * Copyright (C) 2010, Google Inc. and others
3    *
4    * This program and the accompanying materials are made available under the
5    * terms of the Eclipse Distribution License v. 1.0 which is available at
6    * https://www.eclipse.org/org/documents/edl-v10.php.
7    *
8    * SPDX-License-Identifier: BSD-3-Clause
9    */
10  
11  package org.eclipse.jgit.diff;
12  
13  import java.io.EOFException;
14  import java.io.IOException;
15  import java.io.InputStream;
16  import java.util.Arrays;
17  
18  import org.eclipse.jgit.errors.MissingObjectException;
19  import org.eclipse.jgit.lib.ObjectLoader;
20  import org.eclipse.jgit.lib.ObjectStream;
21  
22  /**
23   * Index structure of lines/blocks in one file.
24   * <p>
25   * This structure can be used to compute an approximation of the similarity
26   * between two files. The index is used by
27   * {@link org.eclipse.jgit.diff.SimilarityRenameDetector} to compute scores
28   * between files.
29   * <p>
30   * To save space in memory, this index uses a space efficient encoding which
31   * will not exceed 1 MiB per instance. The index starts out at a smaller size
32   * (closer to 2 KiB), but may grow as more distinct blocks within the scanned
33   * file are discovered.
34   *
35   * @since 4.0
36   */
37  public class SimilarityIndex {
38  	/** A special {@link TableFullException} used in place of OutOfMemoryError. */
39  	public static final TableFullException
40  			TABLE_FULL_OUT_OF_MEMORY = new TableFullException();
41  
42  	/**
43  	 * Shift to apply before storing a key.
44  	 * <p>
45  	 * Within the 64 bit table record space, we leave the highest bit unset so
46  	 * all values are positive. The lower 32 bits to count bytes.
47  	 */
48  	private static final int KEY_SHIFT = 32;
49  
50  	/** Maximum value of the count field, also mask to extract the count. */
51  	private static final long MAX_COUNT = (1L << KEY_SHIFT) - 1;
52  
53  	/**
54  	 * Total amount of bytes hashed into the structure, including \n. This is
55  	 * usually the size of the file minus number of CRLF encounters.
56  	 */
57  	private long hashedCnt;
58  
59  	/** Number of non-zero entries in {@link #idHash}. */
60  	private int idSize;
61  
62  	/** {@link #idSize} that triggers {@link #idHash} to double in size. */
63  	private int idGrowAt;
64  
65  	/**
66  	 * Pairings of content keys and counters.
67  	 * <p>
68  	 * Slots in the table are actually two ints wedged into a single long. The
69  	 * upper 32 bits stores the content key, and the remaining lower bits stores
70  	 * the number of bytes associated with that key. Empty slots are denoted by
71  	 * 0, which cannot occur because the count cannot be 0. Values can only be
72  	 * positive, which we enforce during key addition.
73  	 */
74  	private long[] idHash;
75  
76  	/** {@code idHash.length == 1 << idHashBits}. */
77  	private int idHashBits;
78  
79  	/**
80  	 * Create a new similarity index for the given object
81  	 *
82  	 * @param obj
83  	 *            the object to hash
84  	 * @return similarity index for this object
85  	 * @throws java.io.IOException
86  	 *             file contents cannot be read from the repository.
87  	 * @throws org.eclipse.jgit.diff.SimilarityIndex.TableFullException
88  	 *             object hashing overflowed the storage capacity of the
89  	 *             SimilarityIndex.
90  	 */
91  	public static SimilarityIndex create(ObjectLoader obj) throws IOException,
92  			TableFullException {
93  		SimilarityIndex idx = new SimilarityIndex();
94  		idx.hash(obj);
95  		idx.sort();
96  		return idx;
97  	}
98  
99  	SimilarityIndex() {
100 		idHashBits = 8;
101 		idHash = new long[1 << idHashBits];
102 		idGrowAt = growAt(idHashBits);
103 	}
104 
105 	void hash(ObjectLoader obj) throws MissingObjectException, IOException,
106 			TableFullException {
107 		if (obj.isLarge()) {
108 			hashLargeObject(obj);
109 		} else {
110 			byte[] raw = obj.getCachedBytes();
111 			hash(raw, 0, raw.length);
112 		}
113 	}
114 
115 	private void hashLargeObject(ObjectLoader obj) throws IOException,
116 			TableFullException {
117 		boolean text;
118 		try (ObjectStream in1 = obj.openStream()) {
119 			text = !RawText.isBinary(in1);
120 		}
121 
122 		try (ObjectStream in2 = obj.openStream()) {
123 			hash(in2, in2.getSize(), text);
124 		}
125 	}
126 
127 	void hash(byte[] raw, int ptr, int end) throws TableFullException {
128 		final boolean text = !RawText.isBinary(raw);
129 		hashedCnt = 0;
130 		while (ptr < end) {
131 			int hash = 5381;
132 			int blockHashedCnt = 0;
133 			int start = ptr;
134 
135 			// Hash one line, or one block, whichever occurs first.
136 			do {
137 				int c = raw[ptr++] & 0xff;
138 				// Ignore CR in CRLF sequence if text
139 				if (text && c == '\r' && ptr < end && raw[ptr] == '\n')
140 					continue;
141 				blockHashedCnt++;
142 				if (c == '\n')
143 					break;
144 				hash = (hash << 5) + hash + c;
145 			} while (ptr < end && ptr - start < 64);
146 			hashedCnt += blockHashedCnt;
147 			add(hash, blockHashedCnt);
148 		}
149 	}
150 
151 	void hash(InputStream in, long remaining, boolean text) throws IOException,
152 			TableFullException {
153 		byte[] buf = new byte[4096];
154 		int ptr = 0;
155 		int cnt = 0;
156 
157 		while (0 < remaining) {
158 			int hash = 5381;
159 			int blockHashedCnt = 0;
160 
161 			// Hash one line, or one block, whichever occurs first.
162 			int n = 0;
163 			do {
164 				if (ptr == cnt) {
165 					ptr = 0;
166 					cnt = in.read(buf, 0, buf.length);
167 					if (cnt <= 0)
168 						throw new EOFException();
169 				}
170 
171 				n++;
172 				int c = buf[ptr++] & 0xff;
173 				// Ignore CR in CRLF sequence if text
174 				if (text && c == '\r' && ptr < cnt && buf[ptr] == '\n')
175 					continue;
176 				blockHashedCnt++;
177 				if (c == '\n')
178 					break;
179 				hash = (hash << 5) + hash + c;
180 			} while (n < 64 && n < remaining);
181 			hashedCnt += blockHashedCnt;
182 			add(hash, blockHashedCnt);
183 			remaining -= n;
184 		}
185 	}
186 
187 	/**
188 	 * Sort the internal table so it can be used for efficient scoring.
189 	 * <p>
190 	 * Once sorted, additional lines/blocks cannot be added to the index.
191 	 */
192 	void sort() {
193 		// Sort the array. All of the empty space will wind up at the front,
194 		// because we forced all of the keys to always be positive. Later
195 		// we only work with the back half of the array.
196 		//
197 		Arrays.sort(idHash);
198 	}
199 
200 	/**
201 	 * Compute the similarity score between this index and another.
202 	 * <p>
203 	 * A region of a file is defined as a line in a text file or a fixed-size
204 	 * block in a binary file. To prepare an index, each region in the file is
205 	 * hashed; the values and counts of hashes are retained in a sorted table.
206 	 * Define the similarity fraction F as the count of matching regions
207 	 * between the two files divided between the maximum count of regions in
208 	 * either file. The similarity score is F multiplied by the maxScore
209 	 * constant, yielding a range [0, maxScore]. It is defined as maxScore for
210 	 * the degenerate case of two empty files.
211 	 * <p>
212 	 * The similarity score is symmetrical; i.e. a.score(b) == b.score(a).
213 	 *
214 	 * @param dst
215 	 *            the other index
216 	 * @param maxScore
217 	 *            the score representing a 100% match
218 	 * @return the similarity score
219 	 */
220 	public int score(SimilarityIndex dst, int maxScore) {
221 		long max = Math.max(hashedCnt, dst.hashedCnt);
222 		if (max == 0)
223 			return maxScore;
224 		return (int) ((common(dst) * maxScore) / max);
225 	}
226 
227 	long common(SimilarityIndex dst) {
228 		return common(this, dst);
229 	}
230 
231 	private static long common(SimilarityIndex./../../org/eclipse/jgit/diff/SimilarityIndex.html#SimilarityIndex">SimilarityIndex src, SimilarityIndex dst) {
232 		int srcIdx = src.packedIndex(0);
233 		int dstIdx = dst.packedIndex(0);
234 		long[] srcHash = src.idHash;
235 		long[] dstHash = dst.idHash;
236 		return common(srcHash, srcIdx, dstHash, dstIdx);
237 	}
238 
239 	private static long common(long[] srcHash, int srcIdx, //
240 			long[] dstHash, int dstIdx) {
241 		if (srcIdx == srcHash.length || dstIdx == dstHash.length)
242 			return 0;
243 
244 		long common = 0;
245 		int srcKey = keyOf(srcHash[srcIdx]);
246 		int dstKey = keyOf(dstHash[dstIdx]);
247 
248 		for (;;) {
249 			if (srcKey == dstKey) {
250 				common += Math.min(countOf(srcHash[srcIdx]),
251 						countOf(dstHash[dstIdx]));
252 
253 				if (++srcIdx == srcHash.length)
254 					break;
255 				srcKey = keyOf(srcHash[srcIdx]);
256 
257 				if (++dstIdx == dstHash.length)
258 					break;
259 				dstKey = keyOf(dstHash[dstIdx]);
260 
261 			} else if (srcKey < dstKey) {
262 				// Regions of src which do not appear in dst.
263 				if (++srcIdx == srcHash.length)
264 					break;
265 				srcKey = keyOf(srcHash[srcIdx]);
266 
267 			} else /* if (dstKey < srcKey) */{
268 				// Regions of dst which do not appear in src.
269 				if (++dstIdx == dstHash.length)
270 					break;
271 				dstKey = keyOf(dstHash[dstIdx]);
272 			}
273 		}
274 
275 		return common;
276 	}
277 
278 	// Testing only
279 	int size() {
280 		return idSize;
281 	}
282 
283 	// Testing only
284 	int key(int idx) {
285 		return keyOf(idHash[packedIndex(idx)]);
286 	}
287 
288 	// Testing only
289 	long count(int idx) {
290 		return countOf(idHash[packedIndex(idx)]);
291 	}
292 
293 	// Brute force approach only for testing.
294 	int findIndex(int key) {
295 		for (int i = 0; i < idSize; i++)
296 			if (key(i) == key)
297 				return i;
298 		return -1;
299 	}
300 
301 	private int packedIndex(int idx) {
302 		return (idHash.length - idSize) + idx;
303 	}
304 
305 	void add(int key, int cnt) throws TableFullException {
306 		key = (key * 0x9e370001) >>> 1; // Mix bits and ensure not negative.
307 
308 		int j = slot(key);
309 		for (;;) {
310 			long v = idHash[j];
311 			if (v == 0) {
312 				// Empty slot in the table, store here.
313 				if (idGrowAt <= idSize) {
314 					grow();
315 					j = slot(key);
316 					continue;
317 				}
318 				idHash[j] = pair(key, cnt);
319 				idSize++;
320 				return;
321 
322 			} else if (keyOf(v) == key) {
323 				// Same key, increment the counter. If it overflows, fail
324 				// indexing to prevent the key from being impacted.
325 				//
326 				idHash[j] = pair(key, countOf(v) + cnt);
327 				return;
328 
329 			} else if (++j >= idHash.length) {
330 				j = 0;
331 			}
332 		}
333 	}
334 
335 	private static long pair(int key, long cnt) throws TableFullException {
336 		if (MAX_COUNT < cnt)
337 			throw new TableFullException();
338 		return (((long) key) << KEY_SHIFT) | cnt;
339 	}
340 
341 	private int slot(int key) {
342 		// We use 31 - idHashBits because the upper bit was already forced
343 		// to be 0 and we want the remaining high bits to be used as the
344 		// table slot.
345 		//
346 		return key >>> (31 - idHashBits);
347 	}
348 
349 	private static int growAt(int idHashBits) {
350 		return (1 << idHashBits) * (idHashBits - 3) / idHashBits;
351 	}
352 
353 	@SuppressWarnings("UnusedException")
354 	private void grow() throws TableFullException {
355 		if (idHashBits == 30)
356 			throw new TableFullException();
357 
358 		long[] oldHash = idHash;
359 		int oldSize = idHash.length;
360 
361 		idHashBits++;
362 		idGrowAt = growAt(idHashBits);
363 
364 		try {
365 			idHash = new long[1 << idHashBits];
366 		} catch (OutOfMemoryError noMemory) {
367 			throw TABLE_FULL_OUT_OF_MEMORY;
368 		}
369 
370 		for (int i = 0; i < oldSize; i++) {
371 			long v = oldHash[i];
372 			if (v != 0) {
373 				int j = slot(keyOf(v));
374 				while (idHash[j] != 0)
375 					if (++j >= idHash.length)
376 						j = 0;
377 				idHash[j] = v;
378 			}
379 		}
380 	}
381 
382 	private static int keyOf(long v) {
383 		return (int) (v >>> KEY_SHIFT);
384 	}
385 
386 	private static long countOf(long v) {
387 		return v & MAX_COUNT;
388 	}
389 
390 	/** Thrown by {@code create()} when file is too large. */
391 	public static class TableFullException extends Exception {
392 		private static final long serialVersionUID = 1L;
393 	}
394 }