View Javadoc
1   /*
2    * Copyright (C) 2010, Google Inc. and others
3    *
4    * This program and the accompanying materials are made available under the
5    * terms of the Eclipse Distribution License v. 1.0 which is available at
6    * https://www.eclipse.org/org/documents/edl-v10.php.
7    *
8    * SPDX-License-Identifier: BSD-3-Clause
9    */
10  
11  package org.eclipse.jgit.pgm.debug;
12  
13  import static java.lang.Integer.valueOf;
14  import static java.lang.Long.valueOf;
15  
16  import java.io.File;
17  import java.lang.reflect.Field;
18  import java.security.MessageDigest;
19  import java.util.ArrayList;
20  import java.util.Arrays;
21  import java.util.HashSet;
22  import java.util.List;
23  
24  import org.eclipse.jgit.diff.RawText;
25  import org.eclipse.jgit.diff.RawTextComparator;
26  import org.eclipse.jgit.errors.LargeObjectException;
27  import org.eclipse.jgit.lib.Constants;
28  import org.eclipse.jgit.lib.FileMode;
29  import org.eclipse.jgit.lib.MutableObjectId;
30  import org.eclipse.jgit.lib.ObjectReader;
31  import org.eclipse.jgit.lib.Repository;
32  import org.eclipse.jgit.lib.RepositoryBuilder;
33  import org.eclipse.jgit.lib.RepositoryCache;
34  import org.eclipse.jgit.pgm.Command;
35  import org.eclipse.jgit.pgm.TextBuiltin;
36  import org.eclipse.jgit.pgm.internal.CLIText;
37  import org.eclipse.jgit.revwalk.RevWalk;
38  import org.eclipse.jgit.treewalk.TreeWalk;
39  import org.eclipse.jgit.util.FS;
40  import org.eclipse.jgit.util.NB;
41  import org.kohsuke.args4j.Option;
42  
43  /**
44   * Scan repository to compute maximum number of collisions for hash functions.
45   *
46   * This is a test suite to help benchmark the collision rate of hash functions
47   * when applied to file contents in a Git repository. The test scans all text
48   * files in the HEAD revision of the repository it is run within. For each file
49   * it finds the unique lines, and then inserts those lines into a hash table to
50   * determine collision rates under the selected hash functions.
51   *
52   * To add another hash function to the test suite, declare a new instance member
53   * field of type {@link Hash} and implement the hashRegion method. The test
54   * suite will automatically pick up the new function through reflection.
55   *
56   * To add another folding function (method of squashing a 32 bit hash code into
57   * the hash tables smaller array index space), declare a new instance field of
58   * type {@link Fold} and implement the logic. The test suite will automatically
59   * pick up the new function through reflection.
60   */
61  @Command(usage = "usage_TextHashFunctions")
62  class TextHashFunctions extends TextBuiltin {
63  
64  	/** Standard SHA-1 on the line, using the first 4 bytes as the hash code. */
65  	final Hash sha1 = new Hash() {
66  		private final MessageDigest md = Constants.newMessageDigest();
67  
68  		@Override
69  		protected int hashRegion(byte[] raw, int ptr, int end) {
70  			md.reset();
71  			md.update(raw, ptr, end - ptr);
72  			return NB.decodeInt32(md.digest(), 0);
73  		}
74  	};
75  
76  	/** Professor Daniel J. Bernstein's rather popular string hash. */
77  	final Hash djb = new Hash() {
78  		@Override
79  		protected int hashRegion(byte[] raw, int ptr, int end) {
80  			int hash = 5381;
81  			for (; ptr < end; ptr++)
82  				hash = ((hash << 5) + hash) + (raw[ptr] & 0xff);
83  			return hash;
84  		}
85  	};
86  
87  	/** Hash function commonly used by java.lang.String. */
88  	final Hash string_hash31 = new Hash() {
89  		@Override
90  		protected int hashRegion(byte[] raw, int ptr, int end) {
91  			int hash = 0;
92  			for (; ptr < end; ptr++)
93  				hash = 31 * hash + (raw[ptr] & 0xff);
94  			return hash;
95  		}
96  	};
97  
98  	/** The Rabin polynomial hash that is used by our own DeltaIndex. */
99  	final Hash rabin_DeltaIndex = new Hash() {
100 		private final byte[] buf16 = new byte[16];
101 
102 		@Override
103 		protected int hashRegion(byte[] raw, int ptr, int end) {
104 			if (end - ptr < 16) {
105 				Arrays.fill(buf16, (byte) 0);
106 				System.arraycopy(raw, ptr, buf16, 0, end - ptr);
107 				return rabin(buf16, 0);
108 			}
109 			return rabin(raw, ptr);
110 		}
111 
112 		private int rabin(byte[] raw, int ptr) {
113 			int hash;
114 
115 			// The first 4 steps collapse out into a 4 byte big-endian decode,
116 			// with a larger right shift as we combined shift lefts together.
117 			//
118 			hash = ((raw[ptr] & 0xff) << 24) //
119 					| ((raw[ptr + 1] & 0xff) << 16) //
120 					| ((raw[ptr + 2] & 0xff) << 8) //
121 					| (raw[ptr + 3] & 0xff);
122 			hash ^= T[hash >>> 31];
123 
124 			hash = ((hash << 8) | (raw[ptr + 4] & 0xff)) ^ T[hash >>> 23];
125 			hash = ((hash << 8) | (raw[ptr + 5] & 0xff)) ^ T[hash >>> 23];
126 			hash = ((hash << 8) | (raw[ptr + 6] & 0xff)) ^ T[hash >>> 23];
127 			hash = ((hash << 8) | (raw[ptr + 7] & 0xff)) ^ T[hash >>> 23];
128 
129 			hash = ((hash << 8) | (raw[ptr + 8] & 0xff)) ^ T[hash >>> 23];
130 			hash = ((hash << 8) | (raw[ptr + 9] & 0xff)) ^ T[hash >>> 23];
131 			hash = ((hash << 8) | (raw[ptr + 10] & 0xff)) ^ T[hash >>> 23];
132 			hash = ((hash << 8) | (raw[ptr + 11] & 0xff)) ^ T[hash >>> 23];
133 
134 			hash = ((hash << 8) | (raw[ptr + 12] & 0xff)) ^ T[hash >>> 23];
135 			hash = ((hash << 8) | (raw[ptr + 13] & 0xff)) ^ T[hash >>> 23];
136 			hash = ((hash << 8) | (raw[ptr + 14] & 0xff)) ^ T[hash >>> 23];
137 			hash = ((hash << 8) | (raw[ptr + 15] & 0xff)) ^ T[hash >>> 23];
138 
139 			return hash;
140 		}
141 
142 		private final int[] T = { 0x00000000, 0xd4c6b32d, 0x7d4bd577,
143 				0xa98d665a, 0x2e5119c3, 0xfa97aaee, 0x531accb4, 0x87dc7f99,
144 				0x5ca23386, 0x886480ab, 0x21e9e6f1, 0xf52f55dc, 0x72f32a45,
145 				0xa6359968, 0x0fb8ff32, 0xdb7e4c1f, 0x6d82d421, 0xb944670c,
146 				0x10c90156, 0xc40fb27b, 0x43d3cde2, 0x97157ecf, 0x3e981895,
147 				0xea5eabb8, 0x3120e7a7, 0xe5e6548a, 0x4c6b32d0, 0x98ad81fd,
148 				0x1f71fe64, 0xcbb74d49, 0x623a2b13, 0xb6fc983e, 0x0fc31b6f,
149 				0xdb05a842, 0x7288ce18, 0xa64e7d35, 0x219202ac, 0xf554b181,
150 				0x5cd9d7db, 0x881f64f6, 0x536128e9, 0x87a79bc4, 0x2e2afd9e,
151 				0xfaec4eb3, 0x7d30312a, 0xa9f68207, 0x007be45d, 0xd4bd5770,
152 				0x6241cf4e, 0xb6877c63, 0x1f0a1a39, 0xcbcca914, 0x4c10d68d,
153 				0x98d665a0, 0x315b03fa, 0xe59db0d7, 0x3ee3fcc8, 0xea254fe5,
154 				0x43a829bf, 0x976e9a92, 0x10b2e50b, 0xc4745626, 0x6df9307c,
155 				0xb93f8351, 0x1f8636de, 0xcb4085f3, 0x62cde3a9, 0xb60b5084,
156 				0x31d72f1d, 0xe5119c30, 0x4c9cfa6a, 0x985a4947, 0x43240558,
157 				0x97e2b675, 0x3e6fd02f, 0xeaa96302, 0x6d751c9b, 0xb9b3afb6,
158 				0x103ec9ec, 0xc4f87ac1, 0x7204e2ff, 0xa6c251d2, 0x0f4f3788,
159 				0xdb8984a5, 0x5c55fb3c, 0x88934811, 0x211e2e4b, 0xf5d89d66,
160 				0x2ea6d179, 0xfa606254, 0x53ed040e, 0x872bb723, 0x00f7c8ba,
161 				0xd4317b97, 0x7dbc1dcd, 0xa97aaee0, 0x10452db1, 0xc4839e9c,
162 				0x6d0ef8c6, 0xb9c84beb, 0x3e143472, 0xead2875f, 0x435fe105,
163 				0x97995228, 0x4ce71e37, 0x9821ad1a, 0x31accb40, 0xe56a786d,
164 				0x62b607f4, 0xb670b4d9, 0x1ffdd283, 0xcb3b61ae, 0x7dc7f990,
165 				0xa9014abd, 0x008c2ce7, 0xd44a9fca, 0x5396e053, 0x8750537e,
166 				0x2edd3524, 0xfa1b8609, 0x2165ca16, 0xf5a3793b, 0x5c2e1f61,
167 				0x88e8ac4c, 0x0f34d3d5, 0xdbf260f8, 0x727f06a2, 0xa6b9b58f,
168 				0x3f0c6dbc, 0xebcade91, 0x4247b8cb, 0x96810be6, 0x115d747f,
169 				0xc59bc752, 0x6c16a108, 0xb8d01225, 0x63ae5e3a, 0xb768ed17,
170 				0x1ee58b4d, 0xca233860, 0x4dff47f9, 0x9939f4d4, 0x30b4928e,
171 				0xe47221a3, 0x528eb99d, 0x86480ab0, 0x2fc56cea, 0xfb03dfc7,
172 				0x7cdfa05e, 0xa8191373, 0x01947529, 0xd552c604, 0x0e2c8a1b,
173 				0xdaea3936, 0x73675f6c, 0xa7a1ec41, 0x207d93d8, 0xf4bb20f5,
174 				0x5d3646af, 0x89f0f582, 0x30cf76d3, 0xe409c5fe, 0x4d84a3a4,
175 				0x99421089, 0x1e9e6f10, 0xca58dc3d, 0x63d5ba67, 0xb713094a,
176 				0x6c6d4555, 0xb8abf678, 0x11269022, 0xc5e0230f, 0x423c5c96,
177 				0x96faefbb, 0x3f7789e1, 0xebb13acc, 0x5d4da2f2, 0x898b11df,
178 				0x20067785, 0xf4c0c4a8, 0x731cbb31, 0xa7da081c, 0x0e576e46,
179 				0xda91dd6b, 0x01ef9174, 0xd5292259, 0x7ca44403, 0xa862f72e,
180 				0x2fbe88b7, 0xfb783b9a, 0x52f55dc0, 0x8633eeed, 0x208a5b62,
181 				0xf44ce84f, 0x5dc18e15, 0x89073d38, 0x0edb42a1, 0xda1df18c,
182 				0x739097d6, 0xa75624fb, 0x7c2868e4, 0xa8eedbc9, 0x0163bd93,
183 				0xd5a50ebe, 0x52797127, 0x86bfc20a, 0x2f32a450, 0xfbf4177d,
184 				0x4d088f43, 0x99ce3c6e, 0x30435a34, 0xe485e919, 0x63599680,
185 				0xb79f25ad, 0x1e1243f7, 0xcad4f0da, 0x11aabcc5, 0xc56c0fe8,
186 				0x6ce169b2, 0xb827da9f, 0x3ffba506, 0xeb3d162b, 0x42b07071,
187 				0x9676c35c, 0x2f49400d, 0xfb8ff320, 0x5202957a, 0x86c42657,
188 				0x011859ce, 0xd5deeae3, 0x7c538cb9, 0xa8953f94, 0x73eb738b,
189 				0xa72dc0a6, 0x0ea0a6fc, 0xda6615d1, 0x5dba6a48, 0x897cd965,
190 				0x20f1bf3f, 0xf4370c12, 0x42cb942c, 0x960d2701, 0x3f80415b,
191 				0xeb46f276, 0x6c9a8def, 0xb85c3ec2, 0x11d15898, 0xc517ebb5,
192 				0x1e69a7aa, 0xcaaf1487, 0x632272dd, 0xb7e4c1f0, 0x3038be69,
193 				0xe4fe0d44, 0x4d736b1e, 0x99b5d833 };
194 	};
195 
196 	/** Bitwise-and to extract only the low bits. */
197 	final Fold truncate = new Fold() {
198 		@Override
199 		public int fold(int hash, int bits) {
200 			return hash & ((1 << bits) - 1);
201 		}
202 	};
203 
204 	/** Applies the golden ratio and takes the upper bits. */
205 	final Fold golden_ratio = new Fold() {
206 		@Override
207 		public int fold(int hash, int bits) {
208 			/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
209 			return (hash * 0x9e370001) >>> (32 - bits);
210 		}
211 	};
212 
213 	// -----------------------------------------------------------------------
214 	//
215 	// Implementation of the suite lives below this line.
216 	//
217 	//
218 
219 	@Option(name = "--hash", metaVar = "NAME", usage = "Enable hash function(s)")
220 	List<String> hashFunctions = new ArrayList<>();
221 
222 	@Option(name = "--fold", metaVar = "NAME", usage = "Enable fold function(s)")
223 	List<String> foldFunctions = new ArrayList<>();
224 
225 	@Option(name = "--text-limit", metaVar = "LIMIT", usage = "Maximum size in KiB to scan")
226 	int textLimit = 15 * 1024; // 15 MiB as later we do * 1024.
227 
228 	@Option(name = "--repository", aliases = { "-r" }, metaVar = "GIT_DIR", usage = "Repository to scan")
229 	List<File> gitDirs = new ArrayList<>();
230 
231 	/** {@inheritDoc} */
232 	@Override
233 	protected boolean requiresRepository() {
234 		return false;
235 	}
236 
237 	/** {@inheritDoc} */
238 	@Override
239 	protected void run() throws Exception {
240 		if (gitDirs.isEmpty()) {
241 			RepositoryBuilder rb = new RepositoryBuilder() //
242 					.setGitDir(new File(gitdir)) //
243 					.readEnvironment() //
244 					.findGitDir();
245 			if (rb.getGitDir() == null)
246 				throw die(CLIText.get().cantFindGitDirectory);
247 			gitDirs.add(rb.getGitDir());
248 		}
249 
250 		for (File dir : gitDirs) {
251 			RepositoryBuilder rb = new RepositoryBuilder();
252 			if (RepositoryCache.FileKey.isGitRepository(dir, FS.DETECTED))
253 				rb.setGitDir(dir);
254 			else
255 				rb.findGitDir(dir);
256 
257 			try (Repository repo = rb.build()) {
258 				run(repo);
259 			}
260 		}
261 	}
262 
263 	private void run(Repository repo) throws Exception {
264 		List<Function> all = init();
265 
266 		long fileCnt = 0;
267 		long lineCnt = 0;
268 		try (ObjectReader or = repo.newObjectReader();
269 			RevWalk rw = new RevWalk(or);
270 			TreeWalk tw = new TreeWalk(or)) {
271 			final MutableObjectId id = new MutableObjectId();
272 			tw.reset(rw.parseTree(repo.resolve(Constants.HEAD)));
273 			tw.setRecursive(true);
274 
275 			while (tw.next()) {
276 				FileMode fm = tw.getFileMode(0);
277 				if (!FileMode.REGULAR_FILE.equals(fm)
278 						&& !FileMode.EXECUTABLE_FILE.equals(fm))
279 					continue;
280 
281 				byte[] raw;
282 				try {
283 					tw.getObjectId(id, 0);
284 					raw = or.open(id).getCachedBytes(textLimit * 1024);
285 				} catch (LargeObjectException tooBig) {
286 					continue;
287 				}
288 
289 				if (RawText.isBinary(raw))
290 					continue;
291 
292 				RawText txt = new RawText(raw);
293 				int[] lines = new int[txt.size()];
294 				int cnt = 0;
295 				HashSet<Line> u = new HashSet<>();
296 				for (int i = 0; i < txt.size(); i++) {
297 					if (u.add(new Line(txt, i)))
298 						lines[cnt++] = i;
299 				}
300 
301 				fileCnt++;
302 				lineCnt += cnt;
303 
304 				for (Function fun : all)
305 					testOne(fun, txt, lines, cnt);
306 			}
307 		}
308 
309 		File directory = repo.getDirectory();
310 		if (directory != null) {
311 			String name = directory.getName();
312 			File parent = directory.getParentFile();
313 			if (name.equals(Constants.DOT_GIT) && parent != null)
314 				name = parent.getName();
315 			outw.println(name + ":"); //$NON-NLS-1$
316 		}
317 		outw.format("  %6d files; %5d avg. unique lines/file\n", //$NON-NLS-1$
318 				valueOf(fileCnt), //
319 				valueOf(lineCnt / fileCnt));
320 		outw.format("%-20s %-15s %9s\n", "Hash", "Fold", "Max Len"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
321 		outw.println("-----------------------------------------------"); //$NON-NLS-1$
322 		String lastHashName = null;
323 		for (Function fun : all) {
324 			String hashName = fun.hash.name;
325 			if (hashName.equals(lastHashName))
326 				hashName = ""; //$NON-NLS-1$
327 			outw.format("%-20s %-15s %9d\n", // //$NON-NLS-1$
328 					hashName, //
329 					fun.fold.name, //
330 					valueOf(fun.maxChainLength));
331 			lastHashName = fun.hash.name;
332 		}
333 		outw.println();
334 		outw.flush();
335 	}
336 
337 	private static void testOne(Function fun, RawText txt, int[] elements,
338 			int cnt) {
339 		final Hash cmp = fun.hash;
340 		final Fold fold = fun.fold;
341 
342 		final int bits = tableBits(cnt);
343 		final int[] buckets = new int[1 << bits];
344 		for (int i = 0; i < cnt; i++)
345 			buckets[fold.fold(cmp.hash(txt, elements[i]), bits)]++;
346 
347 		int maxChainLength = 0;
348 		for (int i = 0; i < buckets.length; i++)
349 			maxChainLength = Math.max(maxChainLength, buckets[i]);
350 		fun.maxChainLength = Math.max(fun.maxChainLength, maxChainLength);
351 	}
352 
353 	private List<Function> init() {
354 		List<Hash> hashes = new ArrayList<>();
355 		List<Fold> folds = new ArrayList<>();
356 
357 		try {
358 			for (Field f : TextHashFunctions.class.getDeclaredFields()) {
359 				if (f.getType() == Hash.class) {
360 					f.setAccessible(true);
361 					Hash cmp = (Hash) f.get(this);
362 					cmp.name = f.getName();
363 					hashes.add(cmp);
364 
365 				} else if (f.getType() == Fold.class) {
366 					f.setAccessible(true);
367 					Fold fold = (Fold) f.get(this);
368 					fold.name = f.getName();
369 					folds.add(fold);
370 				}
371 			}
372 		} catch (IllegalArgumentException | IllegalAccessException e) {
373 			throw new RuntimeException("Cannot determine names", e); //$NON-NLS-1$
374 		}
375 
376 		List<Function> all = new ArrayList<>();
377 		for (Hash cmp : hashes) {
378 			if (include(cmp.name, hashFunctions)) {
379 				for (Fold f : folds) {
380 					if (include(f.name, foldFunctions)) {
381 						all.add(new Function(cmp, f));
382 					}
383 				}
384 			}
385 		}
386 		return all;
387 	}
388 
389 	private static boolean include(String name, List<String> want) {
390 		if (want.isEmpty())
391 			return true;
392 		for (String s : want) {
393 			if (s.equalsIgnoreCase(name))
394 				return true;
395 		}
396 		return false;
397 	}
398 
399 	private static class Function {
400 		final Hash hash;
401 
402 		final Fold fold;
403 
404 		int maxChainLength;
405 
406 		Function(Hash cmp, Fold fold) {
407 			this.hash = cmp;
408 			this.fold = fold;
409 		}
410 	}
411 
412 	/** Base class for any hashCode function to be tested. */
413 	private abstract static class Hash extends RawTextComparator {
414 		String name;
415 
416 		@Override
417 		public boolean equals(RawText a, int ai, RawText b, int bi) {
418 			return RawTextComparator.DEFAULT.equals(a, ai, b, bi);
419 		}
420 	}
421 
422 	/** Base class for any hashCode folding function to be tested. */
423 	private abstract static class Fold {
424 		String name;
425 
426 		/**
427 		 * Fold the given 32-bit hash code into only {@code bits} of space.
428 		 *
429 		 * @param hash
430 		 *            the 32 bit hash code to be folded into a smaller value.
431 		 * @param bits
432 		 *            total number of bits that can appear in the output. The
433 		 *            output value must be in the range {@code [0, 1 << bits)}.
434 		 *            When bits = 2, valid outputs are 0, 1, 2, 3.
435 		 * @return the folded hash, squeezed into only {@code bits}.
436 		 */
437 		abstract int fold(int hash, int bits);
438 	}
439 
440 	/** Utility to help us identify unique lines in a file. */
441 	private static class Line {
442 		private final RawText txt;
443 
444 		private final int pos;
445 
446 		Line(RawText txt, int pos) {
447 			this.txt = txt;
448 			this.pos = pos;
449 		}
450 
451 		@Override
452 		public int hashCode() {
453 			return RawTextComparator.DEFAULT.hash(txt, pos);
454 		}
455 
456 		@Override
457 		public boolean equals(Object obj) {
458 			if (obj instanceof Line) {
459 				Line e = (Line) obj;
460 				return RawTextComparator.DEFAULT.equals(txt, pos, e.txt, e.pos);
461 			}
462 			return false;
463 		}
464 	}
465 
466 	private static int tableBits(int sz) {
467 		int bits = 31 - Integer.numberOfLeadingZeros(sz);
468 		if (bits == 0)
469 			bits = 1;
470 		if (1 << bits < sz)
471 			bits++;
472 		return bits;
473 	}
474 }