View Javadoc
1   /*
2    * Copyright (C) 2010, Google Inc.
3    * and other copyright owners as documented in the project's IP log.
4    *
5    * This program and the accompanying materials are made available
6    * under the terms of the Eclipse Distribution License v1.0 which
7    * accompanies this distribution, is reproduced below, and is
8    * available at http://www.eclipse.org/org/documents/edl-v10.php
9    *
10   * All rights reserved.
11   *
12   * Redistribution and use in source and binary forms, with or
13   * without modification, are permitted provided that the following
14   * conditions are met:
15   *
16   * - Redistributions of source code must retain the above copyright
17   *   notice, this list of conditions and the following disclaimer.
18   *
19   * - Redistributions in binary form must reproduce the above
20   *   copyright notice, this list of conditions and the following
21   *   disclaimer in the documentation and/or other materials provided
22   *   with the distribution.
23   *
24   * - Neither the name of the Eclipse Foundation, Inc. nor the
25   *   names of its contributors may be used to endorse or promote
26   *   products derived from this software without specific prior
27   *   written permission.
28   *
29   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
30   * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
31   * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
32   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33   * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
34   * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
35   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
36   * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
37   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
38   * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
39   * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
40   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
41   * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
42   */
43  
44  package org.eclipse.jgit.pgm.debug;
45  
46  import static java.lang.Integer.valueOf;
47  import static java.lang.Long.valueOf;
48  
49  import java.io.File;
50  import java.lang.reflect.Field;
51  import java.security.MessageDigest;
52  import java.util.ArrayList;
53  import java.util.Arrays;
54  import java.util.HashSet;
55  import java.util.List;
56  
57  import org.eclipse.jgit.diff.RawText;
58  import org.eclipse.jgit.diff.RawTextComparator;
59  import org.eclipse.jgit.errors.LargeObjectException;
60  import org.eclipse.jgit.lib.Constants;
61  import org.eclipse.jgit.lib.FileMode;
62  import org.eclipse.jgit.lib.MutableObjectId;
63  import org.eclipse.jgit.lib.ObjectReader;
64  import org.eclipse.jgit.lib.Repository;
65  import org.eclipse.jgit.lib.RepositoryBuilder;
66  import org.eclipse.jgit.lib.RepositoryCache;
67  import org.eclipse.jgit.pgm.Command;
68  import org.eclipse.jgit.pgm.TextBuiltin;
69  import org.eclipse.jgit.pgm.internal.CLIText;
70  import org.eclipse.jgit.revwalk.RevWalk;
71  import org.eclipse.jgit.treewalk.TreeWalk;
72  import org.eclipse.jgit.util.FS;
73  import org.eclipse.jgit.util.NB;
74  import org.kohsuke.args4j.Option;
75  
76  /**
77   * Scan repository to compute maximum number of collisions for hash functions.
78   *
79   * This is a test suite to help benchmark the collision rate of hash functions
80   * when applied to file contents in a Git repository. The test scans all text
81   * files in the HEAD revision of the repository it is run within. For each file
82   * it finds the unique lines, and then inserts those lines into a hash table to
83   * determine collision rates under the selected hash functions.
84   *
85   * To add another hash function to the test suite, declare a new instance member
86   * field of type {@link Hash} and implement the hashRegion method. The test
87   * suite will automatically pick up the new function through reflection.
88   *
89   * To add another folding function (method of squashing a 32 bit hash code into
90   * the hash tables smaller array index space), declare a new instance field of
91   * type {@link Fold} and implement the logic. The test suite will automatically
92   * pick up the new function through reflection.
93   */
94  @Command(usage = "usage_TextHashFunctions")
95  class TextHashFunctions extends TextBuiltin {
96  
97  	/** Standard SHA-1 on the line, using the first 4 bytes as the hash code. */
98  	final Hash sha1 = new Hash() {
99  		private final MessageDigest md = Constants.newMessageDigest();
100 
101 		@Override
102 		protected int hashRegion(byte[] raw, int ptr, int end) {
103 			md.reset();
104 			md.update(raw, ptr, end - ptr);
105 			return NB.decodeInt32(md.digest(), 0);
106 		}
107 	};
108 
109 	/** Professor Daniel J. Bernstein's rather popular string hash. */
110 	final Hash djb = new Hash() {
111 		@Override
112 		protected int hashRegion(byte[] raw, int ptr, int end) {
113 			int hash = 5381;
114 			for (; ptr < end; ptr++)
115 				hash = ((hash << 5) + hash) + (raw[ptr] & 0xff);
116 			return hash;
117 		}
118 	};
119 
120 	/** Hash function commonly used by java.lang.String. */
121 	final Hash string_hash31 = new Hash() {
122 		@Override
123 		protected int hashRegion(byte[] raw, int ptr, int end) {
124 			int hash = 0;
125 			for (; ptr < end; ptr++)
126 				hash = 31 * hash + (raw[ptr] & 0xff);
127 			return hash;
128 		}
129 	};
130 
131 	/** The Rabin polynomial hash that is used by our own DeltaIndex. */
132 	final Hash rabin_DeltaIndex = new Hash() {
133 		private final byte[] buf16 = new byte[16];
134 
135 		@Override
136 		protected int hashRegion(byte[] raw, int ptr, int end) {
137 			if (end - ptr < 16) {
138 				Arrays.fill(buf16, (byte) 0);
139 				System.arraycopy(raw, ptr, buf16, 0, end - ptr);
140 				return rabin(buf16, 0);
141 			} else {
142 				return rabin(raw, ptr);
143 			}
144 		}
145 
146 		private int rabin(byte[] raw, int ptr) {
147 			int hash;
148 
149 			// The first 4 steps collapse out into a 4 byte big-endian decode,
150 			// with a larger right shift as we combined shift lefts together.
151 			//
152 			hash = ((raw[ptr] & 0xff) << 24) //
153 					| ((raw[ptr + 1] & 0xff) << 16) //
154 					| ((raw[ptr + 2] & 0xff) << 8) //
155 					| (raw[ptr + 3] & 0xff);
156 			hash ^= T[hash >>> 31];
157 
158 			hash = ((hash << 8) | (raw[ptr + 4] & 0xff)) ^ T[hash >>> 23];
159 			hash = ((hash << 8) | (raw[ptr + 5] & 0xff)) ^ T[hash >>> 23];
160 			hash = ((hash << 8) | (raw[ptr + 6] & 0xff)) ^ T[hash >>> 23];
161 			hash = ((hash << 8) | (raw[ptr + 7] & 0xff)) ^ T[hash >>> 23];
162 
163 			hash = ((hash << 8) | (raw[ptr + 8] & 0xff)) ^ T[hash >>> 23];
164 			hash = ((hash << 8) | (raw[ptr + 9] & 0xff)) ^ T[hash >>> 23];
165 			hash = ((hash << 8) | (raw[ptr + 10] & 0xff)) ^ T[hash >>> 23];
166 			hash = ((hash << 8) | (raw[ptr + 11] & 0xff)) ^ T[hash >>> 23];
167 
168 			hash = ((hash << 8) | (raw[ptr + 12] & 0xff)) ^ T[hash >>> 23];
169 			hash = ((hash << 8) | (raw[ptr + 13] & 0xff)) ^ T[hash >>> 23];
170 			hash = ((hash << 8) | (raw[ptr + 14] & 0xff)) ^ T[hash >>> 23];
171 			hash = ((hash << 8) | (raw[ptr + 15] & 0xff)) ^ T[hash >>> 23];
172 
173 			return hash;
174 		}
175 
176 		private final int[] T = { 0x00000000, 0xd4c6b32d, 0x7d4bd577,
177 				0xa98d665a, 0x2e5119c3, 0xfa97aaee, 0x531accb4, 0x87dc7f99,
178 				0x5ca23386, 0x886480ab, 0x21e9e6f1, 0xf52f55dc, 0x72f32a45,
179 				0xa6359968, 0x0fb8ff32, 0xdb7e4c1f, 0x6d82d421, 0xb944670c,
180 				0x10c90156, 0xc40fb27b, 0x43d3cde2, 0x97157ecf, 0x3e981895,
181 				0xea5eabb8, 0x3120e7a7, 0xe5e6548a, 0x4c6b32d0, 0x98ad81fd,
182 				0x1f71fe64, 0xcbb74d49, 0x623a2b13, 0xb6fc983e, 0x0fc31b6f,
183 				0xdb05a842, 0x7288ce18, 0xa64e7d35, 0x219202ac, 0xf554b181,
184 				0x5cd9d7db, 0x881f64f6, 0x536128e9, 0x87a79bc4, 0x2e2afd9e,
185 				0xfaec4eb3, 0x7d30312a, 0xa9f68207, 0x007be45d, 0xd4bd5770,
186 				0x6241cf4e, 0xb6877c63, 0x1f0a1a39, 0xcbcca914, 0x4c10d68d,
187 				0x98d665a0, 0x315b03fa, 0xe59db0d7, 0x3ee3fcc8, 0xea254fe5,
188 				0x43a829bf, 0x976e9a92, 0x10b2e50b, 0xc4745626, 0x6df9307c,
189 				0xb93f8351, 0x1f8636de, 0xcb4085f3, 0x62cde3a9, 0xb60b5084,
190 				0x31d72f1d, 0xe5119c30, 0x4c9cfa6a, 0x985a4947, 0x43240558,
191 				0x97e2b675, 0x3e6fd02f, 0xeaa96302, 0x6d751c9b, 0xb9b3afb6,
192 				0x103ec9ec, 0xc4f87ac1, 0x7204e2ff, 0xa6c251d2, 0x0f4f3788,
193 				0xdb8984a5, 0x5c55fb3c, 0x88934811, 0x211e2e4b, 0xf5d89d66,
194 				0x2ea6d179, 0xfa606254, 0x53ed040e, 0x872bb723, 0x00f7c8ba,
195 				0xd4317b97, 0x7dbc1dcd, 0xa97aaee0, 0x10452db1, 0xc4839e9c,
196 				0x6d0ef8c6, 0xb9c84beb, 0x3e143472, 0xead2875f, 0x435fe105,
197 				0x97995228, 0x4ce71e37, 0x9821ad1a, 0x31accb40, 0xe56a786d,
198 				0x62b607f4, 0xb670b4d9, 0x1ffdd283, 0xcb3b61ae, 0x7dc7f990,
199 				0xa9014abd, 0x008c2ce7, 0xd44a9fca, 0x5396e053, 0x8750537e,
200 				0x2edd3524, 0xfa1b8609, 0x2165ca16, 0xf5a3793b, 0x5c2e1f61,
201 				0x88e8ac4c, 0x0f34d3d5, 0xdbf260f8, 0x727f06a2, 0xa6b9b58f,
202 				0x3f0c6dbc, 0xebcade91, 0x4247b8cb, 0x96810be6, 0x115d747f,
203 				0xc59bc752, 0x6c16a108, 0xb8d01225, 0x63ae5e3a, 0xb768ed17,
204 				0x1ee58b4d, 0xca233860, 0x4dff47f9, 0x9939f4d4, 0x30b4928e,
205 				0xe47221a3, 0x528eb99d, 0x86480ab0, 0x2fc56cea, 0xfb03dfc7,
206 				0x7cdfa05e, 0xa8191373, 0x01947529, 0xd552c604, 0x0e2c8a1b,
207 				0xdaea3936, 0x73675f6c, 0xa7a1ec41, 0x207d93d8, 0xf4bb20f5,
208 				0x5d3646af, 0x89f0f582, 0x30cf76d3, 0xe409c5fe, 0x4d84a3a4,
209 				0x99421089, 0x1e9e6f10, 0xca58dc3d, 0x63d5ba67, 0xb713094a,
210 				0x6c6d4555, 0xb8abf678, 0x11269022, 0xc5e0230f, 0x423c5c96,
211 				0x96faefbb, 0x3f7789e1, 0xebb13acc, 0x5d4da2f2, 0x898b11df,
212 				0x20067785, 0xf4c0c4a8, 0x731cbb31, 0xa7da081c, 0x0e576e46,
213 				0xda91dd6b, 0x01ef9174, 0xd5292259, 0x7ca44403, 0xa862f72e,
214 				0x2fbe88b7, 0xfb783b9a, 0x52f55dc0, 0x8633eeed, 0x208a5b62,
215 				0xf44ce84f, 0x5dc18e15, 0x89073d38, 0x0edb42a1, 0xda1df18c,
216 				0x739097d6, 0xa75624fb, 0x7c2868e4, 0xa8eedbc9, 0x0163bd93,
217 				0xd5a50ebe, 0x52797127, 0x86bfc20a, 0x2f32a450, 0xfbf4177d,
218 				0x4d088f43, 0x99ce3c6e, 0x30435a34, 0xe485e919, 0x63599680,
219 				0xb79f25ad, 0x1e1243f7, 0xcad4f0da, 0x11aabcc5, 0xc56c0fe8,
220 				0x6ce169b2, 0xb827da9f, 0x3ffba506, 0xeb3d162b, 0x42b07071,
221 				0x9676c35c, 0x2f49400d, 0xfb8ff320, 0x5202957a, 0x86c42657,
222 				0x011859ce, 0xd5deeae3, 0x7c538cb9, 0xa8953f94, 0x73eb738b,
223 				0xa72dc0a6, 0x0ea0a6fc, 0xda6615d1, 0x5dba6a48, 0x897cd965,
224 				0x20f1bf3f, 0xf4370c12, 0x42cb942c, 0x960d2701, 0x3f80415b,
225 				0xeb46f276, 0x6c9a8def, 0xb85c3ec2, 0x11d15898, 0xc517ebb5,
226 				0x1e69a7aa, 0xcaaf1487, 0x632272dd, 0xb7e4c1f0, 0x3038be69,
227 				0xe4fe0d44, 0x4d736b1e, 0x99b5d833 };
228 	};
229 
230 	/** Bitwise-and to extract only the low bits. */
231 	final Fold truncate = new Fold() {
232 		@Override
233 		public int fold(int hash, int bits) {
234 			return hash & ((1 << bits) - 1);
235 		}
236 	};
237 
238 	/** Applies the golden ratio and takes the upper bits. */
239 	final Fold golden_ratio = new Fold() {
240 		@Override
241 		public int fold(int hash, int bits) {
242 			/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
243 			return (hash * 0x9e370001) >>> (32 - bits);
244 		}
245 	};
246 
247 	// -----------------------------------------------------------------------
248 	//
249 	// Implementation of the suite lives below this line.
250 	//
251 	//
252 
253 	@Option(name = "--hash", metaVar = "NAME", usage = "Enable hash function(s)")
254 	List<String> hashFunctions = new ArrayList<>();
255 
256 	@Option(name = "--fold", metaVar = "NAME", usage = "Enable fold function(s)")
257 	List<String> foldFunctions = new ArrayList<>();
258 
259 	@Option(name = "--text-limit", metaVar = "LIMIT", usage = "Maximum size in KiB to scan")
260 	int textLimit = 15 * 1024; // 15 MiB as later we do * 1024.
261 
262 	@Option(name = "--repository", aliases = { "-r" }, metaVar = "GIT_DIR", usage = "Repository to scan")
263 	List<File> gitDirs = new ArrayList<>();
264 
265 	/** {@inheritDoc} */
266 	@Override
267 	protected boolean requiresRepository() {
268 		return false;
269 	}
270 
271 	/** {@inheritDoc} */
272 	@Override
273 	protected void run() throws Exception {
274 		if (gitDirs.isEmpty()) {
275 			RepositoryBuilder rb = new RepositoryBuilder() //
276 					.setGitDir(new File(gitdir)) //
277 					.readEnvironment() //
278 					.findGitDir();
279 			if (rb.getGitDir() == null)
280 				throw die(CLIText.get().cantFindGitDirectory);
281 			gitDirs.add(rb.getGitDir());
282 		}
283 
284 		for (File dir : gitDirs) {
285 			RepositoryBuilder rb = new RepositoryBuilder();
286 			if (RepositoryCache.FileKey.isGitRepository(dir, FS.DETECTED))
287 				rb.setGitDir(dir);
288 			else
289 				rb.findGitDir(dir);
290 
291 			try (Repository repo = rb.build()) {
292 				run(repo);
293 			}
294 		}
295 	}
296 
297 	private void run(Repository repo) throws Exception {
298 		List<Function> all = init();
299 
300 		long fileCnt = 0;
301 		long lineCnt = 0;
302 		try (ObjectReader or = repo.newObjectReader();
303 			RevWalk rw = new RevWalk(or);
304 			TreeWalk tw = new TreeWalk(or)) {
305 			final MutableObjectId id = new MutableObjectId();
306 			tw.reset(rw.parseTree(repo.resolve(Constants.HEAD)));
307 			tw.setRecursive(true);
308 
309 			while (tw.next()) {
310 				FileMode fm = tw.getFileMode(0);
311 				if (!FileMode.REGULAR_FILE.equals(fm)
312 						&& !FileMode.EXECUTABLE_FILE.equals(fm))
313 					continue;
314 
315 				byte[] raw;
316 				try {
317 					tw.getObjectId(id, 0);
318 					raw = or.open(id).getCachedBytes(textLimit * 1024);
319 				} catch (LargeObjectException tooBig) {
320 					continue;
321 				}
322 
323 				if (RawText.isBinary(raw))
324 					continue;
325 
326 				RawText txt = new RawText(raw);
327 				int[] lines = new int[txt.size()];
328 				int cnt = 0;
329 				HashSet<Line> u = new HashSet<>();
330 				for (int i = 0; i < txt.size(); i++) {
331 					if (u.add(new Line(txt, i)))
332 						lines[cnt++] = i;
333 				}
334 
335 				fileCnt++;
336 				lineCnt += cnt;
337 
338 				for (Function fun : all)
339 					testOne(fun, txt, lines, cnt);
340 			}
341 		}
342 
343 		File directory = repo.getDirectory();
344 		if (directory != null) {
345 			String name = directory.getName();
346 			File parent = directory.getParentFile();
347 			if (name.equals(Constants.DOT_GIT) && parent != null)
348 				name = parent.getName();
349 			outw.println(name + ":"); //$NON-NLS-1$
350 		}
351 		outw.format("  %6d files; %5d avg. unique lines/file\n", //$NON-NLS-1$
352 				valueOf(fileCnt), //
353 				valueOf(lineCnt / fileCnt));
354 		outw.format("%-20s %-15s %9s\n", "Hash", "Fold", "Max Len"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
355 		outw.println("-----------------------------------------------"); //$NON-NLS-1$
356 		String lastHashName = null;
357 		for (Function fun : all) {
358 			String hashName = fun.hash.name;
359 			if (hashName.equals(lastHashName))
360 				hashName = ""; //$NON-NLS-1$
361 			outw.format("%-20s %-15s %9d\n", // //$NON-NLS-1$
362 					hashName, //
363 					fun.fold.name, //
364 					valueOf(fun.maxChainLength));
365 			lastHashName = fun.hash.name;
366 		}
367 		outw.println();
368 		outw.flush();
369 	}
370 
371 	private static void testOne(Function fun, RawText txt, int[] elements,
372 			int cnt) {
373 		final Hash cmp = fun.hash;
374 		final Fold fold = fun.fold;
375 
376 		final int bits = tableBits(cnt);
377 		final int[] buckets = new int[1 << bits];
378 		for (int i = 0; i < cnt; i++)
379 			buckets[fold.fold(cmp.hash(txt, elements[i]), bits)]++;
380 
381 		int maxChainLength = 0;
382 		for (int i = 0; i < buckets.length; i++)
383 			maxChainLength = Math.max(maxChainLength, buckets[i]);
384 		fun.maxChainLength = Math.max(fun.maxChainLength, maxChainLength);
385 	}
386 
387 	private List<Function> init() {
388 		List<Hash> hashes = new ArrayList<>();
389 		List<Fold> folds = new ArrayList<>();
390 
391 		try {
392 			for (Field f : TextHashFunctions.class.getDeclaredFields()) {
393 				if (f.getType() == Hash.class) {
394 					f.setAccessible(true);
395 					Hash cmp = (Hash) f.get(this);
396 					cmp.name = f.getName();
397 					hashes.add(cmp);
398 
399 				} else if (f.getType() == Fold.class) {
400 					f.setAccessible(true);
401 					Fold fold = (Fold) f.get(this);
402 					fold.name = f.getName();
403 					folds.add(fold);
404 				}
405 			}
406 		} catch (IllegalArgumentException e) {
407 			throw new RuntimeException("Cannot determine names", e); //$NON-NLS-1$
408 		} catch (IllegalAccessException e) {
409 			throw new RuntimeException("Cannot determine names", e); //$NON-NLS-1$
410 		}
411 
412 		List<Function> all = new ArrayList<>();
413 		for (Hash cmp : hashes) {
414 			if (include(cmp.name, hashFunctions)) {
415 				for (Fold f : folds) {
416 					if (include(f.name, foldFunctions)) {
417 						all.add(new Function(cmp, f));
418 					}
419 				}
420 			}
421 		}
422 		return all;
423 	}
424 
425 	private static boolean include(String name, List<String> want) {
426 		if (want.isEmpty())
427 			return true;
428 		for (String s : want) {
429 			if (s.equalsIgnoreCase(name))
430 				return true;
431 		}
432 		return false;
433 	}
434 
435 	private static class Function {
436 		final Hash hash;
437 
438 		final Fold fold;
439 
440 		int maxChainLength;
441 
442 		Function(Hash cmp, Fold fold) {
443 			this.hash = cmp;
444 			this.fold = fold;
445 		}
446 	}
447 
448 	/** Base class for any hashCode function to be tested. */
449 	private static abstract class Hash extends RawTextComparator {
450 		String name;
451 
452 		@Override
453 		public boolean equals(RawText a, int ai, RawText b, int bi) {
454 			return RawTextComparator.DEFAULT.equals(a, ai, b, bi);
455 		}
456 	}
457 
458 	/** Base class for any hashCode folding function to be tested. */
459 	private static abstract class Fold {
460 		String name;
461 
462 		/**
463 		 * Fold the given 32-bit hash code into only {@code bits} of space.
464 		 *
465 		 * @param hash
466 		 *            the 32 bit hash code to be folded into a smaller value.
467 		 * @param bits
468 		 *            total number of bits that can appear in the output. The
469 		 *            output value must be in the range {@code [0, 1 << bits)}.
470 		 *            When bits = 2, valid outputs are 0, 1, 2, 3.
471 		 * @return the folded hash, squeezed into only {@code bits}.
472 		 */
473 		abstract int fold(int hash, int bits);
474 	}
475 
476 	/** Utility to help us identify unique lines in a file. */
477 	private class Line {
478 		private final RawText txt;
479 
480 		private final int pos;
481 
482 		Line(RawText txt, int pos) {
483 			this.txt = txt;
484 			this.pos = pos;
485 		}
486 
487 		@Override
488 		public int hashCode() {
489 			return RawTextComparator.DEFAULT.hash(txt, pos);
490 		}
491 
492 		@Override
493 		public boolean equals(Object obj) {
494 			if (obj instanceof Line) {
495 				Line e = (Line) obj;
496 				return RawTextComparator.DEFAULT.equals(txt, pos, e.txt, e.pos);
497 			}
498 			return false;
499 		}
500 	}
501 
502 	private static int tableBits(int sz) {
503 		int bits = 31 - Integer.numberOfLeadingZeros(sz);
504 		if (bits == 0)
505 			bits = 1;
506 		if (1 << bits < sz)
507 			bits++;
508 		return bits;
509 	}
510 }