1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44 package org.eclipse.jgit.diff;
45
46 import java.io.EOFException;
47 import java.io.IOException;
48 import java.io.InputStream;
49 import java.util.Arrays;
50
51 import org.eclipse.jgit.errors.MissingObjectException;
52 import org.eclipse.jgit.lib.ObjectLoader;
53 import org.eclipse.jgit.lib.ObjectStream;
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70 public class SimilarityIndex {
71
72 public static final TableFullException
73 TABLE_FULL_OUT_OF_MEMORY = new TableFullException();
74
75
76
77
78
79
80
81 private static final int KEY_SHIFT = 32;
82
83
84 private static final long MAX_COUNT = (1L << KEY_SHIFT) - 1;
85
86
87
88
89
90 private long hashedCnt;
91
92
93 private int idSize;
94
95
96 private int idGrowAt;
97
98
99
100
101
102
103
104
105
106
107 private long[] idHash;
108
109
110 private int idHashBits;
111
112
113
114
115
116
117
118
119
120
121
122
123
124 public static SimilarityIndex create(ObjectLoader obj) throws IOException,
125 TableFullException {
126 SimilarityIndex idx = new SimilarityIndex();
127 idx.hash(obj);
128 idx.sort();
129 return idx;
130 }
131
132 SimilarityIndex() {
133 idHashBits = 8;
134 idHash = new long[1 << idHashBits];
135 idGrowAt = growAt(idHashBits);
136 }
137
138 void hash(ObjectLoader obj) throws MissingObjectException, IOException,
139 TableFullException {
140 if (obj.isLarge()) {
141 hashLargeObject(obj);
142 } else {
143 byte[] raw = obj.getCachedBytes();
144 hash(raw, 0, raw.length);
145 }
146 }
147
148 private void hashLargeObject(ObjectLoader obj) throws IOException,
149 TableFullException {
150 ObjectStream in1 = obj.openStream();
151 boolean text;
152 try {
153 text = !RawText.isBinary(in1);
154 } finally {
155 in1.close();
156 }
157
158 ObjectStream in2 = obj.openStream();
159 try {
160 hash(in2, in2.getSize(), text);
161 } finally {
162 in2.close();
163 }
164 }
165
166 void hash(byte[] raw, int ptr, final int end) throws TableFullException {
167 final boolean text = !RawText.isBinary(raw);
168 hashedCnt = 0;
169 while (ptr < end) {
170 int hash = 5381;
171 int blockHashedCnt = 0;
172 int start = ptr;
173
174
175 do {
176 int c = raw[ptr++] & 0xff;
177
178 if (text && c == '\r' && ptr < end && raw[ptr] == '\n')
179 continue;
180 blockHashedCnt++;
181 if (c == '\n')
182 break;
183 hash = (hash << 5) + hash + c;
184 } while (ptr < end && ptr - start < 64);
185 hashedCnt += blockHashedCnt;
186 add(hash, blockHashedCnt);
187 }
188 }
189
190 void hash(InputStream in, long remaining, boolean text) throws IOException,
191 TableFullException {
192 byte[] buf = new byte[4096];
193 int ptr = 0;
194 int cnt = 0;
195
196 while (0 < remaining) {
197 int hash = 5381;
198 int blockHashedCnt = 0;
199
200
201 int n = 0;
202 do {
203 if (ptr == cnt) {
204 ptr = 0;
205 cnt = in.read(buf, 0, buf.length);
206 if (cnt <= 0)
207 throw new EOFException();
208 }
209
210 n++;
211 int c = buf[ptr++] & 0xff;
212
213 if (text && c == '\r' && ptr < cnt && buf[ptr] == '\n')
214 continue;
215 blockHashedCnt++;
216 if (c == '\n')
217 break;
218 hash = (hash << 5) + hash + c;
219 } while (n < 64 && n < remaining);
220 hashedCnt += blockHashedCnt;
221 add(hash, blockHashedCnt);
222 remaining -= n;
223 }
224 }
225
226
227
228
229
230
231 void sort() {
232
233
234
235
236 Arrays.sort(idHash);
237 }
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259 public int score(SimilarityIndex dst, int maxScore) {
260 long max = Math.max(hashedCnt, dst.hashedCnt);
261 if (max == 0)
262 return maxScore;
263 return (int) ((common(dst) * maxScore) / max);
264 }
265
266 long common(SimilarityIndex dst) {
267 return common(this, dst);
268 }
269
270 private static long common(SimilarityIndex src, SimilarityIndex dst) {
271 int srcIdx = src.packedIndex(0);
272 int dstIdx = dst.packedIndex(0);
273 long[] srcHash = src.idHash;
274 long[] dstHash = dst.idHash;
275 return common(srcHash, srcIdx, dstHash, dstIdx);
276 }
277
278 private static long common(long[] srcHash, int srcIdx,
279 long[] dstHash, int dstIdx) {
280 if (srcIdx == srcHash.length || dstIdx == dstHash.length)
281 return 0;
282
283 long common = 0;
284 int srcKey = keyOf(srcHash[srcIdx]);
285 int dstKey = keyOf(dstHash[dstIdx]);
286
287 for (;;) {
288 if (srcKey == dstKey) {
289 common += Math.min(countOf(srcHash[srcIdx]),
290 countOf(dstHash[dstIdx]));
291
292 if (++srcIdx == srcHash.length)
293 break;
294 srcKey = keyOf(srcHash[srcIdx]);
295
296 if (++dstIdx == dstHash.length)
297 break;
298 dstKey = keyOf(dstHash[dstIdx]);
299
300 } else if (srcKey < dstKey) {
301
302 if (++srcIdx == srcHash.length)
303 break;
304 srcKey = keyOf(srcHash[srcIdx]);
305
306 } else {
307
308 if (++dstIdx == dstHash.length)
309 break;
310 dstKey = keyOf(dstHash[dstIdx]);
311 }
312 }
313
314 return common;
315 }
316
317
318 int size() {
319 return idSize;
320 }
321
322
323 int key(int idx) {
324 return keyOf(idHash[packedIndex(idx)]);
325 }
326
327
328 long count(int idx) {
329 return countOf(idHash[packedIndex(idx)]);
330 }
331
332
333 int findIndex(int key) {
334 for (int i = 0; i < idSize; i++)
335 if (key(i) == key)
336 return i;
337 return -1;
338 }
339
340 private int packedIndex(int idx) {
341 return (idHash.length - idSize) + idx;
342 }
343
344 void add(int key, int cnt) throws TableFullException {
345 key = (key * 0x9e370001) >>> 1;
346
347 int j = slot(key);
348 for (;;) {
349 long v = idHash[j];
350 if (v == 0) {
351
352 if (idGrowAt <= idSize) {
353 grow();
354 j = slot(key);
355 continue;
356 }
357 idHash[j] = pair(key, cnt);
358 idSize++;
359 return;
360
361 } else if (keyOf(v) == key) {
362
363
364
365 idHash[j] = pair(key, countOf(v) + cnt);
366 return;
367
368 } else if (++j >= idHash.length) {
369 j = 0;
370 }
371 }
372 }
373
374 private static long pair(int key, long cnt) throws TableFullException {
375 if (MAX_COUNT < cnt)
376 throw new TableFullException();
377 return (((long) key) << KEY_SHIFT) | cnt;
378 }
379
380 private int slot(int key) {
381
382
383
384
385 return key >>> (31 - idHashBits);
386 }
387
388 private static int growAt(int idHashBits) {
389 return (1 << idHashBits) * (idHashBits - 3) / idHashBits;
390 }
391
392 private void grow() throws TableFullException {
393 if (idHashBits == 30)
394 throw new TableFullException();
395
396 long[] oldHash = idHash;
397 int oldSize = idHash.length;
398
399 idHashBits++;
400 idGrowAt = growAt(idHashBits);
401
402 try {
403 idHash = new long[1 << idHashBits];
404 } catch (OutOfMemoryError noMemory) {
405 throw TABLE_FULL_OUT_OF_MEMORY;
406 }
407
408 for (int i = 0; i < oldSize; i++) {
409 long v = oldHash[i];
410 if (v != 0) {
411 int j = slot(keyOf(v));
412 while (idHash[j] != 0)
413 if (++j >= idHash.length)
414 j = 0;
415 idHash[j] = v;
416 }
417 }
418 }
419
420 private static int keyOf(long v) {
421 return (int) (v >>> KEY_SHIFT);
422 }
423
424 private static long countOf(long v) {
425 return v & MAX_COUNT;
426 }
427
428
429 public static class TableFullException extends Exception {
430 private static final long serialVersionUID = 1L;
431 }
432 }