1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44 package org.eclipse.jgit.diff;
45
46 import java.io.EOFException;
47 import java.io.IOException;
48 import java.io.InputStream;
49 import java.util.Arrays;
50
51 import org.eclipse.jgit.errors.MissingObjectException;
52 import org.eclipse.jgit.lib.ObjectLoader;
53 import org.eclipse.jgit.lib.ObjectStream;
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69 public class SimilarityIndex {
70
71 public static final TableFullException
72 TABLE_FULL_OUT_OF_MEMORY = new TableFullException();
73
74
75
76
77
78
79
80 private static final int KEY_SHIFT = 32;
81
82
83 private static final long MAX_COUNT = (1L << KEY_SHIFT) - 1;
84
85
86
87
88
89 private long hashedCnt;
90
91
92 private int idSize;
93
94
95 private int idGrowAt;
96
97
98
99
100
101
102
103
104
105
106 private long[] idHash;
107
108
109 private int idHashBits;
110
111
112
113
114
115
116
117
118
119
120
121
122
123 public static SimilarityIndex create(ObjectLoader obj) throws IOException,
124 TableFullException {
125 SimilarityIndex idx = new SimilarityIndex();
126 idx.hash(obj);
127 idx.sort();
128 return idx;
129 }
130
131 SimilarityIndex() {
132 idHashBits = 8;
133 idHash = new long[1 << idHashBits];
134 idGrowAt = growAt(idHashBits);
135 }
136
137 void hash(ObjectLoader obj) throws MissingObjectException, IOException,
138 TableFullException {
139 if (obj.isLarge()) {
140 hashLargeObject(obj);
141 } else {
142 byte[] raw = obj.getCachedBytes();
143 hash(raw, 0, raw.length);
144 }
145 }
146
147 private void hashLargeObject(ObjectLoader obj) throws IOException,
148 TableFullException {
149 ObjectStream in1 = obj.openStream();
150 boolean text;
151 try {
152 text = !RawText.isBinary(in1);
153 } finally {
154 in1.close();
155 }
156
157 ObjectStream in2 = obj.openStream();
158 try {
159 hash(in2, in2.getSize(), text);
160 } finally {
161 in2.close();
162 }
163 }
164
165 void hash(byte[] raw, int ptr, final int end) throws TableFullException {
166 final boolean text = !RawText.isBinary(raw);
167 hashedCnt = 0;
168 while (ptr < end) {
169 int hash = 5381;
170 int blockHashedCnt = 0;
171 int start = ptr;
172
173
174 do {
175 int c = raw[ptr++] & 0xff;
176
177 if (text && c == '\r' && ptr < end && raw[ptr] == '\n')
178 continue;
179 blockHashedCnt++;
180 if (c == '\n')
181 break;
182 hash = (hash << 5) + hash + c;
183 } while (ptr < end && ptr - start < 64);
184 hashedCnt += blockHashedCnt;
185 add(hash, blockHashedCnt);
186 }
187 }
188
189 void hash(InputStream in, long remaining, boolean text) throws IOException,
190 TableFullException {
191 byte[] buf = new byte[4096];
192 int ptr = 0;
193 int cnt = 0;
194
195 while (0 < remaining) {
196 int hash = 5381;
197 int blockHashedCnt = 0;
198
199
200 int n = 0;
201 do {
202 if (ptr == cnt) {
203 ptr = 0;
204 cnt = in.read(buf, 0, buf.length);
205 if (cnt <= 0)
206 throw new EOFException();
207 }
208
209 n++;
210 int c = buf[ptr++] & 0xff;
211
212 if (text && c == '\r' && ptr < cnt && buf[ptr] == '\n')
213 continue;
214 blockHashedCnt++;
215 if (c == '\n')
216 break;
217 hash = (hash << 5) + hash + c;
218 } while (n < 64 && n < remaining);
219 hashedCnt += blockHashedCnt;
220 add(hash, blockHashedCnt);
221 remaining -= n;
222 }
223 }
224
225
226
227
228
229
230 void sort() {
231
232
233
234
235 Arrays.sort(idHash);
236 }
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258 public int score(SimilarityIndex dst, int maxScore) {
259 long max = Math.max(hashedCnt, dst.hashedCnt);
260 if (max == 0)
261 return maxScore;
262 return (int) ((common(dst) * maxScore) / max);
263 }
264
265 long common(SimilarityIndex dst) {
266 return common(this, dst);
267 }
268
269 private static long common(SimilarityIndex src, SimilarityIndex dst) {
270 int srcIdx = src.packedIndex(0);
271 int dstIdx = dst.packedIndex(0);
272 long[] srcHash = src.idHash;
273 long[] dstHash = dst.idHash;
274 return common(srcHash, srcIdx, dstHash, dstIdx);
275 }
276
277 private static long common(long[] srcHash, int srcIdx,
278 long[] dstHash, int dstIdx) {
279 if (srcIdx == srcHash.length || dstIdx == dstHash.length)
280 return 0;
281
282 long common = 0;
283 int srcKey = keyOf(srcHash[srcIdx]);
284 int dstKey = keyOf(dstHash[dstIdx]);
285
286 for (;;) {
287 if (srcKey == dstKey) {
288 common += Math.min(countOf(srcHash[srcIdx]),
289 countOf(dstHash[dstIdx]));
290
291 if (++srcIdx == srcHash.length)
292 break;
293 srcKey = keyOf(srcHash[srcIdx]);
294
295 if (++dstIdx == dstHash.length)
296 break;
297 dstKey = keyOf(dstHash[dstIdx]);
298
299 } else if (srcKey < dstKey) {
300
301 if (++srcIdx == srcHash.length)
302 break;
303 srcKey = keyOf(srcHash[srcIdx]);
304
305 } else {
306
307 if (++dstIdx == dstHash.length)
308 break;
309 dstKey = keyOf(dstHash[dstIdx]);
310 }
311 }
312
313 return common;
314 }
315
316
317 int size() {
318 return idSize;
319 }
320
321
322 int key(int idx) {
323 return keyOf(idHash[packedIndex(idx)]);
324 }
325
326
327 long count(int idx) {
328 return countOf(idHash[packedIndex(idx)]);
329 }
330
331
332 int findIndex(int key) {
333 for (int i = 0; i < idSize; i++)
334 if (key(i) == key)
335 return i;
336 return -1;
337 }
338
339 private int packedIndex(int idx) {
340 return (idHash.length - idSize) + idx;
341 }
342
343 void add(int key, int cnt) throws TableFullException {
344 key = (key * 0x9e370001) >>> 1;
345
346 int j = slot(key);
347 for (;;) {
348 long v = idHash[j];
349 if (v == 0) {
350
351 if (idGrowAt <= idSize) {
352 grow();
353 j = slot(key);
354 continue;
355 }
356 idHash[j] = pair(key, cnt);
357 idSize++;
358 return;
359
360 } else if (keyOf(v) == key) {
361
362
363
364 idHash[j] = pair(key, countOf(v) + cnt);
365 return;
366
367 } else if (++j >= idHash.length) {
368 j = 0;
369 }
370 }
371 }
372
373 private static long pair(int key, long cnt) throws TableFullException {
374 if (MAX_COUNT < cnt)
375 throw new TableFullException();
376 return (((long) key) << KEY_SHIFT) | cnt;
377 }
378
379 private int slot(int key) {
380
381
382
383
384 return key >>> (31 - idHashBits);
385 }
386
387 private static int growAt(int idHashBits) {
388 return (1 << idHashBits) * (idHashBits - 3) / idHashBits;
389 }
390
391 private void grow() throws TableFullException {
392 if (idHashBits == 30)
393 throw new TableFullException();
394
395 long[] oldHash = idHash;
396 int oldSize = idHash.length;
397
398 idHashBits++;
399 idGrowAt = growAt(idHashBits);
400
401 try {
402 idHash = new long[1 << idHashBits];
403 } catch (OutOfMemoryError noMemory) {
404 throw TABLE_FULL_OUT_OF_MEMORY;
405 }
406
407 for (int i = 0; i < oldSize; i++) {
408 long v = oldHash[i];
409 if (v != 0) {
410 int j = slot(keyOf(v));
411 while (idHash[j] != 0)
412 if (++j >= idHash.length)
413 j = 0;
414 idHash[j] = v;
415 }
416 }
417 }
418
419 private static int keyOf(long v) {
420 return (int) (v >>> KEY_SHIFT);
421 }
422
423 private static long countOf(long v) {
424 return v & MAX_COUNT;
425 }
426
427
428 public static class TableFullException extends Exception {
429 private static final long serialVersionUID = 1L;
430 }
431 }