1
2
3
4
5
6
7
8
9
10
11 package org.eclipse.jgit.diff;
12
13 import java.io.EOFException;
14 import java.io.IOException;
15 import java.io.InputStream;
16 import java.util.Arrays;
17
18 import org.eclipse.jgit.errors.MissingObjectException;
19 import org.eclipse.jgit.lib.ObjectLoader;
20 import org.eclipse.jgit.lib.ObjectStream;
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37 public class SimilarityIndex {
38
39 public static final TableFullException
40 TABLE_FULL_OUT_OF_MEMORY = new TableFullException();
41
42
43
44
45
46
47
48 private static final int KEY_SHIFT = 32;
49
50
51 private static final long MAX_COUNT = (1L << KEY_SHIFT) - 1;
52
53
54
55
56
57 private long hashedCnt;
58
59
60 private int idSize;
61
62
63 private int idGrowAt;
64
65
66
67
68
69
70
71
72
73
74 private long[] idHash;
75
76
77 private int idHashBits;
78
79
80
81
82
83
84
85
86
87
88
89
90
91 public static SimilarityIndex create(ObjectLoader obj) throws IOException,
92 TableFullException {
93 SimilarityIndex idx = new SimilarityIndex();
94 idx.hash(obj);
95 idx.sort();
96 return idx;
97 }
98
99 SimilarityIndex() {
100 idHashBits = 8;
101 idHash = new long[1 << idHashBits];
102 idGrowAt = growAt(idHashBits);
103 }
104
105 static boolean isBinary(ObjectLoader obj) throws IOException {
106 if (obj.isLarge()) {
107 try (ObjectStream in1 = obj.openStream()) {
108 return RawText.isBinary(in1);
109 }
110 }
111 byte[] raw = obj.getCachedBytes();
112 return RawText.isBinary(raw, raw.length, true);
113 }
114
115 void hash(ObjectLoader obj) throws MissingObjectException, IOException,
116 TableFullException {
117 if (obj.isLarge()) {
118 hashLargeObject(obj);
119 } else {
120 byte[] raw = obj.getCachedBytes();
121 hash(raw, 0, raw.length);
122 }
123 }
124
125 private void hashLargeObject(ObjectLoader obj) throws IOException,
126 TableFullException {
127 boolean text;
128 text = !isBinary(obj);
129
130 try (ObjectStream in2 = obj.openStream()) {
131 hash(in2, in2.getSize(), text);
132 }
133 }
134
135 void hash(byte[] raw, int ptr, int end) throws TableFullException {
136 final boolean text = !RawText.isBinary(raw, raw.length, true);
137 hashedCnt = 0;
138 while (ptr < end) {
139 int hash = 5381;
140 int blockHashedCnt = 0;
141 int start = ptr;
142
143
144 do {
145 int c = raw[ptr++] & 0xff;
146
147 if (text && c == '\r' && ptr < end && raw[ptr] == '\n')
148 continue;
149 blockHashedCnt++;
150 if (c == '\n')
151 break;
152 hash = (hash << 5) + hash + c;
153 } while (ptr < end && ptr - start < 64);
154 hashedCnt += blockHashedCnt;
155 add(hash, blockHashedCnt);
156 }
157 }
158
159 void hash(InputStream in, long remaining, boolean text) throws IOException,
160 TableFullException {
161 byte[] buf = new byte[4096];
162 int ptr = 0;
163 int cnt = 0;
164
165 while (0 < remaining) {
166 int hash = 5381;
167 int blockHashedCnt = 0;
168
169
170 int n = 0;
171 do {
172 if (ptr == cnt) {
173 ptr = 0;
174 cnt = in.read(buf, 0, buf.length);
175 if (cnt <= 0)
176 throw new EOFException();
177 }
178
179 n++;
180 int c = buf[ptr++] & 0xff;
181
182 if (text && c == '\r' && ptr < cnt && buf[ptr] == '\n')
183 continue;
184 blockHashedCnt++;
185 if (c == '\n')
186 break;
187 hash = (hash << 5) + hash + c;
188 } while (n < 64 && n < remaining);
189 hashedCnt += blockHashedCnt;
190 add(hash, blockHashedCnt);
191 remaining -= n;
192 }
193 }
194
195
196
197
198
199
200 void sort() {
201
202
203
204
205 Arrays.sort(idHash);
206 }
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228 public int score(SimilarityIndex dst, int maxScore) {
229 long max = Math.max(hashedCnt, dst.hashedCnt);
230 if (max == 0)
231 return maxScore;
232 return (int) ((common(dst) * maxScore) / max);
233 }
234
235 long common(SimilarityIndex dst) {
236 return common(this, dst);
237 }
238
239 private static long common(SimilarityIndex src, SimilarityIndex dst) {
240 int srcIdx = src.packedIndex(0);
241 int dstIdx = dst.packedIndex(0);
242 long[] srcHash = src.idHash;
243 long[] dstHash = dst.idHash;
244 return common(srcHash, srcIdx, dstHash, dstIdx);
245 }
246
247 private static long common(long[] srcHash, int srcIdx,
248 long[] dstHash, int dstIdx) {
249 if (srcIdx == srcHash.length || dstIdx == dstHash.length)
250 return 0;
251
252 long common = 0;
253 int srcKey = keyOf(srcHash[srcIdx]);
254 int dstKey = keyOf(dstHash[dstIdx]);
255
256 for (;;) {
257 if (srcKey == dstKey) {
258 common += Math.min(countOf(srcHash[srcIdx]),
259 countOf(dstHash[dstIdx]));
260
261 if (++srcIdx == srcHash.length)
262 break;
263 srcKey = keyOf(srcHash[srcIdx]);
264
265 if (++dstIdx == dstHash.length)
266 break;
267 dstKey = keyOf(dstHash[dstIdx]);
268
269 } else if (srcKey < dstKey) {
270
271 if (++srcIdx == srcHash.length)
272 break;
273 srcKey = keyOf(srcHash[srcIdx]);
274
275 } else {
276
277 if (++dstIdx == dstHash.length)
278 break;
279 dstKey = keyOf(dstHash[dstIdx]);
280 }
281 }
282
283 return common;
284 }
285
286
287 int size() {
288 return idSize;
289 }
290
291
292 int key(int idx) {
293 return keyOf(idHash[packedIndex(idx)]);
294 }
295
296
297 long count(int idx) {
298 return countOf(idHash[packedIndex(idx)]);
299 }
300
301
302 int findIndex(int key) {
303 for (int i = 0; i < idSize; i++)
304 if (key(i) == key)
305 return i;
306 return -1;
307 }
308
309 private int packedIndex(int idx) {
310 return (idHash.length - idSize) + idx;
311 }
312
313 void add(int key, int cnt) throws TableFullException {
314 key = (key * 0x9e370001) >>> 1;
315
316 int j = slot(key);
317 for (;;) {
318 long v = idHash[j];
319 if (v == 0) {
320
321 if (idGrowAt <= idSize) {
322 grow();
323 j = slot(key);
324 continue;
325 }
326 idHash[j] = pair(key, cnt);
327 idSize++;
328 return;
329
330 } else if (keyOf(v) == key) {
331
332
333
334 idHash[j] = pair(key, countOf(v) + cnt);
335 return;
336
337 } else if (++j >= idHash.length) {
338 j = 0;
339 }
340 }
341 }
342
343 private static long pair(int key, long cnt) throws TableFullException {
344 if (MAX_COUNT < cnt)
345 throw new TableFullException();
346 return (((long) key) << KEY_SHIFT) | cnt;
347 }
348
349 private int slot(int key) {
350
351
352
353
354 return key >>> (31 - idHashBits);
355 }
356
357 private static int growAt(int idHashBits) {
358 return (1 << idHashBits) * (idHashBits - 3) / idHashBits;
359 }
360
361 @SuppressWarnings("UnusedException")
362 private void grow() throws TableFullException {
363 if (idHashBits == 30)
364 throw new TableFullException();
365
366 long[] oldHash = idHash;
367 int oldSize = idHash.length;
368
369 idHashBits++;
370 idGrowAt = growAt(idHashBits);
371
372 try {
373 idHash = new long[1 << idHashBits];
374 } catch (OutOfMemoryError noMemory) {
375 throw TABLE_FULL_OUT_OF_MEMORY;
376 }
377
378 for (int i = 0; i < oldSize; i++) {
379 long v = oldHash[i];
380 if (v != 0) {
381 int j = slot(keyOf(v));
382 while (idHash[j] != 0)
383 if (++j >= idHash.length)
384 j = 0;
385 idHash[j] = v;
386 }
387 }
388 }
389
390 private static int keyOf(long v) {
391 return (int) (v >>> KEY_SHIFT);
392 }
393
394 private static long countOf(long v) {
395 return v & MAX_COUNT;
396 }
397
398
399 public static class TableFullException extends Exception {
400 private static final long serialVersionUID = 1L;
401 }
402 }