1 /*
2 * Copyright (C) 2010, Google Inc.
3 * and other copyright owners as documented in the project's IP log.
4 *
5 * This program and the accompanying materials are made available
6 * under the terms of the Eclipse Distribution License v1.0 which
7 * accompanies this distribution, is reproduced below, and is
8 * available at http://www.eclipse.org/org/documents/edl-v10.php
9 *
10 * All rights reserved.
11 *
12 * Redistribution and use in source and binary forms, with or
13 * without modification, are permitted provided that the following
14 * conditions are met:
15 *
16 * - Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 *
19 * - Redistributions in binary form must reproduce the above
20 * copyright notice, this list of conditions and the following
21 * disclaimer in the documentation and/or other materials provided
22 * with the distribution.
23 *
24 * - Neither the name of the Eclipse Foundation, Inc. nor the
25 * names of its contributors may be used to endorse or promote
26 * products derived from this software without specific prior
27 * written permission.
28 *
29 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
30 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
31 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
32 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
34 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
35 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
36 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
37 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
38 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
39 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
40 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
41 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
42 */
43
44 package org.eclipse.jgit.diff;
45
46 import org.eclipse.jgit.internal.JGitText;
47
48 /**
49 * Support {@link HistogramDiff} by computing occurrence counts of elements.
50 * <p>
51 * Each element in the range being considered is put into a hash table, tracking
52 * the number of times that distinct element appears in the sequence. Once all
53 * elements have been inserted from sequence A, each element of sequence B is
54 * probed in the hash table and the longest common subsequence with the lowest
55 * occurrence count in A is used as the result.
56 *
57 * @param <S>
58 * type of the base sequence.
59 */
60 final class HistogramDiffIndex<S extends Sequence> {
61 private static final int REC_NEXT_SHIFT = 28 + 8;
62
63 private static final int REC_PTR_SHIFT = 8;
64
65 private static final int REC_PTR_MASK = (1 << 28) - 1;
66
67 private static final int REC_CNT_MASK = (1 << 8) - 1;
68
69 private static final int MAX_PTR = REC_PTR_MASK;
70
71 private static final int MAX_CNT = (1 << 8) - 1;
72
73 private final int maxChainLength;
74
75 private final HashedSequenceComparator<S> cmp;
76
77 private final HashedSequence<S> a;
78
79 private final HashedSequence<S> b;
80
81 private final Edit region;
82
83 /** Keyed by {@link #hash(HashedSequence, int)} for {@link #recs} index. */
84 private final int[] table;
85
86 /** Number of low bits to discard from a key to index {@link #table}. */
87 private final int keyShift;
88
89 /**
90 * Describes a unique element in sequence A.
91 *
92 * The records in this table are actually 3-tuples of:
93 * <ul>
94 * <li>index of next record in this table that has same hash code</li>
95 * <li>index of first element in this occurrence chain</li>
96 * <li>occurrence count for this element (length of locs list)</li>
97 * </ul>
98 *
99 * The occurrence count is capped at {@link #MAX_CNT}, as the field is only
100 * a few bits wide. Elements that occur more frequently will have their
101 * count capped.
102 */
103 private long[] recs;
104
105 /** Number of elements in {@link #recs}; also is the unique element count. */
106 private int recCnt;
107
108 /**
109 * For {@code ptr}, {@code next[ptr - ptrShift]} has subsequent index.
110 *
111 * For the sequence element {@code ptr}, the value stored at location
112 * {@code next[ptr - ptrShift]} is the next occurrence of the exact same
113 * element in the sequence.
114 *
115 * Chains always run from the lowest index to the largest index. Therefore
116 * the array will store {@code next[1] = 2}, but never {@code next[2] = 1}.
117 * This allows a chain to terminate with {@code 0}, as {@code 0} would never
118 * be a valid next element.
119 *
120 * The array is sized to be {@code region.getLengthA()} and element indexes
121 * are converted to array indexes by subtracting {@link #ptrShift}, which is
122 * just a cached version of {@code region.beginA}.
123 */
124 private int[] next;
125
126 /**
127 * For element {@code ptr} in A, index of the record in {@link #recs} array.
128 *
129 * The record at {@code recs[recIdx[ptr - ptrShift]]} is the record
130 * describing all occurrences of the element appearing in sequence A at
131 * position {@code ptr}. The record is needed to get the occurrence count of
132 * the element, or to locate all other occurrences of that element within
133 * sequence A. This index provides constant-time access to the record, and
134 * avoids needing to scan the hash chain.
135 */
136 private int[] recIdx;
137
138 /** Value to subtract from element indexes to key {@link #next} array. */
139 private int ptrShift;
140
141 private Edit lcs;
142
143 private int cnt;
144
145 private boolean hasCommon;
146
147 HistogramDiffIndex(int maxChainLength, HashedSequenceComparator<S> cmp,
148 HashedSequence<S> a, HashedSequence<S> b, Edit r) {
149 this.maxChainLength = maxChainLength;
150 this.cmp = cmp;
151 this.a = a;
152 this.b = b;
153 this.region = r;
154
155 if (region.endA >= MAX_PTR)
156 throw new IllegalArgumentException(
157 JGitText.get().sequenceTooLargeForDiffAlgorithm);
158
159 final int sz = r.getLengthA();
160 final int tableBits = tableBits(sz);
161 table = new int[1 << tableBits];
162 keyShift = 32 - tableBits;
163 ptrShift = r.beginA;
164
165 recs = new long[Math.max(4, sz >>> 3)];
166 next = new int[sz];
167 recIdx = new int[sz];
168 }
169
170 Edit findLongestCommonSequence() {
171 if (!scanA())
172 return null;
173
174 lcs = new Edit(0, 0);
175 cnt = maxChainLength + 1;
176
177 for (int bPtr = region.beginB; bPtr < region.endB;)
178 bPtr = tryLongestCommonSequence(bPtr);
179
180 return hasCommon && maxChainLength < cnt ? null : lcs;
181 }
182
183 private boolean scanA() {
184 // Scan the elements backwards, inserting them into the hash table
185 // as we go. Going in reverse places the earliest occurrence of any
186 // element at the start of the chain, so we consider earlier matches
187 // before later matches.
188 //
189 SCAN: for (int ptr = region.endA - 1; region.beginA <= ptr; ptr--) {
190 final int tIdx = hash(a, ptr);
191
192 int chainLen = 0;
193 for (int rIdx = table[tIdx]; rIdx != 0;) {
194 final long rec = recs[rIdx];
195 if (cmp.equals(a, recPtr(rec), a, ptr)) {
196 // ptr is identical to another element. Insert it onto
197 // the front of the existing element chain.
198 //
199 int newCnt = recCnt(rec) + 1;
200 if (MAX_CNT < newCnt)
201 newCnt = MAX_CNT;
202 recs[rIdx] = recCreate(recNext(rec), ptr, newCnt);
203 next[ptr - ptrShift] = recPtr(rec);
204 recIdx[ptr - ptrShift] = rIdx;
205 continue SCAN;
206 }
207
208 rIdx = recNext(rec);
209 chainLen++;
210 }
211
212 if (chainLen == maxChainLength)
213 return false;
214
215 // This is the first time we have ever seen this particular
216 // element in the sequence. Construct a new chain for it.
217 //
218 final int rIdx = ++recCnt;
219 if (rIdx == recs.length) {
220 int sz = Math.min(recs.length << 1, 1 + region.getLengthA());
221 long[] n = new long[sz];
222 System.arraycopy(recs, 0, n, 0, recs.length);
223 recs = n;
224 }
225
226 recs[rIdx] = recCreate(table[tIdx], ptr, 1);
227 recIdx[ptr - ptrShift] = rIdx;
228 table[tIdx] = rIdx;
229 }
230 return true;
231 }
232
233 private int tryLongestCommonSequence(int bPtr) {
234 int bNext = bPtr + 1;
235 int rIdx = table[hash(b, bPtr)];
236 for (long rec; rIdx != 0; rIdx = recNext(rec)) {
237 rec = recs[rIdx];
238
239 // If there are more occurrences in A, don't use this chain.
240 if (recCnt(rec) > cnt) {
241 if (!hasCommon)
242 hasCommon = cmp.equals(a, recPtr(rec), b, bPtr);
243 continue;
244 }
245
246 int as = recPtr(rec);
247 if (!cmp.equals(a, as, b, bPtr))
248 continue;
249
250 hasCommon = true;
251 TRY_LOCATIONS: for (;;) {
252 int np = next[as - ptrShift];
253 int bs = bPtr;
254 int ae = as + 1;
255 int be = bs + 1;
256 int rc = recCnt(rec);
257
258 while (region.beginA < as && region.beginB < bs
259 && cmp.equals(a, as - 1, b, bs - 1)) {
260 as--;
261 bs--;
262 if (1 < rc)
263 rc = Math.min(rc, recCnt(recs[recIdx[as - ptrShift]]));
264 }
265 while (ae < region.endA && be < region.endB
266 && cmp.equals(a, ae, b, be)) {
267 if (1 < rc)
268 rc = Math.min(rc, recCnt(recs[recIdx[ae - ptrShift]]));
269 ae++;
270 be++;
271 }
272
273 if (bNext < be)
274 bNext = be;
275 if (lcs.getLengthA() < ae - as || rc < cnt) {
276 // If this region is the longest, or there are less
277 // occurrences of it in A, its now our LCS.
278 //
279 lcs.beginA = as;
280 lcs.beginB = bs;
281 lcs.endA = ae;
282 lcs.endB = be;
283 cnt = rc;
284 }
285
286 // Because we added elements in reverse order index 0
287 // cannot possibly be the next position. Its the first
288 // element of the sequence and thus would have been the
289 // value of as at the start of the TRY_LOCATIONS loop.
290 //
291 if (np == 0)
292 break TRY_LOCATIONS;
293
294 while (np < ae) {
295 // The next location to consider was actually within
296 // the LCS we examined above. Don't reconsider it.
297 //
298 np = next[np - ptrShift];
299 if (np == 0)
300 break TRY_LOCATIONS;
301 }
302
303 as = np;
304 }
305 }
306 return bNext;
307 }
308
309 private int hash(HashedSequence<S> s, int idx) {
310 return (cmp.hash(s, idx) * 0x9e370001 /* mix bits */) >>> keyShift;
311 }
312
313 private static long recCreate(int next, int ptr, int cnt) {
314 return ((long) next << REC_NEXT_SHIFT) //
315 | ((long) ptr << REC_PTR_SHIFT) //
316 | cnt;
317 }
318
319 private static int recNext(long rec) {
320 return (int) (rec >>> REC_NEXT_SHIFT);
321 }
322
323 private static int recPtr(long rec) {
324 return ((int) (rec >>> REC_PTR_SHIFT)) & REC_PTR_MASK;
325 }
326
327 private static int recCnt(long rec) {
328 return ((int) rec) & REC_CNT_MASK;
329 }
330
331 private static int tableBits(int sz) {
332 int bits = 31 - Integer.numberOfLeadingZeros(sz);
333 if (bits == 0)
334 bits = 1;
335 if (1 << bits < sz)
336 bits++;
337 return bits;
338 }
339 }