View Javadoc
1   /*
2    * Copyright (C) 2010, Google Inc.
3    * and other copyright owners as documented in the project's IP log.
4    *
5    * This program and the accompanying materials are made available
6    * under the terms of the Eclipse Distribution License v1.0 which
7    * accompanies this distribution, is reproduced below, and is
8    * available at http://www.eclipse.org/org/documents/edl-v10.php
9    *
10   * All rights reserved.
11   *
12   * Redistribution and use in source and binary forms, with or
13   * without modification, are permitted provided that the following
14   * conditions are met:
15   *
16   * - Redistributions of source code must retain the above copyright
17   *   notice, this list of conditions and the following disclaimer.
18   *
19   * - Redistributions in binary form must reproduce the above
20   *   copyright notice, this list of conditions and the following
21   *   disclaimer in the documentation and/or other materials provided
22   *   with the distribution.
23   *
24   * - Neither the name of the Eclipse Foundation, Inc. nor the
25   *   names of its contributors may be used to endorse or promote
26   *   products derived from this software without specific prior
27   *   written permission.
28   *
29   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
30   * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
31   * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
32   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33   * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
34   * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
35   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
36   * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
37   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
38   * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
39   * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
40   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
41   * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
42   */
43  
44  package org.eclipse.jgit.diff;
45  
46  import org.eclipse.jgit.internal.JGitText;
47  
48  /**
49   * Support {@link HistogramDiff} by computing occurrence counts of elements.
50   * <p>
51   * Each element in the range being considered is put into a hash table, tracking
52   * the number of times that distinct element appears in the sequence. Once all
53   * elements have been inserted from sequence A, each element of sequence B is
54   * probed in the hash table and the longest common subsequence with the lowest
55   * occurrence count in A is used as the result.
56   *
57   * @param <S>
58   *            type of the base sequence.
59   */
60  final class HistogramDiffIndex<S extends Sequence> {
61  	private static final int REC_NEXT_SHIFT = 28 + 8;
62  
63  	private static final int REC_PTR_SHIFT = 8;
64  
65  	private static final int REC_PTR_MASK = (1 << 28) - 1;
66  
67  	private static final int REC_CNT_MASK = (1 << 8) - 1;
68  
69  	private static final int MAX_PTR = REC_PTR_MASK;
70  
71  	private static final int MAX_CNT = (1 << 8) - 1;
72  
73  	private final int maxChainLength;
74  
75  	private final HashedSequenceComparator<S> cmp;
76  
77  	private final HashedSequence<S> a;
78  
79  	private final HashedSequence<S> b;
80  
81  	private final Edit region;
82  
83  	/** Keyed by {@link #hash(HashedSequence, int)} for {@link #recs} index. */
84  	private final int[] table;
85  
86  	/** Number of low bits to discard from a key to index {@link #table}. */
87  	private final int keyShift;
88  
89  	/**
90  	 * Describes a unique element in sequence A.
91  	 *
92  	 * The records in this table are actually 3-tuples of:
93  	 * <ul>
94  	 * <li>index of next record in this table that has same hash code</li>
95  	 * <li>index of first element in this occurrence chain</li>
96  	 * <li>occurrence count for this element (length of locs list)</li>
97  	 * </ul>
98  	 *
99  	 * The occurrence count is capped at {@link #MAX_CNT}, as the field is only
100 	 * a few bits wide. Elements that occur more frequently will have their
101 	 * count capped.
102 	 */
103 	private long[] recs;
104 
105 	/** Number of elements in {@link #recs}; also is the unique element count. */
106 	private int recCnt;
107 
108 	/**
109 	 * For {@code ptr}, {@code next[ptr - ptrShift]} has subsequent index.
110 	 *
111 	 * For the sequence element {@code ptr}, the value stored at location
112 	 * {@code next[ptr - ptrShift]} is the next occurrence of the exact same
113 	 * element in the sequence.
114 	 *
115 	 * Chains always run from the lowest index to the largest index. Therefore
116 	 * the array will store {@code next[1] = 2}, but never {@code next[2] = 1}.
117 	 * This allows a chain to terminate with {@code 0}, as {@code 0} would never
118 	 * be a valid next element.
119 	 *
120 	 * The array is sized to be {@code region.getLengthA()} and element indexes
121 	 * are converted to array indexes by subtracting {@link #ptrShift}, which is
122 	 * just a cached version of {@code region.beginA}.
123 	 */
124 	private int[] next;
125 
126 	/**
127 	 * For element {@code ptr} in A, index of the record in {@link #recs} array.
128 	 *
129 	 * The record at {@code recs[recIdx[ptr - ptrShift]]} is the record
130 	 * describing all occurrences of the element appearing in sequence A at
131 	 * position {@code ptr}. The record is needed to get the occurrence count of
132 	 * the element, or to locate all other occurrences of that element within
133 	 * sequence A. This index provides constant-time access to the record, and
134 	 * avoids needing to scan the hash chain.
135 	 */
136 	private int[] recIdx;
137 
138 	/** Value to subtract from element indexes to key {@link #next} array. */
139 	private int ptrShift;
140 
141 	private Edit lcs;
142 
143 	private int cnt;
144 
145 	private boolean hasCommon;
146 
147 	HistogramDiffIndex(int maxChainLength, HashedSequenceComparator<S> cmp,
148 			HashedSequence<S> a, HashedSequence<S> b, Edit r) {
149 		this.maxChainLength = maxChainLength;
150 		this.cmp = cmp;
151 		this.a = a;
152 		this.b = b;
153 		this.region = r;
154 
155 		if (region.endA >= MAX_PTR)
156 			throw new IllegalArgumentException(
157 					JGitText.get().sequenceTooLargeForDiffAlgorithm);
158 
159 		final int sz = r.getLengthA();
160 		final int tableBits = tableBits(sz);
161 		table = new int[1 << tableBits];
162 		keyShift = 32 - tableBits;
163 		ptrShift = r.beginA;
164 
165 		recs = new long[Math.max(4, sz >>> 3)];
166 		next = new int[sz];
167 		recIdx = new int[sz];
168 	}
169 
170 	Edit findLongestCommonSequence() {
171 		if (!scanA())
172 			return null;
173 
174 		lcs = new Edit(0, 0);
175 		cnt = maxChainLength + 1;
176 
177 		for (int bPtr = region.beginB; bPtr < region.endB;)
178 			bPtr = tryLongestCommonSequence(bPtr);
179 
180 		return hasCommon && maxChainLength < cnt ? null : lcs;
181 	}
182 
183 	private boolean scanA() {
184 		// Scan the elements backwards, inserting them into the hash table
185 		// as we go. Going in reverse places the earliest occurrence of any
186 		// element at the start of the chain, so we consider earlier matches
187 		// before later matches.
188 		//
189 		SCAN: for (int ptr = region.endA - 1; region.beginA <= ptr; ptr--) {
190 			final int tIdx = hash(a, ptr);
191 
192 			int chainLen = 0;
193 			for (int rIdx = table[tIdx]; rIdx != 0;) {
194 				final long rec = recs[rIdx];
195 				if (cmp.equals(a, recPtr(rec), a, ptr)) {
196 					// ptr is identical to another element. Insert it onto
197 					// the front of the existing element chain.
198 					//
199 					int newCnt = recCnt(rec) + 1;
200 					if (MAX_CNT < newCnt)
201 						newCnt = MAX_CNT;
202 					recs[rIdx] = recCreate(recNext(rec), ptr, newCnt);
203 					next[ptr - ptrShift] = recPtr(rec);
204 					recIdx[ptr - ptrShift] = rIdx;
205 					continue SCAN;
206 				}
207 
208 				rIdx = recNext(rec);
209 				chainLen++;
210 			}
211 
212 			if (chainLen == maxChainLength)
213 				return false;
214 
215 			// This is the first time we have ever seen this particular
216 			// element in the sequence. Construct a new chain for it.
217 			//
218 			final int rIdx = ++recCnt;
219 			if (rIdx == recs.length) {
220 				int sz = Math.min(recs.length << 1, 1 + region.getLengthA());
221 				long[] n = new long[sz];
222 				System.arraycopy(recs, 0, n, 0, recs.length);
223 				recs = n;
224 			}
225 
226 			recs[rIdx] = recCreate(table[tIdx], ptr, 1);
227 			recIdx[ptr - ptrShift] = rIdx;
228 			table[tIdx] = rIdx;
229 		}
230 		return true;
231 	}
232 
233 	private int tryLongestCommonSequence(int bPtr) {
234 		int bNext = bPtr + 1;
235 		int rIdx = table[hash(b, bPtr)];
236 		for (long rec; rIdx != 0; rIdx = recNext(rec)) {
237 			rec = recs[rIdx];
238 
239 			// If there are more occurrences in A, don't use this chain.
240 			if (recCnt(rec) > cnt) {
241 				if (!hasCommon)
242 					hasCommon = cmp.equals(a, recPtr(rec), b, bPtr);
243 				continue;
244 			}
245 
246 			int as = recPtr(rec);
247 			if (!cmp.equals(a, as, b, bPtr))
248 				continue;
249 
250 			hasCommon = true;
251 			TRY_LOCATIONS: for (;;) {
252 				int np = next[as - ptrShift];
253 				int bs = bPtr;
254 				int ae = as + 1;
255 				int be = bs + 1;
256 				int rc = recCnt(rec);
257 
258 				while (region.beginA < as && region.beginB < bs
259 						&& cmp.equals(a, as - 1, b, bs - 1)) {
260 					as--;
261 					bs--;
262 					if (1 < rc)
263 						rc = Math.min(rc, recCnt(recs[recIdx[as - ptrShift]]));
264 				}
265 				while (ae < region.endA && be < region.endB
266 						&& cmp.equals(a, ae, b, be)) {
267 					if (1 < rc)
268 						rc = Math.min(rc, recCnt(recs[recIdx[ae - ptrShift]]));
269 					ae++;
270 					be++;
271 				}
272 
273 				if (bNext < be)
274 					bNext = be;
275 				if (lcs.getLengthA() < ae - as || rc < cnt) {
276 					// If this region is the longest, or there are less
277 					// occurrences of it in A, its now our LCS.
278 					//
279 					lcs.beginA = as;
280 					lcs.beginB = bs;
281 					lcs.endA = ae;
282 					lcs.endB = be;
283 					cnt = rc;
284 				}
285 
286 				// Because we added elements in reverse order index 0
287 				// cannot possibly be the next position. Its the first
288 				// element of the sequence and thus would have been the
289 				// value of as at the start of the TRY_LOCATIONS loop.
290 				//
291 				if (np == 0)
292 					break TRY_LOCATIONS;
293 
294 				while (np < ae) {
295 					// The next location to consider was actually within
296 					// the LCS we examined above. Don't reconsider it.
297 					//
298 					np = next[np - ptrShift];
299 					if (np == 0)
300 						break TRY_LOCATIONS;
301 				}
302 
303 				as = np;
304 			}
305 		}
306 		return bNext;
307 	}
308 
309 	private int hash(HashedSequence<S> s, int idx) {
310 		return (cmp.hash(s, idx) * 0x9e370001 /* mix bits */) >>> keyShift;
311 	}
312 
313 	private static long recCreate(int next, int ptr, int cnt) {
314 		return ((long) next << REC_NEXT_SHIFT) //
315 				| ((long) ptr << REC_PTR_SHIFT) //
316 				| cnt;
317 	}
318 
319 	private static int recNext(long rec) {
320 		return (int) (rec >>> REC_NEXT_SHIFT);
321 	}
322 
323 	private static int recPtr(long rec) {
324 		return ((int) (rec >>> REC_PTR_SHIFT)) & REC_PTR_MASK;
325 	}
326 
327 	private static int recCnt(long rec) {
328 		return ((int) rec) & REC_CNT_MASK;
329 	}
330 
331 	private static int tableBits(int sz) {
332 		int bits = 31 - Integer.numberOfLeadingZeros(sz);
333 		if (bits == 0)
334 			bits = 1;
335 		if (1 << bits < sz)
336 			bits++;
337 		return bits;
338 	}
339 }