View Javadoc
1   /*
2    * Copyright (C) 2010, Google Inc. and others
3    *
4    * This program and the accompanying materials are made available under the
5    * terms of the Eclipse Distribution License v. 1.0 which is available at
6    * https://www.eclipse.org/org/documents/edl-v10.php.
7    *
8    * SPDX-License-Identifier: BSD-3-Clause
9    */
10  
11  package org.eclipse.jgit.diff;
12  
13  import org.eclipse.jgit.internal.JGitText;
14  
15  /**
16   * Support {@link HistogramDiff} by computing occurrence counts of elements.
17   * <p>
18   * Each element in the range being considered is put into a hash table, tracking
19   * the number of times that distinct element appears in the sequence. Once all
20   * elements have been inserted from sequence A, each element of sequence B is
21   * probed in the hash table and the longest common subsequence with the lowest
22   * occurrence count in A is used as the result.
23   *
24   * @param <S>
25   *            type of the base sequence.
26   */
27  final class HistogramDiffIndex<S extends Sequence> {
28  	private static final int REC_NEXT_SHIFT = 28 + 8;
29  
30  	private static final int REC_PTR_SHIFT = 8;
31  
32  	private static final int REC_PTR_MASK = (1 << 28) - 1;
33  
34  	private static final int REC_CNT_MASK = (1 << 8) - 1;
35  
36  	private static final int MAX_PTR = REC_PTR_MASK;
37  
38  	private static final int MAX_CNT = (1 << 8) - 1;
39  
40  	private final int maxChainLength;
41  
42  	private final HashedSequenceComparator<S> cmp;
43  
44  	private final HashedSequence<S> a;
45  
46  	private final HashedSequence<S> b;
47  
48  	private final Edit region;
49  
50  	/** Keyed by {@link #hash(HashedSequence, int)} for {@link #recs} index. */
51  	private final int[] table;
52  
53  	/** Number of low bits to discard from a key to index {@link #table}. */
54  	private final int keyShift;
55  
56  	/**
57  	 * Describes a unique element in sequence A.
58  	 *
59  	 * The records in this table are actually 3-tuples of:
60  	 * <ul>
61  	 * <li>index of next record in this table that has same hash code</li>
62  	 * <li>index of first element in this occurrence chain</li>
63  	 * <li>occurrence count for this element (length of locs list)</li>
64  	 * </ul>
65  	 *
66  	 * The occurrence count is capped at {@link #MAX_CNT}, as the field is only
67  	 * a few bits wide. Elements that occur more frequently will have their
68  	 * count capped.
69  	 */
70  	private long[] recs;
71  
72  	/** Number of elements in {@link #recs}; also is the unique element count. */
73  	private int recCnt;
74  
75  	/**
76  	 * For {@code ptr}, {@code next[ptr - ptrShift]} has subsequent index.
77  	 *
78  	 * For the sequence element {@code ptr}, the value stored at location
79  	 * {@code next[ptr - ptrShift]} is the next occurrence of the exact same
80  	 * element in the sequence.
81  	 *
82  	 * Chains always run from the lowest index to the largest index. Therefore
83  	 * the array will store {@code next[1] = 2}, but never {@code next[2] = 1}.
84  	 * This allows a chain to terminate with {@code 0}, as {@code 0} would never
85  	 * be a valid next element.
86  	 *
87  	 * The array is sized to be {@code region.getLengthA()} and element indexes
88  	 * are converted to array indexes by subtracting {@link #ptrShift}, which is
89  	 * just a cached version of {@code region.beginA}.
90  	 */
91  	private int[] next;
92  
93  	/**
94  	 * For element {@code ptr} in A, index of the record in {@link #recs} array.
95  	 *
96  	 * The record at {@code recs[recIdx[ptr - ptrShift]]} is the record
97  	 * describing all occurrences of the element appearing in sequence A at
98  	 * position {@code ptr}. The record is needed to get the occurrence count of
99  	 * the element, or to locate all other occurrences of that element within
100 	 * sequence A. This index provides constant-time access to the record, and
101 	 * avoids needing to scan the hash chain.
102 	 */
103 	private int[] recIdx;
104 
105 	/** Value to subtract from element indexes to key {@link #next} array. */
106 	private int ptrShift;
107 
108 	private Edit lcs;
109 
110 	private int cnt;
111 
112 	private boolean hasCommon;
113 
114 	HistogramDiffIndex(int maxChainLength, HashedSequenceComparator<S> cmp,
115 			HashedSequence<S> a, HashedSequence<S> b, Edit r) {
116 		this.maxChainLength = maxChainLength;
117 		this.cmp = cmp;
118 		this.a = a;
119 		this.b = b;
120 		this.region = r;
121 
122 		if (region.endA >= MAX_PTR)
123 			throw new IllegalArgumentException(
124 					JGitText.get().sequenceTooLargeForDiffAlgorithm);
125 
126 		final int sz = r.getLengthA();
127 		final int tableBits = tableBits(sz);
128 		table = new int[1 << tableBits];
129 		keyShift = 32 - tableBits;
130 		ptrShift = r.beginA;
131 
132 		recs = new long[Math.max(4, sz >>> 3)];
133 		next = new int[sz];
134 		recIdx = new int[sz];
135 	}
136 
137 	Edit findLongestCommonSequence() {
138 		if (!scanA())
139 			return null;
140 
141 		lcs = new Edit(0, 0);
142 		cnt = maxChainLength + 1;
143 
144 		for (int bPtr = region.beginB; bPtr < region.endB;)
145 			bPtr = tryLongestCommonSequence(bPtr);
146 
147 		return hasCommon && maxChainLength < cnt ? null : lcs;
148 	}
149 
150 	private boolean scanA() {
151 		// Scan the elements backwards, inserting them into the hash table
152 		// as we go. Going in reverse places the earliest occurrence of any
153 		// element at the start of the chain, so we consider earlier matches
154 		// before later matches.
155 		//
156 		SCAN: for (int ptr = region.endA - 1; region.beginA <= ptr; ptr--) {
157 			final int tIdx = hash(a, ptr);
158 
159 			int chainLen = 0;
160 			for (int rIdx = table[tIdx]; rIdx != 0;) {
161 				final long rec = recs[rIdx];
162 				if (cmp.equals(a, recPtr(rec), a, ptr)) {
163 					// ptr is identical to another element. Insert it onto
164 					// the front of the existing element chain.
165 					//
166 					int newCnt = recCnt(rec) + 1;
167 					if (MAX_CNT < newCnt)
168 						newCnt = MAX_CNT;
169 					recs[rIdx] = recCreate(recNext(rec), ptr, newCnt);
170 					next[ptr - ptrShift] = recPtr(rec);
171 					recIdx[ptr - ptrShift] = rIdx;
172 					continue SCAN;
173 				}
174 
175 				rIdx = recNext(rec);
176 				chainLen++;
177 			}
178 
179 			if (chainLen == maxChainLength)
180 				return false;
181 
182 			// This is the first time we have ever seen this particular
183 			// element in the sequence. Construct a new chain for it.
184 			//
185 			final int rIdx = ++recCnt;
186 			if (rIdx == recs.length) {
187 				int sz = Math.min(recs.length << 1, 1 + region.getLengthA());
188 				long[] n = new long[sz];
189 				System.arraycopy(recs, 0, n, 0, recs.length);
190 				recs = n;
191 			}
192 
193 			recs[rIdx] = recCreate(table[tIdx], ptr, 1);
194 			recIdx[ptr - ptrShift] = rIdx;
195 			table[tIdx] = rIdx;
196 		}
197 		return true;
198 	}
199 
200 	private int tryLongestCommonSequence(int bPtr) {
201 		int bNext = bPtr + 1;
202 		int rIdx = table[hash(b, bPtr)];
203 		for (long rec; rIdx != 0; rIdx = recNext(rec)) {
204 			rec = recs[rIdx];
205 
206 			// If there are more occurrences in A, don't use this chain.
207 			if (recCnt(rec) > cnt) {
208 				if (!hasCommon)
209 					hasCommon = cmp.equals(a, recPtr(rec), b, bPtr);
210 				continue;
211 			}
212 
213 			int as = recPtr(rec);
214 			if (!cmp.equals(a, as, b, bPtr))
215 				continue;
216 
217 			hasCommon = true;
218 			TRY_LOCATIONS: for (;;) {
219 				int np = next[as - ptrShift];
220 				int bs = bPtr;
221 				int ae = as + 1;
222 				int be = bs + 1;
223 				int rc = recCnt(rec);
224 
225 				while (region.beginA < as && region.beginB < bs
226 						&& cmp.equals(a, as - 1, b, bs - 1)) {
227 					as--;
228 					bs--;
229 					if (1 < rc)
230 						rc = Math.min(rc, recCnt(recs[recIdx[as - ptrShift]]));
231 				}
232 				while (ae < region.endA && be < region.endB
233 						&& cmp.equals(a, ae, b, be)) {
234 					if (1 < rc)
235 						rc = Math.min(rc, recCnt(recs[recIdx[ae - ptrShift]]));
236 					ae++;
237 					be++;
238 				}
239 
240 				if (bNext < be)
241 					bNext = be;
242 				if (lcs.getLengthA() < ae - as || rc < cnt) {
243 					// If this region is the longest, or there are less
244 					// occurrences of it in A, its now our LCS.
245 					//
246 					lcs.beginA = as;
247 					lcs.beginB = bs;
248 					lcs.endA = ae;
249 					lcs.endB = be;
250 					cnt = rc;
251 				}
252 
253 				// Because we added elements in reverse order index 0
254 				// cannot possibly be the next position. Its the first
255 				// element of the sequence and thus would have been the
256 				// value of as at the start of the TRY_LOCATIONS loop.
257 				//
258 				if (np == 0)
259 					break TRY_LOCATIONS;
260 
261 				while (np < ae) {
262 					// The next location to consider was actually within
263 					// the LCS we examined above. Don't reconsider it.
264 					//
265 					np = next[np - ptrShift];
266 					if (np == 0)
267 						break TRY_LOCATIONS;
268 				}
269 
270 				as = np;
271 			}
272 		}
273 		return bNext;
274 	}
275 
276 	private int hash(HashedSequence<S> s, int idx) {
277 		return (cmp.hash(s, idx) * 0x9e370001 /* mix bits */) >>> keyShift;
278 	}
279 
280 	private static long recCreate(int next, int ptr, int cnt) {
281 		return ((long) next << REC_NEXT_SHIFT) //
282 				| ((long) ptr << REC_PTR_SHIFT) //
283 				| cnt;
284 	}
285 
286 	private static int recNext(long rec) {
287 		return (int) (rec >>> REC_NEXT_SHIFT);
288 	}
289 
290 	private static int recPtr(long rec) {
291 		return ((int) (rec >>> REC_PTR_SHIFT)) & REC_PTR_MASK;
292 	}
293 
294 	private static int recCnt(long rec) {
295 		return ((int) rec) & REC_CNT_MASK;
296 	}
297 
298 	private static int tableBits(int sz) {
299 		int bits = 31 - Integer.numberOfLeadingZeros(sz);
300 		if (bits == 0)
301 			bits = 1;
302 		if (1 << bits < sz)
303 			bits++;
304 		return bits;
305 	}
306 }