HistogramDiffIndex.java
/*
* Copyright (C) 2010, Google Inc.
* and other copyright owners as documented in the project's IP log.
*
* This program and the accompanying materials are made available
* under the terms of the Eclipse Distribution License v1.0 which
* accompanies this distribution, is reproduced below, and is
* available at http://www.eclipse.org/org/documents/edl-v10.php
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials provided
* with the distribution.
*
* - Neither the name of the Eclipse Foundation, Inc. nor the
* names of its contributors may be used to endorse or promote
* products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package org.eclipse.jgit.diff;
import org.eclipse.jgit.internal.JGitText;
/**
* Support {@link HistogramDiff} by computing occurrence counts of elements.
* <p>
* Each element in the range being considered is put into a hash table, tracking
* the number of times that distinct element appears in the sequence. Once all
* elements have been inserted from sequence A, each element of sequence B is
* probed in the hash table and the longest common subsequence with the lowest
* occurrence count in A is used as the result.
*
* @param <S>
* type of the base sequence.
*/
final class HistogramDiffIndex<S extends Sequence> {
private static final int REC_NEXT_SHIFT = 28 + 8;
private static final int REC_PTR_SHIFT = 8;
private static final int REC_PTR_MASK = (1 << 28) - 1;
private static final int REC_CNT_MASK = (1 << 8) - 1;
private static final int MAX_PTR = REC_PTR_MASK;
private static final int MAX_CNT = (1 << 8) - 1;
private final int maxChainLength;
private final HashedSequenceComparator<S> cmp;
private final HashedSequence<S> a;
private final HashedSequence<S> b;
private final Edit region;
/** Keyed by {@link #hash(HashedSequence, int)} for {@link #recs} index. */
private final int[] table;
/** Number of low bits to discard from a key to index {@link #table}. */
private final int keyShift;
/**
* Describes a unique element in sequence A.
*
* The records in this table are actually 3-tuples of:
* <ul>
* <li>index of next record in this table that has same hash code</li>
* <li>index of first element in this occurrence chain</li>
* <li>occurrence count for this element (length of locs list)</li>
* </ul>
*
* The occurrence count is capped at {@link #MAX_CNT}, as the field is only
* a few bits wide. Elements that occur more frequently will have their
* count capped.
*/
private long[] recs;
/** Number of elements in {@link #recs}; also is the unique element count. */
private int recCnt;
/**
* For {@code ptr}, {@code next[ptr - ptrShift]} has subsequent index.
*
* For the sequence element {@code ptr}, the value stored at location
* {@code next[ptr - ptrShift]} is the next occurrence of the exact same
* element in the sequence.
*
* Chains always run from the lowest index to the largest index. Therefore
* the array will store {@code next[1] = 2}, but never {@code next[2] = 1}.
* This allows a chain to terminate with {@code 0}, as {@code 0} would never
* be a valid next element.
*
* The array is sized to be {@code region.getLengthA()} and element indexes
* are converted to array indexes by subtracting {@link #ptrShift}, which is
* just a cached version of {@code region.beginA}.
*/
private int[] next;
/**
* For element {@code ptr} in A, index of the record in {@link #recs} array.
*
* The record at {@code recs[recIdx[ptr - ptrShift]]} is the record
* describing all occurrences of the element appearing in sequence A at
* position {@code ptr}. The record is needed to get the occurrence count of
* the element, or to locate all other occurrences of that element within
* sequence A. This index provides constant-time access to the record, and
* avoids needing to scan the hash chain.
*/
private int[] recIdx;
/** Value to subtract from element indexes to key {@link #next} array. */
private int ptrShift;
private Edit lcs;
private int cnt;
private boolean hasCommon;
HistogramDiffIndex(int maxChainLength, HashedSequenceComparator<S> cmp,
HashedSequence<S> a, HashedSequence<S> b, Edit r) {
this.maxChainLength = maxChainLength;
this.cmp = cmp;
this.a = a;
this.b = b;
this.region = r;
if (region.endA >= MAX_PTR)
throw new IllegalArgumentException(
JGitText.get().sequenceTooLargeForDiffAlgorithm);
final int sz = r.getLengthA();
final int tableBits = tableBits(sz);
table = new int[1 << tableBits];
keyShift = 32 - tableBits;
ptrShift = r.beginA;
recs = new long[Math.max(4, sz >>> 3)];
next = new int[sz];
recIdx = new int[sz];
}
Edit findLongestCommonSequence() {
if (!scanA())
return null;
lcs = new Edit(0, 0);
cnt = maxChainLength + 1;
for (int bPtr = region.beginB; bPtr < region.endB;)
bPtr = tryLongestCommonSequence(bPtr);
return hasCommon && maxChainLength < cnt ? null : lcs;
}
private boolean scanA() {
// Scan the elements backwards, inserting them into the hash table
// as we go. Going in reverse places the earliest occurrence of any
// element at the start of the chain, so we consider earlier matches
// before later matches.
//
SCAN: for (int ptr = region.endA - 1; region.beginA <= ptr; ptr--) {
final int tIdx = hash(a, ptr);
int chainLen = 0;
for (int rIdx = table[tIdx]; rIdx != 0;) {
final long rec = recs[rIdx];
if (cmp.equals(a, recPtr(rec), a, ptr)) {
// ptr is identical to another element. Insert it onto
// the front of the existing element chain.
//
int newCnt = recCnt(rec) + 1;
if (MAX_CNT < newCnt)
newCnt = MAX_CNT;
recs[rIdx] = recCreate(recNext(rec), ptr, newCnt);
next[ptr - ptrShift] = recPtr(rec);
recIdx[ptr - ptrShift] = rIdx;
continue SCAN;
}
rIdx = recNext(rec);
chainLen++;
}
if (chainLen == maxChainLength)
return false;
// This is the first time we have ever seen this particular
// element in the sequence. Construct a new chain for it.
//
final int rIdx = ++recCnt;
if (rIdx == recs.length) {
int sz = Math.min(recs.length << 1, 1 + region.getLengthA());
long[] n = new long[sz];
System.arraycopy(recs, 0, n, 0, recs.length);
recs = n;
}
recs[rIdx] = recCreate(table[tIdx], ptr, 1);
recIdx[ptr - ptrShift] = rIdx;
table[tIdx] = rIdx;
}
return true;
}
private int tryLongestCommonSequence(int bPtr) {
int bNext = bPtr + 1;
int rIdx = table[hash(b, bPtr)];
for (long rec; rIdx != 0; rIdx = recNext(rec)) {
rec = recs[rIdx];
// If there are more occurrences in A, don't use this chain.
if (recCnt(rec) > cnt) {
if (!hasCommon)
hasCommon = cmp.equals(a, recPtr(rec), b, bPtr);
continue;
}
int as = recPtr(rec);
if (!cmp.equals(a, as, b, bPtr))
continue;
hasCommon = true;
TRY_LOCATIONS: for (;;) {
int np = next[as - ptrShift];
int bs = bPtr;
int ae = as + 1;
int be = bs + 1;
int rc = recCnt(rec);
while (region.beginA < as && region.beginB < bs
&& cmp.equals(a, as - 1, b, bs - 1)) {
as--;
bs--;
if (1 < rc)
rc = Math.min(rc, recCnt(recs[recIdx[as - ptrShift]]));
}
while (ae < region.endA && be < region.endB
&& cmp.equals(a, ae, b, be)) {
if (1 < rc)
rc = Math.min(rc, recCnt(recs[recIdx[ae - ptrShift]]));
ae++;
be++;
}
if (bNext < be)
bNext = be;
if (lcs.getLengthA() < ae - as || rc < cnt) {
// If this region is the longest, or there are less
// occurrences of it in A, its now our LCS.
//
lcs.beginA = as;
lcs.beginB = bs;
lcs.endA = ae;
lcs.endB = be;
cnt = rc;
}
// Because we added elements in reverse order index 0
// cannot possibly be the next position. Its the first
// element of the sequence and thus would have been the
// value of as at the start of the TRY_LOCATIONS loop.
//
if (np == 0)
break TRY_LOCATIONS;
while (np < ae) {
// The next location to consider was actually within
// the LCS we examined above. Don't reconsider it.
//
np = next[np - ptrShift];
if (np == 0)
break TRY_LOCATIONS;
}
as = np;
}
}
return bNext;
}
private int hash(HashedSequence<S> s, int idx) {
return (cmp.hash(s, idx) * 0x9e370001 /* mix bits */) >>> keyShift;
}
private static long recCreate(int next, int ptr, int cnt) {
return ((long) next << REC_NEXT_SHIFT) //
| ((long) ptr << REC_PTR_SHIFT) //
| cnt;
}
private static int recNext(long rec) {
return (int) (rec >>> REC_NEXT_SHIFT);
}
private static int recPtr(long rec) {
return ((int) (rec >>> REC_PTR_SHIFT)) & REC_PTR_MASK;
}
private static int recCnt(long rec) {
return ((int) rec) & REC_CNT_MASK;
}
private static int tableBits(int sz) {
int bits = 31 - Integer.numberOfLeadingZeros(sz);
if (bits == 0)
bits = 1;
if (1 << bits < sz)
bits++;
return bits;
}
}