HistogramDiffIndex.java

/*
 * Copyright (C) 2010, Google Inc.
 * and other copyright owners as documented in the project's IP log.
 *
 * This program and the accompanying materials are made available
 * under the terms of the Eclipse Distribution License v1.0 which
 * accompanies this distribution, is reproduced below, and is
 * available at http://www.eclipse.org/org/documents/edl-v10.php
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or
 * without modification, are permitted provided that the following
 * conditions are met:
 *
 * - Redistributions of source code must retain the above copyright
 *   notice, this list of conditions and the following disclaimer.
 *
 * - Redistributions in binary form must reproduce the above
 *   copyright notice, this list of conditions and the following
 *   disclaimer in the documentation and/or other materials provided
 *   with the distribution.
 *
 * - Neither the name of the Eclipse Foundation, Inc. nor the
 *   names of its contributors may be used to endorse or promote
 *   products derived from this software without specific prior
 *   written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

package org.eclipse.jgit.diff;

import org.eclipse.jgit.internal.JGitText;

/**
 * Support {@link HistogramDiff} by computing occurrence counts of elements.
 * <p>
 * Each element in the range being considered is put into a hash table, tracking
 * the number of times that distinct element appears in the sequence. Once all
 * elements have been inserted from sequence A, each element of sequence B is
 * probed in the hash table and the longest common subsequence with the lowest
 * occurrence count in A is used as the result.
 *
 * @param <S>
 *            type of the base sequence.
 */
final class HistogramDiffIndex<S extends Sequence> {
	private static final int REC_NEXT_SHIFT = 28 + 8;

	private static final int REC_PTR_SHIFT = 8;

	private static final int REC_PTR_MASK = (1 << 28) - 1;

	private static final int REC_CNT_MASK = (1 << 8) - 1;

	private static final int MAX_PTR = REC_PTR_MASK;

	private static final int MAX_CNT = (1 << 8) - 1;

	private final int maxChainLength;

	private final HashedSequenceComparator<S> cmp;

	private final HashedSequence<S> a;

	private final HashedSequence<S> b;

	private final Edit region;

	/** Keyed by {@link #hash(HashedSequence, int)} for {@link #recs} index. */
	private final int[] table;

	/** Number of low bits to discard from a key to index {@link #table}. */
	private final int keyShift;

	/**
	 * Describes a unique element in sequence A.
	 *
	 * The records in this table are actually 3-tuples of:
	 * <ul>
	 * <li>index of next record in this table that has same hash code</li>
	 * <li>index of first element in this occurrence chain</li>
	 * <li>occurrence count for this element (length of locs list)</li>
	 * </ul>
	 *
	 * The occurrence count is capped at {@link #MAX_CNT}, as the field is only
	 * a few bits wide. Elements that occur more frequently will have their
	 * count capped.
	 */
	private long[] recs;

	/** Number of elements in {@link #recs}; also is the unique element count. */
	private int recCnt;

	/**
	 * For {@code ptr}, {@code next[ptr - ptrShift]} has subsequent index.
	 *
	 * For the sequence element {@code ptr}, the value stored at location
	 * {@code next[ptr - ptrShift]} is the next occurrence of the exact same
	 * element in the sequence.
	 *
	 * Chains always run from the lowest index to the largest index. Therefore
	 * the array will store {@code next[1] = 2}, but never {@code next[2] = 1}.
	 * This allows a chain to terminate with {@code 0}, as {@code 0} would never
	 * be a valid next element.
	 *
	 * The array is sized to be {@code region.getLengthA()} and element indexes
	 * are converted to array indexes by subtracting {@link #ptrShift}, which is
	 * just a cached version of {@code region.beginA}.
	 */
	private int[] next;

	/**
	 * For element {@code ptr} in A, index of the record in {@link #recs} array.
	 *
	 * The record at {@code recs[recIdx[ptr - ptrShift]]} is the record
	 * describing all occurrences of the element appearing in sequence A at
	 * position {@code ptr}. The record is needed to get the occurrence count of
	 * the element, or to locate all other occurrences of that element within
	 * sequence A. This index provides constant-time access to the record, and
	 * avoids needing to scan the hash chain.
	 */
	private int[] recIdx;

	/** Value to subtract from element indexes to key {@link #next} array. */
	private int ptrShift;

	private Edit lcs;

	private int cnt;

	private boolean hasCommon;

	HistogramDiffIndex(int maxChainLength, HashedSequenceComparator<S> cmp,
			HashedSequence<S> a, HashedSequence<S> b, Edit r) {
		this.maxChainLength = maxChainLength;
		this.cmp = cmp;
		this.a = a;
		this.b = b;
		this.region = r;

		if (region.endA >= MAX_PTR)
			throw new IllegalArgumentException(
					JGitText.get().sequenceTooLargeForDiffAlgorithm);

		final int sz = r.getLengthA();
		final int tableBits = tableBits(sz);
		table = new int[1 << tableBits];
		keyShift = 32 - tableBits;
		ptrShift = r.beginA;

		recs = new long[Math.max(4, sz >>> 3)];
		next = new int[sz];
		recIdx = new int[sz];
	}

	Edit findLongestCommonSequence() {
		if (!scanA())
			return null;

		lcs = new Edit(0, 0);
		cnt = maxChainLength + 1;

		for (int bPtr = region.beginB; bPtr < region.endB;)
			bPtr = tryLongestCommonSequence(bPtr);

		return hasCommon && maxChainLength < cnt ? null : lcs;
	}

	private boolean scanA() {
		// Scan the elements backwards, inserting them into the hash table
		// as we go. Going in reverse places the earliest occurrence of any
		// element at the start of the chain, so we consider earlier matches
		// before later matches.
		//
		SCAN: for (int ptr = region.endA - 1; region.beginA <= ptr; ptr--) {
			final int tIdx = hash(a, ptr);

			int chainLen = 0;
			for (int rIdx = table[tIdx]; rIdx != 0;) {
				final long rec = recs[rIdx];
				if (cmp.equals(a, recPtr(rec), a, ptr)) {
					// ptr is identical to another element. Insert it onto
					// the front of the existing element chain.
					//
					int newCnt = recCnt(rec) + 1;
					if (MAX_CNT < newCnt)
						newCnt = MAX_CNT;
					recs[rIdx] = recCreate(recNext(rec), ptr, newCnt);
					next[ptr - ptrShift] = recPtr(rec);
					recIdx[ptr - ptrShift] = rIdx;
					continue SCAN;
				}

				rIdx = recNext(rec);
				chainLen++;
			}

			if (chainLen == maxChainLength)
				return false;

			// This is the first time we have ever seen this particular
			// element in the sequence. Construct a new chain for it.
			//
			final int rIdx = ++recCnt;
			if (rIdx == recs.length) {
				int sz = Math.min(recs.length << 1, 1 + region.getLengthA());
				long[] n = new long[sz];
				System.arraycopy(recs, 0, n, 0, recs.length);
				recs = n;
			}

			recs[rIdx] = recCreate(table[tIdx], ptr, 1);
			recIdx[ptr - ptrShift] = rIdx;
			table[tIdx] = rIdx;
		}
		return true;
	}

	private int tryLongestCommonSequence(int bPtr) {
		int bNext = bPtr + 1;
		int rIdx = table[hash(b, bPtr)];
		for (long rec; rIdx != 0; rIdx = recNext(rec)) {
			rec = recs[rIdx];

			// If there are more occurrences in A, don't use this chain.
			if (recCnt(rec) > cnt) {
				if (!hasCommon)
					hasCommon = cmp.equals(a, recPtr(rec), b, bPtr);
				continue;
			}

			int as = recPtr(rec);
			if (!cmp.equals(a, as, b, bPtr))
				continue;

			hasCommon = true;
			TRY_LOCATIONS: for (;;) {
				int np = next[as - ptrShift];
				int bs = bPtr;
				int ae = as + 1;
				int be = bs + 1;
				int rc = recCnt(rec);

				while (region.beginA < as && region.beginB < bs
						&& cmp.equals(a, as - 1, b, bs - 1)) {
					as--;
					bs--;
					if (1 < rc)
						rc = Math.min(rc, recCnt(recs[recIdx[as - ptrShift]]));
				}
				while (ae < region.endA && be < region.endB
						&& cmp.equals(a, ae, b, be)) {
					if (1 < rc)
						rc = Math.min(rc, recCnt(recs[recIdx[ae - ptrShift]]));
					ae++;
					be++;
				}

				if (bNext < be)
					bNext = be;
				if (lcs.getLengthA() < ae - as || rc < cnt) {
					// If this region is the longest, or there are less
					// occurrences of it in A, its now our LCS.
					//
					lcs.beginA = as;
					lcs.beginB = bs;
					lcs.endA = ae;
					lcs.endB = be;
					cnt = rc;
				}

				// Because we added elements in reverse order index 0
				// cannot possibly be the next position. Its the first
				// element of the sequence and thus would have been the
				// value of as at the start of the TRY_LOCATIONS loop.
				//
				if (np == 0)
					break TRY_LOCATIONS;

				while (np < ae) {
					// The next location to consider was actually within
					// the LCS we examined above. Don't reconsider it.
					//
					np = next[np - ptrShift];
					if (np == 0)
						break TRY_LOCATIONS;
				}

				as = np;
			}
		}
		return bNext;
	}

	private int hash(HashedSequence<S> s, int idx) {
		return (cmp.hash(s, idx) * 0x9e370001 /* mix bits */) >>> keyShift;
	}

	private static long recCreate(int next, int ptr, int cnt) {
		return ((long) next << REC_NEXT_SHIFT) //
				| ((long) ptr << REC_PTR_SHIFT) //
				| cnt;
	}

	private static int recNext(long rec) {
		return (int) (rec >>> REC_NEXT_SHIFT);
	}

	private static int recPtr(long rec) {
		return ((int) (rec >>> REC_PTR_SHIFT)) & REC_PTR_MASK;
	}

	private static int recCnt(long rec) {
		return ((int) rec) & REC_CNT_MASK;
	}

	private static int tableBits(int sz) {
		int bits = 31 - Integer.numberOfLeadingZeros(sz);
		if (bits == 0)
			bits = 1;
		if (1 << bits < sz)
			bits++;
		return bits;
	}
}