SimilarityIndex.java

/*
 * Copyright (C) 2010, Google Inc. and others
 *
 * This program and the accompanying materials are made available under the
 * terms of the Eclipse Distribution License v. 1.0 which is available at
 * https://www.eclipse.org/org/documents/edl-v10.php.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 */

package org.eclipse.jgit.diff;

import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;

import org.eclipse.jgit.errors.MissingObjectException;
import org.eclipse.jgit.lib.ObjectLoader;
import org.eclipse.jgit.lib.ObjectStream;

/**
 * Index structure of lines/blocks in one file.
 * <p>
 * This structure can be used to compute an approximation of the similarity
 * between two files. The index is used by
 * {@link org.eclipse.jgit.diff.SimilarityRenameDetector} to compute scores
 * between files.
 * <p>
 * To save space in memory, this index uses a space efficient encoding which
 * will not exceed 1 MiB per instance. The index starts out at a smaller size
 * (closer to 2 KiB), but may grow as more distinct blocks within the scanned
 * file are discovered.
 *
 * @since 4.0
 */
public class SimilarityIndex {
    /** A special {@link TableFullException} used in place of OutOfMemoryError. */
    public static final TableFullException
            TABLE_FULL_OUT_OF_MEMORY = new TableFullException();

    /**
     * Shift to apply before storing a key.
     * <p>
     * Within the 64 bit table record space, we leave the highest bit unset so
     * all values are positive. The lower 32 bits to count bytes.
     */
    private static final int KEY_SHIFT = 32;

    /** Maximum value of the count field, also mask to extract the count. */
    private static final long MAX_COUNT = (1L << KEY_SHIFT) - 1;

    /**
     * Total amount of bytes hashed into the structure, including \n. This is
     * usually the size of the file minus number of CRLF encounters.
     */
    private long hashedCnt;

    /** Number of non-zero entries in {@link #idHash}. */
    private int idSize;

    /** {@link #idSize} that triggers {@link #idHash} to double in size. */
    private int idGrowAt;

    /**
     * Pairings of content keys and counters.
     * <p>
     * Slots in the table are actually two ints wedged into a single long. The
     * upper 32 bits stores the content key, and the remaining lower bits stores
     * the number of bytes associated with that key. Empty slots are denoted by
     * 0, which cannot occur because the count cannot be 0. Values can only be
     * positive, which we enforce during key addition.
     */
    private long[] idHash;

    /** {@code idHash.length == 1 << idHashBits}. */
    private int idHashBits;

    /**
     * Create a new similarity index for the given object
     *
     * @param obj
     *            the object to hash
     * @return similarity index for this object
     * @throws java.io.IOException
     *             file contents cannot be read from the repository.
     * @throws org.eclipse.jgit.diff.SimilarityIndex.TableFullException
     *             object hashing overflowed the storage capacity of the
     *             SimilarityIndex.
     */
    public static SimilarityIndex create(ObjectLoader obj) throws IOException,
            TableFullException {
        SimilarityIndex idx = new SimilarityIndex();
        idx.hash(obj);
        idx.sort();
        return idx;
    }

    SimilarityIndex() {
        idHashBits = 8;
        idHash = new long[1 << idHashBits];
        idGrowAt = growAt(idHashBits);
    }

    void hash(ObjectLoader obj) throws MissingObjectException, IOException,
            TableFullException {
        if (obj.isLarge()) {
            hashLargeObject(obj);
        } else {
            byte[] raw = obj.getCachedBytes();
            hash(raw, 0, raw.length);
        }
    }

    private void hashLargeObject(ObjectLoader obj) throws IOException,
            TableFullException {
        boolean text;
        try (ObjectStream in1 = obj.openStream()) {
            text = !RawText.isBinary(in1);
        }

        try (ObjectStream in2 = obj.openStream()) {
            hash(in2, in2.getSize(), text);
        }
    }

    void hash(byte[] raw, int ptr, int end) throws TableFullException {
        final boolean text = !RawText.isBinary(raw);
        hashedCnt = 0;
        while (ptr < end) {
            int hash = 5381;
            int blockHashedCnt = 0;
            int start = ptr;

            // Hash one line, or one block, whichever occurs first.
            do {
                int c = raw[ptr++] & 0xff;
                // Ignore CR in CRLF sequence if text
                if (text && c == '\r' && ptr < end && raw[ptr] == '\n')
                    continue;
                blockHashedCnt++;
                if (c == '\n')
                    break;
                hash = (hash << 5) + hash + c;
            } while (ptr < end && ptr - start < 64);
            hashedCnt += blockHashedCnt;
            add(hash, blockHashedCnt);
        }
    }

    void hash(InputStream in, long remaining, boolean text) throws IOException,
            TableFullException {
        byte[] buf = new byte[4096];
        int ptr = 0;
        int cnt = 0;

        while (0 < remaining) {
            int hash = 5381;
            int blockHashedCnt = 0;

            // Hash one line, or one block, whichever occurs first.
            int n = 0;
            do {
                if (ptr == cnt) {
                    ptr = 0;
                    cnt = in.read(buf, 0, buf.length);
                    if (cnt <= 0)
                        throw new EOFException();
                }

                n++;
                int c = buf[ptr++] & 0xff;
                // Ignore CR in CRLF sequence if text
                if (text && c == '\r' && ptr < cnt && buf[ptr] == '\n')
                    continue;
                blockHashedCnt++;
                if (c == '\n')
                    break;
                hash = (hash << 5) + hash + c;
            } while (n < 64 && n < remaining);
            hashedCnt += blockHashedCnt;
            add(hash, blockHashedCnt);
            remaining -= n;
        }
    }

    /**
     * Sort the internal table so it can be used for efficient scoring.
     * <p>
     * Once sorted, additional lines/blocks cannot be added to the index.
     */
    void sort() {
        // Sort the array. All of the empty space will wind up at the front,
        // because we forced all of the keys to always be positive. Later
        // we only work with the back half of the array.
        //
        Arrays.sort(idHash);
    }

    /**
     * Compute the similarity score between this index and another.
     * <p>
     * A region of a file is defined as a line in a text file or a fixed-size
     * block in a binary file. To prepare an index, each region in the file is
     * hashed; the values and counts of hashes are retained in a sorted table.
     * Define the similarity fraction F as the count of matching regions
     * between the two files divided between the maximum count of regions in
     * either file. The similarity score is F multiplied by the maxScore
     * constant, yielding a range [0, maxScore]. It is defined as maxScore for
     * the degenerate case of two empty files.
     * <p>
     * The similarity score is symmetrical; i.e. a.score(b) == b.score(a).
     *
     * @param dst
     *            the other index
     * @param maxScore
     *            the score representing a 100% match
     * @return the similarity score
     */
    public int score(SimilarityIndex dst, int maxScore) {
        long max = Math.max(hashedCnt, dst.hashedCnt);
        if (max == 0)
            return maxScore;
        return (int) ((common(dst) * maxScore) / max);
    }

    long common(SimilarityIndex dst) {
        return common(this, dst);
    }

    private static long common(SimilarityIndex src, SimilarityIndex dst) {
        int srcIdx = src.packedIndex(0);
        int dstIdx = dst.packedIndex(0);
        long[] srcHash = src.idHash;
        long[] dstHash = dst.idHash;
        return common(srcHash, srcIdx, dstHash, dstIdx);
    }

    private static long common(long[] srcHash, int srcIdx, //
            long[] dstHash, int dstIdx) {
        if (srcIdx == srcHash.length || dstIdx == dstHash.length)
            return 0;

        long common = 0;
        int srcKey = keyOf(srcHash[srcIdx]);
        int dstKey = keyOf(dstHash[dstIdx]);

        for (;;) {
            if (srcKey == dstKey) {
                common += Math.min(countOf(srcHash[srcIdx]),
                        countOf(dstHash[dstIdx]));

                if (++srcIdx == srcHash.length)
                    break;
                srcKey = keyOf(srcHash[srcIdx]);

                if (++dstIdx == dstHash.length)
                    break;
                dstKey = keyOf(dstHash[dstIdx]);

            } else if (srcKey < dstKey) {
                // Regions of src which do not appear in dst.
                if (++srcIdx == srcHash.length)
                    break;
                srcKey = keyOf(srcHash[srcIdx]);

            } else /* if (dstKey < srcKey) */{
                // Regions of dst which do not appear in src.
                if (++dstIdx == dstHash.length)
                    break;
                dstKey = keyOf(dstHash[dstIdx]);
            }
        }

        return common;
    }

    // Testing only
    int size() {
        return idSize;
    }

    // Testing only
    int key(int idx) {
        return keyOf(idHash[packedIndex(idx)]);
    }

    // Testing only
    long count(int idx) {
        return countOf(idHash[packedIndex(idx)]);
    }

    // Brute force approach only for testing.
    int findIndex(int key) {
        for (int i = 0; i < idSize; i++)
            if (key(i) == key)
                return i;
        return -1;
    }

    private int packedIndex(int idx) {
        return (idHash.length - idSize) + idx;
    }

    void add(int key, int cnt) throws TableFullException {
        key = (key * 0x9e370001) >>> 1; // Mix bits and ensure not negative.

        int j = slot(key);
        for (;;) {
            long v = idHash[j];
            if (v == 0) {
                // Empty slot in the table, store here.
                if (idGrowAt <= idSize) {
                    grow();
                    j = slot(key);
                    continue;
                }
                idHash[j] = pair(key, cnt);
                idSize++;
                return;

            } else if (keyOf(v) == key) {
                // Same key, increment the counter. If it overflows, fail
                // indexing to prevent the key from being impacted.
                //
                idHash[j] = pair(key, countOf(v) + cnt);
                return;

            } else if (++j >= idHash.length) {
                j = 0;
            }
        }
    }

    private static long pair(int key, long cnt) throws TableFullException {
        if (MAX_COUNT < cnt)
            throw new TableFullException();
        return (((long) key) << KEY_SHIFT) | cnt;
    }

    private int slot(int key) {
        // We use 31 - idHashBits because the upper bit was already forced
        // to be 0 and we want the remaining high bits to be used as the
        // table slot.
        //
        return key >>> (31 - idHashBits);
    }

    private static int growAt(int idHashBits) {
        return (1 << idHashBits) * (idHashBits - 3) / idHashBits;
    }

    @SuppressWarnings("UnusedException")
    private void grow() throws TableFullException {
        if (idHashBits == 30)
            throw new TableFullException();

        long[] oldHash = idHash;
        int oldSize = idHash.length;

        idHashBits++;
        idGrowAt = growAt(idHashBits);

        try {
            idHash = new long[1 << idHashBits];
        } catch (OutOfMemoryError noMemory) {
            throw TABLE_FULL_OUT_OF_MEMORY;
        }

        for (int i = 0; i < oldSize; i++) {
            long v = oldHash[i];
            if (v != 0) {
                int j = slot(keyOf(v));
                while (idHash[j] != 0)
                    if (++j >= idHash.length)
                        j = 0;
                idHash[j] = v;
            }
        }
    }

    private static int keyOf(long v) {
        return (int) (v >>> KEY_SHIFT);
    }

    private static long countOf(long v) {
        return v & MAX_COUNT;
    }

    /** Thrown by {@code create()} when file is too large. */
    public static class TableFullException extends Exception {
        private static final long serialVersionUID = 1L;
    }
}