SimilarityIndex.java

  1. /*
  2.  * Copyright (C) 2010, Google Inc. and others
  3.  *
  4.  * This program and the accompanying materials are made available under the
  5.  * terms of the Eclipse Distribution License v. 1.0 which is available at
  6.  * https://www.eclipse.org/org/documents/edl-v10.php.
  7.  *
  8.  * SPDX-License-Identifier: BSD-3-Clause
  9.  */

  10. package org.eclipse.jgit.diff;

  11. import java.io.EOFException;
  12. import java.io.IOException;
  13. import java.io.InputStream;
  14. import java.util.Arrays;

  15. import org.eclipse.jgit.errors.MissingObjectException;
  16. import org.eclipse.jgit.lib.ObjectLoader;
  17. import org.eclipse.jgit.lib.ObjectStream;

  18. /**
  19.  * Index structure of lines/blocks in one file.
  20.  * <p>
  21.  * This structure can be used to compute an approximation of the similarity
  22.  * between two files. The index is used by
  23.  * {@link org.eclipse.jgit.diff.SimilarityRenameDetector} to compute scores
  24.  * between files.
  25.  * <p>
  26.  * To save space in memory, this index uses a space efficient encoding which
  27.  * will not exceed 1 MiB per instance. The index starts out at a smaller size
  28.  * (closer to 2 KiB), but may grow as more distinct blocks within the scanned
  29.  * file are discovered.
  30.  *
  31.  * @since 4.0
  32.  */
  33. public class SimilarityIndex {
  34.     /** A special {@link TableFullException} used in place of OutOfMemoryError. */
  35.     public static final TableFullException
  36.             TABLE_FULL_OUT_OF_MEMORY = new TableFullException();

  37.     /**
  38.      * Shift to apply before storing a key.
  39.      * <p>
  40.      * Within the 64 bit table record space, we leave the highest bit unset so
  41.      * all values are positive. The lower 32 bits to count bytes.
  42.      */
  43.     private static final int KEY_SHIFT = 32;

  44.     /** Maximum value of the count field, also mask to extract the count. */
  45.     private static final long MAX_COUNT = (1L << KEY_SHIFT) - 1;

  46.     /**
  47.      * Total amount of bytes hashed into the structure, including \n. This is
  48.      * usually the size of the file minus number of CRLF encounters.
  49.      */
  50.     private long hashedCnt;

  51.     /** Number of non-zero entries in {@link #idHash}. */
  52.     private int idSize;

  53.     /** {@link #idSize} that triggers {@link #idHash} to double in size. */
  54.     private int idGrowAt;

  55.     /**
  56.      * Pairings of content keys and counters.
  57.      * <p>
  58.      * Slots in the table are actually two ints wedged into a single long. The
  59.      * upper 32 bits stores the content key, and the remaining lower bits stores
  60.      * the number of bytes associated with that key. Empty slots are denoted by
  61.      * 0, which cannot occur because the count cannot be 0. Values can only be
  62.      * positive, which we enforce during key addition.
  63.      */
  64.     private long[] idHash;

  65.     /** {@code idHash.length == 1 << idHashBits}. */
  66.     private int idHashBits;

  67.     /**
  68.      * Create a new similarity index for the given object
  69.      *
  70.      * @param obj
  71.      *            the object to hash
  72.      * @return similarity index for this object
  73.      * @throws java.io.IOException
  74.      *             file contents cannot be read from the repository.
  75.      * @throws org.eclipse.jgit.diff.SimilarityIndex.TableFullException
  76.      *             object hashing overflowed the storage capacity of the
  77.      *             SimilarityIndex.
  78.      */
  79.     public static SimilarityIndex create(ObjectLoader obj) throws IOException,
  80.             TableFullException {
  81.         SimilarityIndex idx = new SimilarityIndex();
  82.         idx.hash(obj);
  83.         idx.sort();
  84.         return idx;
  85.     }

  86.     SimilarityIndex() {
  87.         idHashBits = 8;
  88.         idHash = new long[1 << idHashBits];
  89.         idGrowAt = growAt(idHashBits);
  90.     }

  91.     void hash(ObjectLoader obj) throws MissingObjectException, IOException,
  92.             TableFullException {
  93.         if (obj.isLarge()) {
  94.             hashLargeObject(obj);
  95.         } else {
  96.             byte[] raw = obj.getCachedBytes();
  97.             hash(raw, 0, raw.length);
  98.         }
  99.     }

  100.     private void hashLargeObject(ObjectLoader obj) throws IOException,
  101.             TableFullException {
  102.         boolean text;
  103.         try (ObjectStream in1 = obj.openStream()) {
  104.             text = !RawText.isBinary(in1);
  105.         }

  106.         try (ObjectStream in2 = obj.openStream()) {
  107.             hash(in2, in2.getSize(), text);
  108.         }
  109.     }

  110.     void hash(byte[] raw, int ptr, int end) throws TableFullException {
  111.         final boolean text = !RawText.isBinary(raw);
  112.         hashedCnt = 0;
  113.         while (ptr < end) {
  114.             int hash = 5381;
  115.             int blockHashedCnt = 0;
  116.             int start = ptr;

  117.             // Hash one line, or one block, whichever occurs first.
  118.             do {
  119.                 int c = raw[ptr++] & 0xff;
  120.                 // Ignore CR in CRLF sequence if text
  121.                 if (text && c == '\r' && ptr < end && raw[ptr] == '\n')
  122.                     continue;
  123.                 blockHashedCnt++;
  124.                 if (c == '\n')
  125.                     break;
  126.                 hash = (hash << 5) + hash + c;
  127.             } while (ptr < end && ptr - start < 64);
  128.             hashedCnt += blockHashedCnt;
  129.             add(hash, blockHashedCnt);
  130.         }
  131.     }

  132.     void hash(InputStream in, long remaining, boolean text) throws IOException,
  133.             TableFullException {
  134.         byte[] buf = new byte[4096];
  135.         int ptr = 0;
  136.         int cnt = 0;

  137.         while (0 < remaining) {
  138.             int hash = 5381;
  139.             int blockHashedCnt = 0;

  140.             // Hash one line, or one block, whichever occurs first.
  141.             int n = 0;
  142.             do {
  143.                 if (ptr == cnt) {
  144.                     ptr = 0;
  145.                     cnt = in.read(buf, 0, buf.length);
  146.                     if (cnt <= 0)
  147.                         throw new EOFException();
  148.                 }

  149.                 n++;
  150.                 int c = buf[ptr++] & 0xff;
  151.                 // Ignore CR in CRLF sequence if text
  152.                 if (text && c == '\r' && ptr < cnt && buf[ptr] == '\n')
  153.                     continue;
  154.                 blockHashedCnt++;
  155.                 if (c == '\n')
  156.                     break;
  157.                 hash = (hash << 5) + hash + c;
  158.             } while (n < 64 && n < remaining);
  159.             hashedCnt += blockHashedCnt;
  160.             add(hash, blockHashedCnt);
  161.             remaining -= n;
  162.         }
  163.     }

  164.     /**
  165.      * Sort the internal table so it can be used for efficient scoring.
  166.      * <p>
  167.      * Once sorted, additional lines/blocks cannot be added to the index.
  168.      */
  169.     void sort() {
  170.         // Sort the array. All of the empty space will wind up at the front,
  171.         // because we forced all of the keys to always be positive. Later
  172.         // we only work with the back half of the array.
  173.         //
  174.         Arrays.sort(idHash);
  175.     }

  176.     /**
  177.      * Compute the similarity score between this index and another.
  178.      * <p>
  179.      * A region of a file is defined as a line in a text file or a fixed-size
  180.      * block in a binary file. To prepare an index, each region in the file is
  181.      * hashed; the values and counts of hashes are retained in a sorted table.
  182.      * Define the similarity fraction F as the count of matching regions
  183.      * between the two files divided between the maximum count of regions in
  184.      * either file. The similarity score is F multiplied by the maxScore
  185.      * constant, yielding a range [0, maxScore]. It is defined as maxScore for
  186.      * the degenerate case of two empty files.
  187.      * <p>
  188.      * The similarity score is symmetrical; i.e. a.score(b) == b.score(a).
  189.      *
  190.      * @param dst
  191.      *            the other index
  192.      * @param maxScore
  193.      *            the score representing a 100% match
  194.      * @return the similarity score
  195.      */
  196.     public int score(SimilarityIndex dst, int maxScore) {
  197.         long max = Math.max(hashedCnt, dst.hashedCnt);
  198.         if (max == 0)
  199.             return maxScore;
  200.         return (int) ((common(dst) * maxScore) / max);
  201.     }

  202.     long common(SimilarityIndex dst) {
  203.         return common(this, dst);
  204.     }

  205.     private static long common(SimilarityIndex src, SimilarityIndex dst) {
  206.         int srcIdx = src.packedIndex(0);
  207.         int dstIdx = dst.packedIndex(0);
  208.         long[] srcHash = src.idHash;
  209.         long[] dstHash = dst.idHash;
  210.         return common(srcHash, srcIdx, dstHash, dstIdx);
  211.     }

  212.     private static long common(long[] srcHash, int srcIdx, //
  213.             long[] dstHash, int dstIdx) {
  214.         if (srcIdx == srcHash.length || dstIdx == dstHash.length)
  215.             return 0;

  216.         long common = 0;
  217.         int srcKey = keyOf(srcHash[srcIdx]);
  218.         int dstKey = keyOf(dstHash[dstIdx]);

  219.         for (;;) {
  220.             if (srcKey == dstKey) {
  221.                 common += Math.min(countOf(srcHash[srcIdx]),
  222.                         countOf(dstHash[dstIdx]));

  223.                 if (++srcIdx == srcHash.length)
  224.                     break;
  225.                 srcKey = keyOf(srcHash[srcIdx]);

  226.                 if (++dstIdx == dstHash.length)
  227.                     break;
  228.                 dstKey = keyOf(dstHash[dstIdx]);

  229.             } else if (srcKey < dstKey) {
  230.                 // Regions of src which do not appear in dst.
  231.                 if (++srcIdx == srcHash.length)
  232.                     break;
  233.                 srcKey = keyOf(srcHash[srcIdx]);

  234.             } else /* if (dstKey < srcKey) */{
  235.                 // Regions of dst which do not appear in src.
  236.                 if (++dstIdx == dstHash.length)
  237.                     break;
  238.                 dstKey = keyOf(dstHash[dstIdx]);
  239.             }
  240.         }

  241.         return common;
  242.     }

  243.     // Testing only
  244.     int size() {
  245.         return idSize;
  246.     }

  247.     // Testing only
  248.     int key(int idx) {
  249.         return keyOf(idHash[packedIndex(idx)]);
  250.     }

  251.     // Testing only
  252.     long count(int idx) {
  253.         return countOf(idHash[packedIndex(idx)]);
  254.     }

  255.     // Brute force approach only for testing.
  256.     int findIndex(int key) {
  257.         for (int i = 0; i < idSize; i++)
  258.             if (key(i) == key)
  259.                 return i;
  260.         return -1;
  261.     }

  262.     private int packedIndex(int idx) {
  263.         return (idHash.length - idSize) + idx;
  264.     }

  265.     void add(int key, int cnt) throws TableFullException {
  266.         key = (key * 0x9e370001) >>> 1; // Mix bits and ensure not negative.

  267.         int j = slot(key);
  268.         for (;;) {
  269.             long v = idHash[j];
  270.             if (v == 0) {
  271.                 // Empty slot in the table, store here.
  272.                 if (idGrowAt <= idSize) {
  273.                     grow();
  274.                     j = slot(key);
  275.                     continue;
  276.                 }
  277.                 idHash[j] = pair(key, cnt);
  278.                 idSize++;
  279.                 return;

  280.             } else if (keyOf(v) == key) {
  281.                 // Same key, increment the counter. If it overflows, fail
  282.                 // indexing to prevent the key from being impacted.
  283.                 //
  284.                 idHash[j] = pair(key, countOf(v) + cnt);
  285.                 return;

  286.             } else if (++j >= idHash.length) {
  287.                 j = 0;
  288.             }
  289.         }
  290.     }

  291.     private static long pair(int key, long cnt) throws TableFullException {
  292.         if (MAX_COUNT < cnt)
  293.             throw new TableFullException();
  294.         return (((long) key) << KEY_SHIFT) | cnt;
  295.     }

  296.     private int slot(int key) {
  297.         // We use 31 - idHashBits because the upper bit was already forced
  298.         // to be 0 and we want the remaining high bits to be used as the
  299.         // table slot.
  300.         //
  301.         return key >>> (31 - idHashBits);
  302.     }

  303.     private static int growAt(int idHashBits) {
  304.         return (1 << idHashBits) * (idHashBits - 3) / idHashBits;
  305.     }

  306.     @SuppressWarnings("UnusedException")
  307.     private void grow() throws TableFullException {
  308.         if (idHashBits == 30)
  309.             throw new TableFullException();

  310.         long[] oldHash = idHash;
  311.         int oldSize = idHash.length;

  312.         idHashBits++;
  313.         idGrowAt = growAt(idHashBits);

  314.         try {
  315.             idHash = new long[1 << idHashBits];
  316.         } catch (OutOfMemoryError noMemory) {
  317.             throw TABLE_FULL_OUT_OF_MEMORY;
  318.         }

  319.         for (int i = 0; i < oldSize; i++) {
  320.             long v = oldHash[i];
  321.             if (v != 0) {
  322.                 int j = slot(keyOf(v));
  323.                 while (idHash[j] != 0)
  324.                     if (++j >= idHash.length)
  325.                         j = 0;
  326.                 idHash[j] = v;
  327.             }
  328.         }
  329.     }

  330.     private static int keyOf(long v) {
  331.         return (int) (v >>> KEY_SHIFT);
  332.     }

  333.     private static long countOf(long v) {
  334.         return v & MAX_COUNT;
  335.     }

  336.     /** Thrown by {@code create()} when file is too large. */
  337.     public static class TableFullException extends Exception {
  338.         private static final long serialVersionUID = 1L;
  339.     }
  340. }