SimilarityRenameDetector.java

  1. /*
  2.  * Copyright (C) 2010, Google Inc. and others
  3.  *
  4.  * This program and the accompanying materials are made available under the
  5.  * terms of the Eclipse Distribution License v. 1.0 which is available at
  6.  * https://www.eclipse.org/org/documents/edl-v10.php.
  7.  *
  8.  * SPDX-License-Identifier: BSD-3-Clause
  9.  */

  10. package org.eclipse.jgit.diff;

  11. import static org.eclipse.jgit.diff.DiffEntry.Side.NEW;
  12. import static org.eclipse.jgit.diff.DiffEntry.Side.OLD;

  13. import java.io.IOException;
  14. import java.util.ArrayList;
  15. import java.util.Arrays;
  16. import java.util.BitSet;
  17. import java.util.List;

  18. import org.eclipse.jgit.diff.DiffEntry.ChangeType;
  19. import org.eclipse.jgit.diff.SimilarityIndex.TableFullException;
  20. import org.eclipse.jgit.errors.CancelledException;
  21. import org.eclipse.jgit.internal.JGitText;
  22. import org.eclipse.jgit.lib.FileMode;
  23. import org.eclipse.jgit.lib.NullProgressMonitor;
  24. import org.eclipse.jgit.lib.ProgressMonitor;

  25. class SimilarityRenameDetector {
  26.     /**
  27.      * Number of bits we need to express an index into src or dst list.
  28.      * <p>
  29.      * This must be 28, giving us a limit of 2^28 entries in either list, which
  30.      * is an insane limit of 536,870,912 file names being considered in a single
  31.      * rename pass. The other 8 bits are used to store the score, while staying
  32.      * under 127 so the long doesn't go negative.
  33.      */
  34.     private static final int BITS_PER_INDEX = 28;

  35.     private static final int INDEX_MASK = (1 << BITS_PER_INDEX) - 1;

  36.     private static final int SCORE_SHIFT = 2 * BITS_PER_INDEX;

  37.     private ContentSource.Pair reader;

  38.     /**
  39.      * All sources to consider for copies or renames.
  40.      * <p>
  41.      * A source is typically a {@link ChangeType#DELETE} change, but could be
  42.      * another type when trying to perform copy detection concurrently with
  43.      * rename detection.
  44.      */
  45.     private List<DiffEntry> srcs;

  46.     /**
  47.      * All destinations to consider looking for a rename.
  48.      * <p>
  49.      * A destination is typically an {@link ChangeType#ADD}, as the name has
  50.      * just come into existence, and we want to discover where its initial
  51.      * content came from.
  52.      */
  53.     private List<DiffEntry> dsts;

  54.     /**
  55.      * Matrix of all examined file pairs, and their scores.
  56.      * <p>
  57.      * The upper 8 bits of each long stores the score, but the score is bounded
  58.      * to be in the range (0, 128] so that the highest bit is never set, and all
  59.      * entries are therefore positive.
  60.      * <p>
  61.      * List indexes to an element of {@link #srcs} and {@link #dsts} are encoded
  62.      * as the lower two groups of 28 bits, respectively, but the encoding is
  63.      * inverted, so that 0 is expressed as {@code (1 << 28) - 1}. This sorts
  64.      * lower list indices later in the matrix, giving precedence to files whose
  65.      * names sort earlier in the tree.
  66.      */
  67.     private long[] matrix;

  68.     /** Score a pair must exceed to be considered a rename. */
  69.     private int renameScore = 60;

  70.     /** Set if any {@link SimilarityIndex.TableFullException} occurs. */
  71.     private boolean tableOverflow;

  72.     private List<DiffEntry> out;

  73.     SimilarityRenameDetector(ContentSource.Pair reader, List<DiffEntry> srcs,
  74.             List<DiffEntry> dsts) {
  75.         this.reader = reader;
  76.         this.srcs = srcs;
  77.         this.dsts = dsts;
  78.     }

  79.     void setRenameScore(int score) {
  80.         renameScore = score;
  81.     }

  82.     void compute(ProgressMonitor pm) throws IOException, CancelledException {
  83.         if (pm == null)
  84.             pm = NullProgressMonitor.INSTANCE;

  85.         pm.beginTask(JGitText.get().renamesFindingByContent, //
  86.                 2 * srcs.size() * dsts.size());

  87.         int mNext = buildMatrix(pm);
  88.         out = new ArrayList<>(Math.min(mNext, dsts.size()));

  89.         // Match rename pairs on a first come, first serve basis until
  90.         // we have looked at everything that is above our minimum score.
  91.         //
  92.         for (--mNext; mNext >= 0; mNext--) {
  93.             if (pm.isCancelled()) {
  94.                 // TODO(ms): use org.eclipse.jgit.api.errors.CanceledException
  95.                 // in next major version
  96.                 throw new CancelledException(JGitText.get().renameCancelled);
  97.             }
  98.             long ent = matrix[mNext];
  99.             int sIdx = srcFile(ent);
  100.             int dIdx = dstFile(ent);
  101.             DiffEntry s = srcs.get(sIdx);
  102.             DiffEntry d = dsts.get(dIdx);

  103.             if (d == null) {
  104.                 pm.update(1);
  105.                 continue; // was already matched earlier
  106.             }

  107.             ChangeType type;
  108.             if (s.changeType == ChangeType.DELETE) {
  109.                 // First use of this source file. Tag it as a rename so we
  110.                 // later know it is already been used as a rename, other
  111.                 // matches (if any) will claim themselves as copies instead.
  112.                 //
  113.                 s.changeType = ChangeType.RENAME;
  114.                 type = ChangeType.RENAME;
  115.             } else {
  116.                 type = ChangeType.COPY;
  117.             }

  118.             out.add(DiffEntry.pair(type, s, d, score(ent)));
  119.             dsts.set(dIdx, null); // Claim the destination was matched.
  120.             pm.update(1);
  121.         }

  122.         srcs = compactSrcList(srcs);
  123.         dsts = compactDstList(dsts);
  124.         pm.endTask();
  125.     }

  126.     List<DiffEntry> getMatches() {
  127.         return out;
  128.     }

  129.     List<DiffEntry> getLeftOverSources() {
  130.         return srcs;
  131.     }

  132.     List<DiffEntry> getLeftOverDestinations() {
  133.         return dsts;
  134.     }

  135.     boolean isTableOverflow() {
  136.         return tableOverflow;
  137.     }

  138.     private static List<DiffEntry> compactSrcList(List<DiffEntry> in) {
  139.         ArrayList<DiffEntry> r = new ArrayList<>(in.size());
  140.         for (DiffEntry e : in) {
  141.             if (e.changeType == ChangeType.DELETE)
  142.                 r.add(e);
  143.         }
  144.         return r;
  145.     }

  146.     private static List<DiffEntry> compactDstList(List<DiffEntry> in) {
  147.         ArrayList<DiffEntry> r = new ArrayList<>(in.size());
  148.         for (DiffEntry e : in) {
  149.             if (e != null)
  150.                 r.add(e);
  151.         }
  152.         return r;
  153.     }

  154.     private int buildMatrix(ProgressMonitor pm)
  155.             throws IOException, CancelledException {
  156.         // Allocate for the worst-case scenario where every pair has a
  157.         // score that we need to consider. We might not need that many.
  158.         //
  159.         matrix = new long[srcs.size() * dsts.size()];

  160.         long[] srcSizes = new long[srcs.size()];
  161.         long[] dstSizes = new long[dsts.size()];
  162.         BitSet dstTooLarge = null;

  163.         // Consider each pair of files, if the score is above the minimum
  164.         // threshold we need record that scoring in the matrix so we can
  165.         // later find the best matches.
  166.         //
  167.         int mNext = 0;
  168.         SRC: for (int srcIdx = 0; srcIdx < srcs.size(); srcIdx++) {
  169.             DiffEntry srcEnt = srcs.get(srcIdx);
  170.             if (!isFile(srcEnt.oldMode)) {
  171.                 pm.update(dsts.size());
  172.                 continue;
  173.             }

  174.             SimilarityIndex s = null;

  175.             for (int dstIdx = 0; dstIdx < dsts.size(); dstIdx++) {
  176.                 if (pm.isCancelled()) {
  177.                     // TODO(ms): use
  178.                     // org.eclipse.jgit.api.errors.CanceledException in next
  179.                     // major version
  180.                     throw new CancelledException(
  181.                             JGitText.get().renameCancelled);
  182.                 }

  183.                 DiffEntry dstEnt = dsts.get(dstIdx);

  184.                 if (!isFile(dstEnt.newMode)) {
  185.                     pm.update(1);
  186.                     continue;
  187.                 }

  188.                 if (!RenameDetector.sameType(srcEnt.oldMode, dstEnt.newMode)) {
  189.                     pm.update(1);
  190.                     continue;
  191.                 }

  192.                 if (dstTooLarge != null && dstTooLarge.get(dstIdx)) {
  193.                     pm.update(1);
  194.                     continue;
  195.                 }

  196.                 long srcSize = srcSizes[srcIdx];
  197.                 if (srcSize == 0) {
  198.                     srcSize = size(OLD, srcEnt) + 1;
  199.                     srcSizes[srcIdx] = srcSize;
  200.                 }

  201.                 long dstSize = dstSizes[dstIdx];
  202.                 if (dstSize == 0) {
  203.                     dstSize = size(NEW, dstEnt) + 1;
  204.                     dstSizes[dstIdx] = dstSize;
  205.                 }

  206.                 long max = Math.max(srcSize, dstSize);
  207.                 long min = Math.min(srcSize, dstSize);
  208.                 if (min * 100 / max < renameScore) {
  209.                     // Cannot possibly match, as the file sizes are so different
  210.                     pm.update(1);
  211.                     continue;
  212.                 }

  213.                 if (s == null) {
  214.                     try {
  215.                         s = hash(OLD, srcEnt);
  216.                     } catch (TableFullException tableFull) {
  217.                         tableOverflow = true;
  218.                         continue SRC;
  219.                     }
  220.                 }

  221.                 SimilarityIndex d;
  222.                 try {
  223.                     d = hash(NEW, dstEnt);
  224.                 } catch (TableFullException tableFull) {
  225.                     if (dstTooLarge == null)
  226.                         dstTooLarge = new BitSet(dsts.size());
  227.                     dstTooLarge.set(dstIdx);
  228.                     tableOverflow = true;
  229.                     pm.update(1);
  230.                     continue;
  231.                 }

  232.                 int contentScore = s.score(d, 10000);

  233.                 // nameScore returns a value between 0 and 100, but we want it
  234.                 // to be in the same range as the content score. This allows it
  235.                 // to be dropped into the pretty formula for the final score.
  236.                 int nameScore = nameScore(srcEnt.oldPath, dstEnt.newPath) * 100;

  237.                 int score = (contentScore * 99 + nameScore * 1) / 10000;

  238.                 if (score < renameScore) {
  239.                     pm.update(1);
  240.                     continue;
  241.                 }

  242.                 matrix[mNext++] = encode(score, srcIdx, dstIdx);
  243.                 pm.update(1);
  244.             }
  245.         }

  246.         // Sort everything in the range we populated, which might be the
  247.         // entire matrix, or just a smaller slice if we had some bad low
  248.         // scoring pairs.
  249.         //
  250.         Arrays.sort(matrix, 0, mNext);
  251.         return mNext;
  252.     }

  253.     static int nameScore(String a, String b) {
  254.         int aDirLen = a.lastIndexOf('/') + 1;
  255.         int bDirLen = b.lastIndexOf('/') + 1;

  256.         int dirMin = Math.min(aDirLen, bDirLen);
  257.         int dirMax = Math.max(aDirLen, bDirLen);

  258.         final int dirScoreLtr;
  259.         final int dirScoreRtl;

  260.         if (dirMax == 0) {
  261.             dirScoreLtr = 100;
  262.             dirScoreRtl = 100;
  263.         } else {
  264.             int dirSim = 0;
  265.             for (; dirSim < dirMin; dirSim++) {
  266.                 if (a.charAt(dirSim) != b.charAt(dirSim))
  267.                     break;
  268.             }
  269.             dirScoreLtr = (dirSim * 100) / dirMax;

  270.             if (dirScoreLtr == 100) {
  271.                 dirScoreRtl = 100;
  272.             } else {
  273.                 for (dirSim = 0; dirSim < dirMin; dirSim++) {
  274.                     if (a.charAt(aDirLen - 1 - dirSim) != b.charAt(bDirLen - 1
  275.                             - dirSim))
  276.                         break;
  277.                 }
  278.                 dirScoreRtl = (dirSim * 100) / dirMax;
  279.             }
  280.         }

  281.         int fileMin = Math.min(a.length() - aDirLen, b.length() - bDirLen);
  282.         int fileMax = Math.max(a.length() - aDirLen, b.length() - bDirLen);

  283.         int fileSim = 0;
  284.         for (; fileSim < fileMin; fileSim++) {
  285.             if (a.charAt(a.length() - 1 - fileSim) != b.charAt(b.length() - 1
  286.                     - fileSim))
  287.                 break;
  288.         }
  289.         int fileScore = (fileSim * 100) / fileMax;

  290.         return (((dirScoreLtr + dirScoreRtl) * 25) + (fileScore * 50)) / 100;
  291.     }

  292.     private SimilarityIndex hash(DiffEntry.Side side, DiffEntry ent)
  293.             throws IOException, TableFullException {
  294.         SimilarityIndex r = new SimilarityIndex();
  295.         r.hash(reader.open(side, ent));
  296.         r.sort();
  297.         return r;
  298.     }

  299.     private long size(DiffEntry.Side side, DiffEntry ent) throws IOException {
  300.         return reader.size(side, ent);
  301.     }

  302.     private static int score(long value) {
  303.         return (int) (value >>> SCORE_SHIFT);
  304.     }

  305.     static int srcFile(long value) {
  306.         return decodeFile(((int) (value >>> BITS_PER_INDEX)) & INDEX_MASK);
  307.     }

  308.     static int dstFile(long value) {
  309.         return decodeFile(((int) value) & INDEX_MASK);
  310.     }

  311.     static long encode(int score, int srcIdx, int dstIdx) {
  312.         return (((long) score) << SCORE_SHIFT) //
  313.                 | (encodeFile(srcIdx) << BITS_PER_INDEX) //
  314.                 | encodeFile(dstIdx);
  315.     }

  316.     private static long encodeFile(int idx) {
  317.         // We invert the index so that the first file in the list sorts
  318.         // later in the table. This permits us to break ties favoring
  319.         // earlier names over later ones.
  320.         //
  321.         return INDEX_MASK - idx;
  322.     }

  323.     private static int decodeFile(int v) {
  324.         return INDEX_MASK - v;
  325.     }

  326.     private static boolean isFile(FileMode mode) {
  327.         return (mode.getBits() & FileMode.TYPE_MASK) == FileMode.TYPE_FILE;
  328.     }
  329. }