SimilarityRenameDetector.java

/*
 * Copyright (C) 2010, Google Inc. and others
 *
 * This program and the accompanying materials are made available under the
 * terms of the Eclipse Distribution License v. 1.0 which is available at
 * https://www.eclipse.org/org/documents/edl-v10.php.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 */

package org.eclipse.jgit.diff;

import static org.eclipse.jgit.diff.DiffEntry.Side.NEW;
import static org.eclipse.jgit.diff.DiffEntry.Side.OLD;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.BitSet;
import java.util.List;

import org.eclipse.jgit.diff.DiffEntry.ChangeType;
import org.eclipse.jgit.diff.SimilarityIndex.TableFullException;
import org.eclipse.jgit.errors.CancelledException;
import org.eclipse.jgit.internal.JGitText;
import org.eclipse.jgit.lib.FileMode;
import org.eclipse.jgit.lib.NullProgressMonitor;
import org.eclipse.jgit.lib.ProgressMonitor;

class SimilarityRenameDetector {
    /**
     * Number of bits we need to express an index into src or dst list.
     * <p>
     * This must be 28, giving us a limit of 2^28 entries in either list, which
     * is an insane limit of 536,870,912 file names being considered in a single
     * rename pass. The other 8 bits are used to store the score, while staying
     * under 127 so the long doesn't go negative.
     */
    private static final int BITS_PER_INDEX = 28;

    private static final int INDEX_MASK = (1 << BITS_PER_INDEX) - 1;

    private static final int SCORE_SHIFT = 2 * BITS_PER_INDEX;

    private ContentSource.Pair reader;

    /**
     * All sources to consider for copies or renames.
     * <p>
     * A source is typically a {@link ChangeType#DELETE} change, but could be
     * another type when trying to perform copy detection concurrently with
     * rename detection.
     */
    private List<DiffEntry> srcs;

    /**
     * All destinations to consider looking for a rename.
     * <p>
     * A destination is typically an {@link ChangeType#ADD}, as the name has
     * just come into existence, and we want to discover where its initial
     * content came from.
     */
    private List<DiffEntry> dsts;

    /**
     * Matrix of all examined file pairs, and their scores.
     * <p>
     * The upper 8 bits of each long stores the score, but the score is bounded
     * to be in the range (0, 128] so that the highest bit is never set, and all
     * entries are therefore positive.
     * <p>
     * List indexes to an element of {@link #srcs} and {@link #dsts} are encoded
     * as the lower two groups of 28 bits, respectively, but the encoding is
     * inverted, so that 0 is expressed as {@code (1 << 28) - 1}. This sorts
     * lower list indices later in the matrix, giving precedence to files whose
     * names sort earlier in the tree.
     */
    private long[] matrix;

    /** Score a pair must exceed to be considered a rename. */
    private int renameScore = 60;

    /** Set if any {@link SimilarityIndex.TableFullException} occurs. */
    private boolean tableOverflow;

    private List<DiffEntry> out;

    SimilarityRenameDetector(ContentSource.Pair reader, List<DiffEntry> srcs,
            List<DiffEntry> dsts) {
        this.reader = reader;
        this.srcs = srcs;
        this.dsts = dsts;
    }

    void setRenameScore(int score) {
        renameScore = score;
    }

    void compute(ProgressMonitor pm) throws IOException, CancelledException {
        if (pm == null)
            pm = NullProgressMonitor.INSTANCE;

        pm.beginTask(JGitText.get().renamesFindingByContent, //
                2 * srcs.size() * dsts.size());

        int mNext = buildMatrix(pm);
        out = new ArrayList<>(Math.min(mNext, dsts.size()));

        // Match rename pairs on a first come, first serve basis until
        // we have looked at everything that is above our minimum score.
        //
        for (--mNext; mNext >= 0; mNext--) {
            if (pm.isCancelled()) {
                // TODO(ms): use org.eclipse.jgit.api.errors.CanceledException
                // in next major version
                throw new CancelledException(JGitText.get().renameCancelled);
            }
            long ent = matrix[mNext];
            int sIdx = srcFile(ent);
            int dIdx = dstFile(ent);
            DiffEntry s = srcs.get(sIdx);
            DiffEntry d = dsts.get(dIdx);

            if (d == null) {
                pm.update(1);
                continue; // was already matched earlier
            }

            ChangeType type;
            if (s.changeType == ChangeType.DELETE) {
                // First use of this source file. Tag it as a rename so we
                // later know it is already been used as a rename, other
                // matches (if any) will claim themselves as copies instead.
                //
                s.changeType = ChangeType.RENAME;
                type = ChangeType.RENAME;
            } else {
                type = ChangeType.COPY;
            }

            out.add(DiffEntry.pair(type, s, d, score(ent)));
            dsts.set(dIdx, null); // Claim the destination was matched.
            pm.update(1);
        }

        srcs = compactSrcList(srcs);
        dsts = compactDstList(dsts);
        pm.endTask();
    }

    List<DiffEntry> getMatches() {
        return out;
    }

    List<DiffEntry> getLeftOverSources() {
        return srcs;
    }

    List<DiffEntry> getLeftOverDestinations() {
        return dsts;
    }

    boolean isTableOverflow() {
        return tableOverflow;
    }

    private static List<DiffEntry> compactSrcList(List<DiffEntry> in) {
        ArrayList<DiffEntry> r = new ArrayList<>(in.size());
        for (DiffEntry e : in) {
            if (e.changeType == ChangeType.DELETE)
                r.add(e);
        }
        return r;
    }

    private static List<DiffEntry> compactDstList(List<DiffEntry> in) {
        ArrayList<DiffEntry> r = new ArrayList<>(in.size());
        for (DiffEntry e : in) {
            if (e != null)
                r.add(e);
        }
        return r;
    }

    private int buildMatrix(ProgressMonitor pm)
            throws IOException, CancelledException {
        // Allocate for the worst-case scenario where every pair has a
        // score that we need to consider. We might not need that many.
        //
        matrix = new long[srcs.size() * dsts.size()];

        long[] srcSizes = new long[srcs.size()];
        long[] dstSizes = new long[dsts.size()];
        BitSet dstTooLarge = null;

        // Consider each pair of files, if the score is above the minimum
        // threshold we need record that scoring in the matrix so we can
        // later find the best matches.
        //
        int mNext = 0;
        SRC: for (int srcIdx = 0; srcIdx < srcs.size(); srcIdx++) {
            DiffEntry srcEnt = srcs.get(srcIdx);
            if (!isFile(srcEnt.oldMode)) {
                pm.update(dsts.size());
                continue;
            }

            SimilarityIndex s = null;

            for (int dstIdx = 0; dstIdx < dsts.size(); dstIdx++) {
                if (pm.isCancelled()) {
                    // TODO(ms): use
                    // org.eclipse.jgit.api.errors.CanceledException in next
                    // major version
                    throw new CancelledException(
                            JGitText.get().renameCancelled);
                }

                DiffEntry dstEnt = dsts.get(dstIdx);

                if (!isFile(dstEnt.newMode)) {
                    pm.update(1);
                    continue;
                }

                if (!RenameDetector.sameType(srcEnt.oldMode, dstEnt.newMode)) {
                    pm.update(1);
                    continue;
                }

                if (dstTooLarge != null && dstTooLarge.get(dstIdx)) {
                    pm.update(1);
                    continue;
                }

                long srcSize = srcSizes[srcIdx];
                if (srcSize == 0) {
                    srcSize = size(OLD, srcEnt) + 1;
                    srcSizes[srcIdx] = srcSize;
                }

                long dstSize = dstSizes[dstIdx];
                if (dstSize == 0) {
                    dstSize = size(NEW, dstEnt) + 1;
                    dstSizes[dstIdx] = dstSize;
                }

                long max = Math.max(srcSize, dstSize);
                long min = Math.min(srcSize, dstSize);
                if (min * 100 / max < renameScore) {
                    // Cannot possibly match, as the file sizes are so different
                    pm.update(1);
                    continue;
                }

                if (s == null) {
                    try {
                        s = hash(OLD, srcEnt);
                    } catch (TableFullException tableFull) {
                        tableOverflow = true;
                        continue SRC;
                    }
                }

                SimilarityIndex d;
                try {
                    d = hash(NEW, dstEnt);
                } catch (TableFullException tableFull) {
                    if (dstTooLarge == null)
                        dstTooLarge = new BitSet(dsts.size());
                    dstTooLarge.set(dstIdx);
                    tableOverflow = true;
                    pm.update(1);
                    continue;
                }

                int contentScore = s.score(d, 10000);

                // nameScore returns a value between 0 and 100, but we want it
                // to be in the same range as the content score. This allows it
                // to be dropped into the pretty formula for the final score.
                int nameScore = nameScore(srcEnt.oldPath, dstEnt.newPath) * 100;

                int score = (contentScore * 99 + nameScore * 1) / 10000;

                if (score < renameScore) {
                    pm.update(1);
                    continue;
                }

                matrix[mNext++] = encode(score, srcIdx, dstIdx);
                pm.update(1);
            }
        }

        // Sort everything in the range we populated, which might be the
        // entire matrix, or just a smaller slice if we had some bad low
        // scoring pairs.
        //
        Arrays.sort(matrix, 0, mNext);
        return mNext;
    }

    static int nameScore(String a, String b) {
        int aDirLen = a.lastIndexOf('/') + 1;
        int bDirLen = b.lastIndexOf('/') + 1;

        int dirMin = Math.min(aDirLen, bDirLen);
        int dirMax = Math.max(aDirLen, bDirLen);

        final int dirScoreLtr;
        final int dirScoreRtl;

        if (dirMax == 0) {
            dirScoreLtr = 100;
            dirScoreRtl = 100;
        } else {
            int dirSim = 0;
            for (; dirSim < dirMin; dirSim++) {
                if (a.charAt(dirSim) != b.charAt(dirSim))
                    break;
            }
            dirScoreLtr = (dirSim * 100) / dirMax;

            if (dirScoreLtr == 100) {
                dirScoreRtl = 100;
            } else {
                for (dirSim = 0; dirSim < dirMin; dirSim++) {
                    if (a.charAt(aDirLen - 1 - dirSim) != b.charAt(bDirLen - 1
                            - dirSim))
                        break;
                }
                dirScoreRtl = (dirSim * 100) / dirMax;
            }
        }

        int fileMin = Math.min(a.length() - aDirLen, b.length() - bDirLen);
        int fileMax = Math.max(a.length() - aDirLen, b.length() - bDirLen);

        int fileSim = 0;
        for (; fileSim < fileMin; fileSim++) {
            if (a.charAt(a.length() - 1 - fileSim) != b.charAt(b.length() - 1
                    - fileSim))
                break;
        }
        int fileScore = (fileSim * 100) / fileMax;

        return (((dirScoreLtr + dirScoreRtl) * 25) + (fileScore * 50)) / 100;
    }

    private SimilarityIndex hash(DiffEntry.Side side, DiffEntry ent)
            throws IOException, TableFullException {
        SimilarityIndex r = new SimilarityIndex();
        r.hash(reader.open(side, ent));
        r.sort();
        return r;
    }

    private long size(DiffEntry.Side side, DiffEntry ent) throws IOException {
        return reader.size(side, ent);
    }

    private static int score(long value) {
        return (int) (value >>> SCORE_SHIFT);
    }

    static int srcFile(long value) {
        return decodeFile(((int) (value >>> BITS_PER_INDEX)) & INDEX_MASK);
    }

    static int dstFile(long value) {
        return decodeFile(((int) value) & INDEX_MASK);
    }

    static long encode(int score, int srcIdx, int dstIdx) {
        return (((long) score) << SCORE_SHIFT) //
                | (encodeFile(srcIdx) << BITS_PER_INDEX) //
                | encodeFile(dstIdx);
    }

    private static long encodeFile(int idx) {
        // We invert the index so that the first file in the list sorts
        // later in the table. This permits us to break ties favoring
        // earlier names over later ones.
        //
        return INDEX_MASK - idx;
    }

    private static int decodeFile(int v) {
        return INDEX_MASK - v;
    }

    private static boolean isFile(FileMode mode) {
        return (mode.getBits() & FileMode.TYPE_MASK) == FileMode.TYPE_FILE;
    }
}