SimilarityRenameDetector.java
- /*
- * Copyright (C) 2010, Google Inc. and others
- *
- * This program and the accompanying materials are made available under the
- * terms of the Eclipse Distribution License v. 1.0 which is available at
- * https://www.eclipse.org/org/documents/edl-v10.php.
- *
- * SPDX-License-Identifier: BSD-3-Clause
- */
- package org.eclipse.jgit.diff;
- import static org.eclipse.jgit.diff.DiffEntry.Side.NEW;
- import static org.eclipse.jgit.diff.DiffEntry.Side.OLD;
- import java.io.IOException;
- import java.util.ArrayList;
- import java.util.Arrays;
- import java.util.BitSet;
- import java.util.List;
- import org.eclipse.jgit.diff.DiffEntry.ChangeType;
- import org.eclipse.jgit.diff.SimilarityIndex.TableFullException;
- import org.eclipse.jgit.errors.CancelledException;
- import org.eclipse.jgit.internal.JGitText;
- import org.eclipse.jgit.lib.FileMode;
- import org.eclipse.jgit.lib.NullProgressMonitor;
- import org.eclipse.jgit.lib.ProgressMonitor;
- class SimilarityRenameDetector {
- /**
- * Number of bits we need to express an index into src or dst list.
- * <p>
- * This must be 28, giving us a limit of 2^28 entries in either list, which
- * is an insane limit of 536,870,912 file names being considered in a single
- * rename pass. The other 8 bits are used to store the score, while staying
- * under 127 so the long doesn't go negative.
- */
- private static final int BITS_PER_INDEX = 28;
- private static final int INDEX_MASK = (1 << BITS_PER_INDEX) - 1;
- private static final int SCORE_SHIFT = 2 * BITS_PER_INDEX;
- private ContentSource.Pair reader;
- /**
- * All sources to consider for copies or renames.
- * <p>
- * A source is typically a {@link ChangeType#DELETE} change, but could be
- * another type when trying to perform copy detection concurrently with
- * rename detection.
- */
- private List<DiffEntry> srcs;
- /**
- * All destinations to consider looking for a rename.
- * <p>
- * A destination is typically an {@link ChangeType#ADD}, as the name has
- * just come into existence, and we want to discover where its initial
- * content came from.
- */
- private List<DiffEntry> dsts;
- /**
- * Matrix of all examined file pairs, and their scores.
- * <p>
- * The upper 8 bits of each long stores the score, but the score is bounded
- * to be in the range (0, 128] so that the highest bit is never set, and all
- * entries are therefore positive.
- * <p>
- * List indexes to an element of {@link #srcs} and {@link #dsts} are encoded
- * as the lower two groups of 28 bits, respectively, but the encoding is
- * inverted, so that 0 is expressed as {@code (1 << 28) - 1}. This sorts
- * lower list indices later in the matrix, giving precedence to files whose
- * names sort earlier in the tree.
- */
- private long[] matrix;
- /** Score a pair must exceed to be considered a rename. */
- private int renameScore = 60;
- /** Set if any {@link SimilarityIndex.TableFullException} occurs. */
- private boolean tableOverflow;
- private List<DiffEntry> out;
- SimilarityRenameDetector(ContentSource.Pair reader, List<DiffEntry> srcs,
- List<DiffEntry> dsts) {
- this.reader = reader;
- this.srcs = srcs;
- this.dsts = dsts;
- }
- void setRenameScore(int score) {
- renameScore = score;
- }
- void compute(ProgressMonitor pm) throws IOException, CancelledException {
- if (pm == null)
- pm = NullProgressMonitor.INSTANCE;
- pm.beginTask(JGitText.get().renamesFindingByContent, //
- 2 * srcs.size() * dsts.size());
- int mNext = buildMatrix(pm);
- out = new ArrayList<>(Math.min(mNext, dsts.size()));
- // Match rename pairs on a first come, first serve basis until
- // we have looked at everything that is above our minimum score.
- //
- for (--mNext; mNext >= 0; mNext--) {
- if (pm.isCancelled()) {
- // TODO(ms): use org.eclipse.jgit.api.errors.CanceledException
- // in next major version
- throw new CancelledException(JGitText.get().renameCancelled);
- }
- long ent = matrix[mNext];
- int sIdx = srcFile(ent);
- int dIdx = dstFile(ent);
- DiffEntry s = srcs.get(sIdx);
- DiffEntry d = dsts.get(dIdx);
- if (d == null) {
- pm.update(1);
- continue; // was already matched earlier
- }
- ChangeType type;
- if (s.changeType == ChangeType.DELETE) {
- // First use of this source file. Tag it as a rename so we
- // later know it is already been used as a rename, other
- // matches (if any) will claim themselves as copies instead.
- //
- s.changeType = ChangeType.RENAME;
- type = ChangeType.RENAME;
- } else {
- type = ChangeType.COPY;
- }
- out.add(DiffEntry.pair(type, s, d, score(ent)));
- dsts.set(dIdx, null); // Claim the destination was matched.
- pm.update(1);
- }
- srcs = compactSrcList(srcs);
- dsts = compactDstList(dsts);
- pm.endTask();
- }
- List<DiffEntry> getMatches() {
- return out;
- }
- List<DiffEntry> getLeftOverSources() {
- return srcs;
- }
- List<DiffEntry> getLeftOverDestinations() {
- return dsts;
- }
- boolean isTableOverflow() {
- return tableOverflow;
- }
- private static List<DiffEntry> compactSrcList(List<DiffEntry> in) {
- ArrayList<DiffEntry> r = new ArrayList<>(in.size());
- for (DiffEntry e : in) {
- if (e.changeType == ChangeType.DELETE)
- r.add(e);
- }
- return r;
- }
- private static List<DiffEntry> compactDstList(List<DiffEntry> in) {
- ArrayList<DiffEntry> r = new ArrayList<>(in.size());
- for (DiffEntry e : in) {
- if (e != null)
- r.add(e);
- }
- return r;
- }
- private int buildMatrix(ProgressMonitor pm)
- throws IOException, CancelledException {
- // Allocate for the worst-case scenario where every pair has a
- // score that we need to consider. We might not need that many.
- //
- matrix = new long[srcs.size() * dsts.size()];
- long[] srcSizes = new long[srcs.size()];
- long[] dstSizes = new long[dsts.size()];
- BitSet dstTooLarge = null;
- // Consider each pair of files, if the score is above the minimum
- // threshold we need record that scoring in the matrix so we can
- // later find the best matches.
- //
- int mNext = 0;
- SRC: for (int srcIdx = 0; srcIdx < srcs.size(); srcIdx++) {
- DiffEntry srcEnt = srcs.get(srcIdx);
- if (!isFile(srcEnt.oldMode)) {
- pm.update(dsts.size());
- continue;
- }
- SimilarityIndex s = null;
- for (int dstIdx = 0; dstIdx < dsts.size(); dstIdx++) {
- if (pm.isCancelled()) {
- // TODO(ms): use
- // org.eclipse.jgit.api.errors.CanceledException in next
- // major version
- throw new CancelledException(
- JGitText.get().renameCancelled);
- }
- DiffEntry dstEnt = dsts.get(dstIdx);
- if (!isFile(dstEnt.newMode)) {
- pm.update(1);
- continue;
- }
- if (!RenameDetector.sameType(srcEnt.oldMode, dstEnt.newMode)) {
- pm.update(1);
- continue;
- }
- if (dstTooLarge != null && dstTooLarge.get(dstIdx)) {
- pm.update(1);
- continue;
- }
- long srcSize = srcSizes[srcIdx];
- if (srcSize == 0) {
- srcSize = size(OLD, srcEnt) + 1;
- srcSizes[srcIdx] = srcSize;
- }
- long dstSize = dstSizes[dstIdx];
- if (dstSize == 0) {
- dstSize = size(NEW, dstEnt) + 1;
- dstSizes[dstIdx] = dstSize;
- }
- long max = Math.max(srcSize, dstSize);
- long min = Math.min(srcSize, dstSize);
- if (min * 100 / max < renameScore) {
- // Cannot possibly match, as the file sizes are so different
- pm.update(1);
- continue;
- }
- if (s == null) {
- try {
- s = hash(OLD, srcEnt);
- } catch (TableFullException tableFull) {
- tableOverflow = true;
- continue SRC;
- }
- }
- SimilarityIndex d;
- try {
- d = hash(NEW, dstEnt);
- } catch (TableFullException tableFull) {
- if (dstTooLarge == null)
- dstTooLarge = new BitSet(dsts.size());
- dstTooLarge.set(dstIdx);
- tableOverflow = true;
- pm.update(1);
- continue;
- }
- int contentScore = s.score(d, 10000);
- // nameScore returns a value between 0 and 100, but we want it
- // to be in the same range as the content score. This allows it
- // to be dropped into the pretty formula for the final score.
- int nameScore = nameScore(srcEnt.oldPath, dstEnt.newPath) * 100;
- int score = (contentScore * 99 + nameScore * 1) / 10000;
- if (score < renameScore) {
- pm.update(1);
- continue;
- }
- matrix[mNext++] = encode(score, srcIdx, dstIdx);
- pm.update(1);
- }
- }
- // Sort everything in the range we populated, which might be the
- // entire matrix, or just a smaller slice if we had some bad low
- // scoring pairs.
- //
- Arrays.sort(matrix, 0, mNext);
- return mNext;
- }
- static int nameScore(String a, String b) {
- int aDirLen = a.lastIndexOf('/') + 1;
- int bDirLen = b.lastIndexOf('/') + 1;
- int dirMin = Math.min(aDirLen, bDirLen);
- int dirMax = Math.max(aDirLen, bDirLen);
- final int dirScoreLtr;
- final int dirScoreRtl;
- if (dirMax == 0) {
- dirScoreLtr = 100;
- dirScoreRtl = 100;
- } else {
- int dirSim = 0;
- for (; dirSim < dirMin; dirSim++) {
- if (a.charAt(dirSim) != b.charAt(dirSim))
- break;
- }
- dirScoreLtr = (dirSim * 100) / dirMax;
- if (dirScoreLtr == 100) {
- dirScoreRtl = 100;
- } else {
- for (dirSim = 0; dirSim < dirMin; dirSim++) {
- if (a.charAt(aDirLen - 1 - dirSim) != b.charAt(bDirLen - 1
- - dirSim))
- break;
- }
- dirScoreRtl = (dirSim * 100) / dirMax;
- }
- }
- int fileMin = Math.min(a.length() - aDirLen, b.length() - bDirLen);
- int fileMax = Math.max(a.length() - aDirLen, b.length() - bDirLen);
- int fileSim = 0;
- for (; fileSim < fileMin; fileSim++) {
- if (a.charAt(a.length() - 1 - fileSim) != b.charAt(b.length() - 1
- - fileSim))
- break;
- }
- int fileScore = (fileSim * 100) / fileMax;
- return (((dirScoreLtr + dirScoreRtl) * 25) + (fileScore * 50)) / 100;
- }
- private SimilarityIndex hash(DiffEntry.Side side, DiffEntry ent)
- throws IOException, TableFullException {
- SimilarityIndex r = new SimilarityIndex();
- r.hash(reader.open(side, ent));
- r.sort();
- return r;
- }
- private long size(DiffEntry.Side side, DiffEntry ent) throws IOException {
- return reader.size(side, ent);
- }
- private static int score(long value) {
- return (int) (value >>> SCORE_SHIFT);
- }
- static int srcFile(long value) {
- return decodeFile(((int) (value >>> BITS_PER_INDEX)) & INDEX_MASK);
- }
- static int dstFile(long value) {
- return decodeFile(((int) value) & INDEX_MASK);
- }
- static long encode(int score, int srcIdx, int dstIdx) {
- return (((long) score) << SCORE_SHIFT) //
- | (encodeFile(srcIdx) << BITS_PER_INDEX) //
- | encodeFile(dstIdx);
- }
- private static long encodeFile(int idx) {
- // We invert the index so that the first file in the list sorts
- // later in the table. This permits us to break ties favoring
- // earlier names over later ones.
- //
- return INDEX_MASK - idx;
- }
- private static int decodeFile(int v) {
- return INDEX_MASK - v;
- }
- private static boolean isFile(FileMode mode) {
- return (mode.getBits() & FileMode.TYPE_MASK) == FileMode.TYPE_FILE;
- }
- }