HistogramDiff.java

  1. /*
  2.  * Copyright (C) 2010, Google Inc. and others
  3.  *
  4.  * This program and the accompanying materials are made available under the
  5.  * terms of the Eclipse Distribution License v. 1.0 which is available at
  6.  * https://www.eclipse.org/org/documents/edl-v10.php.
  7.  *
  8.  * SPDX-License-Identifier: BSD-3-Clause
  9.  */

  10. package org.eclipse.jgit.diff;

  11. import java.util.ArrayList;
  12. import java.util.List;

  13. /**
  14.  * An extended form of Bram Cohen's patience diff algorithm.
  15.  * <p>
  16.  * This implementation was derived by using the 4 rules that are outlined in
  17.  * Bram Cohen's <a href="http://bramcohen.livejournal.com/73318.html">blog</a>,
  18.  * and then was further extended to support low-occurrence common elements.
  19.  * <p>
  20.  * The basic idea of the algorithm is to create a histogram of occurrences for
  21.  * each element of sequence A. Each element of sequence B is then considered in
  22.  * turn. If the element also exists in sequence A, and has a lower occurrence
  23.  * count, the positions are considered as a candidate for the longest common
  24.  * subsequence (LCS). After scanning of B is complete the LCS that has the
  25.  * lowest number of occurrences is chosen as a split point. The region is split
  26.  * around the LCS, and the algorithm is recursively applied to the sections
  27.  * before and after the LCS.
  28.  * <p>
  29.  * By always selecting a LCS position with the lowest occurrence count, this
  30.  * algorithm behaves exactly like Bram Cohen's patience diff whenever there is a
  31.  * unique common element available between the two sequences. When no unique
  32.  * elements exist, the lowest occurrence element is chosen instead. This offers
  33.  * more readable diffs than simply falling back on the standard Myers' O(ND)
  34.  * algorithm would produce.
  35.  * <p>
  36.  * To prevent the algorithm from having an O(N^2) running time, an upper limit
  37.  * on the number of unique elements in a histogram bucket is configured by
  38.  * {@link #setMaxChainLength(int)}. If sequence A has more than this many
  39.  * elements that hash into the same hash bucket, the algorithm passes the region
  40.  * to {@link #setFallbackAlgorithm(DiffAlgorithm)}. If no fallback algorithm is
  41.  * configured, the region is emitted as a replace edit.
  42.  * <p>
  43.  * During scanning of sequence B, any element of A that occurs more than
  44.  * {@link #setMaxChainLength(int)} times is never considered for an LCS match
  45.  * position, even if it is common between the two sequences. This limits the
  46.  * number of locations in sequence A that must be considered to find the LCS,
  47.  * and helps maintain a lower running time bound.
  48.  * <p>
  49.  * So long as {@link #setMaxChainLength(int)} is a small constant (such as 64),
  50.  * the algorithm runs in O(N * D) time, where N is the sum of the input lengths
  51.  * and D is the number of edits in the resulting EditList. If the supplied
  52.  * {@link org.eclipse.jgit.diff.SequenceComparator} has a good hash function,
  53.  * this implementation typically out-performs
  54.  * {@link org.eclipse.jgit.diff.MyersDiff}, even though its theoretical running
  55.  * time is the same.
  56.  * <p>
  57.  * This implementation has an internal limitation that prevents it from handling
  58.  * sequences with more than 268,435,456 (2^28) elements.
  59.  */
  60. public class HistogramDiff extends LowLevelDiffAlgorithm {
  61.     /** Algorithm to use when there are too many element occurrences. */
  62.     DiffAlgorithm fallback = MyersDiff.INSTANCE;

  63.     /**
  64.      * Maximum number of positions to consider for a given element hash.
  65.      *
  66.      * All elements with the same hash are stored into a single chain. The chain
  67.      * size is capped to ensure search is linear time at O(len_A + len_B) rather
  68.      * than quadratic at O(len_A * len_B).
  69.      */
  70.     int maxChainLength = 64;

  71.     /**
  72.      * Set the algorithm used when there are too many element occurrences.
  73.      *
  74.      * @param alg
  75.      *            the secondary algorithm. If null the region will be denoted as
  76.      *            a single REPLACE block.
  77.      */
  78.     public void setFallbackAlgorithm(DiffAlgorithm alg) {
  79.         fallback = alg;
  80.     }

  81.     /**
  82.      * Maximum number of positions to consider for a given element hash.
  83.      *
  84.      * All elements with the same hash are stored into a single chain. The chain
  85.      * size is capped to ensure search is linear time at O(len_A + len_B) rather
  86.      * than quadratic at O(len_A * len_B).
  87.      *
  88.      * @param maxLen
  89.      *            new maximum length.
  90.      */
  91.     public void setMaxChainLength(int maxLen) {
  92.         maxChainLength = maxLen;
  93.     }

  94.     /** {@inheritDoc} */
  95.     @Override
  96.     public <S extends Sequence> void diffNonCommon(EditList edits,
  97.             HashedSequenceComparator<S> cmp, HashedSequence<S> a,
  98.             HashedSequence<S> b, Edit region) {
  99.         new State<>(edits, cmp, a, b).diffRegion(region);
  100.     }

  101.     private class State<S extends Sequence> {
  102.         private final HashedSequenceComparator<S> cmp;
  103.         private final HashedSequence<S> a;
  104.         private final HashedSequence<S> b;
  105.         private final List<Edit> queue = new ArrayList<>();

  106.         /** Result edits we have determined that must be made to convert a to b. */
  107.         final EditList edits;

  108.         State(EditList edits, HashedSequenceComparator<S> cmp,
  109.                 HashedSequence<S> a, HashedSequence<S> b) {
  110.             this.cmp = cmp;
  111.             this.a = a;
  112.             this.b = b;
  113.             this.edits = edits;
  114.         }

  115.         void diffRegion(Edit r) {
  116.             diffReplace(r);
  117.             while (!queue.isEmpty())
  118.                 diff(queue.remove(queue.size() - 1));
  119.         }

  120.         private void diffReplace(Edit r) {
  121.             Edit lcs = new HistogramDiffIndex<>(maxChainLength, cmp, a, b, r)
  122.                     .findLongestCommonSequence();
  123.             if (lcs != null) {
  124.                 // If we were given an edit, we can prove a result here.
  125.                 //
  126.                 if (lcs.isEmpty()) {
  127.                     // An empty edit indicates there is nothing in common.
  128.                     // Replace the entire region.
  129.                     //
  130.                     edits.add(r);
  131.                 } else {
  132.                     queue.add(r.after(lcs));
  133.                     queue.add(r.before(lcs));
  134.                 }

  135.             } else if (fallback instanceof LowLevelDiffAlgorithm) {
  136.                 LowLevelDiffAlgorithm fb = (LowLevelDiffAlgorithm) fallback;
  137.                 fb.diffNonCommon(edits, cmp, a, b, r);

  138.             } else if (fallback != null) {
  139.                 SubsequenceComparator<HashedSequence<S>> cs = subcmp();
  140.                 Subsequence<HashedSequence<S>> as = Subsequence.a(a, r);
  141.                 Subsequence<HashedSequence<S>> bs = Subsequence.b(b, r);

  142.                 EditList res = fallback.diffNonCommon(cs, as, bs);
  143.                 edits.addAll(Subsequence.toBase(res, as, bs));

  144.             } else {
  145.                 edits.add(r);
  146.             }
  147.         }

  148.         private void diff(Edit r) {
  149.             switch (r.getType()) {
  150.             case INSERT:
  151.             case DELETE:
  152.                 edits.add(r);
  153.                 break;

  154.             case REPLACE:
  155.                 if (r.getLengthA() == 1 && r.getLengthB() == 1)
  156.                     edits.add(r);
  157.                 else
  158.                     diffReplace(r);
  159.                 break;

  160.             case EMPTY:
  161.             default:
  162.                 throw new IllegalStateException();
  163.             }
  164.         }

  165.         private SubsequenceComparator<HashedSequence<S>> subcmp() {
  166.             return new SubsequenceComparator<>(cmp);
  167.         }
  168.     }
  169. }