FileHeader.java

  1. /*
  2.  * Copyright (C) 2008-2009, Google Inc. and others
  3.  *
  4.  * This program and the accompanying materials are made available under the
  5.  * terms of the Eclipse Distribution License v. 1.0 which is available at
  6.  * https://www.eclipse.org/org/documents/edl-v10.php.
  7.  *
  8.  * SPDX-License-Identifier: BSD-3-Clause
  9.  */

  10. package org.eclipse.jgit.patch;

  11. import static java.nio.charset.StandardCharsets.UTF_8;
  12. import static org.eclipse.jgit.lib.Constants.encodeASCII;
  13. import static org.eclipse.jgit.util.RawParseUtils.decode;
  14. import static org.eclipse.jgit.util.RawParseUtils.decodeNoFallback;
  15. import static org.eclipse.jgit.util.RawParseUtils.extractBinaryString;
  16. import static org.eclipse.jgit.util.RawParseUtils.match;
  17. import static org.eclipse.jgit.util.RawParseUtils.nextLF;
  18. import static org.eclipse.jgit.util.RawParseUtils.parseBase10;

  19. import java.io.IOException;
  20. import java.nio.charset.CharacterCodingException;
  21. import java.nio.charset.Charset;
  22. import java.text.MessageFormat;
  23. import java.util.ArrayList;
  24. import java.util.Collections;
  25. import java.util.List;

  26. import org.eclipse.jgit.diff.DiffEntry;
  27. import org.eclipse.jgit.diff.EditList;
  28. import org.eclipse.jgit.internal.JGitText;
  29. import org.eclipse.jgit.lib.AbbreviatedObjectId;
  30. import org.eclipse.jgit.lib.FileMode;
  31. import org.eclipse.jgit.util.QuotedString;
  32. import org.eclipse.jgit.util.RawParseUtils;
  33. import org.eclipse.jgit.util.TemporaryBuffer;

  34. /**
  35.  * Patch header describing an action for a single file path.
  36.  */
  37. public class FileHeader extends DiffEntry {
  38.     private static final byte[] OLD_MODE = encodeASCII("old mode "); //$NON-NLS-1$

  39.     private static final byte[] NEW_MODE = encodeASCII("new mode "); //$NON-NLS-1$

  40.     static final byte[] DELETED_FILE_MODE = encodeASCII("deleted file mode "); //$NON-NLS-1$

  41.     static final byte[] NEW_FILE_MODE = encodeASCII("new file mode "); //$NON-NLS-1$

  42.     private static final byte[] COPY_FROM = encodeASCII("copy from "); //$NON-NLS-1$

  43.     private static final byte[] COPY_TO = encodeASCII("copy to "); //$NON-NLS-1$

  44.     private static final byte[] RENAME_OLD = encodeASCII("rename old "); //$NON-NLS-1$

  45.     private static final byte[] RENAME_NEW = encodeASCII("rename new "); //$NON-NLS-1$

  46.     private static final byte[] RENAME_FROM = encodeASCII("rename from "); //$NON-NLS-1$

  47.     private static final byte[] RENAME_TO = encodeASCII("rename to "); //$NON-NLS-1$

  48.     private static final byte[] SIMILARITY_INDEX = encodeASCII("similarity index "); //$NON-NLS-1$

  49.     private static final byte[] DISSIMILARITY_INDEX = encodeASCII("dissimilarity index "); //$NON-NLS-1$

  50.     static final byte[] INDEX = encodeASCII("index "); //$NON-NLS-1$

  51.     static final byte[] OLD_NAME = encodeASCII("--- "); //$NON-NLS-1$

  52.     static final byte[] NEW_NAME = encodeASCII("+++ "); //$NON-NLS-1$

  53.     /** Type of patch used by this file. */
  54.     public enum PatchType {
  55.         /** A traditional unified diff style patch of a text file. */
  56.         UNIFIED,

  57.         /** An empty patch with a message "Binary files ... differ" */
  58.         BINARY,

  59.         /** A Git binary patch, holding pre and post image deltas */
  60.         GIT_BINARY;
  61.     }

  62.     /** Buffer holding the patch data for this file. */
  63.     final byte[] buf;

  64.     /** Offset within {@link #buf} to the "diff ..." line. */
  65.     final int startOffset;

  66.     /** Position 1 past the end of this file within {@link #buf}. */
  67.     int endOffset;

  68.     /** Type of patch used to modify this file */
  69.     PatchType patchType;

  70.     /** The hunks of this file */
  71.     private List<HunkHeader> hunks;

  72.     /** If {@link #patchType} is {@link PatchType#GIT_BINARY}, the new image */
  73.     BinaryHunk forwardBinaryHunk;

  74.     /** If {@link #patchType} is {@link PatchType#GIT_BINARY}, the old image */
  75.     BinaryHunk reverseBinaryHunk;

  76.     /**
  77.      * Constructs a new FileHeader
  78.      *
  79.      * @param headerLines
  80.      *            buffer holding the diff header for this file
  81.      * @param edits
  82.      *            the edits for this file
  83.      * @param type
  84.      *            the type of patch used to modify this file
  85.      */
  86.     public FileHeader(byte[] headerLines, EditList edits, PatchType type) {
  87.         this(headerLines, 0);
  88.         endOffset = headerLines.length;
  89.         int ptr = parseGitFileName(Patch.DIFF_GIT.length, headerLines.length);
  90.         parseGitHeaders(ptr, headerLines.length);
  91.         this.patchType = type;
  92.         addHunk(new HunkHeader(this, edits));
  93.     }

  94.     FileHeader(byte[] b, int offset) {
  95.         buf = b;
  96.         startOffset = offset;
  97.         changeType = ChangeType.MODIFY; // unless otherwise designated
  98.         patchType = PatchType.UNIFIED;
  99.     }

  100.     int getParentCount() {
  101.         return 1;
  102.     }

  103.     /**
  104.      * Get the byte array holding this file's patch script.
  105.      *
  106.      * @return the byte array holding this file's patch script.
  107.      */
  108.     public byte[] getBuffer() {
  109.         return buf;
  110.     }

  111.     /**
  112.      * Get offset of the start of this file's script in {@link #getBuffer()}.
  113.      *
  114.      * @return offset of the start of this file's script in
  115.      *         {@link #getBuffer()}.
  116.      */
  117.     public int getStartOffset() {
  118.         return startOffset;
  119.     }

  120.     /**
  121.      * Get offset one past the end of the file script.
  122.      *
  123.      * @return offset one past the end of the file script.
  124.      */
  125.     public int getEndOffset() {
  126.         return endOffset;
  127.     }

  128.     /**
  129.      * Convert the patch script for this file into a string.
  130.      * <p>
  131.      * The default character encoding
  132.      * ({@link java.nio.charset.StandardCharsets#UTF_8}) is assumed for both the
  133.      * old and new files.
  134.      *
  135.      * @return the patch script, as a Unicode string.
  136.      */
  137.     public String getScriptText() {
  138.         return getScriptText(null, null);
  139.     }

  140.     /**
  141.      * Convert the patch script for this file into a string.
  142.      *
  143.      * @param oldCharset
  144.      *            hint character set to decode the old lines with.
  145.      * @param newCharset
  146.      *            hint character set to decode the new lines with.
  147.      * @return the patch script, as a Unicode string.
  148.      */
  149.     public String getScriptText(Charset oldCharset, Charset newCharset) {
  150.         return getScriptText(new Charset[] { oldCharset, newCharset });
  151.     }

  152.     String getScriptText(Charset[] charsetGuess) {
  153.         if (getHunks().isEmpty()) {
  154.             // If we have no hunks then we can safely assume the entire
  155.             // patch is a binary style patch, or a meta-data only style
  156.             // patch. Either way the encoding of the headers should be
  157.             // strictly 7-bit US-ASCII and the body is either 7-bit ASCII
  158.             // (due to the base 85 encoding used for a BinaryHunk) or is
  159.             // arbitrary noise we have chosen to ignore and not understand
  160.             // (e.g. the message "Binary files ... differ").
  161.             //
  162.             return extractBinaryString(buf, startOffset, endOffset);
  163.         }

  164.         if (charsetGuess != null && charsetGuess.length != getParentCount() + 1)
  165.             throw new IllegalArgumentException(MessageFormat.format(
  166.                     JGitText.get().expectedCharacterEncodingGuesses,
  167.                     Integer.valueOf(getParentCount() + 1)));

  168.         if (trySimpleConversion(charsetGuess)) {
  169.             Charset cs = charsetGuess != null ? charsetGuess[0] : null;
  170.             if (cs == null) {
  171.                 cs = UTF_8;
  172.             }
  173.             try {
  174.                 return decodeNoFallback(cs, buf, startOffset, endOffset);
  175.             } catch (CharacterCodingException cee) {
  176.                 // Try the much slower, more-memory intensive version which
  177.                 // can handle a character set conversion patch.
  178.             }
  179.         }

  180.         final StringBuilder r = new StringBuilder(endOffset - startOffset);

  181.         // Always treat the headers as US-ASCII; Git file names are encoded
  182.         // in a C style escape if any character has the high-bit set.
  183.         //
  184.         final int hdrEnd = getHunks().get(0).getStartOffset();
  185.         for (int ptr = startOffset; ptr < hdrEnd;) {
  186.             final int eol = Math.min(hdrEnd, nextLF(buf, ptr));
  187.             r.append(extractBinaryString(buf, ptr, eol));
  188.             ptr = eol;
  189.         }

  190.         final String[] files = extractFileLines(charsetGuess);
  191.         final int[] offsets = new int[files.length];
  192.         for (HunkHeader h : getHunks())
  193.             h.extractFileLines(r, files, offsets);
  194.         return r.toString();
  195.     }

  196.     private static boolean trySimpleConversion(Charset[] charsetGuess) {
  197.         if (charsetGuess == null)
  198.             return true;
  199.         for (int i = 1; i < charsetGuess.length; i++) {
  200.             if (charsetGuess[i] != charsetGuess[0])
  201.                 return false;
  202.         }
  203.         return true;
  204.     }

  205.     private String[] extractFileLines(Charset[] csGuess) {
  206.         final TemporaryBuffer[] tmp = new TemporaryBuffer[getParentCount() + 1];
  207.         try {
  208.             for (int i = 0; i < tmp.length; i++)
  209.                 tmp[i] = new TemporaryBuffer.Heap(Integer.MAX_VALUE);
  210.             for (HunkHeader h : getHunks())
  211.                 h.extractFileLines(tmp);

  212.             final String[] r = new String[tmp.length];
  213.             for (int i = 0; i < tmp.length; i++) {
  214.                 Charset cs = csGuess != null ? csGuess[i] : null;
  215.                 if (cs == null) {
  216.                     cs = UTF_8;
  217.                 }
  218.                 r[i] = RawParseUtils.decode(cs, tmp[i].toByteArray());
  219.             }
  220.             return r;
  221.         } catch (IOException ioe) {
  222.             throw new RuntimeException(JGitText.get().cannotConvertScriptToText, ioe);
  223.         }
  224.     }

  225.     /**
  226.      * Get style of patch used to modify this file.
  227.      *
  228.      * @return style of patch used to modify this file.
  229.      */
  230.     public PatchType getPatchType() {
  231.         return patchType;
  232.     }

  233.     /**
  234.      * Whether this patch modifies metadata about a file
  235.      *
  236.      * @return {@code true} if this patch modifies metadata about a file .
  237.      */
  238.     public boolean hasMetaDataChanges() {
  239.         return changeType != ChangeType.MODIFY || newMode != oldMode;
  240.     }

  241.     /**
  242.      * Get hunks altering this file; in order of appearance in patch
  243.      *
  244.      * @return hunks altering this file; in order of appearance in patch.
  245.      */
  246.     public List<? extends HunkHeader> getHunks() {
  247.         if (hunks == null)
  248.             return Collections.emptyList();
  249.         return hunks;
  250.     }

  251.     void addHunk(HunkHeader h) {
  252.         if (h.getFileHeader() != this)
  253.             throw new IllegalArgumentException(JGitText.get().hunkBelongsToAnotherFile);
  254.         if (hunks == null)
  255.             hunks = new ArrayList<>();
  256.         hunks.add(h);
  257.     }

  258.     HunkHeader newHunkHeader(int offset) {
  259.         return new HunkHeader(this, offset);
  260.     }

  261.     /**
  262.      * Get the new-image delta/literal if this is a
  263.      * {@link PatchType#GIT_BINARY}.
  264.      *
  265.      * @return the new-image delta/literal if this is a
  266.      *         {@link PatchType#GIT_BINARY}.
  267.      */
  268.     public BinaryHunk getForwardBinaryHunk() {
  269.         return forwardBinaryHunk;
  270.     }

  271.     /**
  272.      * Get the old-image delta/literal if this is a
  273.      * {@link PatchType#GIT_BINARY}.
  274.      *
  275.      * @return the old-image delta/literal if this is a
  276.      *         {@link PatchType#GIT_BINARY}.
  277.      */
  278.     public BinaryHunk getReverseBinaryHunk() {
  279.         return reverseBinaryHunk;
  280.     }

  281.     /**
  282.      * Convert to a list describing the content edits performed on this file.
  283.      *
  284.      * @return a list describing the content edits performed on this file.
  285.      */
  286.     public EditList toEditList() {
  287.         final EditList r = new EditList();
  288.         for (HunkHeader hunk : hunks)
  289.             r.addAll(hunk.toEditList());
  290.         return r;
  291.     }

  292.     /**
  293.      * Parse a "diff --git" or "diff --cc" line.
  294.      *
  295.      * @param ptr
  296.      *            first character after the "diff --git " or "diff --cc " part.
  297.      * @param end
  298.      *            one past the last position to parse.
  299.      * @return first character after the LF at the end of the line; -1 on error.
  300.      */
  301.     int parseGitFileName(int ptr, int end) {
  302.         final int eol = nextLF(buf, ptr);
  303.         final int bol = ptr;
  304.         if (eol >= end) {
  305.             return -1;
  306.         }

  307.         // buffer[ptr..eol] looks like "a/foo b/foo\n". After the first
  308.         // A regex to match this is "^[^/]+/(.*?) [^/+]+/\1\n$". There
  309.         // is only one way to split the line such that text to the left
  310.         // of the space matches the text to the right, excluding the part
  311.         // before the first slash.
  312.         //

  313.         final int aStart = nextLF(buf, ptr, '/');
  314.         if (aStart >= eol)
  315.             return eol;

  316.         while (ptr < eol) {
  317.             final int sp = nextLF(buf, ptr, ' ');
  318.             if (sp >= eol) {
  319.                 // We can't split the header, it isn't valid.
  320.                 // This may be OK if this is a rename patch.
  321.                 //
  322.                 return eol;
  323.             }
  324.             final int bStart = nextLF(buf, sp, '/');
  325.             if (bStart >= eol)
  326.                 return eol;

  327.             // If buffer[aStart..sp - 1] = buffer[bStart..eol - 1]
  328.             // we have a valid split.
  329.             //
  330.             if (eq(aStart, sp - 1, bStart, eol - 1)) {
  331.                 if (buf[bol] == '"') {
  332.                     // We're a double quoted name. The region better end
  333.                     // in a double quote too, and we need to decode the
  334.                     // characters before reading the name.
  335.                     //
  336.                     if (buf[sp - 2] != '"') {
  337.                         return eol;
  338.                     }
  339.                     oldPath = QuotedString.GIT_PATH.dequote(buf, bol, sp - 1);
  340.                     oldPath = p1(oldPath);
  341.                 } else {
  342.                     oldPath = decode(UTF_8, buf, aStart, sp - 1);
  343.                 }
  344.                 newPath = oldPath;
  345.                 return eol;
  346.             }

  347.             // This split wasn't correct. Move past the space and try
  348.             // another split as the space must be part of the file name.
  349.             //
  350.             ptr = sp;
  351.         }

  352.         return eol;
  353.     }

  354.     int parseGitHeaders(int ptr, int end) {
  355.         while (ptr < end) {
  356.             final int eol = nextLF(buf, ptr);
  357.             if (isHunkHdr(buf, ptr, eol) >= 1) {
  358.                 // First hunk header; break out and parse them later.
  359.                 break;

  360.             } else if (match(buf, ptr, OLD_NAME) >= 0) {
  361.                 parseOldName(ptr, eol);

  362.             } else if (match(buf, ptr, NEW_NAME) >= 0) {
  363.                 parseNewName(ptr, eol);

  364.             } else if (match(buf, ptr, OLD_MODE) >= 0) {
  365.                 oldMode = parseFileMode(ptr + OLD_MODE.length, eol);

  366.             } else if (match(buf, ptr, NEW_MODE) >= 0) {
  367.                 newMode = parseFileMode(ptr + NEW_MODE.length, eol);

  368.             } else if (match(buf, ptr, DELETED_FILE_MODE) >= 0) {
  369.                 oldMode = parseFileMode(ptr + DELETED_FILE_MODE.length, eol);
  370.                 newMode = FileMode.MISSING;
  371.                 changeType = ChangeType.DELETE;

  372.             } else if (match(buf, ptr, NEW_FILE_MODE) >= 0) {
  373.                 parseNewFileMode(ptr, eol);

  374.             } else if (match(buf, ptr, COPY_FROM) >= 0) {
  375.                 oldPath = parseName(oldPath, ptr + COPY_FROM.length, eol);
  376.                 changeType = ChangeType.COPY;

  377.             } else if (match(buf, ptr, COPY_TO) >= 0) {
  378.                 newPath = parseName(newPath, ptr + COPY_TO.length, eol);
  379.                 changeType = ChangeType.COPY;

  380.             } else if (match(buf, ptr, RENAME_OLD) >= 0) {
  381.                 oldPath = parseName(oldPath, ptr + RENAME_OLD.length, eol);
  382.                 changeType = ChangeType.RENAME;

  383.             } else if (match(buf, ptr, RENAME_NEW) >= 0) {
  384.                 newPath = parseName(newPath, ptr + RENAME_NEW.length, eol);
  385.                 changeType = ChangeType.RENAME;

  386.             } else if (match(buf, ptr, RENAME_FROM) >= 0) {
  387.                 oldPath = parseName(oldPath, ptr + RENAME_FROM.length, eol);
  388.                 changeType = ChangeType.RENAME;

  389.             } else if (match(buf, ptr, RENAME_TO) >= 0) {
  390.                 newPath = parseName(newPath, ptr + RENAME_TO.length, eol);
  391.                 changeType = ChangeType.RENAME;

  392.             } else if (match(buf, ptr, SIMILARITY_INDEX) >= 0) {
  393.                 score = parseBase10(buf, ptr + SIMILARITY_INDEX.length, null);

  394.             } else if (match(buf, ptr, DISSIMILARITY_INDEX) >= 0) {
  395.                 score = parseBase10(buf, ptr + DISSIMILARITY_INDEX.length, null);

  396.             } else if (match(buf, ptr, INDEX) >= 0) {
  397.                 parseIndexLine(ptr + INDEX.length, eol);

  398.             } else {
  399.                 // Probably an empty patch (stat dirty).
  400.                 break;
  401.             }

  402.             ptr = eol;
  403.         }
  404.         return ptr;
  405.     }

  406.     void parseOldName(int ptr, int eol) {
  407.         oldPath = p1(parseName(oldPath, ptr + OLD_NAME.length, eol));
  408.         if (oldPath == DEV_NULL)
  409.             changeType = ChangeType.ADD;
  410.     }

  411.     void parseNewName(int ptr, int eol) {
  412.         newPath = p1(parseName(newPath, ptr + NEW_NAME.length, eol));
  413.         if (newPath == DEV_NULL)
  414.             changeType = ChangeType.DELETE;
  415.     }

  416.     void parseNewFileMode(int ptr, int eol) {
  417.         oldMode = FileMode.MISSING;
  418.         newMode = parseFileMode(ptr + NEW_FILE_MODE.length, eol);
  419.         changeType = ChangeType.ADD;
  420.     }

  421.     int parseTraditionalHeaders(int ptr, int end) {
  422.         while (ptr < end) {
  423.             final int eol = nextLF(buf, ptr);
  424.             if (isHunkHdr(buf, ptr, eol) >= 1) {
  425.                 // First hunk header; break out and parse them later.
  426.                 break;

  427.             } else if (match(buf, ptr, OLD_NAME) >= 0) {
  428.                 parseOldName(ptr, eol);

  429.             } else if (match(buf, ptr, NEW_NAME) >= 0) {
  430.                 parseNewName(ptr, eol);

  431.             } else {
  432.                 // Possibly an empty patch.
  433.                 break;
  434.             }

  435.             ptr = eol;
  436.         }
  437.         return ptr;
  438.     }

  439.     private String parseName(String expect, int ptr, int end) {
  440.         if (ptr == end)
  441.             return expect;

  442.         String r;
  443.         if (buf[ptr] == '"') {
  444.             // New style GNU diff format
  445.             //
  446.             r = QuotedString.GIT_PATH.dequote(buf, ptr, end - 1);
  447.         } else {
  448.             // Older style GNU diff format, an optional tab ends the name.
  449.             //
  450.             int tab = end;
  451.             while (ptr < tab && buf[tab - 1] != '\t')
  452.                 tab--;
  453.             if (ptr == tab)
  454.                 tab = end;
  455.             r = decode(UTF_8, buf, ptr, tab - 1);
  456.         }

  457.         if (r.equals(DEV_NULL))
  458.             r = DEV_NULL;
  459.         return r;
  460.     }

  461.     private static String p1(final String r) {
  462.         final int s = r.indexOf('/');
  463.         return s > 0 ? r.substring(s + 1) : r;
  464.     }

  465.     FileMode parseFileMode(int ptr, int end) {
  466.         int tmp = 0;
  467.         while (ptr < end - 1) {
  468.             tmp <<= 3;
  469.             tmp += buf[ptr++] - '0';
  470.         }
  471.         return FileMode.fromBits(tmp);
  472.     }

  473.     void parseIndexLine(int ptr, int end) {
  474.         // "index $asha1..$bsha1[ $mode]" where $asha1 and $bsha1
  475.         // can be unique abbreviations
  476.         //
  477.         final int dot2 = nextLF(buf, ptr, '.');
  478.         final int mode = nextLF(buf, dot2, ' ');

  479.         oldId = AbbreviatedObjectId.fromString(buf, ptr, dot2 - 1);
  480.         newId = AbbreviatedObjectId.fromString(buf, dot2 + 1, mode - 1);

  481.         if (mode < end)
  482.             newMode = oldMode = parseFileMode(mode, end);
  483.     }

  484.     private boolean eq(int aPtr, int aEnd, int bPtr, int bEnd) {
  485.         if (aEnd - aPtr != bEnd - bPtr) {
  486.             return false;
  487.         }
  488.         while (aPtr < aEnd) {
  489.             if (buf[aPtr++] != buf[bPtr++])
  490.                 return false;
  491.         }
  492.         return true;
  493.     }

  494.     /**
  495.      * Determine if this is a patch hunk header.
  496.      *
  497.      * @param buf
  498.      *            the buffer to scan
  499.      * @param start
  500.      *            first position in the buffer to evaluate
  501.      * @param end
  502.      *            last position to consider; usually the end of the buffer (
  503.      *            <code>buf.length</code>) or the first position on the next
  504.      *            line. This is only used to avoid very long runs of '@' from
  505.      *            killing the scan loop.
  506.      * @return the number of "ancestor revisions" in the hunk header. A
  507.      *         traditional two-way diff ("@@ -...") returns 1; a combined diff
  508.      *         for a 3 way-merge returns 3. If this is not a hunk header, 0 is
  509.      *         returned instead.
  510.      */
  511.     static int isHunkHdr(byte[] buf, int start, int end) {
  512.         int ptr = start;
  513.         while (ptr < end && buf[ptr] == '@')
  514.             ptr++;
  515.         if (ptr - start < 2)
  516.             return 0;
  517.         if (ptr == end || buf[ptr++] != ' ')
  518.             return 0;
  519.         if (ptr == end || buf[ptr++] != '-')
  520.             return 0;
  521.         return (ptr - 3) - start;
  522.     }
  523. }