RawParseUtils.java

  1. /*
  2.  * Copyright (C) 2008-2009, Google Inc.
  3.  * Copyright (C) 2006-2008, Shawn O. Pearce <spearce@spearce.org> and others
  4.  *
  5.  * This program and the accompanying materials are made available under the
  6.  * terms of the Eclipse Distribution License v. 1.0 which is available at
  7.  * https://www.eclipse.org/org/documents/edl-v10.php.
  8.  *
  9.  * SPDX-License-Identifier: BSD-3-Clause
  10.  */

  11. package org.eclipse.jgit.util;

  12. import static java.nio.charset.StandardCharsets.ISO_8859_1;
  13. import static java.nio.charset.StandardCharsets.UTF_8;
  14. import static org.eclipse.jgit.lib.ObjectChecker.author;
  15. import static org.eclipse.jgit.lib.ObjectChecker.committer;
  16. import static org.eclipse.jgit.lib.ObjectChecker.encoding;
  17. import static org.eclipse.jgit.lib.ObjectChecker.tagger;

  18. import java.nio.ByteBuffer;
  19. import java.nio.charset.CharacterCodingException;
  20. import java.nio.charset.Charset;
  21. import java.nio.charset.CharsetDecoder;
  22. import java.nio.charset.CodingErrorAction;
  23. import java.nio.charset.IllegalCharsetNameException;
  24. import java.nio.charset.UnsupportedCharsetException;
  25. import java.util.Arrays;
  26. import java.util.HashMap;
  27. import java.util.Map;

  28. import org.eclipse.jgit.annotations.Nullable;
  29. import org.eclipse.jgit.diff.RawText;
  30. import org.eclipse.jgit.errors.BinaryBlobException;
  31. import org.eclipse.jgit.lib.Constants;
  32. import org.eclipse.jgit.lib.PersonIdent;

  33. /**
  34.  * Handy utility functions to parse raw object contents.
  35.  */
  36. public final class RawParseUtils {
  37.     /**
  38.      * UTF-8 charset constant.
  39.      *
  40.      * @since 2.2
  41.      * @deprecated use {@link java.nio.charset.StandardCharsets#UTF_8} instead
  42.      */
  43.     @Deprecated
  44.     public static final Charset UTF8_CHARSET = UTF_8;

  45.     private static final byte[] digits10;

  46.     private static final byte[] digits16;

  47.     private static final byte[] footerLineKeyChars;

  48.     private static final Map<String, Charset> encodingAliases;

  49.     static {
  50.         encodingAliases = new HashMap<>();
  51.         encodingAliases.put("latin-1", ISO_8859_1); //$NON-NLS-1$
  52.         encodingAliases.put("iso-latin-1", ISO_8859_1); //$NON-NLS-1$

  53.         digits10 = new byte['9' + 1];
  54.         Arrays.fill(digits10, (byte) -1);
  55.         for (char i = '0'; i <= '9'; i++)
  56.             digits10[i] = (byte) (i - '0');

  57.         digits16 = new byte['f' + 1];
  58.         Arrays.fill(digits16, (byte) -1);
  59.         for (char i = '0'; i <= '9'; i++)
  60.             digits16[i] = (byte) (i - '0');
  61.         for (char i = 'a'; i <= 'f'; i++)
  62.             digits16[i] = (byte) ((i - 'a') + 10);
  63.         for (char i = 'A'; i <= 'F'; i++)
  64.             digits16[i] = (byte) ((i - 'A') + 10);

  65.         footerLineKeyChars = new byte['z' + 1];
  66.         footerLineKeyChars['-'] = 1;
  67.         for (char i = '0'; i <= '9'; i++)
  68.             footerLineKeyChars[i] = 1;
  69.         for (char i = 'A'; i <= 'Z'; i++)
  70.             footerLineKeyChars[i] = 1;
  71.         for (char i = 'a'; i <= 'z'; i++)
  72.             footerLineKeyChars[i] = 1;
  73.     }

  74.     /**
  75.      * Determine if b[ptr] matches src.
  76.      *
  77.      * @param b
  78.      *            the buffer to scan.
  79.      * @param ptr
  80.      *            first position within b, this should match src[0].
  81.      * @param src
  82.      *            the buffer to test for equality with b.
  83.      * @return ptr + src.length if b[ptr..src.length] == src; else -1.
  84.      */
  85.     public static final int match(byte[] b, int ptr, byte[] src) {
  86.         if (ptr + src.length > b.length)
  87.             return -1;
  88.         for (int i = 0; i < src.length; i++, ptr++)
  89.             if (b[ptr] != src[i])
  90.                 return -1;
  91.         return ptr;
  92.     }

  93.     private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5',
  94.             '6', '7', '8', '9' };

  95.     /**
  96.      * Format a base 10 numeric into a temporary buffer.
  97.      * <p>
  98.      * Formatting is performed backwards. The method starts at offset
  99.      * <code>o-1</code> and ends at <code>o-1-digits</code>, where
  100.      * <code>digits</code> is the number of positions necessary to store the
  101.      * base 10 value.
  102.      * <p>
  103.      * The argument and return values from this method make it easy to chain
  104.      * writing, for example:
  105.      * </p>
  106.      *
  107.      * <pre>
  108.      * final byte[] tmp = new byte[64];
  109.      * int ptr = tmp.length;
  110.      * tmp[--ptr] = '\n';
  111.      * ptr = RawParseUtils.formatBase10(tmp, ptr, 32);
  112.      * tmp[--ptr] = ' ';
  113.      * ptr = RawParseUtils.formatBase10(tmp, ptr, 18);
  114.      * tmp[--ptr] = 0;
  115.      * final String str = new String(tmp, ptr, tmp.length - ptr);
  116.      * </pre>
  117.      *
  118.      * @param b
  119.      *            buffer to write into.
  120.      * @param o
  121.      *            one offset past the location where writing will begin; writing
  122.      *            proceeds towards lower index values.
  123.      * @param value
  124.      *            the value to store.
  125.      * @return the new offset value <code>o</code>. This is the position of
  126.      *         the last byte written. Additional writing should start at one
  127.      *         position earlier.
  128.      */
  129.     public static int formatBase10(final byte[] b, int o, int value) {
  130.         if (value == 0) {
  131.             b[--o] = '0';
  132.             return o;
  133.         }
  134.         final boolean isneg = value < 0;
  135.         if (isneg)
  136.             value = -value;
  137.         while (value != 0) {
  138.             b[--o] = base10byte[value % 10];
  139.             value /= 10;
  140.         }
  141.         if (isneg)
  142.             b[--o] = '-';
  143.         return o;
  144.     }

  145.     /**
  146.      * Parse a base 10 numeric from a sequence of ASCII digits into an int.
  147.      * <p>
  148.      * Digit sequences can begin with an optional run of spaces before the
  149.      * sequence, and may start with a '+' or a '-' to indicate sign position.
  150.      * Any other characters will cause the method to stop and return the current
  151.      * result to the caller.
  152.      *
  153.      * @param b
  154.      *            buffer to scan.
  155.      * @param ptr
  156.      *            position within buffer to start parsing digits at.
  157.      * @param ptrResult
  158.      *            optional location to return the new ptr value through. If null
  159.      *            the ptr value will be discarded.
  160.      * @return the value at this location; 0 if the location is not a valid
  161.      *         numeric.
  162.      */
  163.     public static final int parseBase10(final byte[] b, int ptr,
  164.             final MutableInteger ptrResult) {
  165.         int r = 0;
  166.         int sign = 0;
  167.         try {
  168.             final int sz = b.length;
  169.             while (ptr < sz && b[ptr] == ' ')
  170.                 ptr++;
  171.             if (ptr >= sz)
  172.                 return 0;

  173.             switch (b[ptr]) {
  174.             case '-':
  175.                 sign = -1;
  176.                 ptr++;
  177.                 break;
  178.             case '+':
  179.                 ptr++;
  180.                 break;
  181.             }

  182.             while (ptr < sz) {
  183.                 final byte v = digits10[b[ptr]];
  184.                 if (v < 0)
  185.                     break;
  186.                 r = (r * 10) + v;
  187.                 ptr++;
  188.             }
  189.         } catch (ArrayIndexOutOfBoundsException e) {
  190.             // Not a valid digit.
  191.         }
  192.         if (ptrResult != null)
  193.             ptrResult.value = ptr;
  194.         return sign < 0 ? -r : r;
  195.     }

  196.     /**
  197.      * Parse a base 10 numeric from a sequence of ASCII digits into a long.
  198.      * <p>
  199.      * Digit sequences can begin with an optional run of spaces before the
  200.      * sequence, and may start with a '+' or a '-' to indicate sign position.
  201.      * Any other characters will cause the method to stop and return the current
  202.      * result to the caller.
  203.      *
  204.      * @param b
  205.      *            buffer to scan.
  206.      * @param ptr
  207.      *            position within buffer to start parsing digits at.
  208.      * @param ptrResult
  209.      *            optional location to return the new ptr value through. If null
  210.      *            the ptr value will be discarded.
  211.      * @return the value at this location; 0 if the location is not a valid
  212.      *         numeric.
  213.      */
  214.     public static final long parseLongBase10(final byte[] b, int ptr,
  215.             final MutableInteger ptrResult) {
  216.         long r = 0;
  217.         int sign = 0;
  218.         try {
  219.             final int sz = b.length;
  220.             while (ptr < sz && b[ptr] == ' ')
  221.                 ptr++;
  222.             if (ptr >= sz)
  223.                 return 0;

  224.             switch (b[ptr]) {
  225.             case '-':
  226.                 sign = -1;
  227.                 ptr++;
  228.                 break;
  229.             case '+':
  230.                 ptr++;
  231.                 break;
  232.             }

  233.             while (ptr < sz) {
  234.                 final byte v = digits10[b[ptr]];
  235.                 if (v < 0)
  236.                     break;
  237.                 r = (r * 10) + v;
  238.                 ptr++;
  239.             }
  240.         } catch (ArrayIndexOutOfBoundsException e) {
  241.             // Not a valid digit.
  242.         }
  243.         if (ptrResult != null)
  244.             ptrResult.value = ptr;
  245.         return sign < 0 ? -r : r;
  246.     }

  247.     /**
  248.      * Parse 4 character base 16 (hex) formatted string to unsigned integer.
  249.      * <p>
  250.      * The number is read in network byte order, that is, most significant
  251.      * nybble first.
  252.      *
  253.      * @param bs
  254.      *            buffer to parse digits from; positions {@code [p, p+4)} will
  255.      *            be parsed.
  256.      * @param p
  257.      *            first position within the buffer to parse.
  258.      * @return the integer value.
  259.      * @throws java.lang.ArrayIndexOutOfBoundsException
  260.      *             if the string is not hex formatted.
  261.      */
  262.     public static final int parseHexInt16(final byte[] bs, final int p) {
  263.         int r = digits16[bs[p]] << 4;

  264.         r |= digits16[bs[p + 1]];
  265.         r <<= 4;

  266.         r |= digits16[bs[p + 2]];
  267.         r <<= 4;

  268.         r |= digits16[bs[p + 3]];
  269.         if (r < 0)
  270.             throw new ArrayIndexOutOfBoundsException();
  271.         return r;
  272.     }

  273.     /**
  274.      * Parse 8 character base 16 (hex) formatted string to unsigned integer.
  275.      * <p>
  276.      * The number is read in network byte order, that is, most significant
  277.      * nybble first.
  278.      *
  279.      * @param bs
  280.      *            buffer to parse digits from; positions {@code [p, p+8)} will
  281.      *            be parsed.
  282.      * @param p
  283.      *            first position within the buffer to parse.
  284.      * @return the integer value.
  285.      * @throws java.lang.ArrayIndexOutOfBoundsException
  286.      *             if the string is not hex formatted.
  287.      */
  288.     public static final int parseHexInt32(final byte[] bs, final int p) {
  289.         int r = digits16[bs[p]] << 4;

  290.         r |= digits16[bs[p + 1]];
  291.         r <<= 4;

  292.         r |= digits16[bs[p + 2]];
  293.         r <<= 4;

  294.         r |= digits16[bs[p + 3]];
  295.         r <<= 4;

  296.         r |= digits16[bs[p + 4]];
  297.         r <<= 4;

  298.         r |= digits16[bs[p + 5]];
  299.         r <<= 4;

  300.         r |= digits16[bs[p + 6]];

  301.         final int last = digits16[bs[p + 7]];
  302.         if (r < 0 || last < 0)
  303.             throw new ArrayIndexOutOfBoundsException();
  304.         return (r << 4) | last;
  305.     }

  306.     /**
  307.      * Parse 16 character base 16 (hex) formatted string to unsigned long.
  308.      * <p>
  309.      * The number is read in network byte order, that is, most significant
  310.      * nibble first.
  311.      *
  312.      * @param bs
  313.      *            buffer to parse digits from; positions {@code [p, p+16)} will
  314.      *            be parsed.
  315.      * @param p
  316.      *            first position within the buffer to parse.
  317.      * @return the integer value.
  318.      * @throws java.lang.ArrayIndexOutOfBoundsException
  319.      *             if the string is not hex formatted.
  320.      * @since 4.3
  321.      */
  322.     public static final long parseHexInt64(final byte[] bs, final int p) {
  323.         long r = digits16[bs[p]] << 4;

  324.         r |= digits16[bs[p + 1]];
  325.         r <<= 4;

  326.         r |= digits16[bs[p + 2]];
  327.         r <<= 4;

  328.         r |= digits16[bs[p + 3]];
  329.         r <<= 4;

  330.         r |= digits16[bs[p + 4]];
  331.         r <<= 4;

  332.         r |= digits16[bs[p + 5]];
  333.         r <<= 4;

  334.         r |= digits16[bs[p + 6]];
  335.         r <<= 4;

  336.         r |= digits16[bs[p + 7]];
  337.         r <<= 4;

  338.         r |= digits16[bs[p + 8]];
  339.         r <<= 4;

  340.         r |= digits16[bs[p + 9]];
  341.         r <<= 4;

  342.         r |= digits16[bs[p + 10]];
  343.         r <<= 4;

  344.         r |= digits16[bs[p + 11]];
  345.         r <<= 4;

  346.         r |= digits16[bs[p + 12]];
  347.         r <<= 4;

  348.         r |= digits16[bs[p + 13]];
  349.         r <<= 4;

  350.         r |= digits16[bs[p + 14]];

  351.         final int last = digits16[bs[p + 15]];
  352.         if (r < 0 || last < 0)
  353.             throw new ArrayIndexOutOfBoundsException();
  354.         return (r << 4) | last;
  355.     }

  356.     /**
  357.      * Parse a single hex digit to its numeric value (0-15).
  358.      *
  359.      * @param digit
  360.      *            hex character to parse.
  361.      * @return numeric value, in the range 0-15.
  362.      * @throws java.lang.ArrayIndexOutOfBoundsException
  363.      *             if the input digit is not a valid hex digit.
  364.      */
  365.     public static final int parseHexInt4(final byte digit) {
  366.         final byte r = digits16[digit];
  367.         if (r < 0)
  368.             throw new ArrayIndexOutOfBoundsException();
  369.         return r;
  370.     }

  371.     /**
  372.      * Parse a Git style timezone string.
  373.      * <p>
  374.      * The sequence "-0315" will be parsed as the numeric value -195, as the
  375.      * lower two positions count minutes, not 100ths of an hour.
  376.      *
  377.      * @param b
  378.      *            buffer to scan.
  379.      * @param ptr
  380.      *            position within buffer to start parsing digits at.
  381.      * @return the timezone at this location, expressed in minutes.
  382.      */
  383.     public static final int parseTimeZoneOffset(byte[] b, int ptr) {
  384.         return parseTimeZoneOffset(b, ptr, null);
  385.     }

  386.     /**
  387.      * Parse a Git style timezone string.
  388.      * <p>
  389.      * The sequence "-0315" will be parsed as the numeric value -195, as the
  390.      * lower two positions count minutes, not 100ths of an hour.
  391.      *
  392.      * @param b
  393.      *            buffer to scan.
  394.      * @param ptr
  395.      *            position within buffer to start parsing digits at.
  396.      * @param ptrResult
  397.      *            optional location to return the new ptr value through. If null
  398.      *            the ptr value will be discarded.
  399.      * @return the timezone at this location, expressed in minutes.
  400.      * @since 4.1
  401.      */
  402.     public static final int parseTimeZoneOffset(final byte[] b, int ptr,
  403.             MutableInteger ptrResult) {
  404.         final int v = parseBase10(b, ptr, ptrResult);
  405.         final int tzMins = v % 100;
  406.         final int tzHours = v / 100;
  407.         return tzHours * 60 + tzMins;
  408.     }

  409.     /**
  410.      * Locate the first position after a given character.
  411.      *
  412.      * @param b
  413.      *            buffer to scan.
  414.      * @param ptr
  415.      *            position within buffer to start looking for chrA at.
  416.      * @param chrA
  417.      *            character to find.
  418.      * @return new position just after chrA.
  419.      */
  420.     public static final int next(byte[] b, int ptr, char chrA) {
  421.         final int sz = b.length;
  422.         while (ptr < sz) {
  423.             if (b[ptr++] == chrA)
  424.                 return ptr;
  425.         }
  426.         return ptr;
  427.     }

  428.     /**
  429.      * Locate the first position after the next LF.
  430.      * <p>
  431.      * This method stops on the first '\n' it finds.
  432.      *
  433.      * @param b
  434.      *            buffer to scan.
  435.      * @param ptr
  436.      *            position within buffer to start looking for LF at.
  437.      * @return new position just after the first LF found.
  438.      */
  439.     public static final int nextLF(byte[] b, int ptr) {
  440.         return next(b, ptr, '\n');
  441.     }

  442.     /**
  443.      * Locate the first position after either the given character or LF.
  444.      * <p>
  445.      * This method stops on the first match it finds from either chrA or '\n'.
  446.      *
  447.      * @param b
  448.      *            buffer to scan.
  449.      * @param ptr
  450.      *            position within buffer to start looking for chrA or LF at.
  451.      * @param chrA
  452.      *            character to find.
  453.      * @return new position just after the first chrA or LF to be found.
  454.      */
  455.     public static final int nextLF(byte[] b, int ptr, char chrA) {
  456.         final int sz = b.length;
  457.         while (ptr < sz) {
  458.             final byte c = b[ptr++];
  459.             if (c == chrA || c == '\n')
  460.                 return ptr;
  461.         }
  462.         return ptr;
  463.     }

  464.     /**
  465.      * Locate the end of the header.  Note that headers may be
  466.      * more than one line long.
  467.      * @param b
  468.      *            buffer to scan.
  469.      * @param ptr
  470.      *            position within buffer to start looking for the end-of-header.
  471.      * @return new position just after the header.  This is either
  472.      * b.length, or the index of the header's terminating newline.
  473.      * @since 5.1
  474.      */
  475.     public static final int headerEnd(final byte[] b, int ptr) {
  476.         final int sz = b.length;
  477.         while (ptr < sz) {
  478.             final byte c = b[ptr++];
  479.             if (c == '\n' && (ptr == sz || b[ptr] != ' ')) {
  480.                 return ptr - 1;
  481.             }
  482.         }
  483.         return ptr - 1;
  484.     }

  485.     /**
  486.      * Find the start of the contents of a given header.
  487.      *
  488.      * @param b
  489.      *            buffer to scan.
  490.      * @param headerName
  491.      *            header to search for
  492.      * @param ptr
  493.      *            position within buffer to start looking for header at.
  494.      * @return new position at the start of the header's contents, -1 for
  495.      *         not found
  496.      * @since 5.1
  497.      */
  498.     public static final int headerStart(byte[] headerName, byte[] b, int ptr) {
  499.         // Start by advancing to just past a LF or buffer start
  500.         if (ptr != 0) {
  501.             ptr = nextLF(b, ptr - 1);
  502.         }
  503.         while (ptr < b.length - (headerName.length + 1)) {
  504.             boolean found = true;
  505.             for (byte element : headerName) {
  506.                 if (element != b[ptr++]) {
  507.                     found = false;
  508.                     break;
  509.                 }
  510.             }
  511.             if (found && b[ptr++] == ' ') {
  512.                 return ptr;
  513.             }
  514.             ptr = nextLF(b, ptr);
  515.         }
  516.         return -1;
  517.     }

  518.     /**
  519.      * Locate the first position before a given character.
  520.      *
  521.      * @param b
  522.      *            buffer to scan.
  523.      * @param ptr
  524.      *            position within buffer to start looking for chrA at.
  525.      * @param chrA
  526.      *            character to find.
  527.      * @return new position just before chrA, -1 for not found
  528.      */
  529.     public static final int prev(byte[] b, int ptr, char chrA) {
  530.         if (ptr == b.length)
  531.             --ptr;
  532.         while (ptr >= 0) {
  533.             if (b[ptr--] == chrA)
  534.                 return ptr;
  535.         }
  536.         return ptr;
  537.     }

  538.     /**
  539.      * Locate the first position before the previous LF.
  540.      * <p>
  541.      * This method stops on the first '\n' it finds.
  542.      *
  543.      * @param b
  544.      *            buffer to scan.
  545.      * @param ptr
  546.      *            position within buffer to start looking for LF at.
  547.      * @return new position just before the first LF found, -1 for not found
  548.      */
  549.     public static final int prevLF(byte[] b, int ptr) {
  550.         return prev(b, ptr, '\n');
  551.     }

  552.     /**
  553.      * Locate the previous position before either the given character or LF.
  554.      * <p>
  555.      * This method stops on the first match it finds from either chrA or '\n'.
  556.      *
  557.      * @param b
  558.      *            buffer to scan.
  559.      * @param ptr
  560.      *            position within buffer to start looking for chrA or LF at.
  561.      * @param chrA
  562.      *            character to find.
  563.      * @return new position just before the first chrA or LF to be found, -1 for
  564.      *         not found
  565.      */
  566.     public static final int prevLF(byte[] b, int ptr, char chrA) {
  567.         if (ptr == b.length)
  568.             --ptr;
  569.         while (ptr >= 0) {
  570.             final byte c = b[ptr--];
  571.             if (c == chrA || c == '\n')
  572.                 return ptr;
  573.         }
  574.         return ptr;
  575.     }

  576.     /**
  577.      * Index the region between <code>[ptr, end)</code> to find line starts.
  578.      * <p>
  579.      * The returned list is 1 indexed. Index 0 contains
  580.      * {@link java.lang.Integer#MIN_VALUE} to pad the list out.
  581.      * <p>
  582.      * Using a 1 indexed list means that line numbers can be directly accessed
  583.      * from the list, so <code>list.get(1)</code> (aka get line 1) returns
  584.      * <code>ptr</code>.
  585.      * <p>
  586.      * The last element (index <code>map.size()-1</code>) always contains
  587.      * <code>end</code>.
  588.      *
  589.      * @param buf
  590.      *            buffer to scan.
  591.      * @param ptr
  592.      *            position within the buffer corresponding to the first byte of
  593.      *            line 1.
  594.      * @param end
  595.      *            1 past the end of the content within <code>buf</code>.
  596.      * @return a line map indicating the starting position of each line.
  597.      */
  598.     public static final IntList lineMap(byte[] buf, int ptr, int end) {
  599.         IntList map = new IntList((end - ptr) / 36);
  600.         map.fillTo(1, Integer.MIN_VALUE);
  601.         for (; ptr < end; ptr = nextLF(buf, ptr)) {
  602.             map.add(ptr);
  603.         }
  604.         map.add(end);
  605.         return map;
  606.     }

  607.     /**
  608.      * Like {@link #lineMap(byte[], int, int)} but throw
  609.      * {@link BinaryBlobException} if a NUL byte is encountered.
  610.      *
  611.      * @param buf
  612.      *            buffer to scan.
  613.      * @param ptr
  614.      *            position within the buffer corresponding to the first byte of
  615.      *            line 1.
  616.      * @param end
  617.      *            1 past the end of the content within <code>buf</code>.
  618.      * @return a line map indicating the starting position of each line.
  619.      * @throws BinaryBlobException
  620.      *             if a NUL byte or a lone CR is found.
  621.      * @since 5.0
  622.      */
  623.     public static final IntList lineMapOrBinary(byte[] buf, int ptr, int end)
  624.             throws BinaryBlobException {
  625.         // Experimentally derived from multiple source repositories
  626.         // the average number of bytes/line is 36. Its a rough guess
  627.         // to initially size our map close to the target.
  628.         IntList map = new IntList((end - ptr) / 36);
  629.         map.add(Integer.MIN_VALUE);
  630.         byte last = '\n'; // Must be \n to add the initial ptr
  631.         for (; ptr < end; ptr++) {
  632.             if (last == '\n') {
  633.                 map.add(ptr);
  634.             }
  635.             byte curr = buf[ptr];
  636.             if (RawText.isBinary(curr, last)) {
  637.                 throw new BinaryBlobException();
  638.             }
  639.             last = curr;
  640.         }
  641.         if (last == '\r') {
  642.             // Counts as binary
  643.             throw new BinaryBlobException();
  644.         }
  645.         map.add(end);
  646.         return map;
  647.     }

  648.     /**
  649.      * Locate the "author " header line data.
  650.      *
  651.      * @param b
  652.      *            buffer to scan.
  653.      * @param ptr
  654.      *            position in buffer to start the scan at. Most callers should
  655.      *            pass 0 to ensure the scan starts from the beginning of the
  656.      *            commit buffer and does not accidentally look at message body.
  657.      * @return position just after the space in "author ", so the first
  658.      *         character of the author's name. If no author header can be
  659.      *         located -1 is returned.
  660.      */
  661.     public static final int author(byte[] b, int ptr) {
  662.         final int sz = b.length;
  663.         if (ptr == 0)
  664.             ptr += 46; // skip the "tree ..." line.
  665.         while (ptr < sz && b[ptr] == 'p')
  666.             ptr += 48; // skip this parent.
  667.         return match(b, ptr, author);
  668.     }

  669.     /**
  670.      * Locate the "committer " header line data.
  671.      *
  672.      * @param b
  673.      *            buffer to scan.
  674.      * @param ptr
  675.      *            position in buffer to start the scan at. Most callers should
  676.      *            pass 0 to ensure the scan starts from the beginning of the
  677.      *            commit buffer and does not accidentally look at message body.
  678.      * @return position just after the space in "committer ", so the first
  679.      *         character of the committer's name. If no committer header can be
  680.      *         located -1 is returned.
  681.      */
  682.     public static final int committer(byte[] b, int ptr) {
  683.         final int sz = b.length;
  684.         if (ptr == 0)
  685.             ptr += 46; // skip the "tree ..." line.
  686.         while (ptr < sz && b[ptr] == 'p')
  687.             ptr += 48; // skip this parent.
  688.         if (ptr < sz && b[ptr] == 'a')
  689.             ptr = nextLF(b, ptr);
  690.         return match(b, ptr, committer);
  691.     }

  692.     /**
  693.      * Locate the "tagger " header line data.
  694.      *
  695.      * @param b
  696.      *            buffer to scan.
  697.      * @param ptr
  698.      *            position in buffer to start the scan at. Most callers should
  699.      *            pass 0 to ensure the scan starts from the beginning of the tag
  700.      *            buffer and does not accidentally look at message body.
  701.      * @return position just after the space in "tagger ", so the first
  702.      *         character of the tagger's name. If no tagger header can be
  703.      *         located -1 is returned.
  704.      */
  705.     public static final int tagger(byte[] b, int ptr) {
  706.         final int sz = b.length;
  707.         if (ptr == 0)
  708.             ptr += 48; // skip the "object ..." line.
  709.         while (ptr < sz) {
  710.             if (b[ptr] == '\n')
  711.                 return -1;
  712.             final int m = match(b, ptr, tagger);
  713.             if (m >= 0)
  714.                 return m;
  715.             ptr = nextLF(b, ptr);
  716.         }
  717.         return -1;
  718.     }

  719.     /**
  720.      * Locate the "encoding " header line.
  721.      *
  722.      * @param b
  723.      *            buffer to scan.
  724.      * @param ptr
  725.      *            position in buffer to start the scan at. Most callers should
  726.      *            pass 0 to ensure the scan starts from the beginning of the
  727.      *            buffer and does not accidentally look at the message body.
  728.      * @return position just after the space in "encoding ", so the first
  729.      *         character of the encoding's name. If no encoding header can be
  730.      *         located -1 is returned (and UTF-8 should be assumed).
  731.      */
  732.     public static final int encoding(byte[] b, int ptr) {
  733.         final int sz = b.length;
  734.         while (ptr < sz) {
  735.             if (b[ptr] == '\n')
  736.                 return -1;
  737.             if (b[ptr] == 'e')
  738.                 break;
  739.             ptr = nextLF(b, ptr);
  740.         }
  741.         return match(b, ptr, encoding);
  742.     }

  743.     /**
  744.      * Parse the "encoding " header as a string.
  745.      * <p>
  746.      * Locates the "encoding " header (if present) and returns its value.
  747.      *
  748.      * @param b
  749.      *            buffer to scan.
  750.      * @return the encoding header as specified in the commit; null if the
  751.      *         header was not present and should be assumed.
  752.      * @since 4.2
  753.      */
  754.     @Nullable
  755.     public static String parseEncodingName(byte[] b) {
  756.         int enc = encoding(b, 0);
  757.         if (enc < 0) {
  758.             return null;
  759.         }
  760.         int lf = nextLF(b, enc);
  761.         return decode(UTF_8, b, enc, lf - 1);
  762.     }

  763.     /**
  764.      * Parse the "encoding " header into a character set reference.
  765.      * <p>
  766.      * Locates the "encoding " header (if present) by first calling
  767.      * {@link #encoding(byte[], int)} and then returns the proper character set
  768.      * to apply to this buffer to evaluate its contents as character data.
  769.      * <p>
  770.      * If no encoding header is present {@code UTF-8} is assumed.
  771.      *
  772.      * @param b
  773.      *            buffer to scan.
  774.      * @return the Java character set representation. Never null.
  775.      * @throws IllegalCharsetNameException
  776.      *             if the character set requested by the encoding header is
  777.      *             malformed and unsupportable.
  778.      * @throws UnsupportedCharsetException
  779.      *             if the JRE does not support the character set requested by
  780.      *             the encoding header.
  781.      */
  782.     public static Charset parseEncoding(byte[] b) {
  783.         String enc = parseEncodingName(b);
  784.         if (enc == null) {
  785.             return UTF_8;
  786.         }

  787.         String name = enc.trim();
  788.         try {
  789.             return Charset.forName(name);
  790.         } catch (IllegalCharsetNameException
  791.                 | UnsupportedCharsetException badName) {
  792.             Charset aliased = charsetForAlias(name);
  793.             if (aliased != null) {
  794.                 return aliased;
  795.             }
  796.             throw badName;
  797.         }
  798.     }

  799.     /**
  800.      * Parse a name string (e.g. author, committer, tagger) into a PersonIdent.
  801.      * <p>
  802.      * Leading spaces won't be trimmed from the string, i.e. will show up in the
  803.      * parsed name afterwards.
  804.      *
  805.      * @param in
  806.      *            the string to parse a name from.
  807.      * @return the parsed identity or null in case the identity could not be
  808.      *         parsed.
  809.      */
  810.     public static PersonIdent parsePersonIdent(String in) {
  811.         return parsePersonIdent(Constants.encode(in), 0);
  812.     }

  813.     /**
  814.      * Parse a name line (e.g. author, committer, tagger) into a PersonIdent.
  815.      * <p>
  816.      * When passing in a value for <code>nameB</code> callers should use the
  817.      * return value of {@link #author(byte[], int)} or
  818.      * {@link #committer(byte[], int)}, as these methods provide the proper
  819.      * position within the buffer.
  820.      *
  821.      * @param raw
  822.      *            the buffer to parse character data from.
  823.      * @param nameB
  824.      *            first position of the identity information. This should be the
  825.      *            first position after the space which delimits the header field
  826.      *            name (e.g. "author" or "committer") from the rest of the
  827.      *            identity line.
  828.      * @return the parsed identity or null in case the identity could not be
  829.      *         parsed.
  830.      */
  831.     public static PersonIdent parsePersonIdent(byte[] raw, int nameB) {
  832.         Charset cs;
  833.         try {
  834.             cs = parseEncoding(raw);
  835.         } catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
  836.             // Assume UTF-8 for person identities, usually this is correct.
  837.             // If not decode() will fall back to the ISO-8859-1 encoding.
  838.             cs = UTF_8;
  839.         }

  840.         final int emailB = nextLF(raw, nameB, '<');
  841.         final int emailE = nextLF(raw, emailB, '>');
  842.         if (emailB >= raw.length || raw[emailB] == '\n' ||
  843.                 (emailE >= raw.length - 1 && raw[emailE - 1] != '>'))
  844.             return null;

  845.         final int nameEnd = emailB - 2 >= nameB && raw[emailB - 2] == ' ' ?
  846.                 emailB - 2 : emailB - 1;
  847.         final String name = decode(cs, raw, nameB, nameEnd);
  848.         final String email = decode(cs, raw, emailB, emailE - 1);

  849.         // Start searching from end of line, as after first name-email pair,
  850.         // another name-email pair may occur. We will ignore all kinds of
  851.         // "junk" following the first email.
  852.         //
  853.         // We've to use (emailE - 1) for the case that raw[email] is LF,
  854.         // otherwise we would run too far. "-2" is necessary to position
  855.         // before the LF in case of LF termination resp. the penultimate
  856.         // character if there is no trailing LF.
  857.         final int tzBegin = lastIndexOfTrim(raw, ' ',
  858.                 nextLF(raw, emailE - 1) - 2) + 1;
  859.         if (tzBegin <= emailE) // No time/zone, still valid
  860.             return new PersonIdent(name, email, 0, 0);

  861.         final int whenBegin = Math.max(emailE,
  862.                 lastIndexOfTrim(raw, ' ', tzBegin - 1) + 1);
  863.         if (whenBegin >= tzBegin - 1) // No time/zone, still valid
  864.             return new PersonIdent(name, email, 0, 0);

  865.         final long when = parseLongBase10(raw, whenBegin, null);
  866.         final int tz = parseTimeZoneOffset(raw, tzBegin);
  867.         return new PersonIdent(name, email, when * 1000L, tz);
  868.     }

  869.     /**
  870.      * Parse a name data (e.g. as within a reflog) into a PersonIdent.
  871.      * <p>
  872.      * When passing in a value for <code>nameB</code> callers should use the
  873.      * return value of {@link #author(byte[], int)} or
  874.      * {@link #committer(byte[], int)}, as these methods provide the proper
  875.      * position within the buffer.
  876.      *
  877.      * @param raw
  878.      *            the buffer to parse character data from.
  879.      * @param nameB
  880.      *            first position of the identity information. This should be the
  881.      *            first position after the space which delimits the header field
  882.      *            name (e.g. "author" or "committer") from the rest of the
  883.      *            identity line.
  884.      * @return the parsed identity. Never null.
  885.      */
  886.     public static PersonIdent parsePersonIdentOnly(final byte[] raw,
  887.             final int nameB) {
  888.         int stop = nextLF(raw, nameB);
  889.         int emailB = nextLF(raw, nameB, '<');
  890.         int emailE = nextLF(raw, emailB, '>');
  891.         final String name;
  892.         final String email;
  893.         if (emailE < stop) {
  894.             email = decode(raw, emailB, emailE - 1);
  895.         } else {
  896.             email = "invalid"; //$NON-NLS-1$
  897.         }
  898.         if (emailB < stop)
  899.             name = decode(raw, nameB, emailB - 2);
  900.         else
  901.             name = decode(raw, nameB, stop);

  902.         final MutableInteger ptrout = new MutableInteger();
  903.         long when;
  904.         int tz;
  905.         if (emailE < stop) {
  906.             when = parseLongBase10(raw, emailE + 1, ptrout);
  907.             tz = parseTimeZoneOffset(raw, ptrout.value);
  908.         } else {
  909.             when = 0;
  910.             tz = 0;
  911.         }
  912.         return new PersonIdent(name, email, when * 1000L, tz);
  913.     }

  914.     /**
  915.      * Locate the end of a footer line key string.
  916.      * <p>
  917.      * If the region at {@code raw[ptr]} matches {@code ^[A-Za-z0-9-]+:} (e.g.
  918.      * "Signed-off-by: A. U. Thor\n") then this method returns the position of
  919.      * the first ':'.
  920.      * <p>
  921.      * If the region at {@code raw[ptr]} does not match {@code ^[A-Za-z0-9-]+:}
  922.      * then this method returns -1.
  923.      *
  924.      * @param raw
  925.      *            buffer to scan.
  926.      * @param ptr
  927.      *            first position within raw to consider as a footer line key.
  928.      * @return position of the ':' which terminates the footer line key if this
  929.      *         is otherwise a valid footer line key; otherwise -1.
  930.      */
  931.     public static int endOfFooterLineKey(byte[] raw, int ptr) {
  932.         try {
  933.             for (;;) {
  934.                 final byte c = raw[ptr];
  935.                 if (footerLineKeyChars[c] == 0) {
  936.                     if (c == ':')
  937.                         return ptr;
  938.                     return -1;
  939.                 }
  940.                 ptr++;
  941.             }
  942.         } catch (ArrayIndexOutOfBoundsException e) {
  943.             return -1;
  944.         }
  945.     }

  946.     /**
  947.      * Decode a buffer under UTF-8, if possible.
  948.      *
  949.      * If the byte stream cannot be decoded that way, the platform default is tried
  950.      * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
  951.      *
  952.      * @param buffer
  953.      *            buffer to pull raw bytes from.
  954.      * @return a string representation of the range <code>[start,end)</code>,
  955.      *         after decoding the region through the specified character set.
  956.      */
  957.     public static String decode(byte[] buffer) {
  958.         return decode(buffer, 0, buffer.length);
  959.     }

  960.     /**
  961.      * Decode a buffer under UTF-8, if possible.
  962.      *
  963.      * If the byte stream cannot be decoded that way, the platform default is
  964.      * tried and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
  965.      *
  966.      * @param buffer
  967.      *            buffer to pull raw bytes from.
  968.      * @param start
  969.      *            start position in buffer
  970.      * @param end
  971.      *            one position past the last location within the buffer to take
  972.      *            data from.
  973.      * @return a string representation of the range <code>[start,end)</code>,
  974.      *         after decoding the region through the specified character set.
  975.      */
  976.     public static String decode(final byte[] buffer, final int start,
  977.             final int end) {
  978.         return decode(UTF_8, buffer, start, end);
  979.     }

  980.     /**
  981.      * Decode a buffer under the specified character set if possible.
  982.      *
  983.      * If the byte stream cannot be decoded that way, the platform default is tried
  984.      * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
  985.      *
  986.      * @param cs
  987.      *            character set to use when decoding the buffer.
  988.      * @param buffer
  989.      *            buffer to pull raw bytes from.
  990.      * @return a string representation of the range <code>[start,end)</code>,
  991.      *         after decoding the region through the specified character set.
  992.      */
  993.     public static String decode(Charset cs, byte[] buffer) {
  994.         return decode(cs, buffer, 0, buffer.length);
  995.     }

  996.     /**
  997.      * Decode a region of the buffer under the specified character set if possible.
  998.      *
  999.      * If the byte stream cannot be decoded that way, the platform default is tried
  1000.      * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
  1001.      *
  1002.      * @param cs
  1003.      *            character set to use when decoding the buffer.
  1004.      * @param buffer
  1005.      *            buffer to pull raw bytes from.
  1006.      * @param start
  1007.      *            first position within the buffer to take data from.
  1008.      * @param end
  1009.      *            one position past the last location within the buffer to take
  1010.      *            data from.
  1011.      * @return a string representation of the range <code>[start,end)</code>,
  1012.      *         after decoding the region through the specified character set.
  1013.      */
  1014.     public static String decode(final Charset cs, final byte[] buffer,
  1015.             final int start, final int end) {
  1016.         try {
  1017.             return decodeNoFallback(cs, buffer, start, end);
  1018.         } catch (CharacterCodingException e) {
  1019.             // Fall back to an ISO-8859-1 style encoding. At least all of
  1020.             // the bytes will be present in the output.
  1021.             //
  1022.             return extractBinaryString(buffer, start, end);
  1023.         }
  1024.     }

  1025.     /**
  1026.      * Decode a region of the buffer under the specified character set if
  1027.      * possible.
  1028.      *
  1029.      * If the byte stream cannot be decoded that way, the platform default is
  1030.      * tried and if that too fails, an exception is thrown.
  1031.      *
  1032.      * @param cs
  1033.      *            character set to use when decoding the buffer.
  1034.      * @param buffer
  1035.      *            buffer to pull raw bytes from.
  1036.      * @param start
  1037.      *            first position within the buffer to take data from.
  1038.      * @param end
  1039.      *            one position past the last location within the buffer to take
  1040.      *            data from.
  1041.      * @return a string representation of the range <code>[start,end)</code>,
  1042.      *         after decoding the region through the specified character set.
  1043.      * @throws java.nio.charset.CharacterCodingException
  1044.      *             the input is not in any of the tested character sets.
  1045.      */
  1046.     public static String decodeNoFallback(final Charset cs,
  1047.             final byte[] buffer, final int start, final int end)
  1048.             throws CharacterCodingException {
  1049.         ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
  1050.         b.mark();

  1051.         // Try our built-in favorite. The assumption here is that
  1052.         // decoding will fail if the data is not actually encoded
  1053.         // using that encoder.
  1054.         try {
  1055.             return decode(b, UTF_8);
  1056.         } catch (CharacterCodingException e) {
  1057.             b.reset();
  1058.         }

  1059.         if (!cs.equals(UTF_8)) {
  1060.             // Try the suggested encoding, it might be right since it was
  1061.             // provided by the caller.
  1062.             try {
  1063.                 return decode(b, cs);
  1064.             } catch (CharacterCodingException e) {
  1065.                 b.reset();
  1066.             }
  1067.         }

  1068.         // Try the default character set. A small group of people
  1069.         // might actually use the same (or very similar) locale.
  1070.         Charset defcs = SystemReader.getInstance().getDefaultCharset();
  1071.         if (!defcs.equals(cs) && !defcs.equals(UTF_8)) {
  1072.             try {
  1073.                 return decode(b, defcs);
  1074.             } catch (CharacterCodingException e) {
  1075.                 b.reset();
  1076.             }
  1077.         }

  1078.         throw new CharacterCodingException();
  1079.     }

  1080.     /**
  1081.      * Decode a region of the buffer under the ISO-8859-1 encoding.
  1082.      *
  1083.      * Each byte is treated as a single character in the 8859-1 character
  1084.      * encoding, performing a raw binary-&gt;char conversion.
  1085.      *
  1086.      * @param buffer
  1087.      *            buffer to pull raw bytes from.
  1088.      * @param start
  1089.      *            first position within the buffer to take data from.
  1090.      * @param end
  1091.      *            one position past the last location within the buffer to take
  1092.      *            data from.
  1093.      * @return a string representation of the range <code>[start,end)</code>.
  1094.      */
  1095.     public static String extractBinaryString(final byte[] buffer,
  1096.             final int start, final int end) {
  1097.         final StringBuilder r = new StringBuilder(end - start);
  1098.         for (int i = start; i < end; i++)
  1099.             r.append((char) (buffer[i] & 0xff));
  1100.         return r.toString();
  1101.     }

  1102.     private static String decode(ByteBuffer b, Charset charset)
  1103.             throws CharacterCodingException {
  1104.         final CharsetDecoder d = charset.newDecoder();
  1105.         d.onMalformedInput(CodingErrorAction.REPORT);
  1106.         d.onUnmappableCharacter(CodingErrorAction.REPORT);
  1107.         return d.decode(b).toString();
  1108.     }

  1109.     /**
  1110.      * Locate the position of the commit message body.
  1111.      *
  1112.      * @param b
  1113.      *            buffer to scan.
  1114.      * @param ptr
  1115.      *            position in buffer to start the scan at. Most callers should
  1116.      *            pass 0 to ensure the scan starts from the beginning of the
  1117.      *            commit buffer.
  1118.      * @return position of the user's message buffer.
  1119.      */
  1120.     public static final int commitMessage(byte[] b, int ptr) {
  1121.         final int sz = b.length;
  1122.         if (ptr == 0)
  1123.             ptr += 46; // skip the "tree ..." line.
  1124.         while (ptr < sz && b[ptr] == 'p')
  1125.             ptr += 48; // skip this parent.

  1126.         // Skip any remaining header lines, ignoring what their actual
  1127.         // header line type is. This is identical to the logic for a tag.
  1128.         //
  1129.         return tagMessage(b, ptr);
  1130.     }

  1131.     /**
  1132.      * Locate the position of the tag message body.
  1133.      *
  1134.      * @param b
  1135.      *            buffer to scan.
  1136.      * @param ptr
  1137.      *            position in buffer to start the scan at. Most callers should
  1138.      *            pass 0 to ensure the scan starts from the beginning of the tag
  1139.      *            buffer.
  1140.      * @return position of the user's message buffer.
  1141.      */
  1142.     public static final int tagMessage(byte[] b, int ptr) {
  1143.         final int sz = b.length;
  1144.         if (ptr == 0)
  1145.             ptr += 48; // skip the "object ..." line.
  1146.         while (ptr < sz && b[ptr] != '\n')
  1147.             ptr = nextLF(b, ptr);
  1148.         if (ptr < sz && b[ptr] == '\n')
  1149.             return ptr + 1;
  1150.         return -1;
  1151.     }

  1152.     /**
  1153.      * Locate the end of a paragraph.
  1154.      * <p>
  1155.      * A paragraph is ended by two consecutive LF bytes or CRLF pairs
  1156.      *
  1157.      * @param b
  1158.      *            buffer to scan.
  1159.      * @param start
  1160.      *            position in buffer to start the scan at. Most callers will
  1161.      *            want to pass the first position of the commit message (as
  1162.      *            found by {@link #commitMessage(byte[], int)}.
  1163.      * @return position of the LF at the end of the paragraph;
  1164.      *         <code>b.length</code> if no paragraph end could be located.
  1165.      */
  1166.     public static final int endOfParagraph(byte[] b, int start) {
  1167.         int ptr = start;
  1168.         final int sz = b.length;
  1169.         while (ptr < sz && (b[ptr] != '\n' && b[ptr] != '\r'))
  1170.             ptr = nextLF(b, ptr);
  1171.         if (ptr > start && b[ptr - 1] == '\n')
  1172.             ptr--;
  1173.         if (ptr > start && b[ptr - 1] == '\r')
  1174.             ptr--;
  1175.         return ptr;
  1176.     }

  1177.     /**
  1178.      * Get last index of {@code ch} in raw, trimming spaces.
  1179.      *
  1180.      * @param raw
  1181.      *            buffer to scan.
  1182.      * @param ch
  1183.      *            character to find.
  1184.      * @param pos
  1185.      *            starting position.
  1186.      * @return last index of {@code ch} in raw, trimming spaces.
  1187.      * @since 4.1
  1188.      */
  1189.     public static int lastIndexOfTrim(byte[] raw, char ch, int pos) {
  1190.         while (pos >= 0 && raw[pos] == ' ')
  1191.             pos--;

  1192.         while (pos >= 0 && raw[pos] != ch)
  1193.             pos--;

  1194.         return pos;
  1195.     }

  1196.     private static Charset charsetForAlias(String name) {
  1197.         return encodingAliases.get(StringUtils.toLowerCase(name));
  1198.     }

  1199.     private RawParseUtils() {
  1200.         // Don't create instances of a static only utility.
  1201.     }
  1202. }