RawParseUtils.java

  1. /*
  2.  * Copyright (C) 2008-2009, Google Inc.
  3.  * Copyright (C) 2006-2008, Shawn O. Pearce <spearce@spearce.org> and others
  4.  *
  5.  * This program and the accompanying materials are made available under the
  6.  * terms of the Eclipse Distribution License v. 1.0 which is available at
  7.  * https://www.eclipse.org/org/documents/edl-v10.php.
  8.  *
  9.  * SPDX-License-Identifier: BSD-3-Clause
  10.  */

  11. package org.eclipse.jgit.util;

  12. import static java.nio.charset.StandardCharsets.ISO_8859_1;
  13. import static java.nio.charset.StandardCharsets.UTF_8;
  14. import static org.eclipse.jgit.lib.ObjectChecker.author;
  15. import static org.eclipse.jgit.lib.ObjectChecker.committer;
  16. import static org.eclipse.jgit.lib.ObjectChecker.encoding;
  17. import static org.eclipse.jgit.lib.ObjectChecker.tagger;

  18. import java.nio.ByteBuffer;
  19. import java.nio.charset.CharacterCodingException;
  20. import java.nio.charset.Charset;
  21. import java.nio.charset.CharsetDecoder;
  22. import java.nio.charset.CodingErrorAction;
  23. import java.nio.charset.IllegalCharsetNameException;
  24. import java.nio.charset.UnsupportedCharsetException;
  25. import java.util.Arrays;
  26. import java.util.HashMap;
  27. import java.util.Map;

  28. import org.eclipse.jgit.annotations.Nullable;
  29. import org.eclipse.jgit.errors.BinaryBlobException;
  30. import org.eclipse.jgit.lib.Constants;
  31. import org.eclipse.jgit.lib.PersonIdent;

  32. /**
  33.  * Handy utility functions to parse raw object contents.
  34.  */
  35. public final class RawParseUtils {
  36.     /**
  37.      * UTF-8 charset constant.
  38.      *
  39.      * @since 2.2
  40.      * @deprecated use {@link java.nio.charset.StandardCharsets#UTF_8} instead
  41.      */
  42.     @Deprecated
  43.     public static final Charset UTF8_CHARSET = UTF_8;

  44.     private static final byte[] digits10;

  45.     private static final byte[] digits16;

  46.     private static final byte[] footerLineKeyChars;

  47.     private static final Map<String, Charset> encodingAliases;

  48.     static {
  49.         encodingAliases = new HashMap<>();
  50.         encodingAliases.put("latin-1", ISO_8859_1); //$NON-NLS-1$
  51.         encodingAliases.put("iso-latin-1", ISO_8859_1); //$NON-NLS-1$

  52.         digits10 = new byte['9' + 1];
  53.         Arrays.fill(digits10, (byte) -1);
  54.         for (char i = '0'; i <= '9'; i++)
  55.             digits10[i] = (byte) (i - '0');

  56.         digits16 = new byte['f' + 1];
  57.         Arrays.fill(digits16, (byte) -1);
  58.         for (char i = '0'; i <= '9'; i++)
  59.             digits16[i] = (byte) (i - '0');
  60.         for (char i = 'a'; i <= 'f'; i++)
  61.             digits16[i] = (byte) ((i - 'a') + 10);
  62.         for (char i = 'A'; i <= 'F'; i++)
  63.             digits16[i] = (byte) ((i - 'A') + 10);

  64.         footerLineKeyChars = new byte['z' + 1];
  65.         footerLineKeyChars['-'] = 1;
  66.         for (char i = '0'; i <= '9'; i++)
  67.             footerLineKeyChars[i] = 1;
  68.         for (char i = 'A'; i <= 'Z'; i++)
  69.             footerLineKeyChars[i] = 1;
  70.         for (char i = 'a'; i <= 'z'; i++)
  71.             footerLineKeyChars[i] = 1;
  72.     }

  73.     /**
  74.      * Determine if b[ptr] matches src.
  75.      *
  76.      * @param b
  77.      *            the buffer to scan.
  78.      * @param ptr
  79.      *            first position within b, this should match src[0].
  80.      * @param src
  81.      *            the buffer to test for equality with b.
  82.      * @return ptr + src.length if b[ptr..src.length] == src; else -1.
  83.      */
  84.     public static final int match(byte[] b, int ptr, byte[] src) {
  85.         if (ptr + src.length > b.length)
  86.             return -1;
  87.         for (int i = 0; i < src.length; i++, ptr++)
  88.             if (b[ptr] != src[i])
  89.                 return -1;
  90.         return ptr;
  91.     }

  92.     private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5',
  93.             '6', '7', '8', '9' };

  94.     /**
  95.      * Format a base 10 numeric into a temporary buffer.
  96.      * <p>
  97.      * Formatting is performed backwards. The method starts at offset
  98.      * <code>o-1</code> and ends at <code>o-1-digits</code>, where
  99.      * <code>digits</code> is the number of positions necessary to store the
  100.      * base 10 value.
  101.      * <p>
  102.      * The argument and return values from this method make it easy to chain
  103.      * writing, for example:
  104.      * </p>
  105.      *
  106.      * <pre>
  107.      * final byte[] tmp = new byte[64];
  108.      * int ptr = tmp.length;
  109.      * tmp[--ptr] = '\n';
  110.      * ptr = RawParseUtils.formatBase10(tmp, ptr, 32);
  111.      * tmp[--ptr] = ' ';
  112.      * ptr = RawParseUtils.formatBase10(tmp, ptr, 18);
  113.      * tmp[--ptr] = 0;
  114.      * final String str = new String(tmp, ptr, tmp.length - ptr);
  115.      * </pre>
  116.      *
  117.      * @param b
  118.      *            buffer to write into.
  119.      * @param o
  120.      *            one offset past the location where writing will begin; writing
  121.      *            proceeds towards lower index values.
  122.      * @param value
  123.      *            the value to store.
  124.      * @return the new offset value <code>o</code>. This is the position of
  125.      *         the last byte written. Additional writing should start at one
  126.      *         position earlier.
  127.      */
  128.     public static int formatBase10(final byte[] b, int o, int value) {
  129.         if (value == 0) {
  130.             b[--o] = '0';
  131.             return o;
  132.         }
  133.         final boolean isneg = value < 0;
  134.         if (isneg)
  135.             value = -value;
  136.         while (value != 0) {
  137.             b[--o] = base10byte[value % 10];
  138.             value /= 10;
  139.         }
  140.         if (isneg)
  141.             b[--o] = '-';
  142.         return o;
  143.     }

  144.     /**
  145.      * Parse a base 10 numeric from a sequence of ASCII digits into an int.
  146.      * <p>
  147.      * Digit sequences can begin with an optional run of spaces before the
  148.      * sequence, and may start with a '+' or a '-' to indicate sign position.
  149.      * Any other characters will cause the method to stop and return the current
  150.      * result to the caller.
  151.      *
  152.      * @param b
  153.      *            buffer to scan.
  154.      * @param ptr
  155.      *            position within buffer to start parsing digits at.
  156.      * @param ptrResult
  157.      *            optional location to return the new ptr value through. If null
  158.      *            the ptr value will be discarded.
  159.      * @return the value at this location; 0 if the location is not a valid
  160.      *         numeric.
  161.      */
  162.     public static final int parseBase10(final byte[] b, int ptr,
  163.             final MutableInteger ptrResult) {
  164.         int r = 0;
  165.         int sign = 0;
  166.         try {
  167.             final int sz = b.length;
  168.             while (ptr < sz && b[ptr] == ' ')
  169.                 ptr++;
  170.             if (ptr >= sz)
  171.                 return 0;

  172.             switch (b[ptr]) {
  173.             case '-':
  174.                 sign = -1;
  175.                 ptr++;
  176.                 break;
  177.             case '+':
  178.                 ptr++;
  179.                 break;
  180.             }

  181.             while (ptr < sz) {
  182.                 final byte v = digits10[b[ptr]];
  183.                 if (v < 0)
  184.                     break;
  185.                 r = (r * 10) + v;
  186.                 ptr++;
  187.             }
  188.         } catch (ArrayIndexOutOfBoundsException e) {
  189.             // Not a valid digit.
  190.         }
  191.         if (ptrResult != null)
  192.             ptrResult.value = ptr;
  193.         return sign < 0 ? -r : r;
  194.     }

  195.     /**
  196.      * Parse a base 10 numeric from a sequence of ASCII digits into a long.
  197.      * <p>
  198.      * Digit sequences can begin with an optional run of spaces before the
  199.      * sequence, and may start with a '+' or a '-' to indicate sign position.
  200.      * Any other characters will cause the method to stop and return the current
  201.      * result to the caller.
  202.      *
  203.      * @param b
  204.      *            buffer to scan.
  205.      * @param ptr
  206.      *            position within buffer to start parsing digits at.
  207.      * @param ptrResult
  208.      *            optional location to return the new ptr value through. If null
  209.      *            the ptr value will be discarded.
  210.      * @return the value at this location; 0 if the location is not a valid
  211.      *         numeric.
  212.      */
  213.     public static final long parseLongBase10(final byte[] b, int ptr,
  214.             final MutableInteger ptrResult) {
  215.         long r = 0;
  216.         int sign = 0;
  217.         try {
  218.             final int sz = b.length;
  219.             while (ptr < sz && b[ptr] == ' ')
  220.                 ptr++;
  221.             if (ptr >= sz)
  222.                 return 0;

  223.             switch (b[ptr]) {
  224.             case '-':
  225.                 sign = -1;
  226.                 ptr++;
  227.                 break;
  228.             case '+':
  229.                 ptr++;
  230.                 break;
  231.             }

  232.             while (ptr < sz) {
  233.                 final byte v = digits10[b[ptr]];
  234.                 if (v < 0)
  235.                     break;
  236.                 r = (r * 10) + v;
  237.                 ptr++;
  238.             }
  239.         } catch (ArrayIndexOutOfBoundsException e) {
  240.             // Not a valid digit.
  241.         }
  242.         if (ptrResult != null)
  243.             ptrResult.value = ptr;
  244.         return sign < 0 ? -r : r;
  245.     }

  246.     /**
  247.      * Parse 4 character base 16 (hex) formatted string to unsigned integer.
  248.      * <p>
  249.      * The number is read in network byte order, that is, most significant
  250.      * nybble first.
  251.      *
  252.      * @param bs
  253.      *            buffer to parse digits from; positions {@code [p, p+4)} will
  254.      *            be parsed.
  255.      * @param p
  256.      *            first position within the buffer to parse.
  257.      * @return the integer value.
  258.      * @throws java.lang.ArrayIndexOutOfBoundsException
  259.      *             if the string is not hex formatted.
  260.      */
  261.     public static final int parseHexInt16(final byte[] bs, final int p) {
  262.         int r = digits16[bs[p]] << 4;

  263.         r |= digits16[bs[p + 1]];
  264.         r <<= 4;

  265.         r |= digits16[bs[p + 2]];
  266.         r <<= 4;

  267.         r |= digits16[bs[p + 3]];
  268.         if (r < 0)
  269.             throw new ArrayIndexOutOfBoundsException();
  270.         return r;
  271.     }

  272.     /**
  273.      * Parse 8 character base 16 (hex) formatted string to unsigned integer.
  274.      * <p>
  275.      * The number is read in network byte order, that is, most significant
  276.      * nybble first.
  277.      *
  278.      * @param bs
  279.      *            buffer to parse digits from; positions {@code [p, p+8)} will
  280.      *            be parsed.
  281.      * @param p
  282.      *            first position within the buffer to parse.
  283.      * @return the integer value.
  284.      * @throws java.lang.ArrayIndexOutOfBoundsException
  285.      *             if the string is not hex formatted.
  286.      */
  287.     public static final int parseHexInt32(final byte[] bs, final int p) {
  288.         int r = digits16[bs[p]] << 4;

  289.         r |= digits16[bs[p + 1]];
  290.         r <<= 4;

  291.         r |= digits16[bs[p + 2]];
  292.         r <<= 4;

  293.         r |= digits16[bs[p + 3]];
  294.         r <<= 4;

  295.         r |= digits16[bs[p + 4]];
  296.         r <<= 4;

  297.         r |= digits16[bs[p + 5]];
  298.         r <<= 4;

  299.         r |= digits16[bs[p + 6]];

  300.         final int last = digits16[bs[p + 7]];
  301.         if (r < 0 || last < 0)
  302.             throw new ArrayIndexOutOfBoundsException();
  303.         return (r << 4) | last;
  304.     }

  305.     /**
  306.      * Parse 16 character base 16 (hex) formatted string to unsigned long.
  307.      * <p>
  308.      * The number is read in network byte order, that is, most significant
  309.      * nibble first.
  310.      *
  311.      * @param bs
  312.      *            buffer to parse digits from; positions {@code [p, p+16)} will
  313.      *            be parsed.
  314.      * @param p
  315.      *            first position within the buffer to parse.
  316.      * @return the integer value.
  317.      * @throws java.lang.ArrayIndexOutOfBoundsException
  318.      *             if the string is not hex formatted.
  319.      * @since 4.3
  320.      */
  321.     public static final long parseHexInt64(final byte[] bs, final int p) {
  322.         long r = digits16[bs[p]] << 4;

  323.         r |= digits16[bs[p + 1]];
  324.         r <<= 4;

  325.         r |= digits16[bs[p + 2]];
  326.         r <<= 4;

  327.         r |= digits16[bs[p + 3]];
  328.         r <<= 4;

  329.         r |= digits16[bs[p + 4]];
  330.         r <<= 4;

  331.         r |= digits16[bs[p + 5]];
  332.         r <<= 4;

  333.         r |= digits16[bs[p + 6]];
  334.         r <<= 4;

  335.         r |= digits16[bs[p + 7]];
  336.         r <<= 4;

  337.         r |= digits16[bs[p + 8]];
  338.         r <<= 4;

  339.         r |= digits16[bs[p + 9]];
  340.         r <<= 4;

  341.         r |= digits16[bs[p + 10]];
  342.         r <<= 4;

  343.         r |= digits16[bs[p + 11]];
  344.         r <<= 4;

  345.         r |= digits16[bs[p + 12]];
  346.         r <<= 4;

  347.         r |= digits16[bs[p + 13]];
  348.         r <<= 4;

  349.         r |= digits16[bs[p + 14]];

  350.         final int last = digits16[bs[p + 15]];
  351.         if (r < 0 || last < 0)
  352.             throw new ArrayIndexOutOfBoundsException();
  353.         return (r << 4) | last;
  354.     }

  355.     /**
  356.      * Parse a single hex digit to its numeric value (0-15).
  357.      *
  358.      * @param digit
  359.      *            hex character to parse.
  360.      * @return numeric value, in the range 0-15.
  361.      * @throws java.lang.ArrayIndexOutOfBoundsException
  362.      *             if the input digit is not a valid hex digit.
  363.      */
  364.     public static final int parseHexInt4(final byte digit) {
  365.         final byte r = digits16[digit];
  366.         if (r < 0)
  367.             throw new ArrayIndexOutOfBoundsException();
  368.         return r;
  369.     }

  370.     /**
  371.      * Parse a Git style timezone string.
  372.      * <p>
  373.      * The sequence "-0315" will be parsed as the numeric value -195, as the
  374.      * lower two positions count minutes, not 100ths of an hour.
  375.      *
  376.      * @param b
  377.      *            buffer to scan.
  378.      * @param ptr
  379.      *            position within buffer to start parsing digits at.
  380.      * @return the timezone at this location, expressed in minutes.
  381.      */
  382.     public static final int parseTimeZoneOffset(byte[] b, int ptr) {
  383.         return parseTimeZoneOffset(b, ptr, null);
  384.     }

  385.     /**
  386.      * Parse a Git style timezone string.
  387.      * <p>
  388.      * The sequence "-0315" will be parsed as the numeric value -195, as the
  389.      * lower two positions count minutes, not 100ths of an hour.
  390.      *
  391.      * @param b
  392.      *            buffer to scan.
  393.      * @param ptr
  394.      *            position within buffer to start parsing digits at.
  395.      * @param ptrResult
  396.      *            optional location to return the new ptr value through. If null
  397.      *            the ptr value will be discarded.
  398.      * @return the timezone at this location, expressed in minutes.
  399.      * @since 4.1
  400.      */
  401.     public static final int parseTimeZoneOffset(final byte[] b, int ptr,
  402.             MutableInteger ptrResult) {
  403.         final int v = parseBase10(b, ptr, ptrResult);
  404.         final int tzMins = v % 100;
  405.         final int tzHours = v / 100;
  406.         return tzHours * 60 + tzMins;
  407.     }

  408.     /**
  409.      * Locate the first position after a given character.
  410.      *
  411.      * @param b
  412.      *            buffer to scan.
  413.      * @param ptr
  414.      *            position within buffer to start looking for chrA at.
  415.      * @param chrA
  416.      *            character to find.
  417.      * @return new position just after chrA.
  418.      */
  419.     public static final int next(byte[] b, int ptr, char chrA) {
  420.         final int sz = b.length;
  421.         while (ptr < sz) {
  422.             if (b[ptr++] == chrA)
  423.                 return ptr;
  424.         }
  425.         return ptr;
  426.     }

  427.     /**
  428.      * Locate the first position after the next LF.
  429.      * <p>
  430.      * This method stops on the first '\n' it finds.
  431.      *
  432.      * @param b
  433.      *            buffer to scan.
  434.      * @param ptr
  435.      *            position within buffer to start looking for LF at.
  436.      * @return new position just after the first LF found.
  437.      */
  438.     public static final int nextLF(byte[] b, int ptr) {
  439.         return next(b, ptr, '\n');
  440.     }

  441.     /**
  442.      * Locate the first position after either the given character or LF.
  443.      * <p>
  444.      * This method stops on the first match it finds from either chrA or '\n'.
  445.      *
  446.      * @param b
  447.      *            buffer to scan.
  448.      * @param ptr
  449.      *            position within buffer to start looking for chrA or LF at.
  450.      * @param chrA
  451.      *            character to find.
  452.      * @return new position just after the first chrA or LF to be found.
  453.      */
  454.     public static final int nextLF(byte[] b, int ptr, char chrA) {
  455.         final int sz = b.length;
  456.         while (ptr < sz) {
  457.             final byte c = b[ptr++];
  458.             if (c == chrA || c == '\n')
  459.                 return ptr;
  460.         }
  461.         return ptr;
  462.     }

  463.     /**
  464.      * Locate the end of the header.  Note that headers may be
  465.      * more than one line long.
  466.      * @param b
  467.      *            buffer to scan.
  468.      * @param ptr
  469.      *            position within buffer to start looking for the end-of-header.
  470.      * @return new position just after the header.  This is either
  471.      * b.length, or the index of the header's terminating newline.
  472.      * @since 5.1
  473.      */
  474.     public static final int headerEnd(final byte[] b, int ptr) {
  475.         final int sz = b.length;
  476.         while (ptr < sz) {
  477.             final byte c = b[ptr++];
  478.             if (c == '\n' && (ptr == sz || b[ptr] != ' ')) {
  479.                 return ptr - 1;
  480.             }
  481.         }
  482.         return ptr - 1;
  483.     }

  484.     /**
  485.      * Find the start of the contents of a given header.
  486.      *
  487.      * @param b
  488.      *            buffer to scan.
  489.      * @param headerName
  490.      *            header to search for
  491.      * @param ptr
  492.      *            position within buffer to start looking for header at.
  493.      * @return new position at the start of the header's contents, -1 for
  494.      *         not found
  495.      * @since 5.1
  496.      */
  497.     public static final int headerStart(byte[] headerName, byte[] b, int ptr) {
  498.         // Start by advancing to just past a LF or buffer start
  499.         if (ptr != 0) {
  500.             ptr = nextLF(b, ptr - 1);
  501.         }
  502.         while (ptr < b.length - (headerName.length + 1)) {
  503.             boolean found = true;
  504.             for (byte element : headerName) {
  505.                 if (element != b[ptr++]) {
  506.                     found = false;
  507.                     break;
  508.                 }
  509.             }
  510.             if (found && b[ptr++] == ' ') {
  511.                 return ptr;
  512.             }
  513.             ptr = nextLF(b, ptr);
  514.         }
  515.         return -1;
  516.     }

  517.     /**
  518.      * Locate the first position before a given character.
  519.      *
  520.      * @param b
  521.      *            buffer to scan.
  522.      * @param ptr
  523.      *            position within buffer to start looking for chrA at.
  524.      * @param chrA
  525.      *            character to find.
  526.      * @return new position just before chrA, -1 for not found
  527.      */
  528.     public static final int prev(byte[] b, int ptr, char chrA) {
  529.         if (ptr == b.length)
  530.             --ptr;
  531.         while (ptr >= 0) {
  532.             if (b[ptr--] == chrA)
  533.                 return ptr;
  534.         }
  535.         return ptr;
  536.     }

  537.     /**
  538.      * Locate the first position before the previous LF.
  539.      * <p>
  540.      * This method stops on the first '\n' it finds.
  541.      *
  542.      * @param b
  543.      *            buffer to scan.
  544.      * @param ptr
  545.      *            position within buffer to start looking for LF at.
  546.      * @return new position just before the first LF found, -1 for not found
  547.      */
  548.     public static final int prevLF(byte[] b, int ptr) {
  549.         return prev(b, ptr, '\n');
  550.     }

  551.     /**
  552.      * Locate the previous position before either the given character or LF.
  553.      * <p>
  554.      * This method stops on the first match it finds from either chrA or '\n'.
  555.      *
  556.      * @param b
  557.      *            buffer to scan.
  558.      * @param ptr
  559.      *            position within buffer to start looking for chrA or LF at.
  560.      * @param chrA
  561.      *            character to find.
  562.      * @return new position just before the first chrA or LF to be found, -1 for
  563.      *         not found
  564.      */
  565.     public static final int prevLF(byte[] b, int ptr, char chrA) {
  566.         if (ptr == b.length)
  567.             --ptr;
  568.         while (ptr >= 0) {
  569.             final byte c = b[ptr--];
  570.             if (c == chrA || c == '\n')
  571.                 return ptr;
  572.         }
  573.         return ptr;
  574.     }

  575.     /**
  576.      * Index the region between <code>[ptr, end)</code> to find line starts.
  577.      * <p>
  578.      * The returned list is 1 indexed. Index 0 contains
  579.      * {@link java.lang.Integer#MIN_VALUE} to pad the list out.
  580.      * <p>
  581.      * Using a 1 indexed list means that line numbers can be directly accessed
  582.      * from the list, so <code>list.get(1)</code> (aka get line 1) returns
  583.      * <code>ptr</code>.
  584.      * <p>
  585.      * The last element (index <code>map.size()-1</code>) always contains
  586.      * <code>end</code>.
  587.      *
  588.      * @param buf
  589.      *            buffer to scan.
  590.      * @param ptr
  591.      *            position within the buffer corresponding to the first byte of
  592.      *            line 1.
  593.      * @param end
  594.      *            1 past the end of the content within <code>buf</code>.
  595.      * @return a line map indicating the starting position of each line.
  596.      */
  597.     public static final IntList lineMap(byte[] buf, int ptr, int end) {
  598.         IntList map = new IntList((end - ptr) / 36);
  599.         map.fillTo(1, Integer.MIN_VALUE);
  600.         for (; ptr < end; ptr = nextLF(buf, ptr)) {
  601.             map.add(ptr);
  602.         }
  603.         map.add(end);
  604.         return map;
  605.     }

  606.     /**
  607.      * Like {@link #lineMap(byte[], int, int)} but throw
  608.      * {@link BinaryBlobException} if a NUL byte is encountered.
  609.      *
  610.      * @param buf
  611.      *            buffer to scan.
  612.      * @param ptr
  613.      *            position within the buffer corresponding to the first byte of
  614.      *            line 1.
  615.      * @param end
  616.      *            1 past the end of the content within <code>buf</code>.
  617.      * @return a line map indicating the starting position of each line.
  618.      * @throws BinaryBlobException
  619.      *            if a NUL byte is found.
  620.      * @since 5.0
  621.      */
  622.     public static final IntList lineMapOrBinary(byte[] buf, int ptr, int end)
  623.             throws BinaryBlobException {
  624.         IntList map = lineMapOrNull(buf, ptr, end);
  625.         if (map == null) {
  626.             throw new BinaryBlobException();
  627.         }
  628.         return map;
  629.     }

  630.     @Nullable
  631.     private static IntList lineMapOrNull(byte[] buf, int ptr, int end) {
  632.         // Experimentally derived from multiple source repositories
  633.         // the average number of bytes/line is 36. Its a rough guess
  634.         // to initially size our map close to the target.
  635.         IntList map = new IntList((end - ptr) / 36);
  636.         map.add(Integer.MIN_VALUE);
  637.         boolean foundLF = true;
  638.         for (; ptr < end; ptr++) {
  639.             if (foundLF) {
  640.                 map.add(ptr);
  641.             }

  642.             if (buf[ptr] == '\0') {
  643.                 return null;
  644.             }

  645.             foundLF = (buf[ptr] == '\n');
  646.         }
  647.         map.add(end);
  648.         return map;
  649.     }

  650.     /**
  651.      * Locate the "author " header line data.
  652.      *
  653.      * @param b
  654.      *            buffer to scan.
  655.      * @param ptr
  656.      *            position in buffer to start the scan at. Most callers should
  657.      *            pass 0 to ensure the scan starts from the beginning of the
  658.      *            commit buffer and does not accidentally look at message body.
  659.      * @return position just after the space in "author ", so the first
  660.      *         character of the author's name. If no author header can be
  661.      *         located -1 is returned.
  662.      */
  663.     public static final int author(byte[] b, int ptr) {
  664.         final int sz = b.length;
  665.         if (ptr == 0)
  666.             ptr += 46; // skip the "tree ..." line.
  667.         while (ptr < sz && b[ptr] == 'p')
  668.             ptr += 48; // skip this parent.
  669.         return match(b, ptr, author);
  670.     }

  671.     /**
  672.      * Locate the "committer " header line data.
  673.      *
  674.      * @param b
  675.      *            buffer to scan.
  676.      * @param ptr
  677.      *            position in buffer to start the scan at. Most callers should
  678.      *            pass 0 to ensure the scan starts from the beginning of the
  679.      *            commit buffer and does not accidentally look at message body.
  680.      * @return position just after the space in "committer ", so the first
  681.      *         character of the committer's name. If no committer header can be
  682.      *         located -1 is returned.
  683.      */
  684.     public static final int committer(byte[] b, int ptr) {
  685.         final int sz = b.length;
  686.         if (ptr == 0)
  687.             ptr += 46; // skip the "tree ..." line.
  688.         while (ptr < sz && b[ptr] == 'p')
  689.             ptr += 48; // skip this parent.
  690.         if (ptr < sz && b[ptr] == 'a')
  691.             ptr = nextLF(b, ptr);
  692.         return match(b, ptr, committer);
  693.     }

  694.     /**
  695.      * Locate the "tagger " header line data.
  696.      *
  697.      * @param b
  698.      *            buffer to scan.
  699.      * @param ptr
  700.      *            position in buffer to start the scan at. Most callers should
  701.      *            pass 0 to ensure the scan starts from the beginning of the tag
  702.      *            buffer and does not accidentally look at message body.
  703.      * @return position just after the space in "tagger ", so the first
  704.      *         character of the tagger's name. If no tagger header can be
  705.      *         located -1 is returned.
  706.      */
  707.     public static final int tagger(byte[] b, int ptr) {
  708.         final int sz = b.length;
  709.         if (ptr == 0)
  710.             ptr += 48; // skip the "object ..." line.
  711.         while (ptr < sz) {
  712.             if (b[ptr] == '\n')
  713.                 return -1;
  714.             final int m = match(b, ptr, tagger);
  715.             if (m >= 0)
  716.                 return m;
  717.             ptr = nextLF(b, ptr);
  718.         }
  719.         return -1;
  720.     }

  721.     /**
  722.      * Locate the "encoding " header line.
  723.      *
  724.      * @param b
  725.      *            buffer to scan.
  726.      * @param ptr
  727.      *            position in buffer to start the scan at. Most callers should
  728.      *            pass 0 to ensure the scan starts from the beginning of the
  729.      *            buffer and does not accidentally look at the message body.
  730.      * @return position just after the space in "encoding ", so the first
  731.      *         character of the encoding's name. If no encoding header can be
  732.      *         located -1 is returned (and UTF-8 should be assumed).
  733.      */
  734.     public static final int encoding(byte[] b, int ptr) {
  735.         final int sz = b.length;
  736.         while (ptr < sz) {
  737.             if (b[ptr] == '\n')
  738.                 return -1;
  739.             if (b[ptr] == 'e')
  740.                 break;
  741.             ptr = nextLF(b, ptr);
  742.         }
  743.         return match(b, ptr, encoding);
  744.     }

  745.     /**
  746.      * Parse the "encoding " header as a string.
  747.      * <p>
  748.      * Locates the "encoding " header (if present) and returns its value.
  749.      *
  750.      * @param b
  751.      *            buffer to scan.
  752.      * @return the encoding header as specified in the commit; null if the
  753.      *         header was not present and should be assumed.
  754.      * @since 4.2
  755.      */
  756.     @Nullable
  757.     public static String parseEncodingName(byte[] b) {
  758.         int enc = encoding(b, 0);
  759.         if (enc < 0) {
  760.             return null;
  761.         }
  762.         int lf = nextLF(b, enc);
  763.         return decode(UTF_8, b, enc, lf - 1);
  764.     }

  765.     /**
  766.      * Parse the "encoding " header into a character set reference.
  767.      * <p>
  768.      * Locates the "encoding " header (if present) by first calling
  769.      * {@link #encoding(byte[], int)} and then returns the proper character set
  770.      * to apply to this buffer to evaluate its contents as character data.
  771.      * <p>
  772.      * If no encoding header is present {@code UTF-8} is assumed.
  773.      *
  774.      * @param b
  775.      *            buffer to scan.
  776.      * @return the Java character set representation. Never null.
  777.      * @throws IllegalCharsetNameException
  778.      *             if the character set requested by the encoding header is
  779.      *             malformed and unsupportable.
  780.      * @throws UnsupportedCharsetException
  781.      *             if the JRE does not support the character set requested by
  782.      *             the encoding header.
  783.      */
  784.     public static Charset parseEncoding(byte[] b) {
  785.         String enc = parseEncodingName(b);
  786.         if (enc == null) {
  787.             return UTF_8;
  788.         }

  789.         String name = enc.trim();
  790.         try {
  791.             return Charset.forName(name);
  792.         } catch (IllegalCharsetNameException
  793.                 | UnsupportedCharsetException badName) {
  794.             Charset aliased = charsetForAlias(name);
  795.             if (aliased != null) {
  796.                 return aliased;
  797.             }
  798.             throw badName;
  799.         }
  800.     }

  801.     /**
  802.      * Parse a name string (e.g. author, committer, tagger) into a PersonIdent.
  803.      * <p>
  804.      * Leading spaces won't be trimmed from the string, i.e. will show up in the
  805.      * parsed name afterwards.
  806.      *
  807.      * @param in
  808.      *            the string to parse a name from.
  809.      * @return the parsed identity or null in case the identity could not be
  810.      *         parsed.
  811.      */
  812.     public static PersonIdent parsePersonIdent(String in) {
  813.         return parsePersonIdent(Constants.encode(in), 0);
  814.     }

  815.     /**
  816.      * Parse a name line (e.g. author, committer, tagger) into a PersonIdent.
  817.      * <p>
  818.      * When passing in a value for <code>nameB</code> callers should use the
  819.      * return value of {@link #author(byte[], int)} or
  820.      * {@link #committer(byte[], int)}, as these methods provide the proper
  821.      * position within the buffer.
  822.      *
  823.      * @param raw
  824.      *            the buffer to parse character data from.
  825.      * @param nameB
  826.      *            first position of the identity information. This should be the
  827.      *            first position after the space which delimits the header field
  828.      *            name (e.g. "author" or "committer") from the rest of the
  829.      *            identity line.
  830.      * @return the parsed identity or null in case the identity could not be
  831.      *         parsed.
  832.      */
  833.     public static PersonIdent parsePersonIdent(byte[] raw, int nameB) {
  834.         Charset cs;
  835.         try {
  836.             cs = parseEncoding(raw);
  837.         } catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
  838.             // Assume UTF-8 for person identities, usually this is correct.
  839.             // If not decode() will fall back to the ISO-8859-1 encoding.
  840.             cs = UTF_8;
  841.         }

  842.         final int emailB = nextLF(raw, nameB, '<');
  843.         final int emailE = nextLF(raw, emailB, '>');
  844.         if (emailB >= raw.length || raw[emailB] == '\n' ||
  845.                 (emailE >= raw.length - 1 && raw[emailE - 1] != '>'))
  846.             return null;

  847.         final int nameEnd = emailB - 2 >= nameB && raw[emailB - 2] == ' ' ?
  848.                 emailB - 2 : emailB - 1;
  849.         final String name = decode(cs, raw, nameB, nameEnd);
  850.         final String email = decode(cs, raw, emailB, emailE - 1);

  851.         // Start searching from end of line, as after first name-email pair,
  852.         // another name-email pair may occur. We will ignore all kinds of
  853.         // "junk" following the first email.
  854.         //
  855.         // We've to use (emailE - 1) for the case that raw[email] is LF,
  856.         // otherwise we would run too far. "-2" is necessary to position
  857.         // before the LF in case of LF termination resp. the penultimate
  858.         // character if there is no trailing LF.
  859.         final int tzBegin = lastIndexOfTrim(raw, ' ',
  860.                 nextLF(raw, emailE - 1) - 2) + 1;
  861.         if (tzBegin <= emailE) // No time/zone, still valid
  862.             return new PersonIdent(name, email, 0, 0);

  863.         final int whenBegin = Math.max(emailE,
  864.                 lastIndexOfTrim(raw, ' ', tzBegin - 1) + 1);
  865.         if (whenBegin >= tzBegin - 1) // No time/zone, still valid
  866.             return new PersonIdent(name, email, 0, 0);

  867.         final long when = parseLongBase10(raw, whenBegin, null);
  868.         final int tz = parseTimeZoneOffset(raw, tzBegin);
  869.         return new PersonIdent(name, email, when * 1000L, tz);
  870.     }

  871.     /**
  872.      * Parse a name data (e.g. as within a reflog) into a PersonIdent.
  873.      * <p>
  874.      * When passing in a value for <code>nameB</code> callers should use the
  875.      * return value of {@link #author(byte[], int)} or
  876.      * {@link #committer(byte[], int)}, as these methods provide the proper
  877.      * position within the buffer.
  878.      *
  879.      * @param raw
  880.      *            the buffer to parse character data from.
  881.      * @param nameB
  882.      *            first position of the identity information. This should be the
  883.      *            first position after the space which delimits the header field
  884.      *            name (e.g. "author" or "committer") from the rest of the
  885.      *            identity line.
  886.      * @return the parsed identity. Never null.
  887.      */
  888.     public static PersonIdent parsePersonIdentOnly(final byte[] raw,
  889.             final int nameB) {
  890.         int stop = nextLF(raw, nameB);
  891.         int emailB = nextLF(raw, nameB, '<');
  892.         int emailE = nextLF(raw, emailB, '>');
  893.         final String name;
  894.         final String email;
  895.         if (emailE < stop) {
  896.             email = decode(raw, emailB, emailE - 1);
  897.         } else {
  898.             email = "invalid"; //$NON-NLS-1$
  899.         }
  900.         if (emailB < stop)
  901.             name = decode(raw, nameB, emailB - 2);
  902.         else
  903.             name = decode(raw, nameB, stop);

  904.         final MutableInteger ptrout = new MutableInteger();
  905.         long when;
  906.         int tz;
  907.         if (emailE < stop) {
  908.             when = parseLongBase10(raw, emailE + 1, ptrout);
  909.             tz = parseTimeZoneOffset(raw, ptrout.value);
  910.         } else {
  911.             when = 0;
  912.             tz = 0;
  913.         }
  914.         return new PersonIdent(name, email, when * 1000L, tz);
  915.     }

  916.     /**
  917.      * Locate the end of a footer line key string.
  918.      * <p>
  919.      * If the region at {@code raw[ptr]} matches {@code ^[A-Za-z0-9-]+:} (e.g.
  920.      * "Signed-off-by: A. U. Thor\n") then this method returns the position of
  921.      * the first ':'.
  922.      * <p>
  923.      * If the region at {@code raw[ptr]} does not match {@code ^[A-Za-z0-9-]+:}
  924.      * then this method returns -1.
  925.      *
  926.      * @param raw
  927.      *            buffer to scan.
  928.      * @param ptr
  929.      *            first position within raw to consider as a footer line key.
  930.      * @return position of the ':' which terminates the footer line key if this
  931.      *         is otherwise a valid footer line key; otherwise -1.
  932.      */
  933.     public static int endOfFooterLineKey(byte[] raw, int ptr) {
  934.         try {
  935.             for (;;) {
  936.                 final byte c = raw[ptr];
  937.                 if (footerLineKeyChars[c] == 0) {
  938.                     if (c == ':')
  939.                         return ptr;
  940.                     return -1;
  941.                 }
  942.                 ptr++;
  943.             }
  944.         } catch (ArrayIndexOutOfBoundsException e) {
  945.             return -1;
  946.         }
  947.     }

  948.     /**
  949.      * Decode a buffer under UTF-8, if possible.
  950.      *
  951.      * If the byte stream cannot be decoded that way, the platform default is tried
  952.      * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
  953.      *
  954.      * @param buffer
  955.      *            buffer to pull raw bytes from.
  956.      * @return a string representation of the range <code>[start,end)</code>,
  957.      *         after decoding the region through the specified character set.
  958.      */
  959.     public static String decode(byte[] buffer) {
  960.         return decode(buffer, 0, buffer.length);
  961.     }

  962.     /**
  963.      * Decode a buffer under UTF-8, if possible.
  964.      *
  965.      * If the byte stream cannot be decoded that way, the platform default is
  966.      * tried and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
  967.      *
  968.      * @param buffer
  969.      *            buffer to pull raw bytes from.
  970.      * @param start
  971.      *            start position in buffer
  972.      * @param end
  973.      *            one position past the last location within the buffer to take
  974.      *            data from.
  975.      * @return a string representation of the range <code>[start,end)</code>,
  976.      *         after decoding the region through the specified character set.
  977.      */
  978.     public static String decode(final byte[] buffer, final int start,
  979.             final int end) {
  980.         return decode(UTF_8, buffer, start, end);
  981.     }

  982.     /**
  983.      * Decode a buffer under the specified character set if possible.
  984.      *
  985.      * If the byte stream cannot be decoded that way, the platform default is tried
  986.      * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
  987.      *
  988.      * @param cs
  989.      *            character set to use when decoding the buffer.
  990.      * @param buffer
  991.      *            buffer to pull raw bytes from.
  992.      * @return a string representation of the range <code>[start,end)</code>,
  993.      *         after decoding the region through the specified character set.
  994.      */
  995.     public static String decode(Charset cs, byte[] buffer) {
  996.         return decode(cs, buffer, 0, buffer.length);
  997.     }

  998.     /**
  999.      * Decode a region of the buffer under the specified character set if possible.
  1000.      *
  1001.      * If the byte stream cannot be decoded that way, the platform default is tried
  1002.      * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
  1003.      *
  1004.      * @param cs
  1005.      *            character set to use when decoding the buffer.
  1006.      * @param buffer
  1007.      *            buffer to pull raw bytes from.
  1008.      * @param start
  1009.      *            first position within the buffer to take data from.
  1010.      * @param end
  1011.      *            one position past the last location within the buffer to take
  1012.      *            data from.
  1013.      * @return a string representation of the range <code>[start,end)</code>,
  1014.      *         after decoding the region through the specified character set.
  1015.      */
  1016.     public static String decode(final Charset cs, final byte[] buffer,
  1017.             final int start, final int end) {
  1018.         try {
  1019.             return decodeNoFallback(cs, buffer, start, end);
  1020.         } catch (CharacterCodingException e) {
  1021.             // Fall back to an ISO-8859-1 style encoding. At least all of
  1022.             // the bytes will be present in the output.
  1023.             //
  1024.             return extractBinaryString(buffer, start, end);
  1025.         }
  1026.     }

  1027.     /**
  1028.      * Decode a region of the buffer under the specified character set if
  1029.      * possible.
  1030.      *
  1031.      * If the byte stream cannot be decoded that way, the platform default is
  1032.      * tried and if that too fails, an exception is thrown.
  1033.      *
  1034.      * @param cs
  1035.      *            character set to use when decoding the buffer.
  1036.      * @param buffer
  1037.      *            buffer to pull raw bytes from.
  1038.      * @param start
  1039.      *            first position within the buffer to take data from.
  1040.      * @param end
  1041.      *            one position past the last location within the buffer to take
  1042.      *            data from.
  1043.      * @return a string representation of the range <code>[start,end)</code>,
  1044.      *         after decoding the region through the specified character set.
  1045.      * @throws java.nio.charset.CharacterCodingException
  1046.      *             the input is not in any of the tested character sets.
  1047.      */
  1048.     public static String decodeNoFallback(final Charset cs,
  1049.             final byte[] buffer, final int start, final int end)
  1050.             throws CharacterCodingException {
  1051.         ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
  1052.         b.mark();

  1053.         // Try our built-in favorite. The assumption here is that
  1054.         // decoding will fail if the data is not actually encoded
  1055.         // using that encoder.
  1056.         try {
  1057.             return decode(b, UTF_8);
  1058.         } catch (CharacterCodingException e) {
  1059.             b.reset();
  1060.         }

  1061.         if (!cs.equals(UTF_8)) {
  1062.             // Try the suggested encoding, it might be right since it was
  1063.             // provided by the caller.
  1064.             try {
  1065.                 return decode(b, cs);
  1066.             } catch (CharacterCodingException e) {
  1067.                 b.reset();
  1068.             }
  1069.         }

  1070.         // Try the default character set. A small group of people
  1071.         // might actually use the same (or very similar) locale.
  1072.         Charset defcs = Charset.defaultCharset();
  1073.         if (!defcs.equals(cs) && !defcs.equals(UTF_8)) {
  1074.             try {
  1075.                 return decode(b, defcs);
  1076.             } catch (CharacterCodingException e) {
  1077.                 b.reset();
  1078.             }
  1079.         }

  1080.         throw new CharacterCodingException();
  1081.     }

  1082.     /**
  1083.      * Decode a region of the buffer under the ISO-8859-1 encoding.
  1084.      *
  1085.      * Each byte is treated as a single character in the 8859-1 character
  1086.      * encoding, performing a raw binary-&gt;char conversion.
  1087.      *
  1088.      * @param buffer
  1089.      *            buffer to pull raw bytes from.
  1090.      * @param start
  1091.      *            first position within the buffer to take data from.
  1092.      * @param end
  1093.      *            one position past the last location within the buffer to take
  1094.      *            data from.
  1095.      * @return a string representation of the range <code>[start,end)</code>.
  1096.      */
  1097.     public static String extractBinaryString(final byte[] buffer,
  1098.             final int start, final int end) {
  1099.         final StringBuilder r = new StringBuilder(end - start);
  1100.         for (int i = start; i < end; i++)
  1101.             r.append((char) (buffer[i] & 0xff));
  1102.         return r.toString();
  1103.     }

  1104.     private static String decode(ByteBuffer b, Charset charset)
  1105.             throws CharacterCodingException {
  1106.         final CharsetDecoder d = charset.newDecoder();
  1107.         d.onMalformedInput(CodingErrorAction.REPORT);
  1108.         d.onUnmappableCharacter(CodingErrorAction.REPORT);
  1109.         return d.decode(b).toString();
  1110.     }

  1111.     /**
  1112.      * Locate the position of the commit message body.
  1113.      *
  1114.      * @param b
  1115.      *            buffer to scan.
  1116.      * @param ptr
  1117.      *            position in buffer to start the scan at. Most callers should
  1118.      *            pass 0 to ensure the scan starts from the beginning of the
  1119.      *            commit buffer.
  1120.      * @return position of the user's message buffer.
  1121.      */
  1122.     public static final int commitMessage(byte[] b, int ptr) {
  1123.         final int sz = b.length;
  1124.         if (ptr == 0)
  1125.             ptr += 46; // skip the "tree ..." line.
  1126.         while (ptr < sz && b[ptr] == 'p')
  1127.             ptr += 48; // skip this parent.

  1128.         // Skip any remaining header lines, ignoring what their actual
  1129.         // header line type is. This is identical to the logic for a tag.
  1130.         //
  1131.         return tagMessage(b, ptr);
  1132.     }

  1133.     /**
  1134.      * Locate the position of the tag message body.
  1135.      *
  1136.      * @param b
  1137.      *            buffer to scan.
  1138.      * @param ptr
  1139.      *            position in buffer to start the scan at. Most callers should
  1140.      *            pass 0 to ensure the scan starts from the beginning of the tag
  1141.      *            buffer.
  1142.      * @return position of the user's message buffer.
  1143.      */
  1144.     public static final int tagMessage(byte[] b, int ptr) {
  1145.         final int sz = b.length;
  1146.         if (ptr == 0)
  1147.             ptr += 48; // skip the "object ..." line.
  1148.         while (ptr < sz && b[ptr] != '\n')
  1149.             ptr = nextLF(b, ptr);
  1150.         if (ptr < sz && b[ptr] == '\n')
  1151.             return ptr + 1;
  1152.         return -1;
  1153.     }

  1154.     /**
  1155.      * Locate the end of a paragraph.
  1156.      * <p>
  1157.      * A paragraph is ended by two consecutive LF bytes or CRLF pairs
  1158.      *
  1159.      * @param b
  1160.      *            buffer to scan.
  1161.      * @param start
  1162.      *            position in buffer to start the scan at. Most callers will
  1163.      *            want to pass the first position of the commit message (as
  1164.      *            found by {@link #commitMessage(byte[], int)}.
  1165.      * @return position of the LF at the end of the paragraph;
  1166.      *         <code>b.length</code> if no paragraph end could be located.
  1167.      */
  1168.     public static final int endOfParagraph(byte[] b, int start) {
  1169.         int ptr = start;
  1170.         final int sz = b.length;
  1171.         while (ptr < sz && (b[ptr] != '\n' && b[ptr] != '\r'))
  1172.             ptr = nextLF(b, ptr);
  1173.         if (ptr > start && b[ptr - 1] == '\n')
  1174.             ptr--;
  1175.         if (ptr > start && b[ptr - 1] == '\r')
  1176.             ptr--;
  1177.         return ptr;
  1178.     }

  1179.     /**
  1180.      * Get last index of {@code ch} in raw, trimming spaces.
  1181.      *
  1182.      * @param raw
  1183.      *            buffer to scan.
  1184.      * @param ch
  1185.      *            character to find.
  1186.      * @param pos
  1187.      *            starting position.
  1188.      * @return last index of {@code ch} in raw, trimming spaces.
  1189.      * @since 4.1
  1190.      */
  1191.     public static int lastIndexOfTrim(byte[] raw, char ch, int pos) {
  1192.         while (pos >= 0 && raw[pos] == ' ')
  1193.             pos--;

  1194.         while (pos >= 0 && raw[pos] != ch)
  1195.             pos--;

  1196.         return pos;
  1197.     }

  1198.     private static Charset charsetForAlias(String name) {
  1199.         return encodingAliases.get(StringUtils.toLowerCase(name));
  1200.     }

  1201.     private RawParseUtils() {
  1202.         // Don't create instances of a static only utility.
  1203.     }
  1204. }