View Javadoc
1   /*
2    * Copyright (C) 2008-2009, Google Inc.
3    * Copyright (C) 2006-2008, Shawn O. Pearce <spearce@spearce.org>
4    * and other copyright owners as documented in the project's IP log.
5    *
6    * This program and the accompanying materials are made available
7    * under the terms of the Eclipse Distribution License v1.0 which
8    * accompanies this distribution, is reproduced below, and is
9    * available at http://www.eclipse.org/org/documents/edl-v10.php
10   *
11   * All rights reserved.
12   *
13   * Redistribution and use in source and binary forms, with or
14   * without modification, are permitted provided that the following
15   * conditions are met:
16   *
17   * - Redistributions of source code must retain the above copyright
18   *   notice, this list of conditions and the following disclaimer.
19   *
20   * - Redistributions in binary form must reproduce the above
21   *   copyright notice, this list of conditions and the following
22   *   disclaimer in the documentation and/or other materials provided
23   *   with the distribution.
24   *
25   * - Neither the name of the Eclipse Foundation, Inc. nor the
26   *   names of its contributors may be used to endorse or promote
27   *   products derived from this software without specific prior
28   *   written permission.
29   *
30   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
31   * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
32   * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
33   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34   * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
35   * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
36   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
37   * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
38   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
39   * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
40   * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
41   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
42   * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43   */
44  
45  package org.eclipse.jgit.util;
46  
47  import static java.nio.charset.StandardCharsets.ISO_8859_1;
48  import static java.nio.charset.StandardCharsets.UTF_8;
49  import static org.eclipse.jgit.lib.ObjectChecker.author;
50  import static org.eclipse.jgit.lib.ObjectChecker.committer;
51  import static org.eclipse.jgit.lib.ObjectChecker.encoding;
52  import static org.eclipse.jgit.lib.ObjectChecker.tagger;
53  
54  import java.nio.ByteBuffer;
55  import java.nio.charset.CharacterCodingException;
56  import java.nio.charset.Charset;
57  import java.nio.charset.CharsetDecoder;
58  import java.nio.charset.CodingErrorAction;
59  import java.nio.charset.IllegalCharsetNameException;
60  import java.nio.charset.UnsupportedCharsetException;
61  import java.util.Arrays;
62  import java.util.HashMap;
63  import java.util.Map;
64  
65  import org.eclipse.jgit.annotations.Nullable;
66  import org.eclipse.jgit.errors.BinaryBlobException;
67  import org.eclipse.jgit.lib.Constants;
68  import org.eclipse.jgit.lib.PersonIdent;
69  
70  /**
71   * Handy utility functions to parse raw object contents.
72   */
73  public final class RawParseUtils {
74  	/**
75  	 * UTF-8 charset constant.
76  	 *
77  	 * @since 2.2
78  	 */
79  	public static final Charset UTF8_CHARSET = UTF_8;
80  
81  	private static final byte[] digits10;
82  
83  	private static final byte[] digits16;
84  
85  	private static final byte[] footerLineKeyChars;
86  
87  	private static final Map<String, Charset> encodingAliases;
88  
89  	static {
90  		encodingAliases = new HashMap<>();
91  		encodingAliases.put("latin-1", ISO_8859_1); //$NON-NLS-1$
92  		encodingAliases.put("iso-latin-1", ISO_8859_1); //$NON-NLS-1$
93  
94  		digits10 = new byte['9' + 1];
95  		Arrays.fill(digits10, (byte) -1);
96  		for (char i = '0'; i <= '9'; i++)
97  			digits10[i] = (byte) (i - '0');
98  
99  		digits16 = new byte['f' + 1];
100 		Arrays.fill(digits16, (byte) -1);
101 		for (char i = '0'; i <= '9'; i++)
102 			digits16[i] = (byte) (i - '0');
103 		for (char i = 'a'; i <= 'f'; i++)
104 			digits16[i] = (byte) ((i - 'a') + 10);
105 		for (char i = 'A'; i <= 'F'; i++)
106 			digits16[i] = (byte) ((i - 'A') + 10);
107 
108 		footerLineKeyChars = new byte['z' + 1];
109 		footerLineKeyChars['-'] = 1;
110 		for (char i = '0'; i <= '9'; i++)
111 			footerLineKeyChars[i] = 1;
112 		for (char i = 'A'; i <= 'Z'; i++)
113 			footerLineKeyChars[i] = 1;
114 		for (char i = 'a'; i <= 'z'; i++)
115 			footerLineKeyChars[i] = 1;
116 	}
117 
118 	/**
119 	 * Determine if b[ptr] matches src.
120 	 *
121 	 * @param b
122 	 *            the buffer to scan.
123 	 * @param ptr
124 	 *            first position within b, this should match src[0].
125 	 * @param src
126 	 *            the buffer to test for equality with b.
127 	 * @return ptr + src.length if b[ptr..src.length] == src; else -1.
128 	 */
129 	public static final int match(byte[] b, int ptr, byte[] src) {
130 		if (ptr + src.length > b.length)
131 			return -1;
132 		for (int i = 0; i < src.length; i++, ptr++)
133 			if (b[ptr] != src[i])
134 				return -1;
135 		return ptr;
136 	}
137 
138 	private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5',
139 			'6', '7', '8', '9' };
140 
141 	/**
142 	 * Format a base 10 numeric into a temporary buffer.
143 	 * <p>
144 	 * Formatting is performed backwards. The method starts at offset
145 	 * <code>o-1</code> and ends at <code>o-1-digits</code>, where
146 	 * <code>digits</code> is the number of positions necessary to store the
147 	 * base 10 value.
148 	 * <p>
149 	 * The argument and return values from this method make it easy to chain
150 	 * writing, for example:
151 	 * </p>
152 	 *
153 	 * <pre>
154 	 * final byte[] tmp = new byte[64];
155 	 * int ptr = tmp.length;
156 	 * tmp[--ptr] = '\n';
157 	 * ptr = RawParseUtils.formatBase10(tmp, ptr, 32);
158 	 * tmp[--ptr] = ' ';
159 	 * ptr = RawParseUtils.formatBase10(tmp, ptr, 18);
160 	 * tmp[--ptr] = 0;
161 	 * final String str = new String(tmp, ptr, tmp.length - ptr);
162 	 * </pre>
163 	 *
164 	 * @param b
165 	 *            buffer to write into.
166 	 * @param o
167 	 *            one offset past the location where writing will begin; writing
168 	 *            proceeds towards lower index values.
169 	 * @param value
170 	 *            the value to store.
171 	 * @return the new offset value <code>o</code>. This is the position of
172 	 *         the last byte written. Additional writing should start at one
173 	 *         position earlier.
174 	 */
175 	public static int formatBase10(final byte[] b, int o, int value) {
176 		if (value == 0) {
177 			b[--o] = '0';
178 			return o;
179 		}
180 		final boolean isneg = value < 0;
181 		if (isneg)
182 			value = -value;
183 		while (value != 0) {
184 			b[--o] = base10byte[value % 10];
185 			value /= 10;
186 		}
187 		if (isneg)
188 			b[--o] = '-';
189 		return o;
190 	}
191 
192 	/**
193 	 * Parse a base 10 numeric from a sequence of ASCII digits into an int.
194 	 * <p>
195 	 * Digit sequences can begin with an optional run of spaces before the
196 	 * sequence, and may start with a '+' or a '-' to indicate sign position.
197 	 * Any other characters will cause the method to stop and return the current
198 	 * result to the caller.
199 	 *
200 	 * @param b
201 	 *            buffer to scan.
202 	 * @param ptr
203 	 *            position within buffer to start parsing digits at.
204 	 * @param ptrResult
205 	 *            optional location to return the new ptr value through. If null
206 	 *            the ptr value will be discarded.
207 	 * @return the value at this location; 0 if the location is not a valid
208 	 *         numeric.
209 	 */
210 	public static final int parseBase10(final byte[] b, int ptr,
211 			final MutableInteger ptrResult) {
212 		int r = 0;
213 		int sign = 0;
214 		try {
215 			final int sz = b.length;
216 			while (ptr < sz && b[ptr] == ' ')
217 				ptr++;
218 			if (ptr >= sz)
219 				return 0;
220 
221 			switch (b[ptr]) {
222 			case '-':
223 				sign = -1;
224 				ptr++;
225 				break;
226 			case '+':
227 				ptr++;
228 				break;
229 			}
230 
231 			while (ptr < sz) {
232 				final byte v = digits10[b[ptr]];
233 				if (v < 0)
234 					break;
235 				r = (r * 10) + v;
236 				ptr++;
237 			}
238 		} catch (ArrayIndexOutOfBoundsException e) {
239 			// Not a valid digit.
240 		}
241 		if (ptrResult != null)
242 			ptrResult.value = ptr;
243 		return sign < 0 ? -r : r;
244 	}
245 
246 	/**
247 	 * Parse a base 10 numeric from a sequence of ASCII digits into a long.
248 	 * <p>
249 	 * Digit sequences can begin with an optional run of spaces before the
250 	 * sequence, and may start with a '+' or a '-' to indicate sign position.
251 	 * Any other characters will cause the method to stop and return the current
252 	 * result to the caller.
253 	 *
254 	 * @param b
255 	 *            buffer to scan.
256 	 * @param ptr
257 	 *            position within buffer to start parsing digits at.
258 	 * @param ptrResult
259 	 *            optional location to return the new ptr value through. If null
260 	 *            the ptr value will be discarded.
261 	 * @return the value at this location; 0 if the location is not a valid
262 	 *         numeric.
263 	 */
264 	public static final long parseLongBase10(final byte[] b, int ptr,
265 			final MutableInteger ptrResult) {
266 		long r = 0;
267 		int sign = 0;
268 		try {
269 			final int sz = b.length;
270 			while (ptr < sz && b[ptr] == ' ')
271 				ptr++;
272 			if (ptr >= sz)
273 				return 0;
274 
275 			switch (b[ptr]) {
276 			case '-':
277 				sign = -1;
278 				ptr++;
279 				break;
280 			case '+':
281 				ptr++;
282 				break;
283 			}
284 
285 			while (ptr < sz) {
286 				final byte v = digits10[b[ptr]];
287 				if (v < 0)
288 					break;
289 				r = (r * 10) + v;
290 				ptr++;
291 			}
292 		} catch (ArrayIndexOutOfBoundsException e) {
293 			// Not a valid digit.
294 		}
295 		if (ptrResult != null)
296 			ptrResult.value = ptr;
297 		return sign < 0 ? -r : r;
298 	}
299 
300 	/**
301 	 * Parse 4 character base 16 (hex) formatted string to unsigned integer.
302 	 * <p>
303 	 * The number is read in network byte order, that is, most significant
304 	 * nybble first.
305 	 *
306 	 * @param bs
307 	 *            buffer to parse digits from; positions {@code [p, p+4)} will
308 	 *            be parsed.
309 	 * @param p
310 	 *            first position within the buffer to parse.
311 	 * @return the integer value.
312 	 * @throws java.lang.ArrayIndexOutOfBoundsException
313 	 *             if the string is not hex formatted.
314 	 */
315 	public static final int parseHexInt16(final byte[] bs, final int p) {
316 		int r = digits16[bs[p]] << 4;
317 
318 		r |= digits16[bs[p + 1]];
319 		r <<= 4;
320 
321 		r |= digits16[bs[p + 2]];
322 		r <<= 4;
323 
324 		r |= digits16[bs[p + 3]];
325 		if (r < 0)
326 			throw new ArrayIndexOutOfBoundsException();
327 		return r;
328 	}
329 
330 	/**
331 	 * Parse 8 character base 16 (hex) formatted string to unsigned integer.
332 	 * <p>
333 	 * The number is read in network byte order, that is, most significant
334 	 * nybble first.
335 	 *
336 	 * @param bs
337 	 *            buffer to parse digits from; positions {@code [p, p+8)} will
338 	 *            be parsed.
339 	 * @param p
340 	 *            first position within the buffer to parse.
341 	 * @return the integer value.
342 	 * @throws java.lang.ArrayIndexOutOfBoundsException
343 	 *             if the string is not hex formatted.
344 	 */
345 	public static final int parseHexInt32(final byte[] bs, final int p) {
346 		int r = digits16[bs[p]] << 4;
347 
348 		r |= digits16[bs[p + 1]];
349 		r <<= 4;
350 
351 		r |= digits16[bs[p + 2]];
352 		r <<= 4;
353 
354 		r |= digits16[bs[p + 3]];
355 		r <<= 4;
356 
357 		r |= digits16[bs[p + 4]];
358 		r <<= 4;
359 
360 		r |= digits16[bs[p + 5]];
361 		r <<= 4;
362 
363 		r |= digits16[bs[p + 6]];
364 
365 		final int last = digits16[bs[p + 7]];
366 		if (r < 0 || last < 0)
367 			throw new ArrayIndexOutOfBoundsException();
368 		return (r << 4) | last;
369 	}
370 
371 	/**
372 	 * Parse 16 character base 16 (hex) formatted string to unsigned long.
373 	 * <p>
374 	 * The number is read in network byte order, that is, most significant
375 	 * nibble first.
376 	 *
377 	 * @param bs
378 	 *            buffer to parse digits from; positions {@code [p, p+16)} will
379 	 *            be parsed.
380 	 * @param p
381 	 *            first position within the buffer to parse.
382 	 * @return the integer value.
383 	 * @throws java.lang.ArrayIndexOutOfBoundsException
384 	 *             if the string is not hex formatted.
385 	 * @since 4.3
386 	 */
387 	public static final long parseHexInt64(final byte[] bs, final int p) {
388 		long r = digits16[bs[p]] << 4;
389 
390 		r |= digits16[bs[p + 1]];
391 		r <<= 4;
392 
393 		r |= digits16[bs[p + 2]];
394 		r <<= 4;
395 
396 		r |= digits16[bs[p + 3]];
397 		r <<= 4;
398 
399 		r |= digits16[bs[p + 4]];
400 		r <<= 4;
401 
402 		r |= digits16[bs[p + 5]];
403 		r <<= 4;
404 
405 		r |= digits16[bs[p + 6]];
406 		r <<= 4;
407 
408 		r |= digits16[bs[p + 7]];
409 		r <<= 4;
410 
411 		r |= digits16[bs[p + 8]];
412 		r <<= 4;
413 
414 		r |= digits16[bs[p + 9]];
415 		r <<= 4;
416 
417 		r |= digits16[bs[p + 10]];
418 		r <<= 4;
419 
420 		r |= digits16[bs[p + 11]];
421 		r <<= 4;
422 
423 		r |= digits16[bs[p + 12]];
424 		r <<= 4;
425 
426 		r |= digits16[bs[p + 13]];
427 		r <<= 4;
428 
429 		r |= digits16[bs[p + 14]];
430 
431 		final int last = digits16[bs[p + 15]];
432 		if (r < 0 || last < 0)
433 			throw new ArrayIndexOutOfBoundsException();
434 		return (r << 4) | last;
435 	}
436 
437 	/**
438 	 * Parse a single hex digit to its numeric value (0-15).
439 	 *
440 	 * @param digit
441 	 *            hex character to parse.
442 	 * @return numeric value, in the range 0-15.
443 	 * @throws java.lang.ArrayIndexOutOfBoundsException
444 	 *             if the input digit is not a valid hex digit.
445 	 */
446 	public static final int parseHexInt4(final byte digit) {
447 		final byte r = digits16[digit];
448 		if (r < 0)
449 			throw new ArrayIndexOutOfBoundsException();
450 		return r;
451 	}
452 
453 	/**
454 	 * Parse a Git style timezone string.
455 	 * <p>
456 	 * The sequence "-0315" will be parsed as the numeric value -195, as the
457 	 * lower two positions count minutes, not 100ths of an hour.
458 	 *
459 	 * @param b
460 	 *            buffer to scan.
461 	 * @param ptr
462 	 *            position within buffer to start parsing digits at.
463 	 * @return the timezone at this location, expressed in minutes.
464 	 */
465 	public static final int parseTimeZoneOffset(byte[] b, int ptr) {
466 		return parseTimeZoneOffset(b, ptr, null);
467 	}
468 
469 	/**
470 	 * Parse a Git style timezone string.
471 	 * <p>
472 	 * The sequence "-0315" will be parsed as the numeric value -195, as the
473 	 * lower two positions count minutes, not 100ths of an hour.
474 	 *
475 	 * @param b
476 	 *            buffer to scan.
477 	 * @param ptr
478 	 *            position within buffer to start parsing digits at.
479 	 * @param ptrResult
480 	 *            optional location to return the new ptr value through. If null
481 	 *            the ptr value will be discarded.
482 	 * @return the timezone at this location, expressed in minutes.
483 	 * @since 4.1
484 	 */
485 	public static final int parseTimeZoneOffset(final byte[] b, int ptr,
486 			MutableInteger ptrResult) {
487 		final int v = parseBase10(b, ptr, ptrResult);
488 		final int tzMins = v % 100;
489 		final int tzHours = v / 100;
490 		return tzHours * 60 + tzMins;
491 	}
492 
493 	/**
494 	 * Locate the first position after a given character.
495 	 *
496 	 * @param b
497 	 *            buffer to scan.
498 	 * @param ptr
499 	 *            position within buffer to start looking for chrA at.
500 	 * @param chrA
501 	 *            character to find.
502 	 * @return new position just after chrA.
503 	 */
504 	public static final int next(byte[] b, int ptr, char chrA) {
505 		final int sz = b.length;
506 		while (ptr < sz) {
507 			if (b[ptr++] == chrA)
508 				return ptr;
509 		}
510 		return ptr;
511 	}
512 
513 	/**
514 	 * Locate the first position after the next LF.
515 	 * <p>
516 	 * This method stops on the first '\n' it finds.
517 	 *
518 	 * @param b
519 	 *            buffer to scan.
520 	 * @param ptr
521 	 *            position within buffer to start looking for LF at.
522 	 * @return new position just after the first LF found.
523 	 */
524 	public static final int nextLF(byte[] b, int ptr) {
525 		return next(b, ptr, '\n');
526 	}
527 
528 	/**
529 	 * Locate the first position after either the given character or LF.
530 	 * <p>
531 	 * This method stops on the first match it finds from either chrA or '\n'.
532 	 *
533 	 * @param b
534 	 *            buffer to scan.
535 	 * @param ptr
536 	 *            position within buffer to start looking for chrA or LF at.
537 	 * @param chrA
538 	 *            character to find.
539 	 * @return new position just after the first chrA or LF to be found.
540 	 */
541 	public static final int nextLF(byte[] b, int ptr, char chrA) {
542 		final int sz = b.length;
543 		while (ptr < sz) {
544 			final byte c = b[ptr++];
545 			if (c == chrA || c == '\n')
546 				return ptr;
547 		}
548 		return ptr;
549 	}
550 
551 	/**
552 	 * Locate the end of the header.  Note that headers may be
553 	 * more than one line long.
554 	 * @param b
555 	 *            buffer to scan.
556 	 * @param ptr
557 	 *            position within buffer to start looking for the end-of-header.
558 	 * @return new position just after the header.  This is either
559 	 * b.length, or the index of the header's terminating newline.
560 	 * @since 5.1
561 	 */
562 	public static final int headerEnd(final byte[] b, int ptr) {
563 		final int sz = b.length;
564 		while (ptr < sz) {
565 			final byte c = b[ptr++];
566 			if (c == '\n' && (ptr == sz || b[ptr] != ' ')) {
567 				return ptr - 1;
568 			}
569 		}
570 		return ptr - 1;
571 	}
572 
573 	/**
574 	 * Find the start of the contents of a given header.
575 	 *
576 	 * @param b
577 	 *            buffer to scan.
578 	 * @param headerName
579 	 *            header to search for
580 	 * @param ptr
581 	 *            position within buffer to start looking for header at.
582 	 * @return new position at the start of the header's contents, -1 for
583 	 *         not found
584 	 * @since 5.1
585 	 */
586 	public static final int headerStart(byte[] headerName, byte[] b, int ptr) {
587 		// Start by advancing to just past a LF or buffer start
588 		if (ptr != 0) {
589 			ptr = nextLF(b, ptr - 1);
590 		}
591 		while (ptr < b.length - (headerName.length + 1)) {
592 			boolean found = true;
593 			for (int i = 0; i < headerName.length; i++) {
594 				if (headerName[i] != b[ptr++]) {
595 					found = false;
596 					break;
597 				}
598 			}
599 			if (found && b[ptr++] == ' ') {
600 				return ptr;
601 			}
602 			ptr = nextLF(b, ptr);
603 		}
604 		return -1;
605 	}
606 
607 	/**
608 	 * Locate the first position before a given character.
609 	 *
610 	 * @param b
611 	 *            buffer to scan.
612 	 * @param ptr
613 	 *            position within buffer to start looking for chrA at.
614 	 * @param chrA
615 	 *            character to find.
616 	 * @return new position just before chrA, -1 for not found
617 	 */
618 	public static final int prev(byte[] b, int ptr, char chrA) {
619 		if (ptr == b.length)
620 			--ptr;
621 		while (ptr >= 0) {
622 			if (b[ptr--] == chrA)
623 				return ptr;
624 		}
625 		return ptr;
626 	}
627 
628 	/**
629 	 * Locate the first position before the previous LF.
630 	 * <p>
631 	 * This method stops on the first '\n' it finds.
632 	 *
633 	 * @param b
634 	 *            buffer to scan.
635 	 * @param ptr
636 	 *            position within buffer to start looking for LF at.
637 	 * @return new position just before the first LF found, -1 for not found
638 	 */
639 	public static final int prevLF(byte[] b, int ptr) {
640 		return prev(b, ptr, '\n');
641 	}
642 
643 	/**
644 	 * Locate the previous position before either the given character or LF.
645 	 * <p>
646 	 * This method stops on the first match it finds from either chrA or '\n'.
647 	 *
648 	 * @param b
649 	 *            buffer to scan.
650 	 * @param ptr
651 	 *            position within buffer to start looking for chrA or LF at.
652 	 * @param chrA
653 	 *            character to find.
654 	 * @return new position just before the first chrA or LF to be found, -1 for
655 	 *         not found
656 	 */
657 	public static final int prevLF(byte[] b, int ptr, char chrA) {
658 		if (ptr == b.length)
659 			--ptr;
660 		while (ptr >= 0) {
661 			final byte c = b[ptr--];
662 			if (c == chrA || c == '\n')
663 				return ptr;
664 		}
665 		return ptr;
666 	}
667 
668 	/**
669 	 * Index the region between <code>[ptr, end)</code> to find line starts.
670 	 * <p>
671 	 * The returned list is 1 indexed. Index 0 contains
672 	 * {@link java.lang.Integer#MIN_VALUE} to pad the list out.
673 	 * <p>
674 	 * Using a 1 indexed list means that line numbers can be directly accessed
675 	 * from the list, so <code>list.get(1)</code> (aka get line 1) returns
676 	 * <code>ptr</code>.
677 	 * <p>
678 	 * The last element (index <code>map.size()-1</code>) always contains
679 	 * <code>end</code>.
680 	 * <p>
681 	 * If the data contains a '\0' anywhere, the whole region is considered
682 	 * binary and a LineMap corresponding to a single line is returned.
683 	 * </p>
684 	 *
685 	 * @param buf
686 	 *            buffer to scan.
687 	 * @param ptr
688 	 *            position within the buffer corresponding to the first byte of
689 	 *            line 1.
690 	 * @param end
691 	 *            1 past the end of the content within <code>buf</code>.
692 	 * @return a line map indicating the starting position of each line, or a
693 	 *         map representing the entire buffer as a single line if
694 	 *         <code>buf</code> contains a NUL byte.
695 	 */
696 	public static final IntList lineMap(byte[] buf, int ptr, int end) {
697 		IntList map = lineMapOrNull(buf, ptr, end);
698 		if (map == null) {
699 			map = new IntList(3);
700 			map.add(Integer.MIN_VALUE);
701 			map.add(ptr);
702 			map.add(end);
703 		}
704 		return map;
705 	}
706 
707 	/**
708 	 * Like {@link #lineMap(byte[], int, int)} but throw
709 	 * {@link BinaryBlobException} if a NUL byte is encountered.
710 	 *
711 	 * @param buf
712 	 *            buffer to scan.
713 	 * @param ptr
714 	 *            position within the buffer corresponding to the first byte of
715 	 *            line 1.
716 	 * @param end
717 	 *            1 past the end of the content within <code>buf</code>.
718 	 * @return a line map indicating the starting position of each line.
719 	 * @throws BinaryBlobException
720 	 *            if a NUL byte is found.
721 	 * @since 5.0
722 	 */
723 	public static final IntList lineMapOrBinary(byte[] buf, int ptr, int end)
724 			throws BinaryBlobException {
725 		IntList map = lineMapOrNull(buf, ptr, end);
726 		if (map == null) {
727 			throw new BinaryBlobException();
728 		}
729 		return map;
730 	}
731 
732 	private static @Nullable IntList lineMapOrNull(byte[] buf, int ptr, int end) {
733 		// Experimentally derived from multiple source repositories
734 		// the average number of bytes/line is 36. Its a rough guess
735 		// to initially size our map close to the target.
736 		IntList map = new IntList((end - ptr) / 36);
737 		map.add(Integer.MIN_VALUE);
738 		boolean foundLF = true;
739 		for (; ptr < end; ptr++) {
740 			if (foundLF) {
741 				map.add(ptr);
742 			}
743 
744 			if (buf[ptr] == '\0') {
745 				return null;
746 			}
747 
748 			foundLF = (buf[ptr] == '\n');
749 		}
750 		map.add(end);
751 		return map;
752 	}
753 
754 	/**
755 	 * Locate the "author " header line data.
756 	 *
757 	 * @param b
758 	 *            buffer to scan.
759 	 * @param ptr
760 	 *            position in buffer to start the scan at. Most callers should
761 	 *            pass 0 to ensure the scan starts from the beginning of the
762 	 *            commit buffer and does not accidentally look at message body.
763 	 * @return position just after the space in "author ", so the first
764 	 *         character of the author's name. If no author header can be
765 	 *         located -1 is returned.
766 	 */
767 	public static final int author(byte[] b, int ptr) {
768 		final int sz = b.length;
769 		if (ptr == 0)
770 			ptr += 46; // skip the "tree ..." line.
771 		while (ptr < sz && b[ptr] == 'p')
772 			ptr += 48; // skip this parent.
773 		return match(b, ptr, author);
774 	}
775 
776 	/**
777 	 * Locate the "committer " header line data.
778 	 *
779 	 * @param b
780 	 *            buffer to scan.
781 	 * @param ptr
782 	 *            position in buffer to start the scan at. Most callers should
783 	 *            pass 0 to ensure the scan starts from the beginning of the
784 	 *            commit buffer and does not accidentally look at message body.
785 	 * @return position just after the space in "committer ", so the first
786 	 *         character of the committer's name. If no committer header can be
787 	 *         located -1 is returned.
788 	 */
789 	public static final int committer(byte[] b, int ptr) {
790 		final int sz = b.length;
791 		if (ptr == 0)
792 			ptr += 46; // skip the "tree ..." line.
793 		while (ptr < sz && b[ptr] == 'p')
794 			ptr += 48; // skip this parent.
795 		if (ptr < sz && b[ptr] == 'a')
796 			ptr = nextLF(b, ptr);
797 		return match(b, ptr, committer);
798 	}
799 
800 	/**
801 	 * Locate the "tagger " header line data.
802 	 *
803 	 * @param b
804 	 *            buffer to scan.
805 	 * @param ptr
806 	 *            position in buffer to start the scan at. Most callers should
807 	 *            pass 0 to ensure the scan starts from the beginning of the tag
808 	 *            buffer and does not accidentally look at message body.
809 	 * @return position just after the space in "tagger ", so the first
810 	 *         character of the tagger's name. If no tagger header can be
811 	 *         located -1 is returned.
812 	 */
813 	public static final int tagger(byte[] b, int ptr) {
814 		final int sz = b.length;
815 		if (ptr == 0)
816 			ptr += 48; // skip the "object ..." line.
817 		while (ptr < sz) {
818 			if (b[ptr] == '\n')
819 				return -1;
820 			final int m = match(b, ptr, tagger);
821 			if (m >= 0)
822 				return m;
823 			ptr = nextLF(b, ptr);
824 		}
825 		return -1;
826 	}
827 
828 	/**
829 	 * Locate the "encoding " header line.
830 	 *
831 	 * @param b
832 	 *            buffer to scan.
833 	 * @param ptr
834 	 *            position in buffer to start the scan at. Most callers should
835 	 *            pass 0 to ensure the scan starts from the beginning of the
836 	 *            buffer and does not accidentally look at the message body.
837 	 * @return position just after the space in "encoding ", so the first
838 	 *         character of the encoding's name. If no encoding header can be
839 	 *         located -1 is returned (and UTF-8 should be assumed).
840 	 */
841 	public static final int encoding(byte[] b, int ptr) {
842 		final int sz = b.length;
843 		while (ptr < sz) {
844 			if (b[ptr] == '\n')
845 				return -1;
846 			if (b[ptr] == 'e')
847 				break;
848 			ptr = nextLF(b, ptr);
849 		}
850 		return match(b, ptr, encoding);
851 	}
852 
853 	/**
854 	 * Parse the "encoding " header as a string.
855 	 * <p>
856 	 * Locates the "encoding " header (if present) and returns its value.
857 	 *
858 	 * @param b
859 	 *            buffer to scan.
860 	 * @return the encoding header as specified in the commit; null if the
861 	 *         header was not present and should be assumed.
862 	 * @since 4.2
863 	 */
864 	@Nullable
865 	public static String parseEncodingName(byte[] b) {
866 		int enc = encoding(b, 0);
867 		if (enc < 0) {
868 			return null;
869 		}
870 		int lf = nextLF(b, enc);
871 		return decode(UTF_8, b, enc, lf - 1);
872 	}
873 
874 	/**
875 	 * Parse the "encoding " header into a character set reference.
876 	 * <p>
877 	 * Locates the "encoding " header (if present) by first calling
878 	 * {@link #encoding(byte[], int)} and then returns the proper character set
879 	 * to apply to this buffer to evaluate its contents as character data.
880 	 * <p>
881 	 * If no encoding header is present {@code UTF-8} is assumed.
882 	 *
883 	 * @param b
884 	 *            buffer to scan.
885 	 * @return the Java character set representation. Never null.
886 	 * @throws IllegalCharsetNameException
887 	 *             if the character set requested by the encoding header is
888 	 *             malformed and unsupportable.
889 	 * @throws UnsupportedCharsetException
890 	 *             if the JRE does not support the character set requested by
891 	 *             the encoding header.
892 	 */
893 	public static Charset parseEncoding(byte[] b) {
894 		String enc = parseEncodingName(b);
895 		if (enc == null) {
896 			return UTF_8;
897 		}
898 
899 		String name = enc.trim();
900 		try {
901 			return Charset.forName(name);
902 		} catch (IllegalCharsetNameException
903 				| UnsupportedCharsetException badName) {
904 			Charset aliased = charsetForAlias(name);
905 			if (aliased != null) {
906 				return aliased;
907 			}
908 			throw badName;
909 		}
910 	}
911 
912 	/**
913 	 * Parse a name string (e.g. author, committer, tagger) into a PersonIdent.
914 	 * <p>
915 	 * Leading spaces won't be trimmed from the string, i.e. will show up in the
916 	 * parsed name afterwards.
917 	 *
918 	 * @param in
919 	 *            the string to parse a name from.
920 	 * @return the parsed identity or null in case the identity could not be
921 	 *         parsed.
922 	 */
923 	public static PersonIdent parsePersonIdent(String in) {
924 		return parsePersonIdent(Constants.encode(in), 0);
925 	}
926 
927 	/**
928 	 * Parse a name line (e.g. author, committer, tagger) into a PersonIdent.
929 	 * <p>
930 	 * When passing in a value for <code>nameB</code> callers should use the
931 	 * return value of {@link #author(byte[], int)} or
932 	 * {@link #committer(byte[], int)}, as these methods provide the proper
933 	 * position within the buffer.
934 	 *
935 	 * @param raw
936 	 *            the buffer to parse character data from.
937 	 * @param nameB
938 	 *            first position of the identity information. This should be the
939 	 *            first position after the space which delimits the header field
940 	 *            name (e.g. "author" or "committer") from the rest of the
941 	 *            identity line.
942 	 * @return the parsed identity or null in case the identity could not be
943 	 *         parsed.
944 	 */
945 	public static PersonIdent parsePersonIdent(byte[] raw, int nameB) {
946 		Charset cs;
947 		try {
948 			cs = parseEncoding(raw);
949 		} catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
950 			// Assume UTF-8 for person identities, usually this is correct.
951 			// If not decode() will fall back to the ISO-8859-1 encoding.
952 			cs = UTF_8;
953 		}
954 
955 		final int emailB = nextLF(raw, nameB, '<');
956 		final int emailE = nextLF(raw, emailB, '>');
957 		if (emailB >= raw.length || raw[emailB] == '\n' ||
958 				(emailE >= raw.length - 1 && raw[emailE - 1] != '>'))
959 			return null;
960 
961 		final int nameEnd = emailB - 2 >= nameB && raw[emailB - 2] == ' ' ?
962 				emailB - 2 : emailB - 1;
963 		final String name = decode(cs, raw, nameB, nameEnd);
964 		final String email = decode(cs, raw, emailB, emailE - 1);
965 
966 		// Start searching from end of line, as after first name-email pair,
967 		// another name-email pair may occur. We will ignore all kinds of
968 		// "junk" following the first email.
969 		//
970 		// We've to use (emailE - 1) for the case that raw[email] is LF,
971 		// otherwise we would run too far. "-2" is necessary to position
972 		// before the LF in case of LF termination resp. the penultimate
973 		// character if there is no trailing LF.
974 		final int tzBegin = lastIndexOfTrim(raw, ' ',
975 				nextLF(raw, emailE - 1) - 2) + 1;
976 		if (tzBegin <= emailE) // No time/zone, still valid
977 			return new PersonIdent(name, email, 0, 0);
978 
979 		final int whenBegin = Math.max(emailE,
980 				lastIndexOfTrim(raw, ' ', tzBegin - 1) + 1);
981 		if (whenBegin >= tzBegin - 1) // No time/zone, still valid
982 			return new PersonIdent(name, email, 0, 0);
983 
984 		final long when = parseLongBase10(raw, whenBegin, null);
985 		final int tz = parseTimeZoneOffset(raw, tzBegin);
986 		return new PersonIdent(name, email, when * 1000L, tz);
987 	}
988 
989 	/**
990 	 * Parse a name data (e.g. as within a reflog) into a PersonIdent.
991 	 * <p>
992 	 * When passing in a value for <code>nameB</code> callers should use the
993 	 * return value of {@link #author(byte[], int)} or
994 	 * {@link #committer(byte[], int)}, as these methods provide the proper
995 	 * position within the buffer.
996 	 *
997 	 * @param raw
998 	 *            the buffer to parse character data from.
999 	 * @param nameB
1000 	 *            first position of the identity information. This should be the
1001 	 *            first position after the space which delimits the header field
1002 	 *            name (e.g. "author" or "committer") from the rest of the
1003 	 *            identity line.
1004 	 * @return the parsed identity. Never null.
1005 	 */
1006 	public static PersonIdent parsePersonIdentOnly(final byte[] raw,
1007 			final int nameB) {
1008 		int stop = nextLF(raw, nameB);
1009 		int emailB = nextLF(raw, nameB, '<');
1010 		int emailE = nextLF(raw, emailB, '>');
1011 		final String name;
1012 		final String email;
1013 		if (emailE < stop) {
1014 			email = decode(raw, emailB, emailE - 1);
1015 		} else {
1016 			email = "invalid"; //$NON-NLS-1$
1017 		}
1018 		if (emailB < stop)
1019 			name = decode(raw, nameB, emailB - 2);
1020 		else
1021 			name = decode(raw, nameB, stop);
1022 
1023 		final MutableInteger ptrout = new MutableInteger();
1024 		long when;
1025 		int tz;
1026 		if (emailE < stop) {
1027 			when = parseLongBase10(raw, emailE + 1, ptrout);
1028 			tz = parseTimeZoneOffset(raw, ptrout.value);
1029 		} else {
1030 			when = 0;
1031 			tz = 0;
1032 		}
1033 		return new PersonIdent(name, email, when * 1000L, tz);
1034 	}
1035 
1036 	/**
1037 	 * Locate the end of a footer line key string.
1038 	 * <p>
1039 	 * If the region at {@code raw[ptr]} matches {@code ^[A-Za-z0-9-]+:} (e.g.
1040 	 * "Signed-off-by: A. U. Thor\n") then this method returns the position of
1041 	 * the first ':'.
1042 	 * <p>
1043 	 * If the region at {@code raw[ptr]} does not match {@code ^[A-Za-z0-9-]+:}
1044 	 * then this method returns -1.
1045 	 *
1046 	 * @param raw
1047 	 *            buffer to scan.
1048 	 * @param ptr
1049 	 *            first position within raw to consider as a footer line key.
1050 	 * @return position of the ':' which terminates the footer line key if this
1051 	 *         is otherwise a valid footer line key; otherwise -1.
1052 	 */
1053 	public static int endOfFooterLineKey(byte[] raw, int ptr) {
1054 		try {
1055 			for (;;) {
1056 				final byte c = raw[ptr];
1057 				if (footerLineKeyChars[c] == 0) {
1058 					if (c == ':')
1059 						return ptr;
1060 					return -1;
1061 				}
1062 				ptr++;
1063 			}
1064 		} catch (ArrayIndexOutOfBoundsException e) {
1065 			return -1;
1066 		}
1067 	}
1068 
1069 	/**
1070 	 * Decode a buffer under UTF-8, if possible.
1071 	 *
1072 	 * If the byte stream cannot be decoded that way, the platform default is tried
1073 	 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1074 	 *
1075 	 * @param buffer
1076 	 *            buffer to pull raw bytes from.
1077 	 * @return a string representation of the range <code>[start,end)</code>,
1078 	 *         after decoding the region through the specified character set.
1079 	 */
1080 	public static String decode(byte[] buffer) {
1081 		return decode(buffer, 0, buffer.length);
1082 	}
1083 
1084 	/**
1085 	 * Decode a buffer under UTF-8, if possible.
1086 	 *
1087 	 * If the byte stream cannot be decoded that way, the platform default is
1088 	 * tried and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1089 	 *
1090 	 * @param buffer
1091 	 *            buffer to pull raw bytes from.
1092 	 * @param start
1093 	 *            start position in buffer
1094 	 * @param end
1095 	 *            one position past the last location within the buffer to take
1096 	 *            data from.
1097 	 * @return a string representation of the range <code>[start,end)</code>,
1098 	 *         after decoding the region through the specified character set.
1099 	 */
1100 	public static String decode(final byte[] buffer, final int start,
1101 			final int end) {
1102 		return decode(UTF_8, buffer, start, end);
1103 	}
1104 
1105 	/**
1106 	 * Decode a buffer under the specified character set if possible.
1107 	 *
1108 	 * If the byte stream cannot be decoded that way, the platform default is tried
1109 	 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1110 	 *
1111 	 * @param cs
1112 	 *            character set to use when decoding the buffer.
1113 	 * @param buffer
1114 	 *            buffer to pull raw bytes from.
1115 	 * @return a string representation of the range <code>[start,end)</code>,
1116 	 *         after decoding the region through the specified character set.
1117 	 */
1118 	public static String decode(Charset cs, byte[] buffer) {
1119 		return decode(cs, buffer, 0, buffer.length);
1120 	}
1121 
1122 	/**
1123 	 * Decode a region of the buffer under the specified character set if possible.
1124 	 *
1125 	 * If the byte stream cannot be decoded that way, the platform default is tried
1126 	 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1127 	 *
1128 	 * @param cs
1129 	 *            character set to use when decoding the buffer.
1130 	 * @param buffer
1131 	 *            buffer to pull raw bytes from.
1132 	 * @param start
1133 	 *            first position within the buffer to take data from.
1134 	 * @param end
1135 	 *            one position past the last location within the buffer to take
1136 	 *            data from.
1137 	 * @return a string representation of the range <code>[start,end)</code>,
1138 	 *         after decoding the region through the specified character set.
1139 	 */
1140 	public static String decode(final Charset cs, final byte[] buffer,
1141 			final int start, final int end) {
1142 		try {
1143 			return decodeNoFallback(cs, buffer, start, end);
1144 		} catch (CharacterCodingException e) {
1145 			// Fall back to an ISO-8859-1 style encoding. At least all of
1146 			// the bytes will be present in the output.
1147 			//
1148 			return extractBinaryString(buffer, start, end);
1149 		}
1150 	}
1151 
1152 	/**
1153 	 * Decode a region of the buffer under the specified character set if
1154 	 * possible.
1155 	 *
1156 	 * If the byte stream cannot be decoded that way, the platform default is
1157 	 * tried and if that too fails, an exception is thrown.
1158 	 *
1159 	 * @param cs
1160 	 *            character set to use when decoding the buffer.
1161 	 * @param buffer
1162 	 *            buffer to pull raw bytes from.
1163 	 * @param start
1164 	 *            first position within the buffer to take data from.
1165 	 * @param end
1166 	 *            one position past the last location within the buffer to take
1167 	 *            data from.
1168 	 * @return a string representation of the range <code>[start,end)</code>,
1169 	 *         after decoding the region through the specified character set.
1170 	 * @throws java.nio.charset.CharacterCodingException
1171 	 *             the input is not in any of the tested character sets.
1172 	 */
1173 	public static String decodeNoFallback(final Charset cs,
1174 			final byte[] buffer, final int start, final int end)
1175 			throws CharacterCodingException {
1176 		ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
1177 		b.mark();
1178 
1179 		// Try our built-in favorite. The assumption here is that
1180 		// decoding will fail if the data is not actually encoded
1181 		// using that encoder.
1182 		try {
1183 			return decode(b, UTF_8);
1184 		} catch (CharacterCodingException e) {
1185 			b.reset();
1186 		}
1187 
1188 		if (!cs.equals(UTF_8)) {
1189 			// Try the suggested encoding, it might be right since it was
1190 			// provided by the caller.
1191 			try {
1192 				return decode(b, cs);
1193 			} catch (CharacterCodingException e) {
1194 				b.reset();
1195 			}
1196 		}
1197 
1198 		// Try the default character set. A small group of people
1199 		// might actually use the same (or very similar) locale.
1200 		Charset defcs = Charset.defaultCharset();
1201 		if (!defcs.equals(cs) && !defcs.equals(UTF_8)) {
1202 			try {
1203 				return decode(b, defcs);
1204 			} catch (CharacterCodingException e) {
1205 				b.reset();
1206 			}
1207 		}
1208 
1209 		throw new CharacterCodingException();
1210 	}
1211 
1212 	/**
1213 	 * Decode a region of the buffer under the ISO-8859-1 encoding.
1214 	 *
1215 	 * Each byte is treated as a single character in the 8859-1 character
1216 	 * encoding, performing a raw binary-&gt;char conversion.
1217 	 *
1218 	 * @param buffer
1219 	 *            buffer to pull raw bytes from.
1220 	 * @param start
1221 	 *            first position within the buffer to take data from.
1222 	 * @param end
1223 	 *            one position past the last location within the buffer to take
1224 	 *            data from.
1225 	 * @return a string representation of the range <code>[start,end)</code>.
1226 	 */
1227 	public static String extractBinaryString(final byte[] buffer,
1228 			final int start, final int end) {
1229 		final StringBuilder r = new StringBuilder(end - start);
1230 		for (int i = start; i < end; i++)
1231 			r.append((char) (buffer[i] & 0xff));
1232 		return r.toString();
1233 	}
1234 
1235 	private static String decode(ByteBuffer b, Charset charset)
1236 			throws CharacterCodingException {
1237 		final CharsetDecoder d = charset.newDecoder();
1238 		d.onMalformedInput(CodingErrorAction.REPORT);
1239 		d.onUnmappableCharacter(CodingErrorAction.REPORT);
1240 		return d.decode(b).toString();
1241 	}
1242 
1243 	/**
1244 	 * Locate the position of the commit message body.
1245 	 *
1246 	 * @param b
1247 	 *            buffer to scan.
1248 	 * @param ptr
1249 	 *            position in buffer to start the scan at. Most callers should
1250 	 *            pass 0 to ensure the scan starts from the beginning of the
1251 	 *            commit buffer.
1252 	 * @return position of the user's message buffer.
1253 	 */
1254 	public static final int commitMessage(byte[] b, int ptr) {
1255 		final int sz = b.length;
1256 		if (ptr == 0)
1257 			ptr += 46; // skip the "tree ..." line.
1258 		while (ptr < sz && b[ptr] == 'p')
1259 			ptr += 48; // skip this parent.
1260 
1261 		// Skip any remaining header lines, ignoring what their actual
1262 		// header line type is. This is identical to the logic for a tag.
1263 		//
1264 		return tagMessage(b, ptr);
1265 	}
1266 
1267 	/**
1268 	 * Locate the position of the tag message body.
1269 	 *
1270 	 * @param b
1271 	 *            buffer to scan.
1272 	 * @param ptr
1273 	 *            position in buffer to start the scan at. Most callers should
1274 	 *            pass 0 to ensure the scan starts from the beginning of the tag
1275 	 *            buffer.
1276 	 * @return position of the user's message buffer.
1277 	 */
1278 	public static final int tagMessage(byte[] b, int ptr) {
1279 		final int sz = b.length;
1280 		if (ptr == 0)
1281 			ptr += 48; // skip the "object ..." line.
1282 		while (ptr < sz && b[ptr] != '\n')
1283 			ptr = nextLF(b, ptr);
1284 		if (ptr < sz && b[ptr] == '\n')
1285 			return ptr + 1;
1286 		return -1;
1287 	}
1288 
1289 	/**
1290 	 * Locate the end of a paragraph.
1291 	 * <p>
1292 	 * A paragraph is ended by two consecutive LF bytes or CRLF pairs
1293 	 *
1294 	 * @param b
1295 	 *            buffer to scan.
1296 	 * @param start
1297 	 *            position in buffer to start the scan at. Most callers will
1298 	 *            want to pass the first position of the commit message (as
1299 	 *            found by {@link #commitMessage(byte[], int)}.
1300 	 * @return position of the LF at the end of the paragraph;
1301 	 *         <code>b.length</code> if no paragraph end could be located.
1302 	 */
1303 	public static final int endOfParagraph(byte[] b, int start) {
1304 		int ptr = start;
1305 		final int sz = b.length;
1306 		while (ptr < sz && (b[ptr] != '\n' && b[ptr] != '\r'))
1307 			ptr = nextLF(b, ptr);
1308 		if (ptr > start && b[ptr - 1] == '\n')
1309 			ptr--;
1310 		if (ptr > start && b[ptr - 1] == '\r')
1311 			ptr--;
1312 		return ptr;
1313 	}
1314 
1315 	/**
1316 	 * Get last index of {@code ch} in raw, trimming spaces.
1317 	 *
1318 	 * @param raw
1319 	 *            buffer to scan.
1320 	 * @param ch
1321 	 *            character to find.
1322 	 * @param pos
1323 	 *            starting position.
1324 	 * @return last index of {@code ch} in raw, trimming spaces.
1325 	 * @since 4.1
1326 	 */
1327 	public static int lastIndexOfTrim(byte[] raw, char ch, int pos) {
1328 		while (pos >= 0 && raw[pos] == ' ')
1329 			pos--;
1330 
1331 		while (pos >= 0 && raw[pos] != ch)
1332 			pos--;
1333 
1334 		return pos;
1335 	}
1336 
1337 	private static Charset charsetForAlias(String name) {
1338 		return encodingAliases.get(StringUtils.toLowerCase(name));
1339 	}
1340 
1341 	private RawParseUtils() {
1342 		// Don't create instances of a static only utility.
1343 	}
1344 }