View Javadoc
1   /*
2    * Copyright (C) 2008-2009, Google Inc.
3    * Copyright (C) 2006-2008, Shawn O. Pearce <spearce@spearce.org>
4    * and other copyright owners as documented in the project's IP log.
5    *
6    * This program and the accompanying materials are made available
7    * under the terms of the Eclipse Distribution License v1.0 which
8    * accompanies this distribution, is reproduced below, and is
9    * available at http://www.eclipse.org/org/documents/edl-v10.php
10   *
11   * All rights reserved.
12   *
13   * Redistribution and use in source and binary forms, with or
14   * without modification, are permitted provided that the following
15   * conditions are met:
16   *
17   * - Redistributions of source code must retain the above copyright
18   *   notice, this list of conditions and the following disclaimer.
19   *
20   * - Redistributions in binary form must reproduce the above
21   *   copyright notice, this list of conditions and the following
22   *   disclaimer in the documentation and/or other materials provided
23   *   with the distribution.
24   *
25   * - Neither the name of the Eclipse Foundation, Inc. nor the
26   *   names of its contributors may be used to endorse or promote
27   *   products derived from this software without specific prior
28   *   written permission.
29   *
30   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
31   * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
32   * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
33   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34   * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
35   * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
36   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
37   * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
38   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
39   * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
40   * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
41   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
42   * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43   */
44  
45  package org.eclipse.jgit.util;
46  
47  import static java.nio.charset.StandardCharsets.ISO_8859_1;
48  import static java.nio.charset.StandardCharsets.UTF_8;
49  import static org.eclipse.jgit.lib.ObjectChecker.author;
50  import static org.eclipse.jgit.lib.ObjectChecker.committer;
51  import static org.eclipse.jgit.lib.ObjectChecker.encoding;
52  import static org.eclipse.jgit.lib.ObjectChecker.tagger;
53  
54  import java.nio.ByteBuffer;
55  import java.nio.charset.CharacterCodingException;
56  import java.nio.charset.Charset;
57  import java.nio.charset.CharsetDecoder;
58  import java.nio.charset.CodingErrorAction;
59  import java.nio.charset.IllegalCharsetNameException;
60  import java.nio.charset.UnsupportedCharsetException;
61  import java.util.Arrays;
62  import java.util.HashMap;
63  import java.util.Map;
64  
65  import org.eclipse.jgit.annotations.Nullable;
66  import org.eclipse.jgit.lib.Constants;
67  import org.eclipse.jgit.lib.PersonIdent;
68  
69  /** Handy utility functions to parse raw object contents. */
70  public final class RawParseUtils {
71  	/**
72  	 * UTF-8 charset constant.
73  	 *
74  	 * @since 2.2
75  	 */
76  	public static final Charset UTF8_CHARSET = UTF_8;
77  
78  	private static final byte[] digits10;
79  
80  	private static final byte[] digits16;
81  
82  	private static final byte[] footerLineKeyChars;
83  
84  	private static final Map<String, Charset> encodingAliases;
85  
86  	static {
87  		encodingAliases = new HashMap<>();
88  		encodingAliases.put("latin-1", ISO_8859_1); //$NON-NLS-1$
89  		encodingAliases.put("iso-latin-1", ISO_8859_1); //$NON-NLS-1$
90  
91  		digits10 = new byte['9' + 1];
92  		Arrays.fill(digits10, (byte) -1);
93  		for (char i = '0'; i <= '9'; i++)
94  			digits10[i] = (byte) (i - '0');
95  
96  		digits16 = new byte['f' + 1];
97  		Arrays.fill(digits16, (byte) -1);
98  		for (char i = '0'; i <= '9'; i++)
99  			digits16[i] = (byte) (i - '0');
100 		for (char i = 'a'; i <= 'f'; i++)
101 			digits16[i] = (byte) ((i - 'a') + 10);
102 		for (char i = 'A'; i <= 'F'; i++)
103 			digits16[i] = (byte) ((i - 'A') + 10);
104 
105 		footerLineKeyChars = new byte['z' + 1];
106 		footerLineKeyChars['-'] = 1;
107 		for (char i = '0'; i <= '9'; i++)
108 			footerLineKeyChars[i] = 1;
109 		for (char i = 'A'; i <= 'Z'; i++)
110 			footerLineKeyChars[i] = 1;
111 		for (char i = 'a'; i <= 'z'; i++)
112 			footerLineKeyChars[i] = 1;
113 	}
114 
115 	/**
116 	 * Determine if b[ptr] matches src.
117 	 *
118 	 * @param b
119 	 *            the buffer to scan.
120 	 * @param ptr
121 	 *            first position within b, this should match src[0].
122 	 * @param src
123 	 *            the buffer to test for equality with b.
124 	 * @return ptr + src.length if b[ptr..src.length] == src; else -1.
125 	 */
126 	public static final int match(final byte[] b, int ptr, final byte[] src) {
127 		if (ptr + src.length > b.length)
128 			return -1;
129 		for (int i = 0; i < src.length; i++, ptr++)
130 			if (b[ptr] != src[i])
131 				return -1;
132 		return ptr;
133 	}
134 
135 	private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5',
136 			'6', '7', '8', '9' };
137 
138 	/**
139 	 * Format a base 10 numeric into a temporary buffer.
140 	 * <p>
141 	 * Formatting is performed backwards. The method starts at offset
142 	 * <code>o-1</code> and ends at <code>o-1-digits</code>, where
143 	 * <code>digits</code> is the number of positions necessary to store the
144 	 * base 10 value.
145 	 * <p>
146 	 * The argument and return values from this method make it easy to chain
147 	 * writing, for example:
148 	 * </p>
149 	 *
150 	 * <pre>
151 	 * final byte[] tmp = new byte[64];
152 	 * int ptr = tmp.length;
153 	 * tmp[--ptr] = '\n';
154 	 * ptr = RawParseUtils.formatBase10(tmp, ptr, 32);
155 	 * tmp[--ptr] = ' ';
156 	 * ptr = RawParseUtils.formatBase10(tmp, ptr, 18);
157 	 * tmp[--ptr] = 0;
158 	 * final String str = new String(tmp, ptr, tmp.length - ptr);
159 	 * </pre>
160 	 *
161 	 * @param b
162 	 *            buffer to write into.
163 	 * @param o
164 	 *            one offset past the location where writing will begin; writing
165 	 *            proceeds towards lower index values.
166 	 * @param value
167 	 *            the value to store.
168 	 * @return the new offset value <code>o</code>. This is the position of
169 	 *         the last byte written. Additional writing should start at one
170 	 *         position earlier.
171 	 */
172 	public static int formatBase10(final byte[] b, int o, int value) {
173 		if (value == 0) {
174 			b[--o] = '0';
175 			return o;
176 		}
177 		final boolean isneg = value < 0;
178 		if (isneg)
179 			value = -value;
180 		while (value != 0) {
181 			b[--o] = base10byte[value % 10];
182 			value /= 10;
183 		}
184 		if (isneg)
185 			b[--o] = '-';
186 		return o;
187 	}
188 
189 	/**
190 	 * Parse a base 10 numeric from a sequence of ASCII digits into an int.
191 	 * <p>
192 	 * Digit sequences can begin with an optional run of spaces before the
193 	 * sequence, and may start with a '+' or a '-' to indicate sign position.
194 	 * Any other characters will cause the method to stop and return the current
195 	 * result to the caller.
196 	 *
197 	 * @param b
198 	 *            buffer to scan.
199 	 * @param ptr
200 	 *            position within buffer to start parsing digits at.
201 	 * @param ptrResult
202 	 *            optional location to return the new ptr value through. If null
203 	 *            the ptr value will be discarded.
204 	 * @return the value at this location; 0 if the location is not a valid
205 	 *         numeric.
206 	 */
207 	public static final int parseBase10(final byte[] b, int ptr,
208 			final MutableInteger ptrResult) {
209 		int r = 0;
210 		int sign = 0;
211 		try {
212 			final int sz = b.length;
213 			while (ptr < sz && b[ptr] == ' ')
214 				ptr++;
215 			if (ptr >= sz)
216 				return 0;
217 
218 			switch (b[ptr]) {
219 			case '-':
220 				sign = -1;
221 				ptr++;
222 				break;
223 			case '+':
224 				ptr++;
225 				break;
226 			}
227 
228 			while (ptr < sz) {
229 				final byte v = digits10[b[ptr]];
230 				if (v < 0)
231 					break;
232 				r = (r * 10) + v;
233 				ptr++;
234 			}
235 		} catch (ArrayIndexOutOfBoundsException e) {
236 			// Not a valid digit.
237 		}
238 		if (ptrResult != null)
239 			ptrResult.value = ptr;
240 		return sign < 0 ? -r : r;
241 	}
242 
243 	/**
244 	 * Parse a base 10 numeric from a sequence of ASCII digits into a long.
245 	 * <p>
246 	 * Digit sequences can begin with an optional run of spaces before the
247 	 * sequence, and may start with a '+' or a '-' to indicate sign position.
248 	 * Any other characters will cause the method to stop and return the current
249 	 * result to the caller.
250 	 *
251 	 * @param b
252 	 *            buffer to scan.
253 	 * @param ptr
254 	 *            position within buffer to start parsing digits at.
255 	 * @param ptrResult
256 	 *            optional location to return the new ptr value through. If null
257 	 *            the ptr value will be discarded.
258 	 * @return the value at this location; 0 if the location is not a valid
259 	 *         numeric.
260 	 */
261 	public static final long parseLongBase10(final byte[] b, int ptr,
262 			final MutableInteger ptrResult) {
263 		long r = 0;
264 		int sign = 0;
265 		try {
266 			final int sz = b.length;
267 			while (ptr < sz && b[ptr] == ' ')
268 				ptr++;
269 			if (ptr >= sz)
270 				return 0;
271 
272 			switch (b[ptr]) {
273 			case '-':
274 				sign = -1;
275 				ptr++;
276 				break;
277 			case '+':
278 				ptr++;
279 				break;
280 			}
281 
282 			while (ptr < sz) {
283 				final byte v = digits10[b[ptr]];
284 				if (v < 0)
285 					break;
286 				r = (r * 10) + v;
287 				ptr++;
288 			}
289 		} catch (ArrayIndexOutOfBoundsException e) {
290 			// Not a valid digit.
291 		}
292 		if (ptrResult != null)
293 			ptrResult.value = ptr;
294 		return sign < 0 ? -r : r;
295 	}
296 
297 	/**
298 	 * Parse 4 character base 16 (hex) formatted string to unsigned integer.
299 	 * <p>
300 	 * The number is read in network byte order, that is, most significant
301 	 * nybble first.
302 	 *
303 	 * @param bs
304 	 *            buffer to parse digits from; positions {@code [p, p+4)} will
305 	 *            be parsed.
306 	 * @param p
307 	 *            first position within the buffer to parse.
308 	 * @return the integer value.
309 	 * @throws ArrayIndexOutOfBoundsException
310 	 *             if the string is not hex formatted.
311 	 */
312 	public static final int parseHexInt16(final byte[] bs, final int p) {
313 		int r = digits16[bs[p]] << 4;
314 
315 		r |= digits16[bs[p + 1]];
316 		r <<= 4;
317 
318 		r |= digits16[bs[p + 2]];
319 		r <<= 4;
320 
321 		r |= digits16[bs[p + 3]];
322 		if (r < 0)
323 			throw new ArrayIndexOutOfBoundsException();
324 		return r;
325 	}
326 
327 	/**
328 	 * Parse 8 character base 16 (hex) formatted string to unsigned integer.
329 	 * <p>
330 	 * The number is read in network byte order, that is, most significant
331 	 * nybble first.
332 	 *
333 	 * @param bs
334 	 *            buffer to parse digits from; positions {@code [p, p+8)} will
335 	 *            be parsed.
336 	 * @param p
337 	 *            first position within the buffer to parse.
338 	 * @return the integer value.
339 	 * @throws ArrayIndexOutOfBoundsException
340 	 *             if the string is not hex formatted.
341 	 */
342 	public static final int parseHexInt32(final byte[] bs, final int p) {
343 		int r = digits16[bs[p]] << 4;
344 
345 		r |= digits16[bs[p + 1]];
346 		r <<= 4;
347 
348 		r |= digits16[bs[p + 2]];
349 		r <<= 4;
350 
351 		r |= digits16[bs[p + 3]];
352 		r <<= 4;
353 
354 		r |= digits16[bs[p + 4]];
355 		r <<= 4;
356 
357 		r |= digits16[bs[p + 5]];
358 		r <<= 4;
359 
360 		r |= digits16[bs[p + 6]];
361 
362 		final int last = digits16[bs[p + 7]];
363 		if (r < 0 || last < 0)
364 			throw new ArrayIndexOutOfBoundsException();
365 		return (r << 4) | last;
366 	}
367 
368 	/**
369 	 * Parse 16 character base 16 (hex) formatted string to unsigned long.
370 	 * <p>
371 	 * The number is read in network byte order, that is, most significant
372 	 * nibble first.
373 	 *
374 	 * @param bs
375 	 *            buffer to parse digits from; positions {@code [p, p+16)} will
376 	 *            be parsed.
377 	 * @param p
378 	 *            first position within the buffer to parse.
379 	 * @return the integer value.
380 	 * @throws ArrayIndexOutOfBoundsException
381 	 *             if the string is not hex formatted.
382 	 * @since 4.3
383 	 */
384 	public static final long parseHexInt64(final byte[] bs, final int p) {
385 		long r = digits16[bs[p]] << 4;
386 
387 		r |= digits16[bs[p + 1]];
388 		r <<= 4;
389 
390 		r |= digits16[bs[p + 2]];
391 		r <<= 4;
392 
393 		r |= digits16[bs[p + 3]];
394 		r <<= 4;
395 
396 		r |= digits16[bs[p + 4]];
397 		r <<= 4;
398 
399 		r |= digits16[bs[p + 5]];
400 		r <<= 4;
401 
402 		r |= digits16[bs[p + 6]];
403 		r <<= 4;
404 
405 		r |= digits16[bs[p + 7]];
406 		r <<= 4;
407 
408 		r |= digits16[bs[p + 8]];
409 		r <<= 4;
410 
411 		r |= digits16[bs[p + 9]];
412 		r <<= 4;
413 
414 		r |= digits16[bs[p + 10]];
415 		r <<= 4;
416 
417 		r |= digits16[bs[p + 11]];
418 		r <<= 4;
419 
420 		r |= digits16[bs[p + 12]];
421 		r <<= 4;
422 
423 		r |= digits16[bs[p + 13]];
424 		r <<= 4;
425 
426 		r |= digits16[bs[p + 14]];
427 
428 		final int last = digits16[bs[p + 15]];
429 		if (r < 0 || last < 0)
430 			throw new ArrayIndexOutOfBoundsException();
431 		return (r << 4) | last;
432 	}
433 
434 	/**
435 	 * Parse a single hex digit to its numeric value (0-15).
436 	 *
437 	 * @param digit
438 	 *            hex character to parse.
439 	 * @return numeric value, in the range 0-15.
440 	 * @throws ArrayIndexOutOfBoundsException
441 	 *             if the input digit is not a valid hex digit.
442 	 */
443 	public static final int parseHexInt4(final byte digit) {
444 		final byte r = digits16[digit];
445 		if (r < 0)
446 			throw new ArrayIndexOutOfBoundsException();
447 		return r;
448 	}
449 
450 	/**
451 	 * Parse a Git style timezone string.
452 	 * <p>
453 	 * The sequence "-0315" will be parsed as the numeric value -195, as the
454 	 * lower two positions count minutes, not 100ths of an hour.
455 	 *
456 	 * @param b
457 	 *            buffer to scan.
458 	 * @param ptr
459 	 *            position within buffer to start parsing digits at.
460 	 * @return the timezone at this location, expressed in minutes.
461 	 */
462 	public static final int parseTimeZoneOffset(final byte[] b, int ptr) {
463 		return parseTimeZoneOffset(b, ptr, null);
464 	}
465 
466 	/**
467 	 * Parse a Git style timezone string.
468 	 * <p>
469 	 * The sequence "-0315" will be parsed as the numeric value -195, as the
470 	 * lower two positions count minutes, not 100ths of an hour.
471 	 *
472 	 * @param b
473 	 *            buffer to scan.
474 	 * @param ptr
475 	 *            position within buffer to start parsing digits at.
476 	 * @param ptrResult
477 	 *            optional location to return the new ptr value through. If null
478 	 *            the ptr value will be discarded.
479 	 * @return the timezone at this location, expressed in minutes.
480 	 * @since 4.1
481 	 */
482 	public static final int parseTimeZoneOffset(final byte[] b, int ptr,
483 			MutableInteger ptrResult) {
484 		final int v = parseBase10(b, ptr, ptrResult);
485 		final int tzMins = v % 100;
486 		final int tzHours = v / 100;
487 		return tzHours * 60 + tzMins;
488 	}
489 
490 	/**
491 	 * Locate the first position after a given character.
492 	 *
493 	 * @param b
494 	 *            buffer to scan.
495 	 * @param ptr
496 	 *            position within buffer to start looking for chrA at.
497 	 * @param chrA
498 	 *            character to find.
499 	 * @return new position just after chrA.
500 	 */
501 	public static final int next(final byte[] b, int ptr, final char chrA) {
502 		final int sz = b.length;
503 		while (ptr < sz) {
504 			if (b[ptr++] == chrA)
505 				return ptr;
506 		}
507 		return ptr;
508 	}
509 
510 	/**
511 	 * Locate the first position after the next LF.
512 	 * <p>
513 	 * This method stops on the first '\n' it finds.
514 	 *
515 	 * @param b
516 	 *            buffer to scan.
517 	 * @param ptr
518 	 *            position within buffer to start looking for LF at.
519 	 * @return new position just after the first LF found.
520 	 */
521 	public static final int nextLF(final byte[] b, int ptr) {
522 		return next(b, ptr, '\n');
523 	}
524 
525 	/**
526 	 * Locate the first position after either the given character or LF.
527 	 * <p>
528 	 * This method stops on the first match it finds from either chrA or '\n'.
529 	 *
530 	 * @param b
531 	 *            buffer to scan.
532 	 * @param ptr
533 	 *            position within buffer to start looking for chrA or LF at.
534 	 * @param chrA
535 	 *            character to find.
536 	 * @return new position just after the first chrA or LF to be found.
537 	 */
538 	public static final int nextLF(final byte[] b, int ptr, final char chrA) {
539 		final int sz = b.length;
540 		while (ptr < sz) {
541 			final byte c = b[ptr++];
542 			if (c == chrA || c == '\n')
543 				return ptr;
544 		}
545 		return ptr;
546 	}
547 
548 	/**
549 	 * Locate the first position before a given character.
550 	 *
551 	 * @param b
552 	 *            buffer to scan.
553 	 * @param ptr
554 	 *            position within buffer to start looking for chrA at.
555 	 * @param chrA
556 	 *            character to find.
557 	 * @return new position just before chrA, -1 for not found
558 	 */
559 	public static final int prev(final byte[] b, int ptr, final char chrA) {
560 		if (ptr == b.length)
561 			--ptr;
562 		while (ptr >= 0) {
563 			if (b[ptr--] == chrA)
564 				return ptr;
565 		}
566 		return ptr;
567 	}
568 
569 	/**
570 	 * Locate the first position before the previous LF.
571 	 * <p>
572 	 * This method stops on the first '\n' it finds.
573 	 *
574 	 * @param b
575 	 *            buffer to scan.
576 	 * @param ptr
577 	 *            position within buffer to start looking for LF at.
578 	 * @return new position just before the first LF found, -1 for not found
579 	 */
580 	public static final int prevLF(final byte[] b, int ptr) {
581 		return prev(b, ptr, '\n');
582 	}
583 
584 	/**
585 	 * Locate the previous position before either the given character or LF.
586 	 * <p>
587 	 * This method stops on the first match it finds from either chrA or '\n'.
588 	 *
589 	 * @param b
590 	 *            buffer to scan.
591 	 * @param ptr
592 	 *            position within buffer to start looking for chrA or LF at.
593 	 * @param chrA
594 	 *            character to find.
595 	 * @return new position just before the first chrA or LF to be found, -1 for
596 	 *         not found
597 	 */
598 	public static final int prevLF(final byte[] b, int ptr, final char chrA) {
599 		if (ptr == b.length)
600 			--ptr;
601 		while (ptr >= 0) {
602 			final byte c = b[ptr--];
603 			if (c == chrA || c == '\n')
604 				return ptr;
605 		}
606 		return ptr;
607 	}
608 
609 	/**
610 	 * Index the region between <code>[ptr, end)</code> to find line starts.
611 	 * <p>
612 	 * The returned list is 1 indexed. Index 0 contains
613 	 * {@link Integer#MIN_VALUE} to pad the list out.
614 	 * <p>
615 	 * Using a 1 indexed list means that line numbers can be directly accessed
616 	 * from the list, so <code>list.get(1)</code> (aka get line 1) returns
617 	 * <code>ptr</code>.
618 	 * <p>
619 	 * The last element (index <code>map.size()-1</code>) always contains
620 	 * <code>end</code>.
621 	 *
622 	 * @param buf
623 	 *            buffer to scan.
624 	 * @param ptr
625 	 *            position within the buffer corresponding to the first byte of
626 	 *            line 1.
627 	 * @param end
628 	 *            1 past the end of the content within <code>buf</code>.
629 	 * @return a line map indexing the start position of each line.
630 	 */
631 	public static final IntList lineMap(final byte[] buf, int ptr, int end) {
632 		// Experimentally derived from multiple source repositories
633 		// the average number of bytes/line is 36. Its a rough guess
634 		// to initially size our map close to the target.
635 		//
636 		final IntList map = new IntList((end - ptr) / 36);
637 		map.fillTo(1, Integer.MIN_VALUE);
638 		for (; ptr < end; ptr = nextLF(buf, ptr))
639 			map.add(ptr);
640 		map.add(end);
641 		return map;
642 	}
643 
644 	/**
645 	 * Locate the "author " header line data.
646 	 *
647 	 * @param b
648 	 *            buffer to scan.
649 	 * @param ptr
650 	 *            position in buffer to start the scan at. Most callers should
651 	 *            pass 0 to ensure the scan starts from the beginning of the
652 	 *            commit buffer and does not accidentally look at message body.
653 	 * @return position just after the space in "author ", so the first
654 	 *         character of the author's name. If no author header can be
655 	 *         located -1 is returned.
656 	 */
657 	public static final int author(final byte[] b, int ptr) {
658 		final int sz = b.length;
659 		if (ptr == 0)
660 			ptr += 46; // skip the "tree ..." line.
661 		while (ptr < sz && b[ptr] == 'p')
662 			ptr += 48; // skip this parent.
663 		return match(b, ptr, author);
664 	}
665 
666 	/**
667 	 * Locate the "committer " header line data.
668 	 *
669 	 * @param b
670 	 *            buffer to scan.
671 	 * @param ptr
672 	 *            position in buffer to start the scan at. Most callers should
673 	 *            pass 0 to ensure the scan starts from the beginning of the
674 	 *            commit buffer and does not accidentally look at message body.
675 	 * @return position just after the space in "committer ", so the first
676 	 *         character of the committer's name. If no committer header can be
677 	 *         located -1 is returned.
678 	 */
679 	public static final int committer(final byte[] b, int ptr) {
680 		final int sz = b.length;
681 		if (ptr == 0)
682 			ptr += 46; // skip the "tree ..." line.
683 		while (ptr < sz && b[ptr] == 'p')
684 			ptr += 48; // skip this parent.
685 		if (ptr < sz && b[ptr] == 'a')
686 			ptr = nextLF(b, ptr);
687 		return match(b, ptr, committer);
688 	}
689 
690 	/**
691 	 * Locate the "tagger " header line data.
692 	 *
693 	 * @param b
694 	 *            buffer to scan.
695 	 * @param ptr
696 	 *            position in buffer to start the scan at. Most callers should
697 	 *            pass 0 to ensure the scan starts from the beginning of the tag
698 	 *            buffer and does not accidentally look at message body.
699 	 * @return position just after the space in "tagger ", so the first
700 	 *         character of the tagger's name. If no tagger header can be
701 	 *         located -1 is returned.
702 	 */
703 	public static final int tagger(final byte[] b, int ptr) {
704 		final int sz = b.length;
705 		if (ptr == 0)
706 			ptr += 48; // skip the "object ..." line.
707 		while (ptr < sz) {
708 			if (b[ptr] == '\n')
709 				return -1;
710 			final int m = match(b, ptr, tagger);
711 			if (m >= 0)
712 				return m;
713 			ptr = nextLF(b, ptr);
714 		}
715 		return -1;
716 	}
717 
718 	/**
719 	 * Locate the "encoding " header line.
720 	 *
721 	 * @param b
722 	 *            buffer to scan.
723 	 * @param ptr
724 	 *            position in buffer to start the scan at. Most callers should
725 	 *            pass 0 to ensure the scan starts from the beginning of the
726 	 *            buffer and does not accidentally look at the message body.
727 	 * @return position just after the space in "encoding ", so the first
728 	 *         character of the encoding's name. If no encoding header can be
729 	 *         located -1 is returned (and UTF-8 should be assumed).
730 	 */
731 	public static final int encoding(final byte[] b, int ptr) {
732 		final int sz = b.length;
733 		while (ptr < sz) {
734 			if (b[ptr] == '\n')
735 				return -1;
736 			if (b[ptr] == 'e')
737 				break;
738 			ptr = nextLF(b, ptr);
739 		}
740 		return match(b, ptr, encoding);
741 	}
742 
743 	/**
744 	 * Parse the "encoding " header as a string.
745 	 * <p>
746 	 * Locates the "encoding " header (if present) and returns its value.
747 	 *
748 	 * @param b
749 	 *            buffer to scan.
750 	 * @return the encoding header as specified in the commit; null if the
751 	 *         header was not present and should be assumed.
752 	 * @since 4.2
753 	 */
754 	@Nullable
755 	public static String parseEncodingName(final byte[] b) {
756 		int enc = encoding(b, 0);
757 		if (enc < 0) {
758 			return null;
759 		}
760 		int lf = nextLF(b, enc);
761 		return decode(UTF_8, b, enc, lf - 1);
762 	}
763 
764 	/**
765 	 * Parse the "encoding " header into a character set reference.
766 	 * <p>
767 	 * Locates the "encoding " header (if present) by first calling
768 	 * {@link #encoding(byte[], int)} and then returns the proper character set
769 	 * to apply to this buffer to evaluate its contents as character data.
770 	 * <p>
771 	 * If no encoding header is present {@code UTF-8} is assumed.
772 	 *
773 	 * @param b
774 	 *            buffer to scan.
775 	 * @return the Java character set representation. Never null.
776 	 * @throws IllegalCharsetNameException
777 	 *             if the character set requested by the encoding header is
778 	 *             malformed and unsupportable.
779 	 * @throws UnsupportedCharsetException
780 	 *             if the JRE does not support the character set requested by
781 	 *             the encoding header.
782 	 */
783 	public static Charset parseEncoding(final byte[] b) {
784 		String enc = parseEncodingName(b);
785 		if (enc == null) {
786 			return UTF_8;
787 		}
788 
789 		String name = enc.trim();
790 		try {
791 			return Charset.forName(name);
792 		} catch (IllegalCharsetNameException
793 				| UnsupportedCharsetException badName) {
794 			Charset aliased = charsetForAlias(name);
795 			if (aliased != null) {
796 				return aliased;
797 			}
798 			throw badName;
799 		}
800 	}
801 
802 	/**
803 	 * Parse a name string (e.g. author, committer, tagger) into a PersonIdent.
804 	 * <p>
805 	 * Leading spaces won't be trimmed from the string, i.e. will show up in the
806 	 * parsed name afterwards.
807 	 *
808 	 * @param in
809 	 *            the string to parse a name from.
810 	 * @return the parsed identity or null in case the identity could not be
811 	 *         parsed.
812 	 */
813 	public static PersonIdent parsePersonIdent(final String in) {
814 		return parsePersonIdent(Constants.encode(in), 0);
815 	}
816 
817 	/**
818 	 * Parse a name line (e.g. author, committer, tagger) into a PersonIdent.
819 	 * <p>
820 	 * When passing in a value for <code>nameB</code> callers should use the
821 	 * return value of {@link #author(byte[], int)} or
822 	 * {@link #committer(byte[], int)}, as these methods provide the proper
823 	 * position within the buffer.
824 	 *
825 	 * @param raw
826 	 *            the buffer to parse character data from.
827 	 * @param nameB
828 	 *            first position of the identity information. This should be the
829 	 *            first position after the space which delimits the header field
830 	 *            name (e.g. "author" or "committer") from the rest of the
831 	 *            identity line.
832 	 * @return the parsed identity or null in case the identity could not be
833 	 *         parsed.
834 	 */
835 	public static PersonIdent parsePersonIdent(final byte[] raw, final int nameB) {
836 		Charset cs;
837 		try {
838 			cs = parseEncoding(raw);
839 		} catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
840 			// Assume UTF-8 for person identities, usually this is correct.
841 			// If not decode() will fall back to the ISO-8859-1 encoding.
842 			cs = UTF_8;
843 		}
844 
845 		final int emailB = nextLF(raw, nameB, '<');
846 		final int emailE = nextLF(raw, emailB, '>');
847 		if (emailB >= raw.length || raw[emailB] == '\n' ||
848 				(emailE >= raw.length - 1 && raw[emailE - 1] != '>'))
849 			return null;
850 
851 		final int nameEnd = emailB - 2 >= nameB && raw[emailB - 2] == ' ' ?
852 				emailB - 2 : emailB - 1;
853 		final String name = decode(cs, raw, nameB, nameEnd);
854 		final String email = decode(cs, raw, emailB, emailE - 1);
855 
856 		// Start searching from end of line, as after first name-email pair,
857 		// another name-email pair may occur. We will ignore all kinds of
858 		// "junk" following the first email.
859 		//
860 		// We've to use (emailE - 1) for the case that raw[email] is LF,
861 		// otherwise we would run too far. "-2" is necessary to position
862 		// before the LF in case of LF termination resp. the penultimate
863 		// character if there is no trailing LF.
864 		final int tzBegin = lastIndexOfTrim(raw, ' ',
865 				nextLF(raw, emailE - 1) - 2) + 1;
866 		if (tzBegin <= emailE) // No time/zone, still valid
867 			return new PersonIdent(name, email, 0, 0);
868 
869 		final int whenBegin = Math.max(emailE,
870 				lastIndexOfTrim(raw, ' ', tzBegin - 1) + 1);
871 		if (whenBegin >= tzBegin - 1) // No time/zone, still valid
872 			return new PersonIdent(name, email, 0, 0);
873 
874 		final long when = parseLongBase10(raw, whenBegin, null);
875 		final int tz = parseTimeZoneOffset(raw, tzBegin);
876 		return new PersonIdent(name, email, when * 1000L, tz);
877 	}
878 
879 	/**
880 	 * Parse a name data (e.g. as within a reflog) into a PersonIdent.
881 	 * <p>
882 	 * When passing in a value for <code>nameB</code> callers should use the
883 	 * return value of {@link #author(byte[], int)} or
884 	 * {@link #committer(byte[], int)}, as these methods provide the proper
885 	 * position within the buffer.
886 	 *
887 	 * @param raw
888 	 *            the buffer to parse character data from.
889 	 * @param nameB
890 	 *            first position of the identity information. This should be the
891 	 *            first position after the space which delimits the header field
892 	 *            name (e.g. "author" or "committer") from the rest of the
893 	 *            identity line.
894 	 * @return the parsed identity. Never null.
895 	 */
896 	public static PersonIdent parsePersonIdentOnly(final byte[] raw,
897 			final int nameB) {
898 		int stop = nextLF(raw, nameB);
899 		int emailB = nextLF(raw, nameB, '<');
900 		int emailE = nextLF(raw, emailB, '>');
901 		final String name;
902 		final String email;
903 		if (emailE < stop) {
904 			email = decode(raw, emailB, emailE - 1);
905 		} else {
906 			email = "invalid"; //$NON-NLS-1$
907 		}
908 		if (emailB < stop)
909 			name = decode(raw, nameB, emailB - 2);
910 		else
911 			name = decode(raw, nameB, stop);
912 
913 		final MutableInteger ptrout = new MutableInteger();
914 		long when;
915 		int tz;
916 		if (emailE < stop) {
917 			when = parseLongBase10(raw, emailE + 1, ptrout);
918 			tz = parseTimeZoneOffset(raw, ptrout.value);
919 		} else {
920 			when = 0;
921 			tz = 0;
922 		}
923 		return new PersonIdent(name, email, when * 1000L, tz);
924 	}
925 
926 	/**
927 	 * Locate the end of a footer line key string.
928 	 * <p>
929 	 * If the region at {@code raw[ptr]} matches {@code ^[A-Za-z0-9-]+:} (e.g.
930 	 * "Signed-off-by: A. U. Thor\n") then this method returns the position of
931 	 * the first ':'.
932 	 * <p>
933 	 * If the region at {@code raw[ptr]} does not match {@code ^[A-Za-z0-9-]+:}
934 	 * then this method returns -1.
935 	 *
936 	 * @param raw
937 	 *            buffer to scan.
938 	 * @param ptr
939 	 *            first position within raw to consider as a footer line key.
940 	 * @return position of the ':' which terminates the footer line key if this
941 	 *         is otherwise a valid footer line key; otherwise -1.
942 	 */
943 	public static int endOfFooterLineKey(final byte[] raw, int ptr) {
944 		try {
945 			for (;;) {
946 				final byte c = raw[ptr];
947 				if (footerLineKeyChars[c] == 0) {
948 					if (c == ':')
949 						return ptr;
950 					return -1;
951 				}
952 				ptr++;
953 			}
954 		} catch (ArrayIndexOutOfBoundsException e) {
955 			return -1;
956 		}
957 	}
958 
959 	/**
960 	 * Decode a buffer under UTF-8, if possible.
961 	 *
962 	 * If the byte stream cannot be decoded that way, the platform default is tried
963 	 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
964 	 *
965 	 * @param buffer
966 	 *            buffer to pull raw bytes from.
967 	 * @return a string representation of the range <code>[start,end)</code>,
968 	 *         after decoding the region through the specified character set.
969 	 */
970 	public static String decode(final byte[] buffer) {
971 		return decode(buffer, 0, buffer.length);
972 	}
973 
974 	/**
975 	 * Decode a buffer under UTF-8, if possible.
976 	 *
977 	 * If the byte stream cannot be decoded that way, the platform default is
978 	 * tried and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
979 	 *
980 	 * @param buffer
981 	 *            buffer to pull raw bytes from.
982 	 * @param start
983 	 *            start position in buffer
984 	 * @param end
985 	 *            one position past the last location within the buffer to take
986 	 *            data from.
987 	 * @return a string representation of the range <code>[start,end)</code>,
988 	 *         after decoding the region through the specified character set.
989 	 */
990 	public static String decode(final byte[] buffer, final int start,
991 			final int end) {
992 		return decode(UTF_8, buffer, start, end);
993 	}
994 
995 	/**
996 	 * Decode a buffer under the specified character set if possible.
997 	 *
998 	 * If the byte stream cannot be decoded that way, the platform default is tried
999 	 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1000 	 *
1001 	 * @param cs
1002 	 *            character set to use when decoding the buffer.
1003 	 * @param buffer
1004 	 *            buffer to pull raw bytes from.
1005 	 * @return a string representation of the range <code>[start,end)</code>,
1006 	 *         after decoding the region through the specified character set.
1007 	 */
1008 	public static String decode(final Charset cs, final byte[] buffer) {
1009 		return decode(cs, buffer, 0, buffer.length);
1010 	}
1011 
1012 	/**
1013 	 * Decode a region of the buffer under the specified character set if possible.
1014 	 *
1015 	 * If the byte stream cannot be decoded that way, the platform default is tried
1016 	 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1017 	 *
1018 	 * @param cs
1019 	 *            character set to use when decoding the buffer.
1020 	 * @param buffer
1021 	 *            buffer to pull raw bytes from.
1022 	 * @param start
1023 	 *            first position within the buffer to take data from.
1024 	 * @param end
1025 	 *            one position past the last location within the buffer to take
1026 	 *            data from.
1027 	 * @return a string representation of the range <code>[start,end)</code>,
1028 	 *         after decoding the region through the specified character set.
1029 	 */
1030 	public static String decode(final Charset cs, final byte[] buffer,
1031 			final int start, final int end) {
1032 		try {
1033 			return decodeNoFallback(cs, buffer, start, end);
1034 		} catch (CharacterCodingException e) {
1035 			// Fall back to an ISO-8859-1 style encoding. At least all of
1036 			// the bytes will be present in the output.
1037 			//
1038 			return extractBinaryString(buffer, start, end);
1039 		}
1040 	}
1041 
1042 	/**
1043 	 * Decode a region of the buffer under the specified character set if
1044 	 * possible.
1045 	 *
1046 	 * If the byte stream cannot be decoded that way, the platform default is
1047 	 * tried and if that too fails, an exception is thrown.
1048 	 *
1049 	 * @param cs
1050 	 *            character set to use when decoding the buffer.
1051 	 * @param buffer
1052 	 *            buffer to pull raw bytes from.
1053 	 * @param start
1054 	 *            first position within the buffer to take data from.
1055 	 * @param end
1056 	 *            one position past the last location within the buffer to take
1057 	 *            data from.
1058 	 * @return a string representation of the range <code>[start,end)</code>,
1059 	 *         after decoding the region through the specified character set.
1060 	 * @throws CharacterCodingException
1061 	 *             the input is not in any of the tested character sets.
1062 	 */
1063 	public static String decodeNoFallback(final Charset cs,
1064 			final byte[] buffer, final int start, final int end)
1065 			throws CharacterCodingException {
1066 		ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
1067 		b.mark();
1068 
1069 		// Try our built-in favorite. The assumption here is that
1070 		// decoding will fail if the data is not actually encoded
1071 		// using that encoder.
1072 		try {
1073 			return decode(b, UTF_8);
1074 		} catch (CharacterCodingException e) {
1075 			b.reset();
1076 		}
1077 
1078 		if (!cs.equals(UTF_8)) {
1079 			// Try the suggested encoding, it might be right since it was
1080 			// provided by the caller.
1081 			try {
1082 				return decode(b, cs);
1083 			} catch (CharacterCodingException e) {
1084 				b.reset();
1085 			}
1086 		}
1087 
1088 		// Try the default character set. A small group of people
1089 		// might actually use the same (or very similar) locale.
1090 		Charset defcs = Charset.defaultCharset();
1091 		if (!defcs.equals(cs) && !defcs.equals(UTF_8)) {
1092 			try {
1093 				return decode(b, defcs);
1094 			} catch (CharacterCodingException e) {
1095 				b.reset();
1096 			}
1097 		}
1098 
1099 		throw new CharacterCodingException();
1100 	}
1101 
1102 	/**
1103 	 * Decode a region of the buffer under the ISO-8859-1 encoding.
1104 	 *
1105 	 * Each byte is treated as a single character in the 8859-1 character
1106 	 * encoding, performing a raw binary-&gt;char conversion.
1107 	 *
1108 	 * @param buffer
1109 	 *            buffer to pull raw bytes from.
1110 	 * @param start
1111 	 *            first position within the buffer to take data from.
1112 	 * @param end
1113 	 *            one position past the last location within the buffer to take
1114 	 *            data from.
1115 	 * @return a string representation of the range <code>[start,end)</code>.
1116 	 */
1117 	public static String extractBinaryString(final byte[] buffer,
1118 			final int start, final int end) {
1119 		final StringBuilder r = new StringBuilder(end - start);
1120 		for (int i = start; i < end; i++)
1121 			r.append((char) (buffer[i] & 0xff));
1122 		return r.toString();
1123 	}
1124 
1125 	private static String decode(final ByteBuffer b, final Charset charset)
1126 			throws CharacterCodingException {
1127 		final CharsetDecoder d = charset.newDecoder();
1128 		d.onMalformedInput(CodingErrorAction.REPORT);
1129 		d.onUnmappableCharacter(CodingErrorAction.REPORT);
1130 		return d.decode(b).toString();
1131 	}
1132 
1133 	/**
1134 	 * Locate the position of the commit message body.
1135 	 *
1136 	 * @param b
1137 	 *            buffer to scan.
1138 	 * @param ptr
1139 	 *            position in buffer to start the scan at. Most callers should
1140 	 *            pass 0 to ensure the scan starts from the beginning of the
1141 	 *            commit buffer.
1142 	 * @return position of the user's message buffer.
1143 	 */
1144 	public static final int commitMessage(final byte[] b, int ptr) {
1145 		final int sz = b.length;
1146 		if (ptr == 0)
1147 			ptr += 46; // skip the "tree ..." line.
1148 		while (ptr < sz && b[ptr] == 'p')
1149 			ptr += 48; // skip this parent.
1150 
1151 		// Skip any remaining header lines, ignoring what their actual
1152 		// header line type is. This is identical to the logic for a tag.
1153 		//
1154 		return tagMessage(b, ptr);
1155 	}
1156 
1157 	/**
1158 	 * Locate the position of the tag message body.
1159 	 *
1160 	 * @param b
1161 	 *            buffer to scan.
1162 	 * @param ptr
1163 	 *            position in buffer to start the scan at. Most callers should
1164 	 *            pass 0 to ensure the scan starts from the beginning of the tag
1165 	 *            buffer.
1166 	 * @return position of the user's message buffer.
1167 	 */
1168 	public static final int tagMessage(final byte[] b, int ptr) {
1169 		final int sz = b.length;
1170 		if (ptr == 0)
1171 			ptr += 48; // skip the "object ..." line.
1172 		while (ptr < sz && b[ptr] != '\n')
1173 			ptr = nextLF(b, ptr);
1174 		if (ptr < sz && b[ptr] == '\n')
1175 			return ptr + 1;
1176 		return -1;
1177 	}
1178 
1179 	/**
1180 	 * Locate the end of a paragraph.
1181 	 * <p>
1182 	 * A paragraph is ended by two consecutive LF bytes or CRLF pairs
1183 	 *
1184 	 * @param b
1185 	 *            buffer to scan.
1186 	 * @param start
1187 	 *            position in buffer to start the scan at. Most callers will
1188 	 *            want to pass the first position of the commit message (as
1189 	 *            found by {@link #commitMessage(byte[], int)}.
1190 	 * @return position of the LF at the end of the paragraph;
1191 	 *         <code>b.length</code> if no paragraph end could be located.
1192 	 */
1193 	public static final int endOfParagraph(final byte[] b, final int start) {
1194 		int ptr = start;
1195 		final int sz = b.length;
1196 		while (ptr < sz && (b[ptr] != '\n' && b[ptr] != '\r'))
1197 			ptr = nextLF(b, ptr);
1198 		if (ptr > start && b[ptr - 1] == '\n')
1199 			ptr--;
1200 		if (ptr > start && b[ptr - 1] == '\r')
1201 			ptr--;
1202 		return ptr;
1203 	}
1204 
1205 	/**
1206 	 * @param raw
1207 	 *            buffer to scan.
1208 	 * @param ch
1209 	 *            character to find.
1210 	 * @param pos
1211 	 *            starting position.
1212 	 * @return last index of ch in raw, trimming spaces.
1213 	 * @since 4.1
1214 	 */
1215 	public static int lastIndexOfTrim(byte[] raw, char ch, int pos) {
1216 		while (pos >= 0 && raw[pos] == ' ')
1217 			pos--;
1218 
1219 		while (pos >= 0 && raw[pos] != ch)
1220 			pos--;
1221 
1222 		return pos;
1223 	}
1224 
1225 	private static Charset charsetForAlias(String name) {
1226 		return encodingAliases.get(StringUtils.toLowerCase(name));
1227 	}
1228 
1229 	private RawParseUtils() {
1230 		// Don't create instances of a static only utility.
1231 	}
1232 }