View Javadoc
1   /*
2    * Copyright (C) 2008-2009, Google Inc.
3    * Copyright (C) 2006-2008, Shawn O. Pearce <spearce@spearce.org>
4    * and other copyright owners as documented in the project's IP log.
5    *
6    * This program and the accompanying materials are made available
7    * under the terms of the Eclipse Distribution License v1.0 which
8    * accompanies this distribution, is reproduced below, and is
9    * available at http://www.eclipse.org/org/documents/edl-v10.php
10   *
11   * All rights reserved.
12   *
13   * Redistribution and use in source and binary forms, with or
14   * without modification, are permitted provided that the following
15   * conditions are met:
16   *
17   * - Redistributions of source code must retain the above copyright
18   *   notice, this list of conditions and the following disclaimer.
19   *
20   * - Redistributions in binary form must reproduce the above
21   *   copyright notice, this list of conditions and the following
22   *   disclaimer in the documentation and/or other materials provided
23   *   with the distribution.
24   *
25   * - Neither the name of the Eclipse Foundation, Inc. nor the
26   *   names of its contributors may be used to endorse or promote
27   *   products derived from this software without specific prior
28   *   written permission.
29   *
30   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
31   * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
32   * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
33   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34   * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
35   * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
36   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
37   * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
38   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
39   * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
40   * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
41   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
42   * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43   */
44  
45  package org.eclipse.jgit.util;
46  
47  import static org.eclipse.jgit.lib.ObjectChecker.author;
48  import static org.eclipse.jgit.lib.ObjectChecker.committer;
49  import static org.eclipse.jgit.lib.ObjectChecker.encoding;
50  import static org.eclipse.jgit.lib.ObjectChecker.tagger;
51  
52  import java.nio.ByteBuffer;
53  import java.nio.charset.CharacterCodingException;
54  import java.nio.charset.Charset;
55  import java.nio.charset.CharsetDecoder;
56  import java.nio.charset.CodingErrorAction;
57  import java.nio.charset.IllegalCharsetNameException;
58  import java.nio.charset.UnsupportedCharsetException;
59  import java.util.Arrays;
60  import java.util.HashMap;
61  import java.util.Map;
62  
63  import org.eclipse.jgit.lib.Constants;
64  import org.eclipse.jgit.lib.PersonIdent;
65  
66  /** Handy utility functions to parse raw object contents. */
67  public final class RawParseUtils {
68  	/**
69  	 * UTF-8 charset constant.
70  	 *
71  	 * @since 2.2
72  	 */
73  	public static final Charset UTF8_CHARSET = Charset.forName("UTF-8"); //$NON-NLS-1$
74  
75  	private static final byte[] digits10;
76  
77  	private static final byte[] digits16;
78  
79  	private static final byte[] footerLineKeyChars;
80  
81  	private static final Map<String, Charset> encodingAliases;
82  
83  	static {
84  		encodingAliases = new HashMap<String, Charset>();
85  		encodingAliases.put("latin-1", Charset.forName("ISO-8859-1")); //$NON-NLS-1$ //$NON-NLS-2$
86  
87  		digits10 = new byte['9' + 1];
88  		Arrays.fill(digits10, (byte) -1);
89  		for (char i = '0'; i <= '9'; i++)
90  			digits10[i] = (byte) (i - '0');
91  
92  		digits16 = new byte['f' + 1];
93  		Arrays.fill(digits16, (byte) -1);
94  		for (char i = '0'; i <= '9'; i++)
95  			digits16[i] = (byte) (i - '0');
96  		for (char i = 'a'; i <= 'f'; i++)
97  			digits16[i] = (byte) ((i - 'a') + 10);
98  		for (char i = 'A'; i <= 'F'; i++)
99  			digits16[i] = (byte) ((i - 'A') + 10);
100 
101 		footerLineKeyChars = new byte['z' + 1];
102 		footerLineKeyChars['-'] = 1;
103 		for (char i = '0'; i <= '9'; i++)
104 			footerLineKeyChars[i] = 1;
105 		for (char i = 'A'; i <= 'Z'; i++)
106 			footerLineKeyChars[i] = 1;
107 		for (char i = 'a'; i <= 'z'; i++)
108 			footerLineKeyChars[i] = 1;
109 	}
110 
111 	/**
112 	 * Determine if b[ptr] matches src.
113 	 *
114 	 * @param b
115 	 *            the buffer to scan.
116 	 * @param ptr
117 	 *            first position within b, this should match src[0].
118 	 * @param src
119 	 *            the buffer to test for equality with b.
120 	 * @return ptr + src.length if b[ptr..src.length] == src; else -1.
121 	 */
122 	public static final int match(final byte[] b, int ptr, final byte[] src) {
123 		if (ptr + src.length > b.length)
124 			return -1;
125 		for (int i = 0; i < src.length; i++, ptr++)
126 			if (b[ptr] != src[i])
127 				return -1;
128 		return ptr;
129 	}
130 
131 	private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5',
132 			'6', '7', '8', '9' };
133 
134 	/**
135 	 * Format a base 10 numeric into a temporary buffer.
136 	 * <p>
137 	 * Formatting is performed backwards. The method starts at offset
138 	 * <code>o-1</code> and ends at <code>o-1-digits</code>, where
139 	 * <code>digits</code> is the number of positions necessary to store the
140 	 * base 10 value.
141 	 * <p>
142 	 * The argument and return values from this method make it easy to chain
143 	 * writing, for example:
144 	 * </p>
145 	 *
146 	 * <pre>
147 	 * final byte[] tmp = new byte[64];
148 	 * int ptr = tmp.length;
149 	 * tmp[--ptr] = '\n';
150 	 * ptr = RawParseUtils.formatBase10(tmp, ptr, 32);
151 	 * tmp[--ptr] = ' ';
152 	 * ptr = RawParseUtils.formatBase10(tmp, ptr, 18);
153 	 * tmp[--ptr] = 0;
154 	 * final String str = new String(tmp, ptr, tmp.length - ptr);
155 	 * </pre>
156 	 *
157 	 * @param b
158 	 *            buffer to write into.
159 	 * @param o
160 	 *            one offset past the location where writing will begin; writing
161 	 *            proceeds towards lower index values.
162 	 * @param value
163 	 *            the value to store.
164 	 * @return the new offset value <code>o</code>. This is the position of
165 	 *         the last byte written. Additional writing should start at one
166 	 *         position earlier.
167 	 */
168 	public static int formatBase10(final byte[] b, int o, int value) {
169 		if (value == 0) {
170 			b[--o] = '0';
171 			return o;
172 		}
173 		final boolean isneg = value < 0;
174 		if (isneg)
175 			value = -value;
176 		while (value != 0) {
177 			b[--o] = base10byte[value % 10];
178 			value /= 10;
179 		}
180 		if (isneg)
181 			b[--o] = '-';
182 		return o;
183 	}
184 
185 	/**
186 	 * Parse a base 10 numeric from a sequence of ASCII digits into an int.
187 	 * <p>
188 	 * Digit sequences can begin with an optional run of spaces before the
189 	 * sequence, and may start with a '+' or a '-' to indicate sign position.
190 	 * Any other characters will cause the method to stop and return the current
191 	 * result to the caller.
192 	 *
193 	 * @param b
194 	 *            buffer to scan.
195 	 * @param ptr
196 	 *            position within buffer to start parsing digits at.
197 	 * @param ptrResult
198 	 *            optional location to return the new ptr value through. If null
199 	 *            the ptr value will be discarded.
200 	 * @return the value at this location; 0 if the location is not a valid
201 	 *         numeric.
202 	 */
203 	public static final int parseBase10(final byte[] b, int ptr,
204 			final MutableInteger ptrResult) {
205 		int r = 0;
206 		int sign = 0;
207 		try {
208 			final int sz = b.length;
209 			while (ptr < sz && b[ptr] == ' ')
210 				ptr++;
211 			if (ptr >= sz)
212 				return 0;
213 
214 			switch (b[ptr]) {
215 			case '-':
216 				sign = -1;
217 				ptr++;
218 				break;
219 			case '+':
220 				ptr++;
221 				break;
222 			}
223 
224 			while (ptr < sz) {
225 				final byte v = digits10[b[ptr]];
226 				if (v < 0)
227 					break;
228 				r = (r * 10) + v;
229 				ptr++;
230 			}
231 		} catch (ArrayIndexOutOfBoundsException e) {
232 			// Not a valid digit.
233 		}
234 		if (ptrResult != null)
235 			ptrResult.value = ptr;
236 		return sign < 0 ? -r : r;
237 	}
238 
239 	/**
240 	 * Parse a base 10 numeric from a sequence of ASCII digits into a long.
241 	 * <p>
242 	 * Digit sequences can begin with an optional run of spaces before the
243 	 * sequence, and may start with a '+' or a '-' to indicate sign position.
244 	 * Any other characters will cause the method to stop and return the current
245 	 * result to the caller.
246 	 *
247 	 * @param b
248 	 *            buffer to scan.
249 	 * @param ptr
250 	 *            position within buffer to start parsing digits at.
251 	 * @param ptrResult
252 	 *            optional location to return the new ptr value through. If null
253 	 *            the ptr value will be discarded.
254 	 * @return the value at this location; 0 if the location is not a valid
255 	 *         numeric.
256 	 */
257 	public static final long parseLongBase10(final byte[] b, int ptr,
258 			final MutableInteger ptrResult) {
259 		long r = 0;
260 		int sign = 0;
261 		try {
262 			final int sz = b.length;
263 			while (ptr < sz && b[ptr] == ' ')
264 				ptr++;
265 			if (ptr >= sz)
266 				return 0;
267 
268 			switch (b[ptr]) {
269 			case '-':
270 				sign = -1;
271 				ptr++;
272 				break;
273 			case '+':
274 				ptr++;
275 				break;
276 			}
277 
278 			while (ptr < sz) {
279 				final byte v = digits10[b[ptr]];
280 				if (v < 0)
281 					break;
282 				r = (r * 10) + v;
283 				ptr++;
284 			}
285 		} catch (ArrayIndexOutOfBoundsException e) {
286 			// Not a valid digit.
287 		}
288 		if (ptrResult != null)
289 			ptrResult.value = ptr;
290 		return sign < 0 ? -r : r;
291 	}
292 
293 	/**
294 	 * Parse 4 character base 16 (hex) formatted string to unsigned integer.
295 	 * <p>
296 	 * The number is read in network byte order, that is, most significant
297 	 * nybble first.
298 	 *
299 	 * @param bs
300 	 *            buffer to parse digits from; positions {@code [p, p+4)} will
301 	 *            be parsed.
302 	 * @param p
303 	 *            first position within the buffer to parse.
304 	 * @return the integer value.
305 	 * @throws ArrayIndexOutOfBoundsException
306 	 *             if the string is not hex formatted.
307 	 */
308 	public static final int parseHexInt16(final byte[] bs, final int p) {
309 		int r = digits16[bs[p]] << 4;
310 
311 		r |= digits16[bs[p + 1]];
312 		r <<= 4;
313 
314 		r |= digits16[bs[p + 2]];
315 		r <<= 4;
316 
317 		r |= digits16[bs[p + 3]];
318 		if (r < 0)
319 			throw new ArrayIndexOutOfBoundsException();
320 		return r;
321 	}
322 
323 	/**
324 	 * Parse 8 character base 16 (hex) formatted string to unsigned integer.
325 	 * <p>
326 	 * The number is read in network byte order, that is, most significant
327 	 * nybble first.
328 	 *
329 	 * @param bs
330 	 *            buffer to parse digits from; positions {@code [p, p+8)} will
331 	 *            be parsed.
332 	 * @param p
333 	 *            first position within the buffer to parse.
334 	 * @return the integer value.
335 	 * @throws ArrayIndexOutOfBoundsException
336 	 *             if the string is not hex formatted.
337 	 */
338 	public static final int parseHexInt32(final byte[] bs, final int p) {
339 		int r = digits16[bs[p]] << 4;
340 
341 		r |= digits16[bs[p + 1]];
342 		r <<= 4;
343 
344 		r |= digits16[bs[p + 2]];
345 		r <<= 4;
346 
347 		r |= digits16[bs[p + 3]];
348 		r <<= 4;
349 
350 		r |= digits16[bs[p + 4]];
351 		r <<= 4;
352 
353 		r |= digits16[bs[p + 5]];
354 		r <<= 4;
355 
356 		r |= digits16[bs[p + 6]];
357 
358 		final int last = digits16[bs[p + 7]];
359 		if (r < 0 || last < 0)
360 			throw new ArrayIndexOutOfBoundsException();
361 		return (r << 4) | last;
362 	}
363 
364 	/**
365 	 * Parse a single hex digit to its numeric value (0-15).
366 	 *
367 	 * @param digit
368 	 *            hex character to parse.
369 	 * @return numeric value, in the range 0-15.
370 	 * @throws ArrayIndexOutOfBoundsException
371 	 *             if the input digit is not a valid hex digit.
372 	 */
373 	public static final int parseHexInt4(final byte digit) {
374 		final byte r = digits16[digit];
375 		if (r < 0)
376 			throw new ArrayIndexOutOfBoundsException();
377 		return r;
378 	}
379 
380 	/**
381 	 * Parse a Git style timezone string.
382 	 * <p>
383 	 * The sequence "-0315" will be parsed as the numeric value -195, as the
384 	 * lower two positions count minutes, not 100ths of an hour.
385 	 *
386 	 * @param b
387 	 *            buffer to scan.
388 	 * @param ptr
389 	 *            position within buffer to start parsing digits at.
390 	 * @return the timezone at this location, expressed in minutes.
391 	 */
392 	public static final int parseTimeZoneOffset(final byte[] b, int ptr) {
393 		final int v = parseBase10(b, ptr, null);
394 		final int tzMins = v % 100;
395 		final int tzHours = v / 100;
396 		return tzHours * 60 + tzMins;
397 	}
398 
399 	/**
400 	 * Locate the first position after a given character.
401 	 *
402 	 * @param b
403 	 *            buffer to scan.
404 	 * @param ptr
405 	 *            position within buffer to start looking for chrA at.
406 	 * @param chrA
407 	 *            character to find.
408 	 * @return new position just after chrA.
409 	 */
410 	public static final int next(final byte[] b, int ptr, final char chrA) {
411 		final int sz = b.length;
412 		while (ptr < sz) {
413 			if (b[ptr++] == chrA)
414 				return ptr;
415 		}
416 		return ptr;
417 	}
418 
419 	/**
420 	 * Locate the first position after the next LF.
421 	 * <p>
422 	 * This method stops on the first '\n' it finds.
423 	 *
424 	 * @param b
425 	 *            buffer to scan.
426 	 * @param ptr
427 	 *            position within buffer to start looking for LF at.
428 	 * @return new position just after the first LF found.
429 	 */
430 	public static final int nextLF(final byte[] b, int ptr) {
431 		return next(b, ptr, '\n');
432 	}
433 
434 	/**
435 	 * Locate the first position after either the given character or LF.
436 	 * <p>
437 	 * This method stops on the first match it finds from either chrA or '\n'.
438 	 *
439 	 * @param b
440 	 *            buffer to scan.
441 	 * @param ptr
442 	 *            position within buffer to start looking for chrA or LF at.
443 	 * @param chrA
444 	 *            character to find.
445 	 * @return new position just after the first chrA or LF to be found.
446 	 */
447 	public static final int nextLF(final byte[] b, int ptr, final char chrA) {
448 		final int sz = b.length;
449 		while (ptr < sz) {
450 			final byte c = b[ptr++];
451 			if (c == chrA || c == '\n')
452 				return ptr;
453 		}
454 		return ptr;
455 	}
456 
457 	/**
458 	 * Locate the first position before a given character.
459 	 *
460 	 * @param b
461 	 *            buffer to scan.
462 	 * @param ptr
463 	 *            position within buffer to start looking for chrA at.
464 	 * @param chrA
465 	 *            character to find.
466 	 * @return new position just before chrA, -1 for not found
467 	 */
468 	public static final int prev(final byte[] b, int ptr, final char chrA) {
469 		if (ptr == b.length)
470 			--ptr;
471 		while (ptr >= 0) {
472 			if (b[ptr--] == chrA)
473 				return ptr;
474 		}
475 		return ptr;
476 	}
477 
478 	/**
479 	 * Locate the first position before the previous LF.
480 	 * <p>
481 	 * This method stops on the first '\n' it finds.
482 	 *
483 	 * @param b
484 	 *            buffer to scan.
485 	 * @param ptr
486 	 *            position within buffer to start looking for LF at.
487 	 * @return new position just before the first LF found, -1 for not found
488 	 */
489 	public static final int prevLF(final byte[] b, int ptr) {
490 		return prev(b, ptr, '\n');
491 	}
492 
493 	/**
494 	 * Locate the previous position before either the given character or LF.
495 	 * <p>
496 	 * This method stops on the first match it finds from either chrA or '\n'.
497 	 *
498 	 * @param b
499 	 *            buffer to scan.
500 	 * @param ptr
501 	 *            position within buffer to start looking for chrA or LF at.
502 	 * @param chrA
503 	 *            character to find.
504 	 * @return new position just before the first chrA or LF to be found, -1 for
505 	 *         not found
506 	 */
507 	public static final int prevLF(final byte[] b, int ptr, final char chrA) {
508 		if (ptr == b.length)
509 			--ptr;
510 		while (ptr >= 0) {
511 			final byte c = b[ptr--];
512 			if (c == chrA || c == '\n')
513 				return ptr;
514 		}
515 		return ptr;
516 	}
517 
518 	/**
519 	 * Index the region between <code>[ptr, end)</code> to find line starts.
520 	 * <p>
521 	 * The returned list is 1 indexed. Index 0 contains
522 	 * {@link Integer#MIN_VALUE} to pad the list out.
523 	 * <p>
524 	 * Using a 1 indexed list means that line numbers can be directly accessed
525 	 * from the list, so <code>list.get(1)</code> (aka get line 1) returns
526 	 * <code>ptr</code>.
527 	 * <p>
528 	 * The last element (index <code>map.size()-1</code>) always contains
529 	 * <code>end</code>.
530 	 *
531 	 * @param buf
532 	 *            buffer to scan.
533 	 * @param ptr
534 	 *            position within the buffer corresponding to the first byte of
535 	 *            line 1.
536 	 * @param end
537 	 *            1 past the end of the content within <code>buf</code>.
538 	 * @return a line map indexing the start position of each line.
539 	 */
540 	public static final IntList lineMap(final byte[] buf, int ptr, int end) {
541 		// Experimentally derived from multiple source repositories
542 		// the average number of bytes/line is 36. Its a rough guess
543 		// to initially size our map close to the target.
544 		//
545 		final IntList map = new IntList((end - ptr) / 36);
546 		map.fillTo(1, Integer.MIN_VALUE);
547 		for (; ptr < end; ptr = nextLF(buf, ptr))
548 			map.add(ptr);
549 		map.add(end);
550 		return map;
551 	}
552 
553 	/**
554 	 * Locate the "author " header line data.
555 	 *
556 	 * @param b
557 	 *            buffer to scan.
558 	 * @param ptr
559 	 *            position in buffer to start the scan at. Most callers should
560 	 *            pass 0 to ensure the scan starts from the beginning of the
561 	 *            commit buffer and does not accidentally look at message body.
562 	 * @return position just after the space in "author ", so the first
563 	 *         character of the author's name. If no author header can be
564 	 *         located -1 is returned.
565 	 */
566 	public static final int author(final byte[] b, int ptr) {
567 		final int sz = b.length;
568 		if (ptr == 0)
569 			ptr += 46; // skip the "tree ..." line.
570 		while (ptr < sz && b[ptr] == 'p')
571 			ptr += 48; // skip this parent.
572 		return match(b, ptr, author);
573 	}
574 
575 	/**
576 	 * Locate the "committer " header line data.
577 	 *
578 	 * @param b
579 	 *            buffer to scan.
580 	 * @param ptr
581 	 *            position in buffer to start the scan at. Most callers should
582 	 *            pass 0 to ensure the scan starts from the beginning of the
583 	 *            commit buffer and does not accidentally look at message body.
584 	 * @return position just after the space in "committer ", so the first
585 	 *         character of the committer's name. If no committer header can be
586 	 *         located -1 is returned.
587 	 */
588 	public static final int committer(final byte[] b, int ptr) {
589 		final int sz = b.length;
590 		if (ptr == 0)
591 			ptr += 46; // skip the "tree ..." line.
592 		while (ptr < sz && b[ptr] == 'p')
593 			ptr += 48; // skip this parent.
594 		if (ptr < sz && b[ptr] == 'a')
595 			ptr = nextLF(b, ptr);
596 		return match(b, ptr, committer);
597 	}
598 
599 	/**
600 	 * Locate the "tagger " header line data.
601 	 *
602 	 * @param b
603 	 *            buffer to scan.
604 	 * @param ptr
605 	 *            position in buffer to start the scan at. Most callers should
606 	 *            pass 0 to ensure the scan starts from the beginning of the tag
607 	 *            buffer and does not accidentally look at message body.
608 	 * @return position just after the space in "tagger ", so the first
609 	 *         character of the tagger's name. If no tagger header can be
610 	 *         located -1 is returned.
611 	 */
612 	public static final int tagger(final byte[] b, int ptr) {
613 		final int sz = b.length;
614 		if (ptr == 0)
615 			ptr += 48; // skip the "object ..." line.
616 		while (ptr < sz) {
617 			if (b[ptr] == '\n')
618 				return -1;
619 			final int m = match(b, ptr, tagger);
620 			if (m >= 0)
621 				return m;
622 			ptr = nextLF(b, ptr);
623 		}
624 		return -1;
625 	}
626 
627 	/**
628 	 * Locate the "encoding " header line.
629 	 *
630 	 * @param b
631 	 *            buffer to scan.
632 	 * @param ptr
633 	 *            position in buffer to start the scan at. Most callers should
634 	 *            pass 0 to ensure the scan starts from the beginning of the
635 	 *            buffer and does not accidentally look at the message body.
636 	 * @return position just after the space in "encoding ", so the first
637 	 *         character of the encoding's name. If no encoding header can be
638 	 *         located -1 is returned (and UTF-8 should be assumed).
639 	 */
640 	public static final int encoding(final byte[] b, int ptr) {
641 		final int sz = b.length;
642 		while (ptr < sz) {
643 			if (b[ptr] == '\n')
644 				return -1;
645 			if (b[ptr] == 'e')
646 				break;
647 			ptr = nextLF(b, ptr);
648 		}
649 		return match(b, ptr, encoding);
650 	}
651 
652 	/**
653 	 * Parse the "encoding " header into a character set reference.
654 	 * <p>
655 	 * Locates the "encoding " header (if present) by first calling
656 	 * {@link #encoding(byte[], int)} and then returns the proper character set
657 	 * to apply to this buffer to evaluate its contents as character data.
658 	 * <p>
659 	 * If no encoding header is present, {@link Constants#CHARSET} is assumed.
660 	 *
661 	 * @param b
662 	 *            buffer to scan.
663 	 * @return the Java character set representation. Never null.
664 	 */
665 	public static Charset parseEncoding(final byte[] b) {
666 		final int enc = encoding(b, 0);
667 		if (enc < 0)
668 			return Constants.CHARSET;
669 		final int lf = nextLF(b, enc);
670 		String decoded = decode(Constants.CHARSET, b, enc, lf - 1);
671 		try {
672 			return Charset.forName(decoded);
673 		} catch (IllegalCharsetNameException badName) {
674 			Charset aliased = charsetForAlias(decoded);
675 			if (aliased != null)
676 				return aliased;
677 			throw badName;
678 		} catch (UnsupportedCharsetException badName) {
679 			Charset aliased = charsetForAlias(decoded);
680 			if (aliased != null)
681 				return aliased;
682 			throw badName;
683 		}
684 	}
685 
686 	/**
687 	 * Parse a name string (e.g. author, committer, tagger) into a PersonIdent.
688 	 * <p>
689 	 * Leading spaces won't be trimmed from the string, i.e. will show up in the
690 	 * parsed name afterwards.
691 	 *
692 	 * @param in
693 	 *            the string to parse a name from.
694 	 * @return the parsed identity or null in case the identity could not be
695 	 *         parsed.
696 	 */
697 	public static PersonIdent parsePersonIdent(final String in) {
698 		return parsePersonIdent(Constants.encode(in), 0);
699 	}
700 
701 	/**
702 	 * Parse a name line (e.g. author, committer, tagger) into a PersonIdent.
703 	 * <p>
704 	 * When passing in a value for <code>nameB</code> callers should use the
705 	 * return value of {@link #author(byte[], int)} or
706 	 * {@link #committer(byte[], int)}, as these methods provide the proper
707 	 * position within the buffer.
708 	 *
709 	 * @param raw
710 	 *            the buffer to parse character data from.
711 	 * @param nameB
712 	 *            first position of the identity information. This should be the
713 	 *            first position after the space which delimits the header field
714 	 *            name (e.g. "author" or "committer") from the rest of the
715 	 *            identity line.
716 	 * @return the parsed identity or null in case the identity could not be
717 	 *         parsed.
718 	 */
719 	public static PersonIdent parsePersonIdent(final byte[] raw, final int nameB) {
720 		final Charset cs = parseEncoding(raw);
721 		final int emailB = nextLF(raw, nameB, '<');
722 		final int emailE = nextLF(raw, emailB, '>');
723 		if (emailB >= raw.length || raw[emailB] == '\n' ||
724 				(emailE >= raw.length - 1 && raw[emailE - 1] != '>'))
725 			return null;
726 
727 		final int nameEnd = emailB - 2 >= nameB && raw[emailB - 2] == ' ' ?
728 				emailB - 2 : emailB - 1;
729 		final String name = decode(cs, raw, nameB, nameEnd);
730 		final String email = decode(cs, raw, emailB, emailE - 1);
731 
732 		// Start searching from end of line, as after first name-email pair,
733 		// another name-email pair may occur. We will ignore all kinds of
734 		// "junk" following the first email.
735 		//
736 		// We've to use (emailE - 1) for the case that raw[email] is LF,
737 		// otherwise we would run too far. "-2" is necessary to position
738 		// before the LF in case of LF termination resp. the penultimate
739 		// character if there is no trailing LF.
740 		final int tzBegin = lastIndexOfTrim(raw, ' ',
741 				nextLF(raw, emailE - 1) - 2) + 1;
742 		if (tzBegin <= emailE) // No time/zone, still valid
743 			return new PersonIdent(name, email, 0, 0);
744 
745 		final int whenBegin = Math.max(emailE,
746 				lastIndexOfTrim(raw, ' ', tzBegin - 1) + 1);
747 		if (whenBegin >= tzBegin - 1) // No time/zone, still valid
748 			return new PersonIdent(name, email, 0, 0);
749 
750 		final long when = parseLongBase10(raw, whenBegin, null);
751 		final int tz = parseTimeZoneOffset(raw, tzBegin);
752 		return new PersonIdent(name, email, when * 1000L, tz);
753 	}
754 
755 	/**
756 	 * Parse a name data (e.g. as within a reflog) into a PersonIdent.
757 	 * <p>
758 	 * When passing in a value for <code>nameB</code> callers should use the
759 	 * return value of {@link #author(byte[], int)} or
760 	 * {@link #committer(byte[], int)}, as these methods provide the proper
761 	 * position within the buffer.
762 	 *
763 	 * @param raw
764 	 *            the buffer to parse character data from.
765 	 * @param nameB
766 	 *            first position of the identity information. This should be the
767 	 *            first position after the space which delimits the header field
768 	 *            name (e.g. "author" or "committer") from the rest of the
769 	 *            identity line.
770 	 * @return the parsed identity. Never null.
771 	 */
772 	public static PersonIdent parsePersonIdentOnly(final byte[] raw,
773 			final int nameB) {
774 		int stop = nextLF(raw, nameB);
775 		int emailB = nextLF(raw, nameB, '<');
776 		int emailE = nextLF(raw, emailB, '>');
777 		final String name;
778 		final String email;
779 		if (emailE < stop) {
780 			email = decode(raw, emailB, emailE - 1);
781 		} else {
782 			email = "invalid"; //$NON-NLS-1$
783 		}
784 		if (emailB < stop)
785 			name = decode(raw, nameB, emailB - 2);
786 		else
787 			name = decode(raw, nameB, stop);
788 
789 		final MutableInteger ptrout = new MutableInteger();
790 		long when;
791 		int tz;
792 		if (emailE < stop) {
793 			when = parseLongBase10(raw, emailE + 1, ptrout);
794 			tz = parseTimeZoneOffset(raw, ptrout.value);
795 		} else {
796 			when = 0;
797 			tz = 0;
798 		}
799 		return new PersonIdent(name, email, when * 1000L, tz);
800 	}
801 
802 	/**
803 	 * Locate the end of a footer line key string.
804 	 * <p>
805 	 * If the region at {@code raw[ptr]} matches {@code ^[A-Za-z0-9-]+:} (e.g.
806 	 * "Signed-off-by: A. U. Thor\n") then this method returns the position of
807 	 * the first ':'.
808 	 * <p>
809 	 * If the region at {@code raw[ptr]} does not match {@code ^[A-Za-z0-9-]+:}
810 	 * then this method returns -1.
811 	 *
812 	 * @param raw
813 	 *            buffer to scan.
814 	 * @param ptr
815 	 *            first position within raw to consider as a footer line key.
816 	 * @return position of the ':' which terminates the footer line key if this
817 	 *         is otherwise a valid footer line key; otherwise -1.
818 	 */
819 	public static int endOfFooterLineKey(final byte[] raw, int ptr) {
820 		try {
821 			for (;;) {
822 				final byte c = raw[ptr];
823 				if (footerLineKeyChars[c] == 0) {
824 					if (c == ':')
825 						return ptr;
826 					return -1;
827 				}
828 				ptr++;
829 			}
830 		} catch (ArrayIndexOutOfBoundsException e) {
831 			return -1;
832 		}
833 	}
834 
835 	/**
836 	 * Decode a buffer under UTF-8, if possible.
837 	 *
838 	 * If the byte stream cannot be decoded that way, the platform default is tried
839 	 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
840 	 *
841 	 * @param buffer
842 	 *            buffer to pull raw bytes from.
843 	 * @return a string representation of the range <code>[start,end)</code>,
844 	 *         after decoding the region through the specified character set.
845 	 */
846 	public static String decode(final byte[] buffer) {
847 		return decode(buffer, 0, buffer.length);
848 	}
849 
850 	/**
851 	 * Decode a buffer under UTF-8, if possible.
852 	 *
853 	 * If the byte stream cannot be decoded that way, the platform default is
854 	 * tried and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
855 	 *
856 	 * @param buffer
857 	 *            buffer to pull raw bytes from.
858 	 * @param start
859 	 *            start position in buffer
860 	 * @param end
861 	 *            one position past the last location within the buffer to take
862 	 *            data from.
863 	 * @return a string representation of the range <code>[start,end)</code>,
864 	 *         after decoding the region through the specified character set.
865 	 */
866 	public static String decode(final byte[] buffer, final int start,
867 			final int end) {
868 		return decode(Constants.CHARSET, buffer, start, end);
869 	}
870 
871 	/**
872 	 * Decode a buffer under the specified character set if possible.
873 	 *
874 	 * If the byte stream cannot be decoded that way, the platform default is tried
875 	 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
876 	 *
877 	 * @param cs
878 	 *            character set to use when decoding the buffer.
879 	 * @param buffer
880 	 *            buffer to pull raw bytes from.
881 	 * @return a string representation of the range <code>[start,end)</code>,
882 	 *         after decoding the region through the specified character set.
883 	 */
884 	public static String decode(final Charset cs, final byte[] buffer) {
885 		return decode(cs, buffer, 0, buffer.length);
886 	}
887 
888 	/**
889 	 * Decode a region of the buffer under the specified character set if possible.
890 	 *
891 	 * If the byte stream cannot be decoded that way, the platform default is tried
892 	 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
893 	 *
894 	 * @param cs
895 	 *            character set to use when decoding the buffer.
896 	 * @param buffer
897 	 *            buffer to pull raw bytes from.
898 	 * @param start
899 	 *            first position within the buffer to take data from.
900 	 * @param end
901 	 *            one position past the last location within the buffer to take
902 	 *            data from.
903 	 * @return a string representation of the range <code>[start,end)</code>,
904 	 *         after decoding the region through the specified character set.
905 	 */
906 	public static String decode(final Charset cs, final byte[] buffer,
907 			final int start, final int end) {
908 		try {
909 			return decodeNoFallback(cs, buffer, start, end);
910 		} catch (CharacterCodingException e) {
911 			// Fall back to an ISO-8859-1 style encoding. At least all of
912 			// the bytes will be present in the output.
913 			//
914 			return extractBinaryString(buffer, start, end);
915 		}
916 	}
917 
918 	/**
919 	 * Decode a region of the buffer under the specified character set if
920 	 * possible.
921 	 *
922 	 * If the byte stream cannot be decoded that way, the platform default is
923 	 * tried and if that too fails, an exception is thrown.
924 	 *
925 	 * @param cs
926 	 *            character set to use when decoding the buffer.
927 	 * @param buffer
928 	 *            buffer to pull raw bytes from.
929 	 * @param start
930 	 *            first position within the buffer to take data from.
931 	 * @param end
932 	 *            one position past the last location within the buffer to take
933 	 *            data from.
934 	 * @return a string representation of the range <code>[start,end)</code>,
935 	 *         after decoding the region through the specified character set.
936 	 * @throws CharacterCodingException
937 	 *             the input is not in any of the tested character sets.
938 	 */
939 	public static String decodeNoFallback(final Charset cs,
940 			final byte[] buffer, final int start, final int end)
941 			throws CharacterCodingException {
942 		final ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
943 		b.mark();
944 
945 		// Try our built-in favorite. The assumption here is that
946 		// decoding will fail if the data is not actually encoded
947 		// using that encoder.
948 		//
949 		try {
950 			return decode(b, Constants.CHARSET);
951 		} catch (CharacterCodingException e) {
952 			b.reset();
953 		}
954 
955 		if (!cs.equals(Constants.CHARSET)) {
956 			// Try the suggested encoding, it might be right since it was
957 			// provided by the caller.
958 			//
959 			try {
960 				return decode(b, cs);
961 			} catch (CharacterCodingException e) {
962 				b.reset();
963 			}
964 		}
965 
966 		// Try the default character set. A small group of people
967 		// might actually use the same (or very similar) locale.
968 		//
969 		final Charset defcs = Charset.defaultCharset();
970 		if (!defcs.equals(cs) && !defcs.equals(Constants.CHARSET)) {
971 			try {
972 				return decode(b, defcs);
973 			} catch (CharacterCodingException e) {
974 				b.reset();
975 			}
976 		}
977 
978 		throw new CharacterCodingException();
979 	}
980 
981 	/**
982 	 * Decode a region of the buffer under the ISO-8859-1 encoding.
983 	 *
984 	 * Each byte is treated as a single character in the 8859-1 character
985 	 * encoding, performing a raw binary-&gt;char conversion.
986 	 *
987 	 * @param buffer
988 	 *            buffer to pull raw bytes from.
989 	 * @param start
990 	 *            first position within the buffer to take data from.
991 	 * @param end
992 	 *            one position past the last location within the buffer to take
993 	 *            data from.
994 	 * @return a string representation of the range <code>[start,end)</code>.
995 	 */
996 	public static String extractBinaryString(final byte[] buffer,
997 			final int start, final int end) {
998 		final StringBuilder r = new StringBuilder(end - start);
999 		for (int i = start; i < end; i++)
1000 			r.append((char) (buffer[i] & 0xff));
1001 		return r.toString();
1002 	}
1003 
1004 	private static String decode(final ByteBuffer b, final Charset charset)
1005 			throws CharacterCodingException {
1006 		final CharsetDecoder d = charset.newDecoder();
1007 		d.onMalformedInput(CodingErrorAction.REPORT);
1008 		d.onUnmappableCharacter(CodingErrorAction.REPORT);
1009 		return d.decode(b).toString();
1010 	}
1011 
1012 	/**
1013 	 * Locate the position of the commit message body.
1014 	 *
1015 	 * @param b
1016 	 *            buffer to scan.
1017 	 * @param ptr
1018 	 *            position in buffer to start the scan at. Most callers should
1019 	 *            pass 0 to ensure the scan starts from the beginning of the
1020 	 *            commit buffer.
1021 	 * @return position of the user's message buffer.
1022 	 */
1023 	public static final int commitMessage(final byte[] b, int ptr) {
1024 		final int sz = b.length;
1025 		if (ptr == 0)
1026 			ptr += 46; // skip the "tree ..." line.
1027 		while (ptr < sz && b[ptr] == 'p')
1028 			ptr += 48; // skip this parent.
1029 
1030 		// Skip any remaining header lines, ignoring what their actual
1031 		// header line type is. This is identical to the logic for a tag.
1032 		//
1033 		return tagMessage(b, ptr);
1034 	}
1035 
1036 	/**
1037 	 * Locate the position of the tag message body.
1038 	 *
1039 	 * @param b
1040 	 *            buffer to scan.
1041 	 * @param ptr
1042 	 *            position in buffer to start the scan at. Most callers should
1043 	 *            pass 0 to ensure the scan starts from the beginning of the tag
1044 	 *            buffer.
1045 	 * @return position of the user's message buffer.
1046 	 */
1047 	public static final int tagMessage(final byte[] b, int ptr) {
1048 		final int sz = b.length;
1049 		if (ptr == 0)
1050 			ptr += 48; // skip the "object ..." line.
1051 		while (ptr < sz && b[ptr] != '\n')
1052 			ptr = nextLF(b, ptr);
1053 		if (ptr < sz && b[ptr] == '\n')
1054 			return ptr + 1;
1055 		return -1;
1056 	}
1057 
1058 	/**
1059 	 * Locate the end of a paragraph.
1060 	 * <p>
1061 	 * A paragraph is ended by two consecutive LF bytes or CRLF pairs
1062 	 *
1063 	 * @param b
1064 	 *            buffer to scan.
1065 	 * @param start
1066 	 *            position in buffer to start the scan at. Most callers will
1067 	 *            want to pass the first position of the commit message (as
1068 	 *            found by {@link #commitMessage(byte[], int)}.
1069 	 * @return position of the LF at the end of the paragraph;
1070 	 *         <code>b.length</code> if no paragraph end could be located.
1071 	 */
1072 	public static final int endOfParagraph(final byte[] b, final int start) {
1073 		int ptr = start;
1074 		final int sz = b.length;
1075 		while (ptr < sz && (b[ptr] != '\n' && b[ptr] != '\r'))
1076 			ptr = nextLF(b, ptr);
1077 		if (ptr > start && b[ptr - 1] == '\n')
1078 			ptr--;
1079 		if (ptr > start && b[ptr - 1] == '\r')
1080 			ptr--;
1081 		return ptr;
1082 	}
1083 
1084 	private static int lastIndexOfTrim(byte[] raw, char ch, int pos) {
1085 		while (pos >= 0 && raw[pos] == ' ')
1086 			pos--;
1087 
1088 		while (pos >= 0 && raw[pos] != ch)
1089 			pos--;
1090 
1091 		return pos;
1092 	}
1093 
1094 	private static Charset charsetForAlias(String name) {
1095 		return encodingAliases.get(StringUtils.toLowerCase(name));
1096 	}
1097 
1098 	private RawParseUtils() {
1099 		// Don't create instances of a static only utility.
1100 	}
1101 }