View Javadoc
1   /*
2    * Copyright (C) 2008-2009, Google Inc.
3    * Copyright (C) 2006-2008, Shawn O. Pearce <spearce@spearce.org>
4    * and other copyright owners as documented in the project's IP log.
5    *
6    * This program and the accompanying materials are made available
7    * under the terms of the Eclipse Distribution License v1.0 which
8    * accompanies this distribution, is reproduced below, and is
9    * available at http://www.eclipse.org/org/documents/edl-v10.php
10   *
11   * All rights reserved.
12   *
13   * Redistribution and use in source and binary forms, with or
14   * without modification, are permitted provided that the following
15   * conditions are met:
16   *
17   * - Redistributions of source code must retain the above copyright
18   *   notice, this list of conditions and the following disclaimer.
19   *
20   * - Redistributions in binary form must reproduce the above
21   *   copyright notice, this list of conditions and the following
22   *   disclaimer in the documentation and/or other materials provided
23   *   with the distribution.
24   *
25   * - Neither the name of the Eclipse Foundation, Inc. nor the
26   *   names of its contributors may be used to endorse or promote
27   *   products derived from this software without specific prior
28   *   written permission.
29   *
30   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
31   * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
32   * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
33   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34   * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
35   * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
36   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
37   * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
38   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
39   * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
40   * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
41   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
42   * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43   */
44  
45  package org.eclipse.jgit.util;
46  
47  import static java.nio.charset.StandardCharsets.ISO_8859_1;
48  import static java.nio.charset.StandardCharsets.UTF_8;
49  import static org.eclipse.jgit.lib.ObjectChecker.author;
50  import static org.eclipse.jgit.lib.ObjectChecker.committer;
51  import static org.eclipse.jgit.lib.ObjectChecker.encoding;
52  import static org.eclipse.jgit.lib.ObjectChecker.tagger;
53  
54  import java.nio.ByteBuffer;
55  import java.nio.charset.CharacterCodingException;
56  import java.nio.charset.Charset;
57  import java.nio.charset.CharsetDecoder;
58  import java.nio.charset.CodingErrorAction;
59  import java.nio.charset.IllegalCharsetNameException;
60  import java.nio.charset.UnsupportedCharsetException;
61  import java.util.Arrays;
62  import java.util.HashMap;
63  import java.util.Map;
64  
65  import org.eclipse.jgit.annotations.Nullable;
66  import org.eclipse.jgit.errors.BinaryBlobException;
67  import org.eclipse.jgit.lib.Constants;
68  import org.eclipse.jgit.lib.PersonIdent;
69  
70  /**
71   * Handy utility functions to parse raw object contents.
72   */
73  public final class RawParseUtils {
74  	/**
75  	 * UTF-8 charset constant.
76  	 *
77  	 * @since 2.2
78  	 * @deprecated use {@link java.nio.charset.StandardCharsets#UTF_8} instead
79  	 */
80  	@Deprecated
81  	public static final Charset UTF8_CHARSET = UTF_8;
82  
83  	private static final byte[] digits10;
84  
85  	private static final byte[] digits16;
86  
87  	private static final byte[] footerLineKeyChars;
88  
89  	private static final Map<String, Charset> encodingAliases;
90  
91  	static {
92  		encodingAliases = new HashMap<>();
93  		encodingAliases.put("latin-1", ISO_8859_1); //$NON-NLS-1$
94  		encodingAliases.put("iso-latin-1", ISO_8859_1); //$NON-NLS-1$
95  
96  		digits10 = new byte['9' + 1];
97  		Arrays.fill(digits10, (byte) -1);
98  		for (char i = '0'; i <= '9'; i++)
99  			digits10[i] = (byte) (i - '0');
100 
101 		digits16 = new byte['f' + 1];
102 		Arrays.fill(digits16, (byte) -1);
103 		for (char i = '0'; i <= '9'; i++)
104 			digits16[i] = (byte) (i - '0');
105 		for (char i = 'a'; i <= 'f'; i++)
106 			digits16[i] = (byte) ((i - 'a') + 10);
107 		for (char i = 'A'; i <= 'F'; i++)
108 			digits16[i] = (byte) ((i - 'A') + 10);
109 
110 		footerLineKeyChars = new byte['z' + 1];
111 		footerLineKeyChars['-'] = 1;
112 		for (char i = '0'; i <= '9'; i++)
113 			footerLineKeyChars[i] = 1;
114 		for (char i = 'A'; i <= 'Z'; i++)
115 			footerLineKeyChars[i] = 1;
116 		for (char i = 'a'; i <= 'z'; i++)
117 			footerLineKeyChars[i] = 1;
118 	}
119 
120 	/**
121 	 * Determine if b[ptr] matches src.
122 	 *
123 	 * @param b
124 	 *            the buffer to scan.
125 	 * @param ptr
126 	 *            first position within b, this should match src[0].
127 	 * @param src
128 	 *            the buffer to test for equality with b.
129 	 * @return ptr + src.length if b[ptr..src.length] == src; else -1.
130 	 */
131 	public static final int match(byte[] b, int ptr, byte[] src) {
132 		if (ptr + src.length > b.length)
133 			return -1;
134 		for (int i = 0; i < src.length; i++, ptr++)
135 			if (b[ptr] != src[i])
136 				return -1;
137 		return ptr;
138 	}
139 
140 	private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5',
141 			'6', '7', '8', '9' };
142 
143 	/**
144 	 * Format a base 10 numeric into a temporary buffer.
145 	 * <p>
146 	 * Formatting is performed backwards. The method starts at offset
147 	 * <code>o-1</code> and ends at <code>o-1-digits</code>, where
148 	 * <code>digits</code> is the number of positions necessary to store the
149 	 * base 10 value.
150 	 * <p>
151 	 * The argument and return values from this method make it easy to chain
152 	 * writing, for example:
153 	 * </p>
154 	 *
155 	 * <pre>
156 	 * final byte[] tmp = new byte[64];
157 	 * int ptr = tmp.length;
158 	 * tmp[--ptr] = '\n';
159 	 * ptr = RawParseUtils.formatBase10(tmp, ptr, 32);
160 	 * tmp[--ptr] = ' ';
161 	 * ptr = RawParseUtils.formatBase10(tmp, ptr, 18);
162 	 * tmp[--ptr] = 0;
163 	 * final String str = new String(tmp, ptr, tmp.length - ptr);
164 	 * </pre>
165 	 *
166 	 * @param b
167 	 *            buffer to write into.
168 	 * @param o
169 	 *            one offset past the location where writing will begin; writing
170 	 *            proceeds towards lower index values.
171 	 * @param value
172 	 *            the value to store.
173 	 * @return the new offset value <code>o</code>. This is the position of
174 	 *         the last byte written. Additional writing should start at one
175 	 *         position earlier.
176 	 */
177 	public static int formatBase10(final byte[] b, int o, int value) {
178 		if (value == 0) {
179 			b[--o] = '0';
180 			return o;
181 		}
182 		final boolean isneg = value < 0;
183 		if (isneg)
184 			value = -value;
185 		while (value != 0) {
186 			b[--o] = base10byte[value % 10];
187 			value /= 10;
188 		}
189 		if (isneg)
190 			b[--o] = '-';
191 		return o;
192 	}
193 
194 	/**
195 	 * Parse a base 10 numeric from a sequence of ASCII digits into an int.
196 	 * <p>
197 	 * Digit sequences can begin with an optional run of spaces before the
198 	 * sequence, and may start with a '+' or a '-' to indicate sign position.
199 	 * Any other characters will cause the method to stop and return the current
200 	 * result to the caller.
201 	 *
202 	 * @param b
203 	 *            buffer to scan.
204 	 * @param ptr
205 	 *            position within buffer to start parsing digits at.
206 	 * @param ptrResult
207 	 *            optional location to return the new ptr value through. If null
208 	 *            the ptr value will be discarded.
209 	 * @return the value at this location; 0 if the location is not a valid
210 	 *         numeric.
211 	 */
212 	public static final int parseBase10(final byte[] b, int ptr,
213 			final MutableInteger ptrResult) {
214 		int r = 0;
215 		int sign = 0;
216 		try {
217 			final int sz = b.length;
218 			while (ptr < sz && b[ptr] == ' ')
219 				ptr++;
220 			if (ptr >= sz)
221 				return 0;
222 
223 			switch (b[ptr]) {
224 			case '-':
225 				sign = -1;
226 				ptr++;
227 				break;
228 			case '+':
229 				ptr++;
230 				break;
231 			}
232 
233 			while (ptr < sz) {
234 				final byte v = digits10[b[ptr]];
235 				if (v < 0)
236 					break;
237 				r = (r * 10) + v;
238 				ptr++;
239 			}
240 		} catch (ArrayIndexOutOfBoundsException e) {
241 			// Not a valid digit.
242 		}
243 		if (ptrResult != null)
244 			ptrResult.value = ptr;
245 		return sign < 0 ? -r : r;
246 	}
247 
248 	/**
249 	 * Parse a base 10 numeric from a sequence of ASCII digits into a long.
250 	 * <p>
251 	 * Digit sequences can begin with an optional run of spaces before the
252 	 * sequence, and may start with a '+' or a '-' to indicate sign position.
253 	 * Any other characters will cause the method to stop and return the current
254 	 * result to the caller.
255 	 *
256 	 * @param b
257 	 *            buffer to scan.
258 	 * @param ptr
259 	 *            position within buffer to start parsing digits at.
260 	 * @param ptrResult
261 	 *            optional location to return the new ptr value through. If null
262 	 *            the ptr value will be discarded.
263 	 * @return the value at this location; 0 if the location is not a valid
264 	 *         numeric.
265 	 */
266 	public static final long parseLongBase10(final byte[] b, int ptr,
267 			final MutableInteger ptrResult) {
268 		long r = 0;
269 		int sign = 0;
270 		try {
271 			final int sz = b.length;
272 			while (ptr < sz && b[ptr] == ' ')
273 				ptr++;
274 			if (ptr >= sz)
275 				return 0;
276 
277 			switch (b[ptr]) {
278 			case '-':
279 				sign = -1;
280 				ptr++;
281 				break;
282 			case '+':
283 				ptr++;
284 				break;
285 			}
286 
287 			while (ptr < sz) {
288 				final byte v = digits10[b[ptr]];
289 				if (v < 0)
290 					break;
291 				r = (r * 10) + v;
292 				ptr++;
293 			}
294 		} catch (ArrayIndexOutOfBoundsException e) {
295 			// Not a valid digit.
296 		}
297 		if (ptrResult != null)
298 			ptrResult.value = ptr;
299 		return sign < 0 ? -r : r;
300 	}
301 
302 	/**
303 	 * Parse 4 character base 16 (hex) formatted string to unsigned integer.
304 	 * <p>
305 	 * The number is read in network byte order, that is, most significant
306 	 * nybble first.
307 	 *
308 	 * @param bs
309 	 *            buffer to parse digits from; positions {@code [p, p+4)} will
310 	 *            be parsed.
311 	 * @param p
312 	 *            first position within the buffer to parse.
313 	 * @return the integer value.
314 	 * @throws java.lang.ArrayIndexOutOfBoundsException
315 	 *             if the string is not hex formatted.
316 	 */
317 	public static final int parseHexInt16(final byte[] bs, final int p) {
318 		int r = digits16[bs[p]] << 4;
319 
320 		r |= digits16[bs[p + 1]];
321 		r <<= 4;
322 
323 		r |= digits16[bs[p + 2]];
324 		r <<= 4;
325 
326 		r |= digits16[bs[p + 3]];
327 		if (r < 0)
328 			throw new ArrayIndexOutOfBoundsException();
329 		return r;
330 	}
331 
332 	/**
333 	 * Parse 8 character base 16 (hex) formatted string to unsigned integer.
334 	 * <p>
335 	 * The number is read in network byte order, that is, most significant
336 	 * nybble first.
337 	 *
338 	 * @param bs
339 	 *            buffer to parse digits from; positions {@code [p, p+8)} will
340 	 *            be parsed.
341 	 * @param p
342 	 *            first position within the buffer to parse.
343 	 * @return the integer value.
344 	 * @throws java.lang.ArrayIndexOutOfBoundsException
345 	 *             if the string is not hex formatted.
346 	 */
347 	public static final int parseHexInt32(final byte[] bs, final int p) {
348 		int r = digits16[bs[p]] << 4;
349 
350 		r |= digits16[bs[p + 1]];
351 		r <<= 4;
352 
353 		r |= digits16[bs[p + 2]];
354 		r <<= 4;
355 
356 		r |= digits16[bs[p + 3]];
357 		r <<= 4;
358 
359 		r |= digits16[bs[p + 4]];
360 		r <<= 4;
361 
362 		r |= digits16[bs[p + 5]];
363 		r <<= 4;
364 
365 		r |= digits16[bs[p + 6]];
366 
367 		final int last = digits16[bs[p + 7]];
368 		if (r < 0 || last < 0)
369 			throw new ArrayIndexOutOfBoundsException();
370 		return (r << 4) | last;
371 	}
372 
373 	/**
374 	 * Parse 16 character base 16 (hex) formatted string to unsigned long.
375 	 * <p>
376 	 * The number is read in network byte order, that is, most significant
377 	 * nibble first.
378 	 *
379 	 * @param bs
380 	 *            buffer to parse digits from; positions {@code [p, p+16)} will
381 	 *            be parsed.
382 	 * @param p
383 	 *            first position within the buffer to parse.
384 	 * @return the integer value.
385 	 * @throws java.lang.ArrayIndexOutOfBoundsException
386 	 *             if the string is not hex formatted.
387 	 * @since 4.3
388 	 */
389 	public static final long parseHexInt64(final byte[] bs, final int p) {
390 		long r = digits16[bs[p]] << 4;
391 
392 		r |= digits16[bs[p + 1]];
393 		r <<= 4;
394 
395 		r |= digits16[bs[p + 2]];
396 		r <<= 4;
397 
398 		r |= digits16[bs[p + 3]];
399 		r <<= 4;
400 
401 		r |= digits16[bs[p + 4]];
402 		r <<= 4;
403 
404 		r |= digits16[bs[p + 5]];
405 		r <<= 4;
406 
407 		r |= digits16[bs[p + 6]];
408 		r <<= 4;
409 
410 		r |= digits16[bs[p + 7]];
411 		r <<= 4;
412 
413 		r |= digits16[bs[p + 8]];
414 		r <<= 4;
415 
416 		r |= digits16[bs[p + 9]];
417 		r <<= 4;
418 
419 		r |= digits16[bs[p + 10]];
420 		r <<= 4;
421 
422 		r |= digits16[bs[p + 11]];
423 		r <<= 4;
424 
425 		r |= digits16[bs[p + 12]];
426 		r <<= 4;
427 
428 		r |= digits16[bs[p + 13]];
429 		r <<= 4;
430 
431 		r |= digits16[bs[p + 14]];
432 
433 		final int last = digits16[bs[p + 15]];
434 		if (r < 0 || last < 0)
435 			throw new ArrayIndexOutOfBoundsException();
436 		return (r << 4) | last;
437 	}
438 
439 	/**
440 	 * Parse a single hex digit to its numeric value (0-15).
441 	 *
442 	 * @param digit
443 	 *            hex character to parse.
444 	 * @return numeric value, in the range 0-15.
445 	 * @throws java.lang.ArrayIndexOutOfBoundsException
446 	 *             if the input digit is not a valid hex digit.
447 	 */
448 	public static final int parseHexInt4(final byte digit) {
449 		final byte r = digits16[digit];
450 		if (r < 0)
451 			throw new ArrayIndexOutOfBoundsException();
452 		return r;
453 	}
454 
455 	/**
456 	 * Parse a Git style timezone string.
457 	 * <p>
458 	 * The sequence "-0315" will be parsed as the numeric value -195, as the
459 	 * lower two positions count minutes, not 100ths of an hour.
460 	 *
461 	 * @param b
462 	 *            buffer to scan.
463 	 * @param ptr
464 	 *            position within buffer to start parsing digits at.
465 	 * @return the timezone at this location, expressed in minutes.
466 	 */
467 	public static final int parseTimeZoneOffset(byte[] b, int ptr) {
468 		return parseTimeZoneOffset(b, ptr, null);
469 	}
470 
471 	/**
472 	 * Parse a Git style timezone string.
473 	 * <p>
474 	 * The sequence "-0315" will be parsed as the numeric value -195, as the
475 	 * lower two positions count minutes, not 100ths of an hour.
476 	 *
477 	 * @param b
478 	 *            buffer to scan.
479 	 * @param ptr
480 	 *            position within buffer to start parsing digits at.
481 	 * @param ptrResult
482 	 *            optional location to return the new ptr value through. If null
483 	 *            the ptr value will be discarded.
484 	 * @return the timezone at this location, expressed in minutes.
485 	 * @since 4.1
486 	 */
487 	public static final int parseTimeZoneOffset(final byte[] b, int ptr,
488 			MutableInteger ptrResult) {
489 		final int v = parseBase10(b, ptr, ptrResult);
490 		final int tzMins = v % 100;
491 		final int tzHours = v / 100;
492 		return tzHours * 60 + tzMins;
493 	}
494 
495 	/**
496 	 * Locate the first position after a given character.
497 	 *
498 	 * @param b
499 	 *            buffer to scan.
500 	 * @param ptr
501 	 *            position within buffer to start looking for chrA at.
502 	 * @param chrA
503 	 *            character to find.
504 	 * @return new position just after chrA.
505 	 */
506 	public static final int next(byte[] b, int ptr, char chrA) {
507 		final int sz = b.length;
508 		while (ptr < sz) {
509 			if (b[ptr++] == chrA)
510 				return ptr;
511 		}
512 		return ptr;
513 	}
514 
515 	/**
516 	 * Locate the first position after the next LF.
517 	 * <p>
518 	 * This method stops on the first '\n' it finds.
519 	 *
520 	 * @param b
521 	 *            buffer to scan.
522 	 * @param ptr
523 	 *            position within buffer to start looking for LF at.
524 	 * @return new position just after the first LF found.
525 	 */
526 	public static final int nextLF(byte[] b, int ptr) {
527 		return next(b, ptr, '\n');
528 	}
529 
530 	/**
531 	 * Locate the first position after either the given character or LF.
532 	 * <p>
533 	 * This method stops on the first match it finds from either chrA or '\n'.
534 	 *
535 	 * @param b
536 	 *            buffer to scan.
537 	 * @param ptr
538 	 *            position within buffer to start looking for chrA or LF at.
539 	 * @param chrA
540 	 *            character to find.
541 	 * @return new position just after the first chrA or LF to be found.
542 	 */
543 	public static final int nextLF(byte[] b, int ptr, char chrA) {
544 		final int sz = b.length;
545 		while (ptr < sz) {
546 			final byte c = b[ptr++];
547 			if (c == chrA || c == '\n')
548 				return ptr;
549 		}
550 		return ptr;
551 	}
552 
553 	/**
554 	 * Locate the end of the header.  Note that headers may be
555 	 * more than one line long.
556 	 * @param b
557 	 *            buffer to scan.
558 	 * @param ptr
559 	 *            position within buffer to start looking for the end-of-header.
560 	 * @return new position just after the header.  This is either
561 	 * b.length, or the index of the header's terminating newline.
562 	 * @since 5.1
563 	 */
564 	public static final int headerEnd(final byte[] b, int ptr) {
565 		final int sz = b.length;
566 		while (ptr < sz) {
567 			final byte c = b[ptr++];
568 			if (c == '\n' && (ptr == sz || b[ptr] != ' ')) {
569 				return ptr - 1;
570 			}
571 		}
572 		return ptr - 1;
573 	}
574 
575 	/**
576 	 * Find the start of the contents of a given header.
577 	 *
578 	 * @param b
579 	 *            buffer to scan.
580 	 * @param headerName
581 	 *            header to search for
582 	 * @param ptr
583 	 *            position within buffer to start looking for header at.
584 	 * @return new position at the start of the header's contents, -1 for
585 	 *         not found
586 	 * @since 5.1
587 	 */
588 	public static final int headerStart(byte[] headerName, byte[] b, int ptr) {
589 		// Start by advancing to just past a LF or buffer start
590 		if (ptr != 0) {
591 			ptr = nextLF(b, ptr - 1);
592 		}
593 		while (ptr < b.length - (headerName.length + 1)) {
594 			boolean found = true;
595 			for (int i = 0; i < headerName.length; i++) {
596 				if (headerName[i] != b[ptr++]) {
597 					found = false;
598 					break;
599 				}
600 			}
601 			if (found && b[ptr++] == ' ') {
602 				return ptr;
603 			}
604 			ptr = nextLF(b, ptr);
605 		}
606 		return -1;
607 	}
608 
609 	/**
610 	 * Locate the first position before a given character.
611 	 *
612 	 * @param b
613 	 *            buffer to scan.
614 	 * @param ptr
615 	 *            position within buffer to start looking for chrA at.
616 	 * @param chrA
617 	 *            character to find.
618 	 * @return new position just before chrA, -1 for not found
619 	 */
620 	public static final int prev(byte[] b, int ptr, char chrA) {
621 		if (ptr == b.length)
622 			--ptr;
623 		while (ptr >= 0) {
624 			if (b[ptr--] == chrA)
625 				return ptr;
626 		}
627 		return ptr;
628 	}
629 
630 	/**
631 	 * Locate the first position before the previous LF.
632 	 * <p>
633 	 * This method stops on the first '\n' it finds.
634 	 *
635 	 * @param b
636 	 *            buffer to scan.
637 	 * @param ptr
638 	 *            position within buffer to start looking for LF at.
639 	 * @return new position just before the first LF found, -1 for not found
640 	 */
641 	public static final int prevLF(byte[] b, int ptr) {
642 		return prev(b, ptr, '\n');
643 	}
644 
645 	/**
646 	 * Locate the previous position before either the given character or LF.
647 	 * <p>
648 	 * This method stops on the first match it finds from either chrA or '\n'.
649 	 *
650 	 * @param b
651 	 *            buffer to scan.
652 	 * @param ptr
653 	 *            position within buffer to start looking for chrA or LF at.
654 	 * @param chrA
655 	 *            character to find.
656 	 * @return new position just before the first chrA or LF to be found, -1 for
657 	 *         not found
658 	 */
659 	public static final int prevLF(byte[] b, int ptr, char chrA) {
660 		if (ptr == b.length)
661 			--ptr;
662 		while (ptr >= 0) {
663 			final byte c = b[ptr--];
664 			if (c == chrA || c == '\n')
665 				return ptr;
666 		}
667 		return ptr;
668 	}
669 
670 	/**
671 	 * Index the region between <code>[ptr, end)</code> to find line starts.
672 	 * <p>
673 	 * The returned list is 1 indexed. Index 0 contains
674 	 * {@link java.lang.Integer#MIN_VALUE} to pad the list out.
675 	 * <p>
676 	 * Using a 1 indexed list means that line numbers can be directly accessed
677 	 * from the list, so <code>list.get(1)</code> (aka get line 1) returns
678 	 * <code>ptr</code>.
679 	 * <p>
680 	 * The last element (index <code>map.size()-1</code>) always contains
681 	 * <code>end</code>.
682 	 *
683 	 * @param buf
684 	 *            buffer to scan.
685 	 * @param ptr
686 	 *            position within the buffer corresponding to the first byte of
687 	 *            line 1.
688 	 * @param end
689 	 *            1 past the end of the content within <code>buf</code>.
690 	 * @return a line map indicating the starting position of each line.
691 	 */
692 	public static final IntList lineMap(byte[] buf, int ptr, int end) {
693 		IntList map = new IntList((end - ptr) / 36);
694 		map.fillTo(1, Integer.MIN_VALUE);
695 		for (; ptr < end; ptr = nextLF(buf, ptr)) {
696 			map.add(ptr);
697 		}
698 		map.add(end);
699 		return map;
700 	}
701 
702 	/**
703 	 * Like {@link #lineMap(byte[], int, int)} but throw
704 	 * {@link BinaryBlobException} if a NUL byte is encountered.
705 	 *
706 	 * @param buf
707 	 *            buffer to scan.
708 	 * @param ptr
709 	 *            position within the buffer corresponding to the first byte of
710 	 *            line 1.
711 	 * @param end
712 	 *            1 past the end of the content within <code>buf</code>.
713 	 * @return a line map indicating the starting position of each line.
714 	 * @throws BinaryBlobException
715 	 *            if a NUL byte is found.
716 	 * @since 5.0
717 	 */
718 	public static final IntList lineMapOrBinary(byte[] buf, int ptr, int end)
719 			throws BinaryBlobException {
720 		IntList map = lineMapOrNull(buf, ptr, end);
721 		if (map == null) {
722 			throw new BinaryBlobException();
723 		}
724 		return map;
725 	}
726 
727 	@Nullable
728 	private static IntList lineMapOrNull(byte[] buf, int ptr, int end) {
729 		// Experimentally derived from multiple source repositories
730 		// the average number of bytes/line is 36. Its a rough guess
731 		// to initially size our map close to the target.
732 		IntList map = new IntList((end - ptr) / 36);
733 		map.add(Integer.MIN_VALUE);
734 		boolean foundLF = true;
735 		for (; ptr < end; ptr++) {
736 			if (foundLF) {
737 				map.add(ptr);
738 			}
739 
740 			if (buf[ptr] == '\0') {
741 				return null;
742 			}
743 
744 			foundLF = (buf[ptr] == '\n');
745 		}
746 		map.add(end);
747 		return map;
748 	}
749 
750 	/**
751 	 * Locate the "author " header line data.
752 	 *
753 	 * @param b
754 	 *            buffer to scan.
755 	 * @param ptr
756 	 *            position in buffer to start the scan at. Most callers should
757 	 *            pass 0 to ensure the scan starts from the beginning of the
758 	 *            commit buffer and does not accidentally look at message body.
759 	 * @return position just after the space in "author ", so the first
760 	 *         character of the author's name. If no author header can be
761 	 *         located -1 is returned.
762 	 */
763 	public static final int author(byte[] b, int ptr) {
764 		final int sz = b.length;
765 		if (ptr == 0)
766 			ptr += 46; // skip the "tree ..." line.
767 		while (ptr < sz && b[ptr] == 'p')
768 			ptr += 48; // skip this parent.
769 		return match(b, ptr, author);
770 	}
771 
772 	/**
773 	 * Locate the "committer " header line data.
774 	 *
775 	 * @param b
776 	 *            buffer to scan.
777 	 * @param ptr
778 	 *            position in buffer to start the scan at. Most callers should
779 	 *            pass 0 to ensure the scan starts from the beginning of the
780 	 *            commit buffer and does not accidentally look at message body.
781 	 * @return position just after the space in "committer ", so the first
782 	 *         character of the committer's name. If no committer header can be
783 	 *         located -1 is returned.
784 	 */
785 	public static final int committer(byte[] b, int ptr) {
786 		final int sz = b.length;
787 		if (ptr == 0)
788 			ptr += 46; // skip the "tree ..." line.
789 		while (ptr < sz && b[ptr] == 'p')
790 			ptr += 48; // skip this parent.
791 		if (ptr < sz && b[ptr] == 'a')
792 			ptr = nextLF(b, ptr);
793 		return match(b, ptr, committer);
794 	}
795 
796 	/**
797 	 * Locate the "tagger " header line data.
798 	 *
799 	 * @param b
800 	 *            buffer to scan.
801 	 * @param ptr
802 	 *            position in buffer to start the scan at. Most callers should
803 	 *            pass 0 to ensure the scan starts from the beginning of the tag
804 	 *            buffer and does not accidentally look at message body.
805 	 * @return position just after the space in "tagger ", so the first
806 	 *         character of the tagger's name. If no tagger header can be
807 	 *         located -1 is returned.
808 	 */
809 	public static final int tagger(byte[] b, int ptr) {
810 		final int sz = b.length;
811 		if (ptr == 0)
812 			ptr += 48; // skip the "object ..." line.
813 		while (ptr < sz) {
814 			if (b[ptr] == '\n')
815 				return -1;
816 			final int m = match(b, ptr, tagger);
817 			if (m >= 0)
818 				return m;
819 			ptr = nextLF(b, ptr);
820 		}
821 		return -1;
822 	}
823 
824 	/**
825 	 * Locate the "encoding " header line.
826 	 *
827 	 * @param b
828 	 *            buffer to scan.
829 	 * @param ptr
830 	 *            position in buffer to start the scan at. Most callers should
831 	 *            pass 0 to ensure the scan starts from the beginning of the
832 	 *            buffer and does not accidentally look at the message body.
833 	 * @return position just after the space in "encoding ", so the first
834 	 *         character of the encoding's name. If no encoding header can be
835 	 *         located -1 is returned (and UTF-8 should be assumed).
836 	 */
837 	public static final int encoding(byte[] b, int ptr) {
838 		final int sz = b.length;
839 		while (ptr < sz) {
840 			if (b[ptr] == '\n')
841 				return -1;
842 			if (b[ptr] == 'e')
843 				break;
844 			ptr = nextLF(b, ptr);
845 		}
846 		return match(b, ptr, encoding);
847 	}
848 
849 	/**
850 	 * Parse the "encoding " header as a string.
851 	 * <p>
852 	 * Locates the "encoding " header (if present) and returns its value.
853 	 *
854 	 * @param b
855 	 *            buffer to scan.
856 	 * @return the encoding header as specified in the commit; null if the
857 	 *         header was not present and should be assumed.
858 	 * @since 4.2
859 	 */
860 	@Nullable
861 	public static String parseEncodingName(byte[] b) {
862 		int enc = encoding(b, 0);
863 		if (enc < 0) {
864 			return null;
865 		}
866 		int lf = nextLF(b, enc);
867 		return decode(UTF_8, b, enc, lf - 1);
868 	}
869 
870 	/**
871 	 * Parse the "encoding " header into a character set reference.
872 	 * <p>
873 	 * Locates the "encoding " header (if present) by first calling
874 	 * {@link #encoding(byte[], int)} and then returns the proper character set
875 	 * to apply to this buffer to evaluate its contents as character data.
876 	 * <p>
877 	 * If no encoding header is present {@code UTF-8} is assumed.
878 	 *
879 	 * @param b
880 	 *            buffer to scan.
881 	 * @return the Java character set representation. Never null.
882 	 * @throws IllegalCharsetNameException
883 	 *             if the character set requested by the encoding header is
884 	 *             malformed and unsupportable.
885 	 * @throws UnsupportedCharsetException
886 	 *             if the JRE does not support the character set requested by
887 	 *             the encoding header.
888 	 */
889 	public static Charset parseEncoding(byte[] b) {
890 		String enc = parseEncodingName(b);
891 		if (enc == null) {
892 			return UTF_8;
893 		}
894 
895 		String name = enc.trim();
896 		try {
897 			return Charset.forName(name);
898 		} catch (IllegalCharsetNameException
899 				| UnsupportedCharsetException badName) {
900 			Charset aliased = charsetForAlias(name);
901 			if (aliased != null) {
902 				return aliased;
903 			}
904 			throw badName;
905 		}
906 	}
907 
908 	/**
909 	 * Parse a name string (e.g. author, committer, tagger) into a PersonIdent.
910 	 * <p>
911 	 * Leading spaces won't be trimmed from the string, i.e. will show up in the
912 	 * parsed name afterwards.
913 	 *
914 	 * @param in
915 	 *            the string to parse a name from.
916 	 * @return the parsed identity or null in case the identity could not be
917 	 *         parsed.
918 	 */
919 	public static PersonIdent parsePersonIdent(String in) {
920 		return parsePersonIdent(Constants.encode(in), 0);
921 	}
922 
923 	/**
924 	 * Parse a name line (e.g. author, committer, tagger) into a PersonIdent.
925 	 * <p>
926 	 * When passing in a value for <code>nameB</code> callers should use the
927 	 * return value of {@link #author(byte[], int)} or
928 	 * {@link #committer(byte[], int)}, as these methods provide the proper
929 	 * position within the buffer.
930 	 *
931 	 * @param raw
932 	 *            the buffer to parse character data from.
933 	 * @param nameB
934 	 *            first position of the identity information. This should be the
935 	 *            first position after the space which delimits the header field
936 	 *            name (e.g. "author" or "committer") from the rest of the
937 	 *            identity line.
938 	 * @return the parsed identity or null in case the identity could not be
939 	 *         parsed.
940 	 */
941 	public static PersonIdent parsePersonIdent(byte[] raw, int nameB) {
942 		Charset cs;
943 		try {
944 			cs = parseEncoding(raw);
945 		} catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
946 			// Assume UTF-8 for person identities, usually this is correct.
947 			// If not decode() will fall back to the ISO-8859-1 encoding.
948 			cs = UTF_8;
949 		}
950 
951 		final int emailB = nextLF(raw, nameB, '<');
952 		final int emailE = nextLF(raw, emailB, '>');
953 		if (emailB >= raw.length || raw[emailB] == '\n' ||
954 				(emailE >= raw.length - 1 && raw[emailE - 1] != '>'))
955 			return null;
956 
957 		final int nameEnd = emailB - 2 >= nameB && raw[emailB - 2] == ' ' ?
958 				emailB - 2 : emailB - 1;
959 		final String name = decode(cs, raw, nameB, nameEnd);
960 		final String email = decode(cs, raw, emailB, emailE - 1);
961 
962 		// Start searching from end of line, as after first name-email pair,
963 		// another name-email pair may occur. We will ignore all kinds of
964 		// "junk" following the first email.
965 		//
966 		// We've to use (emailE - 1) for the case that raw[email] is LF,
967 		// otherwise we would run too far. "-2" is necessary to position
968 		// before the LF in case of LF termination resp. the penultimate
969 		// character if there is no trailing LF.
970 		final int tzBegin = lastIndexOfTrim(raw, ' ',
971 				nextLF(raw, emailE - 1) - 2) + 1;
972 		if (tzBegin <= emailE) // No time/zone, still valid
973 			return new PersonIdent(name, email, 0, 0);
974 
975 		final int whenBegin = Math.max(emailE,
976 				lastIndexOfTrim(raw, ' ', tzBegin - 1) + 1);
977 		if (whenBegin >= tzBegin - 1) // No time/zone, still valid
978 			return new PersonIdent(name, email, 0, 0);
979 
980 		final long when = parseLongBase10(raw, whenBegin, null);
981 		final int tz = parseTimeZoneOffset(raw, tzBegin);
982 		return new PersonIdent(name, email, when * 1000L, tz);
983 	}
984 
985 	/**
986 	 * Parse a name data (e.g. as within a reflog) into a PersonIdent.
987 	 * <p>
988 	 * When passing in a value for <code>nameB</code> callers should use the
989 	 * return value of {@link #author(byte[], int)} or
990 	 * {@link #committer(byte[], int)}, as these methods provide the proper
991 	 * position within the buffer.
992 	 *
993 	 * @param raw
994 	 *            the buffer to parse character data from.
995 	 * @param nameB
996 	 *            first position of the identity information. This should be the
997 	 *            first position after the space which delimits the header field
998 	 *            name (e.g. "author" or "committer") from the rest of the
999 	 *            identity line.
1000 	 * @return the parsed identity. Never null.
1001 	 */
1002 	public static PersonIdent parsePersonIdentOnly(final byte[] raw,
1003 			final int nameB) {
1004 		int stop = nextLF(raw, nameB);
1005 		int emailB = nextLF(raw, nameB, '<');
1006 		int emailE = nextLF(raw, emailB, '>');
1007 		final String name;
1008 		final String email;
1009 		if (emailE < stop) {
1010 			email = decode(raw, emailB, emailE - 1);
1011 		} else {
1012 			email = "invalid"; //$NON-NLS-1$
1013 		}
1014 		if (emailB < stop)
1015 			name = decode(raw, nameB, emailB - 2);
1016 		else
1017 			name = decode(raw, nameB, stop);
1018 
1019 		final MutableInteger.html#MutableInteger">MutableInteger ptrout = new MutableInteger();
1020 		long when;
1021 		int tz;
1022 		if (emailE < stop) {
1023 			when = parseLongBase10(raw, emailE + 1, ptrout);
1024 			tz = parseTimeZoneOffset(raw, ptrout.value);
1025 		} else {
1026 			when = 0;
1027 			tz = 0;
1028 		}
1029 		return new PersonIdent(name, email, when * 1000L, tz);
1030 	}
1031 
1032 	/**
1033 	 * Locate the end of a footer line key string.
1034 	 * <p>
1035 	 * If the region at {@code raw[ptr]} matches {@code ^[A-Za-z0-9-]+:} (e.g.
1036 	 * "Signed-off-by: A. U. Thor\n") then this method returns the position of
1037 	 * the first ':'.
1038 	 * <p>
1039 	 * If the region at {@code raw[ptr]} does not match {@code ^[A-Za-z0-9-]+:}
1040 	 * then this method returns -1.
1041 	 *
1042 	 * @param raw
1043 	 *            buffer to scan.
1044 	 * @param ptr
1045 	 *            first position within raw to consider as a footer line key.
1046 	 * @return position of the ':' which terminates the footer line key if this
1047 	 *         is otherwise a valid footer line key; otherwise -1.
1048 	 */
1049 	public static int endOfFooterLineKey(byte[] raw, int ptr) {
1050 		try {
1051 			for (;;) {
1052 				final byte c = raw[ptr];
1053 				if (footerLineKeyChars[c] == 0) {
1054 					if (c == ':')
1055 						return ptr;
1056 					return -1;
1057 				}
1058 				ptr++;
1059 			}
1060 		} catch (ArrayIndexOutOfBoundsException e) {
1061 			return -1;
1062 		}
1063 	}
1064 
1065 	/**
1066 	 * Decode a buffer under UTF-8, if possible.
1067 	 *
1068 	 * If the byte stream cannot be decoded that way, the platform default is tried
1069 	 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1070 	 *
1071 	 * @param buffer
1072 	 *            buffer to pull raw bytes from.
1073 	 * @return a string representation of the range <code>[start,end)</code>,
1074 	 *         after decoding the region through the specified character set.
1075 	 */
1076 	public static String decode(byte[] buffer) {
1077 		return decode(buffer, 0, buffer.length);
1078 	}
1079 
1080 	/**
1081 	 * Decode a buffer under UTF-8, if possible.
1082 	 *
1083 	 * If the byte stream cannot be decoded that way, the platform default is
1084 	 * tried and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1085 	 *
1086 	 * @param buffer
1087 	 *            buffer to pull raw bytes from.
1088 	 * @param start
1089 	 *            start position in buffer
1090 	 * @param end
1091 	 *            one position past the last location within the buffer to take
1092 	 *            data from.
1093 	 * @return a string representation of the range <code>[start,end)</code>,
1094 	 *         after decoding the region through the specified character set.
1095 	 */
1096 	public static String decode(final byte[] buffer, final int start,
1097 			final int end) {
1098 		return decode(UTF_8, buffer, start, end);
1099 	}
1100 
1101 	/**
1102 	 * Decode a buffer under the specified character set if possible.
1103 	 *
1104 	 * If the byte stream cannot be decoded that way, the platform default is tried
1105 	 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1106 	 *
1107 	 * @param cs
1108 	 *            character set to use when decoding the buffer.
1109 	 * @param buffer
1110 	 *            buffer to pull raw bytes from.
1111 	 * @return a string representation of the range <code>[start,end)</code>,
1112 	 *         after decoding the region through the specified character set.
1113 	 */
1114 	public static String decode(Charset cs, byte[] buffer) {
1115 		return decode(cs, buffer, 0, buffer.length);
1116 	}
1117 
1118 	/**
1119 	 * Decode a region of the buffer under the specified character set if possible.
1120 	 *
1121 	 * If the byte stream cannot be decoded that way, the platform default is tried
1122 	 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1123 	 *
1124 	 * @param cs
1125 	 *            character set to use when decoding the buffer.
1126 	 * @param buffer
1127 	 *            buffer to pull raw bytes from.
1128 	 * @param start
1129 	 *            first position within the buffer to take data from.
1130 	 * @param end
1131 	 *            one position past the last location within the buffer to take
1132 	 *            data from.
1133 	 * @return a string representation of the range <code>[start,end)</code>,
1134 	 *         after decoding the region through the specified character set.
1135 	 */
1136 	public static String decode(final Charset cs, final byte[] buffer,
1137 			final int start, final int end) {
1138 		try {
1139 			return decodeNoFallback(cs, buffer, start, end);
1140 		} catch (CharacterCodingException e) {
1141 			// Fall back to an ISO-8859-1 style encoding. At least all of
1142 			// the bytes will be present in the output.
1143 			//
1144 			return extractBinaryString(buffer, start, end);
1145 		}
1146 	}
1147 
1148 	/**
1149 	 * Decode a region of the buffer under the specified character set if
1150 	 * possible.
1151 	 *
1152 	 * If the byte stream cannot be decoded that way, the platform default is
1153 	 * tried and if that too fails, an exception is thrown.
1154 	 *
1155 	 * @param cs
1156 	 *            character set to use when decoding the buffer.
1157 	 * @param buffer
1158 	 *            buffer to pull raw bytes from.
1159 	 * @param start
1160 	 *            first position within the buffer to take data from.
1161 	 * @param end
1162 	 *            one position past the last location within the buffer to take
1163 	 *            data from.
1164 	 * @return a string representation of the range <code>[start,end)</code>,
1165 	 *         after decoding the region through the specified character set.
1166 	 * @throws java.nio.charset.CharacterCodingException
1167 	 *             the input is not in any of the tested character sets.
1168 	 */
1169 	public static String decodeNoFallback(final Charset cs,
1170 			final byte[] buffer, final int start, final int end)
1171 			throws CharacterCodingException {
1172 		ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
1173 		b.mark();
1174 
1175 		// Try our built-in favorite. The assumption here is that
1176 		// decoding will fail if the data is not actually encoded
1177 		// using that encoder.
1178 		try {
1179 			return decode(b, UTF_8);
1180 		} catch (CharacterCodingException e) {
1181 			b.reset();
1182 		}
1183 
1184 		if (!cs.equals(UTF_8)) {
1185 			// Try the suggested encoding, it might be right since it was
1186 			// provided by the caller.
1187 			try {
1188 				return decode(b, cs);
1189 			} catch (CharacterCodingException e) {
1190 				b.reset();
1191 			}
1192 		}
1193 
1194 		// Try the default character set. A small group of people
1195 		// might actually use the same (or very similar) locale.
1196 		Charset defcs = Charset.defaultCharset();
1197 		if (!defcs.equals(cs) && !defcs.equals(UTF_8)) {
1198 			try {
1199 				return decode(b, defcs);
1200 			} catch (CharacterCodingException e) {
1201 				b.reset();
1202 			}
1203 		}
1204 
1205 		throw new CharacterCodingException();
1206 	}
1207 
1208 	/**
1209 	 * Decode a region of the buffer under the ISO-8859-1 encoding.
1210 	 *
1211 	 * Each byte is treated as a single character in the 8859-1 character
1212 	 * encoding, performing a raw binary-&gt;char conversion.
1213 	 *
1214 	 * @param buffer
1215 	 *            buffer to pull raw bytes from.
1216 	 * @param start
1217 	 *            first position within the buffer to take data from.
1218 	 * @param end
1219 	 *            one position past the last location within the buffer to take
1220 	 *            data from.
1221 	 * @return a string representation of the range <code>[start,end)</code>.
1222 	 */
1223 	public static String extractBinaryString(final byte[] buffer,
1224 			final int start, final int end) {
1225 		final StringBuilder r = new StringBuilder(end - start);
1226 		for (int i = start; i < end; i++)
1227 			r.append((char) (buffer[i] & 0xff));
1228 		return r.toString();
1229 	}
1230 
1231 	private static String decode(ByteBuffer b, Charset charset)
1232 			throws CharacterCodingException {
1233 		final CharsetDecoder d = charset.newDecoder();
1234 		d.onMalformedInput(CodingErrorAction.REPORT);
1235 		d.onUnmappableCharacter(CodingErrorAction.REPORT);
1236 		return d.decode(b).toString();
1237 	}
1238 
1239 	/**
1240 	 * Locate the position of the commit message body.
1241 	 *
1242 	 * @param b
1243 	 *            buffer to scan.
1244 	 * @param ptr
1245 	 *            position in buffer to start the scan at. Most callers should
1246 	 *            pass 0 to ensure the scan starts from the beginning of the
1247 	 *            commit buffer.
1248 	 * @return position of the user's message buffer.
1249 	 */
1250 	public static final int commitMessage(byte[] b, int ptr) {
1251 		final int sz = b.length;
1252 		if (ptr == 0)
1253 			ptr += 46; // skip the "tree ..." line.
1254 		while (ptr < sz && b[ptr] == 'p')
1255 			ptr += 48; // skip this parent.
1256 
1257 		// Skip any remaining header lines, ignoring what their actual
1258 		// header line type is. This is identical to the logic for a tag.
1259 		//
1260 		return tagMessage(b, ptr);
1261 	}
1262 
1263 	/**
1264 	 * Locate the position of the tag message body.
1265 	 *
1266 	 * @param b
1267 	 *            buffer to scan.
1268 	 * @param ptr
1269 	 *            position in buffer to start the scan at. Most callers should
1270 	 *            pass 0 to ensure the scan starts from the beginning of the tag
1271 	 *            buffer.
1272 	 * @return position of the user's message buffer.
1273 	 */
1274 	public static final int tagMessage(byte[] b, int ptr) {
1275 		final int sz = b.length;
1276 		if (ptr == 0)
1277 			ptr += 48; // skip the "object ..." line.
1278 		while (ptr < sz && b[ptr] != '\n')
1279 			ptr = nextLF(b, ptr);
1280 		if (ptr < sz && b[ptr] == '\n')
1281 			return ptr + 1;
1282 		return -1;
1283 	}
1284 
1285 	/**
1286 	 * Locate the end of a paragraph.
1287 	 * <p>
1288 	 * A paragraph is ended by two consecutive LF bytes or CRLF pairs
1289 	 *
1290 	 * @param b
1291 	 *            buffer to scan.
1292 	 * @param start
1293 	 *            position in buffer to start the scan at. Most callers will
1294 	 *            want to pass the first position of the commit message (as
1295 	 *            found by {@link #commitMessage(byte[], int)}.
1296 	 * @return position of the LF at the end of the paragraph;
1297 	 *         <code>b.length</code> if no paragraph end could be located.
1298 	 */
1299 	public static final int endOfParagraph(byte[] b, int start) {
1300 		int ptr = start;
1301 		final int sz = b.length;
1302 		while (ptr < sz && (b[ptr] != '\n' && b[ptr] != '\r'))
1303 			ptr = nextLF(b, ptr);
1304 		if (ptr > start && b[ptr - 1] == '\n')
1305 			ptr--;
1306 		if (ptr > start && b[ptr - 1] == '\r')
1307 			ptr--;
1308 		return ptr;
1309 	}
1310 
1311 	/**
1312 	 * Get last index of {@code ch} in raw, trimming spaces.
1313 	 *
1314 	 * @param raw
1315 	 *            buffer to scan.
1316 	 * @param ch
1317 	 *            character to find.
1318 	 * @param pos
1319 	 *            starting position.
1320 	 * @return last index of {@code ch} in raw, trimming spaces.
1321 	 * @since 4.1
1322 	 */
1323 	public static int lastIndexOfTrim(byte[] raw, char ch, int pos) {
1324 		while (pos >= 0 && raw[pos] == ' ')
1325 			pos--;
1326 
1327 		while (pos >= 0 && raw[pos] != ch)
1328 			pos--;
1329 
1330 		return pos;
1331 	}
1332 
1333 	private static Charset charsetForAlias(String name) {
1334 		return encodingAliases.get(StringUtils.toLowerCase(name));
1335 	}
1336 
1337 	private RawParseUtils() {
1338 		// Don't create instances of a static only utility.
1339 	}
1340 }