1 /*
2 * Copyright (C) 2008-2009, Google Inc.
3 * Copyright (C) 2006-2008, Shawn O. Pearce <spearce@spearce.org> and others
4 *
5 * This program and the accompanying materials are made available under the
6 * terms of the Eclipse Distribution License v. 1.0 which is available at
7 * https://www.eclipse.org/org/documents/edl-v10.php.
8 *
9 * SPDX-License-Identifier: BSD-3-Clause
10 */
11
12 package org.eclipse.jgit.util;
13
14 import static java.nio.charset.StandardCharsets.ISO_8859_1;
15 import static java.nio.charset.StandardCharsets.UTF_8;
16 import static org.eclipse.jgit.lib.ObjectChecker.author;
17 import static org.eclipse.jgit.lib.ObjectChecker.committer;
18 import static org.eclipse.jgit.lib.ObjectChecker.encoding;
19 import static org.eclipse.jgit.lib.ObjectChecker.tagger;
20
21 import java.nio.ByteBuffer;
22 import java.nio.charset.CharacterCodingException;
23 import java.nio.charset.Charset;
24 import java.nio.charset.CharsetDecoder;
25 import java.nio.charset.CodingErrorAction;
26 import java.nio.charset.IllegalCharsetNameException;
27 import java.nio.charset.UnsupportedCharsetException;
28 import java.util.Arrays;
29 import java.util.HashMap;
30 import java.util.Map;
31
32 import org.eclipse.jgit.annotations.Nullable;
33 import org.eclipse.jgit.errors.BinaryBlobException;
34 import org.eclipse.jgit.lib.Constants;
35 import org.eclipse.jgit.lib.PersonIdent;
36
37 /**
38 * Handy utility functions to parse raw object contents.
39 */
40 public final class RawParseUtils {
41 /**
42 * UTF-8 charset constant.
43 *
44 * @since 2.2
45 * @deprecated use {@link java.nio.charset.StandardCharsets#UTF_8} instead
46 */
47 @Deprecated
48 public static final Charset UTF8_CHARSET = UTF_8;
49
50 private static final byte[] digits10;
51
52 private static final byte[] digits16;
53
54 private static final byte[] footerLineKeyChars;
55
56 private static final Map<String, Charset> encodingAliases;
57
58 static {
59 encodingAliases = new HashMap<>();
60 encodingAliases.put("latin-1", ISO_8859_1); //$NON-NLS-1$
61 encodingAliases.put("iso-latin-1", ISO_8859_1); //$NON-NLS-1$
62
63 digits10 = new byte['9' + 1];
64 Arrays.fill(digits10, (byte) -1);
65 for (char i = '0'; i <= '9'; i++)
66 digits10[i] = (byte) (i - '0');
67
68 digits16 = new byte['f' + 1];
69 Arrays.fill(digits16, (byte) -1);
70 for (char i = '0'; i <= '9'; i++)
71 digits16[i] = (byte) (i - '0');
72 for (char i = 'a'; i <= 'f'; i++)
73 digits16[i] = (byte) ((i - 'a') + 10);
74 for (char i = 'A'; i <= 'F'; i++)
75 digits16[i] = (byte) ((i - 'A') + 10);
76
77 footerLineKeyChars = new byte['z' + 1];
78 footerLineKeyChars['-'] = 1;
79 for (char i = '0'; i <= '9'; i++)
80 footerLineKeyChars[i] = 1;
81 for (char i = 'A'; i <= 'Z'; i++)
82 footerLineKeyChars[i] = 1;
83 for (char i = 'a'; i <= 'z'; i++)
84 footerLineKeyChars[i] = 1;
85 }
86
87 /**
88 * Determine if b[ptr] matches src.
89 *
90 * @param b
91 * the buffer to scan.
92 * @param ptr
93 * first position within b, this should match src[0].
94 * @param src
95 * the buffer to test for equality with b.
96 * @return ptr + src.length if b[ptr..src.length] == src; else -1.
97 */
98 public static final int match(byte[] b, int ptr, byte[] src) {
99 if (ptr + src.length > b.length)
100 return -1;
101 for (int i = 0; i < src.length; i++, ptr++)
102 if (b[ptr] != src[i])
103 return -1;
104 return ptr;
105 }
106
107 private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5',
108 '6', '7', '8', '9' };
109
110 /**
111 * Format a base 10 numeric into a temporary buffer.
112 * <p>
113 * Formatting is performed backwards. The method starts at offset
114 * <code>o-1</code> and ends at <code>o-1-digits</code>, where
115 * <code>digits</code> is the number of positions necessary to store the
116 * base 10 value.
117 * <p>
118 * The argument and return values from this method make it easy to chain
119 * writing, for example:
120 * </p>
121 *
122 * <pre>
123 * final byte[] tmp = new byte[64];
124 * int ptr = tmp.length;
125 * tmp[--ptr] = '\n';
126 * ptr = RawParseUtils.formatBase10(tmp, ptr, 32);
127 * tmp[--ptr] = ' ';
128 * ptr = RawParseUtils.formatBase10(tmp, ptr, 18);
129 * tmp[--ptr] = 0;
130 * final String str = new String(tmp, ptr, tmp.length - ptr);
131 * </pre>
132 *
133 * @param b
134 * buffer to write into.
135 * @param o
136 * one offset past the location where writing will begin; writing
137 * proceeds towards lower index values.
138 * @param value
139 * the value to store.
140 * @return the new offset value <code>o</code>. This is the position of
141 * the last byte written. Additional writing should start at one
142 * position earlier.
143 */
144 public static int formatBase10(final byte[] b, int o, int value) {
145 if (value == 0) {
146 b[--o] = '0';
147 return o;
148 }
149 final boolean isneg = value < 0;
150 if (isneg)
151 value = -value;
152 while (value != 0) {
153 b[--o] = base10byte[value % 10];
154 value /= 10;
155 }
156 if (isneg)
157 b[--o] = '-';
158 return o;
159 }
160
161 /**
162 * Parse a base 10 numeric from a sequence of ASCII digits into an int.
163 * <p>
164 * Digit sequences can begin with an optional run of spaces before the
165 * sequence, and may start with a '+' or a '-' to indicate sign position.
166 * Any other characters will cause the method to stop and return the current
167 * result to the caller.
168 *
169 * @param b
170 * buffer to scan.
171 * @param ptr
172 * position within buffer to start parsing digits at.
173 * @param ptrResult
174 * optional location to return the new ptr value through. If null
175 * the ptr value will be discarded.
176 * @return the value at this location; 0 if the location is not a valid
177 * numeric.
178 */
179 public static final int parseBase10(final byte[] b, int ptr,
180 final MutableInteger ptrResult) {
181 int r = 0;
182 int sign = 0;
183 try {
184 final int sz = b.length;
185 while (ptr < sz && b[ptr] == ' ')
186 ptr++;
187 if (ptr >= sz)
188 return 0;
189
190 switch (b[ptr]) {
191 case '-':
192 sign = -1;
193 ptr++;
194 break;
195 case '+':
196 ptr++;
197 break;
198 }
199
200 while (ptr < sz) {
201 final byte v = digits10[b[ptr]];
202 if (v < 0)
203 break;
204 r = (r * 10) + v;
205 ptr++;
206 }
207 } catch (ArrayIndexOutOfBoundsException e) {
208 // Not a valid digit.
209 }
210 if (ptrResult != null)
211 ptrResult.value = ptr;
212 return sign < 0 ? -r : r;
213 }
214
215 /**
216 * Parse a base 10 numeric from a sequence of ASCII digits into a long.
217 * <p>
218 * Digit sequences can begin with an optional run of spaces before the
219 * sequence, and may start with a '+' or a '-' to indicate sign position.
220 * Any other characters will cause the method to stop and return the current
221 * result to the caller.
222 *
223 * @param b
224 * buffer to scan.
225 * @param ptr
226 * position within buffer to start parsing digits at.
227 * @param ptrResult
228 * optional location to return the new ptr value through. If null
229 * the ptr value will be discarded.
230 * @return the value at this location; 0 if the location is not a valid
231 * numeric.
232 */
233 public static final long parseLongBase10(final byte[] b, int ptr,
234 final MutableInteger ptrResult) {
235 long r = 0;
236 int sign = 0;
237 try {
238 final int sz = b.length;
239 while (ptr < sz && b[ptr] == ' ')
240 ptr++;
241 if (ptr >= sz)
242 return 0;
243
244 switch (b[ptr]) {
245 case '-':
246 sign = -1;
247 ptr++;
248 break;
249 case '+':
250 ptr++;
251 break;
252 }
253
254 while (ptr < sz) {
255 final byte v = digits10[b[ptr]];
256 if (v < 0)
257 break;
258 r = (r * 10) + v;
259 ptr++;
260 }
261 } catch (ArrayIndexOutOfBoundsException e) {
262 // Not a valid digit.
263 }
264 if (ptrResult != null)
265 ptrResult.value = ptr;
266 return sign < 0 ? -r : r;
267 }
268
269 /**
270 * Parse 4 character base 16 (hex) formatted string to unsigned integer.
271 * <p>
272 * The number is read in network byte order, that is, most significant
273 * nybble first.
274 *
275 * @param bs
276 * buffer to parse digits from; positions {@code [p, p+4)} will
277 * be parsed.
278 * @param p
279 * first position within the buffer to parse.
280 * @return the integer value.
281 * @throws java.lang.ArrayIndexOutOfBoundsException
282 * if the string is not hex formatted.
283 */
284 public static final int parseHexInt16(final byte[] bs, final int p) {
285 int r = digits16[bs[p]] << 4;
286
287 r |= digits16[bs[p + 1]];
288 r <<= 4;
289
290 r |= digits16[bs[p + 2]];
291 r <<= 4;
292
293 r |= digits16[bs[p + 3]];
294 if (r < 0)
295 throw new ArrayIndexOutOfBoundsException();
296 return r;
297 }
298
299 /**
300 * Parse 8 character base 16 (hex) formatted string to unsigned integer.
301 * <p>
302 * The number is read in network byte order, that is, most significant
303 * nybble first.
304 *
305 * @param bs
306 * buffer to parse digits from; positions {@code [p, p+8)} will
307 * be parsed.
308 * @param p
309 * first position within the buffer to parse.
310 * @return the integer value.
311 * @throws java.lang.ArrayIndexOutOfBoundsException
312 * if the string is not hex formatted.
313 */
314 public static final int parseHexInt32(final byte[] bs, final int p) {
315 int r = digits16[bs[p]] << 4;
316
317 r |= digits16[bs[p + 1]];
318 r <<= 4;
319
320 r |= digits16[bs[p + 2]];
321 r <<= 4;
322
323 r |= digits16[bs[p + 3]];
324 r <<= 4;
325
326 r |= digits16[bs[p + 4]];
327 r <<= 4;
328
329 r |= digits16[bs[p + 5]];
330 r <<= 4;
331
332 r |= digits16[bs[p + 6]];
333
334 final int last = digits16[bs[p + 7]];
335 if (r < 0 || last < 0)
336 throw new ArrayIndexOutOfBoundsException();
337 return (r << 4) | last;
338 }
339
340 /**
341 * Parse 16 character base 16 (hex) formatted string to unsigned long.
342 * <p>
343 * The number is read in network byte order, that is, most significant
344 * nibble first.
345 *
346 * @param bs
347 * buffer to parse digits from; positions {@code [p, p+16)} will
348 * be parsed.
349 * @param p
350 * first position within the buffer to parse.
351 * @return the integer value.
352 * @throws java.lang.ArrayIndexOutOfBoundsException
353 * if the string is not hex formatted.
354 * @since 4.3
355 */
356 public static final long parseHexInt64(final byte[] bs, final int p) {
357 long r = digits16[bs[p]] << 4;
358
359 r |= digits16[bs[p + 1]];
360 r <<= 4;
361
362 r |= digits16[bs[p + 2]];
363 r <<= 4;
364
365 r |= digits16[bs[p + 3]];
366 r <<= 4;
367
368 r |= digits16[bs[p + 4]];
369 r <<= 4;
370
371 r |= digits16[bs[p + 5]];
372 r <<= 4;
373
374 r |= digits16[bs[p + 6]];
375 r <<= 4;
376
377 r |= digits16[bs[p + 7]];
378 r <<= 4;
379
380 r |= digits16[bs[p + 8]];
381 r <<= 4;
382
383 r |= digits16[bs[p + 9]];
384 r <<= 4;
385
386 r |= digits16[bs[p + 10]];
387 r <<= 4;
388
389 r |= digits16[bs[p + 11]];
390 r <<= 4;
391
392 r |= digits16[bs[p + 12]];
393 r <<= 4;
394
395 r |= digits16[bs[p + 13]];
396 r <<= 4;
397
398 r |= digits16[bs[p + 14]];
399
400 final int last = digits16[bs[p + 15]];
401 if (r < 0 || last < 0)
402 throw new ArrayIndexOutOfBoundsException();
403 return (r << 4) | last;
404 }
405
406 /**
407 * Parse a single hex digit to its numeric value (0-15).
408 *
409 * @param digit
410 * hex character to parse.
411 * @return numeric value, in the range 0-15.
412 * @throws java.lang.ArrayIndexOutOfBoundsException
413 * if the input digit is not a valid hex digit.
414 */
415 public static final int parseHexInt4(final byte digit) {
416 final byte r = digits16[digit];
417 if (r < 0)
418 throw new ArrayIndexOutOfBoundsException();
419 return r;
420 }
421
422 /**
423 * Parse a Git style timezone string.
424 * <p>
425 * The sequence "-0315" will be parsed as the numeric value -195, as the
426 * lower two positions count minutes, not 100ths of an hour.
427 *
428 * @param b
429 * buffer to scan.
430 * @param ptr
431 * position within buffer to start parsing digits at.
432 * @return the timezone at this location, expressed in minutes.
433 */
434 public static final int parseTimeZoneOffset(byte[] b, int ptr) {
435 return parseTimeZoneOffset(b, ptr, null);
436 }
437
438 /**
439 * Parse a Git style timezone string.
440 * <p>
441 * The sequence "-0315" will be parsed as the numeric value -195, as the
442 * lower two positions count minutes, not 100ths of an hour.
443 *
444 * @param b
445 * buffer to scan.
446 * @param ptr
447 * position within buffer to start parsing digits at.
448 * @param ptrResult
449 * optional location to return the new ptr value through. If null
450 * the ptr value will be discarded.
451 * @return the timezone at this location, expressed in minutes.
452 * @since 4.1
453 */
454 public static final int parseTimeZoneOffset(final byte[] b, int ptr,
455 MutableInteger ptrResult) {
456 final int v = parseBase10(b, ptr, ptrResult);
457 final int tzMins = v % 100;
458 final int tzHours = v / 100;
459 return tzHours * 60 + tzMins;
460 }
461
462 /**
463 * Locate the first position after a given character.
464 *
465 * @param b
466 * buffer to scan.
467 * @param ptr
468 * position within buffer to start looking for chrA at.
469 * @param chrA
470 * character to find.
471 * @return new position just after chrA.
472 */
473 public static final int next(byte[] b, int ptr, char chrA) {
474 final int sz = b.length;
475 while (ptr < sz) {
476 if (b[ptr++] == chrA)
477 return ptr;
478 }
479 return ptr;
480 }
481
482 /**
483 * Locate the first position after the next LF.
484 * <p>
485 * This method stops on the first '\n' it finds.
486 *
487 * @param b
488 * buffer to scan.
489 * @param ptr
490 * position within buffer to start looking for LF at.
491 * @return new position just after the first LF found.
492 */
493 public static final int nextLF(byte[] b, int ptr) {
494 return next(b, ptr, '\n');
495 }
496
497 /**
498 * Locate the first position after either the given character or LF.
499 * <p>
500 * This method stops on the first match it finds from either chrA or '\n'.
501 *
502 * @param b
503 * buffer to scan.
504 * @param ptr
505 * position within buffer to start looking for chrA or LF at.
506 * @param chrA
507 * character to find.
508 * @return new position just after the first chrA or LF to be found.
509 */
510 public static final int nextLF(byte[] b, int ptr, char chrA) {
511 final int sz = b.length;
512 while (ptr < sz) {
513 final byte c = b[ptr++];
514 if (c == chrA || c == '\n')
515 return ptr;
516 }
517 return ptr;
518 }
519
520 /**
521 * Locate the end of the header. Note that headers may be
522 * more than one line long.
523 * @param b
524 * buffer to scan.
525 * @param ptr
526 * position within buffer to start looking for the end-of-header.
527 * @return new position just after the header. This is either
528 * b.length, or the index of the header's terminating newline.
529 * @since 5.1
530 */
531 public static final int headerEnd(final byte[] b, int ptr) {
532 final int sz = b.length;
533 while (ptr < sz) {
534 final byte c = b[ptr++];
535 if (c == '\n' && (ptr == sz || b[ptr] != ' ')) {
536 return ptr - 1;
537 }
538 }
539 return ptr - 1;
540 }
541
542 /**
543 * Find the start of the contents of a given header.
544 *
545 * @param b
546 * buffer to scan.
547 * @param headerName
548 * header to search for
549 * @param ptr
550 * position within buffer to start looking for header at.
551 * @return new position at the start of the header's contents, -1 for
552 * not found
553 * @since 5.1
554 */
555 public static final int headerStart(byte[] headerName, byte[] b, int ptr) {
556 // Start by advancing to just past a LF or buffer start
557 if (ptr != 0) {
558 ptr = nextLF(b, ptr - 1);
559 }
560 while (ptr < b.length - (headerName.length + 1)) {
561 boolean found = true;
562 for (byte element : headerName) {
563 if (element != b[ptr++]) {
564 found = false;
565 break;
566 }
567 }
568 if (found && b[ptr++] == ' ') {
569 return ptr;
570 }
571 ptr = nextLF(b, ptr);
572 }
573 return -1;
574 }
575
576 /**
577 * Locate the first position before a given character.
578 *
579 * @param b
580 * buffer to scan.
581 * @param ptr
582 * position within buffer to start looking for chrA at.
583 * @param chrA
584 * character to find.
585 * @return new position just before chrA, -1 for not found
586 */
587 public static final int prev(byte[] b, int ptr, char chrA) {
588 if (ptr == b.length)
589 --ptr;
590 while (ptr >= 0) {
591 if (b[ptr--] == chrA)
592 return ptr;
593 }
594 return ptr;
595 }
596
597 /**
598 * Locate the first position before the previous LF.
599 * <p>
600 * This method stops on the first '\n' it finds.
601 *
602 * @param b
603 * buffer to scan.
604 * @param ptr
605 * position within buffer to start looking for LF at.
606 * @return new position just before the first LF found, -1 for not found
607 */
608 public static final int prevLF(byte[] b, int ptr) {
609 return prev(b, ptr, '\n');
610 }
611
612 /**
613 * Locate the previous position before either the given character or LF.
614 * <p>
615 * This method stops on the first match it finds from either chrA or '\n'.
616 *
617 * @param b
618 * buffer to scan.
619 * @param ptr
620 * position within buffer to start looking for chrA or LF at.
621 * @param chrA
622 * character to find.
623 * @return new position just before the first chrA or LF to be found, -1 for
624 * not found
625 */
626 public static final int prevLF(byte[] b, int ptr, char chrA) {
627 if (ptr == b.length)
628 --ptr;
629 while (ptr >= 0) {
630 final byte c = b[ptr--];
631 if (c == chrA || c == '\n')
632 return ptr;
633 }
634 return ptr;
635 }
636
637 /**
638 * Index the region between <code>[ptr, end)</code> to find line starts.
639 * <p>
640 * The returned list is 1 indexed. Index 0 contains
641 * {@link java.lang.Integer#MIN_VALUE} to pad the list out.
642 * <p>
643 * Using a 1 indexed list means that line numbers can be directly accessed
644 * from the list, so <code>list.get(1)</code> (aka get line 1) returns
645 * <code>ptr</code>.
646 * <p>
647 * The last element (index <code>map.size()-1</code>) always contains
648 * <code>end</code>.
649 *
650 * @param buf
651 * buffer to scan.
652 * @param ptr
653 * position within the buffer corresponding to the first byte of
654 * line 1.
655 * @param end
656 * 1 past the end of the content within <code>buf</code>.
657 * @return a line map indicating the starting position of each line.
658 */
659 public static final IntList lineMap(byte[] buf, int ptr, int end) {
660 IntList map = new IntList((end - ptr) / 36);
661 map.fillTo(1, Integer.MIN_VALUE);
662 for (; ptr < end; ptr = nextLF(buf, ptr)) {
663 map.add(ptr);
664 }
665 map.add(end);
666 return map;
667 }
668
669 /**
670 * Like {@link #lineMap(byte[], int, int)} but throw
671 * {@link BinaryBlobException} if a NUL byte is encountered.
672 *
673 * @param buf
674 * buffer to scan.
675 * @param ptr
676 * position within the buffer corresponding to the first byte of
677 * line 1.
678 * @param end
679 * 1 past the end of the content within <code>buf</code>.
680 * @return a line map indicating the starting position of each line.
681 * @throws BinaryBlobException
682 * if a NUL byte is found.
683 * @since 5.0
684 */
685 public static final IntList lineMapOrBinary(byte[] buf, int ptr, int end)
686 throws BinaryBlobException {
687 IntList map = lineMapOrNull(buf, ptr, end);
688 if (map == null) {
689 throw new BinaryBlobException();
690 }
691 return map;
692 }
693
694 @Nullable
695 private static IntList lineMapOrNull(byte[] buf, int ptr, int end) {
696 // Experimentally derived from multiple source repositories
697 // the average number of bytes/line is 36. Its a rough guess
698 // to initially size our map close to the target.
699 IntList map = new IntList((end - ptr) / 36);
700 map.add(Integer.MIN_VALUE);
701 boolean foundLF = true;
702 for (; ptr < end; ptr++) {
703 if (foundLF) {
704 map.add(ptr);
705 }
706
707 if (buf[ptr] == '\0') {
708 return null;
709 }
710
711 foundLF = (buf[ptr] == '\n');
712 }
713 map.add(end);
714 return map;
715 }
716
717 /**
718 * Locate the "author " header line data.
719 *
720 * @param b
721 * buffer to scan.
722 * @param ptr
723 * position in buffer to start the scan at. Most callers should
724 * pass 0 to ensure the scan starts from the beginning of the
725 * commit buffer and does not accidentally look at message body.
726 * @return position just after the space in "author ", so the first
727 * character of the author's name. If no author header can be
728 * located -1 is returned.
729 */
730 public static final int author(byte[] b, int ptr) {
731 final int sz = b.length;
732 if (ptr == 0)
733 ptr += 46; // skip the "tree ..." line.
734 while (ptr < sz && b[ptr] == 'p')
735 ptr += 48; // skip this parent.
736 return match(b, ptr, author);
737 }
738
739 /**
740 * Locate the "committer " header line data.
741 *
742 * @param b
743 * buffer to scan.
744 * @param ptr
745 * position in buffer to start the scan at. Most callers should
746 * pass 0 to ensure the scan starts from the beginning of the
747 * commit buffer and does not accidentally look at message body.
748 * @return position just after the space in "committer ", so the first
749 * character of the committer's name. If no committer header can be
750 * located -1 is returned.
751 */
752 public static final int committer(byte[] b, int ptr) {
753 final int sz = b.length;
754 if (ptr == 0)
755 ptr += 46; // skip the "tree ..." line.
756 while (ptr < sz && b[ptr] == 'p')
757 ptr += 48; // skip this parent.
758 if (ptr < sz && b[ptr] == 'a')
759 ptr = nextLF(b, ptr);
760 return match(b, ptr, committer);
761 }
762
763 /**
764 * Locate the "tagger " header line data.
765 *
766 * @param b
767 * buffer to scan.
768 * @param ptr
769 * position in buffer to start the scan at. Most callers should
770 * pass 0 to ensure the scan starts from the beginning of the tag
771 * buffer and does not accidentally look at message body.
772 * @return position just after the space in "tagger ", so the first
773 * character of the tagger's name. If no tagger header can be
774 * located -1 is returned.
775 */
776 public static final int tagger(byte[] b, int ptr) {
777 final int sz = b.length;
778 if (ptr == 0)
779 ptr += 48; // skip the "object ..." line.
780 while (ptr < sz) {
781 if (b[ptr] == '\n')
782 return -1;
783 final int m = match(b, ptr, tagger);
784 if (m >= 0)
785 return m;
786 ptr = nextLF(b, ptr);
787 }
788 return -1;
789 }
790
791 /**
792 * Locate the "encoding " header line.
793 *
794 * @param b
795 * buffer to scan.
796 * @param ptr
797 * position in buffer to start the scan at. Most callers should
798 * pass 0 to ensure the scan starts from the beginning of the
799 * buffer and does not accidentally look at the message body.
800 * @return position just after the space in "encoding ", so the first
801 * character of the encoding's name. If no encoding header can be
802 * located -1 is returned (and UTF-8 should be assumed).
803 */
804 public static final int encoding(byte[] b, int ptr) {
805 final int sz = b.length;
806 while (ptr < sz) {
807 if (b[ptr] == '\n')
808 return -1;
809 if (b[ptr] == 'e')
810 break;
811 ptr = nextLF(b, ptr);
812 }
813 return match(b, ptr, encoding);
814 }
815
816 /**
817 * Parse the "encoding " header as a string.
818 * <p>
819 * Locates the "encoding " header (if present) and returns its value.
820 *
821 * @param b
822 * buffer to scan.
823 * @return the encoding header as specified in the commit; null if the
824 * header was not present and should be assumed.
825 * @since 4.2
826 */
827 @Nullable
828 public static String parseEncodingName(byte[] b) {
829 int enc = encoding(b, 0);
830 if (enc < 0) {
831 return null;
832 }
833 int lf = nextLF(b, enc);
834 return decode(UTF_8, b, enc, lf - 1);
835 }
836
837 /**
838 * Parse the "encoding " header into a character set reference.
839 * <p>
840 * Locates the "encoding " header (if present) by first calling
841 * {@link #encoding(byte[], int)} and then returns the proper character set
842 * to apply to this buffer to evaluate its contents as character data.
843 * <p>
844 * If no encoding header is present {@code UTF-8} is assumed.
845 *
846 * @param b
847 * buffer to scan.
848 * @return the Java character set representation. Never null.
849 * @throws IllegalCharsetNameException
850 * if the character set requested by the encoding header is
851 * malformed and unsupportable.
852 * @throws UnsupportedCharsetException
853 * if the JRE does not support the character set requested by
854 * the encoding header.
855 */
856 public static Charset parseEncoding(byte[] b) {
857 String enc = parseEncodingName(b);
858 if (enc == null) {
859 return UTF_8;
860 }
861
862 String name = enc.trim();
863 try {
864 return Charset.forName(name);
865 } catch (IllegalCharsetNameException
866 | UnsupportedCharsetException badName) {
867 Charset aliased = charsetForAlias(name);
868 if (aliased != null) {
869 return aliased;
870 }
871 throw badName;
872 }
873 }
874
875 /**
876 * Parse a name string (e.g. author, committer, tagger) into a PersonIdent.
877 * <p>
878 * Leading spaces won't be trimmed from the string, i.e. will show up in the
879 * parsed name afterwards.
880 *
881 * @param in
882 * the string to parse a name from.
883 * @return the parsed identity or null in case the identity could not be
884 * parsed.
885 */
886 public static PersonIdent parsePersonIdent(String in) {
887 return parsePersonIdent(Constants.encode(in), 0);
888 }
889
890 /**
891 * Parse a name line (e.g. author, committer, tagger) into a PersonIdent.
892 * <p>
893 * When passing in a value for <code>nameB</code> callers should use the
894 * return value of {@link #author(byte[], int)} or
895 * {@link #committer(byte[], int)}, as these methods provide the proper
896 * position within the buffer.
897 *
898 * @param raw
899 * the buffer to parse character data from.
900 * @param nameB
901 * first position of the identity information. This should be the
902 * first position after the space which delimits the header field
903 * name (e.g. "author" or "committer") from the rest of the
904 * identity line.
905 * @return the parsed identity or null in case the identity could not be
906 * parsed.
907 */
908 public static PersonIdent parsePersonIdent(byte[] raw, int nameB) {
909 Charset cs;
910 try {
911 cs = parseEncoding(raw);
912 } catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
913 // Assume UTF-8 for person identities, usually this is correct.
914 // If not decode() will fall back to the ISO-8859-1 encoding.
915 cs = UTF_8;
916 }
917
918 final int emailB = nextLF(raw, nameB, '<');
919 final int emailE = nextLF(raw, emailB, '>');
920 if (emailB >= raw.length || raw[emailB] == '\n' ||
921 (emailE >= raw.length - 1 && raw[emailE - 1] != '>'))
922 return null;
923
924 final int nameEnd = emailB - 2 >= nameB && raw[emailB - 2] == ' ' ?
925 emailB - 2 : emailB - 1;
926 final String name = decode(cs, raw, nameB, nameEnd);
927 final String email = decode(cs, raw, emailB, emailE - 1);
928
929 // Start searching from end of line, as after first name-email pair,
930 // another name-email pair may occur. We will ignore all kinds of
931 // "junk" following the first email.
932 //
933 // We've to use (emailE - 1) for the case that raw[email] is LF,
934 // otherwise we would run too far. "-2" is necessary to position
935 // before the LF in case of LF termination resp. the penultimate
936 // character if there is no trailing LF.
937 final int tzBegin = lastIndexOfTrim(raw, ' ',
938 nextLF(raw, emailE - 1) - 2) + 1;
939 if (tzBegin <= emailE) // No time/zone, still valid
940 return new PersonIdent(name, email, 0, 0);
941
942 final int whenBegin = Math.max(emailE,
943 lastIndexOfTrim(raw, ' ', tzBegin - 1) + 1);
944 if (whenBegin >= tzBegin - 1) // No time/zone, still valid
945 return new PersonIdent(name, email, 0, 0);
946
947 final long when = parseLongBase10(raw, whenBegin, null);
948 final int tz = parseTimeZoneOffset(raw, tzBegin);
949 return new PersonIdent(name, email, when * 1000L, tz);
950 }
951
952 /**
953 * Parse a name data (e.g. as within a reflog) into a PersonIdent.
954 * <p>
955 * When passing in a value for <code>nameB</code> callers should use the
956 * return value of {@link #author(byte[], int)} or
957 * {@link #committer(byte[], int)}, as these methods provide the proper
958 * position within the buffer.
959 *
960 * @param raw
961 * the buffer to parse character data from.
962 * @param nameB
963 * first position of the identity information. This should be the
964 * first position after the space which delimits the header field
965 * name (e.g. "author" or "committer") from the rest of the
966 * identity line.
967 * @return the parsed identity. Never null.
968 */
969 public static PersonIdent parsePersonIdentOnly(final byte[] raw,
970 final int nameB) {
971 int stop = nextLF(raw, nameB);
972 int emailB = nextLF(raw, nameB, '<');
973 int emailE = nextLF(raw, emailB, '>');
974 final String name;
975 final String email;
976 if (emailE < stop) {
977 email = decode(raw, emailB, emailE - 1);
978 } else {
979 email = "invalid"; //$NON-NLS-1$
980 }
981 if (emailB < stop)
982 name = decode(raw, nameB, emailB - 2);
983 else
984 name = decode(raw, nameB, stop);
985
986 final MutableInteger.html#MutableInteger">MutableInteger ptrout = new MutableInteger();
987 long when;
988 int tz;
989 if (emailE < stop) {
990 when = parseLongBase10(raw, emailE + 1, ptrout);
991 tz = parseTimeZoneOffset(raw, ptrout.value);
992 } else {
993 when = 0;
994 tz = 0;
995 }
996 return new PersonIdent(name, email, when * 1000L, tz);
997 }
998
999 /**
1000 * Locate the end of a footer line key string.
1001 * <p>
1002 * If the region at {@code raw[ptr]} matches {@code ^[A-Za-z0-9-]+:} (e.g.
1003 * "Signed-off-by: A. U. Thor\n") then this method returns the position of
1004 * the first ':'.
1005 * <p>
1006 * If the region at {@code raw[ptr]} does not match {@code ^[A-Za-z0-9-]+:}
1007 * then this method returns -1.
1008 *
1009 * @param raw
1010 * buffer to scan.
1011 * @param ptr
1012 * first position within raw to consider as a footer line key.
1013 * @return position of the ':' which terminates the footer line key if this
1014 * is otherwise a valid footer line key; otherwise -1.
1015 */
1016 public static int endOfFooterLineKey(byte[] raw, int ptr) {
1017 try {
1018 for (;;) {
1019 final byte c = raw[ptr];
1020 if (footerLineKeyChars[c] == 0) {
1021 if (c == ':')
1022 return ptr;
1023 return -1;
1024 }
1025 ptr++;
1026 }
1027 } catch (ArrayIndexOutOfBoundsException e) {
1028 return -1;
1029 }
1030 }
1031
1032 /**
1033 * Decode a buffer under UTF-8, if possible.
1034 *
1035 * If the byte stream cannot be decoded that way, the platform default is tried
1036 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1037 *
1038 * @param buffer
1039 * buffer to pull raw bytes from.
1040 * @return a string representation of the range <code>[start,end)</code>,
1041 * after decoding the region through the specified character set.
1042 */
1043 public static String decode(byte[] buffer) {
1044 return decode(buffer, 0, buffer.length);
1045 }
1046
1047 /**
1048 * Decode a buffer under UTF-8, if possible.
1049 *
1050 * If the byte stream cannot be decoded that way, the platform default is
1051 * tried and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1052 *
1053 * @param buffer
1054 * buffer to pull raw bytes from.
1055 * @param start
1056 * start position in buffer
1057 * @param end
1058 * one position past the last location within the buffer to take
1059 * data from.
1060 * @return a string representation of the range <code>[start,end)</code>,
1061 * after decoding the region through the specified character set.
1062 */
1063 public static String decode(final byte[] buffer, final int start,
1064 final int end) {
1065 return decode(UTF_8, buffer, start, end);
1066 }
1067
1068 /**
1069 * Decode a buffer under the specified character set if possible.
1070 *
1071 * If the byte stream cannot be decoded that way, the platform default is tried
1072 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1073 *
1074 * @param cs
1075 * character set to use when decoding the buffer.
1076 * @param buffer
1077 * buffer to pull raw bytes from.
1078 * @return a string representation of the range <code>[start,end)</code>,
1079 * after decoding the region through the specified character set.
1080 */
1081 public static String decode(Charset cs, byte[] buffer) {
1082 return decode(cs, buffer, 0, buffer.length);
1083 }
1084
1085 /**
1086 * Decode a region of the buffer under the specified character set if possible.
1087 *
1088 * If the byte stream cannot be decoded that way, the platform default is tried
1089 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1090 *
1091 * @param cs
1092 * character set to use when decoding the buffer.
1093 * @param buffer
1094 * buffer to pull raw bytes from.
1095 * @param start
1096 * first position within the buffer to take data from.
1097 * @param end
1098 * one position past the last location within the buffer to take
1099 * data from.
1100 * @return a string representation of the range <code>[start,end)</code>,
1101 * after decoding the region through the specified character set.
1102 */
1103 public static String decode(final Charset cs, final byte[] buffer,
1104 final int start, final int end) {
1105 try {
1106 return decodeNoFallback(cs, buffer, start, end);
1107 } catch (CharacterCodingException e) {
1108 // Fall back to an ISO-8859-1 style encoding. At least all of
1109 // the bytes will be present in the output.
1110 //
1111 return extractBinaryString(buffer, start, end);
1112 }
1113 }
1114
1115 /**
1116 * Decode a region of the buffer under the specified character set if
1117 * possible.
1118 *
1119 * If the byte stream cannot be decoded that way, the platform default is
1120 * tried and if that too fails, an exception is thrown.
1121 *
1122 * @param cs
1123 * character set to use when decoding the buffer.
1124 * @param buffer
1125 * buffer to pull raw bytes from.
1126 * @param start
1127 * first position within the buffer to take data from.
1128 * @param end
1129 * one position past the last location within the buffer to take
1130 * data from.
1131 * @return a string representation of the range <code>[start,end)</code>,
1132 * after decoding the region through the specified character set.
1133 * @throws java.nio.charset.CharacterCodingException
1134 * the input is not in any of the tested character sets.
1135 */
1136 public static String decodeNoFallback(final Charset cs,
1137 final byte[] buffer, final int start, final int end)
1138 throws CharacterCodingException {
1139 ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
1140 b.mark();
1141
1142 // Try our built-in favorite. The assumption here is that
1143 // decoding will fail if the data is not actually encoded
1144 // using that encoder.
1145 try {
1146 return decode(b, UTF_8);
1147 } catch (CharacterCodingException e) {
1148 b.reset();
1149 }
1150
1151 if (!cs.equals(UTF_8)) {
1152 // Try the suggested encoding, it might be right since it was
1153 // provided by the caller.
1154 try {
1155 return decode(b, cs);
1156 } catch (CharacterCodingException e) {
1157 b.reset();
1158 }
1159 }
1160
1161 // Try the default character set. A small group of people
1162 // might actually use the same (or very similar) locale.
1163 Charset defcs = Charset.defaultCharset();
1164 if (!defcs.equals(cs) && !defcs.equals(UTF_8)) {
1165 try {
1166 return decode(b, defcs);
1167 } catch (CharacterCodingException e) {
1168 b.reset();
1169 }
1170 }
1171
1172 throw new CharacterCodingException();
1173 }
1174
1175 /**
1176 * Decode a region of the buffer under the ISO-8859-1 encoding.
1177 *
1178 * Each byte is treated as a single character in the 8859-1 character
1179 * encoding, performing a raw binary->char conversion.
1180 *
1181 * @param buffer
1182 * buffer to pull raw bytes from.
1183 * @param start
1184 * first position within the buffer to take data from.
1185 * @param end
1186 * one position past the last location within the buffer to take
1187 * data from.
1188 * @return a string representation of the range <code>[start,end)</code>.
1189 */
1190 public static String extractBinaryString(final byte[] buffer,
1191 final int start, final int end) {
1192 final StringBuilder r = new StringBuilder(end - start);
1193 for (int i = start; i < end; i++)
1194 r.append((char) (buffer[i] & 0xff));
1195 return r.toString();
1196 }
1197
1198 private static String decode(ByteBuffer b, Charset charset)
1199 throws CharacterCodingException {
1200 final CharsetDecoder d = charset.newDecoder();
1201 d.onMalformedInput(CodingErrorAction.REPORT);
1202 d.onUnmappableCharacter(CodingErrorAction.REPORT);
1203 return d.decode(b).toString();
1204 }
1205
1206 /**
1207 * Locate the position of the commit message body.
1208 *
1209 * @param b
1210 * buffer to scan.
1211 * @param ptr
1212 * position in buffer to start the scan at. Most callers should
1213 * pass 0 to ensure the scan starts from the beginning of the
1214 * commit buffer.
1215 * @return position of the user's message buffer.
1216 */
1217 public static final int commitMessage(byte[] b, int ptr) {
1218 final int sz = b.length;
1219 if (ptr == 0)
1220 ptr += 46; // skip the "tree ..." line.
1221 while (ptr < sz && b[ptr] == 'p')
1222 ptr += 48; // skip this parent.
1223
1224 // Skip any remaining header lines, ignoring what their actual
1225 // header line type is. This is identical to the logic for a tag.
1226 //
1227 return tagMessage(b, ptr);
1228 }
1229
1230 /**
1231 * Locate the position of the tag message body.
1232 *
1233 * @param b
1234 * buffer to scan.
1235 * @param ptr
1236 * position in buffer to start the scan at. Most callers should
1237 * pass 0 to ensure the scan starts from the beginning of the tag
1238 * buffer.
1239 * @return position of the user's message buffer.
1240 */
1241 public static final int tagMessage(byte[] b, int ptr) {
1242 final int sz = b.length;
1243 if (ptr == 0)
1244 ptr += 48; // skip the "object ..." line.
1245 while (ptr < sz && b[ptr] != '\n')
1246 ptr = nextLF(b, ptr);
1247 if (ptr < sz && b[ptr] == '\n')
1248 return ptr + 1;
1249 return -1;
1250 }
1251
1252 /**
1253 * Locate the end of a paragraph.
1254 * <p>
1255 * A paragraph is ended by two consecutive LF bytes or CRLF pairs
1256 *
1257 * @param b
1258 * buffer to scan.
1259 * @param start
1260 * position in buffer to start the scan at. Most callers will
1261 * want to pass the first position of the commit message (as
1262 * found by {@link #commitMessage(byte[], int)}.
1263 * @return position of the LF at the end of the paragraph;
1264 * <code>b.length</code> if no paragraph end could be located.
1265 */
1266 public static final int endOfParagraph(byte[] b, int start) {
1267 int ptr = start;
1268 final int sz = b.length;
1269 while (ptr < sz && (b[ptr] != '\n' && b[ptr] != '\r'))
1270 ptr = nextLF(b, ptr);
1271 if (ptr > start && b[ptr - 1] == '\n')
1272 ptr--;
1273 if (ptr > start && b[ptr - 1] == '\r')
1274 ptr--;
1275 return ptr;
1276 }
1277
1278 /**
1279 * Get last index of {@code ch} in raw, trimming spaces.
1280 *
1281 * @param raw
1282 * buffer to scan.
1283 * @param ch
1284 * character to find.
1285 * @param pos
1286 * starting position.
1287 * @return last index of {@code ch} in raw, trimming spaces.
1288 * @since 4.1
1289 */
1290 public static int lastIndexOfTrim(byte[] raw, char ch, int pos) {
1291 while (pos >= 0 && raw[pos] == ' ')
1292 pos--;
1293
1294 while (pos >= 0 && raw[pos] != ch)
1295 pos--;
1296
1297 return pos;
1298 }
1299
1300 private static Charset charsetForAlias(String name) {
1301 return encodingAliases.get(StringUtils.toLowerCase(name));
1302 }
1303
1304 private RawParseUtils() {
1305 // Don't create instances of a static only utility.
1306 }
1307 }