1 /*
2 * Copyright (C) 2008-2009, Google Inc.
3 * Copyright (C) 2006-2008, Shawn O. Pearce <spearce@spearce.org> and others
4 *
5 * This program and the accompanying materials are made available under the
6 * terms of the Eclipse Distribution License v. 1.0 which is available at
7 * https://www.eclipse.org/org/documents/edl-v10.php.
8 *
9 * SPDX-License-Identifier: BSD-3-Clause
10 */
11
12 package org.eclipse.jgit.util;
13
14 import static java.nio.charset.StandardCharsets.ISO_8859_1;
15 import static java.nio.charset.StandardCharsets.UTF_8;
16 import static org.eclipse.jgit.lib.ObjectChecker.author;
17 import static org.eclipse.jgit.lib.ObjectChecker.committer;
18 import static org.eclipse.jgit.lib.ObjectChecker.encoding;
19 import static org.eclipse.jgit.lib.ObjectChecker.tagger;
20
21 import java.nio.ByteBuffer;
22 import java.nio.charset.CharacterCodingException;
23 import java.nio.charset.Charset;
24 import java.nio.charset.CharsetDecoder;
25 import java.nio.charset.CodingErrorAction;
26 import java.nio.charset.IllegalCharsetNameException;
27 import java.nio.charset.UnsupportedCharsetException;
28 import java.util.Arrays;
29 import java.util.HashMap;
30 import java.util.Map;
31
32 import org.eclipse.jgit.annotations.Nullable;
33 import org.eclipse.jgit.diff.RawText;
34 import org.eclipse.jgit.errors.BinaryBlobException;
35 import org.eclipse.jgit.lib.Constants;
36 import org.eclipse.jgit.lib.PersonIdent;
37
38 /**
39 * Handy utility functions to parse raw object contents.
40 */
41 public final class RawParseUtils {
42 /**
43 * UTF-8 charset constant.
44 *
45 * @since 2.2
46 * @deprecated use {@link java.nio.charset.StandardCharsets#UTF_8} instead
47 */
48 @Deprecated
49 public static final Charset UTF8_CHARSET = UTF_8;
50
51 private static final byte[] digits10;
52
53 private static final byte[] digits16;
54
55 private static final byte[] footerLineKeyChars;
56
57 private static final Map<String, Charset> encodingAliases;
58
59 static {
60 encodingAliases = new HashMap<>();
61 encodingAliases.put("latin-1", ISO_8859_1); //$NON-NLS-1$
62 encodingAliases.put("iso-latin-1", ISO_8859_1); //$NON-NLS-1$
63
64 digits10 = new byte['9' + 1];
65 Arrays.fill(digits10, (byte) -1);
66 for (char i = '0'; i <= '9'; i++)
67 digits10[i] = (byte) (i - '0');
68
69 digits16 = new byte['f' + 1];
70 Arrays.fill(digits16, (byte) -1);
71 for (char i = '0'; i <= '9'; i++)
72 digits16[i] = (byte) (i - '0');
73 for (char i = 'a'; i <= 'f'; i++)
74 digits16[i] = (byte) ((i - 'a') + 10);
75 for (char i = 'A'; i <= 'F'; i++)
76 digits16[i] = (byte) ((i - 'A') + 10);
77
78 footerLineKeyChars = new byte['z' + 1];
79 footerLineKeyChars['-'] = 1;
80 for (char i = '0'; i <= '9'; i++)
81 footerLineKeyChars[i] = 1;
82 for (char i = 'A'; i <= 'Z'; i++)
83 footerLineKeyChars[i] = 1;
84 for (char i = 'a'; i <= 'z'; i++)
85 footerLineKeyChars[i] = 1;
86 }
87
88 /**
89 * Determine if b[ptr] matches src.
90 *
91 * @param b
92 * the buffer to scan.
93 * @param ptr
94 * first position within b, this should match src[0].
95 * @param src
96 * the buffer to test for equality with b.
97 * @return ptr + src.length if b[ptr..src.length] == src; else -1.
98 */
99 public static final int match(byte[] b, int ptr, byte[] src) {
100 if (ptr + src.length > b.length)
101 return -1;
102 for (int i = 0; i < src.length; i++, ptr++)
103 if (b[ptr] != src[i])
104 return -1;
105 return ptr;
106 }
107
108 private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5',
109 '6', '7', '8', '9' };
110
111 /**
112 * Format a base 10 numeric into a temporary buffer.
113 * <p>
114 * Formatting is performed backwards. The method starts at offset
115 * <code>o-1</code> and ends at <code>o-1-digits</code>, where
116 * <code>digits</code> is the number of positions necessary to store the
117 * base 10 value.
118 * <p>
119 * The argument and return values from this method make it easy to chain
120 * writing, for example:
121 * </p>
122 *
123 * <pre>
124 * final byte[] tmp = new byte[64];
125 * int ptr = tmp.length;
126 * tmp[--ptr] = '\n';
127 * ptr = RawParseUtils.formatBase10(tmp, ptr, 32);
128 * tmp[--ptr] = ' ';
129 * ptr = RawParseUtils.formatBase10(tmp, ptr, 18);
130 * tmp[--ptr] = 0;
131 * final String str = new String(tmp, ptr, tmp.length - ptr);
132 * </pre>
133 *
134 * @param b
135 * buffer to write into.
136 * @param o
137 * one offset past the location where writing will begin; writing
138 * proceeds towards lower index values.
139 * @param value
140 * the value to store.
141 * @return the new offset value <code>o</code>. This is the position of
142 * the last byte written. Additional writing should start at one
143 * position earlier.
144 */
145 public static int formatBase10(final byte[] b, int o, int value) {
146 if (value == 0) {
147 b[--o] = '0';
148 return o;
149 }
150 final boolean isneg = value < 0;
151 if (isneg)
152 value = -value;
153 while (value != 0) {
154 b[--o] = base10byte[value % 10];
155 value /= 10;
156 }
157 if (isneg)
158 b[--o] = '-';
159 return o;
160 }
161
162 /**
163 * Parse a base 10 numeric from a sequence of ASCII digits into an int.
164 * <p>
165 * Digit sequences can begin with an optional run of spaces before the
166 * sequence, and may start with a '+' or a '-' to indicate sign position.
167 * Any other characters will cause the method to stop and return the current
168 * result to the caller.
169 *
170 * @param b
171 * buffer to scan.
172 * @param ptr
173 * position within buffer to start parsing digits at.
174 * @param ptrResult
175 * optional location to return the new ptr value through. If null
176 * the ptr value will be discarded.
177 * @return the value at this location; 0 if the location is not a valid
178 * numeric.
179 */
180 public static final int parseBase10(final byte[] b, int ptr,
181 final MutableInteger ptrResult) {
182 int r = 0;
183 int sign = 0;
184 try {
185 final int sz = b.length;
186 while (ptr < sz && b[ptr] == ' ')
187 ptr++;
188 if (ptr >= sz)
189 return 0;
190
191 switch (b[ptr]) {
192 case '-':
193 sign = -1;
194 ptr++;
195 break;
196 case '+':
197 ptr++;
198 break;
199 }
200
201 while (ptr < sz) {
202 final byte v = digits10[b[ptr]];
203 if (v < 0)
204 break;
205 r = (r * 10) + v;
206 ptr++;
207 }
208 } catch (ArrayIndexOutOfBoundsException e) {
209 // Not a valid digit.
210 }
211 if (ptrResult != null)
212 ptrResult.value = ptr;
213 return sign < 0 ? -r : r;
214 }
215
216 /**
217 * Parse a base 10 numeric from a sequence of ASCII digits into a long.
218 * <p>
219 * Digit sequences can begin with an optional run of spaces before the
220 * sequence, and may start with a '+' or a '-' to indicate sign position.
221 * Any other characters will cause the method to stop and return the current
222 * result to the caller.
223 *
224 * @param b
225 * buffer to scan.
226 * @param ptr
227 * position within buffer to start parsing digits at.
228 * @param ptrResult
229 * optional location to return the new ptr value through. If null
230 * the ptr value will be discarded.
231 * @return the value at this location; 0 if the location is not a valid
232 * numeric.
233 */
234 public static final long parseLongBase10(final byte[] b, int ptr,
235 final MutableInteger ptrResult) {
236 long r = 0;
237 int sign = 0;
238 try {
239 final int sz = b.length;
240 while (ptr < sz && b[ptr] == ' ')
241 ptr++;
242 if (ptr >= sz)
243 return 0;
244
245 switch (b[ptr]) {
246 case '-':
247 sign = -1;
248 ptr++;
249 break;
250 case '+':
251 ptr++;
252 break;
253 }
254
255 while (ptr < sz) {
256 final byte v = digits10[b[ptr]];
257 if (v < 0)
258 break;
259 r = (r * 10) + v;
260 ptr++;
261 }
262 } catch (ArrayIndexOutOfBoundsException e) {
263 // Not a valid digit.
264 }
265 if (ptrResult != null)
266 ptrResult.value = ptr;
267 return sign < 0 ? -r : r;
268 }
269
270 /**
271 * Parse 4 character base 16 (hex) formatted string to unsigned integer.
272 * <p>
273 * The number is read in network byte order, that is, most significant
274 * nybble first.
275 *
276 * @param bs
277 * buffer to parse digits from; positions {@code [p, p+4)} will
278 * be parsed.
279 * @param p
280 * first position within the buffer to parse.
281 * @return the integer value.
282 * @throws java.lang.ArrayIndexOutOfBoundsException
283 * if the string is not hex formatted.
284 */
285 public static final int parseHexInt16(final byte[] bs, final int p) {
286 int r = digits16[bs[p]] << 4;
287
288 r |= digits16[bs[p + 1]];
289 r <<= 4;
290
291 r |= digits16[bs[p + 2]];
292 r <<= 4;
293
294 r |= digits16[bs[p + 3]];
295 if (r < 0)
296 throw new ArrayIndexOutOfBoundsException();
297 return r;
298 }
299
300 /**
301 * Parse 8 character base 16 (hex) formatted string to unsigned integer.
302 * <p>
303 * The number is read in network byte order, that is, most significant
304 * nybble first.
305 *
306 * @param bs
307 * buffer to parse digits from; positions {@code [p, p+8)} will
308 * be parsed.
309 * @param p
310 * first position within the buffer to parse.
311 * @return the integer value.
312 * @throws java.lang.ArrayIndexOutOfBoundsException
313 * if the string is not hex formatted.
314 */
315 public static final int parseHexInt32(final byte[] bs, final int p) {
316 int r = digits16[bs[p]] << 4;
317
318 r |= digits16[bs[p + 1]];
319 r <<= 4;
320
321 r |= digits16[bs[p + 2]];
322 r <<= 4;
323
324 r |= digits16[bs[p + 3]];
325 r <<= 4;
326
327 r |= digits16[bs[p + 4]];
328 r <<= 4;
329
330 r |= digits16[bs[p + 5]];
331 r <<= 4;
332
333 r |= digits16[bs[p + 6]];
334
335 final int last = digits16[bs[p + 7]];
336 if (r < 0 || last < 0)
337 throw new ArrayIndexOutOfBoundsException();
338 return (r << 4) | last;
339 }
340
341 /**
342 * Parse 16 character base 16 (hex) formatted string to unsigned long.
343 * <p>
344 * The number is read in network byte order, that is, most significant
345 * nibble first.
346 *
347 * @param bs
348 * buffer to parse digits from; positions {@code [p, p+16)} will
349 * be parsed.
350 * @param p
351 * first position within the buffer to parse.
352 * @return the integer value.
353 * @throws java.lang.ArrayIndexOutOfBoundsException
354 * if the string is not hex formatted.
355 * @since 4.3
356 */
357 public static final long parseHexInt64(final byte[] bs, final int p) {
358 long r = digits16[bs[p]] << 4;
359
360 r |= digits16[bs[p + 1]];
361 r <<= 4;
362
363 r |= digits16[bs[p + 2]];
364 r <<= 4;
365
366 r |= digits16[bs[p + 3]];
367 r <<= 4;
368
369 r |= digits16[bs[p + 4]];
370 r <<= 4;
371
372 r |= digits16[bs[p + 5]];
373 r <<= 4;
374
375 r |= digits16[bs[p + 6]];
376 r <<= 4;
377
378 r |= digits16[bs[p + 7]];
379 r <<= 4;
380
381 r |= digits16[bs[p + 8]];
382 r <<= 4;
383
384 r |= digits16[bs[p + 9]];
385 r <<= 4;
386
387 r |= digits16[bs[p + 10]];
388 r <<= 4;
389
390 r |= digits16[bs[p + 11]];
391 r <<= 4;
392
393 r |= digits16[bs[p + 12]];
394 r <<= 4;
395
396 r |= digits16[bs[p + 13]];
397 r <<= 4;
398
399 r |= digits16[bs[p + 14]];
400
401 final int last = digits16[bs[p + 15]];
402 if (r < 0 || last < 0)
403 throw new ArrayIndexOutOfBoundsException();
404 return (r << 4) | last;
405 }
406
407 /**
408 * Parse a single hex digit to its numeric value (0-15).
409 *
410 * @param digit
411 * hex character to parse.
412 * @return numeric value, in the range 0-15.
413 * @throws java.lang.ArrayIndexOutOfBoundsException
414 * if the input digit is not a valid hex digit.
415 */
416 public static final int parseHexInt4(final byte digit) {
417 final byte r = digits16[digit];
418 if (r < 0)
419 throw new ArrayIndexOutOfBoundsException();
420 return r;
421 }
422
423 /**
424 * Parse a Git style timezone string.
425 * <p>
426 * The sequence "-0315" will be parsed as the numeric value -195, as the
427 * lower two positions count minutes, not 100ths of an hour.
428 *
429 * @param b
430 * buffer to scan.
431 * @param ptr
432 * position within buffer to start parsing digits at.
433 * @return the timezone at this location, expressed in minutes.
434 */
435 public static final int parseTimeZoneOffset(byte[] b, int ptr) {
436 return parseTimeZoneOffset(b, ptr, null);
437 }
438
439 /**
440 * Parse a Git style timezone string.
441 * <p>
442 * The sequence "-0315" will be parsed as the numeric value -195, as the
443 * lower two positions count minutes, not 100ths of an hour.
444 *
445 * @param b
446 * buffer to scan.
447 * @param ptr
448 * position within buffer to start parsing digits at.
449 * @param ptrResult
450 * optional location to return the new ptr value through. If null
451 * the ptr value will be discarded.
452 * @return the timezone at this location, expressed in minutes.
453 * @since 4.1
454 */
455 public static final int parseTimeZoneOffset(final byte[] b, int ptr,
456 MutableInteger ptrResult) {
457 final int v = parseBase10(b, ptr, ptrResult);
458 final int tzMins = v % 100;
459 final int tzHours = v / 100;
460 return tzHours * 60 + tzMins;
461 }
462
463 /**
464 * Locate the first position after a given character.
465 *
466 * @param b
467 * buffer to scan.
468 * @param ptr
469 * position within buffer to start looking for chrA at.
470 * @param chrA
471 * character to find.
472 * @return new position just after chrA.
473 */
474 public static final int next(byte[] b, int ptr, char chrA) {
475 final int sz = b.length;
476 while (ptr < sz) {
477 if (b[ptr++] == chrA)
478 return ptr;
479 }
480 return ptr;
481 }
482
483 /**
484 * Locate the first position after the next LF.
485 * <p>
486 * This method stops on the first '\n' it finds.
487 *
488 * @param b
489 * buffer to scan.
490 * @param ptr
491 * position within buffer to start looking for LF at.
492 * @return new position just after the first LF found.
493 */
494 public static final int nextLF(byte[] b, int ptr) {
495 return next(b, ptr, '\n');
496 }
497
498 /**
499 * Locate the first position after either the given character or LF.
500 * <p>
501 * This method stops on the first match it finds from either chrA or '\n'.
502 *
503 * @param b
504 * buffer to scan.
505 * @param ptr
506 * position within buffer to start looking for chrA or LF at.
507 * @param chrA
508 * character to find.
509 * @return new position just after the first chrA or LF to be found.
510 */
511 public static final int nextLF(byte[] b, int ptr, char chrA) {
512 final int sz = b.length;
513 while (ptr < sz) {
514 final byte c = b[ptr++];
515 if (c == chrA || c == '\n')
516 return ptr;
517 }
518 return ptr;
519 }
520
521 /**
522 * Locate the end of the header. Note that headers may be
523 * more than one line long.
524 * @param b
525 * buffer to scan.
526 * @param ptr
527 * position within buffer to start looking for the end-of-header.
528 * @return new position just after the header. This is either
529 * b.length, or the index of the header's terminating newline.
530 * @since 5.1
531 */
532 public static final int headerEnd(final byte[] b, int ptr) {
533 final int sz = b.length;
534 while (ptr < sz) {
535 final byte c = b[ptr++];
536 if (c == '\n' && (ptr == sz || b[ptr] != ' ')) {
537 return ptr - 1;
538 }
539 }
540 return ptr - 1;
541 }
542
543 /**
544 * Find the start of the contents of a given header.
545 *
546 * @param b
547 * buffer to scan.
548 * @param headerName
549 * header to search for
550 * @param ptr
551 * position within buffer to start looking for header at.
552 * @return new position at the start of the header's contents, -1 for
553 * not found
554 * @since 5.1
555 */
556 public static final int headerStart(byte[] headerName, byte[] b, int ptr) {
557 // Start by advancing to just past a LF or buffer start
558 if (ptr != 0) {
559 ptr = nextLF(b, ptr - 1);
560 }
561 while (ptr < b.length - (headerName.length + 1)) {
562 boolean found = true;
563 for (byte element : headerName) {
564 if (element != b[ptr++]) {
565 found = false;
566 break;
567 }
568 }
569 if (found && b[ptr++] == ' ') {
570 return ptr;
571 }
572 ptr = nextLF(b, ptr);
573 }
574 return -1;
575 }
576
577 /**
578 * Locate the first position before a given character.
579 *
580 * @param b
581 * buffer to scan.
582 * @param ptr
583 * position within buffer to start looking for chrA at.
584 * @param chrA
585 * character to find.
586 * @return new position just before chrA, -1 for not found
587 */
588 public static final int prev(byte[] b, int ptr, char chrA) {
589 if (ptr == b.length)
590 --ptr;
591 while (ptr >= 0) {
592 if (b[ptr--] == chrA)
593 return ptr;
594 }
595 return ptr;
596 }
597
598 /**
599 * Locate the first position before the previous LF.
600 * <p>
601 * This method stops on the first '\n' it finds.
602 *
603 * @param b
604 * buffer to scan.
605 * @param ptr
606 * position within buffer to start looking for LF at.
607 * @return new position just before the first LF found, -1 for not found
608 */
609 public static final int prevLF(byte[] b, int ptr) {
610 return prev(b, ptr, '\n');
611 }
612
613 /**
614 * Locate the previous position before either the given character or LF.
615 * <p>
616 * This method stops on the first match it finds from either chrA or '\n'.
617 *
618 * @param b
619 * buffer to scan.
620 * @param ptr
621 * position within buffer to start looking for chrA or LF at.
622 * @param chrA
623 * character to find.
624 * @return new position just before the first chrA or LF to be found, -1 for
625 * not found
626 */
627 public static final int prevLF(byte[] b, int ptr, char chrA) {
628 if (ptr == b.length)
629 --ptr;
630 while (ptr >= 0) {
631 final byte c = b[ptr--];
632 if (c == chrA || c == '\n')
633 return ptr;
634 }
635 return ptr;
636 }
637
638 /**
639 * Index the region between <code>[ptr, end)</code> to find line starts.
640 * <p>
641 * The returned list is 1 indexed. Index 0 contains
642 * {@link java.lang.Integer#MIN_VALUE} to pad the list out.
643 * <p>
644 * Using a 1 indexed list means that line numbers can be directly accessed
645 * from the list, so <code>list.get(1)</code> (aka get line 1) returns
646 * <code>ptr</code>.
647 * <p>
648 * The last element (index <code>map.size()-1</code>) always contains
649 * <code>end</code>.
650 *
651 * @param buf
652 * buffer to scan.
653 * @param ptr
654 * position within the buffer corresponding to the first byte of
655 * line 1.
656 * @param end
657 * 1 past the end of the content within <code>buf</code>.
658 * @return a line map indicating the starting position of each line.
659 */
660 public static final IntList lineMap(byte[] buf, int ptr, int end) {
661 IntList map = new IntList((end - ptr) / 36);
662 map.fillTo(1, Integer.MIN_VALUE);
663 for (; ptr < end; ptr = nextLF(buf, ptr)) {
664 map.add(ptr);
665 }
666 map.add(end);
667 return map;
668 }
669
670 /**
671 * Like {@link #lineMap(byte[], int, int)} but throw
672 * {@link BinaryBlobException} if a NUL byte is encountered.
673 *
674 * @param buf
675 * buffer to scan.
676 * @param ptr
677 * position within the buffer corresponding to the first byte of
678 * line 1.
679 * @param end
680 * 1 past the end of the content within <code>buf</code>.
681 * @return a line map indicating the starting position of each line.
682 * @throws BinaryBlobException
683 * if a NUL byte or a lone CR is found.
684 * @since 5.0
685 */
686 public static final IntList lineMapOrBinary(byte[] buf, int ptr, int end)
687 throws BinaryBlobException {
688 // Experimentally derived from multiple source repositories
689 // the average number of bytes/line is 36. Its a rough guess
690 // to initially size our map close to the target.
691 IntList map = new IntList((end - ptr) / 36);
692 map.add(Integer.MIN_VALUE);
693 byte last = '\n'; // Must be \n to add the initial ptr
694 for (; ptr < end; ptr++) {
695 if (last == '\n') {
696 map.add(ptr);
697 }
698 byte curr = buf[ptr];
699 if (RawText.isBinary(curr, last)) {
700 throw new BinaryBlobException();
701 }
702 last = curr;
703 }
704 if (last == '\r') {
705 // Counts as binary
706 throw new BinaryBlobException();
707 }
708 map.add(end);
709 return map;
710 }
711
712 /**
713 * Locate the "author " header line data.
714 *
715 * @param b
716 * buffer to scan.
717 * @param ptr
718 * position in buffer to start the scan at. Most callers should
719 * pass 0 to ensure the scan starts from the beginning of the
720 * commit buffer and does not accidentally look at message body.
721 * @return position just after the space in "author ", so the first
722 * character of the author's name. If no author header can be
723 * located -1 is returned.
724 */
725 public static final int author(byte[] b, int ptr) {
726 final int sz = b.length;
727 if (ptr == 0)
728 ptr += 46; // skip the "tree ..." line.
729 while (ptr < sz && b[ptr] == 'p')
730 ptr += 48; // skip this parent.
731 return match(b, ptr, author);
732 }
733
734 /**
735 * Locate the "committer " header line data.
736 *
737 * @param b
738 * buffer to scan.
739 * @param ptr
740 * position in buffer to start the scan at. Most callers should
741 * pass 0 to ensure the scan starts from the beginning of the
742 * commit buffer and does not accidentally look at message body.
743 * @return position just after the space in "committer ", so the first
744 * character of the committer's name. If no committer header can be
745 * located -1 is returned.
746 */
747 public static final int committer(byte[] b, int ptr) {
748 final int sz = b.length;
749 if (ptr == 0)
750 ptr += 46; // skip the "tree ..." line.
751 while (ptr < sz && b[ptr] == 'p')
752 ptr += 48; // skip this parent.
753 if (ptr < sz && b[ptr] == 'a')
754 ptr = nextLF(b, ptr);
755 return match(b, ptr, committer);
756 }
757
758 /**
759 * Locate the "tagger " header line data.
760 *
761 * @param b
762 * buffer to scan.
763 * @param ptr
764 * position in buffer to start the scan at. Most callers should
765 * pass 0 to ensure the scan starts from the beginning of the tag
766 * buffer and does not accidentally look at message body.
767 * @return position just after the space in "tagger ", so the first
768 * character of the tagger's name. If no tagger header can be
769 * located -1 is returned.
770 */
771 public static final int tagger(byte[] b, int ptr) {
772 final int sz = b.length;
773 if (ptr == 0)
774 ptr += 48; // skip the "object ..." line.
775 while (ptr < sz) {
776 if (b[ptr] == '\n')
777 return -1;
778 final int m = match(b, ptr, tagger);
779 if (m >= 0)
780 return m;
781 ptr = nextLF(b, ptr);
782 }
783 return -1;
784 }
785
786 /**
787 * Locate the "encoding " header line.
788 *
789 * @param b
790 * buffer to scan.
791 * @param ptr
792 * position in buffer to start the scan at. Most callers should
793 * pass 0 to ensure the scan starts from the beginning of the
794 * buffer and does not accidentally look at the message body.
795 * @return position just after the space in "encoding ", so the first
796 * character of the encoding's name. If no encoding header can be
797 * located -1 is returned (and UTF-8 should be assumed).
798 */
799 public static final int encoding(byte[] b, int ptr) {
800 final int sz = b.length;
801 while (ptr < sz) {
802 if (b[ptr] == '\n')
803 return -1;
804 if (b[ptr] == 'e')
805 break;
806 ptr = nextLF(b, ptr);
807 }
808 return match(b, ptr, encoding);
809 }
810
811 /**
812 * Parse the "encoding " header as a string.
813 * <p>
814 * Locates the "encoding " header (if present) and returns its value.
815 *
816 * @param b
817 * buffer to scan.
818 * @return the encoding header as specified in the commit; null if the
819 * header was not present and should be assumed.
820 * @since 4.2
821 */
822 @Nullable
823 public static String parseEncodingName(byte[] b) {
824 int enc = encoding(b, 0);
825 if (enc < 0) {
826 return null;
827 }
828 int lf = nextLF(b, enc);
829 return decode(UTF_8, b, enc, lf - 1);
830 }
831
832 /**
833 * Parse the "encoding " header into a character set reference.
834 * <p>
835 * Locates the "encoding " header (if present) by first calling
836 * {@link #encoding(byte[], int)} and then returns the proper character set
837 * to apply to this buffer to evaluate its contents as character data.
838 * <p>
839 * If no encoding header is present {@code UTF-8} is assumed.
840 *
841 * @param b
842 * buffer to scan.
843 * @return the Java character set representation. Never null.
844 * @throws IllegalCharsetNameException
845 * if the character set requested by the encoding header is
846 * malformed and unsupportable.
847 * @throws UnsupportedCharsetException
848 * if the JRE does not support the character set requested by
849 * the encoding header.
850 */
851 public static Charset parseEncoding(byte[] b) {
852 String enc = parseEncodingName(b);
853 if (enc == null) {
854 return UTF_8;
855 }
856
857 String name = enc.trim();
858 try {
859 return Charset.forName(name);
860 } catch (IllegalCharsetNameException
861 | UnsupportedCharsetException badName) {
862 Charset aliased = charsetForAlias(name);
863 if (aliased != null) {
864 return aliased;
865 }
866 throw badName;
867 }
868 }
869
870 /**
871 * Parse a name string (e.g. author, committer, tagger) into a PersonIdent.
872 * <p>
873 * Leading spaces won't be trimmed from the string, i.e. will show up in the
874 * parsed name afterwards.
875 *
876 * @param in
877 * the string to parse a name from.
878 * @return the parsed identity or null in case the identity could not be
879 * parsed.
880 */
881 public static PersonIdent parsePersonIdent(String in) {
882 return parsePersonIdent(Constants.encode(in), 0);
883 }
884
885 /**
886 * Parse a name line (e.g. author, committer, tagger) into a PersonIdent.
887 * <p>
888 * When passing in a value for <code>nameB</code> callers should use the
889 * return value of {@link #author(byte[], int)} or
890 * {@link #committer(byte[], int)}, as these methods provide the proper
891 * position within the buffer.
892 *
893 * @param raw
894 * the buffer to parse character data from.
895 * @param nameB
896 * first position of the identity information. This should be the
897 * first position after the space which delimits the header field
898 * name (e.g. "author" or "committer") from the rest of the
899 * identity line.
900 * @return the parsed identity or null in case the identity could not be
901 * parsed.
902 */
903 public static PersonIdent parsePersonIdent(byte[] raw, int nameB) {
904 Charset cs;
905 try {
906 cs = parseEncoding(raw);
907 } catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
908 // Assume UTF-8 for person identities, usually this is correct.
909 // If not decode() will fall back to the ISO-8859-1 encoding.
910 cs = UTF_8;
911 }
912
913 final int emailB = nextLF(raw, nameB, '<');
914 final int emailE = nextLF(raw, emailB, '>');
915 if (emailB >= raw.length || raw[emailB] == '\n' ||
916 (emailE >= raw.length - 1 && raw[emailE - 1] != '>'))
917 return null;
918
919 final int nameEnd = emailB - 2 >= nameB && raw[emailB - 2] == ' ' ?
920 emailB - 2 : emailB - 1;
921 final String name = decode(cs, raw, nameB, nameEnd);
922 final String email = decode(cs, raw, emailB, emailE - 1);
923
924 // Start searching from end of line, as after first name-email pair,
925 // another name-email pair may occur. We will ignore all kinds of
926 // "junk" following the first email.
927 //
928 // We've to use (emailE - 1) for the case that raw[email] is LF,
929 // otherwise we would run too far. "-2" is necessary to position
930 // before the LF in case of LF termination resp. the penultimate
931 // character if there is no trailing LF.
932 final int tzBegin = lastIndexOfTrim(raw, ' ',
933 nextLF(raw, emailE - 1) - 2) + 1;
934 if (tzBegin <= emailE) // No time/zone, still valid
935 return new PersonIdent(name, email, 0, 0);
936
937 final int whenBegin = Math.max(emailE,
938 lastIndexOfTrim(raw, ' ', tzBegin - 1) + 1);
939 if (whenBegin >= tzBegin - 1) // No time/zone, still valid
940 return new PersonIdent(name, email, 0, 0);
941
942 final long when = parseLongBase10(raw, whenBegin, null);
943 final int tz = parseTimeZoneOffset(raw, tzBegin);
944 return new PersonIdent(name, email, when * 1000L, tz);
945 }
946
947 /**
948 * Parse a name data (e.g. as within a reflog) into a PersonIdent.
949 * <p>
950 * When passing in a value for <code>nameB</code> callers should use the
951 * return value of {@link #author(byte[], int)} or
952 * {@link #committer(byte[], int)}, as these methods provide the proper
953 * position within the buffer.
954 *
955 * @param raw
956 * the buffer to parse character data from.
957 * @param nameB
958 * first position of the identity information. This should be the
959 * first position after the space which delimits the header field
960 * name (e.g. "author" or "committer") from the rest of the
961 * identity line.
962 * @return the parsed identity. Never null.
963 */
964 public static PersonIdent parsePersonIdentOnly(final byte[] raw,
965 final int nameB) {
966 int stop = nextLF(raw, nameB);
967 int emailB = nextLF(raw, nameB, '<');
968 int emailE = nextLF(raw, emailB, '>');
969 final String name;
970 final String email;
971 if (emailE < stop) {
972 email = decode(raw, emailB, emailE - 1);
973 } else {
974 email = "invalid"; //$NON-NLS-1$
975 }
976 if (emailB < stop)
977 name = decode(raw, nameB, emailB - 2);
978 else
979 name = decode(raw, nameB, stop);
980
981 final MutableInteger ptrout = new MutableInteger();
982 long when;
983 int tz;
984 if (emailE < stop) {
985 when = parseLongBase10(raw, emailE + 1, ptrout);
986 tz = parseTimeZoneOffset(raw, ptrout.value);
987 } else {
988 when = 0;
989 tz = 0;
990 }
991 return new PersonIdent(name, email, when * 1000L, tz);
992 }
993
994 /**
995 * Locate the end of a footer line key string.
996 * <p>
997 * If the region at {@code raw[ptr]} matches {@code ^[A-Za-z0-9-]+:} (e.g.
998 * "Signed-off-by: A. U. Thor\n") then this method returns the position of
999 * the first ':'.
1000 * <p>
1001 * If the region at {@code raw[ptr]} does not match {@code ^[A-Za-z0-9-]+:}
1002 * then this method returns -1.
1003 *
1004 * @param raw
1005 * buffer to scan.
1006 * @param ptr
1007 * first position within raw to consider as a footer line key.
1008 * @return position of the ':' which terminates the footer line key if this
1009 * is otherwise a valid footer line key; otherwise -1.
1010 */
1011 public static int endOfFooterLineKey(byte[] raw, int ptr) {
1012 try {
1013 for (;;) {
1014 final byte c = raw[ptr];
1015 if (footerLineKeyChars[c] == 0) {
1016 if (c == ':')
1017 return ptr;
1018 return -1;
1019 }
1020 ptr++;
1021 }
1022 } catch (ArrayIndexOutOfBoundsException e) {
1023 return -1;
1024 }
1025 }
1026
1027 /**
1028 * Decode a buffer under UTF-8, if possible.
1029 *
1030 * If the byte stream cannot be decoded that way, the platform default is tried
1031 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1032 *
1033 * @param buffer
1034 * buffer to pull raw bytes from.
1035 * @return a string representation of the range <code>[start,end)</code>,
1036 * after decoding the region through the specified character set.
1037 */
1038 public static String decode(byte[] buffer) {
1039 return decode(buffer, 0, buffer.length);
1040 }
1041
1042 /**
1043 * Decode a buffer under UTF-8, if possible.
1044 *
1045 * If the byte stream cannot be decoded that way, the platform default is
1046 * tried and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1047 *
1048 * @param buffer
1049 * buffer to pull raw bytes from.
1050 * @param start
1051 * start position in buffer
1052 * @param end
1053 * one position past the last location within the buffer to take
1054 * data from.
1055 * @return a string representation of the range <code>[start,end)</code>,
1056 * after decoding the region through the specified character set.
1057 */
1058 public static String decode(final byte[] buffer, final int start,
1059 final int end) {
1060 return decode(UTF_8, buffer, start, end);
1061 }
1062
1063 /**
1064 * Decode a buffer under the specified character set if possible.
1065 *
1066 * If the byte stream cannot be decoded that way, the platform default is tried
1067 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1068 *
1069 * @param cs
1070 * character set to use when decoding the buffer.
1071 * @param buffer
1072 * buffer to pull raw bytes from.
1073 * @return a string representation of the range <code>[start,end)</code>,
1074 * after decoding the region through the specified character set.
1075 */
1076 public static String decode(Charset cs, byte[] buffer) {
1077 return decode(cs, buffer, 0, buffer.length);
1078 }
1079
1080 /**
1081 * Decode a region of the buffer under the specified character set if possible.
1082 *
1083 * If the byte stream cannot be decoded that way, the platform default is tried
1084 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1085 *
1086 * @param cs
1087 * character set to use when decoding the buffer.
1088 * @param buffer
1089 * buffer to pull raw bytes from.
1090 * @param start
1091 * first position within the buffer to take data from.
1092 * @param end
1093 * one position past the last location within the buffer to take
1094 * data from.
1095 * @return a string representation of the range <code>[start,end)</code>,
1096 * after decoding the region through the specified character set.
1097 */
1098 public static String decode(final Charset cs, final byte[] buffer,
1099 final int start, final int end) {
1100 try {
1101 return decodeNoFallback(cs, buffer, start, end);
1102 } catch (CharacterCodingException e) {
1103 // Fall back to an ISO-8859-1 style encoding. At least all of
1104 // the bytes will be present in the output.
1105 //
1106 return extractBinaryString(buffer, start, end);
1107 }
1108 }
1109
1110 /**
1111 * Decode a region of the buffer under the specified character set if
1112 * possible.
1113 *
1114 * If the byte stream cannot be decoded that way, the platform default is
1115 * tried and if that too fails, an exception is thrown.
1116 *
1117 * @param cs
1118 * character set to use when decoding the buffer.
1119 * @param buffer
1120 * buffer to pull raw bytes from.
1121 * @param start
1122 * first position within the buffer to take data from.
1123 * @param end
1124 * one position past the last location within the buffer to take
1125 * data from.
1126 * @return a string representation of the range <code>[start,end)</code>,
1127 * after decoding the region through the specified character set.
1128 * @throws java.nio.charset.CharacterCodingException
1129 * the input is not in any of the tested character sets.
1130 */
1131 public static String decodeNoFallback(final Charset cs,
1132 final byte[] buffer, final int start, final int end)
1133 throws CharacterCodingException {
1134 ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
1135 b.mark();
1136
1137 // Try our built-in favorite. The assumption here is that
1138 // decoding will fail if the data is not actually encoded
1139 // using that encoder.
1140 try {
1141 return decode(b, UTF_8);
1142 } catch (CharacterCodingException e) {
1143 b.reset();
1144 }
1145
1146 if (!cs.equals(UTF_8)) {
1147 // Try the suggested encoding, it might be right since it was
1148 // provided by the caller.
1149 try {
1150 return decode(b, cs);
1151 } catch (CharacterCodingException e) {
1152 b.reset();
1153 }
1154 }
1155
1156 // Try the default character set. A small group of people
1157 // might actually use the same (or very similar) locale.
1158 Charset defcs = SystemReader.getInstance().getDefaultCharset();
1159 if (!defcs.equals(cs) && !defcs.equals(UTF_8)) {
1160 try {
1161 return decode(b, defcs);
1162 } catch (CharacterCodingException e) {
1163 b.reset();
1164 }
1165 }
1166
1167 throw new CharacterCodingException();
1168 }
1169
1170 /**
1171 * Decode a region of the buffer under the ISO-8859-1 encoding.
1172 *
1173 * Each byte is treated as a single character in the 8859-1 character
1174 * encoding, performing a raw binary->char conversion.
1175 *
1176 * @param buffer
1177 * buffer to pull raw bytes from.
1178 * @param start
1179 * first position within the buffer to take data from.
1180 * @param end
1181 * one position past the last location within the buffer to take
1182 * data from.
1183 * @return a string representation of the range <code>[start,end)</code>.
1184 */
1185 public static String extractBinaryString(final byte[] buffer,
1186 final int start, final int end) {
1187 final StringBuilder r = new StringBuilder(end - start);
1188 for (int i = start; i < end; i++)
1189 r.append((char) (buffer[i] & 0xff));
1190 return r.toString();
1191 }
1192
1193 private static String decode(ByteBuffer b, Charset charset)
1194 throws CharacterCodingException {
1195 final CharsetDecoder d = charset.newDecoder();
1196 d.onMalformedInput(CodingErrorAction.REPORT);
1197 d.onUnmappableCharacter(CodingErrorAction.REPORT);
1198 return d.decode(b).toString();
1199 }
1200
1201 /**
1202 * Locate the position of the commit message body.
1203 *
1204 * @param b
1205 * buffer to scan.
1206 * @param ptr
1207 * position in buffer to start the scan at. Most callers should
1208 * pass 0 to ensure the scan starts from the beginning of the
1209 * commit buffer.
1210 * @return position of the user's message buffer.
1211 */
1212 public static final int commitMessage(byte[] b, int ptr) {
1213 final int sz = b.length;
1214 if (ptr == 0)
1215 ptr += 46; // skip the "tree ..." line.
1216 while (ptr < sz && b[ptr] == 'p')
1217 ptr += 48; // skip this parent.
1218
1219 // Skip any remaining header lines, ignoring what their actual
1220 // header line type is. This is identical to the logic for a tag.
1221 //
1222 return tagMessage(b, ptr);
1223 }
1224
1225 /**
1226 * Locate the position of the tag message body.
1227 *
1228 * @param b
1229 * buffer to scan.
1230 * @param ptr
1231 * position in buffer to start the scan at. Most callers should
1232 * pass 0 to ensure the scan starts from the beginning of the tag
1233 * buffer.
1234 * @return position of the user's message buffer.
1235 */
1236 public static final int tagMessage(byte[] b, int ptr) {
1237 final int sz = b.length;
1238 if (ptr == 0)
1239 ptr += 48; // skip the "object ..." line.
1240 while (ptr < sz && b[ptr] != '\n')
1241 ptr = nextLF(b, ptr);
1242 if (ptr < sz && b[ptr] == '\n')
1243 return ptr + 1;
1244 return -1;
1245 }
1246
1247 /**
1248 * Locate the end of a paragraph.
1249 * <p>
1250 * A paragraph is ended by two consecutive LF bytes or CRLF pairs
1251 *
1252 * @param b
1253 * buffer to scan.
1254 * @param start
1255 * position in buffer to start the scan at. Most callers will
1256 * want to pass the first position of the commit message (as
1257 * found by {@link #commitMessage(byte[], int)}.
1258 * @return position of the LF at the end of the paragraph;
1259 * <code>b.length</code> if no paragraph end could be located.
1260 */
1261 public static final int endOfParagraph(byte[] b, int start) {
1262 int ptr = start;
1263 final int sz = b.length;
1264 while (ptr < sz && (b[ptr] != '\n' && b[ptr] != '\r'))
1265 ptr = nextLF(b, ptr);
1266 if (ptr > start && b[ptr - 1] == '\n')
1267 ptr--;
1268 if (ptr > start && b[ptr - 1] == '\r')
1269 ptr--;
1270 return ptr;
1271 }
1272
1273 /**
1274 * Get last index of {@code ch} in raw, trimming spaces.
1275 *
1276 * @param raw
1277 * buffer to scan.
1278 * @param ch
1279 * character to find.
1280 * @param pos
1281 * starting position.
1282 * @return last index of {@code ch} in raw, trimming spaces.
1283 * @since 4.1
1284 */
1285 public static int lastIndexOfTrim(byte[] raw, char ch, int pos) {
1286 while (pos >= 0 && raw[pos] == ' ')
1287 pos--;
1288
1289 while (pos >= 0 && raw[pos] != ch)
1290 pos--;
1291
1292 return pos;
1293 }
1294
1295 private static Charset charsetForAlias(String name) {
1296 return encodingAliases.get(StringUtils.toLowerCase(name));
1297 }
1298
1299 private RawParseUtils() {
1300 // Don't create instances of a static only utility.
1301 }
1302 }