1 /*
2 * Copyright (C) 2008-2009, Google Inc.
3 * Copyright (C) 2006-2008, Shawn O. Pearce <spearce@spearce.org>
4 * and other copyright owners as documented in the project's IP log.
5 *
6 * This program and the accompanying materials are made available
7 * under the terms of the Eclipse Distribution License v1.0 which
8 * accompanies this distribution, is reproduced below, and is
9 * available at http://www.eclipse.org/org/documents/edl-v10.php
10 *
11 * All rights reserved.
12 *
13 * Redistribution and use in source and binary forms, with or
14 * without modification, are permitted provided that the following
15 * conditions are met:
16 *
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 *
20 * - Redistributions in binary form must reproduce the above
21 * copyright notice, this list of conditions and the following
22 * disclaimer in the documentation and/or other materials provided
23 * with the distribution.
24 *
25 * - Neither the name of the Eclipse Foundation, Inc. nor the
26 * names of its contributors may be used to endorse or promote
27 * products derived from this software without specific prior
28 * written permission.
29 *
30 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
31 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
32 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
33 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
35 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
36 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
37 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
38 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
39 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
40 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
41 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
42 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 */
44
45 package org.eclipse.jgit.util;
46
47 import static java.nio.charset.StandardCharsets.ISO_8859_1;
48 import static java.nio.charset.StandardCharsets.UTF_8;
49 import static org.eclipse.jgit.lib.ObjectChecker.author;
50 import static org.eclipse.jgit.lib.ObjectChecker.committer;
51 import static org.eclipse.jgit.lib.ObjectChecker.encoding;
52 import static org.eclipse.jgit.lib.ObjectChecker.tagger;
53
54 import java.nio.ByteBuffer;
55 import java.nio.charset.CharacterCodingException;
56 import java.nio.charset.Charset;
57 import java.nio.charset.CharsetDecoder;
58 import java.nio.charset.CodingErrorAction;
59 import java.nio.charset.IllegalCharsetNameException;
60 import java.nio.charset.UnsupportedCharsetException;
61 import java.util.Arrays;
62 import java.util.HashMap;
63 import java.util.Map;
64
65 import org.eclipse.jgit.annotations.Nullable;
66 import org.eclipse.jgit.errors.BinaryBlobException;
67 import org.eclipse.jgit.lib.Constants;
68 import org.eclipse.jgit.lib.PersonIdent;
69
70 /**
71 * Handy utility functions to parse raw object contents.
72 */
73 public final class RawParseUtils {
74 /**
75 * UTF-8 charset constant.
76 *
77 * @since 2.2
78 */
79 public static final Charset UTF8_CHARSET = UTF_8;
80
81 private static final byte[] digits10;
82
83 private static final byte[] digits16;
84
85 private static final byte[] footerLineKeyChars;
86
87 private static final Map<String, Charset> encodingAliases;
88
89 static {
90 encodingAliases = new HashMap<>();
91 encodingAliases.put("latin-1", ISO_8859_1); //$NON-NLS-1$
92 encodingAliases.put("iso-latin-1", ISO_8859_1); //$NON-NLS-1$
93
94 digits10 = new byte['9' + 1];
95 Arrays.fill(digits10, (byte) -1);
96 for (char i = '0'; i <= '9'; i++)
97 digits10[i] = (byte) (i - '0');
98
99 digits16 = new byte['f' + 1];
100 Arrays.fill(digits16, (byte) -1);
101 for (char i = '0'; i <= '9'; i++)
102 digits16[i] = (byte) (i - '0');
103 for (char i = 'a'; i <= 'f'; i++)
104 digits16[i] = (byte) ((i - 'a') + 10);
105 for (char i = 'A'; i <= 'F'; i++)
106 digits16[i] = (byte) ((i - 'A') + 10);
107
108 footerLineKeyChars = new byte['z' + 1];
109 footerLineKeyChars['-'] = 1;
110 for (char i = '0'; i <= '9'; i++)
111 footerLineKeyChars[i] = 1;
112 for (char i = 'A'; i <= 'Z'; i++)
113 footerLineKeyChars[i] = 1;
114 for (char i = 'a'; i <= 'z'; i++)
115 footerLineKeyChars[i] = 1;
116 }
117
118 /**
119 * Determine if b[ptr] matches src.
120 *
121 * @param b
122 * the buffer to scan.
123 * @param ptr
124 * first position within b, this should match src[0].
125 * @param src
126 * the buffer to test for equality with b.
127 * @return ptr + src.length if b[ptr..src.length] == src; else -1.
128 */
129 public static final int match(byte[] b, int ptr, byte[] src) {
130 if (ptr + src.length > b.length)
131 return -1;
132 for (int i = 0; i < src.length; i++, ptr++)
133 if (b[ptr] != src[i])
134 return -1;
135 return ptr;
136 }
137
138 private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5',
139 '6', '7', '8', '9' };
140
141 /**
142 * Format a base 10 numeric into a temporary buffer.
143 * <p>
144 * Formatting is performed backwards. The method starts at offset
145 * <code>o-1</code> and ends at <code>o-1-digits</code>, where
146 * <code>digits</code> is the number of positions necessary to store the
147 * base 10 value.
148 * <p>
149 * The argument and return values from this method make it easy to chain
150 * writing, for example:
151 * </p>
152 *
153 * <pre>
154 * final byte[] tmp = new byte[64];
155 * int ptr = tmp.length;
156 * tmp[--ptr] = '\n';
157 * ptr = RawParseUtils.formatBase10(tmp, ptr, 32);
158 * tmp[--ptr] = ' ';
159 * ptr = RawParseUtils.formatBase10(tmp, ptr, 18);
160 * tmp[--ptr] = 0;
161 * final String str = new String(tmp, ptr, tmp.length - ptr);
162 * </pre>
163 *
164 * @param b
165 * buffer to write into.
166 * @param o
167 * one offset past the location where writing will begin; writing
168 * proceeds towards lower index values.
169 * @param value
170 * the value to store.
171 * @return the new offset value <code>o</code>. This is the position of
172 * the last byte written. Additional writing should start at one
173 * position earlier.
174 */
175 public static int formatBase10(final byte[] b, int o, int value) {
176 if (value == 0) {
177 b[--o] = '0';
178 return o;
179 }
180 final boolean isneg = value < 0;
181 if (isneg)
182 value = -value;
183 while (value != 0) {
184 b[--o] = base10byte[value % 10];
185 value /= 10;
186 }
187 if (isneg)
188 b[--o] = '-';
189 return o;
190 }
191
192 /**
193 * Parse a base 10 numeric from a sequence of ASCII digits into an int.
194 * <p>
195 * Digit sequences can begin with an optional run of spaces before the
196 * sequence, and may start with a '+' or a '-' to indicate sign position.
197 * Any other characters will cause the method to stop and return the current
198 * result to the caller.
199 *
200 * @param b
201 * buffer to scan.
202 * @param ptr
203 * position within buffer to start parsing digits at.
204 * @param ptrResult
205 * optional location to return the new ptr value through. If null
206 * the ptr value will be discarded.
207 * @return the value at this location; 0 if the location is not a valid
208 * numeric.
209 */
210 public static final int parseBase10(final byte[] b, int ptr,
211 final MutableInteger ptrResult) {
212 int r = 0;
213 int sign = 0;
214 try {
215 final int sz = b.length;
216 while (ptr < sz && b[ptr] == ' ')
217 ptr++;
218 if (ptr >= sz)
219 return 0;
220
221 switch (b[ptr]) {
222 case '-':
223 sign = -1;
224 ptr++;
225 break;
226 case '+':
227 ptr++;
228 break;
229 }
230
231 while (ptr < sz) {
232 final byte v = digits10[b[ptr]];
233 if (v < 0)
234 break;
235 r = (r * 10) + v;
236 ptr++;
237 }
238 } catch (ArrayIndexOutOfBoundsException e) {
239 // Not a valid digit.
240 }
241 if (ptrResult != null)
242 ptrResult.value = ptr;
243 return sign < 0 ? -r : r;
244 }
245
246 /**
247 * Parse a base 10 numeric from a sequence of ASCII digits into a long.
248 * <p>
249 * Digit sequences can begin with an optional run of spaces before the
250 * sequence, and may start with a '+' or a '-' to indicate sign position.
251 * Any other characters will cause the method to stop and return the current
252 * result to the caller.
253 *
254 * @param b
255 * buffer to scan.
256 * @param ptr
257 * position within buffer to start parsing digits at.
258 * @param ptrResult
259 * optional location to return the new ptr value through. If null
260 * the ptr value will be discarded.
261 * @return the value at this location; 0 if the location is not a valid
262 * numeric.
263 */
264 public static final long parseLongBase10(final byte[] b, int ptr,
265 final MutableInteger ptrResult) {
266 long r = 0;
267 int sign = 0;
268 try {
269 final int sz = b.length;
270 while (ptr < sz && b[ptr] == ' ')
271 ptr++;
272 if (ptr >= sz)
273 return 0;
274
275 switch (b[ptr]) {
276 case '-':
277 sign = -1;
278 ptr++;
279 break;
280 case '+':
281 ptr++;
282 break;
283 }
284
285 while (ptr < sz) {
286 final byte v = digits10[b[ptr]];
287 if (v < 0)
288 break;
289 r = (r * 10) + v;
290 ptr++;
291 }
292 } catch (ArrayIndexOutOfBoundsException e) {
293 // Not a valid digit.
294 }
295 if (ptrResult != null)
296 ptrResult.value = ptr;
297 return sign < 0 ? -r : r;
298 }
299
300 /**
301 * Parse 4 character base 16 (hex) formatted string to unsigned integer.
302 * <p>
303 * The number is read in network byte order, that is, most significant
304 * nybble first.
305 *
306 * @param bs
307 * buffer to parse digits from; positions {@code [p, p+4)} will
308 * be parsed.
309 * @param p
310 * first position within the buffer to parse.
311 * @return the integer value.
312 * @throws java.lang.ArrayIndexOutOfBoundsException
313 * if the string is not hex formatted.
314 */
315 public static final int parseHexInt16(final byte[] bs, final int p) {
316 int r = digits16[bs[p]] << 4;
317
318 r |= digits16[bs[p + 1]];
319 r <<= 4;
320
321 r |= digits16[bs[p + 2]];
322 r <<= 4;
323
324 r |= digits16[bs[p + 3]];
325 if (r < 0)
326 throw new ArrayIndexOutOfBoundsException();
327 return r;
328 }
329
330 /**
331 * Parse 8 character base 16 (hex) formatted string to unsigned integer.
332 * <p>
333 * The number is read in network byte order, that is, most significant
334 * nybble first.
335 *
336 * @param bs
337 * buffer to parse digits from; positions {@code [p, p+8)} will
338 * be parsed.
339 * @param p
340 * first position within the buffer to parse.
341 * @return the integer value.
342 * @throws java.lang.ArrayIndexOutOfBoundsException
343 * if the string is not hex formatted.
344 */
345 public static final int parseHexInt32(final byte[] bs, final int p) {
346 int r = digits16[bs[p]] << 4;
347
348 r |= digits16[bs[p + 1]];
349 r <<= 4;
350
351 r |= digits16[bs[p + 2]];
352 r <<= 4;
353
354 r |= digits16[bs[p + 3]];
355 r <<= 4;
356
357 r |= digits16[bs[p + 4]];
358 r <<= 4;
359
360 r |= digits16[bs[p + 5]];
361 r <<= 4;
362
363 r |= digits16[bs[p + 6]];
364
365 final int last = digits16[bs[p + 7]];
366 if (r < 0 || last < 0)
367 throw new ArrayIndexOutOfBoundsException();
368 return (r << 4) | last;
369 }
370
371 /**
372 * Parse 16 character base 16 (hex) formatted string to unsigned long.
373 * <p>
374 * The number is read in network byte order, that is, most significant
375 * nibble first.
376 *
377 * @param bs
378 * buffer to parse digits from; positions {@code [p, p+16)} will
379 * be parsed.
380 * @param p
381 * first position within the buffer to parse.
382 * @return the integer value.
383 * @throws java.lang.ArrayIndexOutOfBoundsException
384 * if the string is not hex formatted.
385 * @since 4.3
386 */
387 public static final long parseHexInt64(final byte[] bs, final int p) {
388 long r = digits16[bs[p]] << 4;
389
390 r |= digits16[bs[p + 1]];
391 r <<= 4;
392
393 r |= digits16[bs[p + 2]];
394 r <<= 4;
395
396 r |= digits16[bs[p + 3]];
397 r <<= 4;
398
399 r |= digits16[bs[p + 4]];
400 r <<= 4;
401
402 r |= digits16[bs[p + 5]];
403 r <<= 4;
404
405 r |= digits16[bs[p + 6]];
406 r <<= 4;
407
408 r |= digits16[bs[p + 7]];
409 r <<= 4;
410
411 r |= digits16[bs[p + 8]];
412 r <<= 4;
413
414 r |= digits16[bs[p + 9]];
415 r <<= 4;
416
417 r |= digits16[bs[p + 10]];
418 r <<= 4;
419
420 r |= digits16[bs[p + 11]];
421 r <<= 4;
422
423 r |= digits16[bs[p + 12]];
424 r <<= 4;
425
426 r |= digits16[bs[p + 13]];
427 r <<= 4;
428
429 r |= digits16[bs[p + 14]];
430
431 final int last = digits16[bs[p + 15]];
432 if (r < 0 || last < 0)
433 throw new ArrayIndexOutOfBoundsException();
434 return (r << 4) | last;
435 }
436
437 /**
438 * Parse a single hex digit to its numeric value (0-15).
439 *
440 * @param digit
441 * hex character to parse.
442 * @return numeric value, in the range 0-15.
443 * @throws java.lang.ArrayIndexOutOfBoundsException
444 * if the input digit is not a valid hex digit.
445 */
446 public static final int parseHexInt4(final byte digit) {
447 final byte r = digits16[digit];
448 if (r < 0)
449 throw new ArrayIndexOutOfBoundsException();
450 return r;
451 }
452
453 /**
454 * Parse a Git style timezone string.
455 * <p>
456 * The sequence "-0315" will be parsed as the numeric value -195, as the
457 * lower two positions count minutes, not 100ths of an hour.
458 *
459 * @param b
460 * buffer to scan.
461 * @param ptr
462 * position within buffer to start parsing digits at.
463 * @return the timezone at this location, expressed in minutes.
464 */
465 public static final int parseTimeZoneOffset(byte[] b, int ptr) {
466 return parseTimeZoneOffset(b, ptr, null);
467 }
468
469 /**
470 * Parse a Git style timezone string.
471 * <p>
472 * The sequence "-0315" will be parsed as the numeric value -195, as the
473 * lower two positions count minutes, not 100ths of an hour.
474 *
475 * @param b
476 * buffer to scan.
477 * @param ptr
478 * position within buffer to start parsing digits at.
479 * @param ptrResult
480 * optional location to return the new ptr value through. If null
481 * the ptr value will be discarded.
482 * @return the timezone at this location, expressed in minutes.
483 * @since 4.1
484 */
485 public static final int parseTimeZoneOffset(final byte[] b, int ptr,
486 MutableInteger ptrResult) {
487 final int v = parseBase10(b, ptr, ptrResult);
488 final int tzMins = v % 100;
489 final int tzHours = v / 100;
490 return tzHours * 60 + tzMins;
491 }
492
493 /**
494 * Locate the first position after a given character.
495 *
496 * @param b
497 * buffer to scan.
498 * @param ptr
499 * position within buffer to start looking for chrA at.
500 * @param chrA
501 * character to find.
502 * @return new position just after chrA.
503 */
504 public static final int next(byte[] b, int ptr, char chrA) {
505 final int sz = b.length;
506 while (ptr < sz) {
507 if (b[ptr++] == chrA)
508 return ptr;
509 }
510 return ptr;
511 }
512
513 /**
514 * Locate the first position after the next LF.
515 * <p>
516 * This method stops on the first '\n' it finds.
517 *
518 * @param b
519 * buffer to scan.
520 * @param ptr
521 * position within buffer to start looking for LF at.
522 * @return new position just after the first LF found.
523 */
524 public static final int nextLF(byte[] b, int ptr) {
525 return next(b, ptr, '\n');
526 }
527
528 /**
529 * Locate the first position after either the given character or LF.
530 * <p>
531 * This method stops on the first match it finds from either chrA or '\n'.
532 *
533 * @param b
534 * buffer to scan.
535 * @param ptr
536 * position within buffer to start looking for chrA or LF at.
537 * @param chrA
538 * character to find.
539 * @return new position just after the first chrA or LF to be found.
540 */
541 public static final int nextLF(byte[] b, int ptr, char chrA) {
542 final int sz = b.length;
543 while (ptr < sz) {
544 final byte c = b[ptr++];
545 if (c == chrA || c == '\n')
546 return ptr;
547 }
548 return ptr;
549 }
550
551 /**
552 * Locate the end of the header. Note that headers may be
553 * more than one line long.
554 * @param b
555 * buffer to scan.
556 * @param ptr
557 * position within buffer to start looking for the end-of-header.
558 * @return new position just after the header. This is either
559 * b.length, or the index of the header's terminating newline.
560 * @since 5.1
561 */
562 public static final int headerEnd(final byte[] b, int ptr) {
563 final int sz = b.length;
564 while (ptr < sz) {
565 final byte c = b[ptr++];
566 if (c == '\n' && (ptr == sz || b[ptr] != ' ')) {
567 return ptr - 1;
568 }
569 }
570 return ptr - 1;
571 }
572
573 /**
574 * Find the start of the contents of a given header.
575 *
576 * @param b
577 * buffer to scan.
578 * @param headerName
579 * header to search for
580 * @param ptr
581 * position within buffer to start looking for header at.
582 * @return new position at the start of the header's contents, -1 for
583 * not found
584 * @since 5.1
585 */
586 public static final int headerStart(byte[] headerName, byte[] b, int ptr) {
587 // Start by advancing to just past a LF or buffer start
588 if (ptr != 0) {
589 ptr = nextLF(b, ptr - 1);
590 }
591 while (ptr < b.length - (headerName.length + 1)) {
592 boolean found = true;
593 for (int i = 0; i < headerName.length; i++) {
594 if (headerName[i] != b[ptr++]) {
595 found = false;
596 break;
597 }
598 }
599 if (found && b[ptr++] == ' ') {
600 return ptr;
601 }
602 ptr = nextLF(b, ptr);
603 }
604 return -1;
605 }
606
607 /**
608 * Locate the first position before a given character.
609 *
610 * @param b
611 * buffer to scan.
612 * @param ptr
613 * position within buffer to start looking for chrA at.
614 * @param chrA
615 * character to find.
616 * @return new position just before chrA, -1 for not found
617 */
618 public static final int prev(byte[] b, int ptr, char chrA) {
619 if (ptr == b.length)
620 --ptr;
621 while (ptr >= 0) {
622 if (b[ptr--] == chrA)
623 return ptr;
624 }
625 return ptr;
626 }
627
628 /**
629 * Locate the first position before the previous LF.
630 * <p>
631 * This method stops on the first '\n' it finds.
632 *
633 * @param b
634 * buffer to scan.
635 * @param ptr
636 * position within buffer to start looking for LF at.
637 * @return new position just before the first LF found, -1 for not found
638 */
639 public static final int prevLF(byte[] b, int ptr) {
640 return prev(b, ptr, '\n');
641 }
642
643 /**
644 * Locate the previous position before either the given character or LF.
645 * <p>
646 * This method stops on the first match it finds from either chrA or '\n'.
647 *
648 * @param b
649 * buffer to scan.
650 * @param ptr
651 * position within buffer to start looking for chrA or LF at.
652 * @param chrA
653 * character to find.
654 * @return new position just before the first chrA or LF to be found, -1 for
655 * not found
656 */
657 public static final int prevLF(byte[] b, int ptr, char chrA) {
658 if (ptr == b.length)
659 --ptr;
660 while (ptr >= 0) {
661 final byte c = b[ptr--];
662 if (c == chrA || c == '\n')
663 return ptr;
664 }
665 return ptr;
666 }
667
668 /**
669 * Index the region between <code>[ptr, end)</code> to find line starts.
670 * <p>
671 * The returned list is 1 indexed. Index 0 contains
672 * {@link java.lang.Integer#MIN_VALUE} to pad the list out.
673 * <p>
674 * Using a 1 indexed list means that line numbers can be directly accessed
675 * from the list, so <code>list.get(1)</code> (aka get line 1) returns
676 * <code>ptr</code>.
677 * <p>
678 * The last element (index <code>map.size()-1</code>) always contains
679 * <code>end</code>.
680 * <p>
681 * If the data contains a '\0' anywhere, the whole region is considered
682 * binary and a LineMap corresponding to a single line is returned.
683 * </p>
684 *
685 * @param buf
686 * buffer to scan.
687 * @param ptr
688 * position within the buffer corresponding to the first byte of
689 * line 1.
690 * @param end
691 * 1 past the end of the content within <code>buf</code>.
692 * @return a line map indicating the starting position of each line, or a
693 * map representing the entire buffer as a single line if
694 * <code>buf</code> contains a NUL byte.
695 */
696 public static final IntList lineMap(byte[] buf, int ptr, int end) {
697 IntList map = lineMapOrNull(buf, ptr, end);
698 if (map == null) {
699 map = new IntList(3);
700 map.add(Integer.MIN_VALUE);
701 map.add(ptr);
702 map.add(end);
703 }
704 return map;
705 }
706
707 /**
708 * Like {@link #lineMap(byte[], int, int)} but throw
709 * {@link BinaryBlobException} if a NUL byte is encountered.
710 *
711 * @param buf
712 * buffer to scan.
713 * @param ptr
714 * position within the buffer corresponding to the first byte of
715 * line 1.
716 * @param end
717 * 1 past the end of the content within <code>buf</code>.
718 * @return a line map indicating the starting position of each line.
719 * @throws BinaryBlobException
720 * if a NUL byte is found.
721 * @since 5.0
722 */
723 public static final IntList lineMapOrBinary(byte[] buf, int ptr, int end)
724 throws BinaryBlobException {
725 IntList map = lineMapOrNull(buf, ptr, end);
726 if (map == null) {
727 throw new BinaryBlobException();
728 }
729 return map;
730 }
731
732 private static @Nullable IntList lineMapOrNull(byte[] buf, int ptr, int end) {
733 // Experimentally derived from multiple source repositories
734 // the average number of bytes/line is 36. Its a rough guess
735 // to initially size our map close to the target.
736 IntList map = new IntList((end - ptr) / 36);
737 map.add(Integer.MIN_VALUE);
738 boolean foundLF = true;
739 for (; ptr < end; ptr++) {
740 if (foundLF) {
741 map.add(ptr);
742 }
743
744 if (buf[ptr] == '\0') {
745 return null;
746 }
747
748 foundLF = (buf[ptr] == '\n');
749 }
750 map.add(end);
751 return map;
752 }
753
754 /**
755 * Locate the "author " header line data.
756 *
757 * @param b
758 * buffer to scan.
759 * @param ptr
760 * position in buffer to start the scan at. Most callers should
761 * pass 0 to ensure the scan starts from the beginning of the
762 * commit buffer and does not accidentally look at message body.
763 * @return position just after the space in "author ", so the first
764 * character of the author's name. If no author header can be
765 * located -1 is returned.
766 */
767 public static final int author(byte[] b, int ptr) {
768 final int sz = b.length;
769 if (ptr == 0)
770 ptr += 46; // skip the "tree ..." line.
771 while (ptr < sz && b[ptr] == 'p')
772 ptr += 48; // skip this parent.
773 return match(b, ptr, author);
774 }
775
776 /**
777 * Locate the "committer " header line data.
778 *
779 * @param b
780 * buffer to scan.
781 * @param ptr
782 * position in buffer to start the scan at. Most callers should
783 * pass 0 to ensure the scan starts from the beginning of the
784 * commit buffer and does not accidentally look at message body.
785 * @return position just after the space in "committer ", so the first
786 * character of the committer's name. If no committer header can be
787 * located -1 is returned.
788 */
789 public static final int committer(byte[] b, int ptr) {
790 final int sz = b.length;
791 if (ptr == 0)
792 ptr += 46; // skip the "tree ..." line.
793 while (ptr < sz && b[ptr] == 'p')
794 ptr += 48; // skip this parent.
795 if (ptr < sz && b[ptr] == 'a')
796 ptr = nextLF(b, ptr);
797 return match(b, ptr, committer);
798 }
799
800 /**
801 * Locate the "tagger " header line data.
802 *
803 * @param b
804 * buffer to scan.
805 * @param ptr
806 * position in buffer to start the scan at. Most callers should
807 * pass 0 to ensure the scan starts from the beginning of the tag
808 * buffer and does not accidentally look at message body.
809 * @return position just after the space in "tagger ", so the first
810 * character of the tagger's name. If no tagger header can be
811 * located -1 is returned.
812 */
813 public static final int tagger(byte[] b, int ptr) {
814 final int sz = b.length;
815 if (ptr == 0)
816 ptr += 48; // skip the "object ..." line.
817 while (ptr < sz) {
818 if (b[ptr] == '\n')
819 return -1;
820 final int m = match(b, ptr, tagger);
821 if (m >= 0)
822 return m;
823 ptr = nextLF(b, ptr);
824 }
825 return -1;
826 }
827
828 /**
829 * Locate the "encoding " header line.
830 *
831 * @param b
832 * buffer to scan.
833 * @param ptr
834 * position in buffer to start the scan at. Most callers should
835 * pass 0 to ensure the scan starts from the beginning of the
836 * buffer and does not accidentally look at the message body.
837 * @return position just after the space in "encoding ", so the first
838 * character of the encoding's name. If no encoding header can be
839 * located -1 is returned (and UTF-8 should be assumed).
840 */
841 public static final int encoding(byte[] b, int ptr) {
842 final int sz = b.length;
843 while (ptr < sz) {
844 if (b[ptr] == '\n')
845 return -1;
846 if (b[ptr] == 'e')
847 break;
848 ptr = nextLF(b, ptr);
849 }
850 return match(b, ptr, encoding);
851 }
852
853 /**
854 * Parse the "encoding " header as a string.
855 * <p>
856 * Locates the "encoding " header (if present) and returns its value.
857 *
858 * @param b
859 * buffer to scan.
860 * @return the encoding header as specified in the commit; null if the
861 * header was not present and should be assumed.
862 * @since 4.2
863 */
864 @Nullable
865 public static String parseEncodingName(byte[] b) {
866 int enc = encoding(b, 0);
867 if (enc < 0) {
868 return null;
869 }
870 int lf = nextLF(b, enc);
871 return decode(UTF_8, b, enc, lf - 1);
872 }
873
874 /**
875 * Parse the "encoding " header into a character set reference.
876 * <p>
877 * Locates the "encoding " header (if present) by first calling
878 * {@link #encoding(byte[], int)} and then returns the proper character set
879 * to apply to this buffer to evaluate its contents as character data.
880 * <p>
881 * If no encoding header is present {@code UTF-8} is assumed.
882 *
883 * @param b
884 * buffer to scan.
885 * @return the Java character set representation. Never null.
886 * @throws IllegalCharsetNameException
887 * if the character set requested by the encoding header is
888 * malformed and unsupportable.
889 * @throws UnsupportedCharsetException
890 * if the JRE does not support the character set requested by
891 * the encoding header.
892 */
893 public static Charset parseEncoding(byte[] b) {
894 String enc = parseEncodingName(b);
895 if (enc == null) {
896 return UTF_8;
897 }
898
899 String name = enc.trim();
900 try {
901 return Charset.forName(name);
902 } catch (IllegalCharsetNameException
903 | UnsupportedCharsetException badName) {
904 Charset aliased = charsetForAlias(name);
905 if (aliased != null) {
906 return aliased;
907 }
908 throw badName;
909 }
910 }
911
912 /**
913 * Parse a name string (e.g. author, committer, tagger) into a PersonIdent.
914 * <p>
915 * Leading spaces won't be trimmed from the string, i.e. will show up in the
916 * parsed name afterwards.
917 *
918 * @param in
919 * the string to parse a name from.
920 * @return the parsed identity or null in case the identity could not be
921 * parsed.
922 */
923 public static PersonIdent parsePersonIdent(String in) {
924 return parsePersonIdent(Constants.encode(in), 0);
925 }
926
927 /**
928 * Parse a name line (e.g. author, committer, tagger) into a PersonIdent.
929 * <p>
930 * When passing in a value for <code>nameB</code> callers should use the
931 * return value of {@link #author(byte[], int)} or
932 * {@link #committer(byte[], int)}, as these methods provide the proper
933 * position within the buffer.
934 *
935 * @param raw
936 * the buffer to parse character data from.
937 * @param nameB
938 * first position of the identity information. This should be the
939 * first position after the space which delimits the header field
940 * name (e.g. "author" or "committer") from the rest of the
941 * identity line.
942 * @return the parsed identity or null in case the identity could not be
943 * parsed.
944 */
945 public static PersonIdent parsePersonIdent(byte[] raw, int nameB) {
946 Charset cs;
947 try {
948 cs = parseEncoding(raw);
949 } catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
950 // Assume UTF-8 for person identities, usually this is correct.
951 // If not decode() will fall back to the ISO-8859-1 encoding.
952 cs = UTF_8;
953 }
954
955 final int emailB = nextLF(raw, nameB, '<');
956 final int emailE = nextLF(raw, emailB, '>');
957 if (emailB >= raw.length || raw[emailB] == '\n' ||
958 (emailE >= raw.length - 1 && raw[emailE - 1] != '>'))
959 return null;
960
961 final int nameEnd = emailB - 2 >= nameB && raw[emailB - 2] == ' ' ?
962 emailB - 2 : emailB - 1;
963 final String name = decode(cs, raw, nameB, nameEnd);
964 final String email = decode(cs, raw, emailB, emailE - 1);
965
966 // Start searching from end of line, as after first name-email pair,
967 // another name-email pair may occur. We will ignore all kinds of
968 // "junk" following the first email.
969 //
970 // We've to use (emailE - 1) for the case that raw[email] is LF,
971 // otherwise we would run too far. "-2" is necessary to position
972 // before the LF in case of LF termination resp. the penultimate
973 // character if there is no trailing LF.
974 final int tzBegin = lastIndexOfTrim(raw, ' ',
975 nextLF(raw, emailE - 1) - 2) + 1;
976 if (tzBegin <= emailE) // No time/zone, still valid
977 return new PersonIdent(name, email, 0, 0);
978
979 final int whenBegin = Math.max(emailE,
980 lastIndexOfTrim(raw, ' ', tzBegin - 1) + 1);
981 if (whenBegin >= tzBegin - 1) // No time/zone, still valid
982 return new PersonIdent(name, email, 0, 0);
983
984 final long when = parseLongBase10(raw, whenBegin, null);
985 final int tz = parseTimeZoneOffset(raw, tzBegin);
986 return new PersonIdent(name, email, when * 1000L, tz);
987 }
988
989 /**
990 * Parse a name data (e.g. as within a reflog) into a PersonIdent.
991 * <p>
992 * When passing in a value for <code>nameB</code> callers should use the
993 * return value of {@link #author(byte[], int)} or
994 * {@link #committer(byte[], int)}, as these methods provide the proper
995 * position within the buffer.
996 *
997 * @param raw
998 * the buffer to parse character data from.
999 * @param nameB
1000 * first position of the identity information. This should be the
1001 * first position after the space which delimits the header field
1002 * name (e.g. "author" or "committer") from the rest of the
1003 * identity line.
1004 * @return the parsed identity. Never null.
1005 */
1006 public static PersonIdent parsePersonIdentOnly(final byte[] raw,
1007 final int nameB) {
1008 int stop = nextLF(raw, nameB);
1009 int emailB = nextLF(raw, nameB, '<');
1010 int emailE = nextLF(raw, emailB, '>');
1011 final String name;
1012 final String email;
1013 if (emailE < stop) {
1014 email = decode(raw, emailB, emailE - 1);
1015 } else {
1016 email = "invalid"; //$NON-NLS-1$
1017 }
1018 if (emailB < stop)
1019 name = decode(raw, nameB, emailB - 2);
1020 else
1021 name = decode(raw, nameB, stop);
1022
1023 final MutableInteger ptrout = new MutableInteger();
1024 long when;
1025 int tz;
1026 if (emailE < stop) {
1027 when = parseLongBase10(raw, emailE + 1, ptrout);
1028 tz = parseTimeZoneOffset(raw, ptrout.value);
1029 } else {
1030 when = 0;
1031 tz = 0;
1032 }
1033 return new PersonIdent(name, email, when * 1000L, tz);
1034 }
1035
1036 /**
1037 * Locate the end of a footer line key string.
1038 * <p>
1039 * If the region at {@code raw[ptr]} matches {@code ^[A-Za-z0-9-]+:} (e.g.
1040 * "Signed-off-by: A. U. Thor\n") then this method returns the position of
1041 * the first ':'.
1042 * <p>
1043 * If the region at {@code raw[ptr]} does not match {@code ^[A-Za-z0-9-]+:}
1044 * then this method returns -1.
1045 *
1046 * @param raw
1047 * buffer to scan.
1048 * @param ptr
1049 * first position within raw to consider as a footer line key.
1050 * @return position of the ':' which terminates the footer line key if this
1051 * is otherwise a valid footer line key; otherwise -1.
1052 */
1053 public static int endOfFooterLineKey(byte[] raw, int ptr) {
1054 try {
1055 for (;;) {
1056 final byte c = raw[ptr];
1057 if (footerLineKeyChars[c] == 0) {
1058 if (c == ':')
1059 return ptr;
1060 return -1;
1061 }
1062 ptr++;
1063 }
1064 } catch (ArrayIndexOutOfBoundsException e) {
1065 return -1;
1066 }
1067 }
1068
1069 /**
1070 * Decode a buffer under UTF-8, if possible.
1071 *
1072 * If the byte stream cannot be decoded that way, the platform default is tried
1073 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1074 *
1075 * @param buffer
1076 * buffer to pull raw bytes from.
1077 * @return a string representation of the range <code>[start,end)</code>,
1078 * after decoding the region through the specified character set.
1079 */
1080 public static String decode(byte[] buffer) {
1081 return decode(buffer, 0, buffer.length);
1082 }
1083
1084 /**
1085 * Decode a buffer under UTF-8, if possible.
1086 *
1087 * If the byte stream cannot be decoded that way, the platform default is
1088 * tried and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1089 *
1090 * @param buffer
1091 * buffer to pull raw bytes from.
1092 * @param start
1093 * start position in buffer
1094 * @param end
1095 * one position past the last location within the buffer to take
1096 * data from.
1097 * @return a string representation of the range <code>[start,end)</code>,
1098 * after decoding the region through the specified character set.
1099 */
1100 public static String decode(final byte[] buffer, final int start,
1101 final int end) {
1102 return decode(UTF_8, buffer, start, end);
1103 }
1104
1105 /**
1106 * Decode a buffer under the specified character set if possible.
1107 *
1108 * If the byte stream cannot be decoded that way, the platform default is tried
1109 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1110 *
1111 * @param cs
1112 * character set to use when decoding the buffer.
1113 * @param buffer
1114 * buffer to pull raw bytes from.
1115 * @return a string representation of the range <code>[start,end)</code>,
1116 * after decoding the region through the specified character set.
1117 */
1118 public static String decode(Charset cs, byte[] buffer) {
1119 return decode(cs, buffer, 0, buffer.length);
1120 }
1121
1122 /**
1123 * Decode a region of the buffer under the specified character set if possible.
1124 *
1125 * If the byte stream cannot be decoded that way, the platform default is tried
1126 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1127 *
1128 * @param cs
1129 * character set to use when decoding the buffer.
1130 * @param buffer
1131 * buffer to pull raw bytes from.
1132 * @param start
1133 * first position within the buffer to take data from.
1134 * @param end
1135 * one position past the last location within the buffer to take
1136 * data from.
1137 * @return a string representation of the range <code>[start,end)</code>,
1138 * after decoding the region through the specified character set.
1139 */
1140 public static String decode(final Charset cs, final byte[] buffer,
1141 final int start, final int end) {
1142 try {
1143 return decodeNoFallback(cs, buffer, start, end);
1144 } catch (CharacterCodingException e) {
1145 // Fall back to an ISO-8859-1 style encoding. At least all of
1146 // the bytes will be present in the output.
1147 //
1148 return extractBinaryString(buffer, start, end);
1149 }
1150 }
1151
1152 /**
1153 * Decode a region of the buffer under the specified character set if
1154 * possible.
1155 *
1156 * If the byte stream cannot be decoded that way, the platform default is
1157 * tried and if that too fails, an exception is thrown.
1158 *
1159 * @param cs
1160 * character set to use when decoding the buffer.
1161 * @param buffer
1162 * buffer to pull raw bytes from.
1163 * @param start
1164 * first position within the buffer to take data from.
1165 * @param end
1166 * one position past the last location within the buffer to take
1167 * data from.
1168 * @return a string representation of the range <code>[start,end)</code>,
1169 * after decoding the region through the specified character set.
1170 * @throws java.nio.charset.CharacterCodingException
1171 * the input is not in any of the tested character sets.
1172 */
1173 public static String decodeNoFallback(final Charset cs,
1174 final byte[] buffer, final int start, final int end)
1175 throws CharacterCodingException {
1176 ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
1177 b.mark();
1178
1179 // Try our built-in favorite. The assumption here is that
1180 // decoding will fail if the data is not actually encoded
1181 // using that encoder.
1182 try {
1183 return decode(b, UTF_8);
1184 } catch (CharacterCodingException e) {
1185 b.reset();
1186 }
1187
1188 if (!cs.equals(UTF_8)) {
1189 // Try the suggested encoding, it might be right since it was
1190 // provided by the caller.
1191 try {
1192 return decode(b, cs);
1193 } catch (CharacterCodingException e) {
1194 b.reset();
1195 }
1196 }
1197
1198 // Try the default character set. A small group of people
1199 // might actually use the same (or very similar) locale.
1200 Charset defcs = Charset.defaultCharset();
1201 if (!defcs.equals(cs) && !defcs.equals(UTF_8)) {
1202 try {
1203 return decode(b, defcs);
1204 } catch (CharacterCodingException e) {
1205 b.reset();
1206 }
1207 }
1208
1209 throw new CharacterCodingException();
1210 }
1211
1212 /**
1213 * Decode a region of the buffer under the ISO-8859-1 encoding.
1214 *
1215 * Each byte is treated as a single character in the 8859-1 character
1216 * encoding, performing a raw binary->char conversion.
1217 *
1218 * @param buffer
1219 * buffer to pull raw bytes from.
1220 * @param start
1221 * first position within the buffer to take data from.
1222 * @param end
1223 * one position past the last location within the buffer to take
1224 * data from.
1225 * @return a string representation of the range <code>[start,end)</code>.
1226 */
1227 public static String extractBinaryString(final byte[] buffer,
1228 final int start, final int end) {
1229 final StringBuilder r = new StringBuilder(end - start);
1230 for (int i = start; i < end; i++)
1231 r.append((char) (buffer[i] & 0xff));
1232 return r.toString();
1233 }
1234
1235 private static String decode(ByteBuffer b, Charset charset)
1236 throws CharacterCodingException {
1237 final CharsetDecoder d = charset.newDecoder();
1238 d.onMalformedInput(CodingErrorAction.REPORT);
1239 d.onUnmappableCharacter(CodingErrorAction.REPORT);
1240 return d.decode(b).toString();
1241 }
1242
1243 /**
1244 * Locate the position of the commit message body.
1245 *
1246 * @param b
1247 * buffer to scan.
1248 * @param ptr
1249 * position in buffer to start the scan at. Most callers should
1250 * pass 0 to ensure the scan starts from the beginning of the
1251 * commit buffer.
1252 * @return position of the user's message buffer.
1253 */
1254 public static final int commitMessage(byte[] b, int ptr) {
1255 final int sz = b.length;
1256 if (ptr == 0)
1257 ptr += 46; // skip the "tree ..." line.
1258 while (ptr < sz && b[ptr] == 'p')
1259 ptr += 48; // skip this parent.
1260
1261 // Skip any remaining header lines, ignoring what their actual
1262 // header line type is. This is identical to the logic for a tag.
1263 //
1264 return tagMessage(b, ptr);
1265 }
1266
1267 /**
1268 * Locate the position of the tag message body.
1269 *
1270 * @param b
1271 * buffer to scan.
1272 * @param ptr
1273 * position in buffer to start the scan at. Most callers should
1274 * pass 0 to ensure the scan starts from the beginning of the tag
1275 * buffer.
1276 * @return position of the user's message buffer.
1277 */
1278 public static final int tagMessage(byte[] b, int ptr) {
1279 final int sz = b.length;
1280 if (ptr == 0)
1281 ptr += 48; // skip the "object ..." line.
1282 while (ptr < sz && b[ptr] != '\n')
1283 ptr = nextLF(b, ptr);
1284 if (ptr < sz && b[ptr] == '\n')
1285 return ptr + 1;
1286 return -1;
1287 }
1288
1289 /**
1290 * Locate the end of a paragraph.
1291 * <p>
1292 * A paragraph is ended by two consecutive LF bytes or CRLF pairs
1293 *
1294 * @param b
1295 * buffer to scan.
1296 * @param start
1297 * position in buffer to start the scan at. Most callers will
1298 * want to pass the first position of the commit message (as
1299 * found by {@link #commitMessage(byte[], int)}.
1300 * @return position of the LF at the end of the paragraph;
1301 * <code>b.length</code> if no paragraph end could be located.
1302 */
1303 public static final int endOfParagraph(byte[] b, int start) {
1304 int ptr = start;
1305 final int sz = b.length;
1306 while (ptr < sz && (b[ptr] != '\n' && b[ptr] != '\r'))
1307 ptr = nextLF(b, ptr);
1308 if (ptr > start && b[ptr - 1] == '\n')
1309 ptr--;
1310 if (ptr > start && b[ptr - 1] == '\r')
1311 ptr--;
1312 return ptr;
1313 }
1314
1315 /**
1316 * Get last index of {@code ch} in raw, trimming spaces.
1317 *
1318 * @param raw
1319 * buffer to scan.
1320 * @param ch
1321 * character to find.
1322 * @param pos
1323 * starting position.
1324 * @return last index of {@code ch} in raw, trimming spaces.
1325 * @since 4.1
1326 */
1327 public static int lastIndexOfTrim(byte[] raw, char ch, int pos) {
1328 while (pos >= 0 && raw[pos] == ' ')
1329 pos--;
1330
1331 while (pos >= 0 && raw[pos] != ch)
1332 pos--;
1333
1334 return pos;
1335 }
1336
1337 private static Charset charsetForAlias(String name) {
1338 return encodingAliases.get(StringUtils.toLowerCase(name));
1339 }
1340
1341 private RawParseUtils() {
1342 // Don't create instances of a static only utility.
1343 }
1344 }