1 /*
2 * Copyright (C) 2008-2009, Google Inc.
3 * Copyright (C) 2006-2008, Shawn O. Pearce <spearce@spearce.org>
4 * and other copyright owners as documented in the project's IP log.
5 *
6 * This program and the accompanying materials are made available
7 * under the terms of the Eclipse Distribution License v1.0 which
8 * accompanies this distribution, is reproduced below, and is
9 * available at http://www.eclipse.org/org/documents/edl-v10.php
10 *
11 * All rights reserved.
12 *
13 * Redistribution and use in source and binary forms, with or
14 * without modification, are permitted provided that the following
15 * conditions are met:
16 *
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 *
20 * - Redistributions in binary form must reproduce the above
21 * copyright notice, this list of conditions and the following
22 * disclaimer in the documentation and/or other materials provided
23 * with the distribution.
24 *
25 * - Neither the name of the Eclipse Foundation, Inc. nor the
26 * names of its contributors may be used to endorse or promote
27 * products derived from this software without specific prior
28 * written permission.
29 *
30 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
31 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
32 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
33 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
35 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
36 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
37 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
38 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
39 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
40 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
41 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
42 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 */
44
45 package org.eclipse.jgit.util;
46
47 import static java.nio.charset.StandardCharsets.ISO_8859_1;
48 import static java.nio.charset.StandardCharsets.UTF_8;
49 import static org.eclipse.jgit.lib.ObjectChecker.author;
50 import static org.eclipse.jgit.lib.ObjectChecker.committer;
51 import static org.eclipse.jgit.lib.ObjectChecker.encoding;
52 import static org.eclipse.jgit.lib.ObjectChecker.tagger;
53
54 import java.nio.ByteBuffer;
55 import java.nio.charset.CharacterCodingException;
56 import java.nio.charset.Charset;
57 import java.nio.charset.CharsetDecoder;
58 import java.nio.charset.CodingErrorAction;
59 import java.nio.charset.IllegalCharsetNameException;
60 import java.nio.charset.UnsupportedCharsetException;
61 import java.util.Arrays;
62 import java.util.HashMap;
63 import java.util.Map;
64
65 import org.eclipse.jgit.annotations.Nullable;
66 import org.eclipse.jgit.lib.Constants;
67 import org.eclipse.jgit.lib.PersonIdent;
68
69 /**
70 * Handy utility functions to parse raw object contents.
71 */
72 public final class RawParseUtils {
73 /**
74 * UTF-8 charset constant.
75 *
76 * @since 2.2
77 */
78 public static final Charset UTF8_CHARSET = UTF_8;
79
80 private static final byte[] digits10;
81
82 private static final byte[] digits16;
83
84 private static final byte[] footerLineKeyChars;
85
86 private static final Map<String, Charset> encodingAliases;
87
88 static {
89 encodingAliases = new HashMap<>();
90 encodingAliases.put("latin-1", ISO_8859_1); //$NON-NLS-1$
91 encodingAliases.put("iso-latin-1", ISO_8859_1); //$NON-NLS-1$
92
93 digits10 = new byte['9' + 1];
94 Arrays.fill(digits10, (byte) -1);
95 for (char i = '0'; i <= '9'; i++)
96 digits10[i] = (byte) (i - '0');
97
98 digits16 = new byte['f' + 1];
99 Arrays.fill(digits16, (byte) -1);
100 for (char i = '0'; i <= '9'; i++)
101 digits16[i] = (byte) (i - '0');
102 for (char i = 'a'; i <= 'f'; i++)
103 digits16[i] = (byte) ((i - 'a') + 10);
104 for (char i = 'A'; i <= 'F'; i++)
105 digits16[i] = (byte) ((i - 'A') + 10);
106
107 footerLineKeyChars = new byte['z' + 1];
108 footerLineKeyChars['-'] = 1;
109 for (char i = '0'; i <= '9'; i++)
110 footerLineKeyChars[i] = 1;
111 for (char i = 'A'; i <= 'Z'; i++)
112 footerLineKeyChars[i] = 1;
113 for (char i = 'a'; i <= 'z'; i++)
114 footerLineKeyChars[i] = 1;
115 }
116
117 /**
118 * Determine if b[ptr] matches src.
119 *
120 * @param b
121 * the buffer to scan.
122 * @param ptr
123 * first position within b, this should match src[0].
124 * @param src
125 * the buffer to test for equality with b.
126 * @return ptr + src.length if b[ptr..src.length] == src; else -1.
127 */
128 public static final int match(final byte[] b, int ptr, final byte[] src) {
129 if (ptr + src.length > b.length)
130 return -1;
131 for (int i = 0; i < src.length; i++, ptr++)
132 if (b[ptr] != src[i])
133 return -1;
134 return ptr;
135 }
136
137 private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5',
138 '6', '7', '8', '9' };
139
140 /**
141 * Format a base 10 numeric into a temporary buffer.
142 * <p>
143 * Formatting is performed backwards. The method starts at offset
144 * <code>o-1</code> and ends at <code>o-1-digits</code>, where
145 * <code>digits</code> is the number of positions necessary to store the
146 * base 10 value.
147 * <p>
148 * The argument and return values from this method make it easy to chain
149 * writing, for example:
150 * </p>
151 *
152 * <pre>
153 * final byte[] tmp = new byte[64];
154 * int ptr = tmp.length;
155 * tmp[--ptr] = '\n';
156 * ptr = RawParseUtils.formatBase10(tmp, ptr, 32);
157 * tmp[--ptr] = ' ';
158 * ptr = RawParseUtils.formatBase10(tmp, ptr, 18);
159 * tmp[--ptr] = 0;
160 * final String str = new String(tmp, ptr, tmp.length - ptr);
161 * </pre>
162 *
163 * @param b
164 * buffer to write into.
165 * @param o
166 * one offset past the location where writing will begin; writing
167 * proceeds towards lower index values.
168 * @param value
169 * the value to store.
170 * @return the new offset value <code>o</code>. This is the position of
171 * the last byte written. Additional writing should start at one
172 * position earlier.
173 */
174 public static int formatBase10(final byte[] b, int o, int value) {
175 if (value == 0) {
176 b[--o] = '0';
177 return o;
178 }
179 final boolean isneg = value < 0;
180 if (isneg)
181 value = -value;
182 while (value != 0) {
183 b[--o] = base10byte[value % 10];
184 value /= 10;
185 }
186 if (isneg)
187 b[--o] = '-';
188 return o;
189 }
190
191 /**
192 * Parse a base 10 numeric from a sequence of ASCII digits into an int.
193 * <p>
194 * Digit sequences can begin with an optional run of spaces before the
195 * sequence, and may start with a '+' or a '-' to indicate sign position.
196 * Any other characters will cause the method to stop and return the current
197 * result to the caller.
198 *
199 * @param b
200 * buffer to scan.
201 * @param ptr
202 * position within buffer to start parsing digits at.
203 * @param ptrResult
204 * optional location to return the new ptr value through. If null
205 * the ptr value will be discarded.
206 * @return the value at this location; 0 if the location is not a valid
207 * numeric.
208 */
209 public static final int parseBase10(final byte[] b, int ptr,
210 final MutableInteger ptrResult) {
211 int r = 0;
212 int sign = 0;
213 try {
214 final int sz = b.length;
215 while (ptr < sz && b[ptr] == ' ')
216 ptr++;
217 if (ptr >= sz)
218 return 0;
219
220 switch (b[ptr]) {
221 case '-':
222 sign = -1;
223 ptr++;
224 break;
225 case '+':
226 ptr++;
227 break;
228 }
229
230 while (ptr < sz) {
231 final byte v = digits10[b[ptr]];
232 if (v < 0)
233 break;
234 r = (r * 10) + v;
235 ptr++;
236 }
237 } catch (ArrayIndexOutOfBoundsException e) {
238 // Not a valid digit.
239 }
240 if (ptrResult != null)
241 ptrResult.value = ptr;
242 return sign < 0 ? -r : r;
243 }
244
245 /**
246 * Parse a base 10 numeric from a sequence of ASCII digits into a long.
247 * <p>
248 * Digit sequences can begin with an optional run of spaces before the
249 * sequence, and may start with a '+' or a '-' to indicate sign position.
250 * Any other characters will cause the method to stop and return the current
251 * result to the caller.
252 *
253 * @param b
254 * buffer to scan.
255 * @param ptr
256 * position within buffer to start parsing digits at.
257 * @param ptrResult
258 * optional location to return the new ptr value through. If null
259 * the ptr value will be discarded.
260 * @return the value at this location; 0 if the location is not a valid
261 * numeric.
262 */
263 public static final long parseLongBase10(final byte[] b, int ptr,
264 final MutableInteger ptrResult) {
265 long r = 0;
266 int sign = 0;
267 try {
268 final int sz = b.length;
269 while (ptr < sz && b[ptr] == ' ')
270 ptr++;
271 if (ptr >= sz)
272 return 0;
273
274 switch (b[ptr]) {
275 case '-':
276 sign = -1;
277 ptr++;
278 break;
279 case '+':
280 ptr++;
281 break;
282 }
283
284 while (ptr < sz) {
285 final byte v = digits10[b[ptr]];
286 if (v < 0)
287 break;
288 r = (r * 10) + v;
289 ptr++;
290 }
291 } catch (ArrayIndexOutOfBoundsException e) {
292 // Not a valid digit.
293 }
294 if (ptrResult != null)
295 ptrResult.value = ptr;
296 return sign < 0 ? -r : r;
297 }
298
299 /**
300 * Parse 4 character base 16 (hex) formatted string to unsigned integer.
301 * <p>
302 * The number is read in network byte order, that is, most significant
303 * nybble first.
304 *
305 * @param bs
306 * buffer to parse digits from; positions {@code [p, p+4)} will
307 * be parsed.
308 * @param p
309 * first position within the buffer to parse.
310 * @return the integer value.
311 * @throws java.lang.ArrayIndexOutOfBoundsException
312 * if the string is not hex formatted.
313 */
314 public static final int parseHexInt16(final byte[] bs, final int p) {
315 int r = digits16[bs[p]] << 4;
316
317 r |= digits16[bs[p + 1]];
318 r <<= 4;
319
320 r |= digits16[bs[p + 2]];
321 r <<= 4;
322
323 r |= digits16[bs[p + 3]];
324 if (r < 0)
325 throw new ArrayIndexOutOfBoundsException();
326 return r;
327 }
328
329 /**
330 * Parse 8 character base 16 (hex) formatted string to unsigned integer.
331 * <p>
332 * The number is read in network byte order, that is, most significant
333 * nybble first.
334 *
335 * @param bs
336 * buffer to parse digits from; positions {@code [p, p+8)} will
337 * be parsed.
338 * @param p
339 * first position within the buffer to parse.
340 * @return the integer value.
341 * @throws java.lang.ArrayIndexOutOfBoundsException
342 * if the string is not hex formatted.
343 */
344 public static final int parseHexInt32(final byte[] bs, final int p) {
345 int r = digits16[bs[p]] << 4;
346
347 r |= digits16[bs[p + 1]];
348 r <<= 4;
349
350 r |= digits16[bs[p + 2]];
351 r <<= 4;
352
353 r |= digits16[bs[p + 3]];
354 r <<= 4;
355
356 r |= digits16[bs[p + 4]];
357 r <<= 4;
358
359 r |= digits16[bs[p + 5]];
360 r <<= 4;
361
362 r |= digits16[bs[p + 6]];
363
364 final int last = digits16[bs[p + 7]];
365 if (r < 0 || last < 0)
366 throw new ArrayIndexOutOfBoundsException();
367 return (r << 4) | last;
368 }
369
370 /**
371 * Parse 16 character base 16 (hex) formatted string to unsigned long.
372 * <p>
373 * The number is read in network byte order, that is, most significant
374 * nibble first.
375 *
376 * @param bs
377 * buffer to parse digits from; positions {@code [p, p+16)} will
378 * be parsed.
379 * @param p
380 * first position within the buffer to parse.
381 * @return the integer value.
382 * @throws java.lang.ArrayIndexOutOfBoundsException
383 * if the string is not hex formatted.
384 * @since 4.3
385 */
386 public static final long parseHexInt64(final byte[] bs, final int p) {
387 long r = digits16[bs[p]] << 4;
388
389 r |= digits16[bs[p + 1]];
390 r <<= 4;
391
392 r |= digits16[bs[p + 2]];
393 r <<= 4;
394
395 r |= digits16[bs[p + 3]];
396 r <<= 4;
397
398 r |= digits16[bs[p + 4]];
399 r <<= 4;
400
401 r |= digits16[bs[p + 5]];
402 r <<= 4;
403
404 r |= digits16[bs[p + 6]];
405 r <<= 4;
406
407 r |= digits16[bs[p + 7]];
408 r <<= 4;
409
410 r |= digits16[bs[p + 8]];
411 r <<= 4;
412
413 r |= digits16[bs[p + 9]];
414 r <<= 4;
415
416 r |= digits16[bs[p + 10]];
417 r <<= 4;
418
419 r |= digits16[bs[p + 11]];
420 r <<= 4;
421
422 r |= digits16[bs[p + 12]];
423 r <<= 4;
424
425 r |= digits16[bs[p + 13]];
426 r <<= 4;
427
428 r |= digits16[bs[p + 14]];
429
430 final int last = digits16[bs[p + 15]];
431 if (r < 0 || last < 0)
432 throw new ArrayIndexOutOfBoundsException();
433 return (r << 4) | last;
434 }
435
436 /**
437 * Parse a single hex digit to its numeric value (0-15).
438 *
439 * @param digit
440 * hex character to parse.
441 * @return numeric value, in the range 0-15.
442 * @throws java.lang.ArrayIndexOutOfBoundsException
443 * if the input digit is not a valid hex digit.
444 */
445 public static final int parseHexInt4(final byte digit) {
446 final byte r = digits16[digit];
447 if (r < 0)
448 throw new ArrayIndexOutOfBoundsException();
449 return r;
450 }
451
452 /**
453 * Parse a Git style timezone string.
454 * <p>
455 * The sequence "-0315" will be parsed as the numeric value -195, as the
456 * lower two positions count minutes, not 100ths of an hour.
457 *
458 * @param b
459 * buffer to scan.
460 * @param ptr
461 * position within buffer to start parsing digits at.
462 * @return the timezone at this location, expressed in minutes.
463 */
464 public static final int parseTimeZoneOffset(final byte[] b, int ptr) {
465 return parseTimeZoneOffset(b, ptr, null);
466 }
467
468 /**
469 * Parse a Git style timezone string.
470 * <p>
471 * The sequence "-0315" will be parsed as the numeric value -195, as the
472 * lower two positions count minutes, not 100ths of an hour.
473 *
474 * @param b
475 * buffer to scan.
476 * @param ptr
477 * position within buffer to start parsing digits at.
478 * @param ptrResult
479 * optional location to return the new ptr value through. If null
480 * the ptr value will be discarded.
481 * @return the timezone at this location, expressed in minutes.
482 * @since 4.1
483 */
484 public static final int parseTimeZoneOffset(final byte[] b, int ptr,
485 MutableInteger ptrResult) {
486 final int v = parseBase10(b, ptr, ptrResult);
487 final int tzMins = v % 100;
488 final int tzHours = v / 100;
489 return tzHours * 60 + tzMins;
490 }
491
492 /**
493 * Locate the first position after a given character.
494 *
495 * @param b
496 * buffer to scan.
497 * @param ptr
498 * position within buffer to start looking for chrA at.
499 * @param chrA
500 * character to find.
501 * @return new position just after chrA.
502 */
503 public static final int next(final byte[] b, int ptr, final char chrA) {
504 final int sz = b.length;
505 while (ptr < sz) {
506 if (b[ptr++] == chrA)
507 return ptr;
508 }
509 return ptr;
510 }
511
512 /**
513 * Locate the first position after the next LF.
514 * <p>
515 * This method stops on the first '\n' it finds.
516 *
517 * @param b
518 * buffer to scan.
519 * @param ptr
520 * position within buffer to start looking for LF at.
521 * @return new position just after the first LF found.
522 */
523 public static final int nextLF(final byte[] b, int ptr) {
524 return next(b, ptr, '\n');
525 }
526
527 /**
528 * Locate the first position after either the given character or LF.
529 * <p>
530 * This method stops on the first match it finds from either chrA or '\n'.
531 *
532 * @param b
533 * buffer to scan.
534 * @param ptr
535 * position within buffer to start looking for chrA or LF at.
536 * @param chrA
537 * character to find.
538 * @return new position just after the first chrA or LF to be found.
539 */
540 public static final int nextLF(final byte[] b, int ptr, final char chrA) {
541 final int sz = b.length;
542 while (ptr < sz) {
543 final byte c = b[ptr++];
544 if (c == chrA || c == '\n')
545 return ptr;
546 }
547 return ptr;
548 }
549
550 /**
551 * Locate the first position before a given character.
552 *
553 * @param b
554 * buffer to scan.
555 * @param ptr
556 * position within buffer to start looking for chrA at.
557 * @param chrA
558 * character to find.
559 * @return new position just before chrA, -1 for not found
560 */
561 public static final int prev(final byte[] b, int ptr, final char chrA) {
562 if (ptr == b.length)
563 --ptr;
564 while (ptr >= 0) {
565 if (b[ptr--] == chrA)
566 return ptr;
567 }
568 return ptr;
569 }
570
571 /**
572 * Locate the first position before the previous LF.
573 * <p>
574 * This method stops on the first '\n' it finds.
575 *
576 * @param b
577 * buffer to scan.
578 * @param ptr
579 * position within buffer to start looking for LF at.
580 * @return new position just before the first LF found, -1 for not found
581 */
582 public static final int prevLF(final byte[] b, int ptr) {
583 return prev(b, ptr, '\n');
584 }
585
586 /**
587 * Locate the previous position before either the given character or LF.
588 * <p>
589 * This method stops on the first match it finds from either chrA or '\n'.
590 *
591 * @param b
592 * buffer to scan.
593 * @param ptr
594 * position within buffer to start looking for chrA or LF at.
595 * @param chrA
596 * character to find.
597 * @return new position just before the first chrA or LF to be found, -1 for
598 * not found
599 */
600 public static final int prevLF(final byte[] b, int ptr, final char chrA) {
601 if (ptr == b.length)
602 --ptr;
603 while (ptr >= 0) {
604 final byte c = b[ptr--];
605 if (c == chrA || c == '\n')
606 return ptr;
607 }
608 return ptr;
609 }
610
611 /**
612 * Index the region between <code>[ptr, end)</code> to find line starts.
613 * <p>
614 * The returned list is 1 indexed. Index 0 contains
615 * {@link java.lang.Integer#MIN_VALUE} to pad the list out.
616 * <p>
617 * Using a 1 indexed list means that line numbers can be directly accessed
618 * from the list, so <code>list.get(1)</code> (aka get line 1) returns
619 * <code>ptr</code>.
620 * <p>
621 * The last element (index <code>map.size()-1</code>) always contains
622 * <code>end</code>.
623 * <p>
624 * If the data contains a '\0' anywhere, the whole region is considered
625 * binary and a LineMap corresponding to a single line is returned.
626 * </p>
627 *
628 * @param buf
629 * buffer to scan.
630 * @param ptr
631 * position within the buffer corresponding to the first byte of
632 * line 1.
633 * @param end
634 * 1 past the end of the content within <code>buf</code>.
635 * @return a line map indexing the start position of each line.
636 */
637 public static final IntList lineMap(final byte[] buf, int ptr, int end) {
638 int start = ptr;
639
640 // Experimentally derived from multiple source repositories
641 // the average number of bytes/line is 36. Its a rough guess
642 // to initially size our map close to the target.
643 IntList map = new IntList((end - ptr) / 36);
644 map.add(Integer.MIN_VALUE);
645 boolean foundLF = true;
646 for (; ptr < end; ptr++) {
647 if (foundLF) {
648 map.add(ptr);
649 }
650
651 if (buf[ptr] == '\0') {
652 // binary data.
653 map = new IntList(3);
654 map.add(Integer.MIN_VALUE);
655 map.add(start);
656 break;
657 }
658
659 foundLF = (buf[ptr] == '\n');
660 }
661 map.add(end);
662 return map;
663 }
664
665 /**
666 * Locate the "author " header line data.
667 *
668 * @param b
669 * buffer to scan.
670 * @param ptr
671 * position in buffer to start the scan at. Most callers should
672 * pass 0 to ensure the scan starts from the beginning of the
673 * commit buffer and does not accidentally look at message body.
674 * @return position just after the space in "author ", so the first
675 * character of the author's name. If no author header can be
676 * located -1 is returned.
677 */
678 public static final int author(final byte[] b, int ptr) {
679 final int sz = b.length;
680 if (ptr == 0)
681 ptr += 46; // skip the "tree ..." line.
682 while (ptr < sz && b[ptr] == 'p')
683 ptr += 48; // skip this parent.
684 return match(b, ptr, author);
685 }
686
687 /**
688 * Locate the "committer " header line data.
689 *
690 * @param b
691 * buffer to scan.
692 * @param ptr
693 * position in buffer to start the scan at. Most callers should
694 * pass 0 to ensure the scan starts from the beginning of the
695 * commit buffer and does not accidentally look at message body.
696 * @return position just after the space in "committer ", so the first
697 * character of the committer's name. If no committer header can be
698 * located -1 is returned.
699 */
700 public static final int committer(final byte[] b, int ptr) {
701 final int sz = b.length;
702 if (ptr == 0)
703 ptr += 46; // skip the "tree ..." line.
704 while (ptr < sz && b[ptr] == 'p')
705 ptr += 48; // skip this parent.
706 if (ptr < sz && b[ptr] == 'a')
707 ptr = nextLF(b, ptr);
708 return match(b, ptr, committer);
709 }
710
711 /**
712 * Locate the "tagger " header line data.
713 *
714 * @param b
715 * buffer to scan.
716 * @param ptr
717 * position in buffer to start the scan at. Most callers should
718 * pass 0 to ensure the scan starts from the beginning of the tag
719 * buffer and does not accidentally look at message body.
720 * @return position just after the space in "tagger ", so the first
721 * character of the tagger's name. If no tagger header can be
722 * located -1 is returned.
723 */
724 public static final int tagger(final byte[] b, int ptr) {
725 final int sz = b.length;
726 if (ptr == 0)
727 ptr += 48; // skip the "object ..." line.
728 while (ptr < sz) {
729 if (b[ptr] == '\n')
730 return -1;
731 final int m = match(b, ptr, tagger);
732 if (m >= 0)
733 return m;
734 ptr = nextLF(b, ptr);
735 }
736 return -1;
737 }
738
739 /**
740 * Locate the "encoding " header line.
741 *
742 * @param b
743 * buffer to scan.
744 * @param ptr
745 * position in buffer to start the scan at. Most callers should
746 * pass 0 to ensure the scan starts from the beginning of the
747 * buffer and does not accidentally look at the message body.
748 * @return position just after the space in "encoding ", so the first
749 * character of the encoding's name. If no encoding header can be
750 * located -1 is returned (and UTF-8 should be assumed).
751 */
752 public static final int encoding(final byte[] b, int ptr) {
753 final int sz = b.length;
754 while (ptr < sz) {
755 if (b[ptr] == '\n')
756 return -1;
757 if (b[ptr] == 'e')
758 break;
759 ptr = nextLF(b, ptr);
760 }
761 return match(b, ptr, encoding);
762 }
763
764 /**
765 * Parse the "encoding " header as a string.
766 * <p>
767 * Locates the "encoding " header (if present) and returns its value.
768 *
769 * @param b
770 * buffer to scan.
771 * @return the encoding header as specified in the commit; null if the
772 * header was not present and should be assumed.
773 * @since 4.2
774 */
775 @Nullable
776 public static String parseEncodingName(final byte[] b) {
777 int enc = encoding(b, 0);
778 if (enc < 0) {
779 return null;
780 }
781 int lf = nextLF(b, enc);
782 return decode(UTF_8, b, enc, lf - 1);
783 }
784
785 /**
786 * Parse the "encoding " header into a character set reference.
787 * <p>
788 * Locates the "encoding " header (if present) by first calling
789 * {@link #encoding(byte[], int)} and then returns the proper character set
790 * to apply to this buffer to evaluate its contents as character data.
791 * <p>
792 * If no encoding header is present {@code UTF-8} is assumed.
793 *
794 * @param b
795 * buffer to scan.
796 * @return the Java character set representation. Never null.
797 * @throws IllegalCharsetNameException
798 * if the character set requested by the encoding header is
799 * malformed and unsupportable.
800 * @throws UnsupportedCharsetException
801 * if the JRE does not support the character set requested by
802 * the encoding header.
803 */
804 public static Charset parseEncoding(final byte[] b) {
805 String enc = parseEncodingName(b);
806 if (enc == null) {
807 return UTF_8;
808 }
809
810 String name = enc.trim();
811 try {
812 return Charset.forName(name);
813 } catch (IllegalCharsetNameException
814 | UnsupportedCharsetException badName) {
815 Charset aliased = charsetForAlias(name);
816 if (aliased != null) {
817 return aliased;
818 }
819 throw badName;
820 }
821 }
822
823 /**
824 * Parse a name string (e.g. author, committer, tagger) into a PersonIdent.
825 * <p>
826 * Leading spaces won't be trimmed from the string, i.e. will show up in the
827 * parsed name afterwards.
828 *
829 * @param in
830 * the string to parse a name from.
831 * @return the parsed identity or null in case the identity could not be
832 * parsed.
833 */
834 public static PersonIdent parsePersonIdent(final String in) {
835 return parsePersonIdent(Constants.encode(in), 0);
836 }
837
838 /**
839 * Parse a name line (e.g. author, committer, tagger) into a PersonIdent.
840 * <p>
841 * When passing in a value for <code>nameB</code> callers should use the
842 * return value of {@link #author(byte[], int)} or
843 * {@link #committer(byte[], int)}, as these methods provide the proper
844 * position within the buffer.
845 *
846 * @param raw
847 * the buffer to parse character data from.
848 * @param nameB
849 * first position of the identity information. This should be the
850 * first position after the space which delimits the header field
851 * name (e.g. "author" or "committer") from the rest of the
852 * identity line.
853 * @return the parsed identity or null in case the identity could not be
854 * parsed.
855 */
856 public static PersonIdent parsePersonIdent(final byte[] raw, final int nameB) {
857 Charset cs;
858 try {
859 cs = parseEncoding(raw);
860 } catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
861 // Assume UTF-8 for person identities, usually this is correct.
862 // If not decode() will fall back to the ISO-8859-1 encoding.
863 cs = UTF_8;
864 }
865
866 final int emailB = nextLF(raw, nameB, '<');
867 final int emailE = nextLF(raw, emailB, '>');
868 if (emailB >= raw.length || raw[emailB] == '\n' ||
869 (emailE >= raw.length - 1 && raw[emailE - 1] != '>'))
870 return null;
871
872 final int nameEnd = emailB - 2 >= nameB && raw[emailB - 2] == ' ' ?
873 emailB - 2 : emailB - 1;
874 final String name = decode(cs, raw, nameB, nameEnd);
875 final String email = decode(cs, raw, emailB, emailE - 1);
876
877 // Start searching from end of line, as after first name-email pair,
878 // another name-email pair may occur. We will ignore all kinds of
879 // "junk" following the first email.
880 //
881 // We've to use (emailE - 1) for the case that raw[email] is LF,
882 // otherwise we would run too far. "-2" is necessary to position
883 // before the LF in case of LF termination resp. the penultimate
884 // character if there is no trailing LF.
885 final int tzBegin = lastIndexOfTrim(raw, ' ',
886 nextLF(raw, emailE - 1) - 2) + 1;
887 if (tzBegin <= emailE) // No time/zone, still valid
888 return new PersonIdent(name, email, 0, 0);
889
890 final int whenBegin = Math.max(emailE,
891 lastIndexOfTrim(raw, ' ', tzBegin - 1) + 1);
892 if (whenBegin >= tzBegin - 1) // No time/zone, still valid
893 return new PersonIdent(name, email, 0, 0);
894
895 final long when = parseLongBase10(raw, whenBegin, null);
896 final int tz = parseTimeZoneOffset(raw, tzBegin);
897 return new PersonIdent(name, email, when * 1000L, tz);
898 }
899
900 /**
901 * Parse a name data (e.g. as within a reflog) into a PersonIdent.
902 * <p>
903 * When passing in a value for <code>nameB</code> callers should use the
904 * return value of {@link #author(byte[], int)} or
905 * {@link #committer(byte[], int)}, as these methods provide the proper
906 * position within the buffer.
907 *
908 * @param raw
909 * the buffer to parse character data from.
910 * @param nameB
911 * first position of the identity information. This should be the
912 * first position after the space which delimits the header field
913 * name (e.g. "author" or "committer") from the rest of the
914 * identity line.
915 * @return the parsed identity. Never null.
916 */
917 public static PersonIdent parsePersonIdentOnly(final byte[] raw,
918 final int nameB) {
919 int stop = nextLF(raw, nameB);
920 int emailB = nextLF(raw, nameB, '<');
921 int emailE = nextLF(raw, emailB, '>');
922 final String name;
923 final String email;
924 if (emailE < stop) {
925 email = decode(raw, emailB, emailE - 1);
926 } else {
927 email = "invalid"; //$NON-NLS-1$
928 }
929 if (emailB < stop)
930 name = decode(raw, nameB, emailB - 2);
931 else
932 name = decode(raw, nameB, stop);
933
934 final MutableInteger ptrout = new MutableInteger();
935 long when;
936 int tz;
937 if (emailE < stop) {
938 when = parseLongBase10(raw, emailE + 1, ptrout);
939 tz = parseTimeZoneOffset(raw, ptrout.value);
940 } else {
941 when = 0;
942 tz = 0;
943 }
944 return new PersonIdent(name, email, when * 1000L, tz);
945 }
946
947 /**
948 * Locate the end of a footer line key string.
949 * <p>
950 * If the region at {@code raw[ptr]} matches {@code ^[A-Za-z0-9-]+:} (e.g.
951 * "Signed-off-by: A. U. Thor\n") then this method returns the position of
952 * the first ':'.
953 * <p>
954 * If the region at {@code raw[ptr]} does not match {@code ^[A-Za-z0-9-]+:}
955 * then this method returns -1.
956 *
957 * @param raw
958 * buffer to scan.
959 * @param ptr
960 * first position within raw to consider as a footer line key.
961 * @return position of the ':' which terminates the footer line key if this
962 * is otherwise a valid footer line key; otherwise -1.
963 */
964 public static int endOfFooterLineKey(final byte[] raw, int ptr) {
965 try {
966 for (;;) {
967 final byte c = raw[ptr];
968 if (footerLineKeyChars[c] == 0) {
969 if (c == ':')
970 return ptr;
971 return -1;
972 }
973 ptr++;
974 }
975 } catch (ArrayIndexOutOfBoundsException e) {
976 return -1;
977 }
978 }
979
980 /**
981 * Decode a buffer under UTF-8, if possible.
982 *
983 * If the byte stream cannot be decoded that way, the platform default is tried
984 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
985 *
986 * @param buffer
987 * buffer to pull raw bytes from.
988 * @return a string representation of the range <code>[start,end)</code>,
989 * after decoding the region through the specified character set.
990 */
991 public static String decode(final byte[] buffer) {
992 return decode(buffer, 0, buffer.length);
993 }
994
995 /**
996 * Decode a buffer under UTF-8, if possible.
997 *
998 * If the byte stream cannot be decoded that way, the platform default is
999 * tried and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1000 *
1001 * @param buffer
1002 * buffer to pull raw bytes from.
1003 * @param start
1004 * start position in buffer
1005 * @param end
1006 * one position past the last location within the buffer to take
1007 * data from.
1008 * @return a string representation of the range <code>[start,end)</code>,
1009 * after decoding the region through the specified character set.
1010 */
1011 public static String decode(final byte[] buffer, final int start,
1012 final int end) {
1013 return decode(UTF_8, buffer, start, end);
1014 }
1015
1016 /**
1017 * Decode a buffer under the specified character set if possible.
1018 *
1019 * If the byte stream cannot be decoded that way, the platform default is tried
1020 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1021 *
1022 * @param cs
1023 * character set to use when decoding the buffer.
1024 * @param buffer
1025 * buffer to pull raw bytes from.
1026 * @return a string representation of the range <code>[start,end)</code>,
1027 * after decoding the region through the specified character set.
1028 */
1029 public static String decode(final Charset cs, final byte[] buffer) {
1030 return decode(cs, buffer, 0, buffer.length);
1031 }
1032
1033 /**
1034 * Decode a region of the buffer under the specified character set if possible.
1035 *
1036 * If the byte stream cannot be decoded that way, the platform default is tried
1037 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1038 *
1039 * @param cs
1040 * character set to use when decoding the buffer.
1041 * @param buffer
1042 * buffer to pull raw bytes from.
1043 * @param start
1044 * first position within the buffer to take data from.
1045 * @param end
1046 * one position past the last location within the buffer to take
1047 * data from.
1048 * @return a string representation of the range <code>[start,end)</code>,
1049 * after decoding the region through the specified character set.
1050 */
1051 public static String decode(final Charset cs, final byte[] buffer,
1052 final int start, final int end) {
1053 try {
1054 return decodeNoFallback(cs, buffer, start, end);
1055 } catch (CharacterCodingException e) {
1056 // Fall back to an ISO-8859-1 style encoding. At least all of
1057 // the bytes will be present in the output.
1058 //
1059 return extractBinaryString(buffer, start, end);
1060 }
1061 }
1062
1063 /**
1064 * Decode a region of the buffer under the specified character set if
1065 * possible.
1066 *
1067 * If the byte stream cannot be decoded that way, the platform default is
1068 * tried and if that too fails, an exception is thrown.
1069 *
1070 * @param cs
1071 * character set to use when decoding the buffer.
1072 * @param buffer
1073 * buffer to pull raw bytes from.
1074 * @param start
1075 * first position within the buffer to take data from.
1076 * @param end
1077 * one position past the last location within the buffer to take
1078 * data from.
1079 * @return a string representation of the range <code>[start,end)</code>,
1080 * after decoding the region through the specified character set.
1081 * @throws java.nio.charset.CharacterCodingException
1082 * the input is not in any of the tested character sets.
1083 */
1084 public static String decodeNoFallback(final Charset cs,
1085 final byte[] buffer, final int start, final int end)
1086 throws CharacterCodingException {
1087 ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
1088 b.mark();
1089
1090 // Try our built-in favorite. The assumption here is that
1091 // decoding will fail if the data is not actually encoded
1092 // using that encoder.
1093 try {
1094 return decode(b, UTF_8);
1095 } catch (CharacterCodingException e) {
1096 b.reset();
1097 }
1098
1099 if (!cs.equals(UTF_8)) {
1100 // Try the suggested encoding, it might be right since it was
1101 // provided by the caller.
1102 try {
1103 return decode(b, cs);
1104 } catch (CharacterCodingException e) {
1105 b.reset();
1106 }
1107 }
1108
1109 // Try the default character set. A small group of people
1110 // might actually use the same (or very similar) locale.
1111 Charset defcs = Charset.defaultCharset();
1112 if (!defcs.equals(cs) && !defcs.equals(UTF_8)) {
1113 try {
1114 return decode(b, defcs);
1115 } catch (CharacterCodingException e) {
1116 b.reset();
1117 }
1118 }
1119
1120 throw new CharacterCodingException();
1121 }
1122
1123 /**
1124 * Decode a region of the buffer under the ISO-8859-1 encoding.
1125 *
1126 * Each byte is treated as a single character in the 8859-1 character
1127 * encoding, performing a raw binary->char conversion.
1128 *
1129 * @param buffer
1130 * buffer to pull raw bytes from.
1131 * @param start
1132 * first position within the buffer to take data from.
1133 * @param end
1134 * one position past the last location within the buffer to take
1135 * data from.
1136 * @return a string representation of the range <code>[start,end)</code>.
1137 */
1138 public static String extractBinaryString(final byte[] buffer,
1139 final int start, final int end) {
1140 final StringBuilder r = new StringBuilder(end - start);
1141 for (int i = start; i < end; i++)
1142 r.append((char) (buffer[i] & 0xff));
1143 return r.toString();
1144 }
1145
1146 private static String decode(final ByteBuffer b, final Charset charset)
1147 throws CharacterCodingException {
1148 final CharsetDecoder d = charset.newDecoder();
1149 d.onMalformedInput(CodingErrorAction.REPORT);
1150 d.onUnmappableCharacter(CodingErrorAction.REPORT);
1151 return d.decode(b).toString();
1152 }
1153
1154 /**
1155 * Locate the position of the commit message body.
1156 *
1157 * @param b
1158 * buffer to scan.
1159 * @param ptr
1160 * position in buffer to start the scan at. Most callers should
1161 * pass 0 to ensure the scan starts from the beginning of the
1162 * commit buffer.
1163 * @return position of the user's message buffer.
1164 */
1165 public static final int commitMessage(final byte[] b, int ptr) {
1166 final int sz = b.length;
1167 if (ptr == 0)
1168 ptr += 46; // skip the "tree ..." line.
1169 while (ptr < sz && b[ptr] == 'p')
1170 ptr += 48; // skip this parent.
1171
1172 // Skip any remaining header lines, ignoring what their actual
1173 // header line type is. This is identical to the logic for a tag.
1174 //
1175 return tagMessage(b, ptr);
1176 }
1177
1178 /**
1179 * Locate the position of the tag message body.
1180 *
1181 * @param b
1182 * buffer to scan.
1183 * @param ptr
1184 * position in buffer to start the scan at. Most callers should
1185 * pass 0 to ensure the scan starts from the beginning of the tag
1186 * buffer.
1187 * @return position of the user's message buffer.
1188 */
1189 public static final int tagMessage(final byte[] b, int ptr) {
1190 final int sz = b.length;
1191 if (ptr == 0)
1192 ptr += 48; // skip the "object ..." line.
1193 while (ptr < sz && b[ptr] != '\n')
1194 ptr = nextLF(b, ptr);
1195 if (ptr < sz && b[ptr] == '\n')
1196 return ptr + 1;
1197 return -1;
1198 }
1199
1200 /**
1201 * Locate the end of a paragraph.
1202 * <p>
1203 * A paragraph is ended by two consecutive LF bytes or CRLF pairs
1204 *
1205 * @param b
1206 * buffer to scan.
1207 * @param start
1208 * position in buffer to start the scan at. Most callers will
1209 * want to pass the first position of the commit message (as
1210 * found by {@link #commitMessage(byte[], int)}.
1211 * @return position of the LF at the end of the paragraph;
1212 * <code>b.length</code> if no paragraph end could be located.
1213 */
1214 public static final int endOfParagraph(final byte[] b, final int start) {
1215 int ptr = start;
1216 final int sz = b.length;
1217 while (ptr < sz && (b[ptr] != '\n' && b[ptr] != '\r'))
1218 ptr = nextLF(b, ptr);
1219 if (ptr > start && b[ptr - 1] == '\n')
1220 ptr--;
1221 if (ptr > start && b[ptr - 1] == '\r')
1222 ptr--;
1223 return ptr;
1224 }
1225
1226 /**
1227 * Get last index of {@code ch} in raw, trimming spaces.
1228 *
1229 * @param raw
1230 * buffer to scan.
1231 * @param ch
1232 * character to find.
1233 * @param pos
1234 * starting position.
1235 * @return last index of {@code ch} in raw, trimming spaces.
1236 * @since 4.1
1237 */
1238 public static int lastIndexOfTrim(byte[] raw, char ch, int pos) {
1239 while (pos >= 0 && raw[pos] == ' ')
1240 pos--;
1241
1242 while (pos >= 0 && raw[pos] != ch)
1243 pos--;
1244
1245 return pos;
1246 }
1247
1248 private static Charset charsetForAlias(String name) {
1249 return encodingAliases.get(StringUtils.toLowerCase(name));
1250 }
1251
1252 private RawParseUtils() {
1253 // Don't create instances of a static only utility.
1254 }
1255 }