1 /*
2 * Copyright (C) 2008-2009, Google Inc.
3 * Copyright (C) 2006-2008, Shawn O. Pearce <spearce@spearce.org>
4 * and other copyright owners as documented in the project's IP log.
5 *
6 * This program and the accompanying materials are made available
7 * under the terms of the Eclipse Distribution License v1.0 which
8 * accompanies this distribution, is reproduced below, and is
9 * available at http://www.eclipse.org/org/documents/edl-v10.php
10 *
11 * All rights reserved.
12 *
13 * Redistribution and use in source and binary forms, with or
14 * without modification, are permitted provided that the following
15 * conditions are met:
16 *
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 *
20 * - Redistributions in binary form must reproduce the above
21 * copyright notice, this list of conditions and the following
22 * disclaimer in the documentation and/or other materials provided
23 * with the distribution.
24 *
25 * - Neither the name of the Eclipse Foundation, Inc. nor the
26 * names of its contributors may be used to endorse or promote
27 * products derived from this software without specific prior
28 * written permission.
29 *
30 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
31 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
32 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
33 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
35 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
36 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
37 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
38 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
39 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
40 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
41 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
42 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 */
44
45 package org.eclipse.jgit.util;
46
47 import static java.nio.charset.StandardCharsets.ISO_8859_1;
48 import static java.nio.charset.StandardCharsets.UTF_8;
49 import static org.eclipse.jgit.lib.ObjectChecker.author;
50 import static org.eclipse.jgit.lib.ObjectChecker.committer;
51 import static org.eclipse.jgit.lib.ObjectChecker.encoding;
52 import static org.eclipse.jgit.lib.ObjectChecker.tagger;
53
54 import java.nio.ByteBuffer;
55 import java.nio.charset.CharacterCodingException;
56 import java.nio.charset.Charset;
57 import java.nio.charset.CharsetDecoder;
58 import java.nio.charset.CodingErrorAction;
59 import java.nio.charset.IllegalCharsetNameException;
60 import java.nio.charset.UnsupportedCharsetException;
61 import java.util.Arrays;
62 import java.util.HashMap;
63 import java.util.Map;
64
65 import org.eclipse.jgit.annotations.Nullable;
66 import org.eclipse.jgit.lib.Constants;
67 import org.eclipse.jgit.lib.PersonIdent;
68
69 /** Handy utility functions to parse raw object contents. */
70 public final class RawParseUtils {
71 /**
72 * UTF-8 charset constant.
73 *
74 * @since 2.2
75 */
76 public static final Charset UTF8_CHARSET = UTF_8;
77
78 private static final byte[] digits10;
79
80 private static final byte[] digits16;
81
82 private static final byte[] footerLineKeyChars;
83
84 private static final Map<String, Charset> encodingAliases;
85
86 static {
87 encodingAliases = new HashMap<>();
88 encodingAliases.put("latin-1", ISO_8859_1); //$NON-NLS-1$
89 encodingAliases.put("iso-latin-1", ISO_8859_1); //$NON-NLS-1$
90
91 digits10 = new byte['9' + 1];
92 Arrays.fill(digits10, (byte) -1);
93 for (char i = '0'; i <= '9'; i++)
94 digits10[i] = (byte) (i - '0');
95
96 digits16 = new byte['f' + 1];
97 Arrays.fill(digits16, (byte) -1);
98 for (char i = '0'; i <= '9'; i++)
99 digits16[i] = (byte) (i - '0');
100 for (char i = 'a'; i <= 'f'; i++)
101 digits16[i] = (byte) ((i - 'a') + 10);
102 for (char i = 'A'; i <= 'F'; i++)
103 digits16[i] = (byte) ((i - 'A') + 10);
104
105 footerLineKeyChars = new byte['z' + 1];
106 footerLineKeyChars['-'] = 1;
107 for (char i = '0'; i <= '9'; i++)
108 footerLineKeyChars[i] = 1;
109 for (char i = 'A'; i <= 'Z'; i++)
110 footerLineKeyChars[i] = 1;
111 for (char i = 'a'; i <= 'z'; i++)
112 footerLineKeyChars[i] = 1;
113 }
114
115 /**
116 * Determine if b[ptr] matches src.
117 *
118 * @param b
119 * the buffer to scan.
120 * @param ptr
121 * first position within b, this should match src[0].
122 * @param src
123 * the buffer to test for equality with b.
124 * @return ptr + src.length if b[ptr..src.length] == src; else -1.
125 */
126 public static final int match(final byte[] b, int ptr, final byte[] src) {
127 if (ptr + src.length > b.length)
128 return -1;
129 for (int i = 0; i < src.length; i++, ptr++)
130 if (b[ptr] != src[i])
131 return -1;
132 return ptr;
133 }
134
135 private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5',
136 '6', '7', '8', '9' };
137
138 /**
139 * Format a base 10 numeric into a temporary buffer.
140 * <p>
141 * Formatting is performed backwards. The method starts at offset
142 * <code>o-1</code> and ends at <code>o-1-digits</code>, where
143 * <code>digits</code> is the number of positions necessary to store the
144 * base 10 value.
145 * <p>
146 * The argument and return values from this method make it easy to chain
147 * writing, for example:
148 * </p>
149 *
150 * <pre>
151 * final byte[] tmp = new byte[64];
152 * int ptr = tmp.length;
153 * tmp[--ptr] = '\n';
154 * ptr = RawParseUtils.formatBase10(tmp, ptr, 32);
155 * tmp[--ptr] = ' ';
156 * ptr = RawParseUtils.formatBase10(tmp, ptr, 18);
157 * tmp[--ptr] = 0;
158 * final String str = new String(tmp, ptr, tmp.length - ptr);
159 * </pre>
160 *
161 * @param b
162 * buffer to write into.
163 * @param o
164 * one offset past the location where writing will begin; writing
165 * proceeds towards lower index values.
166 * @param value
167 * the value to store.
168 * @return the new offset value <code>o</code>. This is the position of
169 * the last byte written. Additional writing should start at one
170 * position earlier.
171 */
172 public static int formatBase10(final byte[] b, int o, int value) {
173 if (value == 0) {
174 b[--o] = '0';
175 return o;
176 }
177 final boolean isneg = value < 0;
178 if (isneg)
179 value = -value;
180 while (value != 0) {
181 b[--o] = base10byte[value % 10];
182 value /= 10;
183 }
184 if (isneg)
185 b[--o] = '-';
186 return o;
187 }
188
189 /**
190 * Parse a base 10 numeric from a sequence of ASCII digits into an int.
191 * <p>
192 * Digit sequences can begin with an optional run of spaces before the
193 * sequence, and may start with a '+' or a '-' to indicate sign position.
194 * Any other characters will cause the method to stop and return the current
195 * result to the caller.
196 *
197 * @param b
198 * buffer to scan.
199 * @param ptr
200 * position within buffer to start parsing digits at.
201 * @param ptrResult
202 * optional location to return the new ptr value through. If null
203 * the ptr value will be discarded.
204 * @return the value at this location; 0 if the location is not a valid
205 * numeric.
206 */
207 public static final int parseBase10(final byte[] b, int ptr,
208 final MutableInteger ptrResult) {
209 int r = 0;
210 int sign = 0;
211 try {
212 final int sz = b.length;
213 while (ptr < sz && b[ptr] == ' ')
214 ptr++;
215 if (ptr >= sz)
216 return 0;
217
218 switch (b[ptr]) {
219 case '-':
220 sign = -1;
221 ptr++;
222 break;
223 case '+':
224 ptr++;
225 break;
226 }
227
228 while (ptr < sz) {
229 final byte v = digits10[b[ptr]];
230 if (v < 0)
231 break;
232 r = (r * 10) + v;
233 ptr++;
234 }
235 } catch (ArrayIndexOutOfBoundsException e) {
236 // Not a valid digit.
237 }
238 if (ptrResult != null)
239 ptrResult.value = ptr;
240 return sign < 0 ? -r : r;
241 }
242
243 /**
244 * Parse a base 10 numeric from a sequence of ASCII digits into a long.
245 * <p>
246 * Digit sequences can begin with an optional run of spaces before the
247 * sequence, and may start with a '+' or a '-' to indicate sign position.
248 * Any other characters will cause the method to stop and return the current
249 * result to the caller.
250 *
251 * @param b
252 * buffer to scan.
253 * @param ptr
254 * position within buffer to start parsing digits at.
255 * @param ptrResult
256 * optional location to return the new ptr value through. If null
257 * the ptr value will be discarded.
258 * @return the value at this location; 0 if the location is not a valid
259 * numeric.
260 */
261 public static final long parseLongBase10(final byte[] b, int ptr,
262 final MutableInteger ptrResult) {
263 long r = 0;
264 int sign = 0;
265 try {
266 final int sz = b.length;
267 while (ptr < sz && b[ptr] == ' ')
268 ptr++;
269 if (ptr >= sz)
270 return 0;
271
272 switch (b[ptr]) {
273 case '-':
274 sign = -1;
275 ptr++;
276 break;
277 case '+':
278 ptr++;
279 break;
280 }
281
282 while (ptr < sz) {
283 final byte v = digits10[b[ptr]];
284 if (v < 0)
285 break;
286 r = (r * 10) + v;
287 ptr++;
288 }
289 } catch (ArrayIndexOutOfBoundsException e) {
290 // Not a valid digit.
291 }
292 if (ptrResult != null)
293 ptrResult.value = ptr;
294 return sign < 0 ? -r : r;
295 }
296
297 /**
298 * Parse 4 character base 16 (hex) formatted string to unsigned integer.
299 * <p>
300 * The number is read in network byte order, that is, most significant
301 * nybble first.
302 *
303 * @param bs
304 * buffer to parse digits from; positions {@code [p, p+4)} will
305 * be parsed.
306 * @param p
307 * first position within the buffer to parse.
308 * @return the integer value.
309 * @throws ArrayIndexOutOfBoundsException
310 * if the string is not hex formatted.
311 */
312 public static final int parseHexInt16(final byte[] bs, final int p) {
313 int r = digits16[bs[p]] << 4;
314
315 r |= digits16[bs[p + 1]];
316 r <<= 4;
317
318 r |= digits16[bs[p + 2]];
319 r <<= 4;
320
321 r |= digits16[bs[p + 3]];
322 if (r < 0)
323 throw new ArrayIndexOutOfBoundsException();
324 return r;
325 }
326
327 /**
328 * Parse 8 character base 16 (hex) formatted string to unsigned integer.
329 * <p>
330 * The number is read in network byte order, that is, most significant
331 * nybble first.
332 *
333 * @param bs
334 * buffer to parse digits from; positions {@code [p, p+8)} will
335 * be parsed.
336 * @param p
337 * first position within the buffer to parse.
338 * @return the integer value.
339 * @throws ArrayIndexOutOfBoundsException
340 * if the string is not hex formatted.
341 */
342 public static final int parseHexInt32(final byte[] bs, final int p) {
343 int r = digits16[bs[p]] << 4;
344
345 r |= digits16[bs[p + 1]];
346 r <<= 4;
347
348 r |= digits16[bs[p + 2]];
349 r <<= 4;
350
351 r |= digits16[bs[p + 3]];
352 r <<= 4;
353
354 r |= digits16[bs[p + 4]];
355 r <<= 4;
356
357 r |= digits16[bs[p + 5]];
358 r <<= 4;
359
360 r |= digits16[bs[p + 6]];
361
362 final int last = digits16[bs[p + 7]];
363 if (r < 0 || last < 0)
364 throw new ArrayIndexOutOfBoundsException();
365 return (r << 4) | last;
366 }
367
368 /**
369 * Parse 16 character base 16 (hex) formatted string to unsigned long.
370 * <p>
371 * The number is read in network byte order, that is, most significant
372 * nibble first.
373 *
374 * @param bs
375 * buffer to parse digits from; positions {@code [p, p+16)} will
376 * be parsed.
377 * @param p
378 * first position within the buffer to parse.
379 * @return the integer value.
380 * @throws ArrayIndexOutOfBoundsException
381 * if the string is not hex formatted.
382 * @since 4.3
383 */
384 public static final long parseHexInt64(final byte[] bs, final int p) {
385 long r = digits16[bs[p]] << 4;
386
387 r |= digits16[bs[p + 1]];
388 r <<= 4;
389
390 r |= digits16[bs[p + 2]];
391 r <<= 4;
392
393 r |= digits16[bs[p + 3]];
394 r <<= 4;
395
396 r |= digits16[bs[p + 4]];
397 r <<= 4;
398
399 r |= digits16[bs[p + 5]];
400 r <<= 4;
401
402 r |= digits16[bs[p + 6]];
403 r <<= 4;
404
405 r |= digits16[bs[p + 7]];
406 r <<= 4;
407
408 r |= digits16[bs[p + 8]];
409 r <<= 4;
410
411 r |= digits16[bs[p + 9]];
412 r <<= 4;
413
414 r |= digits16[bs[p + 10]];
415 r <<= 4;
416
417 r |= digits16[bs[p + 11]];
418 r <<= 4;
419
420 r |= digits16[bs[p + 12]];
421 r <<= 4;
422
423 r |= digits16[bs[p + 13]];
424 r <<= 4;
425
426 r |= digits16[bs[p + 14]];
427
428 final int last = digits16[bs[p + 15]];
429 if (r < 0 || last < 0)
430 throw new ArrayIndexOutOfBoundsException();
431 return (r << 4) | last;
432 }
433
434 /**
435 * Parse a single hex digit to its numeric value (0-15).
436 *
437 * @param digit
438 * hex character to parse.
439 * @return numeric value, in the range 0-15.
440 * @throws ArrayIndexOutOfBoundsException
441 * if the input digit is not a valid hex digit.
442 */
443 public static final int parseHexInt4(final byte digit) {
444 final byte r = digits16[digit];
445 if (r < 0)
446 throw new ArrayIndexOutOfBoundsException();
447 return r;
448 }
449
450 /**
451 * Parse a Git style timezone string.
452 * <p>
453 * The sequence "-0315" will be parsed as the numeric value -195, as the
454 * lower two positions count minutes, not 100ths of an hour.
455 *
456 * @param b
457 * buffer to scan.
458 * @param ptr
459 * position within buffer to start parsing digits at.
460 * @return the timezone at this location, expressed in minutes.
461 */
462 public static final int parseTimeZoneOffset(final byte[] b, int ptr) {
463 return parseTimeZoneOffset(b, ptr, null);
464 }
465
466 /**
467 * Parse a Git style timezone string.
468 * <p>
469 * The sequence "-0315" will be parsed as the numeric value -195, as the
470 * lower two positions count minutes, not 100ths of an hour.
471 *
472 * @param b
473 * buffer to scan.
474 * @param ptr
475 * position within buffer to start parsing digits at.
476 * @param ptrResult
477 * optional location to return the new ptr value through. If null
478 * the ptr value will be discarded.
479 * @return the timezone at this location, expressed in minutes.
480 * @since 4.1
481 */
482 public static final int parseTimeZoneOffset(final byte[] b, int ptr,
483 MutableInteger ptrResult) {
484 final int v = parseBase10(b, ptr, ptrResult);
485 final int tzMins = v % 100;
486 final int tzHours = v / 100;
487 return tzHours * 60 + tzMins;
488 }
489
490 /**
491 * Locate the first position after a given character.
492 *
493 * @param b
494 * buffer to scan.
495 * @param ptr
496 * position within buffer to start looking for chrA at.
497 * @param chrA
498 * character to find.
499 * @return new position just after chrA.
500 */
501 public static final int next(final byte[] b, int ptr, final char chrA) {
502 final int sz = b.length;
503 while (ptr < sz) {
504 if (b[ptr++] == chrA)
505 return ptr;
506 }
507 return ptr;
508 }
509
510 /**
511 * Locate the first position after the next LF.
512 * <p>
513 * This method stops on the first '\n' it finds.
514 *
515 * @param b
516 * buffer to scan.
517 * @param ptr
518 * position within buffer to start looking for LF at.
519 * @return new position just after the first LF found.
520 */
521 public static final int nextLF(final byte[] b, int ptr) {
522 return next(b, ptr, '\n');
523 }
524
525 /**
526 * Locate the first position after either the given character or LF.
527 * <p>
528 * This method stops on the first match it finds from either chrA or '\n'.
529 *
530 * @param b
531 * buffer to scan.
532 * @param ptr
533 * position within buffer to start looking for chrA or LF at.
534 * @param chrA
535 * character to find.
536 * @return new position just after the first chrA or LF to be found.
537 */
538 public static final int nextLF(final byte[] b, int ptr, final char chrA) {
539 final int sz = b.length;
540 while (ptr < sz) {
541 final byte c = b[ptr++];
542 if (c == chrA || c == '\n')
543 return ptr;
544 }
545 return ptr;
546 }
547
548 /**
549 * Locate the first position before a given character.
550 *
551 * @param b
552 * buffer to scan.
553 * @param ptr
554 * position within buffer to start looking for chrA at.
555 * @param chrA
556 * character to find.
557 * @return new position just before chrA, -1 for not found
558 */
559 public static final int prev(final byte[] b, int ptr, final char chrA) {
560 if (ptr == b.length)
561 --ptr;
562 while (ptr >= 0) {
563 if (b[ptr--] == chrA)
564 return ptr;
565 }
566 return ptr;
567 }
568
569 /**
570 * Locate the first position before the previous LF.
571 * <p>
572 * This method stops on the first '\n' it finds.
573 *
574 * @param b
575 * buffer to scan.
576 * @param ptr
577 * position within buffer to start looking for LF at.
578 * @return new position just before the first LF found, -1 for not found
579 */
580 public static final int prevLF(final byte[] b, int ptr) {
581 return prev(b, ptr, '\n');
582 }
583
584 /**
585 * Locate the previous position before either the given character or LF.
586 * <p>
587 * This method stops on the first match it finds from either chrA or '\n'.
588 *
589 * @param b
590 * buffer to scan.
591 * @param ptr
592 * position within buffer to start looking for chrA or LF at.
593 * @param chrA
594 * character to find.
595 * @return new position just before the first chrA or LF to be found, -1 for
596 * not found
597 */
598 public static final int prevLF(final byte[] b, int ptr, final char chrA) {
599 if (ptr == b.length)
600 --ptr;
601 while (ptr >= 0) {
602 final byte c = b[ptr--];
603 if (c == chrA || c == '\n')
604 return ptr;
605 }
606 return ptr;
607 }
608
609 /**
610 * Index the region between <code>[ptr, end)</code> to find line starts.
611 * <p>
612 * The returned list is 1 indexed. Index 0 contains
613 * {@link Integer#MIN_VALUE} to pad the list out.
614 * <p>
615 * Using a 1 indexed list means that line numbers can be directly accessed
616 * from the list, so <code>list.get(1)</code> (aka get line 1) returns
617 * <code>ptr</code>.
618 * <p>
619 * The last element (index <code>map.size()-1</code>) always contains
620 * <code>end</code>.
621 *
622 * @param buf
623 * buffer to scan.
624 * @param ptr
625 * position within the buffer corresponding to the first byte of
626 * line 1.
627 * @param end
628 * 1 past the end of the content within <code>buf</code>.
629 * @return a line map indexing the start position of each line.
630 */
631 public static final IntList lineMap(final byte[] buf, int ptr, int end) {
632 // Experimentally derived from multiple source repositories
633 // the average number of bytes/line is 36. Its a rough guess
634 // to initially size our map close to the target.
635 //
636 final IntList map = new IntList((end - ptr) / 36);
637 map.fillTo(1, Integer.MIN_VALUE);
638 for (; ptr < end; ptr = nextLF(buf, ptr))
639 map.add(ptr);
640 map.add(end);
641 return map;
642 }
643
644 /**
645 * Locate the "author " header line data.
646 *
647 * @param b
648 * buffer to scan.
649 * @param ptr
650 * position in buffer to start the scan at. Most callers should
651 * pass 0 to ensure the scan starts from the beginning of the
652 * commit buffer and does not accidentally look at message body.
653 * @return position just after the space in "author ", so the first
654 * character of the author's name. If no author header can be
655 * located -1 is returned.
656 */
657 public static final int author(final byte[] b, int ptr) {
658 final int sz = b.length;
659 if (ptr == 0)
660 ptr += 46; // skip the "tree ..." line.
661 while (ptr < sz && b[ptr] == 'p')
662 ptr += 48; // skip this parent.
663 return match(b, ptr, author);
664 }
665
666 /**
667 * Locate the "committer " header line data.
668 *
669 * @param b
670 * buffer to scan.
671 * @param ptr
672 * position in buffer to start the scan at. Most callers should
673 * pass 0 to ensure the scan starts from the beginning of the
674 * commit buffer and does not accidentally look at message body.
675 * @return position just after the space in "committer ", so the first
676 * character of the committer's name. If no committer header can be
677 * located -1 is returned.
678 */
679 public static final int committer(final byte[] b, int ptr) {
680 final int sz = b.length;
681 if (ptr == 0)
682 ptr += 46; // skip the "tree ..." line.
683 while (ptr < sz && b[ptr] == 'p')
684 ptr += 48; // skip this parent.
685 if (ptr < sz && b[ptr] == 'a')
686 ptr = nextLF(b, ptr);
687 return match(b, ptr, committer);
688 }
689
690 /**
691 * Locate the "tagger " header line data.
692 *
693 * @param b
694 * buffer to scan.
695 * @param ptr
696 * position in buffer to start the scan at. Most callers should
697 * pass 0 to ensure the scan starts from the beginning of the tag
698 * buffer and does not accidentally look at message body.
699 * @return position just after the space in "tagger ", so the first
700 * character of the tagger's name. If no tagger header can be
701 * located -1 is returned.
702 */
703 public static final int tagger(final byte[] b, int ptr) {
704 final int sz = b.length;
705 if (ptr == 0)
706 ptr += 48; // skip the "object ..." line.
707 while (ptr < sz) {
708 if (b[ptr] == '\n')
709 return -1;
710 final int m = match(b, ptr, tagger);
711 if (m >= 0)
712 return m;
713 ptr = nextLF(b, ptr);
714 }
715 return -1;
716 }
717
718 /**
719 * Locate the "encoding " header line.
720 *
721 * @param b
722 * buffer to scan.
723 * @param ptr
724 * position in buffer to start the scan at. Most callers should
725 * pass 0 to ensure the scan starts from the beginning of the
726 * buffer and does not accidentally look at the message body.
727 * @return position just after the space in "encoding ", so the first
728 * character of the encoding's name. If no encoding header can be
729 * located -1 is returned (and UTF-8 should be assumed).
730 */
731 public static final int encoding(final byte[] b, int ptr) {
732 final int sz = b.length;
733 while (ptr < sz) {
734 if (b[ptr] == '\n')
735 return -1;
736 if (b[ptr] == 'e')
737 break;
738 ptr = nextLF(b, ptr);
739 }
740 return match(b, ptr, encoding);
741 }
742
743 /**
744 * Parse the "encoding " header as a string.
745 * <p>
746 * Locates the "encoding " header (if present) and returns its value.
747 *
748 * @param b
749 * buffer to scan.
750 * @return the encoding header as specified in the commit; null if the
751 * header was not present and should be assumed.
752 * @since 4.2
753 */
754 @Nullable
755 public static String parseEncodingName(final byte[] b) {
756 int enc = encoding(b, 0);
757 if (enc < 0) {
758 return null;
759 }
760 int lf = nextLF(b, enc);
761 return decode(UTF_8, b, enc, lf - 1);
762 }
763
764 /**
765 * Parse the "encoding " header into a character set reference.
766 * <p>
767 * Locates the "encoding " header (if present) by first calling
768 * {@link #encoding(byte[], int)} and then returns the proper character set
769 * to apply to this buffer to evaluate its contents as character data.
770 * <p>
771 * If no encoding header is present {@code UTF-8} is assumed.
772 *
773 * @param b
774 * buffer to scan.
775 * @return the Java character set representation. Never null.
776 * @throws IllegalCharsetNameException
777 * if the character set requested by the encoding header is
778 * malformed and unsupportable.
779 * @throws UnsupportedCharsetException
780 * if the JRE does not support the character set requested by
781 * the encoding header.
782 */
783 public static Charset parseEncoding(final byte[] b) {
784 String enc = parseEncodingName(b);
785 if (enc == null) {
786 return UTF_8;
787 }
788
789 String name = enc.trim();
790 try {
791 return Charset.forName(name);
792 } catch (IllegalCharsetNameException
793 | UnsupportedCharsetException badName) {
794 Charset aliased = charsetForAlias(name);
795 if (aliased != null) {
796 return aliased;
797 }
798 throw badName;
799 }
800 }
801
802 /**
803 * Parse a name string (e.g. author, committer, tagger) into a PersonIdent.
804 * <p>
805 * Leading spaces won't be trimmed from the string, i.e. will show up in the
806 * parsed name afterwards.
807 *
808 * @param in
809 * the string to parse a name from.
810 * @return the parsed identity or null in case the identity could not be
811 * parsed.
812 */
813 public static PersonIdent parsePersonIdent(final String in) {
814 return parsePersonIdent(Constants.encode(in), 0);
815 }
816
817 /**
818 * Parse a name line (e.g. author, committer, tagger) into a PersonIdent.
819 * <p>
820 * When passing in a value for <code>nameB</code> callers should use the
821 * return value of {@link #author(byte[], int)} or
822 * {@link #committer(byte[], int)}, as these methods provide the proper
823 * position within the buffer.
824 *
825 * @param raw
826 * the buffer to parse character data from.
827 * @param nameB
828 * first position of the identity information. This should be the
829 * first position after the space which delimits the header field
830 * name (e.g. "author" or "committer") from the rest of the
831 * identity line.
832 * @return the parsed identity or null in case the identity could not be
833 * parsed.
834 */
835 public static PersonIdent parsePersonIdent(final byte[] raw, final int nameB) {
836 Charset cs;
837 try {
838 cs = parseEncoding(raw);
839 } catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
840 // Assume UTF-8 for person identities, usually this is correct.
841 // If not decode() will fall back to the ISO-8859-1 encoding.
842 cs = UTF_8;
843 }
844
845 final int emailB = nextLF(raw, nameB, '<');
846 final int emailE = nextLF(raw, emailB, '>');
847 if (emailB >= raw.length || raw[emailB] == '\n' ||
848 (emailE >= raw.length - 1 && raw[emailE - 1] != '>'))
849 return null;
850
851 final int nameEnd = emailB - 2 >= nameB && raw[emailB - 2] == ' ' ?
852 emailB - 2 : emailB - 1;
853 final String name = decode(cs, raw, nameB, nameEnd);
854 final String email = decode(cs, raw, emailB, emailE - 1);
855
856 // Start searching from end of line, as after first name-email pair,
857 // another name-email pair may occur. We will ignore all kinds of
858 // "junk" following the first email.
859 //
860 // We've to use (emailE - 1) for the case that raw[email] is LF,
861 // otherwise we would run too far. "-2" is necessary to position
862 // before the LF in case of LF termination resp. the penultimate
863 // character if there is no trailing LF.
864 final int tzBegin = lastIndexOfTrim(raw, ' ',
865 nextLF(raw, emailE - 1) - 2) + 1;
866 if (tzBegin <= emailE) // No time/zone, still valid
867 return new PersonIdent(name, email, 0, 0);
868
869 final int whenBegin = Math.max(emailE,
870 lastIndexOfTrim(raw, ' ', tzBegin - 1) + 1);
871 if (whenBegin >= tzBegin - 1) // No time/zone, still valid
872 return new PersonIdent(name, email, 0, 0);
873
874 final long when = parseLongBase10(raw, whenBegin, null);
875 final int tz = parseTimeZoneOffset(raw, tzBegin);
876 return new PersonIdent(name, email, when * 1000L, tz);
877 }
878
879 /**
880 * Parse a name data (e.g. as within a reflog) into a PersonIdent.
881 * <p>
882 * When passing in a value for <code>nameB</code> callers should use the
883 * return value of {@link #author(byte[], int)} or
884 * {@link #committer(byte[], int)}, as these methods provide the proper
885 * position within the buffer.
886 *
887 * @param raw
888 * the buffer to parse character data from.
889 * @param nameB
890 * first position of the identity information. This should be the
891 * first position after the space which delimits the header field
892 * name (e.g. "author" or "committer") from the rest of the
893 * identity line.
894 * @return the parsed identity. Never null.
895 */
896 public static PersonIdent parsePersonIdentOnly(final byte[] raw,
897 final int nameB) {
898 int stop = nextLF(raw, nameB);
899 int emailB = nextLF(raw, nameB, '<');
900 int emailE = nextLF(raw, emailB, '>');
901 final String name;
902 final String email;
903 if (emailE < stop) {
904 email = decode(raw, emailB, emailE - 1);
905 } else {
906 email = "invalid"; //$NON-NLS-1$
907 }
908 if (emailB < stop)
909 name = decode(raw, nameB, emailB - 2);
910 else
911 name = decode(raw, nameB, stop);
912
913 final MutableInteger ptrout = new MutableInteger();
914 long when;
915 int tz;
916 if (emailE < stop) {
917 when = parseLongBase10(raw, emailE + 1, ptrout);
918 tz = parseTimeZoneOffset(raw, ptrout.value);
919 } else {
920 when = 0;
921 tz = 0;
922 }
923 return new PersonIdent(name, email, when * 1000L, tz);
924 }
925
926 /**
927 * Locate the end of a footer line key string.
928 * <p>
929 * If the region at {@code raw[ptr]} matches {@code ^[A-Za-z0-9-]+:} (e.g.
930 * "Signed-off-by: A. U. Thor\n") then this method returns the position of
931 * the first ':'.
932 * <p>
933 * If the region at {@code raw[ptr]} does not match {@code ^[A-Za-z0-9-]+:}
934 * then this method returns -1.
935 *
936 * @param raw
937 * buffer to scan.
938 * @param ptr
939 * first position within raw to consider as a footer line key.
940 * @return position of the ':' which terminates the footer line key if this
941 * is otherwise a valid footer line key; otherwise -1.
942 */
943 public static int endOfFooterLineKey(final byte[] raw, int ptr) {
944 try {
945 for (;;) {
946 final byte c = raw[ptr];
947 if (footerLineKeyChars[c] == 0) {
948 if (c == ':')
949 return ptr;
950 return -1;
951 }
952 ptr++;
953 }
954 } catch (ArrayIndexOutOfBoundsException e) {
955 return -1;
956 }
957 }
958
959 /**
960 * Decode a buffer under UTF-8, if possible.
961 *
962 * If the byte stream cannot be decoded that way, the platform default is tried
963 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
964 *
965 * @param buffer
966 * buffer to pull raw bytes from.
967 * @return a string representation of the range <code>[start,end)</code>,
968 * after decoding the region through the specified character set.
969 */
970 public static String decode(final byte[] buffer) {
971 return decode(buffer, 0, buffer.length);
972 }
973
974 /**
975 * Decode a buffer under UTF-8, if possible.
976 *
977 * If the byte stream cannot be decoded that way, the platform default is
978 * tried and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
979 *
980 * @param buffer
981 * buffer to pull raw bytes from.
982 * @param start
983 * start position in buffer
984 * @param end
985 * one position past the last location within the buffer to take
986 * data from.
987 * @return a string representation of the range <code>[start,end)</code>,
988 * after decoding the region through the specified character set.
989 */
990 public static String decode(final byte[] buffer, final int start,
991 final int end) {
992 return decode(UTF_8, buffer, start, end);
993 }
994
995 /**
996 * Decode a buffer under the specified character set if possible.
997 *
998 * If the byte stream cannot be decoded that way, the platform default is tried
999 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1000 *
1001 * @param cs
1002 * character set to use when decoding the buffer.
1003 * @param buffer
1004 * buffer to pull raw bytes from.
1005 * @return a string representation of the range <code>[start,end)</code>,
1006 * after decoding the region through the specified character set.
1007 */
1008 public static String decode(final Charset cs, final byte[] buffer) {
1009 return decode(cs, buffer, 0, buffer.length);
1010 }
1011
1012 /**
1013 * Decode a region of the buffer under the specified character set if possible.
1014 *
1015 * If the byte stream cannot be decoded that way, the platform default is tried
1016 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1017 *
1018 * @param cs
1019 * character set to use when decoding the buffer.
1020 * @param buffer
1021 * buffer to pull raw bytes from.
1022 * @param start
1023 * first position within the buffer to take data from.
1024 * @param end
1025 * one position past the last location within the buffer to take
1026 * data from.
1027 * @return a string representation of the range <code>[start,end)</code>,
1028 * after decoding the region through the specified character set.
1029 */
1030 public static String decode(final Charset cs, final byte[] buffer,
1031 final int start, final int end) {
1032 try {
1033 return decodeNoFallback(cs, buffer, start, end);
1034 } catch (CharacterCodingException e) {
1035 // Fall back to an ISO-8859-1 style encoding. At least all of
1036 // the bytes will be present in the output.
1037 //
1038 return extractBinaryString(buffer, start, end);
1039 }
1040 }
1041
1042 /**
1043 * Decode a region of the buffer under the specified character set if
1044 * possible.
1045 *
1046 * If the byte stream cannot be decoded that way, the platform default is
1047 * tried and if that too fails, an exception is thrown.
1048 *
1049 * @param cs
1050 * character set to use when decoding the buffer.
1051 * @param buffer
1052 * buffer to pull raw bytes from.
1053 * @param start
1054 * first position within the buffer to take data from.
1055 * @param end
1056 * one position past the last location within the buffer to take
1057 * data from.
1058 * @return a string representation of the range <code>[start,end)</code>,
1059 * after decoding the region through the specified character set.
1060 * @throws CharacterCodingException
1061 * the input is not in any of the tested character sets.
1062 */
1063 public static String decodeNoFallback(final Charset cs,
1064 final byte[] buffer, final int start, final int end)
1065 throws CharacterCodingException {
1066 ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
1067 b.mark();
1068
1069 // Try our built-in favorite. The assumption here is that
1070 // decoding will fail if the data is not actually encoded
1071 // using that encoder.
1072 try {
1073 return decode(b, UTF_8);
1074 } catch (CharacterCodingException e) {
1075 b.reset();
1076 }
1077
1078 if (!cs.equals(UTF_8)) {
1079 // Try the suggested encoding, it might be right since it was
1080 // provided by the caller.
1081 try {
1082 return decode(b, cs);
1083 } catch (CharacterCodingException e) {
1084 b.reset();
1085 }
1086 }
1087
1088 // Try the default character set. A small group of people
1089 // might actually use the same (or very similar) locale.
1090 Charset defcs = Charset.defaultCharset();
1091 if (!defcs.equals(cs) && !defcs.equals(UTF_8)) {
1092 try {
1093 return decode(b, defcs);
1094 } catch (CharacterCodingException e) {
1095 b.reset();
1096 }
1097 }
1098
1099 throw new CharacterCodingException();
1100 }
1101
1102 /**
1103 * Decode a region of the buffer under the ISO-8859-1 encoding.
1104 *
1105 * Each byte is treated as a single character in the 8859-1 character
1106 * encoding, performing a raw binary->char conversion.
1107 *
1108 * @param buffer
1109 * buffer to pull raw bytes from.
1110 * @param start
1111 * first position within the buffer to take data from.
1112 * @param end
1113 * one position past the last location within the buffer to take
1114 * data from.
1115 * @return a string representation of the range <code>[start,end)</code>.
1116 */
1117 public static String extractBinaryString(final byte[] buffer,
1118 final int start, final int end) {
1119 final StringBuilder r = new StringBuilder(end - start);
1120 for (int i = start; i < end; i++)
1121 r.append((char) (buffer[i] & 0xff));
1122 return r.toString();
1123 }
1124
1125 private static String decode(final ByteBuffer b, final Charset charset)
1126 throws CharacterCodingException {
1127 final CharsetDecoder d = charset.newDecoder();
1128 d.onMalformedInput(CodingErrorAction.REPORT);
1129 d.onUnmappableCharacter(CodingErrorAction.REPORT);
1130 return d.decode(b).toString();
1131 }
1132
1133 /**
1134 * Locate the position of the commit message body.
1135 *
1136 * @param b
1137 * buffer to scan.
1138 * @param ptr
1139 * position in buffer to start the scan at. Most callers should
1140 * pass 0 to ensure the scan starts from the beginning of the
1141 * commit buffer.
1142 * @return position of the user's message buffer.
1143 */
1144 public static final int commitMessage(final byte[] b, int ptr) {
1145 final int sz = b.length;
1146 if (ptr == 0)
1147 ptr += 46; // skip the "tree ..." line.
1148 while (ptr < sz && b[ptr] == 'p')
1149 ptr += 48; // skip this parent.
1150
1151 // Skip any remaining header lines, ignoring what their actual
1152 // header line type is. This is identical to the logic for a tag.
1153 //
1154 return tagMessage(b, ptr);
1155 }
1156
1157 /**
1158 * Locate the position of the tag message body.
1159 *
1160 * @param b
1161 * buffer to scan.
1162 * @param ptr
1163 * position in buffer to start the scan at. Most callers should
1164 * pass 0 to ensure the scan starts from the beginning of the tag
1165 * buffer.
1166 * @return position of the user's message buffer.
1167 */
1168 public static final int tagMessage(final byte[] b, int ptr) {
1169 final int sz = b.length;
1170 if (ptr == 0)
1171 ptr += 48; // skip the "object ..." line.
1172 while (ptr < sz && b[ptr] != '\n')
1173 ptr = nextLF(b, ptr);
1174 if (ptr < sz && b[ptr] == '\n')
1175 return ptr + 1;
1176 return -1;
1177 }
1178
1179 /**
1180 * Locate the end of a paragraph.
1181 * <p>
1182 * A paragraph is ended by two consecutive LF bytes or CRLF pairs
1183 *
1184 * @param b
1185 * buffer to scan.
1186 * @param start
1187 * position in buffer to start the scan at. Most callers will
1188 * want to pass the first position of the commit message (as
1189 * found by {@link #commitMessage(byte[], int)}.
1190 * @return position of the LF at the end of the paragraph;
1191 * <code>b.length</code> if no paragraph end could be located.
1192 */
1193 public static final int endOfParagraph(final byte[] b, final int start) {
1194 int ptr = start;
1195 final int sz = b.length;
1196 while (ptr < sz && (b[ptr] != '\n' && b[ptr] != '\r'))
1197 ptr = nextLF(b, ptr);
1198 if (ptr > start && b[ptr - 1] == '\n')
1199 ptr--;
1200 if (ptr > start && b[ptr - 1] == '\r')
1201 ptr--;
1202 return ptr;
1203 }
1204
1205 /**
1206 * @param raw
1207 * buffer to scan.
1208 * @param ch
1209 * character to find.
1210 * @param pos
1211 * starting position.
1212 * @return last index of ch in raw, trimming spaces.
1213 * @since 4.1
1214 */
1215 public static int lastIndexOfTrim(byte[] raw, char ch, int pos) {
1216 while (pos >= 0 && raw[pos] == ' ')
1217 pos--;
1218
1219 while (pos >= 0 && raw[pos] != ch)
1220 pos--;
1221
1222 return pos;
1223 }
1224
1225 private static Charset charsetForAlias(String name) {
1226 return encodingAliases.get(StringUtils.toLowerCase(name));
1227 }
1228
1229 private RawParseUtils() {
1230 // Don't create instances of a static only utility.
1231 }
1232 }