1 /*
2 * Copyright (C) 2008-2009, Google Inc.
3 * Copyright (C) 2006-2008, Shawn O. Pearce <spearce@spearce.org>
4 * and other copyright owners as documented in the project's IP log.
5 *
6 * This program and the accompanying materials are made available
7 * under the terms of the Eclipse Distribution License v1.0 which
8 * accompanies this distribution, is reproduced below, and is
9 * available at http://www.eclipse.org/org/documents/edl-v10.php
10 *
11 * All rights reserved.
12 *
13 * Redistribution and use in source and binary forms, with or
14 * without modification, are permitted provided that the following
15 * conditions are met:
16 *
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 *
20 * - Redistributions in binary form must reproduce the above
21 * copyright notice, this list of conditions and the following
22 * disclaimer in the documentation and/or other materials provided
23 * with the distribution.
24 *
25 * - Neither the name of the Eclipse Foundation, Inc. nor the
26 * names of its contributors may be used to endorse or promote
27 * products derived from this software without specific prior
28 * written permission.
29 *
30 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
31 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
32 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
33 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
35 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
36 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
37 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
38 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
39 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
40 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
41 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
42 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 */
44
45 package org.eclipse.jgit.util;
46
47 import static java.nio.charset.StandardCharsets.ISO_8859_1;
48 import static java.nio.charset.StandardCharsets.UTF_8;
49 import static org.eclipse.jgit.lib.ObjectChecker.author;
50 import static org.eclipse.jgit.lib.ObjectChecker.committer;
51 import static org.eclipse.jgit.lib.ObjectChecker.encoding;
52 import static org.eclipse.jgit.lib.ObjectChecker.tagger;
53
54 import java.nio.ByteBuffer;
55 import java.nio.charset.CharacterCodingException;
56 import java.nio.charset.Charset;
57 import java.nio.charset.CharsetDecoder;
58 import java.nio.charset.CodingErrorAction;
59 import java.nio.charset.IllegalCharsetNameException;
60 import java.nio.charset.UnsupportedCharsetException;
61 import java.util.Arrays;
62 import java.util.HashMap;
63 import java.util.Map;
64
65 import org.eclipse.jgit.annotations.Nullable;
66 import org.eclipse.jgit.errors.BinaryBlobException;
67 import org.eclipse.jgit.lib.Constants;
68 import org.eclipse.jgit.lib.PersonIdent;
69
70 /**
71 * Handy utility functions to parse raw object contents.
72 */
73 public final class RawParseUtils {
74 /**
75 * UTF-8 charset constant.
76 *
77 * @since 2.2
78 * @deprecated use {@link java.nio.charset.StandardCharsets#UTF_8} instead
79 */
80 @Deprecated
81 public static final Charset UTF8_CHARSET = UTF_8;
82
83 private static final byte[] digits10;
84
85 private static final byte[] digits16;
86
87 private static final byte[] footerLineKeyChars;
88
89 private static final Map<String, Charset> encodingAliases;
90
91 static {
92 encodingAliases = new HashMap<>();
93 encodingAliases.put("latin-1", ISO_8859_1); //$NON-NLS-1$
94 encodingAliases.put("iso-latin-1", ISO_8859_1); //$NON-NLS-1$
95
96 digits10 = new byte['9' + 1];
97 Arrays.fill(digits10, (byte) -1);
98 for (char i = '0'; i <= '9'; i++)
99 digits10[i] = (byte) (i - '0');
100
101 digits16 = new byte['f' + 1];
102 Arrays.fill(digits16, (byte) -1);
103 for (char i = '0'; i <= '9'; i++)
104 digits16[i] = (byte) (i - '0');
105 for (char i = 'a'; i <= 'f'; i++)
106 digits16[i] = (byte) ((i - 'a') + 10);
107 for (char i = 'A'; i <= 'F'; i++)
108 digits16[i] = (byte) ((i - 'A') + 10);
109
110 footerLineKeyChars = new byte['z' + 1];
111 footerLineKeyChars['-'] = 1;
112 for (char i = '0'; i <= '9'; i++)
113 footerLineKeyChars[i] = 1;
114 for (char i = 'A'; i <= 'Z'; i++)
115 footerLineKeyChars[i] = 1;
116 for (char i = 'a'; i <= 'z'; i++)
117 footerLineKeyChars[i] = 1;
118 }
119
120 /**
121 * Determine if b[ptr] matches src.
122 *
123 * @param b
124 * the buffer to scan.
125 * @param ptr
126 * first position within b, this should match src[0].
127 * @param src
128 * the buffer to test for equality with b.
129 * @return ptr + src.length if b[ptr..src.length] == src; else -1.
130 */
131 public static final int match(byte[] b, int ptr, byte[] src) {
132 if (ptr + src.length > b.length)
133 return -1;
134 for (int i = 0; i < src.length; i++, ptr++)
135 if (b[ptr] != src[i])
136 return -1;
137 return ptr;
138 }
139
140 private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5',
141 '6', '7', '8', '9' };
142
143 /**
144 * Format a base 10 numeric into a temporary buffer.
145 * <p>
146 * Formatting is performed backwards. The method starts at offset
147 * <code>o-1</code> and ends at <code>o-1-digits</code>, where
148 * <code>digits</code> is the number of positions necessary to store the
149 * base 10 value.
150 * <p>
151 * The argument and return values from this method make it easy to chain
152 * writing, for example:
153 * </p>
154 *
155 * <pre>
156 * final byte[] tmp = new byte[64];
157 * int ptr = tmp.length;
158 * tmp[--ptr] = '\n';
159 * ptr = RawParseUtils.formatBase10(tmp, ptr, 32);
160 * tmp[--ptr] = ' ';
161 * ptr = RawParseUtils.formatBase10(tmp, ptr, 18);
162 * tmp[--ptr] = 0;
163 * final String str = new String(tmp, ptr, tmp.length - ptr);
164 * </pre>
165 *
166 * @param b
167 * buffer to write into.
168 * @param o
169 * one offset past the location where writing will begin; writing
170 * proceeds towards lower index values.
171 * @param value
172 * the value to store.
173 * @return the new offset value <code>o</code>. This is the position of
174 * the last byte written. Additional writing should start at one
175 * position earlier.
176 */
177 public static int formatBase10(final byte[] b, int o, int value) {
178 if (value == 0) {
179 b[--o] = '0';
180 return o;
181 }
182 final boolean isneg = value < 0;
183 if (isneg)
184 value = -value;
185 while (value != 0) {
186 b[--o] = base10byte[value % 10];
187 value /= 10;
188 }
189 if (isneg)
190 b[--o] = '-';
191 return o;
192 }
193
194 /**
195 * Parse a base 10 numeric from a sequence of ASCII digits into an int.
196 * <p>
197 * Digit sequences can begin with an optional run of spaces before the
198 * sequence, and may start with a '+' or a '-' to indicate sign position.
199 * Any other characters will cause the method to stop and return the current
200 * result to the caller.
201 *
202 * @param b
203 * buffer to scan.
204 * @param ptr
205 * position within buffer to start parsing digits at.
206 * @param ptrResult
207 * optional location to return the new ptr value through. If null
208 * the ptr value will be discarded.
209 * @return the value at this location; 0 if the location is not a valid
210 * numeric.
211 */
212 public static final int parseBase10(final byte[] b, int ptr,
213 final MutableInteger ptrResult) {
214 int r = 0;
215 int sign = 0;
216 try {
217 final int sz = b.length;
218 while (ptr < sz && b[ptr] == ' ')
219 ptr++;
220 if (ptr >= sz)
221 return 0;
222
223 switch (b[ptr]) {
224 case '-':
225 sign = -1;
226 ptr++;
227 break;
228 case '+':
229 ptr++;
230 break;
231 }
232
233 while (ptr < sz) {
234 final byte v = digits10[b[ptr]];
235 if (v < 0)
236 break;
237 r = (r * 10) + v;
238 ptr++;
239 }
240 } catch (ArrayIndexOutOfBoundsException e) {
241 // Not a valid digit.
242 }
243 if (ptrResult != null)
244 ptrResult.value = ptr;
245 return sign < 0 ? -r : r;
246 }
247
248 /**
249 * Parse a base 10 numeric from a sequence of ASCII digits into a long.
250 * <p>
251 * Digit sequences can begin with an optional run of spaces before the
252 * sequence, and may start with a '+' or a '-' to indicate sign position.
253 * Any other characters will cause the method to stop and return the current
254 * result to the caller.
255 *
256 * @param b
257 * buffer to scan.
258 * @param ptr
259 * position within buffer to start parsing digits at.
260 * @param ptrResult
261 * optional location to return the new ptr value through. If null
262 * the ptr value will be discarded.
263 * @return the value at this location; 0 if the location is not a valid
264 * numeric.
265 */
266 public static final long parseLongBase10(final byte[] b, int ptr,
267 final MutableInteger ptrResult) {
268 long r = 0;
269 int sign = 0;
270 try {
271 final int sz = b.length;
272 while (ptr < sz && b[ptr] == ' ')
273 ptr++;
274 if (ptr >= sz)
275 return 0;
276
277 switch (b[ptr]) {
278 case '-':
279 sign = -1;
280 ptr++;
281 break;
282 case '+':
283 ptr++;
284 break;
285 }
286
287 while (ptr < sz) {
288 final byte v = digits10[b[ptr]];
289 if (v < 0)
290 break;
291 r = (r * 10) + v;
292 ptr++;
293 }
294 } catch (ArrayIndexOutOfBoundsException e) {
295 // Not a valid digit.
296 }
297 if (ptrResult != null)
298 ptrResult.value = ptr;
299 return sign < 0 ? -r : r;
300 }
301
302 /**
303 * Parse 4 character base 16 (hex) formatted string to unsigned integer.
304 * <p>
305 * The number is read in network byte order, that is, most significant
306 * nybble first.
307 *
308 * @param bs
309 * buffer to parse digits from; positions {@code [p, p+4)} will
310 * be parsed.
311 * @param p
312 * first position within the buffer to parse.
313 * @return the integer value.
314 * @throws java.lang.ArrayIndexOutOfBoundsException
315 * if the string is not hex formatted.
316 */
317 public static final int parseHexInt16(final byte[] bs, final int p) {
318 int r = digits16[bs[p]] << 4;
319
320 r |= digits16[bs[p + 1]];
321 r <<= 4;
322
323 r |= digits16[bs[p + 2]];
324 r <<= 4;
325
326 r |= digits16[bs[p + 3]];
327 if (r < 0)
328 throw new ArrayIndexOutOfBoundsException();
329 return r;
330 }
331
332 /**
333 * Parse 8 character base 16 (hex) formatted string to unsigned integer.
334 * <p>
335 * The number is read in network byte order, that is, most significant
336 * nybble first.
337 *
338 * @param bs
339 * buffer to parse digits from; positions {@code [p, p+8)} will
340 * be parsed.
341 * @param p
342 * first position within the buffer to parse.
343 * @return the integer value.
344 * @throws java.lang.ArrayIndexOutOfBoundsException
345 * if the string is not hex formatted.
346 */
347 public static final int parseHexInt32(final byte[] bs, final int p) {
348 int r = digits16[bs[p]] << 4;
349
350 r |= digits16[bs[p + 1]];
351 r <<= 4;
352
353 r |= digits16[bs[p + 2]];
354 r <<= 4;
355
356 r |= digits16[bs[p + 3]];
357 r <<= 4;
358
359 r |= digits16[bs[p + 4]];
360 r <<= 4;
361
362 r |= digits16[bs[p + 5]];
363 r <<= 4;
364
365 r |= digits16[bs[p + 6]];
366
367 final int last = digits16[bs[p + 7]];
368 if (r < 0 || last < 0)
369 throw new ArrayIndexOutOfBoundsException();
370 return (r << 4) | last;
371 }
372
373 /**
374 * Parse 16 character base 16 (hex) formatted string to unsigned long.
375 * <p>
376 * The number is read in network byte order, that is, most significant
377 * nibble first.
378 *
379 * @param bs
380 * buffer to parse digits from; positions {@code [p, p+16)} will
381 * be parsed.
382 * @param p
383 * first position within the buffer to parse.
384 * @return the integer value.
385 * @throws java.lang.ArrayIndexOutOfBoundsException
386 * if the string is not hex formatted.
387 * @since 4.3
388 */
389 public static final long parseHexInt64(final byte[] bs, final int p) {
390 long r = digits16[bs[p]] << 4;
391
392 r |= digits16[bs[p + 1]];
393 r <<= 4;
394
395 r |= digits16[bs[p + 2]];
396 r <<= 4;
397
398 r |= digits16[bs[p + 3]];
399 r <<= 4;
400
401 r |= digits16[bs[p + 4]];
402 r <<= 4;
403
404 r |= digits16[bs[p + 5]];
405 r <<= 4;
406
407 r |= digits16[bs[p + 6]];
408 r <<= 4;
409
410 r |= digits16[bs[p + 7]];
411 r <<= 4;
412
413 r |= digits16[bs[p + 8]];
414 r <<= 4;
415
416 r |= digits16[bs[p + 9]];
417 r <<= 4;
418
419 r |= digits16[bs[p + 10]];
420 r <<= 4;
421
422 r |= digits16[bs[p + 11]];
423 r <<= 4;
424
425 r |= digits16[bs[p + 12]];
426 r <<= 4;
427
428 r |= digits16[bs[p + 13]];
429 r <<= 4;
430
431 r |= digits16[bs[p + 14]];
432
433 final int last = digits16[bs[p + 15]];
434 if (r < 0 || last < 0)
435 throw new ArrayIndexOutOfBoundsException();
436 return (r << 4) | last;
437 }
438
439 /**
440 * Parse a single hex digit to its numeric value (0-15).
441 *
442 * @param digit
443 * hex character to parse.
444 * @return numeric value, in the range 0-15.
445 * @throws java.lang.ArrayIndexOutOfBoundsException
446 * if the input digit is not a valid hex digit.
447 */
448 public static final int parseHexInt4(final byte digit) {
449 final byte r = digits16[digit];
450 if (r < 0)
451 throw new ArrayIndexOutOfBoundsException();
452 return r;
453 }
454
455 /**
456 * Parse a Git style timezone string.
457 * <p>
458 * The sequence "-0315" will be parsed as the numeric value -195, as the
459 * lower two positions count minutes, not 100ths of an hour.
460 *
461 * @param b
462 * buffer to scan.
463 * @param ptr
464 * position within buffer to start parsing digits at.
465 * @return the timezone at this location, expressed in minutes.
466 */
467 public static final int parseTimeZoneOffset(byte[] b, int ptr) {
468 return parseTimeZoneOffset(b, ptr, null);
469 }
470
471 /**
472 * Parse a Git style timezone string.
473 * <p>
474 * The sequence "-0315" will be parsed as the numeric value -195, as the
475 * lower two positions count minutes, not 100ths of an hour.
476 *
477 * @param b
478 * buffer to scan.
479 * @param ptr
480 * position within buffer to start parsing digits at.
481 * @param ptrResult
482 * optional location to return the new ptr value through. If null
483 * the ptr value will be discarded.
484 * @return the timezone at this location, expressed in minutes.
485 * @since 4.1
486 */
487 public static final int parseTimeZoneOffset(final byte[] b, int ptr,
488 MutableInteger ptrResult) {
489 final int v = parseBase10(b, ptr, ptrResult);
490 final int tzMins = v % 100;
491 final int tzHours = v / 100;
492 return tzHours * 60 + tzMins;
493 }
494
495 /**
496 * Locate the first position after a given character.
497 *
498 * @param b
499 * buffer to scan.
500 * @param ptr
501 * position within buffer to start looking for chrA at.
502 * @param chrA
503 * character to find.
504 * @return new position just after chrA.
505 */
506 public static final int next(byte[] b, int ptr, char chrA) {
507 final int sz = b.length;
508 while (ptr < sz) {
509 if (b[ptr++] == chrA)
510 return ptr;
511 }
512 return ptr;
513 }
514
515 /**
516 * Locate the first position after the next LF.
517 * <p>
518 * This method stops on the first '\n' it finds.
519 *
520 * @param b
521 * buffer to scan.
522 * @param ptr
523 * position within buffer to start looking for LF at.
524 * @return new position just after the first LF found.
525 */
526 public static final int nextLF(byte[] b, int ptr) {
527 return next(b, ptr, '\n');
528 }
529
530 /**
531 * Locate the first position after either the given character or LF.
532 * <p>
533 * This method stops on the first match it finds from either chrA or '\n'.
534 *
535 * @param b
536 * buffer to scan.
537 * @param ptr
538 * position within buffer to start looking for chrA or LF at.
539 * @param chrA
540 * character to find.
541 * @return new position just after the first chrA or LF to be found.
542 */
543 public static final int nextLF(byte[] b, int ptr, char chrA) {
544 final int sz = b.length;
545 while (ptr < sz) {
546 final byte c = b[ptr++];
547 if (c == chrA || c == '\n')
548 return ptr;
549 }
550 return ptr;
551 }
552
553 /**
554 * Locate the end of the header. Note that headers may be
555 * more than one line long.
556 * @param b
557 * buffer to scan.
558 * @param ptr
559 * position within buffer to start looking for the end-of-header.
560 * @return new position just after the header. This is either
561 * b.length, or the index of the header's terminating newline.
562 * @since 5.1
563 */
564 public static final int headerEnd(final byte[] b, int ptr) {
565 final int sz = b.length;
566 while (ptr < sz) {
567 final byte c = b[ptr++];
568 if (c == '\n' && (ptr == sz || b[ptr] != ' ')) {
569 return ptr - 1;
570 }
571 }
572 return ptr - 1;
573 }
574
575 /**
576 * Find the start of the contents of a given header.
577 *
578 * @param b
579 * buffer to scan.
580 * @param headerName
581 * header to search for
582 * @param ptr
583 * position within buffer to start looking for header at.
584 * @return new position at the start of the header's contents, -1 for
585 * not found
586 * @since 5.1
587 */
588 public static final int headerStart(byte[] headerName, byte[] b, int ptr) {
589 // Start by advancing to just past a LF or buffer start
590 if (ptr != 0) {
591 ptr = nextLF(b, ptr - 1);
592 }
593 while (ptr < b.length - (headerName.length + 1)) {
594 boolean found = true;
595 for (int i = 0; i < headerName.length; i++) {
596 if (headerName[i] != b[ptr++]) {
597 found = false;
598 break;
599 }
600 }
601 if (found && b[ptr++] == ' ') {
602 return ptr;
603 }
604 ptr = nextLF(b, ptr);
605 }
606 return -1;
607 }
608
609 /**
610 * Locate the first position before a given character.
611 *
612 * @param b
613 * buffer to scan.
614 * @param ptr
615 * position within buffer to start looking for chrA at.
616 * @param chrA
617 * character to find.
618 * @return new position just before chrA, -1 for not found
619 */
620 public static final int prev(byte[] b, int ptr, char chrA) {
621 if (ptr == b.length)
622 --ptr;
623 while (ptr >= 0) {
624 if (b[ptr--] == chrA)
625 return ptr;
626 }
627 return ptr;
628 }
629
630 /**
631 * Locate the first position before the previous LF.
632 * <p>
633 * This method stops on the first '\n' it finds.
634 *
635 * @param b
636 * buffer to scan.
637 * @param ptr
638 * position within buffer to start looking for LF at.
639 * @return new position just before the first LF found, -1 for not found
640 */
641 public static final int prevLF(byte[] b, int ptr) {
642 return prev(b, ptr, '\n');
643 }
644
645 /**
646 * Locate the previous position before either the given character or LF.
647 * <p>
648 * This method stops on the first match it finds from either chrA or '\n'.
649 *
650 * @param b
651 * buffer to scan.
652 * @param ptr
653 * position within buffer to start looking for chrA or LF at.
654 * @param chrA
655 * character to find.
656 * @return new position just before the first chrA or LF to be found, -1 for
657 * not found
658 */
659 public static final int prevLF(byte[] b, int ptr, char chrA) {
660 if (ptr == b.length)
661 --ptr;
662 while (ptr >= 0) {
663 final byte c = b[ptr--];
664 if (c == chrA || c == '\n')
665 return ptr;
666 }
667 return ptr;
668 }
669
670 /**
671 * Index the region between <code>[ptr, end)</code> to find line starts.
672 * <p>
673 * The returned list is 1 indexed. Index 0 contains
674 * {@link java.lang.Integer#MIN_VALUE} to pad the list out.
675 * <p>
676 * Using a 1 indexed list means that line numbers can be directly accessed
677 * from the list, so <code>list.get(1)</code> (aka get line 1) returns
678 * <code>ptr</code>.
679 * <p>
680 * The last element (index <code>map.size()-1</code>) always contains
681 * <code>end</code>.
682 *
683 * @param buf
684 * buffer to scan.
685 * @param ptr
686 * position within the buffer corresponding to the first byte of
687 * line 1.
688 * @param end
689 * 1 past the end of the content within <code>buf</code>.
690 * @return a line map indicating the starting position of each line.
691 */
692 public static final IntList lineMap(byte[] buf, int ptr, int end) {
693 IntList map = new IntList((end - ptr) / 36);
694 map.fillTo(1, Integer.MIN_VALUE);
695 for (; ptr < end; ptr = nextLF(buf, ptr)) {
696 map.add(ptr);
697 }
698 map.add(end);
699 return map;
700 }
701
702 /**
703 * Like {@link #lineMap(byte[], int, int)} but throw
704 * {@link BinaryBlobException} if a NUL byte is encountered.
705 *
706 * @param buf
707 * buffer to scan.
708 * @param ptr
709 * position within the buffer corresponding to the first byte of
710 * line 1.
711 * @param end
712 * 1 past the end of the content within <code>buf</code>.
713 * @return a line map indicating the starting position of each line.
714 * @throws BinaryBlobException
715 * if a NUL byte is found.
716 * @since 5.0
717 */
718 public static final IntList lineMapOrBinary(byte[] buf, int ptr, int end)
719 throws BinaryBlobException {
720 IntList map = lineMapOrNull(buf, ptr, end);
721 if (map == null) {
722 throw new BinaryBlobException();
723 }
724 return map;
725 }
726
727 @Nullable
728 private static IntList lineMapOrNull(byte[] buf, int ptr, int end) {
729 // Experimentally derived from multiple source repositories
730 // the average number of bytes/line is 36. Its a rough guess
731 // to initially size our map close to the target.
732 IntList map = new IntList((end - ptr) / 36);
733 map.add(Integer.MIN_VALUE);
734 boolean foundLF = true;
735 for (; ptr < end; ptr++) {
736 if (foundLF) {
737 map.add(ptr);
738 }
739
740 if (buf[ptr] == '\0') {
741 return null;
742 }
743
744 foundLF = (buf[ptr] == '\n');
745 }
746 map.add(end);
747 return map;
748 }
749
750 /**
751 * Locate the "author " header line data.
752 *
753 * @param b
754 * buffer to scan.
755 * @param ptr
756 * position in buffer to start the scan at. Most callers should
757 * pass 0 to ensure the scan starts from the beginning of the
758 * commit buffer and does not accidentally look at message body.
759 * @return position just after the space in "author ", so the first
760 * character of the author's name. If no author header can be
761 * located -1 is returned.
762 */
763 public static final int author(byte[] b, int ptr) {
764 final int sz = b.length;
765 if (ptr == 0)
766 ptr += 46; // skip the "tree ..." line.
767 while (ptr < sz && b[ptr] == 'p')
768 ptr += 48; // skip this parent.
769 return match(b, ptr, author);
770 }
771
772 /**
773 * Locate the "committer " header line data.
774 *
775 * @param b
776 * buffer to scan.
777 * @param ptr
778 * position in buffer to start the scan at. Most callers should
779 * pass 0 to ensure the scan starts from the beginning of the
780 * commit buffer and does not accidentally look at message body.
781 * @return position just after the space in "committer ", so the first
782 * character of the committer's name. If no committer header can be
783 * located -1 is returned.
784 */
785 public static final int committer(byte[] b, int ptr) {
786 final int sz = b.length;
787 if (ptr == 0)
788 ptr += 46; // skip the "tree ..." line.
789 while (ptr < sz && b[ptr] == 'p')
790 ptr += 48; // skip this parent.
791 if (ptr < sz && b[ptr] == 'a')
792 ptr = nextLF(b, ptr);
793 return match(b, ptr, committer);
794 }
795
796 /**
797 * Locate the "tagger " header line data.
798 *
799 * @param b
800 * buffer to scan.
801 * @param ptr
802 * position in buffer to start the scan at. Most callers should
803 * pass 0 to ensure the scan starts from the beginning of the tag
804 * buffer and does not accidentally look at message body.
805 * @return position just after the space in "tagger ", so the first
806 * character of the tagger's name. If no tagger header can be
807 * located -1 is returned.
808 */
809 public static final int tagger(byte[] b, int ptr) {
810 final int sz = b.length;
811 if (ptr == 0)
812 ptr += 48; // skip the "object ..." line.
813 while (ptr < sz) {
814 if (b[ptr] == '\n')
815 return -1;
816 final int m = match(b, ptr, tagger);
817 if (m >= 0)
818 return m;
819 ptr = nextLF(b, ptr);
820 }
821 return -1;
822 }
823
824 /**
825 * Locate the "encoding " header line.
826 *
827 * @param b
828 * buffer to scan.
829 * @param ptr
830 * position in buffer to start the scan at. Most callers should
831 * pass 0 to ensure the scan starts from the beginning of the
832 * buffer and does not accidentally look at the message body.
833 * @return position just after the space in "encoding ", so the first
834 * character of the encoding's name. If no encoding header can be
835 * located -1 is returned (and UTF-8 should be assumed).
836 */
837 public static final int encoding(byte[] b, int ptr) {
838 final int sz = b.length;
839 while (ptr < sz) {
840 if (b[ptr] == '\n')
841 return -1;
842 if (b[ptr] == 'e')
843 break;
844 ptr = nextLF(b, ptr);
845 }
846 return match(b, ptr, encoding);
847 }
848
849 /**
850 * Parse the "encoding " header as a string.
851 * <p>
852 * Locates the "encoding " header (if present) and returns its value.
853 *
854 * @param b
855 * buffer to scan.
856 * @return the encoding header as specified in the commit; null if the
857 * header was not present and should be assumed.
858 * @since 4.2
859 */
860 @Nullable
861 public static String parseEncodingName(byte[] b) {
862 int enc = encoding(b, 0);
863 if (enc < 0) {
864 return null;
865 }
866 int lf = nextLF(b, enc);
867 return decode(UTF_8, b, enc, lf - 1);
868 }
869
870 /**
871 * Parse the "encoding " header into a character set reference.
872 * <p>
873 * Locates the "encoding " header (if present) by first calling
874 * {@link #encoding(byte[], int)} and then returns the proper character set
875 * to apply to this buffer to evaluate its contents as character data.
876 * <p>
877 * If no encoding header is present {@code UTF-8} is assumed.
878 *
879 * @param b
880 * buffer to scan.
881 * @return the Java character set representation. Never null.
882 * @throws IllegalCharsetNameException
883 * if the character set requested by the encoding header is
884 * malformed and unsupportable.
885 * @throws UnsupportedCharsetException
886 * if the JRE does not support the character set requested by
887 * the encoding header.
888 */
889 public static Charset parseEncoding(byte[] b) {
890 String enc = parseEncodingName(b);
891 if (enc == null) {
892 return UTF_8;
893 }
894
895 String name = enc.trim();
896 try {
897 return Charset.forName(name);
898 } catch (IllegalCharsetNameException
899 | UnsupportedCharsetException badName) {
900 Charset aliased = charsetForAlias(name);
901 if (aliased != null) {
902 return aliased;
903 }
904 throw badName;
905 }
906 }
907
908 /**
909 * Parse a name string (e.g. author, committer, tagger) into a PersonIdent.
910 * <p>
911 * Leading spaces won't be trimmed from the string, i.e. will show up in the
912 * parsed name afterwards.
913 *
914 * @param in
915 * the string to parse a name from.
916 * @return the parsed identity or null in case the identity could not be
917 * parsed.
918 */
919 public static PersonIdent parsePersonIdent(String in) {
920 return parsePersonIdent(Constants.encode(in), 0);
921 }
922
923 /**
924 * Parse a name line (e.g. author, committer, tagger) into a PersonIdent.
925 * <p>
926 * When passing in a value for <code>nameB</code> callers should use the
927 * return value of {@link #author(byte[], int)} or
928 * {@link #committer(byte[], int)}, as these methods provide the proper
929 * position within the buffer.
930 *
931 * @param raw
932 * the buffer to parse character data from.
933 * @param nameB
934 * first position of the identity information. This should be the
935 * first position after the space which delimits the header field
936 * name (e.g. "author" or "committer") from the rest of the
937 * identity line.
938 * @return the parsed identity or null in case the identity could not be
939 * parsed.
940 */
941 public static PersonIdent parsePersonIdent(byte[] raw, int nameB) {
942 Charset cs;
943 try {
944 cs = parseEncoding(raw);
945 } catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
946 // Assume UTF-8 for person identities, usually this is correct.
947 // If not decode() will fall back to the ISO-8859-1 encoding.
948 cs = UTF_8;
949 }
950
951 final int emailB = nextLF(raw, nameB, '<');
952 final int emailE = nextLF(raw, emailB, '>');
953 if (emailB >= raw.length || raw[emailB] == '\n' ||
954 (emailE >= raw.length - 1 && raw[emailE - 1] != '>'))
955 return null;
956
957 final int nameEnd = emailB - 2 >= nameB && raw[emailB - 2] == ' ' ?
958 emailB - 2 : emailB - 1;
959 final String name = decode(cs, raw, nameB, nameEnd);
960 final String email = decode(cs, raw, emailB, emailE - 1);
961
962 // Start searching from end of line, as after first name-email pair,
963 // another name-email pair may occur. We will ignore all kinds of
964 // "junk" following the first email.
965 //
966 // We've to use (emailE - 1) for the case that raw[email] is LF,
967 // otherwise we would run too far. "-2" is necessary to position
968 // before the LF in case of LF termination resp. the penultimate
969 // character if there is no trailing LF.
970 final int tzBegin = lastIndexOfTrim(raw, ' ',
971 nextLF(raw, emailE - 1) - 2) + 1;
972 if (tzBegin <= emailE) // No time/zone, still valid
973 return new PersonIdent(name, email, 0, 0);
974
975 final int whenBegin = Math.max(emailE,
976 lastIndexOfTrim(raw, ' ', tzBegin - 1) + 1);
977 if (whenBegin >= tzBegin - 1) // No time/zone, still valid
978 return new PersonIdent(name, email, 0, 0);
979
980 final long when = parseLongBase10(raw, whenBegin, null);
981 final int tz = parseTimeZoneOffset(raw, tzBegin);
982 return new PersonIdent(name, email, when * 1000L, tz);
983 }
984
985 /**
986 * Parse a name data (e.g. as within a reflog) into a PersonIdent.
987 * <p>
988 * When passing in a value for <code>nameB</code> callers should use the
989 * return value of {@link #author(byte[], int)} or
990 * {@link #committer(byte[], int)}, as these methods provide the proper
991 * position within the buffer.
992 *
993 * @param raw
994 * the buffer to parse character data from.
995 * @param nameB
996 * first position of the identity information. This should be the
997 * first position after the space which delimits the header field
998 * name (e.g. "author" or "committer") from the rest of the
999 * identity line.
1000 * @return the parsed identity. Never null.
1001 */
1002 public static PersonIdent parsePersonIdentOnly(final byte[] raw,
1003 final int nameB) {
1004 int stop = nextLF(raw, nameB);
1005 int emailB = nextLF(raw, nameB, '<');
1006 int emailE = nextLF(raw, emailB, '>');
1007 final String name;
1008 final String email;
1009 if (emailE < stop) {
1010 email = decode(raw, emailB, emailE - 1);
1011 } else {
1012 email = "invalid"; //$NON-NLS-1$
1013 }
1014 if (emailB < stop)
1015 name = decode(raw, nameB, emailB - 2);
1016 else
1017 name = decode(raw, nameB, stop);
1018
1019 final MutableInteger.html#MutableInteger">MutableInteger ptrout = new MutableInteger();
1020 long when;
1021 int tz;
1022 if (emailE < stop) {
1023 when = parseLongBase10(raw, emailE + 1, ptrout);
1024 tz = parseTimeZoneOffset(raw, ptrout.value);
1025 } else {
1026 when = 0;
1027 tz = 0;
1028 }
1029 return new PersonIdent(name, email, when * 1000L, tz);
1030 }
1031
1032 /**
1033 * Locate the end of a footer line key string.
1034 * <p>
1035 * If the region at {@code raw[ptr]} matches {@code ^[A-Za-z0-9-]+:} (e.g.
1036 * "Signed-off-by: A. U. Thor\n") then this method returns the position of
1037 * the first ':'.
1038 * <p>
1039 * If the region at {@code raw[ptr]} does not match {@code ^[A-Za-z0-9-]+:}
1040 * then this method returns -1.
1041 *
1042 * @param raw
1043 * buffer to scan.
1044 * @param ptr
1045 * first position within raw to consider as a footer line key.
1046 * @return position of the ':' which terminates the footer line key if this
1047 * is otherwise a valid footer line key; otherwise -1.
1048 */
1049 public static int endOfFooterLineKey(byte[] raw, int ptr) {
1050 try {
1051 for (;;) {
1052 final byte c = raw[ptr];
1053 if (footerLineKeyChars[c] == 0) {
1054 if (c == ':')
1055 return ptr;
1056 return -1;
1057 }
1058 ptr++;
1059 }
1060 } catch (ArrayIndexOutOfBoundsException e) {
1061 return -1;
1062 }
1063 }
1064
1065 /**
1066 * Decode a buffer under UTF-8, if possible.
1067 *
1068 * If the byte stream cannot be decoded that way, the platform default is tried
1069 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1070 *
1071 * @param buffer
1072 * buffer to pull raw bytes from.
1073 * @return a string representation of the range <code>[start,end)</code>,
1074 * after decoding the region through the specified character set.
1075 */
1076 public static String decode(byte[] buffer) {
1077 return decode(buffer, 0, buffer.length);
1078 }
1079
1080 /**
1081 * Decode a buffer under UTF-8, if possible.
1082 *
1083 * If the byte stream cannot be decoded that way, the platform default is
1084 * tried and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1085 *
1086 * @param buffer
1087 * buffer to pull raw bytes from.
1088 * @param start
1089 * start position in buffer
1090 * @param end
1091 * one position past the last location within the buffer to take
1092 * data from.
1093 * @return a string representation of the range <code>[start,end)</code>,
1094 * after decoding the region through the specified character set.
1095 */
1096 public static String decode(final byte[] buffer, final int start,
1097 final int end) {
1098 return decode(UTF_8, buffer, start, end);
1099 }
1100
1101 /**
1102 * Decode a buffer under the specified character set if possible.
1103 *
1104 * If the byte stream cannot be decoded that way, the platform default is tried
1105 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1106 *
1107 * @param cs
1108 * character set to use when decoding the buffer.
1109 * @param buffer
1110 * buffer to pull raw bytes from.
1111 * @return a string representation of the range <code>[start,end)</code>,
1112 * after decoding the region through the specified character set.
1113 */
1114 public static String decode(Charset cs, byte[] buffer) {
1115 return decode(cs, buffer, 0, buffer.length);
1116 }
1117
1118 /**
1119 * Decode a region of the buffer under the specified character set if possible.
1120 *
1121 * If the byte stream cannot be decoded that way, the platform default is tried
1122 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1123 *
1124 * @param cs
1125 * character set to use when decoding the buffer.
1126 * @param buffer
1127 * buffer to pull raw bytes from.
1128 * @param start
1129 * first position within the buffer to take data from.
1130 * @param end
1131 * one position past the last location within the buffer to take
1132 * data from.
1133 * @return a string representation of the range <code>[start,end)</code>,
1134 * after decoding the region through the specified character set.
1135 */
1136 public static String decode(final Charset cs, final byte[] buffer,
1137 final int start, final int end) {
1138 try {
1139 return decodeNoFallback(cs, buffer, start, end);
1140 } catch (CharacterCodingException e) {
1141 // Fall back to an ISO-8859-1 style encoding. At least all of
1142 // the bytes will be present in the output.
1143 //
1144 return extractBinaryString(buffer, start, end);
1145 }
1146 }
1147
1148 /**
1149 * Decode a region of the buffer under the specified character set if
1150 * possible.
1151 *
1152 * If the byte stream cannot be decoded that way, the platform default is
1153 * tried and if that too fails, an exception is thrown.
1154 *
1155 * @param cs
1156 * character set to use when decoding the buffer.
1157 * @param buffer
1158 * buffer to pull raw bytes from.
1159 * @param start
1160 * first position within the buffer to take data from.
1161 * @param end
1162 * one position past the last location within the buffer to take
1163 * data from.
1164 * @return a string representation of the range <code>[start,end)</code>,
1165 * after decoding the region through the specified character set.
1166 * @throws java.nio.charset.CharacterCodingException
1167 * the input is not in any of the tested character sets.
1168 */
1169 public static String decodeNoFallback(final Charset cs,
1170 final byte[] buffer, final int start, final int end)
1171 throws CharacterCodingException {
1172 ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
1173 b.mark();
1174
1175 // Try our built-in favorite. The assumption here is that
1176 // decoding will fail if the data is not actually encoded
1177 // using that encoder.
1178 try {
1179 return decode(b, UTF_8);
1180 } catch (CharacterCodingException e) {
1181 b.reset();
1182 }
1183
1184 if (!cs.equals(UTF_8)) {
1185 // Try the suggested encoding, it might be right since it was
1186 // provided by the caller.
1187 try {
1188 return decode(b, cs);
1189 } catch (CharacterCodingException e) {
1190 b.reset();
1191 }
1192 }
1193
1194 // Try the default character set. A small group of people
1195 // might actually use the same (or very similar) locale.
1196 Charset defcs = Charset.defaultCharset();
1197 if (!defcs.equals(cs) && !defcs.equals(UTF_8)) {
1198 try {
1199 return decode(b, defcs);
1200 } catch (CharacterCodingException e) {
1201 b.reset();
1202 }
1203 }
1204
1205 throw new CharacterCodingException();
1206 }
1207
1208 /**
1209 * Decode a region of the buffer under the ISO-8859-1 encoding.
1210 *
1211 * Each byte is treated as a single character in the 8859-1 character
1212 * encoding, performing a raw binary->char conversion.
1213 *
1214 * @param buffer
1215 * buffer to pull raw bytes from.
1216 * @param start
1217 * first position within the buffer to take data from.
1218 * @param end
1219 * one position past the last location within the buffer to take
1220 * data from.
1221 * @return a string representation of the range <code>[start,end)</code>.
1222 */
1223 public static String extractBinaryString(final byte[] buffer,
1224 final int start, final int end) {
1225 final StringBuilder r = new StringBuilder(end - start);
1226 for (int i = start; i < end; i++)
1227 r.append((char) (buffer[i] & 0xff));
1228 return r.toString();
1229 }
1230
1231 private static String decode(ByteBuffer b, Charset charset)
1232 throws CharacterCodingException {
1233 final CharsetDecoder d = charset.newDecoder();
1234 d.onMalformedInput(CodingErrorAction.REPORT);
1235 d.onUnmappableCharacter(CodingErrorAction.REPORT);
1236 return d.decode(b).toString();
1237 }
1238
1239 /**
1240 * Locate the position of the commit message body.
1241 *
1242 * @param b
1243 * buffer to scan.
1244 * @param ptr
1245 * position in buffer to start the scan at. Most callers should
1246 * pass 0 to ensure the scan starts from the beginning of the
1247 * commit buffer.
1248 * @return position of the user's message buffer.
1249 */
1250 public static final int commitMessage(byte[] b, int ptr) {
1251 final int sz = b.length;
1252 if (ptr == 0)
1253 ptr += 46; // skip the "tree ..." line.
1254 while (ptr < sz && b[ptr] == 'p')
1255 ptr += 48; // skip this parent.
1256
1257 // Skip any remaining header lines, ignoring what their actual
1258 // header line type is. This is identical to the logic for a tag.
1259 //
1260 return tagMessage(b, ptr);
1261 }
1262
1263 /**
1264 * Locate the position of the tag message body.
1265 *
1266 * @param b
1267 * buffer to scan.
1268 * @param ptr
1269 * position in buffer to start the scan at. Most callers should
1270 * pass 0 to ensure the scan starts from the beginning of the tag
1271 * buffer.
1272 * @return position of the user's message buffer.
1273 */
1274 public static final int tagMessage(byte[] b, int ptr) {
1275 final int sz = b.length;
1276 if (ptr == 0)
1277 ptr += 48; // skip the "object ..." line.
1278 while (ptr < sz && b[ptr] != '\n')
1279 ptr = nextLF(b, ptr);
1280 if (ptr < sz && b[ptr] == '\n')
1281 return ptr + 1;
1282 return -1;
1283 }
1284
1285 /**
1286 * Locate the end of a paragraph.
1287 * <p>
1288 * A paragraph is ended by two consecutive LF bytes or CRLF pairs
1289 *
1290 * @param b
1291 * buffer to scan.
1292 * @param start
1293 * position in buffer to start the scan at. Most callers will
1294 * want to pass the first position of the commit message (as
1295 * found by {@link #commitMessage(byte[], int)}.
1296 * @return position of the LF at the end of the paragraph;
1297 * <code>b.length</code> if no paragraph end could be located.
1298 */
1299 public static final int endOfParagraph(byte[] b, int start) {
1300 int ptr = start;
1301 final int sz = b.length;
1302 while (ptr < sz && (b[ptr] != '\n' && b[ptr] != '\r'))
1303 ptr = nextLF(b, ptr);
1304 if (ptr > start && b[ptr - 1] == '\n')
1305 ptr--;
1306 if (ptr > start && b[ptr - 1] == '\r')
1307 ptr--;
1308 return ptr;
1309 }
1310
1311 /**
1312 * Get last index of {@code ch} in raw, trimming spaces.
1313 *
1314 * @param raw
1315 * buffer to scan.
1316 * @param ch
1317 * character to find.
1318 * @param pos
1319 * starting position.
1320 * @return last index of {@code ch} in raw, trimming spaces.
1321 * @since 4.1
1322 */
1323 public static int lastIndexOfTrim(byte[] raw, char ch, int pos) {
1324 while (pos >= 0 && raw[pos] == ' ')
1325 pos--;
1326
1327 while (pos >= 0 && raw[pos] != ch)
1328 pos--;
1329
1330 return pos;
1331 }
1332
1333 private static Charset charsetForAlias(String name) {
1334 return encodingAliases.get(StringUtils.toLowerCase(name));
1335 }
1336
1337 private RawParseUtils() {
1338 // Don't create instances of a static only utility.
1339 }
1340 }