1 /*
2 * Copyright (C) 2008-2009, Google Inc.
3 * Copyright (C) 2006-2008, Shawn O. Pearce <spearce@spearce.org>
4 * and other copyright owners as documented in the project's IP log.
5 *
6 * This program and the accompanying materials are made available
7 * under the terms of the Eclipse Distribution License v1.0 which
8 * accompanies this distribution, is reproduced below, and is
9 * available at http://www.eclipse.org/org/documents/edl-v10.php
10 *
11 * All rights reserved.
12 *
13 * Redistribution and use in source and binary forms, with or
14 * without modification, are permitted provided that the following
15 * conditions are met:
16 *
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 *
20 * - Redistributions in binary form must reproduce the above
21 * copyright notice, this list of conditions and the following
22 * disclaimer in the documentation and/or other materials provided
23 * with the distribution.
24 *
25 * - Neither the name of the Eclipse Foundation, Inc. nor the
26 * names of its contributors may be used to endorse or promote
27 * products derived from this software without specific prior
28 * written permission.
29 *
30 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
31 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
32 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
33 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
35 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
36 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
37 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
38 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
39 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
40 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
41 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
42 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 */
44
45 package org.eclipse.jgit.util;
46
47 import static org.eclipse.jgit.lib.ObjectChecker.author;
48 import static org.eclipse.jgit.lib.ObjectChecker.committer;
49 import static org.eclipse.jgit.lib.ObjectChecker.encoding;
50 import static org.eclipse.jgit.lib.ObjectChecker.tagger;
51
52 import java.nio.ByteBuffer;
53 import java.nio.charset.CharacterCodingException;
54 import java.nio.charset.Charset;
55 import java.nio.charset.CharsetDecoder;
56 import java.nio.charset.CodingErrorAction;
57 import java.nio.charset.IllegalCharsetNameException;
58 import java.nio.charset.UnsupportedCharsetException;
59 import java.util.Arrays;
60 import java.util.HashMap;
61 import java.util.Map;
62
63 import org.eclipse.jgit.lib.Constants;
64 import org.eclipse.jgit.lib.PersonIdent;
65
66 /** Handy utility functions to parse raw object contents. */
67 public final class RawParseUtils {
68 /**
69 * UTF-8 charset constant.
70 *
71 * @since 2.2
72 */
73 public static final Charset UTF8_CHARSET = Charset.forName("UTF-8"); //$NON-NLS-1$
74
75 private static final byte[] digits10;
76
77 private static final byte[] digits16;
78
79 private static final byte[] footerLineKeyChars;
80
81 private static final Map<String, Charset> encodingAliases;
82
83 static {
84 encodingAliases = new HashMap<String, Charset>();
85 encodingAliases.put("latin-1", Charset.forName("ISO-8859-1")); //$NON-NLS-1$ //$NON-NLS-2$
86
87 digits10 = new byte['9' + 1];
88 Arrays.fill(digits10, (byte) -1);
89 for (char i = '0'; i <= '9'; i++)
90 digits10[i] = (byte) (i - '0');
91
92 digits16 = new byte['f' + 1];
93 Arrays.fill(digits16, (byte) -1);
94 for (char i = '0'; i <= '9'; i++)
95 digits16[i] = (byte) (i - '0');
96 for (char i = 'a'; i <= 'f'; i++)
97 digits16[i] = (byte) ((i - 'a') + 10);
98 for (char i = 'A'; i <= 'F'; i++)
99 digits16[i] = (byte) ((i - 'A') + 10);
100
101 footerLineKeyChars = new byte['z' + 1];
102 footerLineKeyChars['-'] = 1;
103 for (char i = '0'; i <= '9'; i++)
104 footerLineKeyChars[i] = 1;
105 for (char i = 'A'; i <= 'Z'; i++)
106 footerLineKeyChars[i] = 1;
107 for (char i = 'a'; i <= 'z'; i++)
108 footerLineKeyChars[i] = 1;
109 }
110
111 /**
112 * Determine if b[ptr] matches src.
113 *
114 * @param b
115 * the buffer to scan.
116 * @param ptr
117 * first position within b, this should match src[0].
118 * @param src
119 * the buffer to test for equality with b.
120 * @return ptr + src.length if b[ptr..src.length] == src; else -1.
121 */
122 public static final int match(final byte[] b, int ptr, final byte[] src) {
123 if (ptr + src.length > b.length)
124 return -1;
125 for (int i = 0; i < src.length; i++, ptr++)
126 if (b[ptr] != src[i])
127 return -1;
128 return ptr;
129 }
130
131 private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5',
132 '6', '7', '8', '9' };
133
134 /**
135 * Format a base 10 numeric into a temporary buffer.
136 * <p>
137 * Formatting is performed backwards. The method starts at offset
138 * <code>o-1</code> and ends at <code>o-1-digits</code>, where
139 * <code>digits</code> is the number of positions necessary to store the
140 * base 10 value.
141 * <p>
142 * The argument and return values from this method make it easy to chain
143 * writing, for example:
144 * </p>
145 *
146 * <pre>
147 * final byte[] tmp = new byte[64];
148 * int ptr = tmp.length;
149 * tmp[--ptr] = '\n';
150 * ptr = RawParseUtils.formatBase10(tmp, ptr, 32);
151 * tmp[--ptr] = ' ';
152 * ptr = RawParseUtils.formatBase10(tmp, ptr, 18);
153 * tmp[--ptr] = 0;
154 * final String str = new String(tmp, ptr, tmp.length - ptr);
155 * </pre>
156 *
157 * @param b
158 * buffer to write into.
159 * @param o
160 * one offset past the location where writing will begin; writing
161 * proceeds towards lower index values.
162 * @param value
163 * the value to store.
164 * @return the new offset value <code>o</code>. This is the position of
165 * the last byte written. Additional writing should start at one
166 * position earlier.
167 */
168 public static int formatBase10(final byte[] b, int o, int value) {
169 if (value == 0) {
170 b[--o] = '0';
171 return o;
172 }
173 final boolean isneg = value < 0;
174 if (isneg)
175 value = -value;
176 while (value != 0) {
177 b[--o] = base10byte[value % 10];
178 value /= 10;
179 }
180 if (isneg)
181 b[--o] = '-';
182 return o;
183 }
184
185 /**
186 * Parse a base 10 numeric from a sequence of ASCII digits into an int.
187 * <p>
188 * Digit sequences can begin with an optional run of spaces before the
189 * sequence, and may start with a '+' or a '-' to indicate sign position.
190 * Any other characters will cause the method to stop and return the current
191 * result to the caller.
192 *
193 * @param b
194 * buffer to scan.
195 * @param ptr
196 * position within buffer to start parsing digits at.
197 * @param ptrResult
198 * optional location to return the new ptr value through. If null
199 * the ptr value will be discarded.
200 * @return the value at this location; 0 if the location is not a valid
201 * numeric.
202 */
203 public static final int parseBase10(final byte[] b, int ptr,
204 final MutableInteger ptrResult) {
205 int r = 0;
206 int sign = 0;
207 try {
208 final int sz = b.length;
209 while (ptr < sz && b[ptr] == ' ')
210 ptr++;
211 if (ptr >= sz)
212 return 0;
213
214 switch (b[ptr]) {
215 case '-':
216 sign = -1;
217 ptr++;
218 break;
219 case '+':
220 ptr++;
221 break;
222 }
223
224 while (ptr < sz) {
225 final byte v = digits10[b[ptr]];
226 if (v < 0)
227 break;
228 r = (r * 10) + v;
229 ptr++;
230 }
231 } catch (ArrayIndexOutOfBoundsException e) {
232 // Not a valid digit.
233 }
234 if (ptrResult != null)
235 ptrResult.value = ptr;
236 return sign < 0 ? -r : r;
237 }
238
239 /**
240 * Parse a base 10 numeric from a sequence of ASCII digits into a long.
241 * <p>
242 * Digit sequences can begin with an optional run of spaces before the
243 * sequence, and may start with a '+' or a '-' to indicate sign position.
244 * Any other characters will cause the method to stop and return the current
245 * result to the caller.
246 *
247 * @param b
248 * buffer to scan.
249 * @param ptr
250 * position within buffer to start parsing digits at.
251 * @param ptrResult
252 * optional location to return the new ptr value through. If null
253 * the ptr value will be discarded.
254 * @return the value at this location; 0 if the location is not a valid
255 * numeric.
256 */
257 public static final long parseLongBase10(final byte[] b, int ptr,
258 final MutableInteger ptrResult) {
259 long r = 0;
260 int sign = 0;
261 try {
262 final int sz = b.length;
263 while (ptr < sz && b[ptr] == ' ')
264 ptr++;
265 if (ptr >= sz)
266 return 0;
267
268 switch (b[ptr]) {
269 case '-':
270 sign = -1;
271 ptr++;
272 break;
273 case '+':
274 ptr++;
275 break;
276 }
277
278 while (ptr < sz) {
279 final byte v = digits10[b[ptr]];
280 if (v < 0)
281 break;
282 r = (r * 10) + v;
283 ptr++;
284 }
285 } catch (ArrayIndexOutOfBoundsException e) {
286 // Not a valid digit.
287 }
288 if (ptrResult != null)
289 ptrResult.value = ptr;
290 return sign < 0 ? -r : r;
291 }
292
293 /**
294 * Parse 4 character base 16 (hex) formatted string to unsigned integer.
295 * <p>
296 * The number is read in network byte order, that is, most significant
297 * nybble first.
298 *
299 * @param bs
300 * buffer to parse digits from; positions {@code [p, p+4)} will
301 * be parsed.
302 * @param p
303 * first position within the buffer to parse.
304 * @return the integer value.
305 * @throws ArrayIndexOutOfBoundsException
306 * if the string is not hex formatted.
307 */
308 public static final int parseHexInt16(final byte[] bs, final int p) {
309 int r = digits16[bs[p]] << 4;
310
311 r |= digits16[bs[p + 1]];
312 r <<= 4;
313
314 r |= digits16[bs[p + 2]];
315 r <<= 4;
316
317 r |= digits16[bs[p + 3]];
318 if (r < 0)
319 throw new ArrayIndexOutOfBoundsException();
320 return r;
321 }
322
323 /**
324 * Parse 8 character base 16 (hex) formatted string to unsigned integer.
325 * <p>
326 * The number is read in network byte order, that is, most significant
327 * nybble first.
328 *
329 * @param bs
330 * buffer to parse digits from; positions {@code [p, p+8)} will
331 * be parsed.
332 * @param p
333 * first position within the buffer to parse.
334 * @return the integer value.
335 * @throws ArrayIndexOutOfBoundsException
336 * if the string is not hex formatted.
337 */
338 public static final int parseHexInt32(final byte[] bs, final int p) {
339 int r = digits16[bs[p]] << 4;
340
341 r |= digits16[bs[p + 1]];
342 r <<= 4;
343
344 r |= digits16[bs[p + 2]];
345 r <<= 4;
346
347 r |= digits16[bs[p + 3]];
348 r <<= 4;
349
350 r |= digits16[bs[p + 4]];
351 r <<= 4;
352
353 r |= digits16[bs[p + 5]];
354 r <<= 4;
355
356 r |= digits16[bs[p + 6]];
357
358 final int last = digits16[bs[p + 7]];
359 if (r < 0 || last < 0)
360 throw new ArrayIndexOutOfBoundsException();
361 return (r << 4) | last;
362 }
363
364 /**
365 * Parse a single hex digit to its numeric value (0-15).
366 *
367 * @param digit
368 * hex character to parse.
369 * @return numeric value, in the range 0-15.
370 * @throws ArrayIndexOutOfBoundsException
371 * if the input digit is not a valid hex digit.
372 */
373 public static final int parseHexInt4(final byte digit) {
374 final byte r = digits16[digit];
375 if (r < 0)
376 throw new ArrayIndexOutOfBoundsException();
377 return r;
378 }
379
380 /**
381 * Parse a Git style timezone string.
382 * <p>
383 * The sequence "-0315" will be parsed as the numeric value -195, as the
384 * lower two positions count minutes, not 100ths of an hour.
385 *
386 * @param b
387 * buffer to scan.
388 * @param ptr
389 * position within buffer to start parsing digits at.
390 * @return the timezone at this location, expressed in minutes.
391 */
392 public static final int parseTimeZoneOffset(final byte[] b, int ptr) {
393 final int v = parseBase10(b, ptr, null);
394 final int tzMins = v % 100;
395 final int tzHours = v / 100;
396 return tzHours * 60 + tzMins;
397 }
398
399 /**
400 * Locate the first position after a given character.
401 *
402 * @param b
403 * buffer to scan.
404 * @param ptr
405 * position within buffer to start looking for chrA at.
406 * @param chrA
407 * character to find.
408 * @return new position just after chrA.
409 */
410 public static final int next(final byte[] b, int ptr, final char chrA) {
411 final int sz = b.length;
412 while (ptr < sz) {
413 if (b[ptr++] == chrA)
414 return ptr;
415 }
416 return ptr;
417 }
418
419 /**
420 * Locate the first position after the next LF.
421 * <p>
422 * This method stops on the first '\n' it finds.
423 *
424 * @param b
425 * buffer to scan.
426 * @param ptr
427 * position within buffer to start looking for LF at.
428 * @return new position just after the first LF found.
429 */
430 public static final int nextLF(final byte[] b, int ptr) {
431 return next(b, ptr, '\n');
432 }
433
434 /**
435 * Locate the first position after either the given character or LF.
436 * <p>
437 * This method stops on the first match it finds from either chrA or '\n'.
438 *
439 * @param b
440 * buffer to scan.
441 * @param ptr
442 * position within buffer to start looking for chrA or LF at.
443 * @param chrA
444 * character to find.
445 * @return new position just after the first chrA or LF to be found.
446 */
447 public static final int nextLF(final byte[] b, int ptr, final char chrA) {
448 final int sz = b.length;
449 while (ptr < sz) {
450 final byte c = b[ptr++];
451 if (c == chrA || c == '\n')
452 return ptr;
453 }
454 return ptr;
455 }
456
457 /**
458 * Locate the first position before a given character.
459 *
460 * @param b
461 * buffer to scan.
462 * @param ptr
463 * position within buffer to start looking for chrA at.
464 * @param chrA
465 * character to find.
466 * @return new position just before chrA, -1 for not found
467 */
468 public static final int prev(final byte[] b, int ptr, final char chrA) {
469 if (ptr == b.length)
470 --ptr;
471 while (ptr >= 0) {
472 if (b[ptr--] == chrA)
473 return ptr;
474 }
475 return ptr;
476 }
477
478 /**
479 * Locate the first position before the previous LF.
480 * <p>
481 * This method stops on the first '\n' it finds.
482 *
483 * @param b
484 * buffer to scan.
485 * @param ptr
486 * position within buffer to start looking for LF at.
487 * @return new position just before the first LF found, -1 for not found
488 */
489 public static final int prevLF(final byte[] b, int ptr) {
490 return prev(b, ptr, '\n');
491 }
492
493 /**
494 * Locate the previous position before either the given character or LF.
495 * <p>
496 * This method stops on the first match it finds from either chrA or '\n'.
497 *
498 * @param b
499 * buffer to scan.
500 * @param ptr
501 * position within buffer to start looking for chrA or LF at.
502 * @param chrA
503 * character to find.
504 * @return new position just before the first chrA or LF to be found, -1 for
505 * not found
506 */
507 public static final int prevLF(final byte[] b, int ptr, final char chrA) {
508 if (ptr == b.length)
509 --ptr;
510 while (ptr >= 0) {
511 final byte c = b[ptr--];
512 if (c == chrA || c == '\n')
513 return ptr;
514 }
515 return ptr;
516 }
517
518 /**
519 * Index the region between <code>[ptr, end)</code> to find line starts.
520 * <p>
521 * The returned list is 1 indexed. Index 0 contains
522 * {@link Integer#MIN_VALUE} to pad the list out.
523 * <p>
524 * Using a 1 indexed list means that line numbers can be directly accessed
525 * from the list, so <code>list.get(1)</code> (aka get line 1) returns
526 * <code>ptr</code>.
527 * <p>
528 * The last element (index <code>map.size()-1</code>) always contains
529 * <code>end</code>.
530 *
531 * @param buf
532 * buffer to scan.
533 * @param ptr
534 * position within the buffer corresponding to the first byte of
535 * line 1.
536 * @param end
537 * 1 past the end of the content within <code>buf</code>.
538 * @return a line map indexing the start position of each line.
539 */
540 public static final IntList lineMap(final byte[] buf, int ptr, int end) {
541 // Experimentally derived from multiple source repositories
542 // the average number of bytes/line is 36. Its a rough guess
543 // to initially size our map close to the target.
544 //
545 final IntList map = new IntList((end - ptr) / 36);
546 map.fillTo(1, Integer.MIN_VALUE);
547 for (; ptr < end; ptr = nextLF(buf, ptr))
548 map.add(ptr);
549 map.add(end);
550 return map;
551 }
552
553 /**
554 * Locate the "author " header line data.
555 *
556 * @param b
557 * buffer to scan.
558 * @param ptr
559 * position in buffer to start the scan at. Most callers should
560 * pass 0 to ensure the scan starts from the beginning of the
561 * commit buffer and does not accidentally look at message body.
562 * @return position just after the space in "author ", so the first
563 * character of the author's name. If no author header can be
564 * located -1 is returned.
565 */
566 public static final int author(final byte[] b, int ptr) {
567 final int sz = b.length;
568 if (ptr == 0)
569 ptr += 46; // skip the "tree ..." line.
570 while (ptr < sz && b[ptr] == 'p')
571 ptr += 48; // skip this parent.
572 return match(b, ptr, author);
573 }
574
575 /**
576 * Locate the "committer " header line data.
577 *
578 * @param b
579 * buffer to scan.
580 * @param ptr
581 * position in buffer to start the scan at. Most callers should
582 * pass 0 to ensure the scan starts from the beginning of the
583 * commit buffer and does not accidentally look at message body.
584 * @return position just after the space in "committer ", so the first
585 * character of the committer's name. If no committer header can be
586 * located -1 is returned.
587 */
588 public static final int committer(final byte[] b, int ptr) {
589 final int sz = b.length;
590 if (ptr == 0)
591 ptr += 46; // skip the "tree ..." line.
592 while (ptr < sz && b[ptr] == 'p')
593 ptr += 48; // skip this parent.
594 if (ptr < sz && b[ptr] == 'a')
595 ptr = nextLF(b, ptr);
596 return match(b, ptr, committer);
597 }
598
599 /**
600 * Locate the "tagger " header line data.
601 *
602 * @param b
603 * buffer to scan.
604 * @param ptr
605 * position in buffer to start the scan at. Most callers should
606 * pass 0 to ensure the scan starts from the beginning of the tag
607 * buffer and does not accidentally look at message body.
608 * @return position just after the space in "tagger ", so the first
609 * character of the tagger's name. If no tagger header can be
610 * located -1 is returned.
611 */
612 public static final int tagger(final byte[] b, int ptr) {
613 final int sz = b.length;
614 if (ptr == 0)
615 ptr += 48; // skip the "object ..." line.
616 while (ptr < sz) {
617 if (b[ptr] == '\n')
618 return -1;
619 final int m = match(b, ptr, tagger);
620 if (m >= 0)
621 return m;
622 ptr = nextLF(b, ptr);
623 }
624 return -1;
625 }
626
627 /**
628 * Locate the "encoding " header line.
629 *
630 * @param b
631 * buffer to scan.
632 * @param ptr
633 * position in buffer to start the scan at. Most callers should
634 * pass 0 to ensure the scan starts from the beginning of the
635 * buffer and does not accidentally look at the message body.
636 * @return position just after the space in "encoding ", so the first
637 * character of the encoding's name. If no encoding header can be
638 * located -1 is returned (and UTF-8 should be assumed).
639 */
640 public static final int encoding(final byte[] b, int ptr) {
641 final int sz = b.length;
642 while (ptr < sz) {
643 if (b[ptr] == '\n')
644 return -1;
645 if (b[ptr] == 'e')
646 break;
647 ptr = nextLF(b, ptr);
648 }
649 return match(b, ptr, encoding);
650 }
651
652 /**
653 * Parse the "encoding " header into a character set reference.
654 * <p>
655 * Locates the "encoding " header (if present) by first calling
656 * {@link #encoding(byte[], int)} and then returns the proper character set
657 * to apply to this buffer to evaluate its contents as character data.
658 * <p>
659 * If no encoding header is present, {@link Constants#CHARSET} is assumed.
660 *
661 * @param b
662 * buffer to scan.
663 * @return the Java character set representation. Never null.
664 */
665 public static Charset parseEncoding(final byte[] b) {
666 final int enc = encoding(b, 0);
667 if (enc < 0)
668 return Constants.CHARSET;
669 final int lf = nextLF(b, enc);
670 String decoded = decode(Constants.CHARSET, b, enc, lf - 1);
671 try {
672 return Charset.forName(decoded);
673 } catch (IllegalCharsetNameException badName) {
674 Charset aliased = charsetForAlias(decoded);
675 if (aliased != null)
676 return aliased;
677 throw badName;
678 } catch (UnsupportedCharsetException badName) {
679 Charset aliased = charsetForAlias(decoded);
680 if (aliased != null)
681 return aliased;
682 throw badName;
683 }
684 }
685
686 /**
687 * Parse a name string (e.g. author, committer, tagger) into a PersonIdent.
688 * <p>
689 * Leading spaces won't be trimmed from the string, i.e. will show up in the
690 * parsed name afterwards.
691 *
692 * @param in
693 * the string to parse a name from.
694 * @return the parsed identity or null in case the identity could not be
695 * parsed.
696 */
697 public static PersonIdent parsePersonIdent(final String in) {
698 return parsePersonIdent(Constants.encode(in), 0);
699 }
700
701 /**
702 * Parse a name line (e.g. author, committer, tagger) into a PersonIdent.
703 * <p>
704 * When passing in a value for <code>nameB</code> callers should use the
705 * return value of {@link #author(byte[], int)} or
706 * {@link #committer(byte[], int)}, as these methods provide the proper
707 * position within the buffer.
708 *
709 * @param raw
710 * the buffer to parse character data from.
711 * @param nameB
712 * first position of the identity information. This should be the
713 * first position after the space which delimits the header field
714 * name (e.g. "author" or "committer") from the rest of the
715 * identity line.
716 * @return the parsed identity or null in case the identity could not be
717 * parsed.
718 */
719 public static PersonIdent parsePersonIdent(final byte[] raw, final int nameB) {
720 final Charset cs = parseEncoding(raw);
721 final int emailB = nextLF(raw, nameB, '<');
722 final int emailE = nextLF(raw, emailB, '>');
723 if (emailB >= raw.length || raw[emailB] == '\n' ||
724 (emailE >= raw.length - 1 && raw[emailE - 1] != '>'))
725 return null;
726
727 final int nameEnd = emailB - 2 >= nameB && raw[emailB - 2] == ' ' ?
728 emailB - 2 : emailB - 1;
729 final String name = decode(cs, raw, nameB, nameEnd);
730 final String email = decode(cs, raw, emailB, emailE - 1);
731
732 // Start searching from end of line, as after first name-email pair,
733 // another name-email pair may occur. We will ignore all kinds of
734 // "junk" following the first email.
735 //
736 // We've to use (emailE - 1) for the case that raw[email] is LF,
737 // otherwise we would run too far. "-2" is necessary to position
738 // before the LF in case of LF termination resp. the penultimate
739 // character if there is no trailing LF.
740 final int tzBegin = lastIndexOfTrim(raw, ' ',
741 nextLF(raw, emailE - 1) - 2) + 1;
742 if (tzBegin <= emailE) // No time/zone, still valid
743 return new PersonIdent(name, email, 0, 0);
744
745 final int whenBegin = Math.max(emailE,
746 lastIndexOfTrim(raw, ' ', tzBegin - 1) + 1);
747 if (whenBegin >= tzBegin - 1) // No time/zone, still valid
748 return new PersonIdent(name, email, 0, 0);
749
750 final long when = parseLongBase10(raw, whenBegin, null);
751 final int tz = parseTimeZoneOffset(raw, tzBegin);
752 return new PersonIdent(name, email, when * 1000L, tz);
753 }
754
755 /**
756 * Parse a name data (e.g. as within a reflog) into a PersonIdent.
757 * <p>
758 * When passing in a value for <code>nameB</code> callers should use the
759 * return value of {@link #author(byte[], int)} or
760 * {@link #committer(byte[], int)}, as these methods provide the proper
761 * position within the buffer.
762 *
763 * @param raw
764 * the buffer to parse character data from.
765 * @param nameB
766 * first position of the identity information. This should be the
767 * first position after the space which delimits the header field
768 * name (e.g. "author" or "committer") from the rest of the
769 * identity line.
770 * @return the parsed identity. Never null.
771 */
772 public static PersonIdent parsePersonIdentOnly(final byte[] raw,
773 final int nameB) {
774 int stop = nextLF(raw, nameB);
775 int emailB = nextLF(raw, nameB, '<');
776 int emailE = nextLF(raw, emailB, '>');
777 final String name;
778 final String email;
779 if (emailE < stop) {
780 email = decode(raw, emailB, emailE - 1);
781 } else {
782 email = "invalid"; //$NON-NLS-1$
783 }
784 if (emailB < stop)
785 name = decode(raw, nameB, emailB - 2);
786 else
787 name = decode(raw, nameB, stop);
788
789 final MutableInteger ptrout = new MutableInteger();
790 long when;
791 int tz;
792 if (emailE < stop) {
793 when = parseLongBase10(raw, emailE + 1, ptrout);
794 tz = parseTimeZoneOffset(raw, ptrout.value);
795 } else {
796 when = 0;
797 tz = 0;
798 }
799 return new PersonIdent(name, email, when * 1000L, tz);
800 }
801
802 /**
803 * Locate the end of a footer line key string.
804 * <p>
805 * If the region at {@code raw[ptr]} matches {@code ^[A-Za-z0-9-]+:} (e.g.
806 * "Signed-off-by: A. U. Thor\n") then this method returns the position of
807 * the first ':'.
808 * <p>
809 * If the region at {@code raw[ptr]} does not match {@code ^[A-Za-z0-9-]+:}
810 * then this method returns -1.
811 *
812 * @param raw
813 * buffer to scan.
814 * @param ptr
815 * first position within raw to consider as a footer line key.
816 * @return position of the ':' which terminates the footer line key if this
817 * is otherwise a valid footer line key; otherwise -1.
818 */
819 public static int endOfFooterLineKey(final byte[] raw, int ptr) {
820 try {
821 for (;;) {
822 final byte c = raw[ptr];
823 if (footerLineKeyChars[c] == 0) {
824 if (c == ':')
825 return ptr;
826 return -1;
827 }
828 ptr++;
829 }
830 } catch (ArrayIndexOutOfBoundsException e) {
831 return -1;
832 }
833 }
834
835 /**
836 * Decode a buffer under UTF-8, if possible.
837 *
838 * If the byte stream cannot be decoded that way, the platform default is tried
839 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
840 *
841 * @param buffer
842 * buffer to pull raw bytes from.
843 * @return a string representation of the range <code>[start,end)</code>,
844 * after decoding the region through the specified character set.
845 */
846 public static String decode(final byte[] buffer) {
847 return decode(buffer, 0, buffer.length);
848 }
849
850 /**
851 * Decode a buffer under UTF-8, if possible.
852 *
853 * If the byte stream cannot be decoded that way, the platform default is
854 * tried and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
855 *
856 * @param buffer
857 * buffer to pull raw bytes from.
858 * @param start
859 * start position in buffer
860 * @param end
861 * one position past the last location within the buffer to take
862 * data from.
863 * @return a string representation of the range <code>[start,end)</code>,
864 * after decoding the region through the specified character set.
865 */
866 public static String decode(final byte[] buffer, final int start,
867 final int end) {
868 return decode(Constants.CHARSET, buffer, start, end);
869 }
870
871 /**
872 * Decode a buffer under the specified character set if possible.
873 *
874 * If the byte stream cannot be decoded that way, the platform default is tried
875 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
876 *
877 * @param cs
878 * character set to use when decoding the buffer.
879 * @param buffer
880 * buffer to pull raw bytes from.
881 * @return a string representation of the range <code>[start,end)</code>,
882 * after decoding the region through the specified character set.
883 */
884 public static String decode(final Charset cs, final byte[] buffer) {
885 return decode(cs, buffer, 0, buffer.length);
886 }
887
888 /**
889 * Decode a region of the buffer under the specified character set if possible.
890 *
891 * If the byte stream cannot be decoded that way, the platform default is tried
892 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
893 *
894 * @param cs
895 * character set to use when decoding the buffer.
896 * @param buffer
897 * buffer to pull raw bytes from.
898 * @param start
899 * first position within the buffer to take data from.
900 * @param end
901 * one position past the last location within the buffer to take
902 * data from.
903 * @return a string representation of the range <code>[start,end)</code>,
904 * after decoding the region through the specified character set.
905 */
906 public static String decode(final Charset cs, final byte[] buffer,
907 final int start, final int end) {
908 try {
909 return decodeNoFallback(cs, buffer, start, end);
910 } catch (CharacterCodingException e) {
911 // Fall back to an ISO-8859-1 style encoding. At least all of
912 // the bytes will be present in the output.
913 //
914 return extractBinaryString(buffer, start, end);
915 }
916 }
917
918 /**
919 * Decode a region of the buffer under the specified character set if
920 * possible.
921 *
922 * If the byte stream cannot be decoded that way, the platform default is
923 * tried and if that too fails, an exception is thrown.
924 *
925 * @param cs
926 * character set to use when decoding the buffer.
927 * @param buffer
928 * buffer to pull raw bytes from.
929 * @param start
930 * first position within the buffer to take data from.
931 * @param end
932 * one position past the last location within the buffer to take
933 * data from.
934 * @return a string representation of the range <code>[start,end)</code>,
935 * after decoding the region through the specified character set.
936 * @throws CharacterCodingException
937 * the input is not in any of the tested character sets.
938 */
939 public static String decodeNoFallback(final Charset cs,
940 final byte[] buffer, final int start, final int end)
941 throws CharacterCodingException {
942 final ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
943 b.mark();
944
945 // Try our built-in favorite. The assumption here is that
946 // decoding will fail if the data is not actually encoded
947 // using that encoder.
948 //
949 try {
950 return decode(b, Constants.CHARSET);
951 } catch (CharacterCodingException e) {
952 b.reset();
953 }
954
955 if (!cs.equals(Constants.CHARSET)) {
956 // Try the suggested encoding, it might be right since it was
957 // provided by the caller.
958 //
959 try {
960 return decode(b, cs);
961 } catch (CharacterCodingException e) {
962 b.reset();
963 }
964 }
965
966 // Try the default character set. A small group of people
967 // might actually use the same (or very similar) locale.
968 //
969 final Charset defcs = Charset.defaultCharset();
970 if (!defcs.equals(cs) && !defcs.equals(Constants.CHARSET)) {
971 try {
972 return decode(b, defcs);
973 } catch (CharacterCodingException e) {
974 b.reset();
975 }
976 }
977
978 throw new CharacterCodingException();
979 }
980
981 /**
982 * Decode a region of the buffer under the ISO-8859-1 encoding.
983 *
984 * Each byte is treated as a single character in the 8859-1 character
985 * encoding, performing a raw binary->char conversion.
986 *
987 * @param buffer
988 * buffer to pull raw bytes from.
989 * @param start
990 * first position within the buffer to take data from.
991 * @param end
992 * one position past the last location within the buffer to take
993 * data from.
994 * @return a string representation of the range <code>[start,end)</code>.
995 */
996 public static String extractBinaryString(final byte[] buffer,
997 final int start, final int end) {
998 final StringBuilder r = new StringBuilder(end - start);
999 for (int i = start; i < end; i++)
1000 r.append((char) (buffer[i] & 0xff));
1001 return r.toString();
1002 }
1003
1004 private static String decode(final ByteBuffer b, final Charset charset)
1005 throws CharacterCodingException {
1006 final CharsetDecoder d = charset.newDecoder();
1007 d.onMalformedInput(CodingErrorAction.REPORT);
1008 d.onUnmappableCharacter(CodingErrorAction.REPORT);
1009 return d.decode(b).toString();
1010 }
1011
1012 /**
1013 * Locate the position of the commit message body.
1014 *
1015 * @param b
1016 * buffer to scan.
1017 * @param ptr
1018 * position in buffer to start the scan at. Most callers should
1019 * pass 0 to ensure the scan starts from the beginning of the
1020 * commit buffer.
1021 * @return position of the user's message buffer.
1022 */
1023 public static final int commitMessage(final byte[] b, int ptr) {
1024 final int sz = b.length;
1025 if (ptr == 0)
1026 ptr += 46; // skip the "tree ..." line.
1027 while (ptr < sz && b[ptr] == 'p')
1028 ptr += 48; // skip this parent.
1029
1030 // Skip any remaining header lines, ignoring what their actual
1031 // header line type is. This is identical to the logic for a tag.
1032 //
1033 return tagMessage(b, ptr);
1034 }
1035
1036 /**
1037 * Locate the position of the tag message body.
1038 *
1039 * @param b
1040 * buffer to scan.
1041 * @param ptr
1042 * position in buffer to start the scan at. Most callers should
1043 * pass 0 to ensure the scan starts from the beginning of the tag
1044 * buffer.
1045 * @return position of the user's message buffer.
1046 */
1047 public static final int tagMessage(final byte[] b, int ptr) {
1048 final int sz = b.length;
1049 if (ptr == 0)
1050 ptr += 48; // skip the "object ..." line.
1051 while (ptr < sz && b[ptr] != '\n')
1052 ptr = nextLF(b, ptr);
1053 if (ptr < sz && b[ptr] == '\n')
1054 return ptr + 1;
1055 return -1;
1056 }
1057
1058 /**
1059 * Locate the end of a paragraph.
1060 * <p>
1061 * A paragraph is ended by two consecutive LF bytes or CRLF pairs
1062 *
1063 * @param b
1064 * buffer to scan.
1065 * @param start
1066 * position in buffer to start the scan at. Most callers will
1067 * want to pass the first position of the commit message (as
1068 * found by {@link #commitMessage(byte[], int)}.
1069 * @return position of the LF at the end of the paragraph;
1070 * <code>b.length</code> if no paragraph end could be located.
1071 */
1072 public static final int endOfParagraph(final byte[] b, final int start) {
1073 int ptr = start;
1074 final int sz = b.length;
1075 while (ptr < sz && (b[ptr] != '\n' && b[ptr] != '\r'))
1076 ptr = nextLF(b, ptr);
1077 if (ptr > start && b[ptr - 1] == '\n')
1078 ptr--;
1079 if (ptr > start && b[ptr - 1] == '\r')
1080 ptr--;
1081 return ptr;
1082 }
1083
1084 private static int lastIndexOfTrim(byte[] raw, char ch, int pos) {
1085 while (pos >= 0 && raw[pos] == ' ')
1086 pos--;
1087
1088 while (pos >= 0 && raw[pos] != ch)
1089 pos--;
1090
1091 return pos;
1092 }
1093
1094 private static Charset charsetForAlias(String name) {
1095 return encodingAliases.get(StringUtils.toLowerCase(name));
1096 }
1097
1098 private RawParseUtils() {
1099 // Don't create instances of a static only utility.
1100 }
1101 }