1 /* 2 * Copyright (C) 2008-2009, Google Inc. 3 * Copyright (C) 2006-2008, Shawn O. Pearce <spearce@spearce.org> 4 * and other copyright owners as documented in the project's IP log. 5 * 6 * This program and the accompanying materials are made available 7 * under the terms of the Eclipse Distribution License v1.0 which 8 * accompanies this distribution, is reproduced below, and is 9 * available at http://www.eclipse.org/org/documents/edl-v10.php 10 * 11 * All rights reserved. 12 * 13 * Redistribution and use in source and binary forms, with or 14 * without modification, are permitted provided that the following 15 * conditions are met: 16 * 17 * - Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 20 * - Redistributions in binary form must reproduce the above 21 * copyright notice, this list of conditions and the following 22 * disclaimer in the documentation and/or other materials provided 23 * with the distribution. 24 * 25 * - Neither the name of the Eclipse Foundation, Inc. nor the 26 * names of its contributors may be used to endorse or promote 27 * products derived from this software without specific prior 28 * written permission. 29 * 30 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND 31 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, 32 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 33 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 34 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 35 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 36 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 37 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 38 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 39 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 40 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 41 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 42 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 43 */ 44 45 package org.eclipse.jgit.util; 46 47 import static org.eclipse.jgit.lib.ObjectChecker.author; 48 import static org.eclipse.jgit.lib.ObjectChecker.committer; 49 import static org.eclipse.jgit.lib.ObjectChecker.encoding; 50 import static org.eclipse.jgit.lib.ObjectChecker.tagger; 51 52 import java.nio.ByteBuffer; 53 import java.nio.charset.CharacterCodingException; 54 import java.nio.charset.Charset; 55 import java.nio.charset.CharsetDecoder; 56 import java.nio.charset.CodingErrorAction; 57 import java.nio.charset.IllegalCharsetNameException; 58 import java.nio.charset.UnsupportedCharsetException; 59 import java.util.Arrays; 60 import java.util.HashMap; 61 import java.util.Map; 62 63 import org.eclipse.jgit.lib.Constants; 64 import org.eclipse.jgit.lib.PersonIdent; 65 66 /** Handy utility functions to parse raw object contents. */ 67 public final class RawParseUtils { 68 /** 69 * UTF-8 charset constant. 70 * 71 * @since 2.2 72 */ 73 public static final Charset UTF8_CHARSET = Charset.forName("UTF-8"); //$NON-NLS-1$ 74 75 private static final byte[] digits10; 76 77 private static final byte[] digits16; 78 79 private static final byte[] footerLineKeyChars; 80 81 private static final Map<String, Charset> encodingAliases; 82 83 static { 84 encodingAliases = new HashMap<String, Charset>(); 85 encodingAliases.put("latin-1", Charset.forName("ISO-8859-1")); //$NON-NLS-1$ //$NON-NLS-2$ 86 87 digits10 = new byte['9' + 1]; 88 Arrays.fill(digits10, (byte) -1); 89 for (char i = '0'; i <= '9'; i++) 90 digits10[i] = (byte) (i - '0'); 91 92 digits16 = new byte['f' + 1]; 93 Arrays.fill(digits16, (byte) -1); 94 for (char i = '0'; i <= '9'; i++) 95 digits16[i] = (byte) (i - '0'); 96 for (char i = 'a'; i <= 'f'; i++) 97 digits16[i] = (byte) ((i - 'a') + 10); 98 for (char i = 'A'; i <= 'F'; i++) 99 digits16[i] = (byte) ((i - 'A') + 10); 100 101 footerLineKeyChars = new byte['z' + 1]; 102 footerLineKeyChars['-'] = 1; 103 for (char i = '0'; i <= '9'; i++) 104 footerLineKeyChars[i] = 1; 105 for (char i = 'A'; i <= 'Z'; i++) 106 footerLineKeyChars[i] = 1; 107 for (char i = 'a'; i <= 'z'; i++) 108 footerLineKeyChars[i] = 1; 109 } 110 111 /** 112 * Determine if b[ptr] matches src. 113 * 114 * @param b 115 * the buffer to scan. 116 * @param ptr 117 * first position within b, this should match src[0]. 118 * @param src 119 * the buffer to test for equality with b. 120 * @return ptr + src.length if b[ptr..src.length] == src; else -1. 121 */ 122 public static final int match(final byte[] b, int ptr, final byte[] src) { 123 if (ptr + src.length > b.length) 124 return -1; 125 for (int i = 0; i < src.length; i++, ptr++) 126 if (b[ptr] != src[i]) 127 return -1; 128 return ptr; 129 } 130 131 private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5', 132 '6', '7', '8', '9' }; 133 134 /** 135 * Format a base 10 numeric into a temporary buffer. 136 * <p> 137 * Formatting is performed backwards. The method starts at offset 138 * <code>o-1</code> and ends at <code>o-1-digits</code>, where 139 * <code>digits</code> is the number of positions necessary to store the 140 * base 10 value. 141 * <p> 142 * The argument and return values from this method make it easy to chain 143 * writing, for example: 144 * </p> 145 * 146 * <pre> 147 * final byte[] tmp = new byte[64]; 148 * int ptr = tmp.length; 149 * tmp[--ptr] = '\n'; 150 * ptr = RawParseUtils.formatBase10(tmp, ptr, 32); 151 * tmp[--ptr] = ' '; 152 * ptr = RawParseUtils.formatBase10(tmp, ptr, 18); 153 * tmp[--ptr] = 0; 154 * final String str = new String(tmp, ptr, tmp.length - ptr); 155 * </pre> 156 * 157 * @param b 158 * buffer to write into. 159 * @param o 160 * one offset past the location where writing will begin; writing 161 * proceeds towards lower index values. 162 * @param value 163 * the value to store. 164 * @return the new offset value <code>o</code>. This is the position of 165 * the last byte written. Additional writing should start at one 166 * position earlier. 167 */ 168 public static int formatBase10(final byte[] b, int o, int value) { 169 if (value == 0) { 170 b[--o] = '0'; 171 return o; 172 } 173 final boolean isneg = value < 0; 174 if (isneg) 175 value = -value; 176 while (value != 0) { 177 b[--o] = base10byte[value % 10]; 178 value /= 10; 179 } 180 if (isneg) 181 b[--o] = '-'; 182 return o; 183 } 184 185 /** 186 * Parse a base 10 numeric from a sequence of ASCII digits into an int. 187 * <p> 188 * Digit sequences can begin with an optional run of spaces before the 189 * sequence, and may start with a '+' or a '-' to indicate sign position. 190 * Any other characters will cause the method to stop and return the current 191 * result to the caller. 192 * 193 * @param b 194 * buffer to scan. 195 * @param ptr 196 * position within buffer to start parsing digits at. 197 * @param ptrResult 198 * optional location to return the new ptr value through. If null 199 * the ptr value will be discarded. 200 * @return the value at this location; 0 if the location is not a valid 201 * numeric. 202 */ 203 public static final int parseBase10(final byte[] b, int ptr, 204 final MutableInteger ptrResult) { 205 int r = 0; 206 int sign = 0; 207 try { 208 final int sz = b.length; 209 while (ptr < sz && b[ptr] == ' ') 210 ptr++; 211 if (ptr >= sz) 212 return 0; 213 214 switch (b[ptr]) { 215 case '-': 216 sign = -1; 217 ptr++; 218 break; 219 case '+': 220 ptr++; 221 break; 222 } 223 224 while (ptr < sz) { 225 final byte v = digits10[b[ptr]]; 226 if (v < 0) 227 break; 228 r = (r * 10) + v; 229 ptr++; 230 } 231 } catch (ArrayIndexOutOfBoundsException e) { 232 // Not a valid digit. 233 } 234 if (ptrResult != null) 235 ptrResult.value = ptr; 236 return sign < 0 ? -r : r; 237 } 238 239 /** 240 * Parse a base 10 numeric from a sequence of ASCII digits into a long. 241 * <p> 242 * Digit sequences can begin with an optional run of spaces before the 243 * sequence, and may start with a '+' or a '-' to indicate sign position. 244 * Any other characters will cause the method to stop and return the current 245 * result to the caller. 246 * 247 * @param b 248 * buffer to scan. 249 * @param ptr 250 * position within buffer to start parsing digits at. 251 * @param ptrResult 252 * optional location to return the new ptr value through. If null 253 * the ptr value will be discarded. 254 * @return the value at this location; 0 if the location is not a valid 255 * numeric. 256 */ 257 public static final long parseLongBase10(final byte[] b, int ptr, 258 final MutableInteger ptrResult) { 259 long r = 0; 260 int sign = 0; 261 try { 262 final int sz = b.length; 263 while (ptr < sz && b[ptr] == ' ') 264 ptr++; 265 if (ptr >= sz) 266 return 0; 267 268 switch (b[ptr]) { 269 case '-': 270 sign = -1; 271 ptr++; 272 break; 273 case '+': 274 ptr++; 275 break; 276 } 277 278 while (ptr < sz) { 279 final byte v = digits10[b[ptr]]; 280 if (v < 0) 281 break; 282 r = (r * 10) + v; 283 ptr++; 284 } 285 } catch (ArrayIndexOutOfBoundsException e) { 286 // Not a valid digit. 287 } 288 if (ptrResult != null) 289 ptrResult.value = ptr; 290 return sign < 0 ? -r : r; 291 } 292 293 /** 294 * Parse 4 character base 16 (hex) formatted string to unsigned integer. 295 * <p> 296 * The number is read in network byte order, that is, most significant 297 * nybble first. 298 * 299 * @param bs 300 * buffer to parse digits from; positions {@code [p, p+4)} will 301 * be parsed. 302 * @param p 303 * first position within the buffer to parse. 304 * @return the integer value. 305 * @throws ArrayIndexOutOfBoundsException 306 * if the string is not hex formatted. 307 */ 308 public static final int parseHexInt16(final byte[] bs, final int p) { 309 int r = digits16[bs[p]] << 4; 310 311 r |= digits16[bs[p + 1]]; 312 r <<= 4; 313 314 r |= digits16[bs[p + 2]]; 315 r <<= 4; 316 317 r |= digits16[bs[p + 3]]; 318 if (r < 0) 319 throw new ArrayIndexOutOfBoundsException(); 320 return r; 321 } 322 323 /** 324 * Parse 8 character base 16 (hex) formatted string to unsigned integer. 325 * <p> 326 * The number is read in network byte order, that is, most significant 327 * nybble first. 328 * 329 * @param bs 330 * buffer to parse digits from; positions {@code [p, p+8)} will 331 * be parsed. 332 * @param p 333 * first position within the buffer to parse. 334 * @return the integer value. 335 * @throws ArrayIndexOutOfBoundsException 336 * if the string is not hex formatted. 337 */ 338 public static final int parseHexInt32(final byte[] bs, final int p) { 339 int r = digits16[bs[p]] << 4; 340 341 r |= digits16[bs[p + 1]]; 342 r <<= 4; 343 344 r |= digits16[bs[p + 2]]; 345 r <<= 4; 346 347 r |= digits16[bs[p + 3]]; 348 r <<= 4; 349 350 r |= digits16[bs[p + 4]]; 351 r <<= 4; 352 353 r |= digits16[bs[p + 5]]; 354 r <<= 4; 355 356 r |= digits16[bs[p + 6]]; 357 358 final int last = digits16[bs[p + 7]]; 359 if (r < 0 || last < 0) 360 throw new ArrayIndexOutOfBoundsException(); 361 return (r << 4) | last; 362 } 363 364 /** 365 * Parse a single hex digit to its numeric value (0-15). 366 * 367 * @param digit 368 * hex character to parse. 369 * @return numeric value, in the range 0-15. 370 * @throws ArrayIndexOutOfBoundsException 371 * if the input digit is not a valid hex digit. 372 */ 373 public static final int parseHexInt4(final byte digit) { 374 final byte r = digits16[digit]; 375 if (r < 0) 376 throw new ArrayIndexOutOfBoundsException(); 377 return r; 378 } 379 380 /** 381 * Parse a Git style timezone string. 382 * <p> 383 * The sequence "-0315" will be parsed as the numeric value -195, as the 384 * lower two positions count minutes, not 100ths of an hour. 385 * 386 * @param b 387 * buffer to scan. 388 * @param ptr 389 * position within buffer to start parsing digits at. 390 * @return the timezone at this location, expressed in minutes. 391 */ 392 public static final int parseTimeZoneOffset(final byte[] b, int ptr) { 393 final int v = parseBase10(b, ptr, null); 394 final int tzMins = v % 100; 395 final int tzHours = v / 100; 396 return tzHours * 60 + tzMins; 397 } 398 399 /** 400 * Locate the first position after a given character. 401 * 402 * @param b 403 * buffer to scan. 404 * @param ptr 405 * position within buffer to start looking for chrA at. 406 * @param chrA 407 * character to find. 408 * @return new position just after chrA. 409 */ 410 public static final int next(final byte[] b, int ptr, final char chrA) { 411 final int sz = b.length; 412 while (ptr < sz) { 413 if (b[ptr++] == chrA) 414 return ptr; 415 } 416 return ptr; 417 } 418 419 /** 420 * Locate the first position after the next LF. 421 * <p> 422 * This method stops on the first '\n' it finds. 423 * 424 * @param b 425 * buffer to scan. 426 * @param ptr 427 * position within buffer to start looking for LF at. 428 * @return new position just after the first LF found. 429 */ 430 public static final int nextLF(final byte[] b, int ptr) { 431 return next(b, ptr, '\n'); 432 } 433 434 /** 435 * Locate the first position after either the given character or LF. 436 * <p> 437 * This method stops on the first match it finds from either chrA or '\n'. 438 * 439 * @param b 440 * buffer to scan. 441 * @param ptr 442 * position within buffer to start looking for chrA or LF at. 443 * @param chrA 444 * character to find. 445 * @return new position just after the first chrA or LF to be found. 446 */ 447 public static final int nextLF(final byte[] b, int ptr, final char chrA) { 448 final int sz = b.length; 449 while (ptr < sz) { 450 final byte c = b[ptr++]; 451 if (c == chrA || c == '\n') 452 return ptr; 453 } 454 return ptr; 455 } 456 457 /** 458 * Locate the first position before a given character. 459 * 460 * @param b 461 * buffer to scan. 462 * @param ptr 463 * position within buffer to start looking for chrA at. 464 * @param chrA 465 * character to find. 466 * @return new position just before chrA, -1 for not found 467 */ 468 public static final int prev(final byte[] b, int ptr, final char chrA) { 469 if (ptr == b.length) 470 --ptr; 471 while (ptr >= 0) { 472 if (b[ptr--] == chrA) 473 return ptr; 474 } 475 return ptr; 476 } 477 478 /** 479 * Locate the first position before the previous LF. 480 * <p> 481 * This method stops on the first '\n' it finds. 482 * 483 * @param b 484 * buffer to scan. 485 * @param ptr 486 * position within buffer to start looking for LF at. 487 * @return new position just before the first LF found, -1 for not found 488 */ 489 public static final int prevLF(final byte[] b, int ptr) { 490 return prev(b, ptr, '\n'); 491 } 492 493 /** 494 * Locate the previous position before either the given character or LF. 495 * <p> 496 * This method stops on the first match it finds from either chrA or '\n'. 497 * 498 * @param b 499 * buffer to scan. 500 * @param ptr 501 * position within buffer to start looking for chrA or LF at. 502 * @param chrA 503 * character to find. 504 * @return new position just before the first chrA or LF to be found, -1 for 505 * not found 506 */ 507 public static final int prevLF(final byte[] b, int ptr, final char chrA) { 508 if (ptr == b.length) 509 --ptr; 510 while (ptr >= 0) { 511 final byte c = b[ptr--]; 512 if (c == chrA || c == '\n') 513 return ptr; 514 } 515 return ptr; 516 } 517 518 /** 519 * Index the region between <code>[ptr, end)</code> to find line starts. 520 * <p> 521 * The returned list is 1 indexed. Index 0 contains 522 * {@link Integer#MIN_VALUE} to pad the list out. 523 * <p> 524 * Using a 1 indexed list means that line numbers can be directly accessed 525 * from the list, so <code>list.get(1)</code> (aka get line 1) returns 526 * <code>ptr</code>. 527 * <p> 528 * The last element (index <code>map.size()-1</code>) always contains 529 * <code>end</code>. 530 * 531 * @param buf 532 * buffer to scan. 533 * @param ptr 534 * position within the buffer corresponding to the first byte of 535 * line 1. 536 * @param end 537 * 1 past the end of the content within <code>buf</code>. 538 * @return a line map indexing the start position of each line. 539 */ 540 public static final IntList lineMap(final byte[] buf, int ptr, int end) { 541 // Experimentally derived from multiple source repositories 542 // the average number of bytes/line is 36. Its a rough guess 543 // to initially size our map close to the target. 544 // 545 final IntList map = new IntList((end - ptr) / 36); 546 map.fillTo(1, Integer.MIN_VALUE); 547 for (; ptr < end; ptr = nextLF(buf, ptr)) 548 map.add(ptr); 549 map.add(end); 550 return map; 551 } 552 553 /** 554 * Locate the "author " header line data. 555 * 556 * @param b 557 * buffer to scan. 558 * @param ptr 559 * position in buffer to start the scan at. Most callers should 560 * pass 0 to ensure the scan starts from the beginning of the 561 * commit buffer and does not accidentally look at message body. 562 * @return position just after the space in "author ", so the first 563 * character of the author's name. If no author header can be 564 * located -1 is returned. 565 */ 566 public static final int author(final byte[] b, int ptr) { 567 final int sz = b.length; 568 if (ptr == 0) 569 ptr += 46; // skip the "tree ..." line. 570 while (ptr < sz && b[ptr] == 'p') 571 ptr += 48; // skip this parent. 572 return match(b, ptr, author); 573 } 574 575 /** 576 * Locate the "committer " header line data. 577 * 578 * @param b 579 * buffer to scan. 580 * @param ptr 581 * position in buffer to start the scan at. Most callers should 582 * pass 0 to ensure the scan starts from the beginning of the 583 * commit buffer and does not accidentally look at message body. 584 * @return position just after the space in "committer ", so the first 585 * character of the committer's name. If no committer header can be 586 * located -1 is returned. 587 */ 588 public static final int committer(final byte[] b, int ptr) { 589 final int sz = b.length; 590 if (ptr == 0) 591 ptr += 46; // skip the "tree ..." line. 592 while (ptr < sz && b[ptr] == 'p') 593 ptr += 48; // skip this parent. 594 if (ptr < sz && b[ptr] == 'a') 595 ptr = nextLF(b, ptr); 596 return match(b, ptr, committer); 597 } 598 599 /** 600 * Locate the "tagger " header line data. 601 * 602 * @param b 603 * buffer to scan. 604 * @param ptr 605 * position in buffer to start the scan at. Most callers should 606 * pass 0 to ensure the scan starts from the beginning of the tag 607 * buffer and does not accidentally look at message body. 608 * @return position just after the space in "tagger ", so the first 609 * character of the tagger's name. If no tagger header can be 610 * located -1 is returned. 611 */ 612 public static final int tagger(final byte[] b, int ptr) { 613 final int sz = b.length; 614 if (ptr == 0) 615 ptr += 48; // skip the "object ..." line. 616 while (ptr < sz) { 617 if (b[ptr] == '\n') 618 return -1; 619 final int m = match(b, ptr, tagger); 620 if (m >= 0) 621 return m; 622 ptr = nextLF(b, ptr); 623 } 624 return -1; 625 } 626 627 /** 628 * Locate the "encoding " header line. 629 * 630 * @param b 631 * buffer to scan. 632 * @param ptr 633 * position in buffer to start the scan at. Most callers should 634 * pass 0 to ensure the scan starts from the beginning of the 635 * buffer and does not accidentally look at the message body. 636 * @return position just after the space in "encoding ", so the first 637 * character of the encoding's name. If no encoding header can be 638 * located -1 is returned (and UTF-8 should be assumed). 639 */ 640 public static final int encoding(final byte[] b, int ptr) { 641 final int sz = b.length; 642 while (ptr < sz) { 643 if (b[ptr] == '\n') 644 return -1; 645 if (b[ptr] == 'e') 646 break; 647 ptr = nextLF(b, ptr); 648 } 649 return match(b, ptr, encoding); 650 } 651 652 /** 653 * Parse the "encoding " header into a character set reference. 654 * <p> 655 * Locates the "encoding " header (if present) by first calling 656 * {@link #encoding(byte[], int)} and then returns the proper character set 657 * to apply to this buffer to evaluate its contents as character data. 658 * <p> 659 * If no encoding header is present, {@link Constants#CHARSET} is assumed. 660 * 661 * @param b 662 * buffer to scan. 663 * @return the Java character set representation. Never null. 664 */ 665 public static Charset parseEncoding(final byte[] b) { 666 final int enc = encoding(b, 0); 667 if (enc < 0) 668 return Constants.CHARSET; 669 final int lf = nextLF(b, enc); 670 String decoded = decode(Constants.CHARSET, b, enc, lf - 1); 671 try { 672 return Charset.forName(decoded); 673 } catch (IllegalCharsetNameException badName) { 674 Charset aliased = charsetForAlias(decoded); 675 if (aliased != null) 676 return aliased; 677 throw badName; 678 } catch (UnsupportedCharsetException badName) { 679 Charset aliased = charsetForAlias(decoded); 680 if (aliased != null) 681 return aliased; 682 throw badName; 683 } 684 } 685 686 /** 687 * Parse a name string (e.g. author, committer, tagger) into a PersonIdent. 688 * <p> 689 * Leading spaces won't be trimmed from the string, i.e. will show up in the 690 * parsed name afterwards. 691 * 692 * @param in 693 * the string to parse a name from. 694 * @return the parsed identity or null in case the identity could not be 695 * parsed. 696 */ 697 public static PersonIdent parsePersonIdent(final String in) { 698 return parsePersonIdent(Constants.encode(in), 0); 699 } 700 701 /** 702 * Parse a name line (e.g. author, committer, tagger) into a PersonIdent. 703 * <p> 704 * When passing in a value for <code>nameB</code> callers should use the 705 * return value of {@link #author(byte[], int)} or 706 * {@link #committer(byte[], int)}, as these methods provide the proper 707 * position within the buffer. 708 * 709 * @param raw 710 * the buffer to parse character data from. 711 * @param nameB 712 * first position of the identity information. This should be the 713 * first position after the space which delimits the header field 714 * name (e.g. "author" or "committer") from the rest of the 715 * identity line. 716 * @return the parsed identity or null in case the identity could not be 717 * parsed. 718 */ 719 public static PersonIdent parsePersonIdent(final byte[] raw, final int nameB) { 720 final Charset cs = parseEncoding(raw); 721 final int emailB = nextLF(raw, nameB, '<'); 722 final int emailE = nextLF(raw, emailB, '>'); 723 if (emailB >= raw.length || raw[emailB] == '\n' || 724 (emailE >= raw.length - 1 && raw[emailE - 1] != '>')) 725 return null; 726 727 final int nameEnd = emailB - 2 >= nameB && raw[emailB - 2] == ' ' ? 728 emailB - 2 : emailB - 1; 729 final String name = decode(cs, raw, nameB, nameEnd); 730 final String email = decode(cs, raw, emailB, emailE - 1); 731 732 // Start searching from end of line, as after first name-email pair, 733 // another name-email pair may occur. We will ignore all kinds of 734 // "junk" following the first email. 735 // 736 // We've to use (emailE - 1) for the case that raw[email] is LF, 737 // otherwise we would run too far. "-2" is necessary to position 738 // before the LF in case of LF termination resp. the penultimate 739 // character if there is no trailing LF. 740 final int tzBegin = lastIndexOfTrim(raw, ' ', 741 nextLF(raw, emailE - 1) - 2) + 1; 742 if (tzBegin <= emailE) // No time/zone, still valid 743 return new PersonIdent(name, email, 0, 0); 744 745 final int whenBegin = Math.max(emailE, 746 lastIndexOfTrim(raw, ' ', tzBegin - 1) + 1); 747 if (whenBegin >= tzBegin - 1) // No time/zone, still valid 748 return new PersonIdent(name, email, 0, 0); 749 750 final long when = parseLongBase10(raw, whenBegin, null); 751 final int tz = parseTimeZoneOffset(raw, tzBegin); 752 return new PersonIdent(name, email, when * 1000L, tz); 753 } 754 755 /** 756 * Parse a name data (e.g. as within a reflog) into a PersonIdent. 757 * <p> 758 * When passing in a value for <code>nameB</code> callers should use the 759 * return value of {@link #author(byte[], int)} or 760 * {@link #committer(byte[], int)}, as these methods provide the proper 761 * position within the buffer. 762 * 763 * @param raw 764 * the buffer to parse character data from. 765 * @param nameB 766 * first position of the identity information. This should be the 767 * first position after the space which delimits the header field 768 * name (e.g. "author" or "committer") from the rest of the 769 * identity line. 770 * @return the parsed identity. Never null. 771 */ 772 public static PersonIdent parsePersonIdentOnly(final byte[] raw, 773 final int nameB) { 774 int stop = nextLF(raw, nameB); 775 int emailB = nextLF(raw, nameB, '<'); 776 int emailE = nextLF(raw, emailB, '>'); 777 final String name; 778 final String email; 779 if (emailE < stop) { 780 email = decode(raw, emailB, emailE - 1); 781 } else { 782 email = "invalid"; //$NON-NLS-1$ 783 } 784 if (emailB < stop) 785 name = decode(raw, nameB, emailB - 2); 786 else 787 name = decode(raw, nameB, stop); 788 789 final MutableInteger ptrout = new MutableInteger(); 790 long when; 791 int tz; 792 if (emailE < stop) { 793 when = parseLongBase10(raw, emailE + 1, ptrout); 794 tz = parseTimeZoneOffset(raw, ptrout.value); 795 } else { 796 when = 0; 797 tz = 0; 798 } 799 return new PersonIdent(name, email, when * 1000L, tz); 800 } 801 802 /** 803 * Locate the end of a footer line key string. 804 * <p> 805 * If the region at {@code raw[ptr]} matches {@code ^[A-Za-z0-9-]+:} (e.g. 806 * "Signed-off-by: A. U. Thor\n") then this method returns the position of 807 * the first ':'. 808 * <p> 809 * If the region at {@code raw[ptr]} does not match {@code ^[A-Za-z0-9-]+:} 810 * then this method returns -1. 811 * 812 * @param raw 813 * buffer to scan. 814 * @param ptr 815 * first position within raw to consider as a footer line key. 816 * @return position of the ':' which terminates the footer line key if this 817 * is otherwise a valid footer line key; otherwise -1. 818 */ 819 public static int endOfFooterLineKey(final byte[] raw, int ptr) { 820 try { 821 for (;;) { 822 final byte c = raw[ptr]; 823 if (footerLineKeyChars[c] == 0) { 824 if (c == ':') 825 return ptr; 826 return -1; 827 } 828 ptr++; 829 } 830 } catch (ArrayIndexOutOfBoundsException e) { 831 return -1; 832 } 833 } 834 835 /** 836 * Decode a buffer under UTF-8, if possible. 837 * 838 * If the byte stream cannot be decoded that way, the platform default is tried 839 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried. 840 * 841 * @param buffer 842 * buffer to pull raw bytes from. 843 * @return a string representation of the range <code>[start,end)</code>, 844 * after decoding the region through the specified character set. 845 */ 846 public static String decode(final byte[] buffer) { 847 return decode(buffer, 0, buffer.length); 848 } 849 850 /** 851 * Decode a buffer under UTF-8, if possible. 852 * 853 * If the byte stream cannot be decoded that way, the platform default is 854 * tried and if that too fails, the fail-safe ISO-8859-1 encoding is tried. 855 * 856 * @param buffer 857 * buffer to pull raw bytes from. 858 * @param start 859 * start position in buffer 860 * @param end 861 * one position past the last location within the buffer to take 862 * data from. 863 * @return a string representation of the range <code>[start,end)</code>, 864 * after decoding the region through the specified character set. 865 */ 866 public static String decode(final byte[] buffer, final int start, 867 final int end) { 868 return decode(Constants.CHARSET, buffer, start, end); 869 } 870 871 /** 872 * Decode a buffer under the specified character set if possible. 873 * 874 * If the byte stream cannot be decoded that way, the platform default is tried 875 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried. 876 * 877 * @param cs 878 * character set to use when decoding the buffer. 879 * @param buffer 880 * buffer to pull raw bytes from. 881 * @return a string representation of the range <code>[start,end)</code>, 882 * after decoding the region through the specified character set. 883 */ 884 public static String decode(final Charset cs, final byte[] buffer) { 885 return decode(cs, buffer, 0, buffer.length); 886 } 887 888 /** 889 * Decode a region of the buffer under the specified character set if possible. 890 * 891 * If the byte stream cannot be decoded that way, the platform default is tried 892 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried. 893 * 894 * @param cs 895 * character set to use when decoding the buffer. 896 * @param buffer 897 * buffer to pull raw bytes from. 898 * @param start 899 * first position within the buffer to take data from. 900 * @param end 901 * one position past the last location within the buffer to take 902 * data from. 903 * @return a string representation of the range <code>[start,end)</code>, 904 * after decoding the region through the specified character set. 905 */ 906 public static String decode(final Charset cs, final byte[] buffer, 907 final int start, final int end) { 908 try { 909 return decodeNoFallback(cs, buffer, start, end); 910 } catch (CharacterCodingException e) { 911 // Fall back to an ISO-8859-1 style encoding. At least all of 912 // the bytes will be present in the output. 913 // 914 return extractBinaryString(buffer, start, end); 915 } 916 } 917 918 /** 919 * Decode a region of the buffer under the specified character set if 920 * possible. 921 * 922 * If the byte stream cannot be decoded that way, the platform default is 923 * tried and if that too fails, an exception is thrown. 924 * 925 * @param cs 926 * character set to use when decoding the buffer. 927 * @param buffer 928 * buffer to pull raw bytes from. 929 * @param start 930 * first position within the buffer to take data from. 931 * @param end 932 * one position past the last location within the buffer to take 933 * data from. 934 * @return a string representation of the range <code>[start,end)</code>, 935 * after decoding the region through the specified character set. 936 * @throws CharacterCodingException 937 * the input is not in any of the tested character sets. 938 */ 939 public static String decodeNoFallback(final Charset cs, 940 final byte[] buffer, final int start, final int end) 941 throws CharacterCodingException { 942 final ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start); 943 b.mark(); 944 945 // Try our built-in favorite. The assumption here is that 946 // decoding will fail if the data is not actually encoded 947 // using that encoder. 948 // 949 try { 950 return decode(b, Constants.CHARSET); 951 } catch (CharacterCodingException e) { 952 b.reset(); 953 } 954 955 if (!cs.equals(Constants.CHARSET)) { 956 // Try the suggested encoding, it might be right since it was 957 // provided by the caller. 958 // 959 try { 960 return decode(b, cs); 961 } catch (CharacterCodingException e) { 962 b.reset(); 963 } 964 } 965 966 // Try the default character set. A small group of people 967 // might actually use the same (or very similar) locale. 968 // 969 final Charset defcs = Charset.defaultCharset(); 970 if (!defcs.equals(cs) && !defcs.equals(Constants.CHARSET)) { 971 try { 972 return decode(b, defcs); 973 } catch (CharacterCodingException e) { 974 b.reset(); 975 } 976 } 977 978 throw new CharacterCodingException(); 979 } 980 981 /** 982 * Decode a region of the buffer under the ISO-8859-1 encoding. 983 * 984 * Each byte is treated as a single character in the 8859-1 character 985 * encoding, performing a raw binary->char conversion. 986 * 987 * @param buffer 988 * buffer to pull raw bytes from. 989 * @param start 990 * first position within the buffer to take data from. 991 * @param end 992 * one position past the last location within the buffer to take 993 * data from. 994 * @return a string representation of the range <code>[start,end)</code>. 995 */ 996 public static String extractBinaryString(final byte[] buffer, 997 final int start, final int end) { 998 final StringBuilder r = new StringBuilder(end - start); 999 for (int i = start; i < end; i++) 1000 r.append((char) (buffer[i] & 0xff)); 1001 return r.toString(); 1002 } 1003 1004 private static String decode(final ByteBuffer b, final Charset charset) 1005 throws CharacterCodingException { 1006 final CharsetDecoder d = charset.newDecoder(); 1007 d.onMalformedInput(CodingErrorAction.REPORT); 1008 d.onUnmappableCharacter(CodingErrorAction.REPORT); 1009 return d.decode(b).toString(); 1010 } 1011 1012 /** 1013 * Locate the position of the commit message body. 1014 * 1015 * @param b 1016 * buffer to scan. 1017 * @param ptr 1018 * position in buffer to start the scan at. Most callers should 1019 * pass 0 to ensure the scan starts from the beginning of the 1020 * commit buffer. 1021 * @return position of the user's message buffer. 1022 */ 1023 public static final int commitMessage(final byte[] b, int ptr) { 1024 final int sz = b.length; 1025 if (ptr == 0) 1026 ptr += 46; // skip the "tree ..." line. 1027 while (ptr < sz && b[ptr] == 'p') 1028 ptr += 48; // skip this parent. 1029 1030 // Skip any remaining header lines, ignoring what their actual 1031 // header line type is. This is identical to the logic for a tag. 1032 // 1033 return tagMessage(b, ptr); 1034 } 1035 1036 /** 1037 * Locate the position of the tag message body. 1038 * 1039 * @param b 1040 * buffer to scan. 1041 * @param ptr 1042 * position in buffer to start the scan at. Most callers should 1043 * pass 0 to ensure the scan starts from the beginning of the tag 1044 * buffer. 1045 * @return position of the user's message buffer. 1046 */ 1047 public static final int tagMessage(final byte[] b, int ptr) { 1048 final int sz = b.length; 1049 if (ptr == 0) 1050 ptr += 48; // skip the "object ..." line. 1051 while (ptr < sz && b[ptr] != '\n') 1052 ptr = nextLF(b, ptr); 1053 if (ptr < sz && b[ptr] == '\n') 1054 return ptr + 1; 1055 return -1; 1056 } 1057 1058 /** 1059 * Locate the end of a paragraph. 1060 * <p> 1061 * A paragraph is ended by two consecutive LF bytes or CRLF pairs 1062 * 1063 * @param b 1064 * buffer to scan. 1065 * @param start 1066 * position in buffer to start the scan at. Most callers will 1067 * want to pass the first position of the commit message (as 1068 * found by {@link #commitMessage(byte[], int)}. 1069 * @return position of the LF at the end of the paragraph; 1070 * <code>b.length</code> if no paragraph end could be located. 1071 */ 1072 public static final int endOfParagraph(final byte[] b, final int start) { 1073 int ptr = start; 1074 final int sz = b.length; 1075 while (ptr < sz && (b[ptr] != '\n' && b[ptr] != '\r')) 1076 ptr = nextLF(b, ptr); 1077 if (ptr > start && b[ptr - 1] == '\n') 1078 ptr--; 1079 if (ptr > start && b[ptr - 1] == '\r') 1080 ptr--; 1081 return ptr; 1082 } 1083 1084 private static int lastIndexOfTrim(byte[] raw, char ch, int pos) { 1085 while (pos >= 0 && raw[pos] == ' ') 1086 pos--; 1087 1088 while (pos >= 0 && raw[pos] != ch) 1089 pos--; 1090 1091 return pos; 1092 } 1093 1094 private static Charset charsetForAlias(String name) { 1095 return encodingAliases.get(StringUtils.toLowerCase(name)); 1096 } 1097 1098 private RawParseUtils() { 1099 // Don't create instances of a static only utility. 1100 } 1101 }