1 /* 2 * Copyright (C) 2008-2009, Google Inc. 3 * Copyright (C) 2006-2008, Shawn O. Pearce <spearce@spearce.org> 4 * and other copyright owners as documented in the project's IP log. 5 * 6 * This program and the accompanying materials are made available 7 * under the terms of the Eclipse Distribution License v1.0 which 8 * accompanies this distribution, is reproduced below, and is 9 * available at http://www.eclipse.org/org/documents/edl-v10.php 10 * 11 * All rights reserved. 12 * 13 * Redistribution and use in source and binary forms, with or 14 * without modification, are permitted provided that the following 15 * conditions are met: 16 * 17 * - Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 20 * - Redistributions in binary form must reproduce the above 21 * copyright notice, this list of conditions and the following 22 * disclaimer in the documentation and/or other materials provided 23 * with the distribution. 24 * 25 * - Neither the name of the Eclipse Foundation, Inc. nor the 26 * names of its contributors may be used to endorse or promote 27 * products derived from this software without specific prior 28 * written permission. 29 * 30 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND 31 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, 32 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 33 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 34 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 35 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 36 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 37 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 38 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 39 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 40 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 41 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 42 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 43 */ 44 45 package org.eclipse.jgit.util; 46 47 import static java.nio.charset.StandardCharsets.ISO_8859_1; 48 import static java.nio.charset.StandardCharsets.UTF_8; 49 import static org.eclipse.jgit.lib.ObjectChecker.author; 50 import static org.eclipse.jgit.lib.ObjectChecker.committer; 51 import static org.eclipse.jgit.lib.ObjectChecker.encoding; 52 import static org.eclipse.jgit.lib.ObjectChecker.tagger; 53 54 import java.nio.ByteBuffer; 55 import java.nio.charset.CharacterCodingException; 56 import java.nio.charset.Charset; 57 import java.nio.charset.CharsetDecoder; 58 import java.nio.charset.CodingErrorAction; 59 import java.nio.charset.IllegalCharsetNameException; 60 import java.nio.charset.UnsupportedCharsetException; 61 import java.util.Arrays; 62 import java.util.HashMap; 63 import java.util.Map; 64 65 import org.eclipse.jgit.annotations.Nullable; 66 import org.eclipse.jgit.errors.BinaryBlobException; 67 import org.eclipse.jgit.lib.Constants; 68 import org.eclipse.jgit.lib.PersonIdent; 69 70 /** 71 * Handy utility functions to parse raw object contents. 72 */ 73 public final class RawParseUtils { 74 /** 75 * UTF-8 charset constant. 76 * 77 * @since 2.2 78 */ 79 public static final Charset UTF8_CHARSET = UTF_8; 80 81 private static final byte[] digits10; 82 83 private static final byte[] digits16; 84 85 private static final byte[] footerLineKeyChars; 86 87 private static final Map<String, Charset> encodingAliases; 88 89 static { 90 encodingAliases = new HashMap<>(); 91 encodingAliases.put("latin-1", ISO_8859_1); //$NON-NLS-1$ 92 encodingAliases.put("iso-latin-1", ISO_8859_1); //$NON-NLS-1$ 93 94 digits10 = new byte['9' + 1]; 95 Arrays.fill(digits10, (byte) -1); 96 for (char i = '0'; i <= '9'; i++) 97 digits10[i] = (byte) (i - '0'); 98 99 digits16 = new byte['f' + 1]; 100 Arrays.fill(digits16, (byte) -1); 101 for (char i = '0'; i <= '9'; i++) 102 digits16[i] = (byte) (i - '0'); 103 for (char i = 'a'; i <= 'f'; i++) 104 digits16[i] = (byte) ((i - 'a') + 10); 105 for (char i = 'A'; i <= 'F'; i++) 106 digits16[i] = (byte) ((i - 'A') + 10); 107 108 footerLineKeyChars = new byte['z' + 1]; 109 footerLineKeyChars['-'] = 1; 110 for (char i = '0'; i <= '9'; i++) 111 footerLineKeyChars[i] = 1; 112 for (char i = 'A'; i <= 'Z'; i++) 113 footerLineKeyChars[i] = 1; 114 for (char i = 'a'; i <= 'z'; i++) 115 footerLineKeyChars[i] = 1; 116 } 117 118 /** 119 * Determine if b[ptr] matches src. 120 * 121 * @param b 122 * the buffer to scan. 123 * @param ptr 124 * first position within b, this should match src[0]. 125 * @param src 126 * the buffer to test for equality with b. 127 * @return ptr + src.length if b[ptr..src.length] == src; else -1. 128 */ 129 public static final int match(byte[] b, int ptr, byte[] src) { 130 if (ptr + src.length > b.length) 131 return -1; 132 for (int i = 0; i < src.length; i++, ptr++) 133 if (b[ptr] != src[i]) 134 return -1; 135 return ptr; 136 } 137 138 private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5', 139 '6', '7', '8', '9' }; 140 141 /** 142 * Format a base 10 numeric into a temporary buffer. 143 * <p> 144 * Formatting is performed backwards. The method starts at offset 145 * <code>o-1</code> and ends at <code>o-1-digits</code>, where 146 * <code>digits</code> is the number of positions necessary to store the 147 * base 10 value. 148 * <p> 149 * The argument and return values from this method make it easy to chain 150 * writing, for example: 151 * </p> 152 * 153 * <pre> 154 * final byte[] tmp = new byte[64]; 155 * int ptr = tmp.length; 156 * tmp[--ptr] = '\n'; 157 * ptr = RawParseUtils.formatBase10(tmp, ptr, 32); 158 * tmp[--ptr] = ' '; 159 * ptr = RawParseUtils.formatBase10(tmp, ptr, 18); 160 * tmp[--ptr] = 0; 161 * final String str = new String(tmp, ptr, tmp.length - ptr); 162 * </pre> 163 * 164 * @param b 165 * buffer to write into. 166 * @param o 167 * one offset past the location where writing will begin; writing 168 * proceeds towards lower index values. 169 * @param value 170 * the value to store. 171 * @return the new offset value <code>o</code>. This is the position of 172 * the last byte written. Additional writing should start at one 173 * position earlier. 174 */ 175 public static int formatBase10(final byte[] b, int o, int value) { 176 if (value == 0) { 177 b[--o] = '0'; 178 return o; 179 } 180 final boolean isneg = value < 0; 181 if (isneg) 182 value = -value; 183 while (value != 0) { 184 b[--o] = base10byte[value % 10]; 185 value /= 10; 186 } 187 if (isneg) 188 b[--o] = '-'; 189 return o; 190 } 191 192 /** 193 * Parse a base 10 numeric from a sequence of ASCII digits into an int. 194 * <p> 195 * Digit sequences can begin with an optional run of spaces before the 196 * sequence, and may start with a '+' or a '-' to indicate sign position. 197 * Any other characters will cause the method to stop and return the current 198 * result to the caller. 199 * 200 * @param b 201 * buffer to scan. 202 * @param ptr 203 * position within buffer to start parsing digits at. 204 * @param ptrResult 205 * optional location to return the new ptr value through. If null 206 * the ptr value will be discarded. 207 * @return the value at this location; 0 if the location is not a valid 208 * numeric. 209 */ 210 public static final int parseBase10(final byte[] b, int ptr, 211 final MutableInteger ptrResult) { 212 int r = 0; 213 int sign = 0; 214 try { 215 final int sz = b.length; 216 while (ptr < sz && b[ptr] == ' ') 217 ptr++; 218 if (ptr >= sz) 219 return 0; 220 221 switch (b[ptr]) { 222 case '-': 223 sign = -1; 224 ptr++; 225 break; 226 case '+': 227 ptr++; 228 break; 229 } 230 231 while (ptr < sz) { 232 final byte v = digits10[b[ptr]]; 233 if (v < 0) 234 break; 235 r = (r * 10) + v; 236 ptr++; 237 } 238 } catch (ArrayIndexOutOfBoundsException e) { 239 // Not a valid digit. 240 } 241 if (ptrResult != null) 242 ptrResult.value = ptr; 243 return sign < 0 ? -r : r; 244 } 245 246 /** 247 * Parse a base 10 numeric from a sequence of ASCII digits into a long. 248 * <p> 249 * Digit sequences can begin with an optional run of spaces before the 250 * sequence, and may start with a '+' or a '-' to indicate sign position. 251 * Any other characters will cause the method to stop and return the current 252 * result to the caller. 253 * 254 * @param b 255 * buffer to scan. 256 * @param ptr 257 * position within buffer to start parsing digits at. 258 * @param ptrResult 259 * optional location to return the new ptr value through. If null 260 * the ptr value will be discarded. 261 * @return the value at this location; 0 if the location is not a valid 262 * numeric. 263 */ 264 public static final long parseLongBase10(final byte[] b, int ptr, 265 final MutableInteger ptrResult) { 266 long r = 0; 267 int sign = 0; 268 try { 269 final int sz = b.length; 270 while (ptr < sz && b[ptr] == ' ') 271 ptr++; 272 if (ptr >= sz) 273 return 0; 274 275 switch (b[ptr]) { 276 case '-': 277 sign = -1; 278 ptr++; 279 break; 280 case '+': 281 ptr++; 282 break; 283 } 284 285 while (ptr < sz) { 286 final byte v = digits10[b[ptr]]; 287 if (v < 0) 288 break; 289 r = (r * 10) + v; 290 ptr++; 291 } 292 } catch (ArrayIndexOutOfBoundsException e) { 293 // Not a valid digit. 294 } 295 if (ptrResult != null) 296 ptrResult.value = ptr; 297 return sign < 0 ? -r : r; 298 } 299 300 /** 301 * Parse 4 character base 16 (hex) formatted string to unsigned integer. 302 * <p> 303 * The number is read in network byte order, that is, most significant 304 * nybble first. 305 * 306 * @param bs 307 * buffer to parse digits from; positions {@code [p, p+4)} will 308 * be parsed. 309 * @param p 310 * first position within the buffer to parse. 311 * @return the integer value. 312 * @throws java.lang.ArrayIndexOutOfBoundsException 313 * if the string is not hex formatted. 314 */ 315 public static final int parseHexInt16(final byte[] bs, final int p) { 316 int r = digits16[bs[p]] << 4; 317 318 r |= digits16[bs[p + 1]]; 319 r <<= 4; 320 321 r |= digits16[bs[p + 2]]; 322 r <<= 4; 323 324 r |= digits16[bs[p + 3]]; 325 if (r < 0) 326 throw new ArrayIndexOutOfBoundsException(); 327 return r; 328 } 329 330 /** 331 * Parse 8 character base 16 (hex) formatted string to unsigned integer. 332 * <p> 333 * The number is read in network byte order, that is, most significant 334 * nybble first. 335 * 336 * @param bs 337 * buffer to parse digits from; positions {@code [p, p+8)} will 338 * be parsed. 339 * @param p 340 * first position within the buffer to parse. 341 * @return the integer value. 342 * @throws java.lang.ArrayIndexOutOfBoundsException 343 * if the string is not hex formatted. 344 */ 345 public static final int parseHexInt32(final byte[] bs, final int p) { 346 int r = digits16[bs[p]] << 4; 347 348 r |= digits16[bs[p + 1]]; 349 r <<= 4; 350 351 r |= digits16[bs[p + 2]]; 352 r <<= 4; 353 354 r |= digits16[bs[p + 3]]; 355 r <<= 4; 356 357 r |= digits16[bs[p + 4]]; 358 r <<= 4; 359 360 r |= digits16[bs[p + 5]]; 361 r <<= 4; 362 363 r |= digits16[bs[p + 6]]; 364 365 final int last = digits16[bs[p + 7]]; 366 if (r < 0 || last < 0) 367 throw new ArrayIndexOutOfBoundsException(); 368 return (r << 4) | last; 369 } 370 371 /** 372 * Parse 16 character base 16 (hex) formatted string to unsigned long. 373 * <p> 374 * The number is read in network byte order, that is, most significant 375 * nibble first. 376 * 377 * @param bs 378 * buffer to parse digits from; positions {@code [p, p+16)} will 379 * be parsed. 380 * @param p 381 * first position within the buffer to parse. 382 * @return the integer value. 383 * @throws java.lang.ArrayIndexOutOfBoundsException 384 * if the string is not hex formatted. 385 * @since 4.3 386 */ 387 public static final long parseHexInt64(final byte[] bs, final int p) { 388 long r = digits16[bs[p]] << 4; 389 390 r |= digits16[bs[p + 1]]; 391 r <<= 4; 392 393 r |= digits16[bs[p + 2]]; 394 r <<= 4; 395 396 r |= digits16[bs[p + 3]]; 397 r <<= 4; 398 399 r |= digits16[bs[p + 4]]; 400 r <<= 4; 401 402 r |= digits16[bs[p + 5]]; 403 r <<= 4; 404 405 r |= digits16[bs[p + 6]]; 406 r <<= 4; 407 408 r |= digits16[bs[p + 7]]; 409 r <<= 4; 410 411 r |= digits16[bs[p + 8]]; 412 r <<= 4; 413 414 r |= digits16[bs[p + 9]]; 415 r <<= 4; 416 417 r |= digits16[bs[p + 10]]; 418 r <<= 4; 419 420 r |= digits16[bs[p + 11]]; 421 r <<= 4; 422 423 r |= digits16[bs[p + 12]]; 424 r <<= 4; 425 426 r |= digits16[bs[p + 13]]; 427 r <<= 4; 428 429 r |= digits16[bs[p + 14]]; 430 431 final int last = digits16[bs[p + 15]]; 432 if (r < 0 || last < 0) 433 throw new ArrayIndexOutOfBoundsException(); 434 return (r << 4) | last; 435 } 436 437 /** 438 * Parse a single hex digit to its numeric value (0-15). 439 * 440 * @param digit 441 * hex character to parse. 442 * @return numeric value, in the range 0-15. 443 * @throws java.lang.ArrayIndexOutOfBoundsException 444 * if the input digit is not a valid hex digit. 445 */ 446 public static final int parseHexInt4(final byte digit) { 447 final byte r = digits16[digit]; 448 if (r < 0) 449 throw new ArrayIndexOutOfBoundsException(); 450 return r; 451 } 452 453 /** 454 * Parse a Git style timezone string. 455 * <p> 456 * The sequence "-0315" will be parsed as the numeric value -195, as the 457 * lower two positions count minutes, not 100ths of an hour. 458 * 459 * @param b 460 * buffer to scan. 461 * @param ptr 462 * position within buffer to start parsing digits at. 463 * @return the timezone at this location, expressed in minutes. 464 */ 465 public static final int parseTimeZoneOffset(byte[] b, int ptr) { 466 return parseTimeZoneOffset(b, ptr, null); 467 } 468 469 /** 470 * Parse a Git style timezone string. 471 * <p> 472 * The sequence "-0315" will be parsed as the numeric value -195, as the 473 * lower two positions count minutes, not 100ths of an hour. 474 * 475 * @param b 476 * buffer to scan. 477 * @param ptr 478 * position within buffer to start parsing digits at. 479 * @param ptrResult 480 * optional location to return the new ptr value through. If null 481 * the ptr value will be discarded. 482 * @return the timezone at this location, expressed in minutes. 483 * @since 4.1 484 */ 485 public static final int parseTimeZoneOffset(final byte[] b, int ptr, 486 MutableInteger ptrResult) { 487 final int v = parseBase10(b, ptr, ptrResult); 488 final int tzMins = v % 100; 489 final int tzHours = v / 100; 490 return tzHours * 60 + tzMins; 491 } 492 493 /** 494 * Locate the first position after a given character. 495 * 496 * @param b 497 * buffer to scan. 498 * @param ptr 499 * position within buffer to start looking for chrA at. 500 * @param chrA 501 * character to find. 502 * @return new position just after chrA. 503 */ 504 public static final int next(byte[] b, int ptr, char chrA) { 505 final int sz = b.length; 506 while (ptr < sz) { 507 if (b[ptr++] == chrA) 508 return ptr; 509 } 510 return ptr; 511 } 512 513 /** 514 * Locate the first position after the next LF. 515 * <p> 516 * This method stops on the first '\n' it finds. 517 * 518 * @param b 519 * buffer to scan. 520 * @param ptr 521 * position within buffer to start looking for LF at. 522 * @return new position just after the first LF found. 523 */ 524 public static final int nextLF(byte[] b, int ptr) { 525 return next(b, ptr, '\n'); 526 } 527 528 /** 529 * Locate the first position after either the given character or LF. 530 * <p> 531 * This method stops on the first match it finds from either chrA or '\n'. 532 * 533 * @param b 534 * buffer to scan. 535 * @param ptr 536 * position within buffer to start looking for chrA or LF at. 537 * @param chrA 538 * character to find. 539 * @return new position just after the first chrA or LF to be found. 540 */ 541 public static final int nextLF(byte[] b, int ptr, char chrA) { 542 final int sz = b.length; 543 while (ptr < sz) { 544 final byte c = b[ptr++]; 545 if (c == chrA || c == '\n') 546 return ptr; 547 } 548 return ptr; 549 } 550 551 /** 552 * Locate the end of the header. Note that headers may be 553 * more than one line long. 554 * @param b 555 * buffer to scan. 556 * @param ptr 557 * position within buffer to start looking for the end-of-header. 558 * @return new position just after the header. This is either 559 * b.length, or the index of the header's terminating newline. 560 * @since 5.1 561 */ 562 public static final int headerEnd(final byte[] b, int ptr) { 563 final int sz = b.length; 564 while (ptr < sz) { 565 final byte c = b[ptr++]; 566 if (c == '\n' && (ptr == sz || b[ptr] != ' ')) { 567 return ptr - 1; 568 } 569 } 570 return ptr - 1; 571 } 572 573 /** 574 * Find the start of the contents of a given header. 575 * 576 * @param b 577 * buffer to scan. 578 * @param headerName 579 * header to search for 580 * @param ptr 581 * position within buffer to start looking for header at. 582 * @return new position at the start of the header's contents, -1 for 583 * not found 584 * @since 5.1 585 */ 586 public static final int headerStart(byte[] headerName, byte[] b, int ptr) { 587 // Start by advancing to just past a LF or buffer start 588 if (ptr != 0) { 589 ptr = nextLF(b, ptr - 1); 590 } 591 while (ptr < b.length - (headerName.length + 1)) { 592 boolean found = true; 593 for (int i = 0; i < headerName.length; i++) { 594 if (headerName[i] != b[ptr++]) { 595 found = false; 596 break; 597 } 598 } 599 if (found && b[ptr++] == ' ') { 600 return ptr; 601 } 602 ptr = nextLF(b, ptr); 603 } 604 return -1; 605 } 606 607 /** 608 * Locate the first position before a given character. 609 * 610 * @param b 611 * buffer to scan. 612 * @param ptr 613 * position within buffer to start looking for chrA at. 614 * @param chrA 615 * character to find. 616 * @return new position just before chrA, -1 for not found 617 */ 618 public static final int prev(byte[] b, int ptr, char chrA) { 619 if (ptr == b.length) 620 --ptr; 621 while (ptr >= 0) { 622 if (b[ptr--] == chrA) 623 return ptr; 624 } 625 return ptr; 626 } 627 628 /** 629 * Locate the first position before the previous LF. 630 * <p> 631 * This method stops on the first '\n' it finds. 632 * 633 * @param b 634 * buffer to scan. 635 * @param ptr 636 * position within buffer to start looking for LF at. 637 * @return new position just before the first LF found, -1 for not found 638 */ 639 public static final int prevLF(byte[] b, int ptr) { 640 return prev(b, ptr, '\n'); 641 } 642 643 /** 644 * Locate the previous position before either the given character or LF. 645 * <p> 646 * This method stops on the first match it finds from either chrA or '\n'. 647 * 648 * @param b 649 * buffer to scan. 650 * @param ptr 651 * position within buffer to start looking for chrA or LF at. 652 * @param chrA 653 * character to find. 654 * @return new position just before the first chrA or LF to be found, -1 for 655 * not found 656 */ 657 public static final int prevLF(byte[] b, int ptr, char chrA) { 658 if (ptr == b.length) 659 --ptr; 660 while (ptr >= 0) { 661 final byte c = b[ptr--]; 662 if (c == chrA || c == '\n') 663 return ptr; 664 } 665 return ptr; 666 } 667 668 /** 669 * Index the region between <code>[ptr, end)</code> to find line starts. 670 * <p> 671 * The returned list is 1 indexed. Index 0 contains 672 * {@link java.lang.Integer#MIN_VALUE} to pad the list out. 673 * <p> 674 * Using a 1 indexed list means that line numbers can be directly accessed 675 * from the list, so <code>list.get(1)</code> (aka get line 1) returns 676 * <code>ptr</code>. 677 * <p> 678 * The last element (index <code>map.size()-1</code>) always contains 679 * <code>end</code>. 680 * 681 * @param buf 682 * buffer to scan. 683 * @param ptr 684 * position within the buffer corresponding to the first byte of 685 * line 1. 686 * @param end 687 * 1 past the end of the content within <code>buf</code>. 688 * @return a line map indicating the starting position of each line. 689 */ 690 public static final IntList lineMap(byte[] buf, int ptr, int end) { 691 IntList map = new IntList((end - ptr) / 36); 692 map.fillTo(1, Integer.MIN_VALUE); 693 for (; ptr < end; ptr = nextLF(buf, ptr)) { 694 map.add(ptr); 695 } 696 map.add(end); 697 return map; 698 } 699 700 /** 701 * Like {@link #lineMap(byte[], int, int)} but throw 702 * {@link BinaryBlobException} if a NUL byte is encountered. 703 * 704 * @param buf 705 * buffer to scan. 706 * @param ptr 707 * position within the buffer corresponding to the first byte of 708 * line 1. 709 * @param end 710 * 1 past the end of the content within <code>buf</code>. 711 * @return a line map indicating the starting position of each line. 712 * @throws BinaryBlobException 713 * if a NUL byte is found. 714 * @since 5.0 715 */ 716 public static final IntList lineMapOrBinary(byte[] buf, int ptr, int end) 717 throws BinaryBlobException { 718 IntList map = lineMapOrNull(buf, ptr, end); 719 if (map == null) { 720 throw new BinaryBlobException(); 721 } 722 return map; 723 } 724 725 @Nullable 726 private static IntList lineMapOrNull(byte[] buf, int ptr, int end) { 727 // Experimentally derived from multiple source repositories 728 // the average number of bytes/line is 36. Its a rough guess 729 // to initially size our map close to the target. 730 IntList map = new IntList((end - ptr) / 36); 731 map.add(Integer.MIN_VALUE); 732 boolean foundLF = true; 733 for (; ptr < end; ptr++) { 734 if (foundLF) { 735 map.add(ptr); 736 } 737 738 if (buf[ptr] == '\0') { 739 return null; 740 } 741 742 foundLF = (buf[ptr] == '\n'); 743 } 744 map.add(end); 745 return map; 746 } 747 748 /** 749 * Locate the "author " header line data. 750 * 751 * @param b 752 * buffer to scan. 753 * @param ptr 754 * position in buffer to start the scan at. Most callers should 755 * pass 0 to ensure the scan starts from the beginning of the 756 * commit buffer and does not accidentally look at message body. 757 * @return position just after the space in "author ", so the first 758 * character of the author's name. If no author header can be 759 * located -1 is returned. 760 */ 761 public static final int author(byte[] b, int ptr) { 762 final int sz = b.length; 763 if (ptr == 0) 764 ptr += 46; // skip the "tree ..." line. 765 while (ptr < sz && b[ptr] == 'p') 766 ptr += 48; // skip this parent. 767 return match(b, ptr, author); 768 } 769 770 /** 771 * Locate the "committer " header line data. 772 * 773 * @param b 774 * buffer to scan. 775 * @param ptr 776 * position in buffer to start the scan at. Most callers should 777 * pass 0 to ensure the scan starts from the beginning of the 778 * commit buffer and does not accidentally look at message body. 779 * @return position just after the space in "committer ", so the first 780 * character of the committer's name. If no committer header can be 781 * located -1 is returned. 782 */ 783 public static final int committer(byte[] b, int ptr) { 784 final int sz = b.length; 785 if (ptr == 0) 786 ptr += 46; // skip the "tree ..." line. 787 while (ptr < sz && b[ptr] == 'p') 788 ptr += 48; // skip this parent. 789 if (ptr < sz && b[ptr] == 'a') 790 ptr = nextLF(b, ptr); 791 return match(b, ptr, committer); 792 } 793 794 /** 795 * Locate the "tagger " header line data. 796 * 797 * @param b 798 * buffer to scan. 799 * @param ptr 800 * position in buffer to start the scan at. Most callers should 801 * pass 0 to ensure the scan starts from the beginning of the tag 802 * buffer and does not accidentally look at message body. 803 * @return position just after the space in "tagger ", so the first 804 * character of the tagger's name. If no tagger header can be 805 * located -1 is returned. 806 */ 807 public static final int tagger(byte[] b, int ptr) { 808 final int sz = b.length; 809 if (ptr == 0) 810 ptr += 48; // skip the "object ..." line. 811 while (ptr < sz) { 812 if (b[ptr] == '\n') 813 return -1; 814 final int m = match(b, ptr, tagger); 815 if (m >= 0) 816 return m; 817 ptr = nextLF(b, ptr); 818 } 819 return -1; 820 } 821 822 /** 823 * Locate the "encoding " header line. 824 * 825 * @param b 826 * buffer to scan. 827 * @param ptr 828 * position in buffer to start the scan at. Most callers should 829 * pass 0 to ensure the scan starts from the beginning of the 830 * buffer and does not accidentally look at the message body. 831 * @return position just after the space in "encoding ", so the first 832 * character of the encoding's name. If no encoding header can be 833 * located -1 is returned (and UTF-8 should be assumed). 834 */ 835 public static final int encoding(byte[] b, int ptr) { 836 final int sz = b.length; 837 while (ptr < sz) { 838 if (b[ptr] == '\n') 839 return -1; 840 if (b[ptr] == 'e') 841 break; 842 ptr = nextLF(b, ptr); 843 } 844 return match(b, ptr, encoding); 845 } 846 847 /** 848 * Parse the "encoding " header as a string. 849 * <p> 850 * Locates the "encoding " header (if present) and returns its value. 851 * 852 * @param b 853 * buffer to scan. 854 * @return the encoding header as specified in the commit; null if the 855 * header was not present and should be assumed. 856 * @since 4.2 857 */ 858 @Nullable 859 public static String parseEncodingName(byte[] b) { 860 int enc = encoding(b, 0); 861 if (enc < 0) { 862 return null; 863 } 864 int lf = nextLF(b, enc); 865 return decode(UTF_8, b, enc, lf - 1); 866 } 867 868 /** 869 * Parse the "encoding " header into a character set reference. 870 * <p> 871 * Locates the "encoding " header (if present) by first calling 872 * {@link #encoding(byte[], int)} and then returns the proper character set 873 * to apply to this buffer to evaluate its contents as character data. 874 * <p> 875 * If no encoding header is present {@code UTF-8} is assumed. 876 * 877 * @param b 878 * buffer to scan. 879 * @return the Java character set representation. Never null. 880 * @throws IllegalCharsetNameException 881 * if the character set requested by the encoding header is 882 * malformed and unsupportable. 883 * @throws UnsupportedCharsetException 884 * if the JRE does not support the character set requested by 885 * the encoding header. 886 */ 887 public static Charset parseEncoding(byte[] b) { 888 String enc = parseEncodingName(b); 889 if (enc == null) { 890 return UTF_8; 891 } 892 893 String name = enc.trim(); 894 try { 895 return Charset.forName(name); 896 } catch (IllegalCharsetNameException 897 | UnsupportedCharsetException badName) { 898 Charset aliased = charsetForAlias(name); 899 if (aliased != null) { 900 return aliased; 901 } 902 throw badName; 903 } 904 } 905 906 /** 907 * Parse a name string (e.g. author, committer, tagger) into a PersonIdent. 908 * <p> 909 * Leading spaces won't be trimmed from the string, i.e. will show up in the 910 * parsed name afterwards. 911 * 912 * @param in 913 * the string to parse a name from. 914 * @return the parsed identity or null in case the identity could not be 915 * parsed. 916 */ 917 public static PersonIdent parsePersonIdent(String in) { 918 return parsePersonIdent(Constants.encode(in), 0); 919 } 920 921 /** 922 * Parse a name line (e.g. author, committer, tagger) into a PersonIdent. 923 * <p> 924 * When passing in a value for <code>nameB</code> callers should use the 925 * return value of {@link #author(byte[], int)} or 926 * {@link #committer(byte[], int)}, as these methods provide the proper 927 * position within the buffer. 928 * 929 * @param raw 930 * the buffer to parse character data from. 931 * @param nameB 932 * first position of the identity information. This should be the 933 * first position after the space which delimits the header field 934 * name (e.g. "author" or "committer") from the rest of the 935 * identity line. 936 * @return the parsed identity or null in case the identity could not be 937 * parsed. 938 */ 939 public static PersonIdent parsePersonIdent(byte[] raw, int nameB) { 940 Charset cs; 941 try { 942 cs = parseEncoding(raw); 943 } catch (IllegalCharsetNameException | UnsupportedCharsetException e) { 944 // Assume UTF-8 for person identities, usually this is correct. 945 // If not decode() will fall back to the ISO-8859-1 encoding. 946 cs = UTF_8; 947 } 948 949 final int emailB = nextLF(raw, nameB, '<'); 950 final int emailE = nextLF(raw, emailB, '>'); 951 if (emailB >= raw.length || raw[emailB] == '\n' || 952 (emailE >= raw.length - 1 && raw[emailE - 1] != '>')) 953 return null; 954 955 final int nameEnd = emailB - 2 >= nameB && raw[emailB - 2] == ' ' ? 956 emailB - 2 : emailB - 1; 957 final String name = decode(cs, raw, nameB, nameEnd); 958 final String email = decode(cs, raw, emailB, emailE - 1); 959 960 // Start searching from end of line, as after first name-email pair, 961 // another name-email pair may occur. We will ignore all kinds of 962 // "junk" following the first email. 963 // 964 // We've to use (emailE - 1) for the case that raw[email] is LF, 965 // otherwise we would run too far. "-2" is necessary to position 966 // before the LF in case of LF termination resp. the penultimate 967 // character if there is no trailing LF. 968 final int tzBegin = lastIndexOfTrim(raw, ' ', 969 nextLF(raw, emailE - 1) - 2) + 1; 970 if (tzBegin <= emailE) // No time/zone, still valid 971 return new PersonIdent(name, email, 0, 0); 972 973 final int whenBegin = Math.max(emailE, 974 lastIndexOfTrim(raw, ' ', tzBegin - 1) + 1); 975 if (whenBegin >= tzBegin - 1) // No time/zone, still valid 976 return new PersonIdent(name, email, 0, 0); 977 978 final long when = parseLongBase10(raw, whenBegin, null); 979 final int tz = parseTimeZoneOffset(raw, tzBegin); 980 return new PersonIdent(name, email, when * 1000L, tz); 981 } 982 983 /** 984 * Parse a name data (e.g. as within a reflog) into a PersonIdent. 985 * <p> 986 * When passing in a value for <code>nameB</code> callers should use the 987 * return value of {@link #author(byte[], int)} or 988 * {@link #committer(byte[], int)}, as these methods provide the proper 989 * position within the buffer. 990 * 991 * @param raw 992 * the buffer to parse character data from. 993 * @param nameB 994 * first position of the identity information. This should be the 995 * first position after the space which delimits the header field 996 * name (e.g. "author" or "committer") from the rest of the 997 * identity line. 998 * @return the parsed identity. Never null. 999 */ 1000 public static PersonIdent parsePersonIdentOnly(final byte[] raw, 1001 final int nameB) { 1002 int stop = nextLF(raw, nameB); 1003 int emailB = nextLF(raw, nameB, '<'); 1004 int emailE = nextLF(raw, emailB, '>'); 1005 final String name; 1006 final String email; 1007 if (emailE < stop) { 1008 email = decode(raw, emailB, emailE - 1); 1009 } else { 1010 email = "invalid"; //$NON-NLS-1$ 1011 } 1012 if (emailB < stop) 1013 name = decode(raw, nameB, emailB - 2); 1014 else 1015 name = decode(raw, nameB, stop); 1016 1017 final MutableInteger.html#MutableInteger">MutableInteger ptrout = new MutableInteger(); 1018 long when; 1019 int tz; 1020 if (emailE < stop) { 1021 when = parseLongBase10(raw, emailE + 1, ptrout); 1022 tz = parseTimeZoneOffset(raw, ptrout.value); 1023 } else { 1024 when = 0; 1025 tz = 0; 1026 } 1027 return new PersonIdent(name, email, when * 1000L, tz); 1028 } 1029 1030 /** 1031 * Locate the end of a footer line key string. 1032 * <p> 1033 * If the region at {@code raw[ptr]} matches {@code ^[A-Za-z0-9-]+:} (e.g. 1034 * "Signed-off-by: A. U. Thor\n") then this method returns the position of 1035 * the first ':'. 1036 * <p> 1037 * If the region at {@code raw[ptr]} does not match {@code ^[A-Za-z0-9-]+:} 1038 * then this method returns -1. 1039 * 1040 * @param raw 1041 * buffer to scan. 1042 * @param ptr 1043 * first position within raw to consider as a footer line key. 1044 * @return position of the ':' which terminates the footer line key if this 1045 * is otherwise a valid footer line key; otherwise -1. 1046 */ 1047 public static int endOfFooterLineKey(byte[] raw, int ptr) { 1048 try { 1049 for (;;) { 1050 final byte c = raw[ptr]; 1051 if (footerLineKeyChars[c] == 0) { 1052 if (c == ':') 1053 return ptr; 1054 return -1; 1055 } 1056 ptr++; 1057 } 1058 } catch (ArrayIndexOutOfBoundsException e) { 1059 return -1; 1060 } 1061 } 1062 1063 /** 1064 * Decode a buffer under UTF-8, if possible. 1065 * 1066 * If the byte stream cannot be decoded that way, the platform default is tried 1067 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried. 1068 * 1069 * @param buffer 1070 * buffer to pull raw bytes from. 1071 * @return a string representation of the range <code>[start,end)</code>, 1072 * after decoding the region through the specified character set. 1073 */ 1074 public static String decode(byte[] buffer) { 1075 return decode(buffer, 0, buffer.length); 1076 } 1077 1078 /** 1079 * Decode a buffer under UTF-8, if possible. 1080 * 1081 * If the byte stream cannot be decoded that way, the platform default is 1082 * tried and if that too fails, the fail-safe ISO-8859-1 encoding is tried. 1083 * 1084 * @param buffer 1085 * buffer to pull raw bytes from. 1086 * @param start 1087 * start position in buffer 1088 * @param end 1089 * one position past the last location within the buffer to take 1090 * data from. 1091 * @return a string representation of the range <code>[start,end)</code>, 1092 * after decoding the region through the specified character set. 1093 */ 1094 public static String decode(final byte[] buffer, final int start, 1095 final int end) { 1096 return decode(UTF_8, buffer, start, end); 1097 } 1098 1099 /** 1100 * Decode a buffer under the specified character set if possible. 1101 * 1102 * If the byte stream cannot be decoded that way, the platform default is tried 1103 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried. 1104 * 1105 * @param cs 1106 * character set to use when decoding the buffer. 1107 * @param buffer 1108 * buffer to pull raw bytes from. 1109 * @return a string representation of the range <code>[start,end)</code>, 1110 * after decoding the region through the specified character set. 1111 */ 1112 public static String decode(Charset cs, byte[] buffer) { 1113 return decode(cs, buffer, 0, buffer.length); 1114 } 1115 1116 /** 1117 * Decode a region of the buffer under the specified character set if possible. 1118 * 1119 * If the byte stream cannot be decoded that way, the platform default is tried 1120 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried. 1121 * 1122 * @param cs 1123 * character set to use when decoding the buffer. 1124 * @param buffer 1125 * buffer to pull raw bytes from. 1126 * @param start 1127 * first position within the buffer to take data from. 1128 * @param end 1129 * one position past the last location within the buffer to take 1130 * data from. 1131 * @return a string representation of the range <code>[start,end)</code>, 1132 * after decoding the region through the specified character set. 1133 */ 1134 public static String decode(final Charset cs, final byte[] buffer, 1135 final int start, final int end) { 1136 try { 1137 return decodeNoFallback(cs, buffer, start, end); 1138 } catch (CharacterCodingException e) { 1139 // Fall back to an ISO-8859-1 style encoding. At least all of 1140 // the bytes will be present in the output. 1141 // 1142 return extractBinaryString(buffer, start, end); 1143 } 1144 } 1145 1146 /** 1147 * Decode a region of the buffer under the specified character set if 1148 * possible. 1149 * 1150 * If the byte stream cannot be decoded that way, the platform default is 1151 * tried and if that too fails, an exception is thrown. 1152 * 1153 * @param cs 1154 * character set to use when decoding the buffer. 1155 * @param buffer 1156 * buffer to pull raw bytes from. 1157 * @param start 1158 * first position within the buffer to take data from. 1159 * @param end 1160 * one position past the last location within the buffer to take 1161 * data from. 1162 * @return a string representation of the range <code>[start,end)</code>, 1163 * after decoding the region through the specified character set. 1164 * @throws java.nio.charset.CharacterCodingException 1165 * the input is not in any of the tested character sets. 1166 */ 1167 public static String decodeNoFallback(final Charset cs, 1168 final byte[] buffer, final int start, final int end) 1169 throws CharacterCodingException { 1170 ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start); 1171 b.mark(); 1172 1173 // Try our built-in favorite. The assumption here is that 1174 // decoding will fail if the data is not actually encoded 1175 // using that encoder. 1176 try { 1177 return decode(b, UTF_8); 1178 } catch (CharacterCodingException e) { 1179 b.reset(); 1180 } 1181 1182 if (!cs.equals(UTF_8)) { 1183 // Try the suggested encoding, it might be right since it was 1184 // provided by the caller. 1185 try { 1186 return decode(b, cs); 1187 } catch (CharacterCodingException e) { 1188 b.reset(); 1189 } 1190 } 1191 1192 // Try the default character set. A small group of people 1193 // might actually use the same (or very similar) locale. 1194 Charset defcs = Charset.defaultCharset(); 1195 if (!defcs.equals(cs) && !defcs.equals(UTF_8)) { 1196 try { 1197 return decode(b, defcs); 1198 } catch (CharacterCodingException e) { 1199 b.reset(); 1200 } 1201 } 1202 1203 throw new CharacterCodingException(); 1204 } 1205 1206 /** 1207 * Decode a region of the buffer under the ISO-8859-1 encoding. 1208 * 1209 * Each byte is treated as a single character in the 8859-1 character 1210 * encoding, performing a raw binary->char conversion. 1211 * 1212 * @param buffer 1213 * buffer to pull raw bytes from. 1214 * @param start 1215 * first position within the buffer to take data from. 1216 * @param end 1217 * one position past the last location within the buffer to take 1218 * data from. 1219 * @return a string representation of the range <code>[start,end)</code>. 1220 */ 1221 public static String extractBinaryString(final byte[] buffer, 1222 final int start, final int end) { 1223 final StringBuilder r = new StringBuilder(end - start); 1224 for (int i = start; i < end; i++) 1225 r.append((char) (buffer[i] & 0xff)); 1226 return r.toString(); 1227 } 1228 1229 private static String decode(ByteBuffer b, Charset charset) 1230 throws CharacterCodingException { 1231 final CharsetDecoder d = charset.newDecoder(); 1232 d.onMalformedInput(CodingErrorAction.REPORT); 1233 d.onUnmappableCharacter(CodingErrorAction.REPORT); 1234 return d.decode(b).toString(); 1235 } 1236 1237 /** 1238 * Locate the position of the commit message body. 1239 * 1240 * @param b 1241 * buffer to scan. 1242 * @param ptr 1243 * position in buffer to start the scan at. Most callers should 1244 * pass 0 to ensure the scan starts from the beginning of the 1245 * commit buffer. 1246 * @return position of the user's message buffer. 1247 */ 1248 public static final int commitMessage(byte[] b, int ptr) { 1249 final int sz = b.length; 1250 if (ptr == 0) 1251 ptr += 46; // skip the "tree ..." line. 1252 while (ptr < sz && b[ptr] == 'p') 1253 ptr += 48; // skip this parent. 1254 1255 // Skip any remaining header lines, ignoring what their actual 1256 // header line type is. This is identical to the logic for a tag. 1257 // 1258 return tagMessage(b, ptr); 1259 } 1260 1261 /** 1262 * Locate the position of the tag message body. 1263 * 1264 * @param b 1265 * buffer to scan. 1266 * @param ptr 1267 * position in buffer to start the scan at. Most callers should 1268 * pass 0 to ensure the scan starts from the beginning of the tag 1269 * buffer. 1270 * @return position of the user's message buffer. 1271 */ 1272 public static final int tagMessage(byte[] b, int ptr) { 1273 final int sz = b.length; 1274 if (ptr == 0) 1275 ptr += 48; // skip the "object ..." line. 1276 while (ptr < sz && b[ptr] != '\n') 1277 ptr = nextLF(b, ptr); 1278 if (ptr < sz && b[ptr] == '\n') 1279 return ptr + 1; 1280 return -1; 1281 } 1282 1283 /** 1284 * Locate the end of a paragraph. 1285 * <p> 1286 * A paragraph is ended by two consecutive LF bytes or CRLF pairs 1287 * 1288 * @param b 1289 * buffer to scan. 1290 * @param start 1291 * position in buffer to start the scan at. Most callers will 1292 * want to pass the first position of the commit message (as 1293 * found by {@link #commitMessage(byte[], int)}. 1294 * @return position of the LF at the end of the paragraph; 1295 * <code>b.length</code> if no paragraph end could be located. 1296 */ 1297 public static final int endOfParagraph(byte[] b, int start) { 1298 int ptr = start; 1299 final int sz = b.length; 1300 while (ptr < sz && (b[ptr] != '\n' && b[ptr] != '\r')) 1301 ptr = nextLF(b, ptr); 1302 if (ptr > start && b[ptr - 1] == '\n') 1303 ptr--; 1304 if (ptr > start && b[ptr - 1] == '\r') 1305 ptr--; 1306 return ptr; 1307 } 1308 1309 /** 1310 * Get last index of {@code ch} in raw, trimming spaces. 1311 * 1312 * @param raw 1313 * buffer to scan. 1314 * @param ch 1315 * character to find. 1316 * @param pos 1317 * starting position. 1318 * @return last index of {@code ch} in raw, trimming spaces. 1319 * @since 4.1 1320 */ 1321 public static int lastIndexOfTrim(byte[] raw, char ch, int pos) { 1322 while (pos >= 0 && raw[pos] == ' ') 1323 pos--; 1324 1325 while (pos >= 0 && raw[pos] != ch) 1326 pos--; 1327 1328 return pos; 1329 } 1330 1331 private static Charset charsetForAlias(String name) { 1332 return encodingAliases.get(StringUtils.toLowerCase(name)); 1333 } 1334 1335 private RawParseUtils() { 1336 // Don't create instances of a static only utility. 1337 } 1338 }