1 /* 2 * Copyright (C) 2008-2009, Google Inc. 3 * Copyright (C) 2006-2008, Shawn O. Pearce <spearce@spearce.org> 4 * and other copyright owners as documented in the project's IP log. 5 * 6 * This program and the accompanying materials are made available 7 * under the terms of the Eclipse Distribution License v1.0 which 8 * accompanies this distribution, is reproduced below, and is 9 * available at http://www.eclipse.org/org/documents/edl-v10.php 10 * 11 * All rights reserved. 12 * 13 * Redistribution and use in source and binary forms, with or 14 * without modification, are permitted provided that the following 15 * conditions are met: 16 * 17 * - Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 20 * - Redistributions in binary form must reproduce the above 21 * copyright notice, this list of conditions and the following 22 * disclaimer in the documentation and/or other materials provided 23 * with the distribution. 24 * 25 * - Neither the name of the Eclipse Foundation, Inc. nor the 26 * names of its contributors may be used to endorse or promote 27 * products derived from this software without specific prior 28 * written permission. 29 * 30 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND 31 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, 32 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 33 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 34 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 35 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 36 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 37 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 38 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 39 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 40 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 41 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 42 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 43 */ 44 45 package org.eclipse.jgit.util; 46 47 import static java.nio.charset.StandardCharsets.ISO_8859_1; 48 import static java.nio.charset.StandardCharsets.UTF_8; 49 import static org.eclipse.jgit.lib.ObjectChecker.author; 50 import static org.eclipse.jgit.lib.ObjectChecker.committer; 51 import static org.eclipse.jgit.lib.ObjectChecker.encoding; 52 import static org.eclipse.jgit.lib.ObjectChecker.tagger; 53 54 import java.nio.ByteBuffer; 55 import java.nio.charset.CharacterCodingException; 56 import java.nio.charset.Charset; 57 import java.nio.charset.CharsetDecoder; 58 import java.nio.charset.CodingErrorAction; 59 import java.nio.charset.IllegalCharsetNameException; 60 import java.nio.charset.UnsupportedCharsetException; 61 import java.util.Arrays; 62 import java.util.HashMap; 63 import java.util.Map; 64 65 import org.eclipse.jgit.annotations.Nullable; 66 import org.eclipse.jgit.errors.BinaryBlobException; 67 import org.eclipse.jgit.lib.Constants; 68 import org.eclipse.jgit.lib.PersonIdent; 69 70 /** 71 * Handy utility functions to parse raw object contents. 72 */ 73 public final class RawParseUtils { 74 /** 75 * UTF-8 charset constant. 76 * 77 * @since 2.2 78 * @deprecated use {@link java.nio.charset.StandardCharsets#UTF_8} instead 79 */ 80 @Deprecated 81 public static final Charset UTF8_CHARSET = UTF_8; 82 83 private static final byte[] digits10; 84 85 private static final byte[] digits16; 86 87 private static final byte[] footerLineKeyChars; 88 89 private static final Map<String, Charset> encodingAliases; 90 91 static { 92 encodingAliases = new HashMap<>(); 93 encodingAliases.put("latin-1", ISO_8859_1); //$NON-NLS-1$ 94 encodingAliases.put("iso-latin-1", ISO_8859_1); //$NON-NLS-1$ 95 96 digits10 = new byte['9' + 1]; 97 Arrays.fill(digits10, (byte) -1); 98 for (char i = '0'; i <= '9'; i++) 99 digits10[i] = (byte) (i - '0'); 100 101 digits16 = new byte['f' + 1]; 102 Arrays.fill(digits16, (byte) -1); 103 for (char i = '0'; i <= '9'; i++) 104 digits16[i] = (byte) (i - '0'); 105 for (char i = 'a'; i <= 'f'; i++) 106 digits16[i] = (byte) ((i - 'a') + 10); 107 for (char i = 'A'; i <= 'F'; i++) 108 digits16[i] = (byte) ((i - 'A') + 10); 109 110 footerLineKeyChars = new byte['z' + 1]; 111 footerLineKeyChars['-'] = 1; 112 for (char i = '0'; i <= '9'; i++) 113 footerLineKeyChars[i] = 1; 114 for (char i = 'A'; i <= 'Z'; i++) 115 footerLineKeyChars[i] = 1; 116 for (char i = 'a'; i <= 'z'; i++) 117 footerLineKeyChars[i] = 1; 118 } 119 120 /** 121 * Determine if b[ptr] matches src. 122 * 123 * @param b 124 * the buffer to scan. 125 * @param ptr 126 * first position within b, this should match src[0]. 127 * @param src 128 * the buffer to test for equality with b. 129 * @return ptr + src.length if b[ptr..src.length] == src; else -1. 130 */ 131 public static final int match(byte[] b, int ptr, byte[] src) { 132 if (ptr + src.length > b.length) 133 return -1; 134 for (int i = 0; i < src.length; i++, ptr++) 135 if (b[ptr] != src[i]) 136 return -1; 137 return ptr; 138 } 139 140 private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5', 141 '6', '7', '8', '9' }; 142 143 /** 144 * Format a base 10 numeric into a temporary buffer. 145 * <p> 146 * Formatting is performed backwards. The method starts at offset 147 * <code>o-1</code> and ends at <code>o-1-digits</code>, where 148 * <code>digits</code> is the number of positions necessary to store the 149 * base 10 value. 150 * <p> 151 * The argument and return values from this method make it easy to chain 152 * writing, for example: 153 * </p> 154 * 155 * <pre> 156 * final byte[] tmp = new byte[64]; 157 * int ptr = tmp.length; 158 * tmp[--ptr] = '\n'; 159 * ptr = RawParseUtils.formatBase10(tmp, ptr, 32); 160 * tmp[--ptr] = ' '; 161 * ptr = RawParseUtils.formatBase10(tmp, ptr, 18); 162 * tmp[--ptr] = 0; 163 * final String str = new String(tmp, ptr, tmp.length - ptr); 164 * </pre> 165 * 166 * @param b 167 * buffer to write into. 168 * @param o 169 * one offset past the location where writing will begin; writing 170 * proceeds towards lower index values. 171 * @param value 172 * the value to store. 173 * @return the new offset value <code>o</code>. This is the position of 174 * the last byte written. Additional writing should start at one 175 * position earlier. 176 */ 177 public static int formatBase10(final byte[] b, int o, int value) { 178 if (value == 0) { 179 b[--o] = '0'; 180 return o; 181 } 182 final boolean isneg = value < 0; 183 if (isneg) 184 value = -value; 185 while (value != 0) { 186 b[--o] = base10byte[value % 10]; 187 value /= 10; 188 } 189 if (isneg) 190 b[--o] = '-'; 191 return o; 192 } 193 194 /** 195 * Parse a base 10 numeric from a sequence of ASCII digits into an int. 196 * <p> 197 * Digit sequences can begin with an optional run of spaces before the 198 * sequence, and may start with a '+' or a '-' to indicate sign position. 199 * Any other characters will cause the method to stop and return the current 200 * result to the caller. 201 * 202 * @param b 203 * buffer to scan. 204 * @param ptr 205 * position within buffer to start parsing digits at. 206 * @param ptrResult 207 * optional location to return the new ptr value through. If null 208 * the ptr value will be discarded. 209 * @return the value at this location; 0 if the location is not a valid 210 * numeric. 211 */ 212 public static final int parseBase10(final byte[] b, int ptr, 213 final MutableInteger ptrResult) { 214 int r = 0; 215 int sign = 0; 216 try { 217 final int sz = b.length; 218 while (ptr < sz && b[ptr] == ' ') 219 ptr++; 220 if (ptr >= sz) 221 return 0; 222 223 switch (b[ptr]) { 224 case '-': 225 sign = -1; 226 ptr++; 227 break; 228 case '+': 229 ptr++; 230 break; 231 } 232 233 while (ptr < sz) { 234 final byte v = digits10[b[ptr]]; 235 if (v < 0) 236 break; 237 r = (r * 10) + v; 238 ptr++; 239 } 240 } catch (ArrayIndexOutOfBoundsException e) { 241 // Not a valid digit. 242 } 243 if (ptrResult != null) 244 ptrResult.value = ptr; 245 return sign < 0 ? -r : r; 246 } 247 248 /** 249 * Parse a base 10 numeric from a sequence of ASCII digits into a long. 250 * <p> 251 * Digit sequences can begin with an optional run of spaces before the 252 * sequence, and may start with a '+' or a '-' to indicate sign position. 253 * Any other characters will cause the method to stop and return the current 254 * result to the caller. 255 * 256 * @param b 257 * buffer to scan. 258 * @param ptr 259 * position within buffer to start parsing digits at. 260 * @param ptrResult 261 * optional location to return the new ptr value through. If null 262 * the ptr value will be discarded. 263 * @return the value at this location; 0 if the location is not a valid 264 * numeric. 265 */ 266 public static final long parseLongBase10(final byte[] b, int ptr, 267 final MutableInteger ptrResult) { 268 long r = 0; 269 int sign = 0; 270 try { 271 final int sz = b.length; 272 while (ptr < sz && b[ptr] == ' ') 273 ptr++; 274 if (ptr >= sz) 275 return 0; 276 277 switch (b[ptr]) { 278 case '-': 279 sign = -1; 280 ptr++; 281 break; 282 case '+': 283 ptr++; 284 break; 285 } 286 287 while (ptr < sz) { 288 final byte v = digits10[b[ptr]]; 289 if (v < 0) 290 break; 291 r = (r * 10) + v; 292 ptr++; 293 } 294 } catch (ArrayIndexOutOfBoundsException e) { 295 // Not a valid digit. 296 } 297 if (ptrResult != null) 298 ptrResult.value = ptr; 299 return sign < 0 ? -r : r; 300 } 301 302 /** 303 * Parse 4 character base 16 (hex) formatted string to unsigned integer. 304 * <p> 305 * The number is read in network byte order, that is, most significant 306 * nybble first. 307 * 308 * @param bs 309 * buffer to parse digits from; positions {@code [p, p+4)} will 310 * be parsed. 311 * @param p 312 * first position within the buffer to parse. 313 * @return the integer value. 314 * @throws java.lang.ArrayIndexOutOfBoundsException 315 * if the string is not hex formatted. 316 */ 317 public static final int parseHexInt16(final byte[] bs, final int p) { 318 int r = digits16[bs[p]] << 4; 319 320 r |= digits16[bs[p + 1]]; 321 r <<= 4; 322 323 r |= digits16[bs[p + 2]]; 324 r <<= 4; 325 326 r |= digits16[bs[p + 3]]; 327 if (r < 0) 328 throw new ArrayIndexOutOfBoundsException(); 329 return r; 330 } 331 332 /** 333 * Parse 8 character base 16 (hex) formatted string to unsigned integer. 334 * <p> 335 * The number is read in network byte order, that is, most significant 336 * nybble first. 337 * 338 * @param bs 339 * buffer to parse digits from; positions {@code [p, p+8)} will 340 * be parsed. 341 * @param p 342 * first position within the buffer to parse. 343 * @return the integer value. 344 * @throws java.lang.ArrayIndexOutOfBoundsException 345 * if the string is not hex formatted. 346 */ 347 public static final int parseHexInt32(final byte[] bs, final int p) { 348 int r = digits16[bs[p]] << 4; 349 350 r |= digits16[bs[p + 1]]; 351 r <<= 4; 352 353 r |= digits16[bs[p + 2]]; 354 r <<= 4; 355 356 r |= digits16[bs[p + 3]]; 357 r <<= 4; 358 359 r |= digits16[bs[p + 4]]; 360 r <<= 4; 361 362 r |= digits16[bs[p + 5]]; 363 r <<= 4; 364 365 r |= digits16[bs[p + 6]]; 366 367 final int last = digits16[bs[p + 7]]; 368 if (r < 0 || last < 0) 369 throw new ArrayIndexOutOfBoundsException(); 370 return (r << 4) | last; 371 } 372 373 /** 374 * Parse 16 character base 16 (hex) formatted string to unsigned long. 375 * <p> 376 * The number is read in network byte order, that is, most significant 377 * nibble first. 378 * 379 * @param bs 380 * buffer to parse digits from; positions {@code [p, p+16)} will 381 * be parsed. 382 * @param p 383 * first position within the buffer to parse. 384 * @return the integer value. 385 * @throws java.lang.ArrayIndexOutOfBoundsException 386 * if the string is not hex formatted. 387 * @since 4.3 388 */ 389 public static final long parseHexInt64(final byte[] bs, final int p) { 390 long r = digits16[bs[p]] << 4; 391 392 r |= digits16[bs[p + 1]]; 393 r <<= 4; 394 395 r |= digits16[bs[p + 2]]; 396 r <<= 4; 397 398 r |= digits16[bs[p + 3]]; 399 r <<= 4; 400 401 r |= digits16[bs[p + 4]]; 402 r <<= 4; 403 404 r |= digits16[bs[p + 5]]; 405 r <<= 4; 406 407 r |= digits16[bs[p + 6]]; 408 r <<= 4; 409 410 r |= digits16[bs[p + 7]]; 411 r <<= 4; 412 413 r |= digits16[bs[p + 8]]; 414 r <<= 4; 415 416 r |= digits16[bs[p + 9]]; 417 r <<= 4; 418 419 r |= digits16[bs[p + 10]]; 420 r <<= 4; 421 422 r |= digits16[bs[p + 11]]; 423 r <<= 4; 424 425 r |= digits16[bs[p + 12]]; 426 r <<= 4; 427 428 r |= digits16[bs[p + 13]]; 429 r <<= 4; 430 431 r |= digits16[bs[p + 14]]; 432 433 final int last = digits16[bs[p + 15]]; 434 if (r < 0 || last < 0) 435 throw new ArrayIndexOutOfBoundsException(); 436 return (r << 4) | last; 437 } 438 439 /** 440 * Parse a single hex digit to its numeric value (0-15). 441 * 442 * @param digit 443 * hex character to parse. 444 * @return numeric value, in the range 0-15. 445 * @throws java.lang.ArrayIndexOutOfBoundsException 446 * if the input digit is not a valid hex digit. 447 */ 448 public static final int parseHexInt4(final byte digit) { 449 final byte r = digits16[digit]; 450 if (r < 0) 451 throw new ArrayIndexOutOfBoundsException(); 452 return r; 453 } 454 455 /** 456 * Parse a Git style timezone string. 457 * <p> 458 * The sequence "-0315" will be parsed as the numeric value -195, as the 459 * lower two positions count minutes, not 100ths of an hour. 460 * 461 * @param b 462 * buffer to scan. 463 * @param ptr 464 * position within buffer to start parsing digits at. 465 * @return the timezone at this location, expressed in minutes. 466 */ 467 public static final int parseTimeZoneOffset(byte[] b, int ptr) { 468 return parseTimeZoneOffset(b, ptr, null); 469 } 470 471 /** 472 * Parse a Git style timezone string. 473 * <p> 474 * The sequence "-0315" will be parsed as the numeric value -195, as the 475 * lower two positions count minutes, not 100ths of an hour. 476 * 477 * @param b 478 * buffer to scan. 479 * @param ptr 480 * position within buffer to start parsing digits at. 481 * @param ptrResult 482 * optional location to return the new ptr value through. If null 483 * the ptr value will be discarded. 484 * @return the timezone at this location, expressed in minutes. 485 * @since 4.1 486 */ 487 public static final int parseTimeZoneOffset(final byte[] b, int ptr, 488 MutableInteger ptrResult) { 489 final int v = parseBase10(b, ptr, ptrResult); 490 final int tzMins = v % 100; 491 final int tzHours = v / 100; 492 return tzHours * 60 + tzMins; 493 } 494 495 /** 496 * Locate the first position after a given character. 497 * 498 * @param b 499 * buffer to scan. 500 * @param ptr 501 * position within buffer to start looking for chrA at. 502 * @param chrA 503 * character to find. 504 * @return new position just after chrA. 505 */ 506 public static final int next(byte[] b, int ptr, char chrA) { 507 final int sz = b.length; 508 while (ptr < sz) { 509 if (b[ptr++] == chrA) 510 return ptr; 511 } 512 return ptr; 513 } 514 515 /** 516 * Locate the first position after the next LF. 517 * <p> 518 * This method stops on the first '\n' it finds. 519 * 520 * @param b 521 * buffer to scan. 522 * @param ptr 523 * position within buffer to start looking for LF at. 524 * @return new position just after the first LF found. 525 */ 526 public static final int nextLF(byte[] b, int ptr) { 527 return next(b, ptr, '\n'); 528 } 529 530 /** 531 * Locate the first position after either the given character or LF. 532 * <p> 533 * This method stops on the first match it finds from either chrA or '\n'. 534 * 535 * @param b 536 * buffer to scan. 537 * @param ptr 538 * position within buffer to start looking for chrA or LF at. 539 * @param chrA 540 * character to find. 541 * @return new position just after the first chrA or LF to be found. 542 */ 543 public static final int nextLF(byte[] b, int ptr, char chrA) { 544 final int sz = b.length; 545 while (ptr < sz) { 546 final byte c = b[ptr++]; 547 if (c == chrA || c == '\n') 548 return ptr; 549 } 550 return ptr; 551 } 552 553 /** 554 * Locate the end of the header. Note that headers may be 555 * more than one line long. 556 * @param b 557 * buffer to scan. 558 * @param ptr 559 * position within buffer to start looking for the end-of-header. 560 * @return new position just after the header. This is either 561 * b.length, or the index of the header's terminating newline. 562 * @since 5.1 563 */ 564 public static final int headerEnd(final byte[] b, int ptr) { 565 final int sz = b.length; 566 while (ptr < sz) { 567 final byte c = b[ptr++]; 568 if (c == '\n' && (ptr == sz || b[ptr] != ' ')) { 569 return ptr - 1; 570 } 571 } 572 return ptr - 1; 573 } 574 575 /** 576 * Find the start of the contents of a given header. 577 * 578 * @param b 579 * buffer to scan. 580 * @param headerName 581 * header to search for 582 * @param ptr 583 * position within buffer to start looking for header at. 584 * @return new position at the start of the header's contents, -1 for 585 * not found 586 * @since 5.1 587 */ 588 public static final int headerStart(byte[] headerName, byte[] b, int ptr) { 589 // Start by advancing to just past a LF or buffer start 590 if (ptr != 0) { 591 ptr = nextLF(b, ptr - 1); 592 } 593 while (ptr < b.length - (headerName.length + 1)) { 594 boolean found = true; 595 for (int i = 0; i < headerName.length; i++) { 596 if (headerName[i] != b[ptr++]) { 597 found = false; 598 break; 599 } 600 } 601 if (found && b[ptr++] == ' ') { 602 return ptr; 603 } 604 ptr = nextLF(b, ptr); 605 } 606 return -1; 607 } 608 609 /** 610 * Locate the first position before a given character. 611 * 612 * @param b 613 * buffer to scan. 614 * @param ptr 615 * position within buffer to start looking for chrA at. 616 * @param chrA 617 * character to find. 618 * @return new position just before chrA, -1 for not found 619 */ 620 public static final int prev(byte[] b, int ptr, char chrA) { 621 if (ptr == b.length) 622 --ptr; 623 while (ptr >= 0) { 624 if (b[ptr--] == chrA) 625 return ptr; 626 } 627 return ptr; 628 } 629 630 /** 631 * Locate the first position before the previous LF. 632 * <p> 633 * This method stops on the first '\n' it finds. 634 * 635 * @param b 636 * buffer to scan. 637 * @param ptr 638 * position within buffer to start looking for LF at. 639 * @return new position just before the first LF found, -1 for not found 640 */ 641 public static final int prevLF(byte[] b, int ptr) { 642 return prev(b, ptr, '\n'); 643 } 644 645 /** 646 * Locate the previous position before either the given character or LF. 647 * <p> 648 * This method stops on the first match it finds from either chrA or '\n'. 649 * 650 * @param b 651 * buffer to scan. 652 * @param ptr 653 * position within buffer to start looking for chrA or LF at. 654 * @param chrA 655 * character to find. 656 * @return new position just before the first chrA or LF to be found, -1 for 657 * not found 658 */ 659 public static final int prevLF(byte[] b, int ptr, char chrA) { 660 if (ptr == b.length) 661 --ptr; 662 while (ptr >= 0) { 663 final byte c = b[ptr--]; 664 if (c == chrA || c == '\n') 665 return ptr; 666 } 667 return ptr; 668 } 669 670 /** 671 * Index the region between <code>[ptr, end)</code> to find line starts. 672 * <p> 673 * The returned list is 1 indexed. Index 0 contains 674 * {@link java.lang.Integer#MIN_VALUE} to pad the list out. 675 * <p> 676 * Using a 1 indexed list means that line numbers can be directly accessed 677 * from the list, so <code>list.get(1)</code> (aka get line 1) returns 678 * <code>ptr</code>. 679 * <p> 680 * The last element (index <code>map.size()-1</code>) always contains 681 * <code>end</code>. 682 * 683 * @param buf 684 * buffer to scan. 685 * @param ptr 686 * position within the buffer corresponding to the first byte of 687 * line 1. 688 * @param end 689 * 1 past the end of the content within <code>buf</code>. 690 * @return a line map indicating the starting position of each line. 691 */ 692 public static final IntList lineMap(byte[] buf, int ptr, int end) { 693 IntList map = new IntList((end - ptr) / 36); 694 map.fillTo(1, Integer.MIN_VALUE); 695 for (; ptr < end; ptr = nextLF(buf, ptr)) { 696 map.add(ptr); 697 } 698 map.add(end); 699 return map; 700 } 701 702 /** 703 * Like {@link #lineMap(byte[], int, int)} but throw 704 * {@link BinaryBlobException} if a NUL byte is encountered. 705 * 706 * @param buf 707 * buffer to scan. 708 * @param ptr 709 * position within the buffer corresponding to the first byte of 710 * line 1. 711 * @param end 712 * 1 past the end of the content within <code>buf</code>. 713 * @return a line map indicating the starting position of each line. 714 * @throws BinaryBlobException 715 * if a NUL byte is found. 716 * @since 5.0 717 */ 718 public static final IntList lineMapOrBinary(byte[] buf, int ptr, int end) 719 throws BinaryBlobException { 720 IntList map = lineMapOrNull(buf, ptr, end); 721 if (map == null) { 722 throw new BinaryBlobException(); 723 } 724 return map; 725 } 726 727 @Nullable 728 private static IntList lineMapOrNull(byte[] buf, int ptr, int end) { 729 // Experimentally derived from multiple source repositories 730 // the average number of bytes/line is 36. Its a rough guess 731 // to initially size our map close to the target. 732 IntList map = new IntList((end - ptr) / 36); 733 map.add(Integer.MIN_VALUE); 734 boolean foundLF = true; 735 for (; ptr < end; ptr++) { 736 if (foundLF) { 737 map.add(ptr); 738 } 739 740 if (buf[ptr] == '\0') { 741 return null; 742 } 743 744 foundLF = (buf[ptr] == '\n'); 745 } 746 map.add(end); 747 return map; 748 } 749 750 /** 751 * Locate the "author " header line data. 752 * 753 * @param b 754 * buffer to scan. 755 * @param ptr 756 * position in buffer to start the scan at. Most callers should 757 * pass 0 to ensure the scan starts from the beginning of the 758 * commit buffer and does not accidentally look at message body. 759 * @return position just after the space in "author ", so the first 760 * character of the author's name. If no author header can be 761 * located -1 is returned. 762 */ 763 public static final int author(byte[] b, int ptr) { 764 final int sz = b.length; 765 if (ptr == 0) 766 ptr += 46; // skip the "tree ..." line. 767 while (ptr < sz && b[ptr] == 'p') 768 ptr += 48; // skip this parent. 769 return match(b, ptr, author); 770 } 771 772 /** 773 * Locate the "committer " header line data. 774 * 775 * @param b 776 * buffer to scan. 777 * @param ptr 778 * position in buffer to start the scan at. Most callers should 779 * pass 0 to ensure the scan starts from the beginning of the 780 * commit buffer and does not accidentally look at message body. 781 * @return position just after the space in "committer ", so the first 782 * character of the committer's name. If no committer header can be 783 * located -1 is returned. 784 */ 785 public static final int committer(byte[] b, int ptr) { 786 final int sz = b.length; 787 if (ptr == 0) 788 ptr += 46; // skip the "tree ..." line. 789 while (ptr < sz && b[ptr] == 'p') 790 ptr += 48; // skip this parent. 791 if (ptr < sz && b[ptr] == 'a') 792 ptr = nextLF(b, ptr); 793 return match(b, ptr, committer); 794 } 795 796 /** 797 * Locate the "tagger " header line data. 798 * 799 * @param b 800 * buffer to scan. 801 * @param ptr 802 * position in buffer to start the scan at. Most callers should 803 * pass 0 to ensure the scan starts from the beginning of the tag 804 * buffer and does not accidentally look at message body. 805 * @return position just after the space in "tagger ", so the first 806 * character of the tagger's name. If no tagger header can be 807 * located -1 is returned. 808 */ 809 public static final int tagger(byte[] b, int ptr) { 810 final int sz = b.length; 811 if (ptr == 0) 812 ptr += 48; // skip the "object ..." line. 813 while (ptr < sz) { 814 if (b[ptr] == '\n') 815 return -1; 816 final int m = match(b, ptr, tagger); 817 if (m >= 0) 818 return m; 819 ptr = nextLF(b, ptr); 820 } 821 return -1; 822 } 823 824 /** 825 * Locate the "encoding " header line. 826 * 827 * @param b 828 * buffer to scan. 829 * @param ptr 830 * position in buffer to start the scan at. Most callers should 831 * pass 0 to ensure the scan starts from the beginning of the 832 * buffer and does not accidentally look at the message body. 833 * @return position just after the space in "encoding ", so the first 834 * character of the encoding's name. If no encoding header can be 835 * located -1 is returned (and UTF-8 should be assumed). 836 */ 837 public static final int encoding(byte[] b, int ptr) { 838 final int sz = b.length; 839 while (ptr < sz) { 840 if (b[ptr] == '\n') 841 return -1; 842 if (b[ptr] == 'e') 843 break; 844 ptr = nextLF(b, ptr); 845 } 846 return match(b, ptr, encoding); 847 } 848 849 /** 850 * Parse the "encoding " header as a string. 851 * <p> 852 * Locates the "encoding " header (if present) and returns its value. 853 * 854 * @param b 855 * buffer to scan. 856 * @return the encoding header as specified in the commit; null if the 857 * header was not present and should be assumed. 858 * @since 4.2 859 */ 860 @Nullable 861 public static String parseEncodingName(byte[] b) { 862 int enc = encoding(b, 0); 863 if (enc < 0) { 864 return null; 865 } 866 int lf = nextLF(b, enc); 867 return decode(UTF_8, b, enc, lf - 1); 868 } 869 870 /** 871 * Parse the "encoding " header into a character set reference. 872 * <p> 873 * Locates the "encoding " header (if present) by first calling 874 * {@link #encoding(byte[], int)} and then returns the proper character set 875 * to apply to this buffer to evaluate its contents as character data. 876 * <p> 877 * If no encoding header is present {@code UTF-8} is assumed. 878 * 879 * @param b 880 * buffer to scan. 881 * @return the Java character set representation. Never null. 882 * @throws IllegalCharsetNameException 883 * if the character set requested by the encoding header is 884 * malformed and unsupportable. 885 * @throws UnsupportedCharsetException 886 * if the JRE does not support the character set requested by 887 * the encoding header. 888 */ 889 public static Charset parseEncoding(byte[] b) { 890 String enc = parseEncodingName(b); 891 if (enc == null) { 892 return UTF_8; 893 } 894 895 String name = enc.trim(); 896 try { 897 return Charset.forName(name); 898 } catch (IllegalCharsetNameException 899 | UnsupportedCharsetException badName) { 900 Charset aliased = charsetForAlias(name); 901 if (aliased != null) { 902 return aliased; 903 } 904 throw badName; 905 } 906 } 907 908 /** 909 * Parse a name string (e.g. author, committer, tagger) into a PersonIdent. 910 * <p> 911 * Leading spaces won't be trimmed from the string, i.e. will show up in the 912 * parsed name afterwards. 913 * 914 * @param in 915 * the string to parse a name from. 916 * @return the parsed identity or null in case the identity could not be 917 * parsed. 918 */ 919 public static PersonIdent parsePersonIdent(String in) { 920 return parsePersonIdent(Constants.encode(in), 0); 921 } 922 923 /** 924 * Parse a name line (e.g. author, committer, tagger) into a PersonIdent. 925 * <p> 926 * When passing in a value for <code>nameB</code> callers should use the 927 * return value of {@link #author(byte[], int)} or 928 * {@link #committer(byte[], int)}, as these methods provide the proper 929 * position within the buffer. 930 * 931 * @param raw 932 * the buffer to parse character data from. 933 * @param nameB 934 * first position of the identity information. This should be the 935 * first position after the space which delimits the header field 936 * name (e.g. "author" or "committer") from the rest of the 937 * identity line. 938 * @return the parsed identity or null in case the identity could not be 939 * parsed. 940 */ 941 public static PersonIdent parsePersonIdent(byte[] raw, int nameB) { 942 Charset cs; 943 try { 944 cs = parseEncoding(raw); 945 } catch (IllegalCharsetNameException | UnsupportedCharsetException e) { 946 // Assume UTF-8 for person identities, usually this is correct. 947 // If not decode() will fall back to the ISO-8859-1 encoding. 948 cs = UTF_8; 949 } 950 951 final int emailB = nextLF(raw, nameB, '<'); 952 final int emailE = nextLF(raw, emailB, '>'); 953 if (emailB >= raw.length || raw[emailB] == '\n' || 954 (emailE >= raw.length - 1 && raw[emailE - 1] != '>')) 955 return null; 956 957 final int nameEnd = emailB - 2 >= nameB && raw[emailB - 2] == ' ' ? 958 emailB - 2 : emailB - 1; 959 final String name = decode(cs, raw, nameB, nameEnd); 960 final String email = decode(cs, raw, emailB, emailE - 1); 961 962 // Start searching from end of line, as after first name-email pair, 963 // another name-email pair may occur. We will ignore all kinds of 964 // "junk" following the first email. 965 // 966 // We've to use (emailE - 1) for the case that raw[email] is LF, 967 // otherwise we would run too far. "-2" is necessary to position 968 // before the LF in case of LF termination resp. the penultimate 969 // character if there is no trailing LF. 970 final int tzBegin = lastIndexOfTrim(raw, ' ', 971 nextLF(raw, emailE - 1) - 2) + 1; 972 if (tzBegin <= emailE) // No time/zone, still valid 973 return new PersonIdent(name, email, 0, 0); 974 975 final int whenBegin = Math.max(emailE, 976 lastIndexOfTrim(raw, ' ', tzBegin - 1) + 1); 977 if (whenBegin >= tzBegin - 1) // No time/zone, still valid 978 return new PersonIdent(name, email, 0, 0); 979 980 final long when = parseLongBase10(raw, whenBegin, null); 981 final int tz = parseTimeZoneOffset(raw, tzBegin); 982 return new PersonIdent(name, email, when * 1000L, tz); 983 } 984 985 /** 986 * Parse a name data (e.g. as within a reflog) into a PersonIdent. 987 * <p> 988 * When passing in a value for <code>nameB</code> callers should use the 989 * return value of {@link #author(byte[], int)} or 990 * {@link #committer(byte[], int)}, as these methods provide the proper 991 * position within the buffer. 992 * 993 * @param raw 994 * the buffer to parse character data from. 995 * @param nameB 996 * first position of the identity information. This should be the 997 * first position after the space which delimits the header field 998 * name (e.g. "author" or "committer") from the rest of the 999 * identity line. 1000 * @return the parsed identity. Never null. 1001 */ 1002 public static PersonIdent parsePersonIdentOnly(final byte[] raw, 1003 final int nameB) { 1004 int stop = nextLF(raw, nameB); 1005 int emailB = nextLF(raw, nameB, '<'); 1006 int emailE = nextLF(raw, emailB, '>'); 1007 final String name; 1008 final String email; 1009 if (emailE < stop) { 1010 email = decode(raw, emailB, emailE - 1); 1011 } else { 1012 email = "invalid"; //$NON-NLS-1$ 1013 } 1014 if (emailB < stop) 1015 name = decode(raw, nameB, emailB - 2); 1016 else 1017 name = decode(raw, nameB, stop); 1018 1019 final MutableInteger.html#MutableInteger">MutableInteger ptrout = new MutableInteger(); 1020 long when; 1021 int tz; 1022 if (emailE < stop) { 1023 when = parseLongBase10(raw, emailE + 1, ptrout); 1024 tz = parseTimeZoneOffset(raw, ptrout.value); 1025 } else { 1026 when = 0; 1027 tz = 0; 1028 } 1029 return new PersonIdent(name, email, when * 1000L, tz); 1030 } 1031 1032 /** 1033 * Locate the end of a footer line key string. 1034 * <p> 1035 * If the region at {@code raw[ptr]} matches {@code ^[A-Za-z0-9-]+:} (e.g. 1036 * "Signed-off-by: A. U. Thor\n") then this method returns the position of 1037 * the first ':'. 1038 * <p> 1039 * If the region at {@code raw[ptr]} does not match {@code ^[A-Za-z0-9-]+:} 1040 * then this method returns -1. 1041 * 1042 * @param raw 1043 * buffer to scan. 1044 * @param ptr 1045 * first position within raw to consider as a footer line key. 1046 * @return position of the ':' which terminates the footer line key if this 1047 * is otherwise a valid footer line key; otherwise -1. 1048 */ 1049 public static int endOfFooterLineKey(byte[] raw, int ptr) { 1050 try { 1051 for (;;) { 1052 final byte c = raw[ptr]; 1053 if (footerLineKeyChars[c] == 0) { 1054 if (c == ':') 1055 return ptr; 1056 return -1; 1057 } 1058 ptr++; 1059 } 1060 } catch (ArrayIndexOutOfBoundsException e) { 1061 return -1; 1062 } 1063 } 1064 1065 /** 1066 * Decode a buffer under UTF-8, if possible. 1067 * 1068 * If the byte stream cannot be decoded that way, the platform default is tried 1069 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried. 1070 * 1071 * @param buffer 1072 * buffer to pull raw bytes from. 1073 * @return a string representation of the range <code>[start,end)</code>, 1074 * after decoding the region through the specified character set. 1075 */ 1076 public static String decode(byte[] buffer) { 1077 return decode(buffer, 0, buffer.length); 1078 } 1079 1080 /** 1081 * Decode a buffer under UTF-8, if possible. 1082 * 1083 * If the byte stream cannot be decoded that way, the platform default is 1084 * tried and if that too fails, the fail-safe ISO-8859-1 encoding is tried. 1085 * 1086 * @param buffer 1087 * buffer to pull raw bytes from. 1088 * @param start 1089 * start position in buffer 1090 * @param end 1091 * one position past the last location within the buffer to take 1092 * data from. 1093 * @return a string representation of the range <code>[start,end)</code>, 1094 * after decoding the region through the specified character set. 1095 */ 1096 public static String decode(final byte[] buffer, final int start, 1097 final int end) { 1098 return decode(UTF_8, buffer, start, end); 1099 } 1100 1101 /** 1102 * Decode a buffer under the specified character set if possible. 1103 * 1104 * If the byte stream cannot be decoded that way, the platform default is tried 1105 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried. 1106 * 1107 * @param cs 1108 * character set to use when decoding the buffer. 1109 * @param buffer 1110 * buffer to pull raw bytes from. 1111 * @return a string representation of the range <code>[start,end)</code>, 1112 * after decoding the region through the specified character set. 1113 */ 1114 public static String decode(Charset cs, byte[] buffer) { 1115 return decode(cs, buffer, 0, buffer.length); 1116 } 1117 1118 /** 1119 * Decode a region of the buffer under the specified character set if possible. 1120 * 1121 * If the byte stream cannot be decoded that way, the platform default is tried 1122 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried. 1123 * 1124 * @param cs 1125 * character set to use when decoding the buffer. 1126 * @param buffer 1127 * buffer to pull raw bytes from. 1128 * @param start 1129 * first position within the buffer to take data from. 1130 * @param end 1131 * one position past the last location within the buffer to take 1132 * data from. 1133 * @return a string representation of the range <code>[start,end)</code>, 1134 * after decoding the region through the specified character set. 1135 */ 1136 public static String decode(final Charset cs, final byte[] buffer, 1137 final int start, final int end) { 1138 try { 1139 return decodeNoFallback(cs, buffer, start, end); 1140 } catch (CharacterCodingException e) { 1141 // Fall back to an ISO-8859-1 style encoding. At least all of 1142 // the bytes will be present in the output. 1143 // 1144 return extractBinaryString(buffer, start, end); 1145 } 1146 } 1147 1148 /** 1149 * Decode a region of the buffer under the specified character set if 1150 * possible. 1151 * 1152 * If the byte stream cannot be decoded that way, the platform default is 1153 * tried and if that too fails, an exception is thrown. 1154 * 1155 * @param cs 1156 * character set to use when decoding the buffer. 1157 * @param buffer 1158 * buffer to pull raw bytes from. 1159 * @param start 1160 * first position within the buffer to take data from. 1161 * @param end 1162 * one position past the last location within the buffer to take 1163 * data from. 1164 * @return a string representation of the range <code>[start,end)</code>, 1165 * after decoding the region through the specified character set. 1166 * @throws java.nio.charset.CharacterCodingException 1167 * the input is not in any of the tested character sets. 1168 */ 1169 public static String decodeNoFallback(final Charset cs, 1170 final byte[] buffer, final int start, final int end) 1171 throws CharacterCodingException { 1172 ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start); 1173 b.mark(); 1174 1175 // Try our built-in favorite. The assumption here is that 1176 // decoding will fail if the data is not actually encoded 1177 // using that encoder. 1178 try { 1179 return decode(b, UTF_8); 1180 } catch (CharacterCodingException e) { 1181 b.reset(); 1182 } 1183 1184 if (!cs.equals(UTF_8)) { 1185 // Try the suggested encoding, it might be right since it was 1186 // provided by the caller. 1187 try { 1188 return decode(b, cs); 1189 } catch (CharacterCodingException e) { 1190 b.reset(); 1191 } 1192 } 1193 1194 // Try the default character set. A small group of people 1195 // might actually use the same (or very similar) locale. 1196 Charset defcs = Charset.defaultCharset(); 1197 if (!defcs.equals(cs) && !defcs.equals(UTF_8)) { 1198 try { 1199 return decode(b, defcs); 1200 } catch (CharacterCodingException e) { 1201 b.reset(); 1202 } 1203 } 1204 1205 throw new CharacterCodingException(); 1206 } 1207 1208 /** 1209 * Decode a region of the buffer under the ISO-8859-1 encoding. 1210 * 1211 * Each byte is treated as a single character in the 8859-1 character 1212 * encoding, performing a raw binary->char conversion. 1213 * 1214 * @param buffer 1215 * buffer to pull raw bytes from. 1216 * @param start 1217 * first position within the buffer to take data from. 1218 * @param end 1219 * one position past the last location within the buffer to take 1220 * data from. 1221 * @return a string representation of the range <code>[start,end)</code>. 1222 */ 1223 public static String extractBinaryString(final byte[] buffer, 1224 final int start, final int end) { 1225 final StringBuilder r = new StringBuilder(end - start); 1226 for (int i = start; i < end; i++) 1227 r.append((char) (buffer[i] & 0xff)); 1228 return r.toString(); 1229 } 1230 1231 private static String decode(ByteBuffer b, Charset charset) 1232 throws CharacterCodingException { 1233 final CharsetDecoder d = charset.newDecoder(); 1234 d.onMalformedInput(CodingErrorAction.REPORT); 1235 d.onUnmappableCharacter(CodingErrorAction.REPORT); 1236 return d.decode(b).toString(); 1237 } 1238 1239 /** 1240 * Locate the position of the commit message body. 1241 * 1242 * @param b 1243 * buffer to scan. 1244 * @param ptr 1245 * position in buffer to start the scan at. Most callers should 1246 * pass 0 to ensure the scan starts from the beginning of the 1247 * commit buffer. 1248 * @return position of the user's message buffer. 1249 */ 1250 public static final int commitMessage(byte[] b, int ptr) { 1251 final int sz = b.length; 1252 if (ptr == 0) 1253 ptr += 46; // skip the "tree ..." line. 1254 while (ptr < sz && b[ptr] == 'p') 1255 ptr += 48; // skip this parent. 1256 1257 // Skip any remaining header lines, ignoring what their actual 1258 // header line type is. This is identical to the logic for a tag. 1259 // 1260 return tagMessage(b, ptr); 1261 } 1262 1263 /** 1264 * Locate the position of the tag message body. 1265 * 1266 * @param b 1267 * buffer to scan. 1268 * @param ptr 1269 * position in buffer to start the scan at. Most callers should 1270 * pass 0 to ensure the scan starts from the beginning of the tag 1271 * buffer. 1272 * @return position of the user's message buffer. 1273 */ 1274 public static final int tagMessage(byte[] b, int ptr) { 1275 final int sz = b.length; 1276 if (ptr == 0) 1277 ptr += 48; // skip the "object ..." line. 1278 while (ptr < sz && b[ptr] != '\n') 1279 ptr = nextLF(b, ptr); 1280 if (ptr < sz && b[ptr] == '\n') 1281 return ptr + 1; 1282 return -1; 1283 } 1284 1285 /** 1286 * Locate the end of a paragraph. 1287 * <p> 1288 * A paragraph is ended by two consecutive LF bytes or CRLF pairs 1289 * 1290 * @param b 1291 * buffer to scan. 1292 * @param start 1293 * position in buffer to start the scan at. Most callers will 1294 * want to pass the first position of the commit message (as 1295 * found by {@link #commitMessage(byte[], int)}. 1296 * @return position of the LF at the end of the paragraph; 1297 * <code>b.length</code> if no paragraph end could be located. 1298 */ 1299 public static final int endOfParagraph(byte[] b, int start) { 1300 int ptr = start; 1301 final int sz = b.length; 1302 while (ptr < sz && (b[ptr] != '\n' && b[ptr] != '\r')) 1303 ptr = nextLF(b, ptr); 1304 if (ptr > start && b[ptr - 1] == '\n') 1305 ptr--; 1306 if (ptr > start && b[ptr - 1] == '\r') 1307 ptr--; 1308 return ptr; 1309 } 1310 1311 /** 1312 * Get last index of {@code ch} in raw, trimming spaces. 1313 * 1314 * @param raw 1315 * buffer to scan. 1316 * @param ch 1317 * character to find. 1318 * @param pos 1319 * starting position. 1320 * @return last index of {@code ch} in raw, trimming spaces. 1321 * @since 4.1 1322 */ 1323 public static int lastIndexOfTrim(byte[] raw, char ch, int pos) { 1324 while (pos >= 0 && raw[pos] == ' ') 1325 pos--; 1326 1327 while (pos >= 0 && raw[pos] != ch) 1328 pos--; 1329 1330 return pos; 1331 } 1332 1333 private static Charset charsetForAlias(String name) { 1334 return encodingAliases.get(StringUtils.toLowerCase(name)); 1335 } 1336 1337 private RawParseUtils() { 1338 // Don't create instances of a static only utility. 1339 } 1340 }