1 /* 2 * Copyright (C) 2008-2009, Google Inc. 3 * Copyright (C) 2006-2008, Shawn O. Pearce <spearce@spearce.org> and others 4 * 5 * This program and the accompanying materials are made available under the 6 * terms of the Eclipse Distribution License v. 1.0 which is available at 7 * https://www.eclipse.org/org/documents/edl-v10.php. 8 * 9 * SPDX-License-Identifier: BSD-3-Clause 10 */ 11 12 package org.eclipse.jgit.util; 13 14 import static java.nio.charset.StandardCharsets.ISO_8859_1; 15 import static java.nio.charset.StandardCharsets.UTF_8; 16 import static org.eclipse.jgit.lib.ObjectChecker.author; 17 import static org.eclipse.jgit.lib.ObjectChecker.committer; 18 import static org.eclipse.jgit.lib.ObjectChecker.encoding; 19 import static org.eclipse.jgit.lib.ObjectChecker.tagger; 20 21 import java.nio.ByteBuffer; 22 import java.nio.charset.CharacterCodingException; 23 import java.nio.charset.Charset; 24 import java.nio.charset.CharsetDecoder; 25 import java.nio.charset.CodingErrorAction; 26 import java.nio.charset.IllegalCharsetNameException; 27 import java.nio.charset.UnsupportedCharsetException; 28 import java.util.Arrays; 29 import java.util.HashMap; 30 import java.util.Map; 31 32 import org.eclipse.jgit.annotations.Nullable; 33 import org.eclipse.jgit.errors.BinaryBlobException; 34 import org.eclipse.jgit.lib.Constants; 35 import org.eclipse.jgit.lib.PersonIdent; 36 37 /** 38 * Handy utility functions to parse raw object contents. 39 */ 40 public final class RawParseUtils { 41 /** 42 * UTF-8 charset constant. 43 * 44 * @since 2.2 45 * @deprecated use {@link java.nio.charset.StandardCharsets#UTF_8} instead 46 */ 47 @Deprecated 48 public static final Charset UTF8_CHARSET = UTF_8; 49 50 private static final byte[] digits10; 51 52 private static final byte[] digits16; 53 54 private static final byte[] footerLineKeyChars; 55 56 private static final Map<String, Charset> encodingAliases; 57 58 static { 59 encodingAliases = new HashMap<>(); 60 encodingAliases.put("latin-1", ISO_8859_1); //$NON-NLS-1$ 61 encodingAliases.put("iso-latin-1", ISO_8859_1); //$NON-NLS-1$ 62 63 digits10 = new byte['9' + 1]; 64 Arrays.fill(digits10, (byte) -1); 65 for (char i = '0'; i <= '9'; i++) 66 digits10[i] = (byte) (i - '0'); 67 68 digits16 = new byte['f' + 1]; 69 Arrays.fill(digits16, (byte) -1); 70 for (char i = '0'; i <= '9'; i++) 71 digits16[i] = (byte) (i - '0'); 72 for (char i = 'a'; i <= 'f'; i++) 73 digits16[i] = (byte) ((i - 'a') + 10); 74 for (char i = 'A'; i <= 'F'; i++) 75 digits16[i] = (byte) ((i - 'A') + 10); 76 77 footerLineKeyChars = new byte['z' + 1]; 78 footerLineKeyChars['-'] = 1; 79 for (char i = '0'; i <= '9'; i++) 80 footerLineKeyChars[i] = 1; 81 for (char i = 'A'; i <= 'Z'; i++) 82 footerLineKeyChars[i] = 1; 83 for (char i = 'a'; i <= 'z'; i++) 84 footerLineKeyChars[i] = 1; 85 } 86 87 /** 88 * Determine if b[ptr] matches src. 89 * 90 * @param b 91 * the buffer to scan. 92 * @param ptr 93 * first position within b, this should match src[0]. 94 * @param src 95 * the buffer to test for equality with b. 96 * @return ptr + src.length if b[ptr..src.length] == src; else -1. 97 */ 98 public static final int match(byte[] b, int ptr, byte[] src) { 99 if (ptr + src.length > b.length) 100 return -1; 101 for (int i = 0; i < src.length; i++, ptr++) 102 if (b[ptr] != src[i]) 103 return -1; 104 return ptr; 105 } 106 107 private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5', 108 '6', '7', '8', '9' }; 109 110 /** 111 * Format a base 10 numeric into a temporary buffer. 112 * <p> 113 * Formatting is performed backwards. The method starts at offset 114 * <code>o-1</code> and ends at <code>o-1-digits</code>, where 115 * <code>digits</code> is the number of positions necessary to store the 116 * base 10 value. 117 * <p> 118 * The argument and return values from this method make it easy to chain 119 * writing, for example: 120 * </p> 121 * 122 * <pre> 123 * final byte[] tmp = new byte[64]; 124 * int ptr = tmp.length; 125 * tmp[--ptr] = '\n'; 126 * ptr = RawParseUtils.formatBase10(tmp, ptr, 32); 127 * tmp[--ptr] = ' '; 128 * ptr = RawParseUtils.formatBase10(tmp, ptr, 18); 129 * tmp[--ptr] = 0; 130 * final String str = new String(tmp, ptr, tmp.length - ptr); 131 * </pre> 132 * 133 * @param b 134 * buffer to write into. 135 * @param o 136 * one offset past the location where writing will begin; writing 137 * proceeds towards lower index values. 138 * @param value 139 * the value to store. 140 * @return the new offset value <code>o</code>. This is the position of 141 * the last byte written. Additional writing should start at one 142 * position earlier. 143 */ 144 public static int formatBase10(final byte[] b, int o, int value) { 145 if (value == 0) { 146 b[--o] = '0'; 147 return o; 148 } 149 final boolean isneg = value < 0; 150 if (isneg) 151 value = -value; 152 while (value != 0) { 153 b[--o] = base10byte[value % 10]; 154 value /= 10; 155 } 156 if (isneg) 157 b[--o] = '-'; 158 return o; 159 } 160 161 /** 162 * Parse a base 10 numeric from a sequence of ASCII digits into an int. 163 * <p> 164 * Digit sequences can begin with an optional run of spaces before the 165 * sequence, and may start with a '+' or a '-' to indicate sign position. 166 * Any other characters will cause the method to stop and return the current 167 * result to the caller. 168 * 169 * @param b 170 * buffer to scan. 171 * @param ptr 172 * position within buffer to start parsing digits at. 173 * @param ptrResult 174 * optional location to return the new ptr value through. If null 175 * the ptr value will be discarded. 176 * @return the value at this location; 0 if the location is not a valid 177 * numeric. 178 */ 179 public static final int parseBase10(final byte[] b, int ptr, 180 final MutableInteger ptrResult) { 181 int r = 0; 182 int sign = 0; 183 try { 184 final int sz = b.length; 185 while (ptr < sz && b[ptr] == ' ') 186 ptr++; 187 if (ptr >= sz) 188 return 0; 189 190 switch (b[ptr]) { 191 case '-': 192 sign = -1; 193 ptr++; 194 break; 195 case '+': 196 ptr++; 197 break; 198 } 199 200 while (ptr < sz) { 201 final byte v = digits10[b[ptr]]; 202 if (v < 0) 203 break; 204 r = (r * 10) + v; 205 ptr++; 206 } 207 } catch (ArrayIndexOutOfBoundsException e) { 208 // Not a valid digit. 209 } 210 if (ptrResult != null) 211 ptrResult.value = ptr; 212 return sign < 0 ? -r : r; 213 } 214 215 /** 216 * Parse a base 10 numeric from a sequence of ASCII digits into a long. 217 * <p> 218 * Digit sequences can begin with an optional run of spaces before the 219 * sequence, and may start with a '+' or a '-' to indicate sign position. 220 * Any other characters will cause the method to stop and return the current 221 * result to the caller. 222 * 223 * @param b 224 * buffer to scan. 225 * @param ptr 226 * position within buffer to start parsing digits at. 227 * @param ptrResult 228 * optional location to return the new ptr value through. If null 229 * the ptr value will be discarded. 230 * @return the value at this location; 0 if the location is not a valid 231 * numeric. 232 */ 233 public static final long parseLongBase10(final byte[] b, int ptr, 234 final MutableInteger ptrResult) { 235 long r = 0; 236 int sign = 0; 237 try { 238 final int sz = b.length; 239 while (ptr < sz && b[ptr] == ' ') 240 ptr++; 241 if (ptr >= sz) 242 return 0; 243 244 switch (b[ptr]) { 245 case '-': 246 sign = -1; 247 ptr++; 248 break; 249 case '+': 250 ptr++; 251 break; 252 } 253 254 while (ptr < sz) { 255 final byte v = digits10[b[ptr]]; 256 if (v < 0) 257 break; 258 r = (r * 10) + v; 259 ptr++; 260 } 261 } catch (ArrayIndexOutOfBoundsException e) { 262 // Not a valid digit. 263 } 264 if (ptrResult != null) 265 ptrResult.value = ptr; 266 return sign < 0 ? -r : r; 267 } 268 269 /** 270 * Parse 4 character base 16 (hex) formatted string to unsigned integer. 271 * <p> 272 * The number is read in network byte order, that is, most significant 273 * nybble first. 274 * 275 * @param bs 276 * buffer to parse digits from; positions {@code [p, p+4)} will 277 * be parsed. 278 * @param p 279 * first position within the buffer to parse. 280 * @return the integer value. 281 * @throws java.lang.ArrayIndexOutOfBoundsException 282 * if the string is not hex formatted. 283 */ 284 public static final int parseHexInt16(final byte[] bs, final int p) { 285 int r = digits16[bs[p]] << 4; 286 287 r |= digits16[bs[p + 1]]; 288 r <<= 4; 289 290 r |= digits16[bs[p + 2]]; 291 r <<= 4; 292 293 r |= digits16[bs[p + 3]]; 294 if (r < 0) 295 throw new ArrayIndexOutOfBoundsException(); 296 return r; 297 } 298 299 /** 300 * Parse 8 character base 16 (hex) formatted string to unsigned integer. 301 * <p> 302 * The number is read in network byte order, that is, most significant 303 * nybble first. 304 * 305 * @param bs 306 * buffer to parse digits from; positions {@code [p, p+8)} will 307 * be parsed. 308 * @param p 309 * first position within the buffer to parse. 310 * @return the integer value. 311 * @throws java.lang.ArrayIndexOutOfBoundsException 312 * if the string is not hex formatted. 313 */ 314 public static final int parseHexInt32(final byte[] bs, final int p) { 315 int r = digits16[bs[p]] << 4; 316 317 r |= digits16[bs[p + 1]]; 318 r <<= 4; 319 320 r |= digits16[bs[p + 2]]; 321 r <<= 4; 322 323 r |= digits16[bs[p + 3]]; 324 r <<= 4; 325 326 r |= digits16[bs[p + 4]]; 327 r <<= 4; 328 329 r |= digits16[bs[p + 5]]; 330 r <<= 4; 331 332 r |= digits16[bs[p + 6]]; 333 334 final int last = digits16[bs[p + 7]]; 335 if (r < 0 || last < 0) 336 throw new ArrayIndexOutOfBoundsException(); 337 return (r << 4) | last; 338 } 339 340 /** 341 * Parse 16 character base 16 (hex) formatted string to unsigned long. 342 * <p> 343 * The number is read in network byte order, that is, most significant 344 * nibble first. 345 * 346 * @param bs 347 * buffer to parse digits from; positions {@code [p, p+16)} will 348 * be parsed. 349 * @param p 350 * first position within the buffer to parse. 351 * @return the integer value. 352 * @throws java.lang.ArrayIndexOutOfBoundsException 353 * if the string is not hex formatted. 354 * @since 4.3 355 */ 356 public static final long parseHexInt64(final byte[] bs, final int p) { 357 long r = digits16[bs[p]] << 4; 358 359 r |= digits16[bs[p + 1]]; 360 r <<= 4; 361 362 r |= digits16[bs[p + 2]]; 363 r <<= 4; 364 365 r |= digits16[bs[p + 3]]; 366 r <<= 4; 367 368 r |= digits16[bs[p + 4]]; 369 r <<= 4; 370 371 r |= digits16[bs[p + 5]]; 372 r <<= 4; 373 374 r |= digits16[bs[p + 6]]; 375 r <<= 4; 376 377 r |= digits16[bs[p + 7]]; 378 r <<= 4; 379 380 r |= digits16[bs[p + 8]]; 381 r <<= 4; 382 383 r |= digits16[bs[p + 9]]; 384 r <<= 4; 385 386 r |= digits16[bs[p + 10]]; 387 r <<= 4; 388 389 r |= digits16[bs[p + 11]]; 390 r <<= 4; 391 392 r |= digits16[bs[p + 12]]; 393 r <<= 4; 394 395 r |= digits16[bs[p + 13]]; 396 r <<= 4; 397 398 r |= digits16[bs[p + 14]]; 399 400 final int last = digits16[bs[p + 15]]; 401 if (r < 0 || last < 0) 402 throw new ArrayIndexOutOfBoundsException(); 403 return (r << 4) | last; 404 } 405 406 /** 407 * Parse a single hex digit to its numeric value (0-15). 408 * 409 * @param digit 410 * hex character to parse. 411 * @return numeric value, in the range 0-15. 412 * @throws java.lang.ArrayIndexOutOfBoundsException 413 * if the input digit is not a valid hex digit. 414 */ 415 public static final int parseHexInt4(final byte digit) { 416 final byte r = digits16[digit]; 417 if (r < 0) 418 throw new ArrayIndexOutOfBoundsException(); 419 return r; 420 } 421 422 /** 423 * Parse a Git style timezone string. 424 * <p> 425 * The sequence "-0315" will be parsed as the numeric value -195, as the 426 * lower two positions count minutes, not 100ths of an hour. 427 * 428 * @param b 429 * buffer to scan. 430 * @param ptr 431 * position within buffer to start parsing digits at. 432 * @return the timezone at this location, expressed in minutes. 433 */ 434 public static final int parseTimeZoneOffset(byte[] b, int ptr) { 435 return parseTimeZoneOffset(b, ptr, null); 436 } 437 438 /** 439 * Parse a Git style timezone string. 440 * <p> 441 * The sequence "-0315" will be parsed as the numeric value -195, as the 442 * lower two positions count minutes, not 100ths of an hour. 443 * 444 * @param b 445 * buffer to scan. 446 * @param ptr 447 * position within buffer to start parsing digits at. 448 * @param ptrResult 449 * optional location to return the new ptr value through. If null 450 * the ptr value will be discarded. 451 * @return the timezone at this location, expressed in minutes. 452 * @since 4.1 453 */ 454 public static final int parseTimeZoneOffset(final byte[] b, int ptr, 455 MutableInteger ptrResult) { 456 final int v = parseBase10(b, ptr, ptrResult); 457 final int tzMins = v % 100; 458 final int tzHours = v / 100; 459 return tzHours * 60 + tzMins; 460 } 461 462 /** 463 * Locate the first position after a given character. 464 * 465 * @param b 466 * buffer to scan. 467 * @param ptr 468 * position within buffer to start looking for chrA at. 469 * @param chrA 470 * character to find. 471 * @return new position just after chrA. 472 */ 473 public static final int next(byte[] b, int ptr, char chrA) { 474 final int sz = b.length; 475 while (ptr < sz) { 476 if (b[ptr++] == chrA) 477 return ptr; 478 } 479 return ptr; 480 } 481 482 /** 483 * Locate the first position after the next LF. 484 * <p> 485 * This method stops on the first '\n' it finds. 486 * 487 * @param b 488 * buffer to scan. 489 * @param ptr 490 * position within buffer to start looking for LF at. 491 * @return new position just after the first LF found. 492 */ 493 public static final int nextLF(byte[] b, int ptr) { 494 return next(b, ptr, '\n'); 495 } 496 497 /** 498 * Locate the first position after either the given character or LF. 499 * <p> 500 * This method stops on the first match it finds from either chrA or '\n'. 501 * 502 * @param b 503 * buffer to scan. 504 * @param ptr 505 * position within buffer to start looking for chrA or LF at. 506 * @param chrA 507 * character to find. 508 * @return new position just after the first chrA or LF to be found. 509 */ 510 public static final int nextLF(byte[] b, int ptr, char chrA) { 511 final int sz = b.length; 512 while (ptr < sz) { 513 final byte c = b[ptr++]; 514 if (c == chrA || c == '\n') 515 return ptr; 516 } 517 return ptr; 518 } 519 520 /** 521 * Locate the end of the header. Note that headers may be 522 * more than one line long. 523 * @param b 524 * buffer to scan. 525 * @param ptr 526 * position within buffer to start looking for the end-of-header. 527 * @return new position just after the header. This is either 528 * b.length, or the index of the header's terminating newline. 529 * @since 5.1 530 */ 531 public static final int headerEnd(final byte[] b, int ptr) { 532 final int sz = b.length; 533 while (ptr < sz) { 534 final byte c = b[ptr++]; 535 if (c == '\n' && (ptr == sz || b[ptr] != ' ')) { 536 return ptr - 1; 537 } 538 } 539 return ptr - 1; 540 } 541 542 /** 543 * Find the start of the contents of a given header. 544 * 545 * @param b 546 * buffer to scan. 547 * @param headerName 548 * header to search for 549 * @param ptr 550 * position within buffer to start looking for header at. 551 * @return new position at the start of the header's contents, -1 for 552 * not found 553 * @since 5.1 554 */ 555 public static final int headerStart(byte[] headerName, byte[] b, int ptr) { 556 // Start by advancing to just past a LF or buffer start 557 if (ptr != 0) { 558 ptr = nextLF(b, ptr - 1); 559 } 560 while (ptr < b.length - (headerName.length + 1)) { 561 boolean found = true; 562 for (byte element : headerName) { 563 if (element != b[ptr++]) { 564 found = false; 565 break; 566 } 567 } 568 if (found && b[ptr++] == ' ') { 569 return ptr; 570 } 571 ptr = nextLF(b, ptr); 572 } 573 return -1; 574 } 575 576 /** 577 * Locate the first position before a given character. 578 * 579 * @param b 580 * buffer to scan. 581 * @param ptr 582 * position within buffer to start looking for chrA at. 583 * @param chrA 584 * character to find. 585 * @return new position just before chrA, -1 for not found 586 */ 587 public static final int prev(byte[] b, int ptr, char chrA) { 588 if (ptr == b.length) 589 --ptr; 590 while (ptr >= 0) { 591 if (b[ptr--] == chrA) 592 return ptr; 593 } 594 return ptr; 595 } 596 597 /** 598 * Locate the first position before the previous LF. 599 * <p> 600 * This method stops on the first '\n' it finds. 601 * 602 * @param b 603 * buffer to scan. 604 * @param ptr 605 * position within buffer to start looking for LF at. 606 * @return new position just before the first LF found, -1 for not found 607 */ 608 public static final int prevLF(byte[] b, int ptr) { 609 return prev(b, ptr, '\n'); 610 } 611 612 /** 613 * Locate the previous position before either the given character or LF. 614 * <p> 615 * This method stops on the first match it finds from either chrA or '\n'. 616 * 617 * @param b 618 * buffer to scan. 619 * @param ptr 620 * position within buffer to start looking for chrA or LF at. 621 * @param chrA 622 * character to find. 623 * @return new position just before the first chrA or LF to be found, -1 for 624 * not found 625 */ 626 public static final int prevLF(byte[] b, int ptr, char chrA) { 627 if (ptr == b.length) 628 --ptr; 629 while (ptr >= 0) { 630 final byte c = b[ptr--]; 631 if (c == chrA || c == '\n') 632 return ptr; 633 } 634 return ptr; 635 } 636 637 /** 638 * Index the region between <code>[ptr, end)</code> to find line starts. 639 * <p> 640 * The returned list is 1 indexed. Index 0 contains 641 * {@link java.lang.Integer#MIN_VALUE} to pad the list out. 642 * <p> 643 * Using a 1 indexed list means that line numbers can be directly accessed 644 * from the list, so <code>list.get(1)</code> (aka get line 1) returns 645 * <code>ptr</code>. 646 * <p> 647 * The last element (index <code>map.size()-1</code>) always contains 648 * <code>end</code>. 649 * 650 * @param buf 651 * buffer to scan. 652 * @param ptr 653 * position within the buffer corresponding to the first byte of 654 * line 1. 655 * @param end 656 * 1 past the end of the content within <code>buf</code>. 657 * @return a line map indicating the starting position of each line. 658 */ 659 public static final IntList lineMap(byte[] buf, int ptr, int end) { 660 IntList map = new IntList((end - ptr) / 36); 661 map.fillTo(1, Integer.MIN_VALUE); 662 for (; ptr < end; ptr = nextLF(buf, ptr)) { 663 map.add(ptr); 664 } 665 map.add(end); 666 return map; 667 } 668 669 /** 670 * Like {@link #lineMap(byte[], int, int)} but throw 671 * {@link BinaryBlobException} if a NUL byte is encountered. 672 * 673 * @param buf 674 * buffer to scan. 675 * @param ptr 676 * position within the buffer corresponding to the first byte of 677 * line 1. 678 * @param end 679 * 1 past the end of the content within <code>buf</code>. 680 * @return a line map indicating the starting position of each line. 681 * @throws BinaryBlobException 682 * if a NUL byte is found. 683 * @since 5.0 684 */ 685 public static final IntList lineMapOrBinary(byte[] buf, int ptr, int end) 686 throws BinaryBlobException { 687 IntList map = lineMapOrNull(buf, ptr, end); 688 if (map == null) { 689 throw new BinaryBlobException(); 690 } 691 return map; 692 } 693 694 @Nullable 695 private static IntList lineMapOrNull(byte[] buf, int ptr, int end) { 696 // Experimentally derived from multiple source repositories 697 // the average number of bytes/line is 36. Its a rough guess 698 // to initially size our map close to the target. 699 IntList map = new IntList((end - ptr) / 36); 700 map.add(Integer.MIN_VALUE); 701 boolean foundLF = true; 702 for (; ptr < end; ptr++) { 703 if (foundLF) { 704 map.add(ptr); 705 } 706 707 if (buf[ptr] == '\0') { 708 return null; 709 } 710 711 foundLF = (buf[ptr] == '\n'); 712 } 713 map.add(end); 714 return map; 715 } 716 717 /** 718 * Locate the "author " header line data. 719 * 720 * @param b 721 * buffer to scan. 722 * @param ptr 723 * position in buffer to start the scan at. Most callers should 724 * pass 0 to ensure the scan starts from the beginning of the 725 * commit buffer and does not accidentally look at message body. 726 * @return position just after the space in "author ", so the first 727 * character of the author's name. If no author header can be 728 * located -1 is returned. 729 */ 730 public static final int author(byte[] b, int ptr) { 731 final int sz = b.length; 732 if (ptr == 0) 733 ptr += 46; // skip the "tree ..." line. 734 while (ptr < sz && b[ptr] == 'p') 735 ptr += 48; // skip this parent. 736 return match(b, ptr, author); 737 } 738 739 /** 740 * Locate the "committer " header line data. 741 * 742 * @param b 743 * buffer to scan. 744 * @param ptr 745 * position in buffer to start the scan at. Most callers should 746 * pass 0 to ensure the scan starts from the beginning of the 747 * commit buffer and does not accidentally look at message body. 748 * @return position just after the space in "committer ", so the first 749 * character of the committer's name. If no committer header can be 750 * located -1 is returned. 751 */ 752 public static final int committer(byte[] b, int ptr) { 753 final int sz = b.length; 754 if (ptr == 0) 755 ptr += 46; // skip the "tree ..." line. 756 while (ptr < sz && b[ptr] == 'p') 757 ptr += 48; // skip this parent. 758 if (ptr < sz && b[ptr] == 'a') 759 ptr = nextLF(b, ptr); 760 return match(b, ptr, committer); 761 } 762 763 /** 764 * Locate the "tagger " header line data. 765 * 766 * @param b 767 * buffer to scan. 768 * @param ptr 769 * position in buffer to start the scan at. Most callers should 770 * pass 0 to ensure the scan starts from the beginning of the tag 771 * buffer and does not accidentally look at message body. 772 * @return position just after the space in "tagger ", so the first 773 * character of the tagger's name. If no tagger header can be 774 * located -1 is returned. 775 */ 776 public static final int tagger(byte[] b, int ptr) { 777 final int sz = b.length; 778 if (ptr == 0) 779 ptr += 48; // skip the "object ..." line. 780 while (ptr < sz) { 781 if (b[ptr] == '\n') 782 return -1; 783 final int m = match(b, ptr, tagger); 784 if (m >= 0) 785 return m; 786 ptr = nextLF(b, ptr); 787 } 788 return -1; 789 } 790 791 /** 792 * Locate the "encoding " header line. 793 * 794 * @param b 795 * buffer to scan. 796 * @param ptr 797 * position in buffer to start the scan at. Most callers should 798 * pass 0 to ensure the scan starts from the beginning of the 799 * buffer and does not accidentally look at the message body. 800 * @return position just after the space in "encoding ", so the first 801 * character of the encoding's name. If no encoding header can be 802 * located -1 is returned (and UTF-8 should be assumed). 803 */ 804 public static final int encoding(byte[] b, int ptr) { 805 final int sz = b.length; 806 while (ptr < sz) { 807 if (b[ptr] == '\n') 808 return -1; 809 if (b[ptr] == 'e') 810 break; 811 ptr = nextLF(b, ptr); 812 } 813 return match(b, ptr, encoding); 814 } 815 816 /** 817 * Parse the "encoding " header as a string. 818 * <p> 819 * Locates the "encoding " header (if present) and returns its value. 820 * 821 * @param b 822 * buffer to scan. 823 * @return the encoding header as specified in the commit; null if the 824 * header was not present and should be assumed. 825 * @since 4.2 826 */ 827 @Nullable 828 public static String parseEncodingName(byte[] b) { 829 int enc = encoding(b, 0); 830 if (enc < 0) { 831 return null; 832 } 833 int lf = nextLF(b, enc); 834 return decode(UTF_8, b, enc, lf - 1); 835 } 836 837 /** 838 * Parse the "encoding " header into a character set reference. 839 * <p> 840 * Locates the "encoding " header (if present) by first calling 841 * {@link #encoding(byte[], int)} and then returns the proper character set 842 * to apply to this buffer to evaluate its contents as character data. 843 * <p> 844 * If no encoding header is present {@code UTF-8} is assumed. 845 * 846 * @param b 847 * buffer to scan. 848 * @return the Java character set representation. Never null. 849 * @throws IllegalCharsetNameException 850 * if the character set requested by the encoding header is 851 * malformed and unsupportable. 852 * @throws UnsupportedCharsetException 853 * if the JRE does not support the character set requested by 854 * the encoding header. 855 */ 856 public static Charset parseEncoding(byte[] b) { 857 String enc = parseEncodingName(b); 858 if (enc == null) { 859 return UTF_8; 860 } 861 862 String name = enc.trim(); 863 try { 864 return Charset.forName(name); 865 } catch (IllegalCharsetNameException 866 | UnsupportedCharsetException badName) { 867 Charset aliased = charsetForAlias(name); 868 if (aliased != null) { 869 return aliased; 870 } 871 throw badName; 872 } 873 } 874 875 /** 876 * Parse a name string (e.g. author, committer, tagger) into a PersonIdent. 877 * <p> 878 * Leading spaces won't be trimmed from the string, i.e. will show up in the 879 * parsed name afterwards. 880 * 881 * @param in 882 * the string to parse a name from. 883 * @return the parsed identity or null in case the identity could not be 884 * parsed. 885 */ 886 public static PersonIdent parsePersonIdent(String in) { 887 return parsePersonIdent(Constants.encode(in), 0); 888 } 889 890 /** 891 * Parse a name line (e.g. author, committer, tagger) into a PersonIdent. 892 * <p> 893 * When passing in a value for <code>nameB</code> callers should use the 894 * return value of {@link #author(byte[], int)} or 895 * {@link #committer(byte[], int)}, as these methods provide the proper 896 * position within the buffer. 897 * 898 * @param raw 899 * the buffer to parse character data from. 900 * @param nameB 901 * first position of the identity information. This should be the 902 * first position after the space which delimits the header field 903 * name (e.g. "author" or "committer") from the rest of the 904 * identity line. 905 * @return the parsed identity or null in case the identity could not be 906 * parsed. 907 */ 908 public static PersonIdent parsePersonIdent(byte[] raw, int nameB) { 909 Charset cs; 910 try { 911 cs = parseEncoding(raw); 912 } catch (IllegalCharsetNameException | UnsupportedCharsetException e) { 913 // Assume UTF-8 for person identities, usually this is correct. 914 // If not decode() will fall back to the ISO-8859-1 encoding. 915 cs = UTF_8; 916 } 917 918 final int emailB = nextLF(raw, nameB, '<'); 919 final int emailE = nextLF(raw, emailB, '>'); 920 if (emailB >= raw.length || raw[emailB] == '\n' || 921 (emailE >= raw.length - 1 && raw[emailE - 1] != '>')) 922 return null; 923 924 final int nameEnd = emailB - 2 >= nameB && raw[emailB - 2] == ' ' ? 925 emailB - 2 : emailB - 1; 926 final String name = decode(cs, raw, nameB, nameEnd); 927 final String email = decode(cs, raw, emailB, emailE - 1); 928 929 // Start searching from end of line, as after first name-email pair, 930 // another name-email pair may occur. We will ignore all kinds of 931 // "junk" following the first email. 932 // 933 // We've to use (emailE - 1) for the case that raw[email] is LF, 934 // otherwise we would run too far. "-2" is necessary to position 935 // before the LF in case of LF termination resp. the penultimate 936 // character if there is no trailing LF. 937 final int tzBegin = lastIndexOfTrim(raw, ' ', 938 nextLF(raw, emailE - 1) - 2) + 1; 939 if (tzBegin <= emailE) // No time/zone, still valid 940 return new PersonIdent(name, email, 0, 0); 941 942 final int whenBegin = Math.max(emailE, 943 lastIndexOfTrim(raw, ' ', tzBegin - 1) + 1); 944 if (whenBegin >= tzBegin - 1) // No time/zone, still valid 945 return new PersonIdent(name, email, 0, 0); 946 947 final long when = parseLongBase10(raw, whenBegin, null); 948 final int tz = parseTimeZoneOffset(raw, tzBegin); 949 return new PersonIdent(name, email, when * 1000L, tz); 950 } 951 952 /** 953 * Parse a name data (e.g. as within a reflog) into a PersonIdent. 954 * <p> 955 * When passing in a value for <code>nameB</code> callers should use the 956 * return value of {@link #author(byte[], int)} or 957 * {@link #committer(byte[], int)}, as these methods provide the proper 958 * position within the buffer. 959 * 960 * @param raw 961 * the buffer to parse character data from. 962 * @param nameB 963 * first position of the identity information. This should be the 964 * first position after the space which delimits the header field 965 * name (e.g. "author" or "committer") from the rest of the 966 * identity line. 967 * @return the parsed identity. Never null. 968 */ 969 public static PersonIdent parsePersonIdentOnly(final byte[] raw, 970 final int nameB) { 971 int stop = nextLF(raw, nameB); 972 int emailB = nextLF(raw, nameB, '<'); 973 int emailE = nextLF(raw, emailB, '>'); 974 final String name; 975 final String email; 976 if (emailE < stop) { 977 email = decode(raw, emailB, emailE - 1); 978 } else { 979 email = "invalid"; //$NON-NLS-1$ 980 } 981 if (emailB < stop) 982 name = decode(raw, nameB, emailB - 2); 983 else 984 name = decode(raw, nameB, stop); 985 986 final MutableInteger.html#MutableInteger">MutableInteger ptrout = new MutableInteger(); 987 long when; 988 int tz; 989 if (emailE < stop) { 990 when = parseLongBase10(raw, emailE + 1, ptrout); 991 tz = parseTimeZoneOffset(raw, ptrout.value); 992 } else { 993 when = 0; 994 tz = 0; 995 } 996 return new PersonIdent(name, email, when * 1000L, tz); 997 } 998 999 /** 1000 * Locate the end of a footer line key string. 1001 * <p> 1002 * If the region at {@code raw[ptr]} matches {@code ^[A-Za-z0-9-]+:} (e.g. 1003 * "Signed-off-by: A. U. Thor\n") then this method returns the position of 1004 * the first ':'. 1005 * <p> 1006 * If the region at {@code raw[ptr]} does not match {@code ^[A-Za-z0-9-]+:} 1007 * then this method returns -1. 1008 * 1009 * @param raw 1010 * buffer to scan. 1011 * @param ptr 1012 * first position within raw to consider as a footer line key. 1013 * @return position of the ':' which terminates the footer line key if this 1014 * is otherwise a valid footer line key; otherwise -1. 1015 */ 1016 public static int endOfFooterLineKey(byte[] raw, int ptr) { 1017 try { 1018 for (;;) { 1019 final byte c = raw[ptr]; 1020 if (footerLineKeyChars[c] == 0) { 1021 if (c == ':') 1022 return ptr; 1023 return -1; 1024 } 1025 ptr++; 1026 } 1027 } catch (ArrayIndexOutOfBoundsException e) { 1028 return -1; 1029 } 1030 } 1031 1032 /** 1033 * Decode a buffer under UTF-8, if possible. 1034 * 1035 * If the byte stream cannot be decoded that way, the platform default is tried 1036 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried. 1037 * 1038 * @param buffer 1039 * buffer to pull raw bytes from. 1040 * @return a string representation of the range <code>[start,end)</code>, 1041 * after decoding the region through the specified character set. 1042 */ 1043 public static String decode(byte[] buffer) { 1044 return decode(buffer, 0, buffer.length); 1045 } 1046 1047 /** 1048 * Decode a buffer under UTF-8, if possible. 1049 * 1050 * If the byte stream cannot be decoded that way, the platform default is 1051 * tried and if that too fails, the fail-safe ISO-8859-1 encoding is tried. 1052 * 1053 * @param buffer 1054 * buffer to pull raw bytes from. 1055 * @param start 1056 * start position in buffer 1057 * @param end 1058 * one position past the last location within the buffer to take 1059 * data from. 1060 * @return a string representation of the range <code>[start,end)</code>, 1061 * after decoding the region through the specified character set. 1062 */ 1063 public static String decode(final byte[] buffer, final int start, 1064 final int end) { 1065 return decode(UTF_8, buffer, start, end); 1066 } 1067 1068 /** 1069 * Decode a buffer under the specified character set if possible. 1070 * 1071 * If the byte stream cannot be decoded that way, the platform default is tried 1072 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried. 1073 * 1074 * @param cs 1075 * character set to use when decoding the buffer. 1076 * @param buffer 1077 * buffer to pull raw bytes from. 1078 * @return a string representation of the range <code>[start,end)</code>, 1079 * after decoding the region through the specified character set. 1080 */ 1081 public static String decode(Charset cs, byte[] buffer) { 1082 return decode(cs, buffer, 0, buffer.length); 1083 } 1084 1085 /** 1086 * Decode a region of the buffer under the specified character set if possible. 1087 * 1088 * If the byte stream cannot be decoded that way, the platform default is tried 1089 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried. 1090 * 1091 * @param cs 1092 * character set to use when decoding the buffer. 1093 * @param buffer 1094 * buffer to pull raw bytes from. 1095 * @param start 1096 * first position within the buffer to take data from. 1097 * @param end 1098 * one position past the last location within the buffer to take 1099 * data from. 1100 * @return a string representation of the range <code>[start,end)</code>, 1101 * after decoding the region through the specified character set. 1102 */ 1103 public static String decode(final Charset cs, final byte[] buffer, 1104 final int start, final int end) { 1105 try { 1106 return decodeNoFallback(cs, buffer, start, end); 1107 } catch (CharacterCodingException e) { 1108 // Fall back to an ISO-8859-1 style encoding. At least all of 1109 // the bytes will be present in the output. 1110 // 1111 return extractBinaryString(buffer, start, end); 1112 } 1113 } 1114 1115 /** 1116 * Decode a region of the buffer under the specified character set if 1117 * possible. 1118 * 1119 * If the byte stream cannot be decoded that way, the platform default is 1120 * tried and if that too fails, an exception is thrown. 1121 * 1122 * @param cs 1123 * character set to use when decoding the buffer. 1124 * @param buffer 1125 * buffer to pull raw bytes from. 1126 * @param start 1127 * first position within the buffer to take data from. 1128 * @param end 1129 * one position past the last location within the buffer to take 1130 * data from. 1131 * @return a string representation of the range <code>[start,end)</code>, 1132 * after decoding the region through the specified character set. 1133 * @throws java.nio.charset.CharacterCodingException 1134 * the input is not in any of the tested character sets. 1135 */ 1136 public static String decodeNoFallback(final Charset cs, 1137 final byte[] buffer, final int start, final int end) 1138 throws CharacterCodingException { 1139 ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start); 1140 b.mark(); 1141 1142 // Try our built-in favorite. The assumption here is that 1143 // decoding will fail if the data is not actually encoded 1144 // using that encoder. 1145 try { 1146 return decode(b, UTF_8); 1147 } catch (CharacterCodingException e) { 1148 b.reset(); 1149 } 1150 1151 if (!cs.equals(UTF_8)) { 1152 // Try the suggested encoding, it might be right since it was 1153 // provided by the caller. 1154 try { 1155 return decode(b, cs); 1156 } catch (CharacterCodingException e) { 1157 b.reset(); 1158 } 1159 } 1160 1161 // Try the default character set. A small group of people 1162 // might actually use the same (or very similar) locale. 1163 Charset defcs = Charset.defaultCharset(); 1164 if (!defcs.equals(cs) && !defcs.equals(UTF_8)) { 1165 try { 1166 return decode(b, defcs); 1167 } catch (CharacterCodingException e) { 1168 b.reset(); 1169 } 1170 } 1171 1172 throw new CharacterCodingException(); 1173 } 1174 1175 /** 1176 * Decode a region of the buffer under the ISO-8859-1 encoding. 1177 * 1178 * Each byte is treated as a single character in the 8859-1 character 1179 * encoding, performing a raw binary->char conversion. 1180 * 1181 * @param buffer 1182 * buffer to pull raw bytes from. 1183 * @param start 1184 * first position within the buffer to take data from. 1185 * @param end 1186 * one position past the last location within the buffer to take 1187 * data from. 1188 * @return a string representation of the range <code>[start,end)</code>. 1189 */ 1190 public static String extractBinaryString(final byte[] buffer, 1191 final int start, final int end) { 1192 final StringBuilder r = new StringBuilder(end - start); 1193 for (int i = start; i < end; i++) 1194 r.append((char) (buffer[i] & 0xff)); 1195 return r.toString(); 1196 } 1197 1198 private static String decode(ByteBuffer b, Charset charset) 1199 throws CharacterCodingException { 1200 final CharsetDecoder d = charset.newDecoder(); 1201 d.onMalformedInput(CodingErrorAction.REPORT); 1202 d.onUnmappableCharacter(CodingErrorAction.REPORT); 1203 return d.decode(b).toString(); 1204 } 1205 1206 /** 1207 * Locate the position of the commit message body. 1208 * 1209 * @param b 1210 * buffer to scan. 1211 * @param ptr 1212 * position in buffer to start the scan at. Most callers should 1213 * pass 0 to ensure the scan starts from the beginning of the 1214 * commit buffer. 1215 * @return position of the user's message buffer. 1216 */ 1217 public static final int commitMessage(byte[] b, int ptr) { 1218 final int sz = b.length; 1219 if (ptr == 0) 1220 ptr += 46; // skip the "tree ..." line. 1221 while (ptr < sz && b[ptr] == 'p') 1222 ptr += 48; // skip this parent. 1223 1224 // Skip any remaining header lines, ignoring what their actual 1225 // header line type is. This is identical to the logic for a tag. 1226 // 1227 return tagMessage(b, ptr); 1228 } 1229 1230 /** 1231 * Locate the position of the tag message body. 1232 * 1233 * @param b 1234 * buffer to scan. 1235 * @param ptr 1236 * position in buffer to start the scan at. Most callers should 1237 * pass 0 to ensure the scan starts from the beginning of the tag 1238 * buffer. 1239 * @return position of the user's message buffer. 1240 */ 1241 public static final int tagMessage(byte[] b, int ptr) { 1242 final int sz = b.length; 1243 if (ptr == 0) 1244 ptr += 48; // skip the "object ..." line. 1245 while (ptr < sz && b[ptr] != '\n') 1246 ptr = nextLF(b, ptr); 1247 if (ptr < sz && b[ptr] == '\n') 1248 return ptr + 1; 1249 return -1; 1250 } 1251 1252 /** 1253 * Locate the end of a paragraph. 1254 * <p> 1255 * A paragraph is ended by two consecutive LF bytes or CRLF pairs 1256 * 1257 * @param b 1258 * buffer to scan. 1259 * @param start 1260 * position in buffer to start the scan at. Most callers will 1261 * want to pass the first position of the commit message (as 1262 * found by {@link #commitMessage(byte[], int)}. 1263 * @return position of the LF at the end of the paragraph; 1264 * <code>b.length</code> if no paragraph end could be located. 1265 */ 1266 public static final int endOfParagraph(byte[] b, int start) { 1267 int ptr = start; 1268 final int sz = b.length; 1269 while (ptr < sz && (b[ptr] != '\n' && b[ptr] != '\r')) 1270 ptr = nextLF(b, ptr); 1271 if (ptr > start && b[ptr - 1] == '\n') 1272 ptr--; 1273 if (ptr > start && b[ptr - 1] == '\r') 1274 ptr--; 1275 return ptr; 1276 } 1277 1278 /** 1279 * Get last index of {@code ch} in raw, trimming spaces. 1280 * 1281 * @param raw 1282 * buffer to scan. 1283 * @param ch 1284 * character to find. 1285 * @param pos 1286 * starting position. 1287 * @return last index of {@code ch} in raw, trimming spaces. 1288 * @since 4.1 1289 */ 1290 public static int lastIndexOfTrim(byte[] raw, char ch, int pos) { 1291 while (pos >= 0 && raw[pos] == ' ') 1292 pos--; 1293 1294 while (pos >= 0 && raw[pos] != ch) 1295 pos--; 1296 1297 return pos; 1298 } 1299 1300 private static Charset charsetForAlias(String name) { 1301 return encodingAliases.get(StringUtils.toLowerCase(name)); 1302 } 1303 1304 private RawParseUtils() { 1305 // Don't create instances of a static only utility. 1306 } 1307 }