1 /* 2 * Copyright (C) 2008-2009, Google Inc. 3 * Copyright (C) 2006-2008, Shawn O. Pearce <spearce@spearce.org> and others 4 * 5 * This program and the accompanying materials are made available under the 6 * terms of the Eclipse Distribution License v. 1.0 which is available at 7 * https://www.eclipse.org/org/documents/edl-v10.php. 8 * 9 * SPDX-License-Identifier: BSD-3-Clause 10 */ 11 12 package org.eclipse.jgit.util; 13 14 import static java.nio.charset.StandardCharsets.ISO_8859_1; 15 import static java.nio.charset.StandardCharsets.UTF_8; 16 import static org.eclipse.jgit.lib.ObjectChecker.author; 17 import static org.eclipse.jgit.lib.ObjectChecker.committer; 18 import static org.eclipse.jgit.lib.ObjectChecker.encoding; 19 import static org.eclipse.jgit.lib.ObjectChecker.tagger; 20 21 import java.nio.ByteBuffer; 22 import java.nio.charset.CharacterCodingException; 23 import java.nio.charset.Charset; 24 import java.nio.charset.CharsetDecoder; 25 import java.nio.charset.CodingErrorAction; 26 import java.nio.charset.IllegalCharsetNameException; 27 import java.nio.charset.UnsupportedCharsetException; 28 import java.util.Arrays; 29 import java.util.HashMap; 30 import java.util.Map; 31 32 import org.eclipse.jgit.annotations.Nullable; 33 import org.eclipse.jgit.diff.RawText; 34 import org.eclipse.jgit.errors.BinaryBlobException; 35 import org.eclipse.jgit.lib.Constants; 36 import org.eclipse.jgit.lib.PersonIdent; 37 38 /** 39 * Handy utility functions to parse raw object contents. 40 */ 41 public final class RawParseUtils { 42 /** 43 * UTF-8 charset constant. 44 * 45 * @since 2.2 46 * @deprecated use {@link java.nio.charset.StandardCharsets#UTF_8} instead 47 */ 48 @Deprecated 49 public static final Charset UTF8_CHARSET = UTF_8; 50 51 private static final byte[] digits10; 52 53 private static final byte[] digits16; 54 55 private static final byte[] footerLineKeyChars; 56 57 private static final Map<String, Charset> encodingAliases; 58 59 static { 60 encodingAliases = new HashMap<>(); 61 encodingAliases.put("latin-1", ISO_8859_1); //$NON-NLS-1$ 62 encodingAliases.put("iso-latin-1", ISO_8859_1); //$NON-NLS-1$ 63 64 digits10 = new byte['9' + 1]; 65 Arrays.fill(digits10, (byte) -1); 66 for (char i = '0'; i <= '9'; i++) 67 digits10[i] = (byte) (i - '0'); 68 69 digits16 = new byte['f' + 1]; 70 Arrays.fill(digits16, (byte) -1); 71 for (char i = '0'; i <= '9'; i++) 72 digits16[i] = (byte) (i - '0'); 73 for (char i = 'a'; i <= 'f'; i++) 74 digits16[i] = (byte) ((i - 'a') + 10); 75 for (char i = 'A'; i <= 'F'; i++) 76 digits16[i] = (byte) ((i - 'A') + 10); 77 78 footerLineKeyChars = new byte['z' + 1]; 79 footerLineKeyChars['-'] = 1; 80 for (char i = '0'; i <= '9'; i++) 81 footerLineKeyChars[i] = 1; 82 for (char i = 'A'; i <= 'Z'; i++) 83 footerLineKeyChars[i] = 1; 84 for (char i = 'a'; i <= 'z'; i++) 85 footerLineKeyChars[i] = 1; 86 } 87 88 /** 89 * Determine if b[ptr] matches src. 90 * 91 * @param b 92 * the buffer to scan. 93 * @param ptr 94 * first position within b, this should match src[0]. 95 * @param src 96 * the buffer to test for equality with b. 97 * @return ptr + src.length if b[ptr..src.length] == src; else -1. 98 */ 99 public static final int match(byte[] b, int ptr, byte[] src) { 100 if (ptr + src.length > b.length) 101 return -1; 102 for (int i = 0; i < src.length; i++, ptr++) 103 if (b[ptr] != src[i]) 104 return -1; 105 return ptr; 106 } 107 108 private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5', 109 '6', '7', '8', '9' }; 110 111 /** 112 * Format a base 10 numeric into a temporary buffer. 113 * <p> 114 * Formatting is performed backwards. The method starts at offset 115 * <code>o-1</code> and ends at <code>o-1-digits</code>, where 116 * <code>digits</code> is the number of positions necessary to store the 117 * base 10 value. 118 * <p> 119 * The argument and return values from this method make it easy to chain 120 * writing, for example: 121 * </p> 122 * 123 * <pre> 124 * final byte[] tmp = new byte[64]; 125 * int ptr = tmp.length; 126 * tmp[--ptr] = '\n'; 127 * ptr = RawParseUtils.formatBase10(tmp, ptr, 32); 128 * tmp[--ptr] = ' '; 129 * ptr = RawParseUtils.formatBase10(tmp, ptr, 18); 130 * tmp[--ptr] = 0; 131 * final String str = new String(tmp, ptr, tmp.length - ptr); 132 * </pre> 133 * 134 * @param b 135 * buffer to write into. 136 * @param o 137 * one offset past the location where writing will begin; writing 138 * proceeds towards lower index values. 139 * @param value 140 * the value to store. 141 * @return the new offset value <code>o</code>. This is the position of 142 * the last byte written. Additional writing should start at one 143 * position earlier. 144 */ 145 public static int formatBase10(final byte[] b, int o, int value) { 146 if (value == 0) { 147 b[--o] = '0'; 148 return o; 149 } 150 final boolean isneg = value < 0; 151 if (isneg) 152 value = -value; 153 while (value != 0) { 154 b[--o] = base10byte[value % 10]; 155 value /= 10; 156 } 157 if (isneg) 158 b[--o] = '-'; 159 return o; 160 } 161 162 /** 163 * Parse a base 10 numeric from a sequence of ASCII digits into an int. 164 * <p> 165 * Digit sequences can begin with an optional run of spaces before the 166 * sequence, and may start with a '+' or a '-' to indicate sign position. 167 * Any other characters will cause the method to stop and return the current 168 * result to the caller. 169 * 170 * @param b 171 * buffer to scan. 172 * @param ptr 173 * position within buffer to start parsing digits at. 174 * @param ptrResult 175 * optional location to return the new ptr value through. If null 176 * the ptr value will be discarded. 177 * @return the value at this location; 0 if the location is not a valid 178 * numeric. 179 */ 180 public static final int parseBase10(final byte[] b, int ptr, 181 final MutableInteger ptrResult) { 182 int r = 0; 183 int sign = 0; 184 try { 185 final int sz = b.length; 186 while (ptr < sz && b[ptr] == ' ') 187 ptr++; 188 if (ptr >= sz) 189 return 0; 190 191 switch (b[ptr]) { 192 case '-': 193 sign = -1; 194 ptr++; 195 break; 196 case '+': 197 ptr++; 198 break; 199 } 200 201 while (ptr < sz) { 202 final byte v = digits10[b[ptr]]; 203 if (v < 0) 204 break; 205 r = (r * 10) + v; 206 ptr++; 207 } 208 } catch (ArrayIndexOutOfBoundsException e) { 209 // Not a valid digit. 210 } 211 if (ptrResult != null) 212 ptrResult.value = ptr; 213 return sign < 0 ? -r : r; 214 } 215 216 /** 217 * Parse a base 10 numeric from a sequence of ASCII digits into a long. 218 * <p> 219 * Digit sequences can begin with an optional run of spaces before the 220 * sequence, and may start with a '+' or a '-' to indicate sign position. 221 * Any other characters will cause the method to stop and return the current 222 * result to the caller. 223 * 224 * @param b 225 * buffer to scan. 226 * @param ptr 227 * position within buffer to start parsing digits at. 228 * @param ptrResult 229 * optional location to return the new ptr value through. If null 230 * the ptr value will be discarded. 231 * @return the value at this location; 0 if the location is not a valid 232 * numeric. 233 */ 234 public static final long parseLongBase10(final byte[] b, int ptr, 235 final MutableInteger ptrResult) { 236 long r = 0; 237 int sign = 0; 238 try { 239 final int sz = b.length; 240 while (ptr < sz && b[ptr] == ' ') 241 ptr++; 242 if (ptr >= sz) 243 return 0; 244 245 switch (b[ptr]) { 246 case '-': 247 sign = -1; 248 ptr++; 249 break; 250 case '+': 251 ptr++; 252 break; 253 } 254 255 while (ptr < sz) { 256 final byte v = digits10[b[ptr]]; 257 if (v < 0) 258 break; 259 r = (r * 10) + v; 260 ptr++; 261 } 262 } catch (ArrayIndexOutOfBoundsException e) { 263 // Not a valid digit. 264 } 265 if (ptrResult != null) 266 ptrResult.value = ptr; 267 return sign < 0 ? -r : r; 268 } 269 270 /** 271 * Parse 4 character base 16 (hex) formatted string to unsigned integer. 272 * <p> 273 * The number is read in network byte order, that is, most significant 274 * nybble first. 275 * 276 * @param bs 277 * buffer to parse digits from; positions {@code [p, p+4)} will 278 * be parsed. 279 * @param p 280 * first position within the buffer to parse. 281 * @return the integer value. 282 * @throws java.lang.ArrayIndexOutOfBoundsException 283 * if the string is not hex formatted. 284 */ 285 public static final int parseHexInt16(final byte[] bs, final int p) { 286 int r = digits16[bs[p]] << 4; 287 288 r |= digits16[bs[p + 1]]; 289 r <<= 4; 290 291 r |= digits16[bs[p + 2]]; 292 r <<= 4; 293 294 r |= digits16[bs[p + 3]]; 295 if (r < 0) 296 throw new ArrayIndexOutOfBoundsException(); 297 return r; 298 } 299 300 /** 301 * Parse 8 character base 16 (hex) formatted string to unsigned integer. 302 * <p> 303 * The number is read in network byte order, that is, most significant 304 * nybble first. 305 * 306 * @param bs 307 * buffer to parse digits from; positions {@code [p, p+8)} will 308 * be parsed. 309 * @param p 310 * first position within the buffer to parse. 311 * @return the integer value. 312 * @throws java.lang.ArrayIndexOutOfBoundsException 313 * if the string is not hex formatted. 314 */ 315 public static final int parseHexInt32(final byte[] bs, final int p) { 316 int r = digits16[bs[p]] << 4; 317 318 r |= digits16[bs[p + 1]]; 319 r <<= 4; 320 321 r |= digits16[bs[p + 2]]; 322 r <<= 4; 323 324 r |= digits16[bs[p + 3]]; 325 r <<= 4; 326 327 r |= digits16[bs[p + 4]]; 328 r <<= 4; 329 330 r |= digits16[bs[p + 5]]; 331 r <<= 4; 332 333 r |= digits16[bs[p + 6]]; 334 335 final int last = digits16[bs[p + 7]]; 336 if (r < 0 || last < 0) 337 throw new ArrayIndexOutOfBoundsException(); 338 return (r << 4) | last; 339 } 340 341 /** 342 * Parse 16 character base 16 (hex) formatted string to unsigned long. 343 * <p> 344 * The number is read in network byte order, that is, most significant 345 * nibble first. 346 * 347 * @param bs 348 * buffer to parse digits from; positions {@code [p, p+16)} will 349 * be parsed. 350 * @param p 351 * first position within the buffer to parse. 352 * @return the integer value. 353 * @throws java.lang.ArrayIndexOutOfBoundsException 354 * if the string is not hex formatted. 355 * @since 4.3 356 */ 357 public static final long parseHexInt64(final byte[] bs, final int p) { 358 long r = digits16[bs[p]] << 4; 359 360 r |= digits16[bs[p + 1]]; 361 r <<= 4; 362 363 r |= digits16[bs[p + 2]]; 364 r <<= 4; 365 366 r |= digits16[bs[p + 3]]; 367 r <<= 4; 368 369 r |= digits16[bs[p + 4]]; 370 r <<= 4; 371 372 r |= digits16[bs[p + 5]]; 373 r <<= 4; 374 375 r |= digits16[bs[p + 6]]; 376 r <<= 4; 377 378 r |= digits16[bs[p + 7]]; 379 r <<= 4; 380 381 r |= digits16[bs[p + 8]]; 382 r <<= 4; 383 384 r |= digits16[bs[p + 9]]; 385 r <<= 4; 386 387 r |= digits16[bs[p + 10]]; 388 r <<= 4; 389 390 r |= digits16[bs[p + 11]]; 391 r <<= 4; 392 393 r |= digits16[bs[p + 12]]; 394 r <<= 4; 395 396 r |= digits16[bs[p + 13]]; 397 r <<= 4; 398 399 r |= digits16[bs[p + 14]]; 400 401 final int last = digits16[bs[p + 15]]; 402 if (r < 0 || last < 0) 403 throw new ArrayIndexOutOfBoundsException(); 404 return (r << 4) | last; 405 } 406 407 /** 408 * Parse a single hex digit to its numeric value (0-15). 409 * 410 * @param digit 411 * hex character to parse. 412 * @return numeric value, in the range 0-15. 413 * @throws java.lang.ArrayIndexOutOfBoundsException 414 * if the input digit is not a valid hex digit. 415 */ 416 public static final int parseHexInt4(final byte digit) { 417 final byte r = digits16[digit]; 418 if (r < 0) 419 throw new ArrayIndexOutOfBoundsException(); 420 return r; 421 } 422 423 /** 424 * Parse a Git style timezone string. 425 * <p> 426 * The sequence "-0315" will be parsed as the numeric value -195, as the 427 * lower two positions count minutes, not 100ths of an hour. 428 * 429 * @param b 430 * buffer to scan. 431 * @param ptr 432 * position within buffer to start parsing digits at. 433 * @return the timezone at this location, expressed in minutes. 434 */ 435 public static final int parseTimeZoneOffset(byte[] b, int ptr) { 436 return parseTimeZoneOffset(b, ptr, null); 437 } 438 439 /** 440 * Parse a Git style timezone string. 441 * <p> 442 * The sequence "-0315" will be parsed as the numeric value -195, as the 443 * lower two positions count minutes, not 100ths of an hour. 444 * 445 * @param b 446 * buffer to scan. 447 * @param ptr 448 * position within buffer to start parsing digits at. 449 * @param ptrResult 450 * optional location to return the new ptr value through. If null 451 * the ptr value will be discarded. 452 * @return the timezone at this location, expressed in minutes. 453 * @since 4.1 454 */ 455 public static final int parseTimeZoneOffset(final byte[] b, int ptr, 456 MutableInteger ptrResult) { 457 final int v = parseBase10(b, ptr, ptrResult); 458 final int tzMins = v % 100; 459 final int tzHours = v / 100; 460 return tzHours * 60 + tzMins; 461 } 462 463 /** 464 * Locate the first position after a given character. 465 * 466 * @param b 467 * buffer to scan. 468 * @param ptr 469 * position within buffer to start looking for chrA at. 470 * @param chrA 471 * character to find. 472 * @return new position just after chrA. 473 */ 474 public static final int next(byte[] b, int ptr, char chrA) { 475 final int sz = b.length; 476 while (ptr < sz) { 477 if (b[ptr++] == chrA) 478 return ptr; 479 } 480 return ptr; 481 } 482 483 /** 484 * Locate the first position after the next LF. 485 * <p> 486 * This method stops on the first '\n' it finds. 487 * 488 * @param b 489 * buffer to scan. 490 * @param ptr 491 * position within buffer to start looking for LF at. 492 * @return new position just after the first LF found. 493 */ 494 public static final int nextLF(byte[] b, int ptr) { 495 return next(b, ptr, '\n'); 496 } 497 498 /** 499 * Locate the first position after either the given character or LF. 500 * <p> 501 * This method stops on the first match it finds from either chrA or '\n'. 502 * 503 * @param b 504 * buffer to scan. 505 * @param ptr 506 * position within buffer to start looking for chrA or LF at. 507 * @param chrA 508 * character to find. 509 * @return new position just after the first chrA or LF to be found. 510 */ 511 public static final int nextLF(byte[] b, int ptr, char chrA) { 512 final int sz = b.length; 513 while (ptr < sz) { 514 final byte c = b[ptr++]; 515 if (c == chrA || c == '\n') 516 return ptr; 517 } 518 return ptr; 519 } 520 521 /** 522 * Locate the end of the header. Note that headers may be 523 * more than one line long. 524 * @param b 525 * buffer to scan. 526 * @param ptr 527 * position within buffer to start looking for the end-of-header. 528 * @return new position just after the header. This is either 529 * b.length, or the index of the header's terminating newline. 530 * @since 5.1 531 */ 532 public static final int headerEnd(final byte[] b, int ptr) { 533 final int sz = b.length; 534 while (ptr < sz) { 535 final byte c = b[ptr++]; 536 if (c == '\n' && (ptr == sz || b[ptr] != ' ')) { 537 return ptr - 1; 538 } 539 } 540 return ptr - 1; 541 } 542 543 /** 544 * Find the start of the contents of a given header. 545 * 546 * @param b 547 * buffer to scan. 548 * @param headerName 549 * header to search for 550 * @param ptr 551 * position within buffer to start looking for header at. 552 * @return new position at the start of the header's contents, -1 for 553 * not found 554 * @since 5.1 555 */ 556 public static final int headerStart(byte[] headerName, byte[] b, int ptr) { 557 // Start by advancing to just past a LF or buffer start 558 if (ptr != 0) { 559 ptr = nextLF(b, ptr - 1); 560 } 561 while (ptr < b.length - (headerName.length + 1)) { 562 boolean found = true; 563 for (byte element : headerName) { 564 if (element != b[ptr++]) { 565 found = false; 566 break; 567 } 568 } 569 if (found && b[ptr++] == ' ') { 570 return ptr; 571 } 572 ptr = nextLF(b, ptr); 573 } 574 return -1; 575 } 576 577 /** 578 * Locate the first position before a given character. 579 * 580 * @param b 581 * buffer to scan. 582 * @param ptr 583 * position within buffer to start looking for chrA at. 584 * @param chrA 585 * character to find. 586 * @return new position just before chrA, -1 for not found 587 */ 588 public static final int prev(byte[] b, int ptr, char chrA) { 589 if (ptr == b.length) 590 --ptr; 591 while (ptr >= 0) { 592 if (b[ptr--] == chrA) 593 return ptr; 594 } 595 return ptr; 596 } 597 598 /** 599 * Locate the first position before the previous LF. 600 * <p> 601 * This method stops on the first '\n' it finds. 602 * 603 * @param b 604 * buffer to scan. 605 * @param ptr 606 * position within buffer to start looking for LF at. 607 * @return new position just before the first LF found, -1 for not found 608 */ 609 public static final int prevLF(byte[] b, int ptr) { 610 return prev(b, ptr, '\n'); 611 } 612 613 /** 614 * Locate the previous position before either the given character or LF. 615 * <p> 616 * This method stops on the first match it finds from either chrA or '\n'. 617 * 618 * @param b 619 * buffer to scan. 620 * @param ptr 621 * position within buffer to start looking for chrA or LF at. 622 * @param chrA 623 * character to find. 624 * @return new position just before the first chrA or LF to be found, -1 for 625 * not found 626 */ 627 public static final int prevLF(byte[] b, int ptr, char chrA) { 628 if (ptr == b.length) 629 --ptr; 630 while (ptr >= 0) { 631 final byte c = b[ptr--]; 632 if (c == chrA || c == '\n') 633 return ptr; 634 } 635 return ptr; 636 } 637 638 /** 639 * Index the region between <code>[ptr, end)</code> to find line starts. 640 * <p> 641 * The returned list is 1 indexed. Index 0 contains 642 * {@link java.lang.Integer#MIN_VALUE} to pad the list out. 643 * <p> 644 * Using a 1 indexed list means that line numbers can be directly accessed 645 * from the list, so <code>list.get(1)</code> (aka get line 1) returns 646 * <code>ptr</code>. 647 * <p> 648 * The last element (index <code>map.size()-1</code>) always contains 649 * <code>end</code>. 650 * 651 * @param buf 652 * buffer to scan. 653 * @param ptr 654 * position within the buffer corresponding to the first byte of 655 * line 1. 656 * @param end 657 * 1 past the end of the content within <code>buf</code>. 658 * @return a line map indicating the starting position of each line. 659 */ 660 public static final IntList lineMap(byte[] buf, int ptr, int end) { 661 IntList map = new IntList((end - ptr) / 36); 662 map.fillTo(1, Integer.MIN_VALUE); 663 for (; ptr < end; ptr = nextLF(buf, ptr)) { 664 map.add(ptr); 665 } 666 map.add(end); 667 return map; 668 } 669 670 /** 671 * Like {@link #lineMap(byte[], int, int)} but throw 672 * {@link BinaryBlobException} if a NUL byte is encountered. 673 * 674 * @param buf 675 * buffer to scan. 676 * @param ptr 677 * position within the buffer corresponding to the first byte of 678 * line 1. 679 * @param end 680 * 1 past the end of the content within <code>buf</code>. 681 * @return a line map indicating the starting position of each line. 682 * @throws BinaryBlobException 683 * if a NUL byte or a lone CR is found. 684 * @since 5.0 685 */ 686 public static final IntList lineMapOrBinary(byte[] buf, int ptr, int end) 687 throws BinaryBlobException { 688 // Experimentally derived from multiple source repositories 689 // the average number of bytes/line is 36. Its a rough guess 690 // to initially size our map close to the target. 691 IntList map = new IntList((end - ptr) / 36); 692 map.add(Integer.MIN_VALUE); 693 byte last = '\n'; // Must be \n to add the initial ptr 694 for (; ptr < end; ptr++) { 695 if (last == '\n') { 696 map.add(ptr); 697 } 698 byte curr = buf[ptr]; 699 if (RawText.isBinary(curr, last)) { 700 throw new BinaryBlobException(); 701 } 702 last = curr; 703 } 704 if (last == '\r') { 705 // Counts as binary 706 throw new BinaryBlobException(); 707 } 708 map.add(end); 709 return map; 710 } 711 712 /** 713 * Locate the "author " header line data. 714 * 715 * @param b 716 * buffer to scan. 717 * @param ptr 718 * position in buffer to start the scan at. Most callers should 719 * pass 0 to ensure the scan starts from the beginning of the 720 * commit buffer and does not accidentally look at message body. 721 * @return position just after the space in "author ", so the first 722 * character of the author's name. If no author header can be 723 * located -1 is returned. 724 */ 725 public static final int author(byte[] b, int ptr) { 726 final int sz = b.length; 727 if (ptr == 0) 728 ptr += 46; // skip the "tree ..." line. 729 while (ptr < sz && b[ptr] == 'p') 730 ptr += 48; // skip this parent. 731 return match(b, ptr, author); 732 } 733 734 /** 735 * Locate the "committer " header line data. 736 * 737 * @param b 738 * buffer to scan. 739 * @param ptr 740 * position in buffer to start the scan at. Most callers should 741 * pass 0 to ensure the scan starts from the beginning of the 742 * commit buffer and does not accidentally look at message body. 743 * @return position just after the space in "committer ", so the first 744 * character of the committer's name. If no committer header can be 745 * located -1 is returned. 746 */ 747 public static final int committer(byte[] b, int ptr) { 748 final int sz = b.length; 749 if (ptr == 0) 750 ptr += 46; // skip the "tree ..." line. 751 while (ptr < sz && b[ptr] == 'p') 752 ptr += 48; // skip this parent. 753 if (ptr < sz && b[ptr] == 'a') 754 ptr = nextLF(b, ptr); 755 return match(b, ptr, committer); 756 } 757 758 /** 759 * Locate the "tagger " header line data. 760 * 761 * @param b 762 * buffer to scan. 763 * @param ptr 764 * position in buffer to start the scan at. Most callers should 765 * pass 0 to ensure the scan starts from the beginning of the tag 766 * buffer and does not accidentally look at message body. 767 * @return position just after the space in "tagger ", so the first 768 * character of the tagger's name. If no tagger header can be 769 * located -1 is returned. 770 */ 771 public static final int tagger(byte[] b, int ptr) { 772 final int sz = b.length; 773 if (ptr == 0) 774 ptr += 48; // skip the "object ..." line. 775 while (ptr < sz) { 776 if (b[ptr] == '\n') 777 return -1; 778 final int m = match(b, ptr, tagger); 779 if (m >= 0) 780 return m; 781 ptr = nextLF(b, ptr); 782 } 783 return -1; 784 } 785 786 /** 787 * Locate the "encoding " header line. 788 * 789 * @param b 790 * buffer to scan. 791 * @param ptr 792 * position in buffer to start the scan at. Most callers should 793 * pass 0 to ensure the scan starts from the beginning of the 794 * buffer and does not accidentally look at the message body. 795 * @return position just after the space in "encoding ", so the first 796 * character of the encoding's name. If no encoding header can be 797 * located -1 is returned (and UTF-8 should be assumed). 798 */ 799 public static final int encoding(byte[] b, int ptr) { 800 final int sz = b.length; 801 while (ptr < sz) { 802 if (b[ptr] == '\n') 803 return -1; 804 if (b[ptr] == 'e') 805 break; 806 ptr = nextLF(b, ptr); 807 } 808 return match(b, ptr, encoding); 809 } 810 811 /** 812 * Parse the "encoding " header as a string. 813 * <p> 814 * Locates the "encoding " header (if present) and returns its value. 815 * 816 * @param b 817 * buffer to scan. 818 * @return the encoding header as specified in the commit; null if the 819 * header was not present and should be assumed. 820 * @since 4.2 821 */ 822 @Nullable 823 public static String parseEncodingName(byte[] b) { 824 int enc = encoding(b, 0); 825 if (enc < 0) { 826 return null; 827 } 828 int lf = nextLF(b, enc); 829 return decode(UTF_8, b, enc, lf - 1); 830 } 831 832 /** 833 * Parse the "encoding " header into a character set reference. 834 * <p> 835 * Locates the "encoding " header (if present) by first calling 836 * {@link #encoding(byte[], int)} and then returns the proper character set 837 * to apply to this buffer to evaluate its contents as character data. 838 * <p> 839 * If no encoding header is present {@code UTF-8} is assumed. 840 * 841 * @param b 842 * buffer to scan. 843 * @return the Java character set representation. Never null. 844 * @throws IllegalCharsetNameException 845 * if the character set requested by the encoding header is 846 * malformed and unsupportable. 847 * @throws UnsupportedCharsetException 848 * if the JRE does not support the character set requested by 849 * the encoding header. 850 */ 851 public static Charset parseEncoding(byte[] b) { 852 String enc = parseEncodingName(b); 853 if (enc == null) { 854 return UTF_8; 855 } 856 857 String name = enc.trim(); 858 try { 859 return Charset.forName(name); 860 } catch (IllegalCharsetNameException 861 | UnsupportedCharsetException badName) { 862 Charset aliased = charsetForAlias(name); 863 if (aliased != null) { 864 return aliased; 865 } 866 throw badName; 867 } 868 } 869 870 /** 871 * Parse a name string (e.g. author, committer, tagger) into a PersonIdent. 872 * <p> 873 * Leading spaces won't be trimmed from the string, i.e. will show up in the 874 * parsed name afterwards. 875 * 876 * @param in 877 * the string to parse a name from. 878 * @return the parsed identity or null in case the identity could not be 879 * parsed. 880 */ 881 public static PersonIdent parsePersonIdent(String in) { 882 return parsePersonIdent(Constants.encode(in), 0); 883 } 884 885 /** 886 * Parse a name line (e.g. author, committer, tagger) into a PersonIdent. 887 * <p> 888 * When passing in a value for <code>nameB</code> callers should use the 889 * return value of {@link #author(byte[], int)} or 890 * {@link #committer(byte[], int)}, as these methods provide the proper 891 * position within the buffer. 892 * 893 * @param raw 894 * the buffer to parse character data from. 895 * @param nameB 896 * first position of the identity information. This should be the 897 * first position after the space which delimits the header field 898 * name (e.g. "author" or "committer") from the rest of the 899 * identity line. 900 * @return the parsed identity or null in case the identity could not be 901 * parsed. 902 */ 903 public static PersonIdent parsePersonIdent(byte[] raw, int nameB) { 904 Charset cs; 905 try { 906 cs = parseEncoding(raw); 907 } catch (IllegalCharsetNameException | UnsupportedCharsetException e) { 908 // Assume UTF-8 for person identities, usually this is correct. 909 // If not decode() will fall back to the ISO-8859-1 encoding. 910 cs = UTF_8; 911 } 912 913 final int emailB = nextLF(raw, nameB, '<'); 914 final int emailE = nextLF(raw, emailB, '>'); 915 if (emailB >= raw.length || raw[emailB] == '\n' || 916 (emailE >= raw.length - 1 && raw[emailE - 1] != '>')) 917 return null; 918 919 final int nameEnd = emailB - 2 >= nameB && raw[emailB - 2] == ' ' ? 920 emailB - 2 : emailB - 1; 921 final String name = decode(cs, raw, nameB, nameEnd); 922 final String email = decode(cs, raw, emailB, emailE - 1); 923 924 // Start searching from end of line, as after first name-email pair, 925 // another name-email pair may occur. We will ignore all kinds of 926 // "junk" following the first email. 927 // 928 // We've to use (emailE - 1) for the case that raw[email] is LF, 929 // otherwise we would run too far. "-2" is necessary to position 930 // before the LF in case of LF termination resp. the penultimate 931 // character if there is no trailing LF. 932 final int tzBegin = lastIndexOfTrim(raw, ' ', 933 nextLF(raw, emailE - 1) - 2) + 1; 934 if (tzBegin <= emailE) // No time/zone, still valid 935 return new PersonIdent(name, email, 0, 0); 936 937 final int whenBegin = Math.max(emailE, 938 lastIndexOfTrim(raw, ' ', tzBegin - 1) + 1); 939 if (whenBegin >= tzBegin - 1) // No time/zone, still valid 940 return new PersonIdent(name, email, 0, 0); 941 942 final long when = parseLongBase10(raw, whenBegin, null); 943 final int tz = parseTimeZoneOffset(raw, tzBegin); 944 return new PersonIdent(name, email, when * 1000L, tz); 945 } 946 947 /** 948 * Parse a name data (e.g. as within a reflog) into a PersonIdent. 949 * <p> 950 * When passing in a value for <code>nameB</code> callers should use the 951 * return value of {@link #author(byte[], int)} or 952 * {@link #committer(byte[], int)}, as these methods provide the proper 953 * position within the buffer. 954 * 955 * @param raw 956 * the buffer to parse character data from. 957 * @param nameB 958 * first position of the identity information. This should be the 959 * first position after the space which delimits the header field 960 * name (e.g. "author" or "committer") from the rest of the 961 * identity line. 962 * @return the parsed identity. Never null. 963 */ 964 public static PersonIdent parsePersonIdentOnly(final byte[] raw, 965 final int nameB) { 966 int stop = nextLF(raw, nameB); 967 int emailB = nextLF(raw, nameB, '<'); 968 int emailE = nextLF(raw, emailB, '>'); 969 final String name; 970 final String email; 971 if (emailE < stop) { 972 email = decode(raw, emailB, emailE - 1); 973 } else { 974 email = "invalid"; //$NON-NLS-1$ 975 } 976 if (emailB < stop) 977 name = decode(raw, nameB, emailB - 2); 978 else 979 name = decode(raw, nameB, stop); 980 981 final MutableInteger ptrout = new MutableInteger(); 982 long when; 983 int tz; 984 if (emailE < stop) { 985 when = parseLongBase10(raw, emailE + 1, ptrout); 986 tz = parseTimeZoneOffset(raw, ptrout.value); 987 } else { 988 when = 0; 989 tz = 0; 990 } 991 return new PersonIdent(name, email, when * 1000L, tz); 992 } 993 994 /** 995 * Locate the end of a footer line key string. 996 * <p> 997 * If the region at {@code raw[ptr]} matches {@code ^[A-Za-z0-9-]+:} (e.g. 998 * "Signed-off-by: A. U. Thor\n") then this method returns the position of 999 * the first ':'. 1000 * <p> 1001 * If the region at {@code raw[ptr]} does not match {@code ^[A-Za-z0-9-]+:} 1002 * then this method returns -1. 1003 * 1004 * @param raw 1005 * buffer to scan. 1006 * @param ptr 1007 * first position within raw to consider as a footer line key. 1008 * @return position of the ':' which terminates the footer line key if this 1009 * is otherwise a valid footer line key; otherwise -1. 1010 */ 1011 public static int endOfFooterLineKey(byte[] raw, int ptr) { 1012 try { 1013 for (;;) { 1014 final byte c = raw[ptr]; 1015 if (footerLineKeyChars[c] == 0) { 1016 if (c == ':') 1017 return ptr; 1018 return -1; 1019 } 1020 ptr++; 1021 } 1022 } catch (ArrayIndexOutOfBoundsException e) { 1023 return -1; 1024 } 1025 } 1026 1027 /** 1028 * Decode a buffer under UTF-8, if possible. 1029 * 1030 * If the byte stream cannot be decoded that way, the platform default is tried 1031 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried. 1032 * 1033 * @param buffer 1034 * buffer to pull raw bytes from. 1035 * @return a string representation of the range <code>[start,end)</code>, 1036 * after decoding the region through the specified character set. 1037 */ 1038 public static String decode(byte[] buffer) { 1039 return decode(buffer, 0, buffer.length); 1040 } 1041 1042 /** 1043 * Decode a buffer under UTF-8, if possible. 1044 * 1045 * If the byte stream cannot be decoded that way, the platform default is 1046 * tried and if that too fails, the fail-safe ISO-8859-1 encoding is tried. 1047 * 1048 * @param buffer 1049 * buffer to pull raw bytes from. 1050 * @param start 1051 * start position in buffer 1052 * @param end 1053 * one position past the last location within the buffer to take 1054 * data from. 1055 * @return a string representation of the range <code>[start,end)</code>, 1056 * after decoding the region through the specified character set. 1057 */ 1058 public static String decode(final byte[] buffer, final int start, 1059 final int end) { 1060 return decode(UTF_8, buffer, start, end); 1061 } 1062 1063 /** 1064 * Decode a buffer under the specified character set if possible. 1065 * 1066 * If the byte stream cannot be decoded that way, the platform default is tried 1067 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried. 1068 * 1069 * @param cs 1070 * character set to use when decoding the buffer. 1071 * @param buffer 1072 * buffer to pull raw bytes from. 1073 * @return a string representation of the range <code>[start,end)</code>, 1074 * after decoding the region through the specified character set. 1075 */ 1076 public static String decode(Charset cs, byte[] buffer) { 1077 return decode(cs, buffer, 0, buffer.length); 1078 } 1079 1080 /** 1081 * Decode a region of the buffer under the specified character set if possible. 1082 * 1083 * If the byte stream cannot be decoded that way, the platform default is tried 1084 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried. 1085 * 1086 * @param cs 1087 * character set to use when decoding the buffer. 1088 * @param buffer 1089 * buffer to pull raw bytes from. 1090 * @param start 1091 * first position within the buffer to take data from. 1092 * @param end 1093 * one position past the last location within the buffer to take 1094 * data from. 1095 * @return a string representation of the range <code>[start,end)</code>, 1096 * after decoding the region through the specified character set. 1097 */ 1098 public static String decode(final Charset cs, final byte[] buffer, 1099 final int start, final int end) { 1100 try { 1101 return decodeNoFallback(cs, buffer, start, end); 1102 } catch (CharacterCodingException e) { 1103 // Fall back to an ISO-8859-1 style encoding. At least all of 1104 // the bytes will be present in the output. 1105 // 1106 return extractBinaryString(buffer, start, end); 1107 } 1108 } 1109 1110 /** 1111 * Decode a region of the buffer under the specified character set if 1112 * possible. 1113 * 1114 * If the byte stream cannot be decoded that way, the platform default is 1115 * tried and if that too fails, an exception is thrown. 1116 * 1117 * @param cs 1118 * character set to use when decoding the buffer. 1119 * @param buffer 1120 * buffer to pull raw bytes from. 1121 * @param start 1122 * first position within the buffer to take data from. 1123 * @param end 1124 * one position past the last location within the buffer to take 1125 * data from. 1126 * @return a string representation of the range <code>[start,end)</code>, 1127 * after decoding the region through the specified character set. 1128 * @throws java.nio.charset.CharacterCodingException 1129 * the input is not in any of the tested character sets. 1130 */ 1131 public static String decodeNoFallback(final Charset cs, 1132 final byte[] buffer, final int start, final int end) 1133 throws CharacterCodingException { 1134 ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start); 1135 b.mark(); 1136 1137 // Try our built-in favorite. The assumption here is that 1138 // decoding will fail if the data is not actually encoded 1139 // using that encoder. 1140 try { 1141 return decode(b, UTF_8); 1142 } catch (CharacterCodingException e) { 1143 b.reset(); 1144 } 1145 1146 if (!cs.equals(UTF_8)) { 1147 // Try the suggested encoding, it might be right since it was 1148 // provided by the caller. 1149 try { 1150 return decode(b, cs); 1151 } catch (CharacterCodingException e) { 1152 b.reset(); 1153 } 1154 } 1155 1156 // Try the default character set. A small group of people 1157 // might actually use the same (or very similar) locale. 1158 Charset defcs = SystemReader.getInstance().getDefaultCharset(); 1159 if (!defcs.equals(cs) && !defcs.equals(UTF_8)) { 1160 try { 1161 return decode(b, defcs); 1162 } catch (CharacterCodingException e) { 1163 b.reset(); 1164 } 1165 } 1166 1167 throw new CharacterCodingException(); 1168 } 1169 1170 /** 1171 * Decode a region of the buffer under the ISO-8859-1 encoding. 1172 * 1173 * Each byte is treated as a single character in the 8859-1 character 1174 * encoding, performing a raw binary->char conversion. 1175 * 1176 * @param buffer 1177 * buffer to pull raw bytes from. 1178 * @param start 1179 * first position within the buffer to take data from. 1180 * @param end 1181 * one position past the last location within the buffer to take 1182 * data from. 1183 * @return a string representation of the range <code>[start,end)</code>. 1184 */ 1185 public static String extractBinaryString(final byte[] buffer, 1186 final int start, final int end) { 1187 final StringBuilder r = new StringBuilder(end - start); 1188 for (int i = start; i < end; i++) 1189 r.append((char) (buffer[i] & 0xff)); 1190 return r.toString(); 1191 } 1192 1193 private static String decode(ByteBuffer b, Charset charset) 1194 throws CharacterCodingException { 1195 final CharsetDecoder d = charset.newDecoder(); 1196 d.onMalformedInput(CodingErrorAction.REPORT); 1197 d.onUnmappableCharacter(CodingErrorAction.REPORT); 1198 return d.decode(b).toString(); 1199 } 1200 1201 /** 1202 * Locate the position of the commit message body. 1203 * 1204 * @param b 1205 * buffer to scan. 1206 * @param ptr 1207 * position in buffer to start the scan at. Most callers should 1208 * pass 0 to ensure the scan starts from the beginning of the 1209 * commit buffer. 1210 * @return position of the user's message buffer. 1211 */ 1212 public static final int commitMessage(byte[] b, int ptr) { 1213 final int sz = b.length; 1214 if (ptr == 0) 1215 ptr += 46; // skip the "tree ..." line. 1216 while (ptr < sz && b[ptr] == 'p') 1217 ptr += 48; // skip this parent. 1218 1219 // Skip any remaining header lines, ignoring what their actual 1220 // header line type is. This is identical to the logic for a tag. 1221 // 1222 return tagMessage(b, ptr); 1223 } 1224 1225 /** 1226 * Locate the position of the tag message body. 1227 * 1228 * @param b 1229 * buffer to scan. 1230 * @param ptr 1231 * position in buffer to start the scan at. Most callers should 1232 * pass 0 to ensure the scan starts from the beginning of the tag 1233 * buffer. 1234 * @return position of the user's message buffer. 1235 */ 1236 public static final int tagMessage(byte[] b, int ptr) { 1237 final int sz = b.length; 1238 if (ptr == 0) 1239 ptr += 48; // skip the "object ..." line. 1240 while (ptr < sz && b[ptr] != '\n') 1241 ptr = nextLF(b, ptr); 1242 if (ptr < sz && b[ptr] == '\n') 1243 return ptr + 1; 1244 return -1; 1245 } 1246 1247 /** 1248 * Locate the end of a paragraph. 1249 * <p> 1250 * A paragraph is ended by two consecutive LF bytes or CRLF pairs 1251 * 1252 * @param b 1253 * buffer to scan. 1254 * @param start 1255 * position in buffer to start the scan at. Most callers will 1256 * want to pass the first position of the commit message (as 1257 * found by {@link #commitMessage(byte[], int)}. 1258 * @return position of the LF at the end of the paragraph; 1259 * <code>b.length</code> if no paragraph end could be located. 1260 */ 1261 public static final int endOfParagraph(byte[] b, int start) { 1262 int ptr = start; 1263 final int sz = b.length; 1264 while (ptr < sz && (b[ptr] != '\n' && b[ptr] != '\r')) 1265 ptr = nextLF(b, ptr); 1266 if (ptr > start && b[ptr - 1] == '\n') 1267 ptr--; 1268 if (ptr > start && b[ptr - 1] == '\r') 1269 ptr--; 1270 return ptr; 1271 } 1272 1273 /** 1274 * Get last index of {@code ch} in raw, trimming spaces. 1275 * 1276 * @param raw 1277 * buffer to scan. 1278 * @param ch 1279 * character to find. 1280 * @param pos 1281 * starting position. 1282 * @return last index of {@code ch} in raw, trimming spaces. 1283 * @since 4.1 1284 */ 1285 public static int lastIndexOfTrim(byte[] raw, char ch, int pos) { 1286 while (pos >= 0 && raw[pos] == ' ') 1287 pos--; 1288 1289 while (pos >= 0 && raw[pos] != ch) 1290 pos--; 1291 1292 return pos; 1293 } 1294 1295 private static Charset charsetForAlias(String name) { 1296 return encodingAliases.get(StringUtils.toLowerCase(name)); 1297 } 1298 1299 private RawParseUtils() { 1300 // Don't create instances of a static only utility. 1301 } 1302 }