1 /* 2 * Copyright (C) 2008-2009, Google Inc. 3 * Copyright (C) 2006-2008, Shawn O. Pearce <spearce@spearce.org> 4 * and other copyright owners as documented in the project's IP log. 5 * 6 * This program and the accompanying materials are made available 7 * under the terms of the Eclipse Distribution License v1.0 which 8 * accompanies this distribution, is reproduced below, and is 9 * available at http://www.eclipse.org/org/documents/edl-v10.php 10 * 11 * All rights reserved. 12 * 13 * Redistribution and use in source and binary forms, with or 14 * without modification, are permitted provided that the following 15 * conditions are met: 16 * 17 * - Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 20 * - Redistributions in binary form must reproduce the above 21 * copyright notice, this list of conditions and the following 22 * disclaimer in the documentation and/or other materials provided 23 * with the distribution. 24 * 25 * - Neither the name of the Eclipse Foundation, Inc. nor the 26 * names of its contributors may be used to endorse or promote 27 * products derived from this software without specific prior 28 * written permission. 29 * 30 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND 31 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, 32 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 33 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 34 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 35 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 36 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 37 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 38 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 39 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 40 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 41 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 42 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 43 */ 44 45 package org.eclipse.jgit.util; 46 47 import static java.nio.charset.StandardCharsets.ISO_8859_1; 48 import static java.nio.charset.StandardCharsets.UTF_8; 49 import static org.eclipse.jgit.lib.ObjectChecker.author; 50 import static org.eclipse.jgit.lib.ObjectChecker.committer; 51 import static org.eclipse.jgit.lib.ObjectChecker.encoding; 52 import static org.eclipse.jgit.lib.ObjectChecker.tagger; 53 54 import java.nio.ByteBuffer; 55 import java.nio.charset.CharacterCodingException; 56 import java.nio.charset.Charset; 57 import java.nio.charset.CharsetDecoder; 58 import java.nio.charset.CodingErrorAction; 59 import java.nio.charset.IllegalCharsetNameException; 60 import java.nio.charset.UnsupportedCharsetException; 61 import java.util.Arrays; 62 import java.util.HashMap; 63 import java.util.Map; 64 65 import org.eclipse.jgit.annotations.Nullable; 66 import org.eclipse.jgit.lib.Constants; 67 import org.eclipse.jgit.lib.PersonIdent; 68 69 /** 70 * Handy utility functions to parse raw object contents. 71 */ 72 public final class RawParseUtils { 73 /** 74 * UTF-8 charset constant. 75 * 76 * @since 2.2 77 */ 78 public static final Charset UTF8_CHARSET = UTF_8; 79 80 private static final byte[] digits10; 81 82 private static final byte[] digits16; 83 84 private static final byte[] footerLineKeyChars; 85 86 private static final Map<String, Charset> encodingAliases; 87 88 static { 89 encodingAliases = new HashMap<>(); 90 encodingAliases.put("latin-1", ISO_8859_1); //$NON-NLS-1$ 91 encodingAliases.put("iso-latin-1", ISO_8859_1); //$NON-NLS-1$ 92 93 digits10 = new byte['9' + 1]; 94 Arrays.fill(digits10, (byte) -1); 95 for (char i = '0'; i <= '9'; i++) 96 digits10[i] = (byte) (i - '0'); 97 98 digits16 = new byte['f' + 1]; 99 Arrays.fill(digits16, (byte) -1); 100 for (char i = '0'; i <= '9'; i++) 101 digits16[i] = (byte) (i - '0'); 102 for (char i = 'a'; i <= 'f'; i++) 103 digits16[i] = (byte) ((i - 'a') + 10); 104 for (char i = 'A'; i <= 'F'; i++) 105 digits16[i] = (byte) ((i - 'A') + 10); 106 107 footerLineKeyChars = new byte['z' + 1]; 108 footerLineKeyChars['-'] = 1; 109 for (char i = '0'; i <= '9'; i++) 110 footerLineKeyChars[i] = 1; 111 for (char i = 'A'; i <= 'Z'; i++) 112 footerLineKeyChars[i] = 1; 113 for (char i = 'a'; i <= 'z'; i++) 114 footerLineKeyChars[i] = 1; 115 } 116 117 /** 118 * Determine if b[ptr] matches src. 119 * 120 * @param b 121 * the buffer to scan. 122 * @param ptr 123 * first position within b, this should match src[0]. 124 * @param src 125 * the buffer to test for equality with b. 126 * @return ptr + src.length if b[ptr..src.length] == src; else -1. 127 */ 128 public static final int match(final byte[] b, int ptr, final byte[] src) { 129 if (ptr + src.length > b.length) 130 return -1; 131 for (int i = 0; i < src.length; i++, ptr++) 132 if (b[ptr] != src[i]) 133 return -1; 134 return ptr; 135 } 136 137 private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5', 138 '6', '7', '8', '9' }; 139 140 /** 141 * Format a base 10 numeric into a temporary buffer. 142 * <p> 143 * Formatting is performed backwards. The method starts at offset 144 * <code>o-1</code> and ends at <code>o-1-digits</code>, where 145 * <code>digits</code> is the number of positions necessary to store the 146 * base 10 value. 147 * <p> 148 * The argument and return values from this method make it easy to chain 149 * writing, for example: 150 * </p> 151 * 152 * <pre> 153 * final byte[] tmp = new byte[64]; 154 * int ptr = tmp.length; 155 * tmp[--ptr] = '\n'; 156 * ptr = RawParseUtils.formatBase10(tmp, ptr, 32); 157 * tmp[--ptr] = ' '; 158 * ptr = RawParseUtils.formatBase10(tmp, ptr, 18); 159 * tmp[--ptr] = 0; 160 * final String str = new String(tmp, ptr, tmp.length - ptr); 161 * </pre> 162 * 163 * @param b 164 * buffer to write into. 165 * @param o 166 * one offset past the location where writing will begin; writing 167 * proceeds towards lower index values. 168 * @param value 169 * the value to store. 170 * @return the new offset value <code>o</code>. This is the position of 171 * the last byte written. Additional writing should start at one 172 * position earlier. 173 */ 174 public static int formatBase10(final byte[] b, int o, int value) { 175 if (value == 0) { 176 b[--o] = '0'; 177 return o; 178 } 179 final boolean isneg = value < 0; 180 if (isneg) 181 value = -value; 182 while (value != 0) { 183 b[--o] = base10byte[value % 10]; 184 value /= 10; 185 } 186 if (isneg) 187 b[--o] = '-'; 188 return o; 189 } 190 191 /** 192 * Parse a base 10 numeric from a sequence of ASCII digits into an int. 193 * <p> 194 * Digit sequences can begin with an optional run of spaces before the 195 * sequence, and may start with a '+' or a '-' to indicate sign position. 196 * Any other characters will cause the method to stop and return the current 197 * result to the caller. 198 * 199 * @param b 200 * buffer to scan. 201 * @param ptr 202 * position within buffer to start parsing digits at. 203 * @param ptrResult 204 * optional location to return the new ptr value through. If null 205 * the ptr value will be discarded. 206 * @return the value at this location; 0 if the location is not a valid 207 * numeric. 208 */ 209 public static final int parseBase10(final byte[] b, int ptr, 210 final MutableInteger ptrResult) { 211 int r = 0; 212 int sign = 0; 213 try { 214 final int sz = b.length; 215 while (ptr < sz && b[ptr] == ' ') 216 ptr++; 217 if (ptr >= sz) 218 return 0; 219 220 switch (b[ptr]) { 221 case '-': 222 sign = -1; 223 ptr++; 224 break; 225 case '+': 226 ptr++; 227 break; 228 } 229 230 while (ptr < sz) { 231 final byte v = digits10[b[ptr]]; 232 if (v < 0) 233 break; 234 r = (r * 10) + v; 235 ptr++; 236 } 237 } catch (ArrayIndexOutOfBoundsException e) { 238 // Not a valid digit. 239 } 240 if (ptrResult != null) 241 ptrResult.value = ptr; 242 return sign < 0 ? -r : r; 243 } 244 245 /** 246 * Parse a base 10 numeric from a sequence of ASCII digits into a long. 247 * <p> 248 * Digit sequences can begin with an optional run of spaces before the 249 * sequence, and may start with a '+' or a '-' to indicate sign position. 250 * Any other characters will cause the method to stop and return the current 251 * result to the caller. 252 * 253 * @param b 254 * buffer to scan. 255 * @param ptr 256 * position within buffer to start parsing digits at. 257 * @param ptrResult 258 * optional location to return the new ptr value through. If null 259 * the ptr value will be discarded. 260 * @return the value at this location; 0 if the location is not a valid 261 * numeric. 262 */ 263 public static final long parseLongBase10(final byte[] b, int ptr, 264 final MutableInteger ptrResult) { 265 long r = 0; 266 int sign = 0; 267 try { 268 final int sz = b.length; 269 while (ptr < sz && b[ptr] == ' ') 270 ptr++; 271 if (ptr >= sz) 272 return 0; 273 274 switch (b[ptr]) { 275 case '-': 276 sign = -1; 277 ptr++; 278 break; 279 case '+': 280 ptr++; 281 break; 282 } 283 284 while (ptr < sz) { 285 final byte v = digits10[b[ptr]]; 286 if (v < 0) 287 break; 288 r = (r * 10) + v; 289 ptr++; 290 } 291 } catch (ArrayIndexOutOfBoundsException e) { 292 // Not a valid digit. 293 } 294 if (ptrResult != null) 295 ptrResult.value = ptr; 296 return sign < 0 ? -r : r; 297 } 298 299 /** 300 * Parse 4 character base 16 (hex) formatted string to unsigned integer. 301 * <p> 302 * The number is read in network byte order, that is, most significant 303 * nybble first. 304 * 305 * @param bs 306 * buffer to parse digits from; positions {@code [p, p+4)} will 307 * be parsed. 308 * @param p 309 * first position within the buffer to parse. 310 * @return the integer value. 311 * @throws java.lang.ArrayIndexOutOfBoundsException 312 * if the string is not hex formatted. 313 */ 314 public static final int parseHexInt16(final byte[] bs, final int p) { 315 int r = digits16[bs[p]] << 4; 316 317 r |= digits16[bs[p + 1]]; 318 r <<= 4; 319 320 r |= digits16[bs[p + 2]]; 321 r <<= 4; 322 323 r |= digits16[bs[p + 3]]; 324 if (r < 0) 325 throw new ArrayIndexOutOfBoundsException(); 326 return r; 327 } 328 329 /** 330 * Parse 8 character base 16 (hex) formatted string to unsigned integer. 331 * <p> 332 * The number is read in network byte order, that is, most significant 333 * nybble first. 334 * 335 * @param bs 336 * buffer to parse digits from; positions {@code [p, p+8)} will 337 * be parsed. 338 * @param p 339 * first position within the buffer to parse. 340 * @return the integer value. 341 * @throws java.lang.ArrayIndexOutOfBoundsException 342 * if the string is not hex formatted. 343 */ 344 public static final int parseHexInt32(final byte[] bs, final int p) { 345 int r = digits16[bs[p]] << 4; 346 347 r |= digits16[bs[p + 1]]; 348 r <<= 4; 349 350 r |= digits16[bs[p + 2]]; 351 r <<= 4; 352 353 r |= digits16[bs[p + 3]]; 354 r <<= 4; 355 356 r |= digits16[bs[p + 4]]; 357 r <<= 4; 358 359 r |= digits16[bs[p + 5]]; 360 r <<= 4; 361 362 r |= digits16[bs[p + 6]]; 363 364 final int last = digits16[bs[p + 7]]; 365 if (r < 0 || last < 0) 366 throw new ArrayIndexOutOfBoundsException(); 367 return (r << 4) | last; 368 } 369 370 /** 371 * Parse 16 character base 16 (hex) formatted string to unsigned long. 372 * <p> 373 * The number is read in network byte order, that is, most significant 374 * nibble first. 375 * 376 * @param bs 377 * buffer to parse digits from; positions {@code [p, p+16)} will 378 * be parsed. 379 * @param p 380 * first position within the buffer to parse. 381 * @return the integer value. 382 * @throws java.lang.ArrayIndexOutOfBoundsException 383 * if the string is not hex formatted. 384 * @since 4.3 385 */ 386 public static final long parseHexInt64(final byte[] bs, final int p) { 387 long r = digits16[bs[p]] << 4; 388 389 r |= digits16[bs[p + 1]]; 390 r <<= 4; 391 392 r |= digits16[bs[p + 2]]; 393 r <<= 4; 394 395 r |= digits16[bs[p + 3]]; 396 r <<= 4; 397 398 r |= digits16[bs[p + 4]]; 399 r <<= 4; 400 401 r |= digits16[bs[p + 5]]; 402 r <<= 4; 403 404 r |= digits16[bs[p + 6]]; 405 r <<= 4; 406 407 r |= digits16[bs[p + 7]]; 408 r <<= 4; 409 410 r |= digits16[bs[p + 8]]; 411 r <<= 4; 412 413 r |= digits16[bs[p + 9]]; 414 r <<= 4; 415 416 r |= digits16[bs[p + 10]]; 417 r <<= 4; 418 419 r |= digits16[bs[p + 11]]; 420 r <<= 4; 421 422 r |= digits16[bs[p + 12]]; 423 r <<= 4; 424 425 r |= digits16[bs[p + 13]]; 426 r <<= 4; 427 428 r |= digits16[bs[p + 14]]; 429 430 final int last = digits16[bs[p + 15]]; 431 if (r < 0 || last < 0) 432 throw new ArrayIndexOutOfBoundsException(); 433 return (r << 4) | last; 434 } 435 436 /** 437 * Parse a single hex digit to its numeric value (0-15). 438 * 439 * @param digit 440 * hex character to parse. 441 * @return numeric value, in the range 0-15. 442 * @throws java.lang.ArrayIndexOutOfBoundsException 443 * if the input digit is not a valid hex digit. 444 */ 445 public static final int parseHexInt4(final byte digit) { 446 final byte r = digits16[digit]; 447 if (r < 0) 448 throw new ArrayIndexOutOfBoundsException(); 449 return r; 450 } 451 452 /** 453 * Parse a Git style timezone string. 454 * <p> 455 * The sequence "-0315" will be parsed as the numeric value -195, as the 456 * lower two positions count minutes, not 100ths of an hour. 457 * 458 * @param b 459 * buffer to scan. 460 * @param ptr 461 * position within buffer to start parsing digits at. 462 * @return the timezone at this location, expressed in minutes. 463 */ 464 public static final int parseTimeZoneOffset(final byte[] b, int ptr) { 465 return parseTimeZoneOffset(b, ptr, null); 466 } 467 468 /** 469 * Parse a Git style timezone string. 470 * <p> 471 * The sequence "-0315" will be parsed as the numeric value -195, as the 472 * lower two positions count minutes, not 100ths of an hour. 473 * 474 * @param b 475 * buffer to scan. 476 * @param ptr 477 * position within buffer to start parsing digits at. 478 * @param ptrResult 479 * optional location to return the new ptr value through. If null 480 * the ptr value will be discarded. 481 * @return the timezone at this location, expressed in minutes. 482 * @since 4.1 483 */ 484 public static final int parseTimeZoneOffset(final byte[] b, int ptr, 485 MutableInteger ptrResult) { 486 final int v = parseBase10(b, ptr, ptrResult); 487 final int tzMins = v % 100; 488 final int tzHours = v / 100; 489 return tzHours * 60 + tzMins; 490 } 491 492 /** 493 * Locate the first position after a given character. 494 * 495 * @param b 496 * buffer to scan. 497 * @param ptr 498 * position within buffer to start looking for chrA at. 499 * @param chrA 500 * character to find. 501 * @return new position just after chrA. 502 */ 503 public static final int next(final byte[] b, int ptr, final char chrA) { 504 final int sz = b.length; 505 while (ptr < sz) { 506 if (b[ptr++] == chrA) 507 return ptr; 508 } 509 return ptr; 510 } 511 512 /** 513 * Locate the first position after the next LF. 514 * <p> 515 * This method stops on the first '\n' it finds. 516 * 517 * @param b 518 * buffer to scan. 519 * @param ptr 520 * position within buffer to start looking for LF at. 521 * @return new position just after the first LF found. 522 */ 523 public static final int nextLF(final byte[] b, int ptr) { 524 return next(b, ptr, '\n'); 525 } 526 527 /** 528 * Locate the first position after either the given character or LF. 529 * <p> 530 * This method stops on the first match it finds from either chrA or '\n'. 531 * 532 * @param b 533 * buffer to scan. 534 * @param ptr 535 * position within buffer to start looking for chrA or LF at. 536 * @param chrA 537 * character to find. 538 * @return new position just after the first chrA or LF to be found. 539 */ 540 public static final int nextLF(final byte[] b, int ptr, final char chrA) { 541 final int sz = b.length; 542 while (ptr < sz) { 543 final byte c = b[ptr++]; 544 if (c == chrA || c == '\n') 545 return ptr; 546 } 547 return ptr; 548 } 549 550 /** 551 * Locate the first position before a given character. 552 * 553 * @param b 554 * buffer to scan. 555 * @param ptr 556 * position within buffer to start looking for chrA at. 557 * @param chrA 558 * character to find. 559 * @return new position just before chrA, -1 for not found 560 */ 561 public static final int prev(final byte[] b, int ptr, final char chrA) { 562 if (ptr == b.length) 563 --ptr; 564 while (ptr >= 0) { 565 if (b[ptr--] == chrA) 566 return ptr; 567 } 568 return ptr; 569 } 570 571 /** 572 * Locate the first position before the previous LF. 573 * <p> 574 * This method stops on the first '\n' it finds. 575 * 576 * @param b 577 * buffer to scan. 578 * @param ptr 579 * position within buffer to start looking for LF at. 580 * @return new position just before the first LF found, -1 for not found 581 */ 582 public static final int prevLF(final byte[] b, int ptr) { 583 return prev(b, ptr, '\n'); 584 } 585 586 /** 587 * Locate the previous position before either the given character or LF. 588 * <p> 589 * This method stops on the first match it finds from either chrA or '\n'. 590 * 591 * @param b 592 * buffer to scan. 593 * @param ptr 594 * position within buffer to start looking for chrA or LF at. 595 * @param chrA 596 * character to find. 597 * @return new position just before the first chrA or LF to be found, -1 for 598 * not found 599 */ 600 public static final int prevLF(final byte[] b, int ptr, final char chrA) { 601 if (ptr == b.length) 602 --ptr; 603 while (ptr >= 0) { 604 final byte c = b[ptr--]; 605 if (c == chrA || c == '\n') 606 return ptr; 607 } 608 return ptr; 609 } 610 611 /** 612 * Index the region between <code>[ptr, end)</code> to find line starts. 613 * <p> 614 * The returned list is 1 indexed. Index 0 contains 615 * {@link java.lang.Integer#MIN_VALUE} to pad the list out. 616 * <p> 617 * Using a 1 indexed list means that line numbers can be directly accessed 618 * from the list, so <code>list.get(1)</code> (aka get line 1) returns 619 * <code>ptr</code>. 620 * <p> 621 * The last element (index <code>map.size()-1</code>) always contains 622 * <code>end</code>. 623 * <p> 624 * If the data contains a '\0' anywhere, the whole region is considered 625 * binary and a LineMap corresponding to a single line is returned. 626 * </p> 627 * 628 * @param buf 629 * buffer to scan. 630 * @param ptr 631 * position within the buffer corresponding to the first byte of 632 * line 1. 633 * @param end 634 * 1 past the end of the content within <code>buf</code>. 635 * @return a line map indexing the start position of each line. 636 */ 637 public static final IntList lineMap(final byte[] buf, int ptr, int end) { 638 int start = ptr; 639 640 // Experimentally derived from multiple source repositories 641 // the average number of bytes/line is 36. Its a rough guess 642 // to initially size our map close to the target. 643 IntList map = new IntList((end - ptr) / 36); 644 map.add(Integer.MIN_VALUE); 645 boolean foundLF = true; 646 for (; ptr < end; ptr++) { 647 if (foundLF) { 648 map.add(ptr); 649 } 650 651 if (buf[ptr] == '\0') { 652 // binary data. 653 map = new IntList(3); 654 map.add(Integer.MIN_VALUE); 655 map.add(start); 656 break; 657 } 658 659 foundLF = (buf[ptr] == '\n'); 660 } 661 map.add(end); 662 return map; 663 } 664 665 /** 666 * Locate the "author " header line data. 667 * 668 * @param b 669 * buffer to scan. 670 * @param ptr 671 * position in buffer to start the scan at. Most callers should 672 * pass 0 to ensure the scan starts from the beginning of the 673 * commit buffer and does not accidentally look at message body. 674 * @return position just after the space in "author ", so the first 675 * character of the author's name. If no author header can be 676 * located -1 is returned. 677 */ 678 public static final int author(final byte[] b, int ptr) { 679 final int sz = b.length; 680 if (ptr == 0) 681 ptr += 46; // skip the "tree ..." line. 682 while (ptr < sz && b[ptr] == 'p') 683 ptr += 48; // skip this parent. 684 return match(b, ptr, author); 685 } 686 687 /** 688 * Locate the "committer " header line data. 689 * 690 * @param b 691 * buffer to scan. 692 * @param ptr 693 * position in buffer to start the scan at. Most callers should 694 * pass 0 to ensure the scan starts from the beginning of the 695 * commit buffer and does not accidentally look at message body. 696 * @return position just after the space in "committer ", so the first 697 * character of the committer's name. If no committer header can be 698 * located -1 is returned. 699 */ 700 public static final int committer(final byte[] b, int ptr) { 701 final int sz = b.length; 702 if (ptr == 0) 703 ptr += 46; // skip the "tree ..." line. 704 while (ptr < sz && b[ptr] == 'p') 705 ptr += 48; // skip this parent. 706 if (ptr < sz && b[ptr] == 'a') 707 ptr = nextLF(b, ptr); 708 return match(b, ptr, committer); 709 } 710 711 /** 712 * Locate the "tagger " header line data. 713 * 714 * @param b 715 * buffer to scan. 716 * @param ptr 717 * position in buffer to start the scan at. Most callers should 718 * pass 0 to ensure the scan starts from the beginning of the tag 719 * buffer and does not accidentally look at message body. 720 * @return position just after the space in "tagger ", so the first 721 * character of the tagger's name. If no tagger header can be 722 * located -1 is returned. 723 */ 724 public static final int tagger(final byte[] b, int ptr) { 725 final int sz = b.length; 726 if (ptr == 0) 727 ptr += 48; // skip the "object ..." line. 728 while (ptr < sz) { 729 if (b[ptr] == '\n') 730 return -1; 731 final int m = match(b, ptr, tagger); 732 if (m >= 0) 733 return m; 734 ptr = nextLF(b, ptr); 735 } 736 return -1; 737 } 738 739 /** 740 * Locate the "encoding " header line. 741 * 742 * @param b 743 * buffer to scan. 744 * @param ptr 745 * position in buffer to start the scan at. Most callers should 746 * pass 0 to ensure the scan starts from the beginning of the 747 * buffer and does not accidentally look at the message body. 748 * @return position just after the space in "encoding ", so the first 749 * character of the encoding's name. If no encoding header can be 750 * located -1 is returned (and UTF-8 should be assumed). 751 */ 752 public static final int encoding(final byte[] b, int ptr) { 753 final int sz = b.length; 754 while (ptr < sz) { 755 if (b[ptr] == '\n') 756 return -1; 757 if (b[ptr] == 'e') 758 break; 759 ptr = nextLF(b, ptr); 760 } 761 return match(b, ptr, encoding); 762 } 763 764 /** 765 * Parse the "encoding " header as a string. 766 * <p> 767 * Locates the "encoding " header (if present) and returns its value. 768 * 769 * @param b 770 * buffer to scan. 771 * @return the encoding header as specified in the commit; null if the 772 * header was not present and should be assumed. 773 * @since 4.2 774 */ 775 @Nullable 776 public static String parseEncodingName(final byte[] b) { 777 int enc = encoding(b, 0); 778 if (enc < 0) { 779 return null; 780 } 781 int lf = nextLF(b, enc); 782 return decode(UTF_8, b, enc, lf - 1); 783 } 784 785 /** 786 * Parse the "encoding " header into a character set reference. 787 * <p> 788 * Locates the "encoding " header (if present) by first calling 789 * {@link #encoding(byte[], int)} and then returns the proper character set 790 * to apply to this buffer to evaluate its contents as character data. 791 * <p> 792 * If no encoding header is present {@code UTF-8} is assumed. 793 * 794 * @param b 795 * buffer to scan. 796 * @return the Java character set representation. Never null. 797 * @throws IllegalCharsetNameException 798 * if the character set requested by the encoding header is 799 * malformed and unsupportable. 800 * @throws UnsupportedCharsetException 801 * if the JRE does not support the character set requested by 802 * the encoding header. 803 */ 804 public static Charset parseEncoding(final byte[] b) { 805 String enc = parseEncodingName(b); 806 if (enc == null) { 807 return UTF_8; 808 } 809 810 String name = enc.trim(); 811 try { 812 return Charset.forName(name); 813 } catch (IllegalCharsetNameException 814 | UnsupportedCharsetException badName) { 815 Charset aliased = charsetForAlias(name); 816 if (aliased != null) { 817 return aliased; 818 } 819 throw badName; 820 } 821 } 822 823 /** 824 * Parse a name string (e.g. author, committer, tagger) into a PersonIdent. 825 * <p> 826 * Leading spaces won't be trimmed from the string, i.e. will show up in the 827 * parsed name afterwards. 828 * 829 * @param in 830 * the string to parse a name from. 831 * @return the parsed identity or null in case the identity could not be 832 * parsed. 833 */ 834 public static PersonIdent parsePersonIdent(final String in) { 835 return parsePersonIdent(Constants.encode(in), 0); 836 } 837 838 /** 839 * Parse a name line (e.g. author, committer, tagger) into a PersonIdent. 840 * <p> 841 * When passing in a value for <code>nameB</code> callers should use the 842 * return value of {@link #author(byte[], int)} or 843 * {@link #committer(byte[], int)}, as these methods provide the proper 844 * position within the buffer. 845 * 846 * @param raw 847 * the buffer to parse character data from. 848 * @param nameB 849 * first position of the identity information. This should be the 850 * first position after the space which delimits the header field 851 * name (e.g. "author" or "committer") from the rest of the 852 * identity line. 853 * @return the parsed identity or null in case the identity could not be 854 * parsed. 855 */ 856 public static PersonIdent parsePersonIdent(final byte[] raw, final int nameB) { 857 Charset cs; 858 try { 859 cs = parseEncoding(raw); 860 } catch (IllegalCharsetNameException | UnsupportedCharsetException e) { 861 // Assume UTF-8 for person identities, usually this is correct. 862 // If not decode() will fall back to the ISO-8859-1 encoding. 863 cs = UTF_8; 864 } 865 866 final int emailB = nextLF(raw, nameB, '<'); 867 final int emailE = nextLF(raw, emailB, '>'); 868 if (emailB >= raw.length || raw[emailB] == '\n' || 869 (emailE >= raw.length - 1 && raw[emailE - 1] != '>')) 870 return null; 871 872 final int nameEnd = emailB - 2 >= nameB && raw[emailB - 2] == ' ' ? 873 emailB - 2 : emailB - 1; 874 final String name = decode(cs, raw, nameB, nameEnd); 875 final String email = decode(cs, raw, emailB, emailE - 1); 876 877 // Start searching from end of line, as after first name-email pair, 878 // another name-email pair may occur. We will ignore all kinds of 879 // "junk" following the first email. 880 // 881 // We've to use (emailE - 1) for the case that raw[email] is LF, 882 // otherwise we would run too far. "-2" is necessary to position 883 // before the LF in case of LF termination resp. the penultimate 884 // character if there is no trailing LF. 885 final int tzBegin = lastIndexOfTrim(raw, ' ', 886 nextLF(raw, emailE - 1) - 2) + 1; 887 if (tzBegin <= emailE) // No time/zone, still valid 888 return new PersonIdent(name, email, 0, 0); 889 890 final int whenBegin = Math.max(emailE, 891 lastIndexOfTrim(raw, ' ', tzBegin - 1) + 1); 892 if (whenBegin >= tzBegin - 1) // No time/zone, still valid 893 return new PersonIdent(name, email, 0, 0); 894 895 final long when = parseLongBase10(raw, whenBegin, null); 896 final int tz = parseTimeZoneOffset(raw, tzBegin); 897 return new PersonIdent(name, email, when * 1000L, tz); 898 } 899 900 /** 901 * Parse a name data (e.g. as within a reflog) into a PersonIdent. 902 * <p> 903 * When passing in a value for <code>nameB</code> callers should use the 904 * return value of {@link #author(byte[], int)} or 905 * {@link #committer(byte[], int)}, as these methods provide the proper 906 * position within the buffer. 907 * 908 * @param raw 909 * the buffer to parse character data from. 910 * @param nameB 911 * first position of the identity information. This should be the 912 * first position after the space which delimits the header field 913 * name (e.g. "author" or "committer") from the rest of the 914 * identity line. 915 * @return the parsed identity. Never null. 916 */ 917 public static PersonIdent parsePersonIdentOnly(final byte[] raw, 918 final int nameB) { 919 int stop = nextLF(raw, nameB); 920 int emailB = nextLF(raw, nameB, '<'); 921 int emailE = nextLF(raw, emailB, '>'); 922 final String name; 923 final String email; 924 if (emailE < stop) { 925 email = decode(raw, emailB, emailE - 1); 926 } else { 927 email = "invalid"; //$NON-NLS-1$ 928 } 929 if (emailB < stop) 930 name = decode(raw, nameB, emailB - 2); 931 else 932 name = decode(raw, nameB, stop); 933 934 final MutableInteger ptrout = new MutableInteger(); 935 long when; 936 int tz; 937 if (emailE < stop) { 938 when = parseLongBase10(raw, emailE + 1, ptrout); 939 tz = parseTimeZoneOffset(raw, ptrout.value); 940 } else { 941 when = 0; 942 tz = 0; 943 } 944 return new PersonIdent(name, email, when * 1000L, tz); 945 } 946 947 /** 948 * Locate the end of a footer line key string. 949 * <p> 950 * If the region at {@code raw[ptr]} matches {@code ^[A-Za-z0-9-]+:} (e.g. 951 * "Signed-off-by: A. U. Thor\n") then this method returns the position of 952 * the first ':'. 953 * <p> 954 * If the region at {@code raw[ptr]} does not match {@code ^[A-Za-z0-9-]+:} 955 * then this method returns -1. 956 * 957 * @param raw 958 * buffer to scan. 959 * @param ptr 960 * first position within raw to consider as a footer line key. 961 * @return position of the ':' which terminates the footer line key if this 962 * is otherwise a valid footer line key; otherwise -1. 963 */ 964 public static int endOfFooterLineKey(final byte[] raw, int ptr) { 965 try { 966 for (;;) { 967 final byte c = raw[ptr]; 968 if (footerLineKeyChars[c] == 0) { 969 if (c == ':') 970 return ptr; 971 return -1; 972 } 973 ptr++; 974 } 975 } catch (ArrayIndexOutOfBoundsException e) { 976 return -1; 977 } 978 } 979 980 /** 981 * Decode a buffer under UTF-8, if possible. 982 * 983 * If the byte stream cannot be decoded that way, the platform default is tried 984 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried. 985 * 986 * @param buffer 987 * buffer to pull raw bytes from. 988 * @return a string representation of the range <code>[start,end)</code>, 989 * after decoding the region through the specified character set. 990 */ 991 public static String decode(final byte[] buffer) { 992 return decode(buffer, 0, buffer.length); 993 } 994 995 /** 996 * Decode a buffer under UTF-8, if possible. 997 * 998 * If the byte stream cannot be decoded that way, the platform default is 999 * tried and if that too fails, the fail-safe ISO-8859-1 encoding is tried. 1000 * 1001 * @param buffer 1002 * buffer to pull raw bytes from. 1003 * @param start 1004 * start position in buffer 1005 * @param end 1006 * one position past the last location within the buffer to take 1007 * data from. 1008 * @return a string representation of the range <code>[start,end)</code>, 1009 * after decoding the region through the specified character set. 1010 */ 1011 public static String decode(final byte[] buffer, final int start, 1012 final int end) { 1013 return decode(UTF_8, buffer, start, end); 1014 } 1015 1016 /** 1017 * Decode a buffer under the specified character set if possible. 1018 * 1019 * If the byte stream cannot be decoded that way, the platform default is tried 1020 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried. 1021 * 1022 * @param cs 1023 * character set to use when decoding the buffer. 1024 * @param buffer 1025 * buffer to pull raw bytes from. 1026 * @return a string representation of the range <code>[start,end)</code>, 1027 * after decoding the region through the specified character set. 1028 */ 1029 public static String decode(final Charset cs, final byte[] buffer) { 1030 return decode(cs, buffer, 0, buffer.length); 1031 } 1032 1033 /** 1034 * Decode a region of the buffer under the specified character set if possible. 1035 * 1036 * If the byte stream cannot be decoded that way, the platform default is tried 1037 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried. 1038 * 1039 * @param cs 1040 * character set to use when decoding the buffer. 1041 * @param buffer 1042 * buffer to pull raw bytes from. 1043 * @param start 1044 * first position within the buffer to take data from. 1045 * @param end 1046 * one position past the last location within the buffer to take 1047 * data from. 1048 * @return a string representation of the range <code>[start,end)</code>, 1049 * after decoding the region through the specified character set. 1050 */ 1051 public static String decode(final Charset cs, final byte[] buffer, 1052 final int start, final int end) { 1053 try { 1054 return decodeNoFallback(cs, buffer, start, end); 1055 } catch (CharacterCodingException e) { 1056 // Fall back to an ISO-8859-1 style encoding. At least all of 1057 // the bytes will be present in the output. 1058 // 1059 return extractBinaryString(buffer, start, end); 1060 } 1061 } 1062 1063 /** 1064 * Decode a region of the buffer under the specified character set if 1065 * possible. 1066 * 1067 * If the byte stream cannot be decoded that way, the platform default is 1068 * tried and if that too fails, an exception is thrown. 1069 * 1070 * @param cs 1071 * character set to use when decoding the buffer. 1072 * @param buffer 1073 * buffer to pull raw bytes from. 1074 * @param start 1075 * first position within the buffer to take data from. 1076 * @param end 1077 * one position past the last location within the buffer to take 1078 * data from. 1079 * @return a string representation of the range <code>[start,end)</code>, 1080 * after decoding the region through the specified character set. 1081 * @throws java.nio.charset.CharacterCodingException 1082 * the input is not in any of the tested character sets. 1083 */ 1084 public static String decodeNoFallback(final Charset cs, 1085 final byte[] buffer, final int start, final int end) 1086 throws CharacterCodingException { 1087 ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start); 1088 b.mark(); 1089 1090 // Try our built-in favorite. The assumption here is that 1091 // decoding will fail if the data is not actually encoded 1092 // using that encoder. 1093 try { 1094 return decode(b, UTF_8); 1095 } catch (CharacterCodingException e) { 1096 b.reset(); 1097 } 1098 1099 if (!cs.equals(UTF_8)) { 1100 // Try the suggested encoding, it might be right since it was 1101 // provided by the caller. 1102 try { 1103 return decode(b, cs); 1104 } catch (CharacterCodingException e) { 1105 b.reset(); 1106 } 1107 } 1108 1109 // Try the default character set. A small group of people 1110 // might actually use the same (or very similar) locale. 1111 Charset defcs = Charset.defaultCharset(); 1112 if (!defcs.equals(cs) && !defcs.equals(UTF_8)) { 1113 try { 1114 return decode(b, defcs); 1115 } catch (CharacterCodingException e) { 1116 b.reset(); 1117 } 1118 } 1119 1120 throw new CharacterCodingException(); 1121 } 1122 1123 /** 1124 * Decode a region of the buffer under the ISO-8859-1 encoding. 1125 * 1126 * Each byte is treated as a single character in the 8859-1 character 1127 * encoding, performing a raw binary->char conversion. 1128 * 1129 * @param buffer 1130 * buffer to pull raw bytes from. 1131 * @param start 1132 * first position within the buffer to take data from. 1133 * @param end 1134 * one position past the last location within the buffer to take 1135 * data from. 1136 * @return a string representation of the range <code>[start,end)</code>. 1137 */ 1138 public static String extractBinaryString(final byte[] buffer, 1139 final int start, final int end) { 1140 final StringBuilder r = new StringBuilder(end - start); 1141 for (int i = start; i < end; i++) 1142 r.append((char) (buffer[i] & 0xff)); 1143 return r.toString(); 1144 } 1145 1146 private static String decode(final ByteBuffer b, final Charset charset) 1147 throws CharacterCodingException { 1148 final CharsetDecoder d = charset.newDecoder(); 1149 d.onMalformedInput(CodingErrorAction.REPORT); 1150 d.onUnmappableCharacter(CodingErrorAction.REPORT); 1151 return d.decode(b).toString(); 1152 } 1153 1154 /** 1155 * Locate the position of the commit message body. 1156 * 1157 * @param b 1158 * buffer to scan. 1159 * @param ptr 1160 * position in buffer to start the scan at. Most callers should 1161 * pass 0 to ensure the scan starts from the beginning of the 1162 * commit buffer. 1163 * @return position of the user's message buffer. 1164 */ 1165 public static final int commitMessage(final byte[] b, int ptr) { 1166 final int sz = b.length; 1167 if (ptr == 0) 1168 ptr += 46; // skip the "tree ..." line. 1169 while (ptr < sz && b[ptr] == 'p') 1170 ptr += 48; // skip this parent. 1171 1172 // Skip any remaining header lines, ignoring what their actual 1173 // header line type is. This is identical to the logic for a tag. 1174 // 1175 return tagMessage(b, ptr); 1176 } 1177 1178 /** 1179 * Locate the position of the tag message body. 1180 * 1181 * @param b 1182 * buffer to scan. 1183 * @param ptr 1184 * position in buffer to start the scan at. Most callers should 1185 * pass 0 to ensure the scan starts from the beginning of the tag 1186 * buffer. 1187 * @return position of the user's message buffer. 1188 */ 1189 public static final int tagMessage(final byte[] b, int ptr) { 1190 final int sz = b.length; 1191 if (ptr == 0) 1192 ptr += 48; // skip the "object ..." line. 1193 while (ptr < sz && b[ptr] != '\n') 1194 ptr = nextLF(b, ptr); 1195 if (ptr < sz && b[ptr] == '\n') 1196 return ptr + 1; 1197 return -1; 1198 } 1199 1200 /** 1201 * Locate the end of a paragraph. 1202 * <p> 1203 * A paragraph is ended by two consecutive LF bytes or CRLF pairs 1204 * 1205 * @param b 1206 * buffer to scan. 1207 * @param start 1208 * position in buffer to start the scan at. Most callers will 1209 * want to pass the first position of the commit message (as 1210 * found by {@link #commitMessage(byte[], int)}. 1211 * @return position of the LF at the end of the paragraph; 1212 * <code>b.length</code> if no paragraph end could be located. 1213 */ 1214 public static final int endOfParagraph(final byte[] b, final int start) { 1215 int ptr = start; 1216 final int sz = b.length; 1217 while (ptr < sz && (b[ptr] != '\n' && b[ptr] != '\r')) 1218 ptr = nextLF(b, ptr); 1219 if (ptr > start && b[ptr - 1] == '\n') 1220 ptr--; 1221 if (ptr > start && b[ptr - 1] == '\r') 1222 ptr--; 1223 return ptr; 1224 } 1225 1226 /** 1227 * Get last index of {@code ch} in raw, trimming spaces. 1228 * 1229 * @param raw 1230 * buffer to scan. 1231 * @param ch 1232 * character to find. 1233 * @param pos 1234 * starting position. 1235 * @return last index of {@code ch} in raw, trimming spaces. 1236 * @since 4.1 1237 */ 1238 public static int lastIndexOfTrim(byte[] raw, char ch, int pos) { 1239 while (pos >= 0 && raw[pos] == ' ') 1240 pos--; 1241 1242 while (pos >= 0 && raw[pos] != ch) 1243 pos--; 1244 1245 return pos; 1246 } 1247 1248 private static Charset charsetForAlias(String name) { 1249 return encodingAliases.get(StringUtils.toLowerCase(name)); 1250 } 1251 1252 private RawParseUtils() { 1253 // Don't create instances of a static only utility. 1254 } 1255 }