1 /* 2 * Copyright (C) 2008-2009, Google Inc. 3 * Copyright (C) 2006-2008, Shawn O. Pearce <spearce@spearce.org> 4 * and other copyright owners as documented in the project's IP log. 5 * 6 * This program and the accompanying materials are made available 7 * under the terms of the Eclipse Distribution License v1.0 which 8 * accompanies this distribution, is reproduced below, and is 9 * available at http://www.eclipse.org/org/documents/edl-v10.php 10 * 11 * All rights reserved. 12 * 13 * Redistribution and use in source and binary forms, with or 14 * without modification, are permitted provided that the following 15 * conditions are met: 16 * 17 * - Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 20 * - Redistributions in binary form must reproduce the above 21 * copyright notice, this list of conditions and the following 22 * disclaimer in the documentation and/or other materials provided 23 * with the distribution. 24 * 25 * - Neither the name of the Eclipse Foundation, Inc. nor the 26 * names of its contributors may be used to endorse or promote 27 * products derived from this software without specific prior 28 * written permission. 29 * 30 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND 31 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, 32 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 33 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 34 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 35 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 36 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 37 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 38 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 39 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 40 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 41 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 42 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 43 */ 44 45 package org.eclipse.jgit.util; 46 47 import static java.nio.charset.StandardCharsets.ISO_8859_1; 48 import static java.nio.charset.StandardCharsets.UTF_8; 49 import static org.eclipse.jgit.lib.ObjectChecker.author; 50 import static org.eclipse.jgit.lib.ObjectChecker.committer; 51 import static org.eclipse.jgit.lib.ObjectChecker.encoding; 52 import static org.eclipse.jgit.lib.ObjectChecker.tagger; 53 54 import java.nio.ByteBuffer; 55 import java.nio.charset.CharacterCodingException; 56 import java.nio.charset.Charset; 57 import java.nio.charset.CharsetDecoder; 58 import java.nio.charset.CodingErrorAction; 59 import java.nio.charset.IllegalCharsetNameException; 60 import java.nio.charset.UnsupportedCharsetException; 61 import java.util.Arrays; 62 import java.util.HashMap; 63 import java.util.Map; 64 65 import org.eclipse.jgit.annotations.Nullable; 66 import org.eclipse.jgit.lib.Constants; 67 import org.eclipse.jgit.lib.PersonIdent; 68 69 /** Handy utility functions to parse raw object contents. */ 70 public final class RawParseUtils { 71 /** 72 * UTF-8 charset constant. 73 * 74 * @since 2.2 75 */ 76 public static final Charset UTF8_CHARSET = UTF_8; 77 78 private static final byte[] digits10; 79 80 private static final byte[] digits16; 81 82 private static final byte[] footerLineKeyChars; 83 84 private static final Map<String, Charset> encodingAliases; 85 86 static { 87 encodingAliases = new HashMap<>(); 88 encodingAliases.put("latin-1", ISO_8859_1); //$NON-NLS-1$ 89 encodingAliases.put("iso-latin-1", ISO_8859_1); //$NON-NLS-1$ 90 91 digits10 = new byte['9' + 1]; 92 Arrays.fill(digits10, (byte) -1); 93 for (char i = '0'; i <= '9'; i++) 94 digits10[i] = (byte) (i - '0'); 95 96 digits16 = new byte['f' + 1]; 97 Arrays.fill(digits16, (byte) -1); 98 for (char i = '0'; i <= '9'; i++) 99 digits16[i] = (byte) (i - '0'); 100 for (char i = 'a'; i <= 'f'; i++) 101 digits16[i] = (byte) ((i - 'a') + 10); 102 for (char i = 'A'; i <= 'F'; i++) 103 digits16[i] = (byte) ((i - 'A') + 10); 104 105 footerLineKeyChars = new byte['z' + 1]; 106 footerLineKeyChars['-'] = 1; 107 for (char i = '0'; i <= '9'; i++) 108 footerLineKeyChars[i] = 1; 109 for (char i = 'A'; i <= 'Z'; i++) 110 footerLineKeyChars[i] = 1; 111 for (char i = 'a'; i <= 'z'; i++) 112 footerLineKeyChars[i] = 1; 113 } 114 115 /** 116 * Determine if b[ptr] matches src. 117 * 118 * @param b 119 * the buffer to scan. 120 * @param ptr 121 * first position within b, this should match src[0]. 122 * @param src 123 * the buffer to test for equality with b. 124 * @return ptr + src.length if b[ptr..src.length] == src; else -1. 125 */ 126 public static final int match(final byte[] b, int ptr, final byte[] src) { 127 if (ptr + src.length > b.length) 128 return -1; 129 for (int i = 0; i < src.length; i++, ptr++) 130 if (b[ptr] != src[i]) 131 return -1; 132 return ptr; 133 } 134 135 private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5', 136 '6', '7', '8', '9' }; 137 138 /** 139 * Format a base 10 numeric into a temporary buffer. 140 * <p> 141 * Formatting is performed backwards. The method starts at offset 142 * <code>o-1</code> and ends at <code>o-1-digits</code>, where 143 * <code>digits</code> is the number of positions necessary to store the 144 * base 10 value. 145 * <p> 146 * The argument and return values from this method make it easy to chain 147 * writing, for example: 148 * </p> 149 * 150 * <pre> 151 * final byte[] tmp = new byte[64]; 152 * int ptr = tmp.length; 153 * tmp[--ptr] = '\n'; 154 * ptr = RawParseUtils.formatBase10(tmp, ptr, 32); 155 * tmp[--ptr] = ' '; 156 * ptr = RawParseUtils.formatBase10(tmp, ptr, 18); 157 * tmp[--ptr] = 0; 158 * final String str = new String(tmp, ptr, tmp.length - ptr); 159 * </pre> 160 * 161 * @param b 162 * buffer to write into. 163 * @param o 164 * one offset past the location where writing will begin; writing 165 * proceeds towards lower index values. 166 * @param value 167 * the value to store. 168 * @return the new offset value <code>o</code>. This is the position of 169 * the last byte written. Additional writing should start at one 170 * position earlier. 171 */ 172 public static int formatBase10(final byte[] b, int o, int value) { 173 if (value == 0) { 174 b[--o] = '0'; 175 return o; 176 } 177 final boolean isneg = value < 0; 178 if (isneg) 179 value = -value; 180 while (value != 0) { 181 b[--o] = base10byte[value % 10]; 182 value /= 10; 183 } 184 if (isneg) 185 b[--o] = '-'; 186 return o; 187 } 188 189 /** 190 * Parse a base 10 numeric from a sequence of ASCII digits into an int. 191 * <p> 192 * Digit sequences can begin with an optional run of spaces before the 193 * sequence, and may start with a '+' or a '-' to indicate sign position. 194 * Any other characters will cause the method to stop and return the current 195 * result to the caller. 196 * 197 * @param b 198 * buffer to scan. 199 * @param ptr 200 * position within buffer to start parsing digits at. 201 * @param ptrResult 202 * optional location to return the new ptr value through. If null 203 * the ptr value will be discarded. 204 * @return the value at this location; 0 if the location is not a valid 205 * numeric. 206 */ 207 public static final int parseBase10(final byte[] b, int ptr, 208 final MutableInteger ptrResult) { 209 int r = 0; 210 int sign = 0; 211 try { 212 final int sz = b.length; 213 while (ptr < sz && b[ptr] == ' ') 214 ptr++; 215 if (ptr >= sz) 216 return 0; 217 218 switch (b[ptr]) { 219 case '-': 220 sign = -1; 221 ptr++; 222 break; 223 case '+': 224 ptr++; 225 break; 226 } 227 228 while (ptr < sz) { 229 final byte v = digits10[b[ptr]]; 230 if (v < 0) 231 break; 232 r = (r * 10) + v; 233 ptr++; 234 } 235 } catch (ArrayIndexOutOfBoundsException e) { 236 // Not a valid digit. 237 } 238 if (ptrResult != null) 239 ptrResult.value = ptr; 240 return sign < 0 ? -r : r; 241 } 242 243 /** 244 * Parse a base 10 numeric from a sequence of ASCII digits into a long. 245 * <p> 246 * Digit sequences can begin with an optional run of spaces before the 247 * sequence, and may start with a '+' or a '-' to indicate sign position. 248 * Any other characters will cause the method to stop and return the current 249 * result to the caller. 250 * 251 * @param b 252 * buffer to scan. 253 * @param ptr 254 * position within buffer to start parsing digits at. 255 * @param ptrResult 256 * optional location to return the new ptr value through. If null 257 * the ptr value will be discarded. 258 * @return the value at this location; 0 if the location is not a valid 259 * numeric. 260 */ 261 public static final long parseLongBase10(final byte[] b, int ptr, 262 final MutableInteger ptrResult) { 263 long r = 0; 264 int sign = 0; 265 try { 266 final int sz = b.length; 267 while (ptr < sz && b[ptr] == ' ') 268 ptr++; 269 if (ptr >= sz) 270 return 0; 271 272 switch (b[ptr]) { 273 case '-': 274 sign = -1; 275 ptr++; 276 break; 277 case '+': 278 ptr++; 279 break; 280 } 281 282 while (ptr < sz) { 283 final byte v = digits10[b[ptr]]; 284 if (v < 0) 285 break; 286 r = (r * 10) + v; 287 ptr++; 288 } 289 } catch (ArrayIndexOutOfBoundsException e) { 290 // Not a valid digit. 291 } 292 if (ptrResult != null) 293 ptrResult.value = ptr; 294 return sign < 0 ? -r : r; 295 } 296 297 /** 298 * Parse 4 character base 16 (hex) formatted string to unsigned integer. 299 * <p> 300 * The number is read in network byte order, that is, most significant 301 * nybble first. 302 * 303 * @param bs 304 * buffer to parse digits from; positions {@code [p, p+4)} will 305 * be parsed. 306 * @param p 307 * first position within the buffer to parse. 308 * @return the integer value. 309 * @throws ArrayIndexOutOfBoundsException 310 * if the string is not hex formatted. 311 */ 312 public static final int parseHexInt16(final byte[] bs, final int p) { 313 int r = digits16[bs[p]] << 4; 314 315 r |= digits16[bs[p + 1]]; 316 r <<= 4; 317 318 r |= digits16[bs[p + 2]]; 319 r <<= 4; 320 321 r |= digits16[bs[p + 3]]; 322 if (r < 0) 323 throw new ArrayIndexOutOfBoundsException(); 324 return r; 325 } 326 327 /** 328 * Parse 8 character base 16 (hex) formatted string to unsigned integer. 329 * <p> 330 * The number is read in network byte order, that is, most significant 331 * nybble first. 332 * 333 * @param bs 334 * buffer to parse digits from; positions {@code [p, p+8)} will 335 * be parsed. 336 * @param p 337 * first position within the buffer to parse. 338 * @return the integer value. 339 * @throws ArrayIndexOutOfBoundsException 340 * if the string is not hex formatted. 341 */ 342 public static final int parseHexInt32(final byte[] bs, final int p) { 343 int r = digits16[bs[p]] << 4; 344 345 r |= digits16[bs[p + 1]]; 346 r <<= 4; 347 348 r |= digits16[bs[p + 2]]; 349 r <<= 4; 350 351 r |= digits16[bs[p + 3]]; 352 r <<= 4; 353 354 r |= digits16[bs[p + 4]]; 355 r <<= 4; 356 357 r |= digits16[bs[p + 5]]; 358 r <<= 4; 359 360 r |= digits16[bs[p + 6]]; 361 362 final int last = digits16[bs[p + 7]]; 363 if (r < 0 || last < 0) 364 throw new ArrayIndexOutOfBoundsException(); 365 return (r << 4) | last; 366 } 367 368 /** 369 * Parse 16 character base 16 (hex) formatted string to unsigned long. 370 * <p> 371 * The number is read in network byte order, that is, most significant 372 * nibble first. 373 * 374 * @param bs 375 * buffer to parse digits from; positions {@code [p, p+16)} will 376 * be parsed. 377 * @param p 378 * first position within the buffer to parse. 379 * @return the integer value. 380 * @throws ArrayIndexOutOfBoundsException 381 * if the string is not hex formatted. 382 * @since 4.3 383 */ 384 public static final long parseHexInt64(final byte[] bs, final int p) { 385 long r = digits16[bs[p]] << 4; 386 387 r |= digits16[bs[p + 1]]; 388 r <<= 4; 389 390 r |= digits16[bs[p + 2]]; 391 r <<= 4; 392 393 r |= digits16[bs[p + 3]]; 394 r <<= 4; 395 396 r |= digits16[bs[p + 4]]; 397 r <<= 4; 398 399 r |= digits16[bs[p + 5]]; 400 r <<= 4; 401 402 r |= digits16[bs[p + 6]]; 403 r <<= 4; 404 405 r |= digits16[bs[p + 7]]; 406 r <<= 4; 407 408 r |= digits16[bs[p + 8]]; 409 r <<= 4; 410 411 r |= digits16[bs[p + 9]]; 412 r <<= 4; 413 414 r |= digits16[bs[p + 10]]; 415 r <<= 4; 416 417 r |= digits16[bs[p + 11]]; 418 r <<= 4; 419 420 r |= digits16[bs[p + 12]]; 421 r <<= 4; 422 423 r |= digits16[bs[p + 13]]; 424 r <<= 4; 425 426 r |= digits16[bs[p + 14]]; 427 428 final int last = digits16[bs[p + 15]]; 429 if (r < 0 || last < 0) 430 throw new ArrayIndexOutOfBoundsException(); 431 return (r << 4) | last; 432 } 433 434 /** 435 * Parse a single hex digit to its numeric value (0-15). 436 * 437 * @param digit 438 * hex character to parse. 439 * @return numeric value, in the range 0-15. 440 * @throws ArrayIndexOutOfBoundsException 441 * if the input digit is not a valid hex digit. 442 */ 443 public static final int parseHexInt4(final byte digit) { 444 final byte r = digits16[digit]; 445 if (r < 0) 446 throw new ArrayIndexOutOfBoundsException(); 447 return r; 448 } 449 450 /** 451 * Parse a Git style timezone string. 452 * <p> 453 * The sequence "-0315" will be parsed as the numeric value -195, as the 454 * lower two positions count minutes, not 100ths of an hour. 455 * 456 * @param b 457 * buffer to scan. 458 * @param ptr 459 * position within buffer to start parsing digits at. 460 * @return the timezone at this location, expressed in minutes. 461 */ 462 public static final int parseTimeZoneOffset(final byte[] b, int ptr) { 463 return parseTimeZoneOffset(b, ptr, null); 464 } 465 466 /** 467 * Parse a Git style timezone string. 468 * <p> 469 * The sequence "-0315" will be parsed as the numeric value -195, as the 470 * lower two positions count minutes, not 100ths of an hour. 471 * 472 * @param b 473 * buffer to scan. 474 * @param ptr 475 * position within buffer to start parsing digits at. 476 * @param ptrResult 477 * optional location to return the new ptr value through. If null 478 * the ptr value will be discarded. 479 * @return the timezone at this location, expressed in minutes. 480 * @since 4.1 481 */ 482 public static final int parseTimeZoneOffset(final byte[] b, int ptr, 483 MutableInteger ptrResult) { 484 final int v = parseBase10(b, ptr, ptrResult); 485 final int tzMins = v % 100; 486 final int tzHours = v / 100; 487 return tzHours * 60 + tzMins; 488 } 489 490 /** 491 * Locate the first position after a given character. 492 * 493 * @param b 494 * buffer to scan. 495 * @param ptr 496 * position within buffer to start looking for chrA at. 497 * @param chrA 498 * character to find. 499 * @return new position just after chrA. 500 */ 501 public static final int next(final byte[] b, int ptr, final char chrA) { 502 final int sz = b.length; 503 while (ptr < sz) { 504 if (b[ptr++] == chrA) 505 return ptr; 506 } 507 return ptr; 508 } 509 510 /** 511 * Locate the first position after the next LF. 512 * <p> 513 * This method stops on the first '\n' it finds. 514 * 515 * @param b 516 * buffer to scan. 517 * @param ptr 518 * position within buffer to start looking for LF at. 519 * @return new position just after the first LF found. 520 */ 521 public static final int nextLF(final byte[] b, int ptr) { 522 return next(b, ptr, '\n'); 523 } 524 525 /** 526 * Locate the first position after either the given character or LF. 527 * <p> 528 * This method stops on the first match it finds from either chrA or '\n'. 529 * 530 * @param b 531 * buffer to scan. 532 * @param ptr 533 * position within buffer to start looking for chrA or LF at. 534 * @param chrA 535 * character to find. 536 * @return new position just after the first chrA or LF to be found. 537 */ 538 public static final int nextLF(final byte[] b, int ptr, final char chrA) { 539 final int sz = b.length; 540 while (ptr < sz) { 541 final byte c = b[ptr++]; 542 if (c == chrA || c == '\n') 543 return ptr; 544 } 545 return ptr; 546 } 547 548 /** 549 * Locate the first position before a given character. 550 * 551 * @param b 552 * buffer to scan. 553 * @param ptr 554 * position within buffer to start looking for chrA at. 555 * @param chrA 556 * character to find. 557 * @return new position just before chrA, -1 for not found 558 */ 559 public static final int prev(final byte[] b, int ptr, final char chrA) { 560 if (ptr == b.length) 561 --ptr; 562 while (ptr >= 0) { 563 if (b[ptr--] == chrA) 564 return ptr; 565 } 566 return ptr; 567 } 568 569 /** 570 * Locate the first position before the previous LF. 571 * <p> 572 * This method stops on the first '\n' it finds. 573 * 574 * @param b 575 * buffer to scan. 576 * @param ptr 577 * position within buffer to start looking for LF at. 578 * @return new position just before the first LF found, -1 for not found 579 */ 580 public static final int prevLF(final byte[] b, int ptr) { 581 return prev(b, ptr, '\n'); 582 } 583 584 /** 585 * Locate the previous position before either the given character or LF. 586 * <p> 587 * This method stops on the first match it finds from either chrA or '\n'. 588 * 589 * @param b 590 * buffer to scan. 591 * @param ptr 592 * position within buffer to start looking for chrA or LF at. 593 * @param chrA 594 * character to find. 595 * @return new position just before the first chrA or LF to be found, -1 for 596 * not found 597 */ 598 public static final int prevLF(final byte[] b, int ptr, final char chrA) { 599 if (ptr == b.length) 600 --ptr; 601 while (ptr >= 0) { 602 final byte c = b[ptr--]; 603 if (c == chrA || c == '\n') 604 return ptr; 605 } 606 return ptr; 607 } 608 609 /** 610 * Index the region between <code>[ptr, end)</code> to find line starts. 611 * <p> 612 * The returned list is 1 indexed. Index 0 contains 613 * {@link Integer#MIN_VALUE} to pad the list out. 614 * <p> 615 * Using a 1 indexed list means that line numbers can be directly accessed 616 * from the list, so <code>list.get(1)</code> (aka get line 1) returns 617 * <code>ptr</code>. 618 * <p> 619 * The last element (index <code>map.size()-1</code>) always contains 620 * <code>end</code>. 621 * 622 * @param buf 623 * buffer to scan. 624 * @param ptr 625 * position within the buffer corresponding to the first byte of 626 * line 1. 627 * @param end 628 * 1 past the end of the content within <code>buf</code>. 629 * @return a line map indexing the start position of each line. 630 */ 631 public static final IntList lineMap(final byte[] buf, int ptr, int end) { 632 // Experimentally derived from multiple source repositories 633 // the average number of bytes/line is 36. Its a rough guess 634 // to initially size our map close to the target. 635 // 636 final IntList map = new IntList((end - ptr) / 36); 637 map.fillTo(1, Integer.MIN_VALUE); 638 for (; ptr < end; ptr = nextLF(buf, ptr)) 639 map.add(ptr); 640 map.add(end); 641 return map; 642 } 643 644 /** 645 * Locate the "author " header line data. 646 * 647 * @param b 648 * buffer to scan. 649 * @param ptr 650 * position in buffer to start the scan at. Most callers should 651 * pass 0 to ensure the scan starts from the beginning of the 652 * commit buffer and does not accidentally look at message body. 653 * @return position just after the space in "author ", so the first 654 * character of the author's name. If no author header can be 655 * located -1 is returned. 656 */ 657 public static final int author(final byte[] b, int ptr) { 658 final int sz = b.length; 659 if (ptr == 0) 660 ptr += 46; // skip the "tree ..." line. 661 while (ptr < sz && b[ptr] == 'p') 662 ptr += 48; // skip this parent. 663 return match(b, ptr, author); 664 } 665 666 /** 667 * Locate the "committer " header line data. 668 * 669 * @param b 670 * buffer to scan. 671 * @param ptr 672 * position in buffer to start the scan at. Most callers should 673 * pass 0 to ensure the scan starts from the beginning of the 674 * commit buffer and does not accidentally look at message body. 675 * @return position just after the space in "committer ", so the first 676 * character of the committer's name. If no committer header can be 677 * located -1 is returned. 678 */ 679 public static final int committer(final byte[] b, int ptr) { 680 final int sz = b.length; 681 if (ptr == 0) 682 ptr += 46; // skip the "tree ..." line. 683 while (ptr < sz && b[ptr] == 'p') 684 ptr += 48; // skip this parent. 685 if (ptr < sz && b[ptr] == 'a') 686 ptr = nextLF(b, ptr); 687 return match(b, ptr, committer); 688 } 689 690 /** 691 * Locate the "tagger " header line data. 692 * 693 * @param b 694 * buffer to scan. 695 * @param ptr 696 * position in buffer to start the scan at. Most callers should 697 * pass 0 to ensure the scan starts from the beginning of the tag 698 * buffer and does not accidentally look at message body. 699 * @return position just after the space in "tagger ", so the first 700 * character of the tagger's name. If no tagger header can be 701 * located -1 is returned. 702 */ 703 public static final int tagger(final byte[] b, int ptr) { 704 final int sz = b.length; 705 if (ptr == 0) 706 ptr += 48; // skip the "object ..." line. 707 while (ptr < sz) { 708 if (b[ptr] == '\n') 709 return -1; 710 final int m = match(b, ptr, tagger); 711 if (m >= 0) 712 return m; 713 ptr = nextLF(b, ptr); 714 } 715 return -1; 716 } 717 718 /** 719 * Locate the "encoding " header line. 720 * 721 * @param b 722 * buffer to scan. 723 * @param ptr 724 * position in buffer to start the scan at. Most callers should 725 * pass 0 to ensure the scan starts from the beginning of the 726 * buffer and does not accidentally look at the message body. 727 * @return position just after the space in "encoding ", so the first 728 * character of the encoding's name. If no encoding header can be 729 * located -1 is returned (and UTF-8 should be assumed). 730 */ 731 public static final int encoding(final byte[] b, int ptr) { 732 final int sz = b.length; 733 while (ptr < sz) { 734 if (b[ptr] == '\n') 735 return -1; 736 if (b[ptr] == 'e') 737 break; 738 ptr = nextLF(b, ptr); 739 } 740 return match(b, ptr, encoding); 741 } 742 743 /** 744 * Parse the "encoding " header as a string. 745 * <p> 746 * Locates the "encoding " header (if present) and returns its value. 747 * 748 * @param b 749 * buffer to scan. 750 * @return the encoding header as specified in the commit; null if the 751 * header was not present and should be assumed. 752 * @since 4.2 753 */ 754 @Nullable 755 public static String parseEncodingName(final byte[] b) { 756 int enc = encoding(b, 0); 757 if (enc < 0) { 758 return null; 759 } 760 int lf = nextLF(b, enc); 761 return decode(UTF_8, b, enc, lf - 1); 762 } 763 764 /** 765 * Parse the "encoding " header into a character set reference. 766 * <p> 767 * Locates the "encoding " header (if present) by first calling 768 * {@link #encoding(byte[], int)} and then returns the proper character set 769 * to apply to this buffer to evaluate its contents as character data. 770 * <p> 771 * If no encoding header is present {@code UTF-8} is assumed. 772 * 773 * @param b 774 * buffer to scan. 775 * @return the Java character set representation. Never null. 776 * @throws IllegalCharsetNameException 777 * if the character set requested by the encoding header is 778 * malformed and unsupportable. 779 * @throws UnsupportedCharsetException 780 * if the JRE does not support the character set requested by 781 * the encoding header. 782 */ 783 public static Charset parseEncoding(final byte[] b) { 784 String enc = parseEncodingName(b); 785 if (enc == null) { 786 return UTF_8; 787 } 788 789 String name = enc.trim(); 790 try { 791 return Charset.forName(name); 792 } catch (IllegalCharsetNameException 793 | UnsupportedCharsetException badName) { 794 Charset aliased = charsetForAlias(name); 795 if (aliased != null) { 796 return aliased; 797 } 798 throw badName; 799 } 800 } 801 802 /** 803 * Parse a name string (e.g. author, committer, tagger) into a PersonIdent. 804 * <p> 805 * Leading spaces won't be trimmed from the string, i.e. will show up in the 806 * parsed name afterwards. 807 * 808 * @param in 809 * the string to parse a name from. 810 * @return the parsed identity or null in case the identity could not be 811 * parsed. 812 */ 813 public static PersonIdent parsePersonIdent(final String in) { 814 return parsePersonIdent(Constants.encode(in), 0); 815 } 816 817 /** 818 * Parse a name line (e.g. author, committer, tagger) into a PersonIdent. 819 * <p> 820 * When passing in a value for <code>nameB</code> callers should use the 821 * return value of {@link #author(byte[], int)} or 822 * {@link #committer(byte[], int)}, as these methods provide the proper 823 * position within the buffer. 824 * 825 * @param raw 826 * the buffer to parse character data from. 827 * @param nameB 828 * first position of the identity information. This should be the 829 * first position after the space which delimits the header field 830 * name (e.g. "author" or "committer") from the rest of the 831 * identity line. 832 * @return the parsed identity or null in case the identity could not be 833 * parsed. 834 */ 835 public static PersonIdent parsePersonIdent(final byte[] raw, final int nameB) { 836 Charset cs; 837 try { 838 cs = parseEncoding(raw); 839 } catch (IllegalCharsetNameException | UnsupportedCharsetException e) { 840 // Assume UTF-8 for person identities, usually this is correct. 841 // If not decode() will fall back to the ISO-8859-1 encoding. 842 cs = UTF_8; 843 } 844 845 final int emailB = nextLF(raw, nameB, '<'); 846 final int emailE = nextLF(raw, emailB, '>'); 847 if (emailB >= raw.length || raw[emailB] == '\n' || 848 (emailE >= raw.length - 1 && raw[emailE - 1] != '>')) 849 return null; 850 851 final int nameEnd = emailB - 2 >= nameB && raw[emailB - 2] == ' ' ? 852 emailB - 2 : emailB - 1; 853 final String name = decode(cs, raw, nameB, nameEnd); 854 final String email = decode(cs, raw, emailB, emailE - 1); 855 856 // Start searching from end of line, as after first name-email pair, 857 // another name-email pair may occur. We will ignore all kinds of 858 // "junk" following the first email. 859 // 860 // We've to use (emailE - 1) for the case that raw[email] is LF, 861 // otherwise we would run too far. "-2" is necessary to position 862 // before the LF in case of LF termination resp. the penultimate 863 // character if there is no trailing LF. 864 final int tzBegin = lastIndexOfTrim(raw, ' ', 865 nextLF(raw, emailE - 1) - 2) + 1; 866 if (tzBegin <= emailE) // No time/zone, still valid 867 return new PersonIdent(name, email, 0, 0); 868 869 final int whenBegin = Math.max(emailE, 870 lastIndexOfTrim(raw, ' ', tzBegin - 1) + 1); 871 if (whenBegin >= tzBegin - 1) // No time/zone, still valid 872 return new PersonIdent(name, email, 0, 0); 873 874 final long when = parseLongBase10(raw, whenBegin, null); 875 final int tz = parseTimeZoneOffset(raw, tzBegin); 876 return new PersonIdent(name, email, when * 1000L, tz); 877 } 878 879 /** 880 * Parse a name data (e.g. as within a reflog) into a PersonIdent. 881 * <p> 882 * When passing in a value for <code>nameB</code> callers should use the 883 * return value of {@link #author(byte[], int)} or 884 * {@link #committer(byte[], int)}, as these methods provide the proper 885 * position within the buffer. 886 * 887 * @param raw 888 * the buffer to parse character data from. 889 * @param nameB 890 * first position of the identity information. This should be the 891 * first position after the space which delimits the header field 892 * name (e.g. "author" or "committer") from the rest of the 893 * identity line. 894 * @return the parsed identity. Never null. 895 */ 896 public static PersonIdent parsePersonIdentOnly(final byte[] raw, 897 final int nameB) { 898 int stop = nextLF(raw, nameB); 899 int emailB = nextLF(raw, nameB, '<'); 900 int emailE = nextLF(raw, emailB, '>'); 901 final String name; 902 final String email; 903 if (emailE < stop) { 904 email = decode(raw, emailB, emailE - 1); 905 } else { 906 email = "invalid"; //$NON-NLS-1$ 907 } 908 if (emailB < stop) 909 name = decode(raw, nameB, emailB - 2); 910 else 911 name = decode(raw, nameB, stop); 912 913 final MutableInteger ptrout = new MutableInteger(); 914 long when; 915 int tz; 916 if (emailE < stop) { 917 when = parseLongBase10(raw, emailE + 1, ptrout); 918 tz = parseTimeZoneOffset(raw, ptrout.value); 919 } else { 920 when = 0; 921 tz = 0; 922 } 923 return new PersonIdent(name, email, when * 1000L, tz); 924 } 925 926 /** 927 * Locate the end of a footer line key string. 928 * <p> 929 * If the region at {@code raw[ptr]} matches {@code ^[A-Za-z0-9-]+:} (e.g. 930 * "Signed-off-by: A. U. Thor\n") then this method returns the position of 931 * the first ':'. 932 * <p> 933 * If the region at {@code raw[ptr]} does not match {@code ^[A-Za-z0-9-]+:} 934 * then this method returns -1. 935 * 936 * @param raw 937 * buffer to scan. 938 * @param ptr 939 * first position within raw to consider as a footer line key. 940 * @return position of the ':' which terminates the footer line key if this 941 * is otherwise a valid footer line key; otherwise -1. 942 */ 943 public static int endOfFooterLineKey(final byte[] raw, int ptr) { 944 try { 945 for (;;) { 946 final byte c = raw[ptr]; 947 if (footerLineKeyChars[c] == 0) { 948 if (c == ':') 949 return ptr; 950 return -1; 951 } 952 ptr++; 953 } 954 } catch (ArrayIndexOutOfBoundsException e) { 955 return -1; 956 } 957 } 958 959 /** 960 * Decode a buffer under UTF-8, if possible. 961 * 962 * If the byte stream cannot be decoded that way, the platform default is tried 963 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried. 964 * 965 * @param buffer 966 * buffer to pull raw bytes from. 967 * @return a string representation of the range <code>[start,end)</code>, 968 * after decoding the region through the specified character set. 969 */ 970 public static String decode(final byte[] buffer) { 971 return decode(buffer, 0, buffer.length); 972 } 973 974 /** 975 * Decode a buffer under UTF-8, if possible. 976 * 977 * If the byte stream cannot be decoded that way, the platform default is 978 * tried and if that too fails, the fail-safe ISO-8859-1 encoding is tried. 979 * 980 * @param buffer 981 * buffer to pull raw bytes from. 982 * @param start 983 * start position in buffer 984 * @param end 985 * one position past the last location within the buffer to take 986 * data from. 987 * @return a string representation of the range <code>[start,end)</code>, 988 * after decoding the region through the specified character set. 989 */ 990 public static String decode(final byte[] buffer, final int start, 991 final int end) { 992 return decode(UTF_8, buffer, start, end); 993 } 994 995 /** 996 * Decode a buffer under the specified character set if possible. 997 * 998 * If the byte stream cannot be decoded that way, the platform default is tried 999 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried. 1000 * 1001 * @param cs 1002 * character set to use when decoding the buffer. 1003 * @param buffer 1004 * buffer to pull raw bytes from. 1005 * @return a string representation of the range <code>[start,end)</code>, 1006 * after decoding the region through the specified character set. 1007 */ 1008 public static String decode(final Charset cs, final byte[] buffer) { 1009 return decode(cs, buffer, 0, buffer.length); 1010 } 1011 1012 /** 1013 * Decode a region of the buffer under the specified character set if possible. 1014 * 1015 * If the byte stream cannot be decoded that way, the platform default is tried 1016 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried. 1017 * 1018 * @param cs 1019 * character set to use when decoding the buffer. 1020 * @param buffer 1021 * buffer to pull raw bytes from. 1022 * @param start 1023 * first position within the buffer to take data from. 1024 * @param end 1025 * one position past the last location within the buffer to take 1026 * data from. 1027 * @return a string representation of the range <code>[start,end)</code>, 1028 * after decoding the region through the specified character set. 1029 */ 1030 public static String decode(final Charset cs, final byte[] buffer, 1031 final int start, final int end) { 1032 try { 1033 return decodeNoFallback(cs, buffer, start, end); 1034 } catch (CharacterCodingException e) { 1035 // Fall back to an ISO-8859-1 style encoding. At least all of 1036 // the bytes will be present in the output. 1037 // 1038 return extractBinaryString(buffer, start, end); 1039 } 1040 } 1041 1042 /** 1043 * Decode a region of the buffer under the specified character set if 1044 * possible. 1045 * 1046 * If the byte stream cannot be decoded that way, the platform default is 1047 * tried and if that too fails, an exception is thrown. 1048 * 1049 * @param cs 1050 * character set to use when decoding the buffer. 1051 * @param buffer 1052 * buffer to pull raw bytes from. 1053 * @param start 1054 * first position within the buffer to take data from. 1055 * @param end 1056 * one position past the last location within the buffer to take 1057 * data from. 1058 * @return a string representation of the range <code>[start,end)</code>, 1059 * after decoding the region through the specified character set. 1060 * @throws CharacterCodingException 1061 * the input is not in any of the tested character sets. 1062 */ 1063 public static String decodeNoFallback(final Charset cs, 1064 final byte[] buffer, final int start, final int end) 1065 throws CharacterCodingException { 1066 ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start); 1067 b.mark(); 1068 1069 // Try our built-in favorite. The assumption here is that 1070 // decoding will fail if the data is not actually encoded 1071 // using that encoder. 1072 try { 1073 return decode(b, UTF_8); 1074 } catch (CharacterCodingException e) { 1075 b.reset(); 1076 } 1077 1078 if (!cs.equals(UTF_8)) { 1079 // Try the suggested encoding, it might be right since it was 1080 // provided by the caller. 1081 try { 1082 return decode(b, cs); 1083 } catch (CharacterCodingException e) { 1084 b.reset(); 1085 } 1086 } 1087 1088 // Try the default character set. A small group of people 1089 // might actually use the same (or very similar) locale. 1090 Charset defcs = Charset.defaultCharset(); 1091 if (!defcs.equals(cs) && !defcs.equals(UTF_8)) { 1092 try { 1093 return decode(b, defcs); 1094 } catch (CharacterCodingException e) { 1095 b.reset(); 1096 } 1097 } 1098 1099 throw new CharacterCodingException(); 1100 } 1101 1102 /** 1103 * Decode a region of the buffer under the ISO-8859-1 encoding. 1104 * 1105 * Each byte is treated as a single character in the 8859-1 character 1106 * encoding, performing a raw binary->char conversion. 1107 * 1108 * @param buffer 1109 * buffer to pull raw bytes from. 1110 * @param start 1111 * first position within the buffer to take data from. 1112 * @param end 1113 * one position past the last location within the buffer to take 1114 * data from. 1115 * @return a string representation of the range <code>[start,end)</code>. 1116 */ 1117 public static String extractBinaryString(final byte[] buffer, 1118 final int start, final int end) { 1119 final StringBuilder r = new StringBuilder(end - start); 1120 for (int i = start; i < end; i++) 1121 r.append((char) (buffer[i] & 0xff)); 1122 return r.toString(); 1123 } 1124 1125 private static String decode(final ByteBuffer b, final Charset charset) 1126 throws CharacterCodingException { 1127 final CharsetDecoder d = charset.newDecoder(); 1128 d.onMalformedInput(CodingErrorAction.REPORT); 1129 d.onUnmappableCharacter(CodingErrorAction.REPORT); 1130 return d.decode(b).toString(); 1131 } 1132 1133 /** 1134 * Locate the position of the commit message body. 1135 * 1136 * @param b 1137 * buffer to scan. 1138 * @param ptr 1139 * position in buffer to start the scan at. Most callers should 1140 * pass 0 to ensure the scan starts from the beginning of the 1141 * commit buffer. 1142 * @return position of the user's message buffer. 1143 */ 1144 public static final int commitMessage(final byte[] b, int ptr) { 1145 final int sz = b.length; 1146 if (ptr == 0) 1147 ptr += 46; // skip the "tree ..." line. 1148 while (ptr < sz && b[ptr] == 'p') 1149 ptr += 48; // skip this parent. 1150 1151 // Skip any remaining header lines, ignoring what their actual 1152 // header line type is. This is identical to the logic for a tag. 1153 // 1154 return tagMessage(b, ptr); 1155 } 1156 1157 /** 1158 * Locate the position of the tag message body. 1159 * 1160 * @param b 1161 * buffer to scan. 1162 * @param ptr 1163 * position in buffer to start the scan at. Most callers should 1164 * pass 0 to ensure the scan starts from the beginning of the tag 1165 * buffer. 1166 * @return position of the user's message buffer. 1167 */ 1168 public static final int tagMessage(final byte[] b, int ptr) { 1169 final int sz = b.length; 1170 if (ptr == 0) 1171 ptr += 48; // skip the "object ..." line. 1172 while (ptr < sz && b[ptr] != '\n') 1173 ptr = nextLF(b, ptr); 1174 if (ptr < sz && b[ptr] == '\n') 1175 return ptr + 1; 1176 return -1; 1177 } 1178 1179 /** 1180 * Locate the end of a paragraph. 1181 * <p> 1182 * A paragraph is ended by two consecutive LF bytes or CRLF pairs 1183 * 1184 * @param b 1185 * buffer to scan. 1186 * @param start 1187 * position in buffer to start the scan at. Most callers will 1188 * want to pass the first position of the commit message (as 1189 * found by {@link #commitMessage(byte[], int)}. 1190 * @return position of the LF at the end of the paragraph; 1191 * <code>b.length</code> if no paragraph end could be located. 1192 */ 1193 public static final int endOfParagraph(final byte[] b, final int start) { 1194 int ptr = start; 1195 final int sz = b.length; 1196 while (ptr < sz && (b[ptr] != '\n' && b[ptr] != '\r')) 1197 ptr = nextLF(b, ptr); 1198 if (ptr > start && b[ptr - 1] == '\n') 1199 ptr--; 1200 if (ptr > start && b[ptr - 1] == '\r') 1201 ptr--; 1202 return ptr; 1203 } 1204 1205 /** 1206 * @param raw 1207 * buffer to scan. 1208 * @param ch 1209 * character to find. 1210 * @param pos 1211 * starting position. 1212 * @return last index of ch in raw, trimming spaces. 1213 * @since 4.1 1214 */ 1215 public static int lastIndexOfTrim(byte[] raw, char ch, int pos) { 1216 while (pos >= 0 && raw[pos] == ' ') 1217 pos--; 1218 1219 while (pos >= 0 && raw[pos] != ch) 1220 pos--; 1221 1222 return pos; 1223 } 1224 1225 private static Charset charsetForAlias(String name) { 1226 return encodingAliases.get(StringUtils.toLowerCase(name)); 1227 } 1228 1229 private RawParseUtils() { 1230 // Don't create instances of a static only utility. 1231 } 1232 }