1 /* 2 * Copyright (C) 2009, Google Inc. 3 * Copyright (C) 2008-2021, Johannes E. Schindelin <johannes.schindelin@gmx.de> and others 4 * 5 * This program and the accompanying materials are made available under the 6 * terms of the Eclipse Distribution License v. 1.0 which is available at 7 * https://www.eclipse.org/org/documents/edl-v10.php. 8 * 9 * SPDX-License-Identifier: BSD-3-Clause 10 */ 11 12 package org.eclipse.jgit.diff; 13 14 import java.io.EOFException; 15 import java.io.File; 16 import java.io.IOException; 17 import java.io.InputStream; 18 import java.io.OutputStream; 19 import java.nio.ByteBuffer; 20 import java.util.concurrent.atomic.AtomicInteger; 21 22 import org.eclipse.jgit.errors.BinaryBlobException; 23 import org.eclipse.jgit.errors.LargeObjectException; 24 import org.eclipse.jgit.lib.ObjectLoader; 25 import org.eclipse.jgit.util.IO; 26 import org.eclipse.jgit.util.IntList; 27 import org.eclipse.jgit.util.RawParseUtils; 28 29 /** 30 * A Sequence supporting UNIX formatted text in byte[] format. 31 * <p> 32 * Elements of the sequence are the lines of the file, as delimited by the UNIX 33 * newline character ('\n'). The file content is treated as 8 bit binary text, 34 * with no assumptions or requirements on character encoding. 35 * <p> 36 * Note that the first line of the file is element 0, as defined by the Sequence 37 * interface API. Traditionally in a text editor a patch file the first line is 38 * line number 1. Callers may need to subtract 1 prior to invoking methods if 39 * they are converting from "line number" to "element index". 40 */ 41 public class RawText extends Sequence { 42 43 /** A RawText of length 0 */ 44 public static final RawText EMPTY_TEXT = new RawText(new byte[0]); 45 46 /** 47 * Default and minimum for {@link #BUFFER_SIZE}. 48 */ 49 private static final int FIRST_FEW_BYTES = 8 * 1024; 50 51 /** 52 * Number of bytes to check for heuristics in {@link #isBinary(byte[])}. 53 */ 54 private static final AtomicInteger BUFFER_SIZE = new AtomicInteger( 55 FIRST_FEW_BYTES); 56 57 /** The file content for this sequence. */ 58 protected final byte[] content; 59 60 /** Map of line number to starting position within {@link #content}. */ 61 protected final IntList lines; 62 63 /** 64 * Create a new sequence from an existing content byte array. 65 * <p> 66 * The entire array (indexes 0 through length-1) is used as the content. 67 * 68 * @param input 69 * the content array. The object retains a reference to this 70 * array, so it should be immutable. 71 */ 72 public RawText(byte[] input) { 73 this(input, RawParseUtils.lineMap(input, 0, input.length)); 74 } 75 76 /** 77 * Create a new sequence from the existing content byte array and the line 78 * map indicating line boundaries. 79 * 80 * @param input 81 * the content array. The object retains a reference to this 82 * array, so it should be immutable. 83 * @param lineMap 84 * an array with 1-based offsets for the start of each line. 85 * The first and last entries should be {@link Integer#MIN_VALUE} 86 * and an offset one past the end of the last line, respectively. 87 * @since 5.0 88 */ 89 public RawText(byte[] input, IntList lineMap) { 90 content = input; 91 lines = lineMap; 92 } 93 94 /** 95 * Create a new sequence from a file. 96 * <p> 97 * The entire file contents are used. 98 * 99 * @param file 100 * the text file. 101 * @throws java.io.IOException 102 * if Exceptions occur while reading the file 103 */ 104 public RawText(File file) throws IOException { 105 this(IO.readFully(file)); 106 } 107 108 /** 109 * @return the raw, unprocessed content read. 110 * @since 4.11 111 */ 112 public byte[] getRawContent() { 113 return content; 114 } 115 116 /** @return total number of items in the sequence. */ 117 /** {@inheritDoc} */ 118 @Override 119 public int size() { 120 // The line map is always 2 entries larger than the number of lines in 121 // the file. Index 0 is padded out/unused. The last index is the total 122 // length of the buffer, and acts as a sentinel. 123 // 124 return lines.size() - 2; 125 } 126 127 /** 128 * Write a specific line to the output stream, without its trailing LF. 129 * <p> 130 * The specified line is copied as-is, with no character encoding 131 * translation performed. 132 * <p> 133 * If the specified line ends with an LF ('\n'), the LF is <b>not</b> 134 * copied. It is up to the caller to write the LF, if desired, between 135 * output lines. 136 * 137 * @param out 138 * stream to copy the line data onto. 139 * @param i 140 * index of the line to extract. Note this is 0-based, so line 141 * number 1 is actually index 0. 142 * @throws java.io.IOException 143 * the stream write operation failed. 144 */ 145 public void writeLine(OutputStream out, int i) 146 throws IOException { 147 int start = getStart(i); 148 int end = getEnd(i); 149 if (content[end - 1] == '\n') 150 end--; 151 out.write(content, start, end - start); 152 } 153 154 /** 155 * Determine if the file ends with a LF ('\n'). 156 * 157 * @return true if the last line has an LF; false otherwise. 158 */ 159 public boolean isMissingNewlineAtEnd() { 160 final int end = lines.get(lines.size() - 1); 161 if (end == 0) 162 return true; 163 return content[end - 1] != '\n'; 164 } 165 166 /** 167 * Get the text for a single line. 168 * 169 * @param i 170 * index of the line to extract. Note this is 0-based, so line 171 * number 1 is actually index 0. 172 * @return the text for the line, without a trailing LF. 173 */ 174 public String getString(int i) { 175 return getString(i, i + 1, true); 176 } 177 178 /** 179 * Get the raw text for a single line. 180 * 181 * @param i 182 * index of the line to extract. Note this is 0-based, so line 183 * number 1 is actually index 0. 184 * @return the text for the line, without a trailing LF, as a 185 * {@link ByteBuffer} that is backed by a slice of the 186 * {@link #getRawContent() raw content}, with the buffer's position 187 * on the start of the line and the limit at the end. 188 * @since 5.12 189 */ 190 public ByteBuffer getRawString(int i) { 191 int s = getStart(i); 192 int e = getEnd(i); 193 if (e > 0 && content[e - 1] == '\n') { 194 e--; 195 } 196 return ByteBuffer.wrap(content, s, e - s); 197 } 198 199 /** 200 * Get the text for a region of lines. 201 * 202 * @param begin 203 * index of the first line to extract. Note this is 0-based, so 204 * line number 1 is actually index 0. 205 * @param end 206 * index of one past the last line to extract. 207 * @param dropLF 208 * if true the trailing LF ('\n') of the last returned line is 209 * dropped, if present. 210 * @return the text for lines {@code [begin, end)}. 211 */ 212 public String getString(int begin, int end, boolean dropLF) { 213 if (begin == end) 214 return ""; //$NON-NLS-1$ 215 216 int s = getStart(begin); 217 int e = getEnd(end - 1); 218 if (dropLF && content[e - 1] == '\n') 219 e--; 220 return decode(s, e); 221 } 222 223 /** 224 * Decode a region of the text into a String. 225 * 226 * The default implementation of this method tries to guess the character 227 * set by considering UTF-8, the platform default, and falling back on 228 * ISO-8859-1 if neither of those can correctly decode the region given. 229 * 230 * @param start 231 * first byte of the content to decode. 232 * @param end 233 * one past the last byte of the content to decode. 234 * @return the region {@code [start, end)} decoded as a String. 235 */ 236 protected String decode(int start, int end) { 237 return RawParseUtils.decode(content, start, end); 238 } 239 240 private int getStart(int i) { 241 return lines.get(i + 1); 242 } 243 244 private int getEnd(int i) { 245 return lines.get(i + 2); 246 } 247 248 /** 249 * Obtains the buffer size to use for analyzing whether certain content is 250 * text or binary, or what line endings are used if it's text. 251 * 252 * @return the buffer size, by default {@link #FIRST_FEW_BYTES} bytes 253 * @since 6.0 254 */ 255 public static int getBufferSize() { 256 return BUFFER_SIZE.get(); 257 } 258 259 /** 260 * Sets the buffer size to use for analyzing whether certain content is text 261 * or binary, or what line endings are used if it's text. If the given 262 * {@code bufferSize} is smaller than {@link #FIRST_FEW_BYTES} set the 263 * buffer size to {@link #FIRST_FEW_BYTES}. 264 * 265 * @param bufferSize 266 * Size to set 267 * @return the size actually set 268 * @since 6.0 269 */ 270 public static int setBufferSize(int bufferSize) { 271 int newSize = Math.max(FIRST_FEW_BYTES, bufferSize); 272 return BUFFER_SIZE.updateAndGet(curr -> newSize); 273 } 274 275 /** 276 * Determine heuristically whether the bytes contained in a stream 277 * represents binary (as opposed to text) content. 278 * 279 * Note: Do not further use this stream after having called this method! The 280 * stream may not be fully read and will be left at an unknown position 281 * after consuming an unknown number of bytes. The caller is responsible for 282 * closing the stream. 283 * 284 * @param raw 285 * input stream containing the raw file content. 286 * @return true if raw is likely to be a binary file, false otherwise 287 * @throws java.io.IOException 288 * if input stream could not be read 289 */ 290 public static boolean isBinary(InputStream raw) throws IOException { 291 final byte[] buffer = new byte[getBufferSize()]; 292 int cnt = 0; 293 while (cnt < buffer.length) { 294 final int n = raw.read(buffer, cnt, buffer.length - cnt); 295 if (n == -1) 296 break; 297 cnt += n; 298 } 299 return isBinary(buffer, cnt, cnt < buffer.length); 300 } 301 302 /** 303 * Determine heuristically whether a byte array represents binary (as 304 * opposed to text) content. 305 * 306 * @param raw 307 * the raw file content. 308 * @return true if raw is likely to be a binary file, false otherwise 309 */ 310 public static boolean isBinary(byte[] raw) { 311 return isBinary(raw, raw.length); 312 } 313 314 /** 315 * Determine heuristically whether a byte array represents binary (as 316 * opposed to text) content. 317 * 318 * @param raw 319 * the raw file content. 320 * @param length 321 * number of bytes in {@code raw} to evaluate. This should be 322 * {@code raw.length} unless {@code raw} was over-allocated by 323 * the caller. 324 * @return true if raw is likely to be a binary file, false otherwise 325 */ 326 public static boolean isBinary(byte[] raw, int length) { 327 return isBinary(raw, length, false); 328 } 329 330 /** 331 * Determine heuristically whether a byte array represents binary (as 332 * opposed to text) content. 333 * 334 * @param raw 335 * the raw file content. 336 * @param length 337 * number of bytes in {@code raw} to evaluate. This should be 338 * {@code raw.length} unless {@code raw} was over-allocated by 339 * the caller. 340 * @param complete 341 * whether {@code raw} contains the whole data 342 * @return true if raw is likely to be a binary file, false otherwise 343 * @since 6.0 344 */ 345 public static boolean isBinary(byte[] raw, int length, boolean complete) { 346 // Similar heuristic as C Git. Differences: 347 // - limited buffer size; may be only the beginning of a large blob 348 // - no counting of printable vs. non-printable bytes < 0x20 and 0x7F 349 int maxLength = getBufferSize(); 350 if (length > maxLength) { 351 length = maxLength; 352 } 353 byte last = 'x'; // Just something inconspicuous. 354 for (int ptr = 0; ptr < length; ptr++) { 355 byte curr = raw[ptr]; 356 if (isBinary(curr, last)) { 357 return true; 358 } 359 last = curr; 360 } 361 if (complete) { 362 // Buffer contains everything... 363 return last == '\r'; // ... so this must be a lone CR 364 } 365 return false; 366 } 367 368 /** 369 * Determines from the last two bytes read from a source if it looks like 370 * binary content. 371 * 372 * @param curr 373 * the last byte, read after {@code prev} 374 * @param prev 375 * the previous byte, read before {@code last} 376 * @return {@code true} if either byte is NUL, or if prev is CR and curr is 377 * not LF, {@code false} otherwise 378 * @since 6.0 379 */ 380 public static boolean isBinary(byte curr, byte prev) { 381 return curr == '\0' || (curr != '\n' && prev == '\r') || prev == '\0'; 382 } 383 384 /** 385 * Determine heuristically whether a byte array represents text content 386 * using CR-LF as line separator. 387 * 388 * @param raw 389 * the raw file content. 390 * @return {@code true} if raw is likely to be CR-LF delimited text, 391 * {@code false} otherwise 392 * @since 5.3 393 */ 394 public static boolean isCrLfText(byte[] raw) { 395 return isCrLfText(raw, raw.length); 396 } 397 398 /** 399 * Determine heuristically whether the bytes contained in a stream represent 400 * text content using CR-LF as line separator. 401 * 402 * Note: Do not further use this stream after having called this method! The 403 * stream may not be fully read and will be left at an unknown position 404 * after consuming an unknown number of bytes. The caller is responsible for 405 * closing the stream. 406 * 407 * @param raw 408 * input stream containing the raw file content. 409 * @return {@code true} if raw is likely to be CR-LF delimited text, 410 * {@code false} otherwise 411 * @throws java.io.IOException 412 * if input stream could not be read 413 * @since 5.3 414 */ 415 public static boolean isCrLfText(InputStream raw) throws IOException { 416 byte[] buffer = new byte[getBufferSize()]; 417 int cnt = 0; 418 while (cnt < buffer.length) { 419 int n = raw.read(buffer, cnt, buffer.length - cnt); 420 if (n == -1) { 421 break; 422 } 423 cnt += n; 424 } 425 return isCrLfText(buffer, cnt); 426 } 427 428 /** 429 * Determine heuristically whether a byte array represents text content 430 * using CR-LF as line separator. 431 * 432 * @param raw 433 * the raw file content. 434 * @param length 435 * number of bytes in {@code raw} to evaluate. 436 * @return {@code true} if raw is likely to be CR-LF delimited text, 437 * {@code false} otherwise 438 * @since 5.3 439 */ 440 public static boolean isCrLfText(byte[] raw, int length) { 441 return isCrLfText(raw, length, false); 442 } 443 444 /** 445 * Determine heuristically whether a byte array represents text content 446 * using CR-LF as line separator. 447 * 448 * @param raw 449 * the raw file content. 450 * @param length 451 * number of bytes in {@code raw} to evaluate. 452 * @return {@code true} if raw is likely to be CR-LF delimited text, 453 * {@code false} otherwise 454 * @param complete 455 * whether {@code raw} contains the whole data 456 * @since 6.0 457 */ 458 public static boolean isCrLfText(byte[] raw, int length, boolean complete) { 459 boolean has_crlf = false; 460 byte last = 'x'; // Just something inconspicuous 461 for (int ptr = 0; ptr < length; ptr++) { 462 byte curr = raw[ptr]; 463 if (isBinary(curr, last)) { 464 return false; 465 } 466 if (curr == '\n' && last == '\r') { 467 has_crlf = true; 468 } 469 last = curr; 470 } 471 if (last == '\r') { 472 if (complete) { 473 // Lone CR: it's binary after all. 474 return false; 475 } 476 // Tough call. If the next byte, which we don't have, would be a 477 // '\n', it'd be a CR-LF text, otherwise it'd be binary. Just decide 478 // based on what we already scanned; it wasn't binary until now. 479 } 480 return has_crlf; 481 } 482 483 /** 484 * Get the line delimiter for the first line. 485 * 486 * @since 2.0 487 * @return the line delimiter or <code>null</code> 488 */ 489 public String getLineDelimiter() { 490 if (size() == 0) { 491 return null; 492 } 493 int e = getEnd(0); 494 if (content[e - 1] != '\n') { 495 return null; 496 } 497 if (content.length > 1 && e > 1 && content[e - 2] == '\r') { 498 return "\r\n"; //$NON-NLS-1$ 499 } 500 return "\n"; //$NON-NLS-1$ 501 } 502 503 /** 504 * Read a blob object into RawText, or throw BinaryBlobException if the blob 505 * is binary. 506 * 507 * @param ldr 508 * the ObjectLoader for the blob 509 * @param threshold 510 * if the blob is larger than this size, it is always assumed to 511 * be binary. 512 * @since 4.10 513 * @return the RawText representing the blob. 514 * @throws org.eclipse.jgit.errors.BinaryBlobException 515 * if the blob contains binary data. 516 * @throws java.io.IOException 517 * if the input could not be read. 518 */ 519 public static RawText load(ObjectLoader ldr, int threshold) 520 throws IOException, BinaryBlobException { 521 long sz = ldr.getSize(); 522 523 if (sz > threshold) { 524 throw new BinaryBlobException(); 525 } 526 527 int bufferSize = getBufferSize(); 528 if (sz <= bufferSize) { 529 byte[] data = ldr.getCachedBytes(bufferSize); 530 if (isBinary(data, data.length, true)) { 531 throw new BinaryBlobException(); 532 } 533 return new RawText(data); 534 } 535 536 byte[] head = new byte[bufferSize]; 537 try (InputStream stream = ldr.openStream()) { 538 int off = 0; 539 int left = head.length; 540 byte last = 'x'; // Just something inconspicuous 541 while (left > 0) { 542 int n = stream.read(head, off, left); 543 if (n < 0) { 544 throw new EOFException(); 545 } 546 left -= n; 547 548 while (n > 0) { 549 byte curr = head[off]; 550 if (isBinary(curr, last)) { 551 throw new BinaryBlobException(); 552 } 553 last = curr; 554 off++; 555 n--; 556 } 557 } 558 559 byte[] data; 560 try { 561 data = new byte[(int)sz]; 562 } catch (OutOfMemoryError e) { 563 throw new LargeObjectException.OutOfMemory(e); 564 } 565 566 System.arraycopy(head, 0, data, 0, head.length); 567 IO.readFully(stream, data, off, (int) (sz-off)); 568 return new RawText(data, RawParseUtils.lineMapOrBinary(data, 0, (int) sz)); 569 } 570 } 571 }