1 /* 2 * Copyright (C) 2009, Google Inc. 3 * Copyright (C) 2008-2021, Johannes E. Schindelin <johannes.schindelin@gmx.de> and others 4 * 5 * This program and the accompanying materials are made available under the 6 * terms of the Eclipse Distribution License v. 1.0 which is available at 7 * https://www.eclipse.org/org/documents/edl-v10.php. 8 * 9 * SPDX-License-Identifier: BSD-3-Clause 10 */ 11 12 package org.eclipse.jgit.diff; 13 14 import java.io.EOFException; 15 import java.io.File; 16 import java.io.IOException; 17 import java.io.InputStream; 18 import java.io.OutputStream; 19 import java.nio.ByteBuffer; 20 21 import org.eclipse.jgit.errors.BinaryBlobException; 22 import org.eclipse.jgit.errors.LargeObjectException; 23 import org.eclipse.jgit.lib.ObjectLoader; 24 import org.eclipse.jgit.util.IO; 25 import org.eclipse.jgit.util.IntList; 26 import org.eclipse.jgit.util.RawParseUtils; 27 28 /** 29 * A Sequence supporting UNIX formatted text in byte[] format. 30 * <p> 31 * Elements of the sequence are the lines of the file, as delimited by the UNIX 32 * newline character ('\n'). The file content is treated as 8 bit binary text, 33 * with no assumptions or requirements on character encoding. 34 * <p> 35 * Note that the first line of the file is element 0, as defined by the Sequence 36 * interface API. Traditionally in a text editor a patch file the first line is 37 * line number 1. Callers may need to subtract 1 prior to invoking methods if 38 * they are converting from "line number" to "element index". 39 */ 40 public class RawText extends Sequence { 41 /** A RawText of length 0 */ 42 public static final RawText EMPTY_TEXT = new RawText(new byte[0]); 43 44 /** Number of bytes to check for heuristics in {@link #isBinary(byte[])} */ 45 static final int FIRST_FEW_BYTES = 8000; 46 47 /** The file content for this sequence. */ 48 protected final byte[] content; 49 50 /** Map of line number to starting position within {@link #content}. */ 51 protected final IntList lines; 52 53 /** 54 * Create a new sequence from an existing content byte array. 55 * <p> 56 * The entire array (indexes 0 through length-1) is used as the content. 57 * 58 * @param input 59 * the content array. The object retains a reference to this 60 * array, so it should be immutable. 61 */ 62 public RawText(byte[] input) { 63 this(input, RawParseUtils.lineMap(input, 0, input.length)); 64 } 65 66 /** 67 * Create a new sequence from the existing content byte array and the line 68 * map indicating line boundaries. 69 * 70 * @param input 71 * the content array. The object retains a reference to this 72 * array, so it should be immutable. 73 * @param lineMap 74 * an array with 1-based offsets for the start of each line. 75 * The first and last entries should be {@link Integer#MIN_VALUE} 76 * and an offset one past the end of the last line, respectively. 77 * @since 5.0 78 */ 79 public RawText(byte[] input, IntList lineMap) { 80 content = input; 81 lines = lineMap; 82 } 83 84 /** 85 * Create a new sequence from a file. 86 * <p> 87 * The entire file contents are used. 88 * 89 * @param file 90 * the text file. 91 * @throws java.io.IOException 92 * if Exceptions occur while reading the file 93 */ 94 public RawText(File file) throws IOException { 95 this(IO.readFully(file)); 96 } 97 98 /** 99 * @return the raw, unprocessed content read. 100 * @since 4.11 101 */ 102 public byte[] getRawContent() { 103 return content; 104 } 105 106 /** @return total number of items in the sequence. */ 107 /** {@inheritDoc} */ 108 @Override 109 public int size() { 110 // The line map is always 2 entries larger than the number of lines in 111 // the file. Index 0 is padded out/unused. The last index is the total 112 // length of the buffer, and acts as a sentinel. 113 // 114 return lines.size() - 2; 115 } 116 117 /** 118 * Write a specific line to the output stream, without its trailing LF. 119 * <p> 120 * The specified line is copied as-is, with no character encoding 121 * translation performed. 122 * <p> 123 * If the specified line ends with an LF ('\n'), the LF is <b>not</b> 124 * copied. It is up to the caller to write the LF, if desired, between 125 * output lines. 126 * 127 * @param out 128 * stream to copy the line data onto. 129 * @param i 130 * index of the line to extract. Note this is 0-based, so line 131 * number 1 is actually index 0. 132 * @throws java.io.IOException 133 * the stream write operation failed. 134 */ 135 public void writeLine(OutputStream out, int i) 136 throws IOException { 137 int start = getStart(i); 138 int end = getEnd(i); 139 if (content[end - 1] == '\n') 140 end--; 141 out.write(content, start, end - start); 142 } 143 144 /** 145 * Determine if the file ends with a LF ('\n'). 146 * 147 * @return true if the last line has an LF; false otherwise. 148 */ 149 public boolean isMissingNewlineAtEnd() { 150 final int end = lines.get(lines.size() - 1); 151 if (end == 0) 152 return true; 153 return content[end - 1] != '\n'; 154 } 155 156 /** 157 * Get the text for a single line. 158 * 159 * @param i 160 * index of the line to extract. Note this is 0-based, so line 161 * number 1 is actually index 0. 162 * @return the text for the line, without a trailing LF. 163 */ 164 public String getString(int i) { 165 return getString(i, i + 1, true); 166 } 167 168 /** 169 * Get the raw text for a single line. 170 * 171 * @param i 172 * index of the line to extract. Note this is 0-based, so line 173 * number 1 is actually index 0. 174 * @return the text for the line, without a trailing LF, as a 175 * {@link ByteBuffer} that is backed by a slice of the 176 * {@link #getRawContent() raw content}, with the buffer's position 177 * on the start of the line and the limit at the end. 178 * @since 5.12 179 */ 180 public ByteBuffer getRawString(int i) { 181 int s = getStart(i); 182 int e = getEnd(i); 183 if (e > 0 && content[e - 1] == '\n') { 184 e--; 185 } 186 return ByteBuffer.wrap(content, s, e - s); 187 } 188 189 /** 190 * Get the text for a region of lines. 191 * 192 * @param begin 193 * index of the first line to extract. Note this is 0-based, so 194 * line number 1 is actually index 0. 195 * @param end 196 * index of one past the last line to extract. 197 * @param dropLF 198 * if true the trailing LF ('\n') of the last returned line is 199 * dropped, if present. 200 * @return the text for lines {@code [begin, end)}. 201 */ 202 public String getString(int begin, int end, boolean dropLF) { 203 if (begin == end) 204 return ""; //$NON-NLS-1$ 205 206 int s = getStart(begin); 207 int e = getEnd(end - 1); 208 if (dropLF && content[e - 1] == '\n') 209 e--; 210 return decode(s, e); 211 } 212 213 /** 214 * Decode a region of the text into a String. 215 * 216 * The default implementation of this method tries to guess the character 217 * set by considering UTF-8, the platform default, and falling back on 218 * ISO-8859-1 if neither of those can correctly decode the region given. 219 * 220 * @param start 221 * first byte of the content to decode. 222 * @param end 223 * one past the last byte of the content to decode. 224 * @return the region {@code [start, end)} decoded as a String. 225 */ 226 protected String decode(int start, int end) { 227 return RawParseUtils.decode(content, start, end); 228 } 229 230 private int getStart(int i) { 231 return lines.get(i + 1); 232 } 233 234 private int getEnd(int i) { 235 return lines.get(i + 2); 236 } 237 238 /** 239 * Determine heuristically whether a byte array represents binary (as 240 * opposed to text) content. 241 * 242 * @param raw 243 * the raw file content. 244 * @return true if raw is likely to be a binary file, false otherwise 245 */ 246 public static boolean isBinary(byte[] raw) { 247 return isBinary(raw, raw.length); 248 } 249 250 /** 251 * Determine heuristically whether the bytes contained in a stream 252 * represents binary (as opposed to text) content. 253 * 254 * Note: Do not further use this stream after having called this method! The 255 * stream may not be fully read and will be left at an unknown position 256 * after consuming an unknown number of bytes. The caller is responsible for 257 * closing the stream. 258 * 259 * @param raw 260 * input stream containing the raw file content. 261 * @return true if raw is likely to be a binary file, false otherwise 262 * @throws java.io.IOException 263 * if input stream could not be read 264 */ 265 public static boolean isBinary(InputStream raw) throws IOException { 266 final byte[] buffer = new byte[FIRST_FEW_BYTES]; 267 int cnt = 0; 268 while (cnt < buffer.length) { 269 final int n = raw.read(buffer, cnt, buffer.length - cnt); 270 if (n == -1) 271 break; 272 cnt += n; 273 } 274 return isBinary(buffer, cnt); 275 } 276 277 /** 278 * Determine heuristically whether a byte array represents binary (as 279 * opposed to text) content. 280 * 281 * @param raw 282 * the raw file content. 283 * @param length 284 * number of bytes in {@code raw} to evaluate. This should be 285 * {@code raw.length} unless {@code raw} was over-allocated by 286 * the caller. 287 * @return true if raw is likely to be a binary file, false otherwise 288 */ 289 public static boolean isBinary(byte[] raw, int length) { 290 // Same heuristic as C Git 291 if (length > FIRST_FEW_BYTES) 292 length = FIRST_FEW_BYTES; 293 for (int ptr = 0; ptr < length; ptr++) 294 if (raw[ptr] == '\0') 295 return true; 296 297 return false; 298 } 299 300 /** 301 * Determine heuristically whether a byte array represents text content 302 * using CR-LF as line separator. 303 * 304 * @param raw 305 * the raw file content. 306 * @return {@code true} if raw is likely to be CR-LF delimited text, 307 * {@code false} otherwise 308 * @since 5.3 309 */ 310 public static boolean isCrLfText(byte[] raw) { 311 return isCrLfText(raw, raw.length); 312 } 313 314 /** 315 * Determine heuristically whether the bytes contained in a stream represent 316 * text content using CR-LF as line separator. 317 * 318 * Note: Do not further use this stream after having called this method! The 319 * stream may not be fully read and will be left at an unknown position 320 * after consuming an unknown number of bytes. The caller is responsible for 321 * closing the stream. 322 * 323 * @param raw 324 * input stream containing the raw file content. 325 * @return {@code true} if raw is likely to be CR-LF delimited text, 326 * {@code false} otherwise 327 * @throws java.io.IOException 328 * if input stream could not be read 329 * @since 5.3 330 */ 331 public static boolean isCrLfText(InputStream raw) throws IOException { 332 byte[] buffer = new byte[FIRST_FEW_BYTES]; 333 int cnt = 0; 334 while (cnt < buffer.length) { 335 int n = raw.read(buffer, cnt, buffer.length - cnt); 336 if (n == -1) { 337 break; 338 } 339 cnt += n; 340 } 341 return isCrLfText(buffer, cnt); 342 } 343 344 /** 345 * Determine heuristically whether a byte array represents text content 346 * using CR-LF as line separator. 347 * 348 * @param raw 349 * the raw file content. 350 * @param length 351 * number of bytes in {@code raw} to evaluate. 352 * @return {@code true} if raw is likely to be CR-LF delimited text, 353 * {@code false} otherwise 354 * @since 5.3 355 */ 356 public static boolean isCrLfText(byte[] raw, int length) { 357 boolean has_crlf = false; 358 for (int ptr = 0; ptr < length - 1; ptr++) { 359 if (raw[ptr] == '\0') { 360 return false; // binary 361 } else if (raw[ptr] == '\r' && raw[ptr + 1] == '\n') { 362 has_crlf = true; 363 } 364 } 365 return has_crlf; 366 } 367 368 /** 369 * Get the line delimiter for the first line. 370 * 371 * @since 2.0 372 * @return the line delimiter or <code>null</code> 373 */ 374 public String getLineDelimiter() { 375 if (size() == 0) { 376 return null; 377 } 378 int e = getEnd(0); 379 if (content[e - 1] != '\n') { 380 return null; 381 } 382 if (content.length > 1 && e > 1 && content[e - 2] == '\r') { 383 return "\r\n"; //$NON-NLS-1$ 384 } 385 return "\n"; //$NON-NLS-1$ 386 } 387 388 /** 389 * Read a blob object into RawText, or throw BinaryBlobException if the blob 390 * is binary. 391 * 392 * @param ldr 393 * the ObjectLoader for the blob 394 * @param threshold 395 * if the blob is larger than this size, it is always assumed to 396 * be binary. 397 * @since 4.10 398 * @return the RawText representing the blob. 399 * @throws org.eclipse.jgit.errors.BinaryBlobException 400 * if the blob contains binary data. 401 * @throws java.io.IOException 402 * if the input could not be read. 403 */ 404 public static RawText load(ObjectLoader ldr, int threshold) 405 throws IOException, BinaryBlobException { 406 long sz = ldr.getSize(); 407 408 if (sz > threshold) { 409 throw new BinaryBlobException(); 410 } 411 412 if (sz <= FIRST_FEW_BYTES) { 413 byte[] data = ldr.getCachedBytes(FIRST_FEW_BYTES); 414 if (isBinary(data)) { 415 throw new BinaryBlobException(); 416 } 417 return new RawText(data); 418 } 419 420 byte[] head = new byte[FIRST_FEW_BYTES]; 421 try (InputStream stream = ldr.openStream()) { 422 int off = 0; 423 int left = head.length; 424 while (left > 0) { 425 int n = stream.read(head, off, left); 426 if (n < 0) { 427 throw new EOFException(); 428 } 429 left -= n; 430 431 while (n > 0) { 432 if (head[off] == '\0') { 433 throw new BinaryBlobException(); 434 } 435 off++; 436 n--; 437 } 438 } 439 440 byte[] data; 441 try { 442 data = new byte[(int)sz]; 443 } catch (OutOfMemoryError e) { 444 throw new LargeObjectException.OutOfMemory(e); 445 } 446 447 System.arraycopy(head, 0, data, 0, head.length); 448 IO.readFully(stream, data, off, (int) (sz-off)); 449 return new RawText(data, RawParseUtils.lineMapOrBinary(data, 0, (int) sz)); 450 } 451 } 452 }