1 /* 2 * Copyright (C) 2009, Google Inc. 3 * Copyright (C) 2008-2009, Johannes E. Schindelin <johannes.schindelin@gmx.de> and others 4 * 5 * This program and the accompanying materials are made available under the 6 * terms of the Eclipse Distribution License v. 1.0 which is available at 7 * https://www.eclipse.org/org/documents/edl-v10.php. 8 * 9 * SPDX-License-Identifier: BSD-3-Clause 10 */ 11 12 package org.eclipse.jgit.diff; 13 14 import java.io.EOFException; 15 import java.io.File; 16 import java.io.IOException; 17 import java.io.InputStream; 18 import java.io.OutputStream; 19 20 import org.eclipse.jgit.errors.BinaryBlobException; 21 import org.eclipse.jgit.errors.LargeObjectException; 22 import org.eclipse.jgit.lib.ObjectLoader; 23 import org.eclipse.jgit.util.IO; 24 import org.eclipse.jgit.util.IntList; 25 import org.eclipse.jgit.util.RawParseUtils; 26 27 /** 28 * A Sequence supporting UNIX formatted text in byte[] format. 29 * <p> 30 * Elements of the sequence are the lines of the file, as delimited by the UNIX 31 * newline character ('\n'). The file content is treated as 8 bit binary text, 32 * with no assumptions or requirements on character encoding. 33 * <p> 34 * Note that the first line of the file is element 0, as defined by the Sequence 35 * interface API. Traditionally in a text editor a patch file the first line is 36 * line number 1. Callers may need to subtract 1 prior to invoking methods if 37 * they are converting from "line number" to "element index". 38 */ 39 public class RawText extends Sequence { 40 /** A RawText of length 0 */ 41 public static final RawTextl#RawText">RawText EMPTY_TEXT = new RawText(new byte[0]); 42 43 /** Number of bytes to check for heuristics in {@link #isBinary(byte[])} */ 44 static final int FIRST_FEW_BYTES = 8000; 45 46 /** The file content for this sequence. */ 47 protected final byte[] content; 48 49 /** Map of line number to starting position within {@link #content}. */ 50 protected final IntList lines; 51 52 /** 53 * Create a new sequence from an existing content byte array. 54 * <p> 55 * The entire array (indexes 0 through length-1) is used as the content. 56 * 57 * @param input 58 * the content array. The object retains a reference to this 59 * array, so it should be immutable. 60 */ 61 public RawText(byte[] input) { 62 this(input, RawParseUtils.lineMap(input, 0, input.length)); 63 } 64 65 /** 66 * Create a new sequence from the existing content byte array and the line 67 * map indicating line boundaries. 68 * 69 * @param input 70 * the content array. The object retains a reference to this 71 * array, so it should be immutable. 72 * @param lineMap 73 * an array with 1-based offsets for the start of each line. 74 * The first and last entries should be {@link Integer#MIN_VALUE} 75 * and an offset one past the end of the last line, respectively. 76 * @since 5.0 77 */ 78 public RawText(byte[] input, IntList lineMap) { 79 content = input; 80 lines = lineMap; 81 } 82 83 /** 84 * Create a new sequence from a file. 85 * <p> 86 * The entire file contents are used. 87 * 88 * @param file 89 * the text file. 90 * @throws java.io.IOException 91 * if Exceptions occur while reading the file 92 */ 93 public RawText(File file) throws IOException { 94 this(IO.readFully(file)); 95 } 96 97 /** 98 * @return the raw, unprocessed content read. 99 * @since 4.11 100 */ 101 public byte[] getRawContent() { 102 return content; 103 } 104 105 /** @return total number of items in the sequence. */ 106 /** {@inheritDoc} */ 107 @Override 108 public int size() { 109 // The line map is always 2 entries larger than the number of lines in 110 // the file. Index 0 is padded out/unused. The last index is the total 111 // length of the buffer, and acts as a sentinel. 112 // 113 return lines.size() - 2; 114 } 115 116 /** 117 * Write a specific line to the output stream, without its trailing LF. 118 * <p> 119 * The specified line is copied as-is, with no character encoding 120 * translation performed. 121 * <p> 122 * If the specified line ends with an LF ('\n'), the LF is <b>not</b> 123 * copied. It is up to the caller to write the LF, if desired, between 124 * output lines. 125 * 126 * @param out 127 * stream to copy the line data onto. 128 * @param i 129 * index of the line to extract. Note this is 0-based, so line 130 * number 1 is actually index 0. 131 * @throws java.io.IOException 132 * the stream write operation failed. 133 */ 134 public void writeLine(OutputStream out, int i) 135 throws IOException { 136 int start = getStart(i); 137 int end = getEnd(i); 138 if (content[end - 1] == '\n') 139 end--; 140 out.write(content, start, end - start); 141 } 142 143 /** 144 * Determine if the file ends with a LF ('\n'). 145 * 146 * @return true if the last line has an LF; false otherwise. 147 */ 148 public boolean isMissingNewlineAtEnd() { 149 final int end = lines.get(lines.size() - 1); 150 if (end == 0) 151 return true; 152 return content[end - 1] != '\n'; 153 } 154 155 /** 156 * Get the text for a single line. 157 * 158 * @param i 159 * index of the line to extract. Note this is 0-based, so line 160 * number 1 is actually index 0. 161 * @return the text for the line, without a trailing LF. 162 */ 163 public String getString(int i) { 164 return getString(i, i + 1, true); 165 } 166 167 /** 168 * Get the text for a region of lines. 169 * 170 * @param begin 171 * index of the first line to extract. Note this is 0-based, so 172 * line number 1 is actually index 0. 173 * @param end 174 * index of one past the last line to extract. 175 * @param dropLF 176 * if true the trailing LF ('\n') of the last returned line is 177 * dropped, if present. 178 * @return the text for lines {@code [begin, end)}. 179 */ 180 public String getString(int begin, int end, boolean dropLF) { 181 if (begin == end) 182 return ""; //$NON-NLS-1$ 183 184 int s = getStart(begin); 185 int e = getEnd(end - 1); 186 if (dropLF && content[e - 1] == '\n') 187 e--; 188 return decode(s, e); 189 } 190 191 /** 192 * Decode a region of the text into a String. 193 * 194 * The default implementation of this method tries to guess the character 195 * set by considering UTF-8, the platform default, and falling back on 196 * ISO-8859-1 if neither of those can correctly decode the region given. 197 * 198 * @param start 199 * first byte of the content to decode. 200 * @param end 201 * one past the last byte of the content to decode. 202 * @return the region {@code [start, end)} decoded as a String. 203 */ 204 protected String decode(int start, int end) { 205 return RawParseUtils.decode(content, start, end); 206 } 207 208 private int getStart(int i) { 209 return lines.get(i + 1); 210 } 211 212 private int getEnd(int i) { 213 return lines.get(i + 2); 214 } 215 216 /** 217 * Determine heuristically whether a byte array represents binary (as 218 * opposed to text) content. 219 * 220 * @param raw 221 * the raw file content. 222 * @return true if raw is likely to be a binary file, false otherwise 223 */ 224 public static boolean isBinary(byte[] raw) { 225 return isBinary(raw, raw.length); 226 } 227 228 /** 229 * Determine heuristically whether the bytes contained in a stream 230 * represents binary (as opposed to text) content. 231 * 232 * Note: Do not further use this stream after having called this method! The 233 * stream may not be fully read and will be left at an unknown position 234 * after consuming an unknown number of bytes. The caller is responsible for 235 * closing the stream. 236 * 237 * @param raw 238 * input stream containing the raw file content. 239 * @return true if raw is likely to be a binary file, false otherwise 240 * @throws java.io.IOException 241 * if input stream could not be read 242 */ 243 public static boolean isBinary(InputStream raw) throws IOException { 244 final byte[] buffer = new byte[FIRST_FEW_BYTES]; 245 int cnt = 0; 246 while (cnt < buffer.length) { 247 final int n = raw.read(buffer, cnt, buffer.length - cnt); 248 if (n == -1) 249 break; 250 cnt += n; 251 } 252 return isBinary(buffer, cnt); 253 } 254 255 /** 256 * Determine heuristically whether a byte array represents binary (as 257 * opposed to text) content. 258 * 259 * @param raw 260 * the raw file content. 261 * @param length 262 * number of bytes in {@code raw} to evaluate. This should be 263 * {@code raw.length} unless {@code raw} was over-allocated by 264 * the caller. 265 * @return true if raw is likely to be a binary file, false otherwise 266 */ 267 public static boolean isBinary(byte[] raw, int length) { 268 // Same heuristic as C Git 269 if (length > FIRST_FEW_BYTES) 270 length = FIRST_FEW_BYTES; 271 for (int ptr = 0; ptr < length; ptr++) 272 if (raw[ptr] == '\0') 273 return true; 274 275 return false; 276 } 277 278 /** 279 * Determine heuristically whether a byte array represents text content 280 * using CR-LF as line separator. 281 * 282 * @param raw 283 * the raw file content. 284 * @return {@code true} if raw is likely to be CR-LF delimited text, 285 * {@code false} otherwise 286 * @since 5.3 287 */ 288 public static boolean isCrLfText(byte[] raw) { 289 return isCrLfText(raw, raw.length); 290 } 291 292 /** 293 * Determine heuristically whether the bytes contained in a stream represent 294 * text content using CR-LF as line separator. 295 * 296 * Note: Do not further use this stream after having called this method! The 297 * stream may not be fully read and will be left at an unknown position 298 * after consuming an unknown number of bytes. The caller is responsible for 299 * closing the stream. 300 * 301 * @param raw 302 * input stream containing the raw file content. 303 * @return {@code true} if raw is likely to be CR-LF delimited text, 304 * {@code false} otherwise 305 * @throws java.io.IOException 306 * if input stream could not be read 307 * @since 5.3 308 */ 309 public static boolean isCrLfText(InputStream raw) throws IOException { 310 byte[] buffer = new byte[FIRST_FEW_BYTES]; 311 int cnt = 0; 312 while (cnt < buffer.length) { 313 int n = raw.read(buffer, cnt, buffer.length - cnt); 314 if (n == -1) { 315 break; 316 } 317 cnt += n; 318 } 319 return isCrLfText(buffer, cnt); 320 } 321 322 /** 323 * Determine heuristically whether a byte array represents text content 324 * using CR-LF as line separator. 325 * 326 * @param raw 327 * the raw file content. 328 * @param length 329 * number of bytes in {@code raw} to evaluate. 330 * @return {@code true} if raw is likely to be CR-LF delimited text, 331 * {@code false} otherwise 332 * @since 5.3 333 */ 334 public static boolean isCrLfText(byte[] raw, int length) { 335 boolean has_crlf = false; 336 for (int ptr = 0; ptr < length - 1; ptr++) { 337 if (raw[ptr] == '\0') { 338 return false; // binary 339 } else if (raw[ptr] == '\r' && raw[ptr + 1] == '\n') { 340 has_crlf = true; 341 } 342 } 343 return has_crlf; 344 } 345 346 /** 347 * Get the line delimiter for the first line. 348 * 349 * @since 2.0 350 * @return the line delimiter or <code>null</code> 351 */ 352 public String getLineDelimiter() { 353 if (size() == 0) { 354 return null; 355 } 356 int e = getEnd(0); 357 if (content[e - 1] != '\n') { 358 return null; 359 } 360 if (content.length > 1 && e > 1 && content[e - 2] == '\r') { 361 return "\r\n"; //$NON-NLS-1$ 362 } 363 return "\n"; //$NON-NLS-1$ 364 } 365 366 /** 367 * Read a blob object into RawText, or throw BinaryBlobException if the blob 368 * is binary. 369 * 370 * @param ldr 371 * the ObjectLoader for the blob 372 * @param threshold 373 * if the blob is larger than this size, it is always assumed to 374 * be binary. 375 * @since 4.10 376 * @return the RawText representing the blob. 377 * @throws org.eclipse.jgit.errors.BinaryBlobException 378 * if the blob contains binary data. 379 * @throws java.io.IOException 380 * if the input could not be read. 381 */ 382 public static RawText load(ObjectLoader ldr, int threshold) 383 throws IOException, BinaryBlobException { 384 long sz = ldr.getSize(); 385 386 if (sz > threshold) { 387 throw new BinaryBlobException(); 388 } 389 390 if (sz <= FIRST_FEW_BYTES) { 391 byte[] data = ldr.getCachedBytes(FIRST_FEW_BYTES); 392 if (isBinary(data)) { 393 throw new BinaryBlobException(); 394 } 395 return new RawText(data); 396 } 397 398 byte[] head = new byte[FIRST_FEW_BYTES]; 399 try (InputStream stream = ldr.openStream()) { 400 int off = 0; 401 int left = head.length; 402 while (left > 0) { 403 int n = stream.read(head, off, left); 404 if (n < 0) { 405 throw new EOFException(); 406 } 407 left -= n; 408 409 while (n > 0) { 410 if (head[off] == '\0') { 411 throw new BinaryBlobException(); 412 } 413 off++; 414 n--; 415 } 416 } 417 418 byte[] data; 419 try { 420 data = new byte[(int)sz]; 421 } catch (OutOfMemoryError e) { 422 throw new LargeObjectException.OutOfMemory(e); 423 } 424 425 System.arraycopy(head, 0, data, 0, head.length); 426 IO.readFully(stream, data, off, (int) (sz-off)); 427 return new RawText(data, RawParseUtils.lineMapOrBinary(data, 0, (int) sz)); 428 } 429 } 430 }