1 /* 2 * Copyright (C) 2009, Google Inc. 3 * Copyright (C) 2008-2009, Johannes E. Schindelin <johannes.schindelin@gmx.de> 4 * and other copyright owners as documented in the project's IP log. 5 * 6 * This program and the accompanying materials are made available 7 * under the terms of the Eclipse Distribution License v1.0 which 8 * accompanies this distribution, is reproduced below, and is 9 * available at http://www.eclipse.org/org/documents/edl-v10.php 10 * 11 * All rights reserved. 12 * 13 * Redistribution and use in source and binary forms, with or 14 * without modification, are permitted provided that the following 15 * conditions are met: 16 * 17 * - Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 20 * - Redistributions in binary form must reproduce the above 21 * copyright notice, this list of conditions and the following 22 * disclaimer in the documentation and/or other materials provided 23 * with the distribution. 24 * 25 * - Neither the name of the Eclipse Foundation, Inc. nor the 26 * names of its contributors may be used to endorse or promote 27 * products derived from this software without specific prior 28 * written permission. 29 * 30 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND 31 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, 32 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 33 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 34 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 35 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 36 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 37 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 38 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 39 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 40 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 41 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 42 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 43 */ 44 45 package org.eclipse.jgit.diff; 46 47 import java.io.EOFException; 48 import java.io.File; 49 import java.io.IOException; 50 import java.io.InputStream; 51 import java.io.OutputStream; 52 53 import org.eclipse.jgit.errors.BinaryBlobException; 54 import org.eclipse.jgit.errors.LargeObjectException; 55 import org.eclipse.jgit.lib.ObjectLoader; 56 import org.eclipse.jgit.util.IO; 57 import org.eclipse.jgit.util.IntList; 58 import org.eclipse.jgit.util.RawParseUtils; 59 60 /** 61 * A Sequence supporting UNIX formatted text in byte[] format. 62 * <p> 63 * Elements of the sequence are the lines of the file, as delimited by the UNIX 64 * newline character ('\n'). The file content is treated as 8 bit binary text, 65 * with no assumptions or requirements on character encoding. 66 * <p> 67 * Note that the first line of the file is element 0, as defined by the Sequence 68 * interface API. Traditionally in a text editor a patch file the first line is 69 * line number 1. Callers may need to subtract 1 prior to invoking methods if 70 * they are converting from "line number" to "element index". 71 */ 72 public class RawText extends Sequence { 73 /** A RawText of length 0 */ 74 public static final RawTextl#RawText">RawText EMPTY_TEXT = new RawText(new byte[0]); 75 76 /** Number of bytes to check for heuristics in {@link #isBinary(byte[])} */ 77 static final int FIRST_FEW_BYTES = 8000; 78 79 /** The file content for this sequence. */ 80 protected final byte[] content; 81 82 /** Map of line number to starting position within {@link #content}. */ 83 protected final IntList lines; 84 85 /** 86 * Create a new sequence from an existing content byte array. 87 * <p> 88 * The entire array (indexes 0 through length-1) is used as the content. 89 * 90 * @param input 91 * the content array. The object retains a reference to this 92 * array, so it should be immutable. 93 */ 94 public RawText(byte[] input) { 95 this(input, RawParseUtils.lineMap(input, 0, input.length)); 96 } 97 98 /** 99 * Create a new sequence from the existing content byte array and the line 100 * map indicating line boundaries. 101 * 102 * @param input 103 * the content array. The object retains a reference to this 104 * array, so it should be immutable. 105 * @param lineMap 106 * an array with 1-based offsets for the start of each line. 107 * The first and last entries should be {@link Integer#MIN_VALUE} 108 * and an offset one past the end of the last line, respectively. 109 * @since 5.0 110 */ 111 public RawText(byte[] input, IntList lineMap) { 112 content = input; 113 lines = lineMap; 114 } 115 116 /** 117 * Create a new sequence from a file. 118 * <p> 119 * The entire file contents are used. 120 * 121 * @param file 122 * the text file. 123 * @throws java.io.IOException 124 * if Exceptions occur while reading the file 125 */ 126 public RawText(File file) throws IOException { 127 this(IO.readFully(file)); 128 } 129 130 /** 131 * @return the raw, unprocessed content read. 132 * @since 4.11 133 */ 134 public byte[] getRawContent() { 135 return content; 136 } 137 138 /** @return total number of items in the sequence. */ 139 /** {@inheritDoc} */ 140 @Override 141 public int size() { 142 // The line map is always 2 entries larger than the number of lines in 143 // the file. Index 0 is padded out/unused. The last index is the total 144 // length of the buffer, and acts as a sentinel. 145 // 146 return lines.size() - 2; 147 } 148 149 /** 150 * Write a specific line to the output stream, without its trailing LF. 151 * <p> 152 * The specified line is copied as-is, with no character encoding 153 * translation performed. 154 * <p> 155 * If the specified line ends with an LF ('\n'), the LF is <b>not</b> 156 * copied. It is up to the caller to write the LF, if desired, between 157 * output lines. 158 * 159 * @param out 160 * stream to copy the line data onto. 161 * @param i 162 * index of the line to extract. Note this is 0-based, so line 163 * number 1 is actually index 0. 164 * @throws java.io.IOException 165 * the stream write operation failed. 166 */ 167 public void writeLine(OutputStream out, int i) 168 throws IOException { 169 int start = getStart(i); 170 int end = getEnd(i); 171 if (content[end - 1] == '\n') 172 end--; 173 out.write(content, start, end - start); 174 } 175 176 /** 177 * Determine if the file ends with a LF ('\n'). 178 * 179 * @return true if the last line has an LF; false otherwise. 180 */ 181 public boolean isMissingNewlineAtEnd() { 182 final int end = lines.get(lines.size() - 1); 183 if (end == 0) 184 return true; 185 return content[end - 1] != '\n'; 186 } 187 188 /** 189 * Get the text for a single line. 190 * 191 * @param i 192 * index of the line to extract. Note this is 0-based, so line 193 * number 1 is actually index 0. 194 * @return the text for the line, without a trailing LF. 195 */ 196 public String getString(int i) { 197 return getString(i, i + 1, true); 198 } 199 200 /** 201 * Get the text for a region of lines. 202 * 203 * @param begin 204 * index of the first line to extract. Note this is 0-based, so 205 * line number 1 is actually index 0. 206 * @param end 207 * index of one past the last line to extract. 208 * @param dropLF 209 * if true the trailing LF ('\n') of the last returned line is 210 * dropped, if present. 211 * @return the text for lines {@code [begin, end)}. 212 */ 213 public String getString(int begin, int end, boolean dropLF) { 214 if (begin == end) 215 return ""; //$NON-NLS-1$ 216 217 int s = getStart(begin); 218 int e = getEnd(end - 1); 219 if (dropLF && content[e - 1] == '\n') 220 e--; 221 return decode(s, e); 222 } 223 224 /** 225 * Decode a region of the text into a String. 226 * 227 * The default implementation of this method tries to guess the character 228 * set by considering UTF-8, the platform default, and falling back on 229 * ISO-8859-1 if neither of those can correctly decode the region given. 230 * 231 * @param start 232 * first byte of the content to decode. 233 * @param end 234 * one past the last byte of the content to decode. 235 * @return the region {@code [start, end)} decoded as a String. 236 */ 237 protected String decode(int start, int end) { 238 return RawParseUtils.decode(content, start, end); 239 } 240 241 private int getStart(int i) { 242 return lines.get(i + 1); 243 } 244 245 private int getEnd(int i) { 246 return lines.get(i + 2); 247 } 248 249 /** 250 * Determine heuristically whether a byte array represents binary (as 251 * opposed to text) content. 252 * 253 * @param raw 254 * the raw file content. 255 * @return true if raw is likely to be a binary file, false otherwise 256 */ 257 public static boolean isBinary(byte[] raw) { 258 return isBinary(raw, raw.length); 259 } 260 261 /** 262 * Determine heuristically whether the bytes contained in a stream 263 * represents binary (as opposed to text) content. 264 * 265 * Note: Do not further use this stream after having called this method! The 266 * stream may not be fully read and will be left at an unknown position 267 * after consuming an unknown number of bytes. The caller is responsible for 268 * closing the stream. 269 * 270 * @param raw 271 * input stream containing the raw file content. 272 * @return true if raw is likely to be a binary file, false otherwise 273 * @throws java.io.IOException 274 * if input stream could not be read 275 */ 276 public static boolean isBinary(InputStream raw) throws IOException { 277 final byte[] buffer = new byte[FIRST_FEW_BYTES]; 278 int cnt = 0; 279 while (cnt < buffer.length) { 280 final int n = raw.read(buffer, cnt, buffer.length - cnt); 281 if (n == -1) 282 break; 283 cnt += n; 284 } 285 return isBinary(buffer, cnt); 286 } 287 288 /** 289 * Determine heuristically whether a byte array represents binary (as 290 * opposed to text) content. 291 * 292 * @param raw 293 * the raw file content. 294 * @param length 295 * number of bytes in {@code raw} to evaluate. This should be 296 * {@code raw.length} unless {@code raw} was over-allocated by 297 * the caller. 298 * @return true if raw is likely to be a binary file, false otherwise 299 */ 300 public static boolean isBinary(byte[] raw, int length) { 301 // Same heuristic as C Git 302 if (length > FIRST_FEW_BYTES) 303 length = FIRST_FEW_BYTES; 304 for (int ptr = 0; ptr < length; ptr++) 305 if (raw[ptr] == '\0') 306 return true; 307 308 return false; 309 } 310 311 /** 312 * Determine heuristically whether a byte array represents text content 313 * using CR-LF as line separator. 314 * 315 * @param raw 316 * the raw file content. 317 * @return {@code true} if raw is likely to be CR-LF delimited text, 318 * {@code false} otherwise 319 * @since 5.3 320 */ 321 public static boolean isCrLfText(byte[] raw) { 322 return isCrLfText(raw, raw.length); 323 } 324 325 /** 326 * Determine heuristically whether the bytes contained in a stream represent 327 * text content using CR-LF as line separator. 328 * 329 * Note: Do not further use this stream after having called this method! The 330 * stream may not be fully read and will be left at an unknown position 331 * after consuming an unknown number of bytes. The caller is responsible for 332 * closing the stream. 333 * 334 * @param raw 335 * input stream containing the raw file content. 336 * @return {@code true} if raw is likely to be CR-LF delimited text, 337 * {@code false} otherwise 338 * @throws java.io.IOException 339 * if input stream could not be read 340 * @since 5.3 341 */ 342 public static boolean isCrLfText(InputStream raw) throws IOException { 343 byte[] buffer = new byte[FIRST_FEW_BYTES]; 344 int cnt = 0; 345 while (cnt < buffer.length) { 346 int n = raw.read(buffer, cnt, buffer.length - cnt); 347 if (n == -1) { 348 break; 349 } 350 cnt += n; 351 } 352 return isCrLfText(buffer, cnt); 353 } 354 355 /** 356 * Determine heuristically whether a byte array represents text content 357 * using CR-LF as line separator. 358 * 359 * @param raw 360 * the raw file content. 361 * @param length 362 * number of bytes in {@code raw} to evaluate. 363 * @return {@code true} if raw is likely to be CR-LF delimited text, 364 * {@code false} otherwise 365 * @since 5.3 366 */ 367 public static boolean isCrLfText(byte[] raw, int length) { 368 boolean has_crlf = false; 369 for (int ptr = 0; ptr < length - 1; ptr++) { 370 if (raw[ptr] == '\0') { 371 return false; // binary 372 } else if (raw[ptr] == '\r' && raw[ptr + 1] == '\n') { 373 has_crlf = true; 374 } 375 } 376 return has_crlf; 377 } 378 379 /** 380 * Get the line delimiter for the first line. 381 * 382 * @since 2.0 383 * @return the line delimiter or <code>null</code> 384 */ 385 public String getLineDelimiter() { 386 if (size() == 0) 387 return null; 388 int e = getEnd(0); 389 if (content[e - 1] != '\n') 390 return null; 391 if (content.length > 1 && e > 1 && content[e - 2] == '\r') 392 return "\r\n"; //$NON-NLS-1$ 393 else 394 return "\n"; //$NON-NLS-1$ 395 } 396 397 /** 398 * Read a blob object into RawText, or throw BinaryBlobException if the blob 399 * is binary. 400 * 401 * @param ldr 402 * the ObjectLoader for the blob 403 * @param threshold 404 * if the blob is larger than this size, it is always assumed to 405 * be binary. 406 * @since 4.10 407 * @return the RawText representing the blob. 408 * @throws org.eclipse.jgit.errors.BinaryBlobException 409 * if the blob contains binary data. 410 * @throws java.io.IOException 411 * if the input could not be read. 412 */ 413 public static RawText load(ObjectLoader ldr, int threshold) 414 throws IOException, BinaryBlobException { 415 long sz = ldr.getSize(); 416 417 if (sz > threshold) { 418 throw new BinaryBlobException(); 419 } 420 421 if (sz <= FIRST_FEW_BYTES) { 422 byte[] data = ldr.getCachedBytes(FIRST_FEW_BYTES); 423 if (isBinary(data)) { 424 throw new BinaryBlobException(); 425 } 426 return new RawText(data); 427 } 428 429 byte[] head = new byte[FIRST_FEW_BYTES]; 430 try (InputStream stream = ldr.openStream()) { 431 int off = 0; 432 int left = head.length; 433 while (left > 0) { 434 int n = stream.read(head, off, left); 435 if (n < 0) { 436 throw new EOFException(); 437 } 438 left -= n; 439 440 while (n > 0) { 441 if (head[off] == '\0') { 442 throw new BinaryBlobException(); 443 } 444 off++; 445 n--; 446 } 447 } 448 449 byte data[]; 450 try { 451 data = new byte[(int)sz]; 452 } catch (OutOfMemoryError e) { 453 throw new LargeObjectException.OutOfMemory(e); 454 } 455 456 System.arraycopy(head, 0, data, 0, head.length); 457 IO.readFully(stream, data, off, (int) (sz-off)); 458 return new RawText(data, RawParseUtils.lineMapOrBinary(data, 0, (int) sz)); 459 } 460 } 461 }