1 /* 2 * Copyright (C) 2009, Google Inc. 3 * Copyright (C) 2008-2009, Johannes E. Schindelin <johannes.schindelin@gmx.de> 4 * and other copyright owners as documented in the project's IP log. 5 * 6 * This program and the accompanying materials are made available 7 * under the terms of the Eclipse Distribution License v1.0 which 8 * accompanies this distribution, is reproduced below, and is 9 * available at http://www.eclipse.org/org/documents/edl-v10.php 10 * 11 * All rights reserved. 12 * 13 * Redistribution and use in source and binary forms, with or 14 * without modification, are permitted provided that the following 15 * conditions are met: 16 * 17 * - Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 20 * - Redistributions in binary form must reproduce the above 21 * copyright notice, this list of conditions and the following 22 * disclaimer in the documentation and/or other materials provided 23 * with the distribution. 24 * 25 * - Neither the name of the Eclipse Foundation, Inc. nor the 26 * names of its contributors may be used to endorse or promote 27 * products derived from this software without specific prior 28 * written permission. 29 * 30 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND 31 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, 32 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 33 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 34 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 35 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 36 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 37 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 38 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 39 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 40 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 41 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 42 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 43 */ 44 45 package org.eclipse.jgit.diff; 46 47 import java.io.File; 48 import java.io.IOException; 49 import java.io.InputStream; 50 import java.io.OutputStream; 51 52 import org.eclipse.jgit.util.IO; 53 import org.eclipse.jgit.util.IntList; 54 import org.eclipse.jgit.util.RawParseUtils; 55 56 /** 57 * A Sequence supporting UNIX formatted text in byte[] format. 58 * <p> 59 * Elements of the sequence are the lines of the file, as delimited by the UNIX 60 * newline character ('\n'). The file content is treated as 8 bit binary text, 61 * with no assumptions or requirements on character encoding. 62 * <p> 63 * Note that the first line of the file is element 0, as defined by the Sequence 64 * interface API. Traditionally in a text editor a patch file the first line is 65 * line number 1. Callers may need to subtract 1 prior to invoking methods if 66 * they are converting from "line number" to "element index". 67 */ 68 public class RawText extends Sequence { 69 /** A Rawtext of length 0 */ 70 public static final RawText EMPTY_TEXT = new RawText(new byte[0]); 71 72 /** Number of bytes to check for heuristics in {@link #isBinary(byte[])} */ 73 private static final int FIRST_FEW_BYTES = 8000; 74 75 /** The file content for this sequence. */ 76 protected final byte[] content; 77 78 /** Map of line number to starting position within {@link #content}. */ 79 protected final IntList lines; 80 81 /** 82 * Create a new sequence from an existing content byte array. 83 * <p> 84 * The entire array (indexes 0 through length-1) is used as the content. 85 * 86 * @param input 87 * the content array. The array is never modified, so passing 88 * through cached arrays is safe. 89 */ 90 public RawText(final byte[] input) { 91 content = input; 92 lines = RawParseUtils.lineMap(content, 0, content.length); 93 } 94 95 /** 96 * Create a new sequence from a file. 97 * <p> 98 * The entire file contents are used. 99 * 100 * @param file 101 * the text file. 102 * @throws IOException 103 * if Exceptions occur while reading the file 104 */ 105 public RawText(File file) throws IOException { 106 this(IO.readFully(file)); 107 } 108 109 /** @return total number of items in the sequence. */ 110 public int size() { 111 // The line map is always 2 entries larger than the number of lines in 112 // the file. Index 0 is padded out/unused. The last index is the total 113 // length of the buffer, and acts as a sentinel. 114 // 115 return lines.size() - 2; 116 } 117 118 /** 119 * Write a specific line to the output stream, without its trailing LF. 120 * <p> 121 * The specified line is copied as-is, with no character encoding 122 * translation performed. 123 * <p> 124 * If the specified line ends with an LF ('\n'), the LF is <b>not</b> 125 * copied. It is up to the caller to write the LF, if desired, between 126 * output lines. 127 * 128 * @param out 129 * stream to copy the line data onto. 130 * @param i 131 * index of the line to extract. Note this is 0-based, so line 132 * number 1 is actually index 0. 133 * @throws IOException 134 * the stream write operation failed. 135 */ 136 public void writeLine(final OutputStream out, final int i) 137 throws IOException { 138 int start = getStart(i); 139 int end = getEnd(i); 140 if (content[end - 1] == '\n') 141 end--; 142 out.write(content, start, end - start); 143 } 144 145 /** 146 * Determine if the file ends with a LF ('\n'). 147 * 148 * @return true if the last line has an LF; false otherwise. 149 */ 150 public boolean isMissingNewlineAtEnd() { 151 final int end = lines.get(lines.size() - 1); 152 if (end == 0) 153 return true; 154 return content[end - 1] != '\n'; 155 } 156 157 /** 158 * Get the text for a single line. 159 * 160 * @param i 161 * index of the line to extract. Note this is 0-based, so line 162 * number 1 is actually index 0. 163 * @return the text for the line, without a trailing LF. 164 */ 165 public String getString(int i) { 166 return getString(i, i + 1, true); 167 } 168 169 /** 170 * Get the text for a region of lines. 171 * 172 * @param begin 173 * index of the first line to extract. Note this is 0-based, so 174 * line number 1 is actually index 0. 175 * @param end 176 * index of one past the last line to extract. 177 * @param dropLF 178 * if true the trailing LF ('\n') of the last returned line is 179 * dropped, if present. 180 * @return the text for lines {@code [begin, end)}. 181 */ 182 public String getString(int begin, int end, boolean dropLF) { 183 if (begin == end) 184 return ""; //$NON-NLS-1$ 185 186 int s = getStart(begin); 187 int e = getEnd(end - 1); 188 if (dropLF && content[e - 1] == '\n') 189 e--; 190 return decode(s, e); 191 } 192 193 /** 194 * Decode a region of the text into a String. 195 * 196 * The default implementation of this method tries to guess the character 197 * set by considering UTF-8, the platform default, and falling back on 198 * ISO-8859-1 if neither of those can correctly decode the region given. 199 * 200 * @param start 201 * first byte of the content to decode. 202 * @param end 203 * one past the last byte of the content to decode. 204 * @return the region {@code [start, end)} decoded as a String. 205 */ 206 protected String decode(int start, int end) { 207 return RawParseUtils.decode(content, start, end); 208 } 209 210 private int getStart(final int i) { 211 return lines.get(i + 1); 212 } 213 214 private int getEnd(final int i) { 215 return lines.get(i + 2); 216 } 217 218 /** 219 * Determine heuristically whether a byte array represents binary (as 220 * opposed to text) content. 221 * 222 * @param raw 223 * the raw file content. 224 * @return true if raw is likely to be a binary file, false otherwise 225 */ 226 public static boolean isBinary(byte[] raw) { 227 return isBinary(raw, raw.length); 228 } 229 230 /** 231 * Determine heuristically whether the bytes contained in a stream 232 * represents binary (as opposed to text) content. 233 * 234 * Note: Do not further use this stream after having called this method! The 235 * stream may not be fully read and will be left at an unknown position 236 * after consuming an unknown number of bytes. The caller is responsible for 237 * closing the stream. 238 * 239 * @param raw 240 * input stream containing the raw file content. 241 * @return true if raw is likely to be a binary file, false otherwise 242 * @throws IOException 243 * if input stream could not be read 244 */ 245 public static boolean isBinary(InputStream raw) throws IOException { 246 final byte[] buffer = new byte[FIRST_FEW_BYTES]; 247 int cnt = 0; 248 while (cnt < buffer.length) { 249 final int n = raw.read(buffer, cnt, buffer.length - cnt); 250 if (n == -1) 251 break; 252 cnt += n; 253 } 254 return isBinary(buffer, cnt); 255 } 256 257 /** 258 * Determine heuristically whether a byte array represents binary (as 259 * opposed to text) content. 260 * 261 * @param raw 262 * the raw file content. 263 * @param length 264 * number of bytes in {@code raw} to evaluate. This should be 265 * {@code raw.length} unless {@code raw} was over-allocated by 266 * the caller. 267 * @return true if raw is likely to be a binary file, false otherwise 268 */ 269 public static boolean isBinary(byte[] raw, int length) { 270 // Same heuristic as C Git 271 if (length > FIRST_FEW_BYTES) 272 length = FIRST_FEW_BYTES; 273 for (int ptr = 0; ptr < length; ptr++) 274 if (raw[ptr] == '\0') 275 return true; 276 277 return false; 278 } 279 280 /** 281 * Get the line delimiter for the first line. 282 * 283 * @since 2.0 284 * @return the line delimiter or <code>null</code> 285 */ 286 public String getLineDelimiter() { 287 if (size() == 0) 288 return null; 289 int e = getEnd(0); 290 if (content[e - 1] != '\n') 291 return null; 292 if (content.length > 1 && e > 1 && content[e - 2] == '\r') 293 return "\r\n"; //$NON-NLS-1$ 294 else 295 return "\n"; //$NON-NLS-1$ 296 } 297 }