1 /*
2 * Copyright (C) 2009, Google Inc.
3 * Copyright (C) 2008-2021, Johannes E. Schindelin <johannes.schindelin@gmx.de> and others
4 *
5 * This program and the accompanying materials are made available under the
6 * terms of the Eclipse Distribution License v. 1.0 which is available at
7 * https://www.eclipse.org/org/documents/edl-v10.php.
8 *
9 * SPDX-License-Identifier: BSD-3-Clause
10 */
11
12 package org.eclipse.jgit.diff;
13
14 import java.io.EOFException;
15 import java.io.File;
16 import java.io.IOException;
17 import java.io.InputStream;
18 import java.io.OutputStream;
19 import java.nio.ByteBuffer;
20
21 import org.eclipse.jgit.errors.BinaryBlobException;
22 import org.eclipse.jgit.errors.LargeObjectException;
23 import org.eclipse.jgit.lib.ObjectLoader;
24 import org.eclipse.jgit.util.IO;
25 import org.eclipse.jgit.util.IntList;
26 import org.eclipse.jgit.util.RawParseUtils;
27
28 /**
29 * A Sequence supporting UNIX formatted text in byte[] format.
30 * <p>
31 * Elements of the sequence are the lines of the file, as delimited by the UNIX
32 * newline character ('\n'). The file content is treated as 8 bit binary text,
33 * with no assumptions or requirements on character encoding.
34 * <p>
35 * Note that the first line of the file is element 0, as defined by the Sequence
36 * interface API. Traditionally in a text editor a patch file the first line is
37 * line number 1. Callers may need to subtract 1 prior to invoking methods if
38 * they are converting from "line number" to "element index".
39 */
40 public class RawText extends Sequence {
41 /** A RawText of length 0 */
42 public static final RawText EMPTY_TEXT = new RawText(new byte[0]);
43
44 /** Number of bytes to check for heuristics in {@link #isBinary(byte[])} */
45 static final int FIRST_FEW_BYTES = 8000;
46
47 /** The file content for this sequence. */
48 protected final byte[] content;
49
50 /** Map of line number to starting position within {@link #content}. */
51 protected final IntList lines;
52
53 /**
54 * Create a new sequence from an existing content byte array.
55 * <p>
56 * The entire array (indexes 0 through length-1) is used as the content.
57 *
58 * @param input
59 * the content array. The object retains a reference to this
60 * array, so it should be immutable.
61 */
62 public RawText(byte[] input) {
63 this(input, RawParseUtils.lineMap(input, 0, input.length));
64 }
65
66 /**
67 * Create a new sequence from the existing content byte array and the line
68 * map indicating line boundaries.
69 *
70 * @param input
71 * the content array. The object retains a reference to this
72 * array, so it should be immutable.
73 * @param lineMap
74 * an array with 1-based offsets for the start of each line.
75 * The first and last entries should be {@link Integer#MIN_VALUE}
76 * and an offset one past the end of the last line, respectively.
77 * @since 5.0
78 */
79 public RawText(byte[] input, IntList lineMap) {
80 content = input;
81 lines = lineMap;
82 }
83
84 /**
85 * Create a new sequence from a file.
86 * <p>
87 * The entire file contents are used.
88 *
89 * @param file
90 * the text file.
91 * @throws java.io.IOException
92 * if Exceptions occur while reading the file
93 */
94 public RawText(File file) throws IOException {
95 this(IO.readFully(file));
96 }
97
98 /**
99 * @return the raw, unprocessed content read.
100 * @since 4.11
101 */
102 public byte[] getRawContent() {
103 return content;
104 }
105
106 /** @return total number of items in the sequence. */
107 /** {@inheritDoc} */
108 @Override
109 public int size() {
110 // The line map is always 2 entries larger than the number of lines in
111 // the file. Index 0 is padded out/unused. The last index is the total
112 // length of the buffer, and acts as a sentinel.
113 //
114 return lines.size() - 2;
115 }
116
117 /**
118 * Write a specific line to the output stream, without its trailing LF.
119 * <p>
120 * The specified line is copied as-is, with no character encoding
121 * translation performed.
122 * <p>
123 * If the specified line ends with an LF ('\n'), the LF is <b>not</b>
124 * copied. It is up to the caller to write the LF, if desired, between
125 * output lines.
126 *
127 * @param out
128 * stream to copy the line data onto.
129 * @param i
130 * index of the line to extract. Note this is 0-based, so line
131 * number 1 is actually index 0.
132 * @throws java.io.IOException
133 * the stream write operation failed.
134 */
135 public void writeLine(OutputStream out, int i)
136 throws IOException {
137 int start = getStart(i);
138 int end = getEnd(i);
139 if (content[end - 1] == '\n')
140 end--;
141 out.write(content, start, end - start);
142 }
143
144 /**
145 * Determine if the file ends with a LF ('\n').
146 *
147 * @return true if the last line has an LF; false otherwise.
148 */
149 public boolean isMissingNewlineAtEnd() {
150 final int end = lines.get(lines.size() - 1);
151 if (end == 0)
152 return true;
153 return content[end - 1] != '\n';
154 }
155
156 /**
157 * Get the text for a single line.
158 *
159 * @param i
160 * index of the line to extract. Note this is 0-based, so line
161 * number 1 is actually index 0.
162 * @return the text for the line, without a trailing LF.
163 */
164 public String getString(int i) {
165 return getString(i, i + 1, true);
166 }
167
168 /**
169 * Get the raw text for a single line.
170 *
171 * @param i
172 * index of the line to extract. Note this is 0-based, so line
173 * number 1 is actually index 0.
174 * @return the text for the line, without a trailing LF, as a
175 * {@link ByteBuffer} that is backed by a slice of the
176 * {@link #getRawContent() raw content}, with the buffer's position
177 * on the start of the line and the limit at the end.
178 * @since 5.12
179 */
180 public ByteBuffer getRawString(int i) {
181 int s = getStart(i);
182 int e = getEnd(i);
183 if (e > 0 && content[e - 1] == '\n') {
184 e--;
185 }
186 return ByteBuffer.wrap(content, s, e - s);
187 }
188
189 /**
190 * Get the text for a region of lines.
191 *
192 * @param begin
193 * index of the first line to extract. Note this is 0-based, so
194 * line number 1 is actually index 0.
195 * @param end
196 * index of one past the last line to extract.
197 * @param dropLF
198 * if true the trailing LF ('\n') of the last returned line is
199 * dropped, if present.
200 * @return the text for lines {@code [begin, end)}.
201 */
202 public String getString(int begin, int end, boolean dropLF) {
203 if (begin == end)
204 return ""; //$NON-NLS-1$
205
206 int s = getStart(begin);
207 int e = getEnd(end - 1);
208 if (dropLF && content[e - 1] == '\n')
209 e--;
210 return decode(s, e);
211 }
212
213 /**
214 * Decode a region of the text into a String.
215 *
216 * The default implementation of this method tries to guess the character
217 * set by considering UTF-8, the platform default, and falling back on
218 * ISO-8859-1 if neither of those can correctly decode the region given.
219 *
220 * @param start
221 * first byte of the content to decode.
222 * @param end
223 * one past the last byte of the content to decode.
224 * @return the region {@code [start, end)} decoded as a String.
225 */
226 protected String decode(int start, int end) {
227 return RawParseUtils.decode(content, start, end);
228 }
229
230 private int getStart(int i) {
231 return lines.get(i + 1);
232 }
233
234 private int getEnd(int i) {
235 return lines.get(i + 2);
236 }
237
238 /**
239 * Determine heuristically whether a byte array represents binary (as
240 * opposed to text) content.
241 *
242 * @param raw
243 * the raw file content.
244 * @return true if raw is likely to be a binary file, false otherwise
245 */
246 public static boolean isBinary(byte[] raw) {
247 return isBinary(raw, raw.length);
248 }
249
250 /**
251 * Determine heuristically whether the bytes contained in a stream
252 * represents binary (as opposed to text) content.
253 *
254 * Note: Do not further use this stream after having called this method! The
255 * stream may not be fully read and will be left at an unknown position
256 * after consuming an unknown number of bytes. The caller is responsible for
257 * closing the stream.
258 *
259 * @param raw
260 * input stream containing the raw file content.
261 * @return true if raw is likely to be a binary file, false otherwise
262 * @throws java.io.IOException
263 * if input stream could not be read
264 */
265 public static boolean isBinary(InputStream raw) throws IOException {
266 final byte[] buffer = new byte[FIRST_FEW_BYTES];
267 int cnt = 0;
268 while (cnt < buffer.length) {
269 final int n = raw.read(buffer, cnt, buffer.length - cnt);
270 if (n == -1)
271 break;
272 cnt += n;
273 }
274 return isBinary(buffer, cnt);
275 }
276
277 /**
278 * Determine heuristically whether a byte array represents binary (as
279 * opposed to text) content.
280 *
281 * @param raw
282 * the raw file content.
283 * @param length
284 * number of bytes in {@code raw} to evaluate. This should be
285 * {@code raw.length} unless {@code raw} was over-allocated by
286 * the caller.
287 * @return true if raw is likely to be a binary file, false otherwise
288 */
289 public static boolean isBinary(byte[] raw, int length) {
290 // Same heuristic as C Git
291 if (length > FIRST_FEW_BYTES)
292 length = FIRST_FEW_BYTES;
293 for (int ptr = 0; ptr < length; ptr++)
294 if (raw[ptr] == '\0')
295 return true;
296
297 return false;
298 }
299
300 /**
301 * Determine heuristically whether a byte array represents text content
302 * using CR-LF as line separator.
303 *
304 * @param raw
305 * the raw file content.
306 * @return {@code true} if raw is likely to be CR-LF delimited text,
307 * {@code false} otherwise
308 * @since 5.3
309 */
310 public static boolean isCrLfText(byte[] raw) {
311 return isCrLfText(raw, raw.length);
312 }
313
314 /**
315 * Determine heuristically whether the bytes contained in a stream represent
316 * text content using CR-LF as line separator.
317 *
318 * Note: Do not further use this stream after having called this method! The
319 * stream may not be fully read and will be left at an unknown position
320 * after consuming an unknown number of bytes. The caller is responsible for
321 * closing the stream.
322 *
323 * @param raw
324 * input stream containing the raw file content.
325 * @return {@code true} if raw is likely to be CR-LF delimited text,
326 * {@code false} otherwise
327 * @throws java.io.IOException
328 * if input stream could not be read
329 * @since 5.3
330 */
331 public static boolean isCrLfText(InputStream raw) throws IOException {
332 byte[] buffer = new byte[FIRST_FEW_BYTES];
333 int cnt = 0;
334 while (cnt < buffer.length) {
335 int n = raw.read(buffer, cnt, buffer.length - cnt);
336 if (n == -1) {
337 break;
338 }
339 cnt += n;
340 }
341 return isCrLfText(buffer, cnt);
342 }
343
344 /**
345 * Determine heuristically whether a byte array represents text content
346 * using CR-LF as line separator.
347 *
348 * @param raw
349 * the raw file content.
350 * @param length
351 * number of bytes in {@code raw} to evaluate.
352 * @return {@code true} if raw is likely to be CR-LF delimited text,
353 * {@code false} otherwise
354 * @since 5.3
355 */
356 public static boolean isCrLfText(byte[] raw, int length) {
357 boolean has_crlf = false;
358 for (int ptr = 0; ptr < length - 1; ptr++) {
359 if (raw[ptr] == '\0') {
360 return false; // binary
361 } else if (raw[ptr] == '\r' && raw[ptr + 1] == '\n') {
362 has_crlf = true;
363 }
364 }
365 return has_crlf;
366 }
367
368 /**
369 * Get the line delimiter for the first line.
370 *
371 * @since 2.0
372 * @return the line delimiter or <code>null</code>
373 */
374 public String getLineDelimiter() {
375 if (size() == 0) {
376 return null;
377 }
378 int e = getEnd(0);
379 if (content[e - 1] != '\n') {
380 return null;
381 }
382 if (content.length > 1 && e > 1 && content[e - 2] == '\r') {
383 return "\r\n"; //$NON-NLS-1$
384 }
385 return "\n"; //$NON-NLS-1$
386 }
387
388 /**
389 * Read a blob object into RawText, or throw BinaryBlobException if the blob
390 * is binary.
391 *
392 * @param ldr
393 * the ObjectLoader for the blob
394 * @param threshold
395 * if the blob is larger than this size, it is always assumed to
396 * be binary.
397 * @since 4.10
398 * @return the RawText representing the blob.
399 * @throws org.eclipse.jgit.errors.BinaryBlobException
400 * if the blob contains binary data.
401 * @throws java.io.IOException
402 * if the input could not be read.
403 */
404 public static RawText load(ObjectLoader ldr, int threshold)
405 throws IOException, BinaryBlobException {
406 long sz = ldr.getSize();
407
408 if (sz > threshold) {
409 throw new BinaryBlobException();
410 }
411
412 if (sz <= FIRST_FEW_BYTES) {
413 byte[] data = ldr.getCachedBytes(FIRST_FEW_BYTES);
414 if (isBinary(data)) {
415 throw new BinaryBlobException();
416 }
417 return new RawText(data);
418 }
419
420 byte[] head = new byte[FIRST_FEW_BYTES];
421 try (InputStream stream = ldr.openStream()) {
422 int off = 0;
423 int left = head.length;
424 while (left > 0) {
425 int n = stream.read(head, off, left);
426 if (n < 0) {
427 throw new EOFException();
428 }
429 left -= n;
430
431 while (n > 0) {
432 if (head[off] == '\0') {
433 throw new BinaryBlobException();
434 }
435 off++;
436 n--;
437 }
438 }
439
440 byte[] data;
441 try {
442 data = new byte[(int)sz];
443 } catch (OutOfMemoryError e) {
444 throw new LargeObjectException.OutOfMemory(e);
445 }
446
447 System.arraycopy(head, 0, data, 0, head.length);
448 IO.readFully(stream, data, off, (int) (sz-off));
449 return new RawText(data, RawParseUtils.lineMapOrBinary(data, 0, (int) sz));
450 }
451 }
452 }