1 /*
2 * Copyright (C) 2009, Google Inc.
3 * Copyright (C) 2008-2021, Johannes E. Schindelin <johannes.schindelin@gmx.de> and others
4 *
5 * This program and the accompanying materials are made available under the
6 * terms of the Eclipse Distribution License v. 1.0 which is available at
7 * https://www.eclipse.org/org/documents/edl-v10.php.
8 *
9 * SPDX-License-Identifier: BSD-3-Clause
10 */
11
12 package org.eclipse.jgit.diff;
13
14 import java.io.EOFException;
15 import java.io.File;
16 import java.io.IOException;
17 import java.io.InputStream;
18 import java.io.OutputStream;
19 import java.nio.ByteBuffer;
20 import java.util.concurrent.atomic.AtomicInteger;
21
22 import org.eclipse.jgit.errors.BinaryBlobException;
23 import org.eclipse.jgit.errors.LargeObjectException;
24 import org.eclipse.jgit.lib.ObjectLoader;
25 import org.eclipse.jgit.util.IO;
26 import org.eclipse.jgit.util.IntList;
27 import org.eclipse.jgit.util.RawParseUtils;
28
29 /**
30 * A Sequence supporting UNIX formatted text in byte[] format.
31 * <p>
32 * Elements of the sequence are the lines of the file, as delimited by the UNIX
33 * newline character ('\n'). The file content is treated as 8 bit binary text,
34 * with no assumptions or requirements on character encoding.
35 * <p>
36 * Note that the first line of the file is element 0, as defined by the Sequence
37 * interface API. Traditionally in a text editor a patch file the first line is
38 * line number 1. Callers may need to subtract 1 prior to invoking methods if
39 * they are converting from "line number" to "element index".
40 */
41 public class RawText extends Sequence {
42
43 /** A RawText of length 0 */
44 public static final RawText EMPTY_TEXT = new RawText(new byte[0]);
45
46 /**
47 * Default and minimum for {@link #BUFFER_SIZE}.
48 */
49 private static final int FIRST_FEW_BYTES = 8 * 1024;
50
51 /**
52 * Number of bytes to check for heuristics in {@link #isBinary(byte[])}.
53 */
54 private static final AtomicInteger BUFFER_SIZE = new AtomicInteger(
55 FIRST_FEW_BYTES);
56
57 /** The file content for this sequence. */
58 protected final byte[] content;
59
60 /** Map of line number to starting position within {@link #content}. */
61 protected final IntList lines;
62
63 /**
64 * Create a new sequence from an existing content byte array.
65 * <p>
66 * The entire array (indexes 0 through length-1) is used as the content.
67 *
68 * @param input
69 * the content array. The object retains a reference to this
70 * array, so it should be immutable.
71 */
72 public RawText(byte[] input) {
73 this(input, RawParseUtils.lineMap(input, 0, input.length));
74 }
75
76 /**
77 * Create a new sequence from the existing content byte array and the line
78 * map indicating line boundaries.
79 *
80 * @param input
81 * the content array. The object retains a reference to this
82 * array, so it should be immutable.
83 * @param lineMap
84 * an array with 1-based offsets for the start of each line.
85 * The first and last entries should be {@link Integer#MIN_VALUE}
86 * and an offset one past the end of the last line, respectively.
87 * @since 5.0
88 */
89 public RawText(byte[] input, IntList lineMap) {
90 content = input;
91 lines = lineMap;
92 }
93
94 /**
95 * Create a new sequence from a file.
96 * <p>
97 * The entire file contents are used.
98 *
99 * @param file
100 * the text file.
101 * @throws java.io.IOException
102 * if Exceptions occur while reading the file
103 */
104 public RawText(File file) throws IOException {
105 this(IO.readFully(file));
106 }
107
108 /**
109 * @return the raw, unprocessed content read.
110 * @since 4.11
111 */
112 public byte[] getRawContent() {
113 return content;
114 }
115
116 /** @return total number of items in the sequence. */
117 /** {@inheritDoc} */
118 @Override
119 public int size() {
120 // The line map is always 2 entries larger than the number of lines in
121 // the file. Index 0 is padded out/unused. The last index is the total
122 // length of the buffer, and acts as a sentinel.
123 //
124 return lines.size() - 2;
125 }
126
127 /**
128 * Write a specific line to the output stream, without its trailing LF.
129 * <p>
130 * The specified line is copied as-is, with no character encoding
131 * translation performed.
132 * <p>
133 * If the specified line ends with an LF ('\n'), the LF is <b>not</b>
134 * copied. It is up to the caller to write the LF, if desired, between
135 * output lines.
136 *
137 * @param out
138 * stream to copy the line data onto.
139 * @param i
140 * index of the line to extract. Note this is 0-based, so line
141 * number 1 is actually index 0.
142 * @throws java.io.IOException
143 * the stream write operation failed.
144 */
145 public void writeLine(OutputStream out, int i)
146 throws IOException {
147 int start = getStart(i);
148 int end = getEnd(i);
149 if (content[end - 1] == '\n')
150 end--;
151 out.write(content, start, end - start);
152 }
153
154 /**
155 * Determine if the file ends with a LF ('\n').
156 *
157 * @return true if the last line has an LF; false otherwise.
158 */
159 public boolean isMissingNewlineAtEnd() {
160 final int end = lines.get(lines.size() - 1);
161 if (end == 0)
162 return true;
163 return content[end - 1] != '\n';
164 }
165
166 /**
167 * Get the text for a single line.
168 *
169 * @param i
170 * index of the line to extract. Note this is 0-based, so line
171 * number 1 is actually index 0.
172 * @return the text for the line, without a trailing LF.
173 */
174 public String getString(int i) {
175 return getString(i, i + 1, true);
176 }
177
178 /**
179 * Get the raw text for a single line.
180 *
181 * @param i
182 * index of the line to extract. Note this is 0-based, so line
183 * number 1 is actually index 0.
184 * @return the text for the line, without a trailing LF, as a
185 * {@link ByteBuffer} that is backed by a slice of the
186 * {@link #getRawContent() raw content}, with the buffer's position
187 * on the start of the line and the limit at the end.
188 * @since 5.12
189 */
190 public ByteBuffer getRawString(int i) {
191 int s = getStart(i);
192 int e = getEnd(i);
193 if (e > 0 && content[e - 1] == '\n') {
194 e--;
195 }
196 return ByteBuffer.wrap(content, s, e - s);
197 }
198
199 /**
200 * Get the text for a region of lines.
201 *
202 * @param begin
203 * index of the first line to extract. Note this is 0-based, so
204 * line number 1 is actually index 0.
205 * @param end
206 * index of one past the last line to extract.
207 * @param dropLF
208 * if true the trailing LF ('\n') of the last returned line is
209 * dropped, if present.
210 * @return the text for lines {@code [begin, end)}.
211 */
212 public String getString(int begin, int end, boolean dropLF) {
213 if (begin == end)
214 return ""; //$NON-NLS-1$
215
216 int s = getStart(begin);
217 int e = getEnd(end - 1);
218 if (dropLF && content[e - 1] == '\n')
219 e--;
220 return decode(s, e);
221 }
222
223 /**
224 * Decode a region of the text into a String.
225 *
226 * The default implementation of this method tries to guess the character
227 * set by considering UTF-8, the platform default, and falling back on
228 * ISO-8859-1 if neither of those can correctly decode the region given.
229 *
230 * @param start
231 * first byte of the content to decode.
232 * @param end
233 * one past the last byte of the content to decode.
234 * @return the region {@code [start, end)} decoded as a String.
235 */
236 protected String decode(int start, int end) {
237 return RawParseUtils.decode(content, start, end);
238 }
239
240 private int getStart(int i) {
241 return lines.get(i + 1);
242 }
243
244 private int getEnd(int i) {
245 return lines.get(i + 2);
246 }
247
248 /**
249 * Obtains the buffer size to use for analyzing whether certain content is
250 * text or binary, or what line endings are used if it's text.
251 *
252 * @return the buffer size, by default {@link #FIRST_FEW_BYTES} bytes
253 * @since 6.0
254 */
255 public static int getBufferSize() {
256 return BUFFER_SIZE.get();
257 }
258
259 /**
260 * Sets the buffer size to use for analyzing whether certain content is text
261 * or binary, or what line endings are used if it's text. If the given
262 * {@code bufferSize} is smaller than {@link #FIRST_FEW_BYTES} set the
263 * buffer size to {@link #FIRST_FEW_BYTES}.
264 *
265 * @param bufferSize
266 * Size to set
267 * @return the size actually set
268 * @since 6.0
269 */
270 public static int setBufferSize(int bufferSize) {
271 int newSize = Math.max(FIRST_FEW_BYTES, bufferSize);
272 return BUFFER_SIZE.updateAndGet(curr -> newSize);
273 }
274
275 /**
276 * Determine heuristically whether the bytes contained in a stream
277 * represents binary (as opposed to text) content.
278 *
279 * Note: Do not further use this stream after having called this method! The
280 * stream may not be fully read and will be left at an unknown position
281 * after consuming an unknown number of bytes. The caller is responsible for
282 * closing the stream.
283 *
284 * @param raw
285 * input stream containing the raw file content.
286 * @return true if raw is likely to be a binary file, false otherwise
287 * @throws java.io.IOException
288 * if input stream could not be read
289 */
290 public static boolean isBinary(InputStream raw) throws IOException {
291 final byte[] buffer = new byte[getBufferSize()];
292 int cnt = 0;
293 while (cnt < buffer.length) {
294 final int n = raw.read(buffer, cnt, buffer.length - cnt);
295 if (n == -1)
296 break;
297 cnt += n;
298 }
299 return isBinary(buffer, cnt, cnt < buffer.length);
300 }
301
302 /**
303 * Determine heuristically whether a byte array represents binary (as
304 * opposed to text) content.
305 *
306 * @param raw
307 * the raw file content.
308 * @return true if raw is likely to be a binary file, false otherwise
309 */
310 public static boolean isBinary(byte[] raw) {
311 return isBinary(raw, raw.length);
312 }
313
314 /**
315 * Determine heuristically whether a byte array represents binary (as
316 * opposed to text) content.
317 *
318 * @param raw
319 * the raw file content.
320 * @param length
321 * number of bytes in {@code raw} to evaluate. This should be
322 * {@code raw.length} unless {@code raw} was over-allocated by
323 * the caller.
324 * @return true if raw is likely to be a binary file, false otherwise
325 */
326 public static boolean isBinary(byte[] raw, int length) {
327 return isBinary(raw, length, false);
328 }
329
330 /**
331 * Determine heuristically whether a byte array represents binary (as
332 * opposed to text) content.
333 *
334 * @param raw
335 * the raw file content.
336 * @param length
337 * number of bytes in {@code raw} to evaluate. This should be
338 * {@code raw.length} unless {@code raw} was over-allocated by
339 * the caller.
340 * @param complete
341 * whether {@code raw} contains the whole data
342 * @return true if raw is likely to be a binary file, false otherwise
343 * @since 6.0
344 */
345 public static boolean isBinary(byte[] raw, int length, boolean complete) {
346 // Similar heuristic as C Git. Differences:
347 // - limited buffer size; may be only the beginning of a large blob
348 // - no counting of printable vs. non-printable bytes < 0x20 and 0x7F
349 int maxLength = getBufferSize();
350 if (length > maxLength) {
351 length = maxLength;
352 }
353 byte last = 'x'; // Just something inconspicuous.
354 for (int ptr = 0; ptr < length; ptr++) {
355 byte curr = raw[ptr];
356 if (isBinary(curr, last)) {
357 return true;
358 }
359 last = curr;
360 }
361 if (complete) {
362 // Buffer contains everything...
363 return last == '\r'; // ... so this must be a lone CR
364 }
365 return false;
366 }
367
368 /**
369 * Determines from the last two bytes read from a source if it looks like
370 * binary content.
371 *
372 * @param curr
373 * the last byte, read after {@code prev}
374 * @param prev
375 * the previous byte, read before {@code last}
376 * @return {@code true} if either byte is NUL, or if prev is CR and curr is
377 * not LF, {@code false} otherwise
378 * @since 6.0
379 */
380 public static boolean isBinary(byte curr, byte prev) {
381 return curr == '\0' || (curr != '\n' && prev == '\r') || prev == '\0';
382 }
383
384 /**
385 * Determine heuristically whether a byte array represents text content
386 * using CR-LF as line separator.
387 *
388 * @param raw
389 * the raw file content.
390 * @return {@code true} if raw is likely to be CR-LF delimited text,
391 * {@code false} otherwise
392 * @since 5.3
393 */
394 public static boolean isCrLfText(byte[] raw) {
395 return isCrLfText(raw, raw.length);
396 }
397
398 /**
399 * Determine heuristically whether the bytes contained in a stream represent
400 * text content using CR-LF as line separator.
401 *
402 * Note: Do not further use this stream after having called this method! The
403 * stream may not be fully read and will be left at an unknown position
404 * after consuming an unknown number of bytes. The caller is responsible for
405 * closing the stream.
406 *
407 * @param raw
408 * input stream containing the raw file content.
409 * @return {@code true} if raw is likely to be CR-LF delimited text,
410 * {@code false} otherwise
411 * @throws java.io.IOException
412 * if input stream could not be read
413 * @since 5.3
414 */
415 public static boolean isCrLfText(InputStream raw) throws IOException {
416 byte[] buffer = new byte[getBufferSize()];
417 int cnt = 0;
418 while (cnt < buffer.length) {
419 int n = raw.read(buffer, cnt, buffer.length - cnt);
420 if (n == -1) {
421 break;
422 }
423 cnt += n;
424 }
425 return isCrLfText(buffer, cnt);
426 }
427
428 /**
429 * Determine heuristically whether a byte array represents text content
430 * using CR-LF as line separator.
431 *
432 * @param raw
433 * the raw file content.
434 * @param length
435 * number of bytes in {@code raw} to evaluate.
436 * @return {@code true} if raw is likely to be CR-LF delimited text,
437 * {@code false} otherwise
438 * @since 5.3
439 */
440 public static boolean isCrLfText(byte[] raw, int length) {
441 return isCrLfText(raw, length, false);
442 }
443
444 /**
445 * Determine heuristically whether a byte array represents text content
446 * using CR-LF as line separator.
447 *
448 * @param raw
449 * the raw file content.
450 * @param length
451 * number of bytes in {@code raw} to evaluate.
452 * @return {@code true} if raw is likely to be CR-LF delimited text,
453 * {@code false} otherwise
454 * @param complete
455 * whether {@code raw} contains the whole data
456 * @since 6.0
457 */
458 public static boolean isCrLfText(byte[] raw, int length, boolean complete) {
459 boolean has_crlf = false;
460 byte last = 'x'; // Just something inconspicuous
461 for (int ptr = 0; ptr < length; ptr++) {
462 byte curr = raw[ptr];
463 if (isBinary(curr, last)) {
464 return false;
465 }
466 if (curr == '\n' && last == '\r') {
467 has_crlf = true;
468 }
469 last = curr;
470 }
471 if (last == '\r') {
472 if (complete) {
473 // Lone CR: it's binary after all.
474 return false;
475 }
476 // Tough call. If the next byte, which we don't have, would be a
477 // '\n', it'd be a CR-LF text, otherwise it'd be binary. Just decide
478 // based on what we already scanned; it wasn't binary until now.
479 }
480 return has_crlf;
481 }
482
483 /**
484 * Get the line delimiter for the first line.
485 *
486 * @since 2.0
487 * @return the line delimiter or <code>null</code>
488 */
489 public String getLineDelimiter() {
490 if (size() == 0) {
491 return null;
492 }
493 int e = getEnd(0);
494 if (content[e - 1] != '\n') {
495 return null;
496 }
497 if (content.length > 1 && e > 1 && content[e - 2] == '\r') {
498 return "\r\n"; //$NON-NLS-1$
499 }
500 return "\n"; //$NON-NLS-1$
501 }
502
503 /**
504 * Read a blob object into RawText, or throw BinaryBlobException if the blob
505 * is binary.
506 *
507 * @param ldr
508 * the ObjectLoader for the blob
509 * @param threshold
510 * if the blob is larger than this size, it is always assumed to
511 * be binary.
512 * @since 4.10
513 * @return the RawText representing the blob.
514 * @throws org.eclipse.jgit.errors.BinaryBlobException
515 * if the blob contains binary data.
516 * @throws java.io.IOException
517 * if the input could not be read.
518 */
519 public static RawText load(ObjectLoader ldr, int threshold)
520 throws IOException, BinaryBlobException {
521 long sz = ldr.getSize();
522
523 if (sz > threshold) {
524 throw new BinaryBlobException();
525 }
526
527 int bufferSize = getBufferSize();
528 if (sz <= bufferSize) {
529 byte[] data = ldr.getCachedBytes(bufferSize);
530 if (isBinary(data, data.length, true)) {
531 throw new BinaryBlobException();
532 }
533 return new RawText(data);
534 }
535
536 byte[] head = new byte[bufferSize];
537 try (InputStream stream = ldr.openStream()) {
538 int off = 0;
539 int left = head.length;
540 byte last = 'x'; // Just something inconspicuous
541 while (left > 0) {
542 int n = stream.read(head, off, left);
543 if (n < 0) {
544 throw new EOFException();
545 }
546 left -= n;
547
548 while (n > 0) {
549 byte curr = head[off];
550 if (isBinary(curr, last)) {
551 throw new BinaryBlobException();
552 }
553 last = curr;
554 off++;
555 n--;
556 }
557 }
558
559 byte[] data;
560 try {
561 data = new byte[(int)sz];
562 } catch (OutOfMemoryError e) {
563 throw new LargeObjectException.OutOfMemory(e);
564 }
565
566 System.arraycopy(head, 0, data, 0, head.length);
567 IO.readFully(stream, data, off, (int) (sz-off));
568 return new RawText(data, RawParseUtils.lineMapOrBinary(data, 0, (int) sz));
569 }
570 }
571 }