RawText.java

  1. /*
  2.  * Copyright (C) 2009, Google Inc.
  3.  * Copyright (C) 2008-2009, Johannes E. Schindelin <johannes.schindelin@gmx.de> and others
  4.  *
  5.  * This program and the accompanying materials are made available under the
  6.  * terms of the Eclipse Distribution License v. 1.0 which is available at
  7.  * https://www.eclipse.org/org/documents/edl-v10.php.
  8.  *
  9.  * SPDX-License-Identifier: BSD-3-Clause
  10.  */

  11. package org.eclipse.jgit.diff;

  12. import java.io.EOFException;
  13. import java.io.File;
  14. import java.io.IOException;
  15. import java.io.InputStream;
  16. import java.io.OutputStream;

  17. import org.eclipse.jgit.errors.BinaryBlobException;
  18. import org.eclipse.jgit.errors.LargeObjectException;
  19. import org.eclipse.jgit.lib.ObjectLoader;
  20. import org.eclipse.jgit.util.IO;
  21. import org.eclipse.jgit.util.IntList;
  22. import org.eclipse.jgit.util.RawParseUtils;

  23. /**
  24.  * A Sequence supporting UNIX formatted text in byte[] format.
  25.  * <p>
  26.  * Elements of the sequence are the lines of the file, as delimited by the UNIX
  27.  * newline character ('\n'). The file content is treated as 8 bit binary text,
  28.  * with no assumptions or requirements on character encoding.
  29.  * <p>
  30.  * Note that the first line of the file is element 0, as defined by the Sequence
  31.  * interface API. Traditionally in a text editor a patch file the first line is
  32.  * line number 1. Callers may need to subtract 1 prior to invoking methods if
  33.  * they are converting from "line number" to "element index".
  34.  */
  35. public class RawText extends Sequence {
  36.     /** A RawText of length 0 */
  37.     public static final RawText EMPTY_TEXT = new RawText(new byte[0]);

  38.     /** Number of bytes to check for heuristics in {@link #isBinary(byte[])} */
  39.     static final int FIRST_FEW_BYTES = 8000;

  40.     /** The file content for this sequence. */
  41.     protected final byte[] content;

  42.     /** Map of line number to starting position within {@link #content}. */
  43.     protected final IntList lines;

  44.     /**
  45.      * Create a new sequence from an existing content byte array.
  46.      * <p>
  47.      * The entire array (indexes 0 through length-1) is used as the content.
  48.      *
  49.      * @param input
  50.      *            the content array. The object retains a reference to this
  51.      *            array, so it should be immutable.
  52.      */
  53.     public RawText(byte[] input) {
  54.         this(input, RawParseUtils.lineMap(input, 0, input.length));
  55.     }

  56.     /**
  57.      * Create a new sequence from the existing content byte array and the line
  58.      * map indicating line boundaries.
  59.      *
  60.      * @param input
  61.      *            the content array. The object retains a reference to this
  62.      *            array, so it should be immutable.
  63.      * @param lineMap
  64.      *            an array with 1-based offsets for the start of each line.
  65.      *            The first and last entries should be {@link Integer#MIN_VALUE}
  66.      *            and an offset one past the end of the last line, respectively.
  67.      * @since 5.0
  68.      */
  69.     public RawText(byte[] input, IntList lineMap) {
  70.         content = input;
  71.         lines = lineMap;
  72.     }

  73.     /**
  74.      * Create a new sequence from a file.
  75.      * <p>
  76.      * The entire file contents are used.
  77.      *
  78.      * @param file
  79.      *            the text file.
  80.      * @throws java.io.IOException
  81.      *             if Exceptions occur while reading the file
  82.      */
  83.     public RawText(File file) throws IOException {
  84.         this(IO.readFully(file));
  85.     }

  86.     /**
  87.      * @return the raw, unprocessed content read.
  88.      * @since 4.11
  89.      */
  90.     public byte[] getRawContent() {
  91.         return content;
  92.     }

  93.     /** @return total number of items in the sequence. */
  94.     /** {@inheritDoc} */
  95.     @Override
  96.     public int size() {
  97.         // The line map is always 2 entries larger than the number of lines in
  98.         // the file. Index 0 is padded out/unused. The last index is the total
  99.         // length of the buffer, and acts as a sentinel.
  100.         //
  101.         return lines.size() - 2;
  102.     }

  103.     /**
  104.      * Write a specific line to the output stream, without its trailing LF.
  105.      * <p>
  106.      * The specified line is copied as-is, with no character encoding
  107.      * translation performed.
  108.      * <p>
  109.      * If the specified line ends with an LF ('\n'), the LF is <b>not</b>
  110.      * copied. It is up to the caller to write the LF, if desired, between
  111.      * output lines.
  112.      *
  113.      * @param out
  114.      *            stream to copy the line data onto.
  115.      * @param i
  116.      *            index of the line to extract. Note this is 0-based, so line
  117.      *            number 1 is actually index 0.
  118.      * @throws java.io.IOException
  119.      *             the stream write operation failed.
  120.      */
  121.     public void writeLine(OutputStream out, int i)
  122.             throws IOException {
  123.         int start = getStart(i);
  124.         int end = getEnd(i);
  125.         if (content[end - 1] == '\n')
  126.             end--;
  127.         out.write(content, start, end - start);
  128.     }

  129.     /**
  130.      * Determine if the file ends with a LF ('\n').
  131.      *
  132.      * @return true if the last line has an LF; false otherwise.
  133.      */
  134.     public boolean isMissingNewlineAtEnd() {
  135.         final int end = lines.get(lines.size() - 1);
  136.         if (end == 0)
  137.             return true;
  138.         return content[end - 1] != '\n';
  139.     }

  140.     /**
  141.      * Get the text for a single line.
  142.      *
  143.      * @param i
  144.      *            index of the line to extract. Note this is 0-based, so line
  145.      *            number 1 is actually index 0.
  146.      * @return the text for the line, without a trailing LF.
  147.      */
  148.     public String getString(int i) {
  149.         return getString(i, i + 1, true);
  150.     }

  151.     /**
  152.      * Get the text for a region of lines.
  153.      *
  154.      * @param begin
  155.      *            index of the first line to extract. Note this is 0-based, so
  156.      *            line number 1 is actually index 0.
  157.      * @param end
  158.      *            index of one past the last line to extract.
  159.      * @param dropLF
  160.      *            if true the trailing LF ('\n') of the last returned line is
  161.      *            dropped, if present.
  162.      * @return the text for lines {@code [begin, end)}.
  163.      */
  164.     public String getString(int begin, int end, boolean dropLF) {
  165.         if (begin == end)
  166.             return ""; //$NON-NLS-1$

  167.         int s = getStart(begin);
  168.         int e = getEnd(end - 1);
  169.         if (dropLF && content[e - 1] == '\n')
  170.             e--;
  171.         return decode(s, e);
  172.     }

  173.     /**
  174.      * Decode a region of the text into a String.
  175.      *
  176.      * The default implementation of this method tries to guess the character
  177.      * set by considering UTF-8, the platform default, and falling back on
  178.      * ISO-8859-1 if neither of those can correctly decode the region given.
  179.      *
  180.      * @param start
  181.      *            first byte of the content to decode.
  182.      * @param end
  183.      *            one past the last byte of the content to decode.
  184.      * @return the region {@code [start, end)} decoded as a String.
  185.      */
  186.     protected String decode(int start, int end) {
  187.         return RawParseUtils.decode(content, start, end);
  188.     }

  189.     private int getStart(int i) {
  190.         return lines.get(i + 1);
  191.     }

  192.     private int getEnd(int i) {
  193.         return lines.get(i + 2);
  194.     }

  195.     /**
  196.      * Determine heuristically whether a byte array represents binary (as
  197.      * opposed to text) content.
  198.      *
  199.      * @param raw
  200.      *            the raw file content.
  201.      * @return true if raw is likely to be a binary file, false otherwise
  202.      */
  203.     public static boolean isBinary(byte[] raw) {
  204.         return isBinary(raw, raw.length);
  205.     }

  206.     /**
  207.      * Determine heuristically whether the bytes contained in a stream
  208.      * represents binary (as opposed to text) content.
  209.      *
  210.      * Note: Do not further use this stream after having called this method! The
  211.      * stream may not be fully read and will be left at an unknown position
  212.      * after consuming an unknown number of bytes. The caller is responsible for
  213.      * closing the stream.
  214.      *
  215.      * @param raw
  216.      *            input stream containing the raw file content.
  217.      * @return true if raw is likely to be a binary file, false otherwise
  218.      * @throws java.io.IOException
  219.      *             if input stream could not be read
  220.      */
  221.     public static boolean isBinary(InputStream raw) throws IOException {
  222.         final byte[] buffer = new byte[FIRST_FEW_BYTES];
  223.         int cnt = 0;
  224.         while (cnt < buffer.length) {
  225.             final int n = raw.read(buffer, cnt, buffer.length - cnt);
  226.             if (n == -1)
  227.                 break;
  228.             cnt += n;
  229.         }
  230.         return isBinary(buffer, cnt);
  231.     }

  232.     /**
  233.      * Determine heuristically whether a byte array represents binary (as
  234.      * opposed to text) content.
  235.      *
  236.      * @param raw
  237.      *            the raw file content.
  238.      * @param length
  239.      *            number of bytes in {@code raw} to evaluate. This should be
  240.      *            {@code raw.length} unless {@code raw} was over-allocated by
  241.      *            the caller.
  242.      * @return true if raw is likely to be a binary file, false otherwise
  243.      */
  244.     public static boolean isBinary(byte[] raw, int length) {
  245.         // Same heuristic as C Git
  246.         if (length > FIRST_FEW_BYTES)
  247.             length = FIRST_FEW_BYTES;
  248.         for (int ptr = 0; ptr < length; ptr++)
  249.             if (raw[ptr] == '\0')
  250.                 return true;

  251.         return false;
  252.     }

  253.     /**
  254.      * Determine heuristically whether a byte array represents text content
  255.      * using CR-LF as line separator.
  256.      *
  257.      * @param raw
  258.      *            the raw file content.
  259.      * @return {@code true} if raw is likely to be CR-LF delimited text,
  260.      *         {@code false} otherwise
  261.      * @since 5.3
  262.      */
  263.     public static boolean isCrLfText(byte[] raw) {
  264.         return isCrLfText(raw, raw.length);
  265.     }

  266.     /**
  267.      * Determine heuristically whether the bytes contained in a stream represent
  268.      * text content using CR-LF as line separator.
  269.      *
  270.      * Note: Do not further use this stream after having called this method! The
  271.      * stream may not be fully read and will be left at an unknown position
  272.      * after consuming an unknown number of bytes. The caller is responsible for
  273.      * closing the stream.
  274.      *
  275.      * @param raw
  276.      *            input stream containing the raw file content.
  277.      * @return {@code true} if raw is likely to be CR-LF delimited text,
  278.      *         {@code false} otherwise
  279.      * @throws java.io.IOException
  280.      *             if input stream could not be read
  281.      * @since 5.3
  282.      */
  283.     public static boolean isCrLfText(InputStream raw) throws IOException {
  284.         byte[] buffer = new byte[FIRST_FEW_BYTES];
  285.         int cnt = 0;
  286.         while (cnt < buffer.length) {
  287.             int n = raw.read(buffer, cnt, buffer.length - cnt);
  288.             if (n == -1) {
  289.                 break;
  290.             }
  291.             cnt += n;
  292.         }
  293.         return isCrLfText(buffer, cnt);
  294.     }

  295.     /**
  296.      * Determine heuristically whether a byte array represents text content
  297.      * using CR-LF as line separator.
  298.      *
  299.      * @param raw
  300.      *            the raw file content.
  301.      * @param length
  302.      *            number of bytes in {@code raw} to evaluate.
  303.      * @return {@code true} if raw is likely to be CR-LF delimited text,
  304.      *         {@code false} otherwise
  305.      * @since 5.3
  306.      */
  307.     public static boolean isCrLfText(byte[] raw, int length) {
  308.         boolean has_crlf = false;
  309.         for (int ptr = 0; ptr < length - 1; ptr++) {
  310.             if (raw[ptr] == '\0') {
  311.                 return false; // binary
  312.             } else if (raw[ptr] == '\r' && raw[ptr + 1] == '\n') {
  313.                 has_crlf = true;
  314.             }
  315.         }
  316.         return has_crlf;
  317.     }

  318.     /**
  319.      * Get the line delimiter for the first line.
  320.      *
  321.      * @since 2.0
  322.      * @return the line delimiter or <code>null</code>
  323.      */
  324.     public String getLineDelimiter() {
  325.         if (size() == 0) {
  326.             return null;
  327.         }
  328.         int e = getEnd(0);
  329.         if (content[e - 1] != '\n') {
  330.             return null;
  331.         }
  332.         if (content.length > 1 && e > 1 && content[e - 2] == '\r') {
  333.             return "\r\n"; //$NON-NLS-1$
  334.         }
  335.         return "\n"; //$NON-NLS-1$
  336.     }

  337.     /**
  338.      * Read a blob object into RawText, or throw BinaryBlobException if the blob
  339.      * is binary.
  340.      *
  341.      * @param ldr
  342.      *            the ObjectLoader for the blob
  343.      * @param threshold
  344.      *            if the blob is larger than this size, it is always assumed to
  345.      *            be binary.
  346.      * @since 4.10
  347.      * @return the RawText representing the blob.
  348.      * @throws org.eclipse.jgit.errors.BinaryBlobException
  349.      *             if the blob contains binary data.
  350.      * @throws java.io.IOException
  351.      *             if the input could not be read.
  352.      */
  353.     public static RawText load(ObjectLoader ldr, int threshold)
  354.             throws IOException, BinaryBlobException {
  355.         long sz = ldr.getSize();

  356.         if (sz > threshold) {
  357.             throw new BinaryBlobException();
  358.         }

  359.         if (sz <= FIRST_FEW_BYTES) {
  360.             byte[] data = ldr.getCachedBytes(FIRST_FEW_BYTES);
  361.             if (isBinary(data)) {
  362.                 throw new BinaryBlobException();
  363.             }
  364.             return new RawText(data);
  365.         }

  366.         byte[] head = new byte[FIRST_FEW_BYTES];
  367.         try (InputStream stream = ldr.openStream()) {
  368.             int off = 0;
  369.             int left = head.length;
  370.             while (left > 0) {
  371.                 int n = stream.read(head, off, left);
  372.                 if (n < 0) {
  373.                     throw new EOFException();
  374.                 }
  375.                 left -= n;

  376.                 while (n > 0) {
  377.                     if (head[off] == '\0') {
  378.                         throw new BinaryBlobException();
  379.                     }
  380.                     off++;
  381.                     n--;
  382.                 }
  383.             }

  384.             byte[] data;
  385.             try {
  386.                 data = new byte[(int)sz];
  387.             } catch (OutOfMemoryError e) {
  388.                 throw new LargeObjectException.OutOfMemory(e);
  389.             }

  390.             System.arraycopy(head, 0, data, 0, head.length);
  391.             IO.readFully(stream, data, off, (int) (sz-off));
  392.             return new RawText(data, RawParseUtils.lineMapOrBinary(data, 0, (int) sz));
  393.         }
  394.     }
  395. }