View Javadoc
1   /*
2    * Copyright (C) 2008-2009, Google Inc. and others
3    *
4    * This program and the accompanying materials are made available under the
5    * terms of the Eclipse Distribution License v. 1.0 which is available at
6    * https://www.eclipse.org/org/documents/edl-v10.php.
7    *
8    * SPDX-License-Identifier: BSD-3-Clause
9    */
10  
11  package org.eclipse.jgit.patch;
12  
13  import static java.nio.charset.StandardCharsets.UTF_8;
14  import static org.eclipse.jgit.lib.Constants.encodeASCII;
15  import static org.eclipse.jgit.util.RawParseUtils.decode;
16  import static org.eclipse.jgit.util.RawParseUtils.decodeNoFallback;
17  import static org.eclipse.jgit.util.RawParseUtils.extractBinaryString;
18  import static org.eclipse.jgit.util.RawParseUtils.match;
19  import static org.eclipse.jgit.util.RawParseUtils.nextLF;
20  import static org.eclipse.jgit.util.RawParseUtils.parseBase10;
21  
22  import java.io.IOException;
23  import java.nio.charset.CharacterCodingException;
24  import java.nio.charset.Charset;
25  import java.text.MessageFormat;
26  import java.util.ArrayList;
27  import java.util.Collections;
28  import java.util.List;
29  
30  import org.eclipse.jgit.diff.DiffEntry;
31  import org.eclipse.jgit.diff.EditList;
32  import org.eclipse.jgit.internal.JGitText;
33  import org.eclipse.jgit.lib.AbbreviatedObjectId;
34  import org.eclipse.jgit.lib.FileMode;
35  import org.eclipse.jgit.util.QuotedString;
36  import org.eclipse.jgit.util.RawParseUtils;
37  import org.eclipse.jgit.util.TemporaryBuffer;
38  
39  /**
40   * Patch header describing an action for a single file path.
41   */
42  public class FileHeader extends DiffEntry {
43  	private static final byte[] OLD_MODE = encodeASCII("old mode "); //$NON-NLS-1$
44  
45  	private static final byte[] NEW_MODE = encodeASCII("new mode "); //$NON-NLS-1$
46  
47  	static final byte[] DELETED_FILE_MODE = encodeASCII("deleted file mode "); //$NON-NLS-1$
48  
49  	static final byte[] NEW_FILE_MODE = encodeASCII("new file mode "); //$NON-NLS-1$
50  
51  	private static final byte[] COPY_FROM = encodeASCII("copy from "); //$NON-NLS-1$
52  
53  	private static final byte[] COPY_TO = encodeASCII("copy to "); //$NON-NLS-1$
54  
55  	private static final byte[] RENAME_OLD = encodeASCII("rename old "); //$NON-NLS-1$
56  
57  	private static final byte[] RENAME_NEW = encodeASCII("rename new "); //$NON-NLS-1$
58  
59  	private static final byte[] RENAME_FROM = encodeASCII("rename from "); //$NON-NLS-1$
60  
61  	private static final byte[] RENAME_TO = encodeASCII("rename to "); //$NON-NLS-1$
62  
63  	private static final byte[] SIMILARITY_INDEX = encodeASCII("similarity index "); //$NON-NLS-1$
64  
65  	private static final byte[] DISSIMILARITY_INDEX = encodeASCII("dissimilarity index "); //$NON-NLS-1$
66  
67  	static final byte[] INDEX = encodeASCII("index "); //$NON-NLS-1$
68  
69  	static final byte[] OLD_NAME = encodeASCII("--- "); //$NON-NLS-1$
70  
71  	static final byte[] NEW_NAME = encodeASCII("+++ "); //$NON-NLS-1$
72  
73  	/** Type of patch used by this file. */
74  	public enum PatchType {
75  		/** A traditional unified diff style patch of a text file. */
76  		UNIFIED,
77  
78  		/** An empty patch with a message "Binary files ... differ" */
79  		BINARY,
80  
81  		/** A Git binary patch, holding pre and post image deltas */
82  		GIT_BINARY;
83  	}
84  
85  	/** Buffer holding the patch data for this file. */
86  	final byte[] buf;
87  
88  	/** Offset within {@link #buf} to the "diff ..." line. */
89  	final int startOffset;
90  
91  	/** Position 1 past the end of this file within {@link #buf}. */
92  	int endOffset;
93  
94  	/** Type of patch used to modify this file */
95  	PatchType patchType;
96  
97  	/** The hunks of this file */
98  	private List<HunkHeader> hunks;
99  
100 	/** If {@link #patchType} is {@link PatchType#GIT_BINARY}, the new image */
101 	BinaryHunk forwardBinaryHunk;
102 
103 	/** If {@link #patchType} is {@link PatchType#GIT_BINARY}, the old image */
104 	BinaryHunk reverseBinaryHunk;
105 
106 	/**
107 	 * Constructs a new FileHeader
108 	 *
109 	 * @param headerLines
110 	 *            buffer holding the diff header for this file
111 	 * @param edits
112 	 *            the edits for this file
113 	 * @param type
114 	 *            the type of patch used to modify this file
115 	 */
116 	public FileHeader(byte[] headerLines, EditList edits, PatchType type) {
117 		this(headerLines, 0);
118 		endOffset = headerLines.length;
119 		int ptr = parseGitFileName(Patch.DIFF_GIT.length, headerLines.length);
120 		parseGitHeaders(ptr, headerLines.length);
121 		this.patchType = type;
122 		addHunk(new HunkHeader(this, edits));
123 	}
124 
125 	FileHeader(byte[] b, int offset) {
126 		buf = b;
127 		startOffset = offset;
128 		changeType = ChangeType.MODIFY; // unless otherwise designated
129 		patchType = PatchType.UNIFIED;
130 	}
131 
132 	int getParentCount() {
133 		return 1;
134 	}
135 
136 	/**
137 	 * Get the byte array holding this file's patch script.
138 	 *
139 	 * @return the byte array holding this file's patch script.
140 	 */
141 	public byte[] getBuffer() {
142 		return buf;
143 	}
144 
145 	/**
146 	 * Get offset of the start of this file's script in {@link #getBuffer()}.
147 	 *
148 	 * @return offset of the start of this file's script in
149 	 *         {@link #getBuffer()}.
150 	 */
151 	public int getStartOffset() {
152 		return startOffset;
153 	}
154 
155 	/**
156 	 * Get offset one past the end of the file script.
157 	 *
158 	 * @return offset one past the end of the file script.
159 	 */
160 	public int getEndOffset() {
161 		return endOffset;
162 	}
163 
164 	/**
165 	 * Convert the patch script for this file into a string.
166 	 * <p>
167 	 * The default character encoding
168 	 * ({@link java.nio.charset.StandardCharsets#UTF_8}) is assumed for both the
169 	 * old and new files.
170 	 *
171 	 * @return the patch script, as a Unicode string.
172 	 */
173 	public String getScriptText() {
174 		return getScriptText(null, null);
175 	}
176 
177 	/**
178 	 * Convert the patch script for this file into a string.
179 	 *
180 	 * @param oldCharset
181 	 *            hint character set to decode the old lines with.
182 	 * @param newCharset
183 	 *            hint character set to decode the new lines with.
184 	 * @return the patch script, as a Unicode string.
185 	 */
186 	public String getScriptText(Charset oldCharset, Charset newCharset) {
187 		return getScriptText(new Charset[] { oldCharset, newCharset });
188 	}
189 
190 	String getScriptText(Charset[] charsetGuess) {
191 		if (getHunks().isEmpty()) {
192 			// If we have no hunks then we can safely assume the entire
193 			// patch is a binary style patch, or a meta-data only style
194 			// patch. Either way the encoding of the headers should be
195 			// strictly 7-bit US-ASCII and the body is either 7-bit ASCII
196 			// (due to the base 85 encoding used for a BinaryHunk) or is
197 			// arbitrary noise we have chosen to ignore and not understand
198 			// (e.g. the message "Binary files ... differ").
199 			//
200 			return extractBinaryString(buf, startOffset, endOffset);
201 		}
202 
203 		if (charsetGuess != null && charsetGuess.length != getParentCount() + 1)
204 			throw new IllegalArgumentException(MessageFormat.format(
205 					JGitText.get().expectedCharacterEncodingGuesses,
206 					Integer.valueOf(getParentCount() + 1)));
207 
208 		if (trySimpleConversion(charsetGuess)) {
209 			Charset cs = charsetGuess != null ? charsetGuess[0] : null;
210 			if (cs == null) {
211 				cs = UTF_8;
212 			}
213 			try {
214 				return decodeNoFallback(cs, buf, startOffset, endOffset);
215 			} catch (CharacterCodingException cee) {
216 				// Try the much slower, more-memory intensive version which
217 				// can handle a character set conversion patch.
218 			}
219 		}
220 
221 		final StringBuilder r = new StringBuilder(endOffset - startOffset);
222 
223 		// Always treat the headers as US-ASCII; Git file names are encoded
224 		// in a C style escape if any character has the high-bit set.
225 		//
226 		final int hdrEnd = getHunks().get(0).getStartOffset();
227 		for (int ptr = startOffset; ptr < hdrEnd;) {
228 			final int eol = Math.min(hdrEnd, nextLF(buf, ptr));
229 			r.append(extractBinaryString(buf, ptr, eol));
230 			ptr = eol;
231 		}
232 
233 		final String[] files = extractFileLines(charsetGuess);
234 		final int[] offsets = new int[files.length];
235 		for (HunkHeader h : getHunks())
236 			h.extractFileLines(r, files, offsets);
237 		return r.toString();
238 	}
239 
240 	private static boolean trySimpleConversion(Charset[] charsetGuess) {
241 		if (charsetGuess == null)
242 			return true;
243 		for (int i = 1; i < charsetGuess.length; i++) {
244 			if (charsetGuess[i] != charsetGuess[0])
245 				return false;
246 		}
247 		return true;
248 	}
249 
250 	private String[] extractFileLines(Charset[] csGuess) {
251 		final TemporaryBuffer[] tmp = new TemporaryBuffer[getParentCount() + 1];
252 		try {
253 			for (int i = 0; i < tmp.length; i++)
254 				tmp[i] = new TemporaryBuffer.Heap(Integer.MAX_VALUE);
255 			for (HunkHeader h : getHunks())
256 				h.extractFileLines(tmp);
257 
258 			final String[] r = new String[tmp.length];
259 			for (int i = 0; i < tmp.length; i++) {
260 				Charset cs = csGuess != null ? csGuess[i] : null;
261 				if (cs == null) {
262 					cs = UTF_8;
263 				}
264 				r[i] = RawParseUtils.decode(cs, tmp[i].toByteArray());
265 			}
266 			return r;
267 		} catch (IOException ioe) {
268 			throw new RuntimeException(JGitText.get().cannotConvertScriptToText, ioe);
269 		}
270 	}
271 
272 	/**
273 	 * Get style of patch used to modify this file.
274 	 *
275 	 * @return style of patch used to modify this file.
276 	 */
277 	public PatchType getPatchType() {
278 		return patchType;
279 	}
280 
281 	/**
282 	 * Whether this patch modifies metadata about a file
283 	 *
284 	 * @return {@code true} if this patch modifies metadata about a file .
285 	 */
286 	public boolean hasMetaDataChanges() {
287 		return changeType != ChangeType.MODIFY || newMode != oldMode;
288 	}
289 
290 	/**
291 	 * Get hunks altering this file; in order of appearance in patch
292 	 *
293 	 * @return hunks altering this file; in order of appearance in patch.
294 	 */
295 	public List<? extends HunkHeader> getHunks() {
296 		if (hunks == null)
297 			return Collections.emptyList();
298 		return hunks;
299 	}
300 
301 	void addHunk(HunkHeader h) {
302 		if (h.getFileHeader() != this)
303 			throw new IllegalArgumentException(JGitText.get().hunkBelongsToAnotherFile);
304 		if (hunks == null)
305 			hunks = new ArrayList<>();
306 		hunks.add(h);
307 	}
308 
309 	HunkHeader newHunkHeader(int offset) {
310 		return new HunkHeader(this, offset);
311 	}
312 
313 	/**
314 	 * Get the new-image delta/literal if this is a
315 	 * {@link PatchType#GIT_BINARY}.
316 	 *
317 	 * @return the new-image delta/literal if this is a
318 	 *         {@link PatchType#GIT_BINARY}.
319 	 */
320 	public BinaryHunk getForwardBinaryHunk() {
321 		return forwardBinaryHunk;
322 	}
323 
324 	/**
325 	 * Get the old-image delta/literal if this is a
326 	 * {@link PatchType#GIT_BINARY}.
327 	 *
328 	 * @return the old-image delta/literal if this is a
329 	 *         {@link PatchType#GIT_BINARY}.
330 	 */
331 	public BinaryHunk getReverseBinaryHunk() {
332 		return reverseBinaryHunk;
333 	}
334 
335 	/**
336 	 * Convert to a list describing the content edits performed on this file.
337 	 *
338 	 * @return a list describing the content edits performed on this file.
339 	 */
340 	public EditList toEditList() {
341 		final EditList r = new EditList();
342 		for (HunkHeader hunk : hunks)
343 			r.addAll(hunk.toEditList());
344 		return r;
345 	}
346 
347 	/**
348 	 * Parse a "diff --git" or "diff --cc" line.
349 	 *
350 	 * @param ptr
351 	 *            first character after the "diff --git " or "diff --cc " part.
352 	 * @param end
353 	 *            one past the last position to parse.
354 	 * @return first character after the LF at the end of the line; -1 on error.
355 	 */
356 	int parseGitFileName(int ptr, int end) {
357 		final int eol = nextLF(buf, ptr);
358 		final int bol = ptr;
359 		if (eol >= end) {
360 			return -1;
361 		}
362 
363 		// buffer[ptr..eol] looks like "a/foo b/foo\n". After the first
364 		// A regex to match this is "^[^/]+/(.*?) [^/+]+/\1\n$". There
365 		// is only one way to split the line such that text to the left
366 		// of the space matches the text to the right, excluding the part
367 		// before the first slash.
368 		//
369 
370 		final int aStart = nextLF(buf, ptr, '/');
371 		if (aStart >= eol)
372 			return eol;
373 
374 		while (ptr < eol) {
375 			final int sp = nextLF(buf, ptr, ' ');
376 			if (sp >= eol) {
377 				// We can't split the header, it isn't valid.
378 				// This may be OK if this is a rename patch.
379 				//
380 				return eol;
381 			}
382 			final int bStart = nextLF(buf, sp, '/');
383 			if (bStart >= eol)
384 				return eol;
385 
386 			// If buffer[aStart..sp - 1] = buffer[bStart..eol - 1]
387 			// we have a valid split.
388 			//
389 			if (eq(aStart, sp - 1, bStart, eol - 1)) {
390 				if (buf[bol] == '"') {
391 					// We're a double quoted name. The region better end
392 					// in a double quote too, and we need to decode the
393 					// characters before reading the name.
394 					//
395 					if (buf[sp - 2] != '"') {
396 						return eol;
397 					}
398 					oldPath = QuotedString.GIT_PATH.dequote(buf, bol, sp - 1);
399 					oldPath = p1(oldPath);
400 				} else {
401 					oldPath = decode(UTF_8, buf, aStart, sp - 1);
402 				}
403 				newPath = oldPath;
404 				return eol;
405 			}
406 
407 			// This split wasn't correct. Move past the space and try
408 			// another split as the space must be part of the file name.
409 			//
410 			ptr = sp;
411 		}
412 
413 		return eol;
414 	}
415 
416 	int parseGitHeaders(int ptr, int end) {
417 		while (ptr < end) {
418 			final int eol = nextLF(buf, ptr);
419 			if (isHunkHdr(buf, ptr, eol) >= 1) {
420 				// First hunk header; break out and parse them later.
421 				break;
422 
423 			} else if (match(buf, ptr, OLD_NAME) >= 0) {
424 				parseOldName(ptr, eol);
425 
426 			} else if (match(buf, ptr, NEW_NAME) >= 0) {
427 				parseNewName(ptr, eol);
428 
429 			} else if (match(buf, ptr, OLD_MODE) >= 0) {
430 				oldMode = parseFileMode(ptr + OLD_MODE.length, eol);
431 
432 			} else if (match(buf, ptr, NEW_MODE) >= 0) {
433 				newMode = parseFileMode(ptr + NEW_MODE.length, eol);
434 
435 			} else if (match(buf, ptr, DELETED_FILE_MODE) >= 0) {
436 				oldMode = parseFileMode(ptr + DELETED_FILE_MODE.length, eol);
437 				newMode = FileMode.MISSING;
438 				changeType = ChangeType.DELETE;
439 
440 			} else if (match(buf, ptr, NEW_FILE_MODE) >= 0) {
441 				parseNewFileMode(ptr, eol);
442 
443 			} else if (match(buf, ptr, COPY_FROM) >= 0) {
444 				oldPath = parseName(oldPath, ptr + COPY_FROM.length, eol);
445 				changeType = ChangeType.COPY;
446 
447 			} else if (match(buf, ptr, COPY_TO) >= 0) {
448 				newPath = parseName(newPath, ptr + COPY_TO.length, eol);
449 				changeType = ChangeType.COPY;
450 
451 			} else if (match(buf, ptr, RENAME_OLD) >= 0) {
452 				oldPath = parseName(oldPath, ptr + RENAME_OLD.length, eol);
453 				changeType = ChangeType.RENAME;
454 
455 			} else if (match(buf, ptr, RENAME_NEW) >= 0) {
456 				newPath = parseName(newPath, ptr + RENAME_NEW.length, eol);
457 				changeType = ChangeType.RENAME;
458 
459 			} else if (match(buf, ptr, RENAME_FROM) >= 0) {
460 				oldPath = parseName(oldPath, ptr + RENAME_FROM.length, eol);
461 				changeType = ChangeType.RENAME;
462 
463 			} else if (match(buf, ptr, RENAME_TO) >= 0) {
464 				newPath = parseName(newPath, ptr + RENAME_TO.length, eol);
465 				changeType = ChangeType.RENAME;
466 
467 			} else if (match(buf, ptr, SIMILARITY_INDEX) >= 0) {
468 				score = parseBase10(buf, ptr + SIMILARITY_INDEX.length, null);
469 
470 			} else if (match(buf, ptr, DISSIMILARITY_INDEX) >= 0) {
471 				score = parseBase10(buf, ptr + DISSIMILARITY_INDEX.length, null);
472 
473 			} else if (match(buf, ptr, INDEX) >= 0) {
474 				parseIndexLine(ptr + INDEX.length, eol);
475 
476 			} else {
477 				// Probably an empty patch (stat dirty).
478 				break;
479 			}
480 
481 			ptr = eol;
482 		}
483 		return ptr;
484 	}
485 
486 	void parseOldName(int ptr, int eol) {
487 		oldPath = p1(parseName(oldPath, ptr + OLD_NAME.length, eol));
488 		if (oldPath == DEV_NULL)
489 			changeType = ChangeType.ADD;
490 	}
491 
492 	void parseNewName(int ptr, int eol) {
493 		newPath = p1(parseName(newPath, ptr + NEW_NAME.length, eol));
494 		if (newPath == DEV_NULL)
495 			changeType = ChangeType.DELETE;
496 	}
497 
498 	void parseNewFileMode(int ptr, int eol) {
499 		oldMode = FileMode.MISSING;
500 		newMode = parseFileMode(ptr + NEW_FILE_MODE.length, eol);
501 		changeType = ChangeType.ADD;
502 	}
503 
504 	int parseTraditionalHeaders(int ptr, int end) {
505 		while (ptr < end) {
506 			final int eol = nextLF(buf, ptr);
507 			if (isHunkHdr(buf, ptr, eol) >= 1) {
508 				// First hunk header; break out and parse them later.
509 				break;
510 
511 			} else if (match(buf, ptr, OLD_NAME) >= 0) {
512 				parseOldName(ptr, eol);
513 
514 			} else if (match(buf, ptr, NEW_NAME) >= 0) {
515 				parseNewName(ptr, eol);
516 
517 			} else {
518 				// Possibly an empty patch.
519 				break;
520 			}
521 
522 			ptr = eol;
523 		}
524 		return ptr;
525 	}
526 
527 	private String parseName(String expect, int ptr, int end) {
528 		if (ptr == end)
529 			return expect;
530 
531 		String r;
532 		if (buf[ptr] == '"') {
533 			// New style GNU diff format
534 			//
535 			r = QuotedString.GIT_PATH.dequote(buf, ptr, end - 1);
536 		} else {
537 			// Older style GNU diff format, an optional tab ends the name.
538 			//
539 			int tab = end;
540 			while (ptr < tab && buf[tab - 1] != '\t')
541 				tab--;
542 			if (ptr == tab)
543 				tab = end;
544 			r = decode(UTF_8, buf, ptr, tab - 1);
545 		}
546 
547 		if (r.equals(DEV_NULL))
548 			r = DEV_NULL;
549 		return r;
550 	}
551 
552 	private static String p1(final String r) {
553 		final int s = r.indexOf('/');
554 		return s > 0 ? r.substring(s + 1) : r;
555 	}
556 
557 	FileMode parseFileMode(int ptr, int end) {
558 		int tmp = 0;
559 		while (ptr < end - 1) {
560 			tmp <<= 3;
561 			tmp += buf[ptr++] - '0';
562 		}
563 		return FileMode.fromBits(tmp);
564 	}
565 
566 	void parseIndexLine(int ptr, int end) {
567 		// "index $asha1..$bsha1[ $mode]" where $asha1 and $bsha1
568 		// can be unique abbreviations
569 		//
570 		final int dot2 = nextLF(buf, ptr, '.');
571 		final int mode = nextLF(buf, dot2, ' ');
572 
573 		oldId = AbbreviatedObjectId.fromString(buf, ptr, dot2 - 1);
574 		newId = AbbreviatedObjectId.fromString(buf, dot2 + 1, mode - 1);
575 
576 		if (mode < end)
577 			newMode = oldMode = parseFileMode(mode, end);
578 	}
579 
580 	private boolean eq(int aPtr, int aEnd, int bPtr, int bEnd) {
581 		if (aEnd - aPtr != bEnd - bPtr) {
582 			return false;
583 		}
584 		while (aPtr < aEnd) {
585 			if (buf[aPtr++] != buf[bPtr++])
586 				return false;
587 		}
588 		return true;
589 	}
590 
591 	/**
592 	 * Determine if this is a patch hunk header.
593 	 *
594 	 * @param buf
595 	 *            the buffer to scan
596 	 * @param start
597 	 *            first position in the buffer to evaluate
598 	 * @param end
599 	 *            last position to consider; usually the end of the buffer (
600 	 *            <code>buf.length</code>) or the first position on the next
601 	 *            line. This is only used to avoid very long runs of '@' from
602 	 *            killing the scan loop.
603 	 * @return the number of "ancestor revisions" in the hunk header. A
604 	 *         traditional two-way diff ("@@ -...") returns 1; a combined diff
605 	 *         for a 3 way-merge returns 3. If this is not a hunk header, 0 is
606 	 *         returned instead.
607 	 */
608 	static int isHunkHdr(byte[] buf, int start, int end) {
609 		int ptr = start;
610 		while (ptr < end && buf[ptr] == '@')
611 			ptr++;
612 		if (ptr - start < 2)
613 			return 0;
614 		if (ptr == end || buf[ptr++] != ' ')
615 			return 0;
616 		if (ptr == end || buf[ptr++] != '-')
617 			return 0;
618 		return (ptr - 3) - start;
619 	}
620 }