View Javadoc
1   /*
2    * Copyright (C) 2014, Andrey Loskutov <loskutov@gmx.de>
3    * and other copyright owners as documented in the project's IP log.
4    *
5    * This program and the accompanying materials are made available
6    * under the terms of the Eclipse Distribution License v1.0 which
7    * accompanies this distribution, is reproduced below, and is
8    * available at http://www.eclipse.org/org/documents/edl-v10.php
9    *
10   * All rights reserved.
11   *
12   * Redistribution and use in source and binary forms, with or
13   * without modification, are permitted provided that the following
14   * conditions are met:
15   *
16   * - Redistributions of source code must retain the above copyright
17   *   notice, this list of conditions and the following disclaimer.
18   *
19   * - Redistributions in binary form must reproduce the above
20   *   copyright notice, this list of conditions and the following
21   *   disclaimer in the documentation and/or other materials provided
22   *   with the distribution.
23   *
24   * - Neither the name of the Eclipse Foundation, Inc. nor the
25   *   names of its contributors may be used to endorse or promote
26   *   products derived from this software without specific prior
27   *   written permission.
28   *
29   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
30   * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
31   * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
32   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33   * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
34   * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
35   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
36   * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
37   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
38   * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
39   * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
40   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
41   * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
42   */
43  package org.eclipse.jgit.ignore.internal;
44  
45  import static java.lang.Character.isLetter;
46  
47  import java.text.MessageFormat;
48  import java.util.ArrayList;
49  import java.util.Arrays;
50  import java.util.List;
51  import java.util.regex.Pattern;
52  import java.util.regex.PatternSyntaxException;
53  
54  import org.eclipse.jgit.errors.InvalidPatternException;
55  import org.eclipse.jgit.ignore.FastIgnoreRule;
56  import org.eclipse.jgit.internal.JGitText;
57  
58  /**
59   * Various {@link String} related utility methods, written mostly to avoid
60   * generation of new String objects (e.g. via splitting Strings etc).
61   *
62   * @since 3.6
63   */
64  public class Strings {
65  
66  	static char getPathSeparator(Character pathSeparator) {
67  		return pathSeparator == null ? FastIgnoreRule.PATH_SEPARATOR
68  				: pathSeparator.charValue();
69  	}
70  
71  	/**
72  	 * @param pattern
73  	 *            non null
74  	 * @param c
75  	 *            character to remove
76  	 * @return new string with all trailing characters removed
77  	 */
78  	public static String stripTrailing(String pattern, char c) {
79  		while (pattern.length() > 0
80  				&& pattern.charAt(pattern.length() - 1) == c)
81  			pattern = pattern.substring(0, pattern.length() - 1);
82  		return pattern;
83  	}
84  
85  	static int count(String s, char c, boolean ignoreFirstLast) {
86  		int start = 0;
87  		int count = 0;
88  		while (true) {
89  			start = s.indexOf(c, start);
90  			if (start == -1)
91  				break;
92  			if (!ignoreFirstLast || (start != 0 && start != s.length()))
93  				count++;
94  			start++;
95  		}
96  		return count;
97  	}
98  
99  	/**
100 	 * Splits given string to substrings by given separator
101 	 *
102 	 * @param pattern
103 	 *            non null
104 	 * @param slash
105 	 *            separator char
106 	 * @return list of substrings
107 	 */
108 	public static List<String> split(String pattern, char slash) {
109 		int count = count(pattern, slash, true);
110 		if (count < 1)
111 			throw new IllegalStateException(
112 					"Pattern must have at least two segments: " + pattern); //$NON-NLS-1$
113 		List<String> segments = new ArrayList<String>(count);
114 		int right = 0;
115 		while (true) {
116 			int left = right;
117 			right = pattern.indexOf(slash, right);
118 			if (right == -1) {
119 				if (left < pattern.length())
120 					segments.add(pattern.substring(left));
121 				break;
122 			}
123 			if (right - left > 0)
124 				if (left == 1)
125 					// leading slash should remain by the first pattern
126 					segments.add(pattern.substring(left - 1, right));
127 				else if (right == pattern.length() - 1)
128 					// trailing slash should remain too
129 					segments.add(pattern.substring(left, right + 1));
130 				else
131 					segments.add(pattern.substring(left, right));
132 			right++;
133 		}
134 		return segments;
135 	}
136 
137 	static boolean isWildCard(String pattern) {
138 		return pattern.indexOf('*') != -1 || isComplexWildcard(pattern);
139 	}
140 
141 	private static boolean isComplexWildcard(String pattern) {
142 		int idx1 = pattern.indexOf('[');
143 		if (idx1 != -1) {
144 			int idx2 = pattern.indexOf(']', idx1);
145 			if (idx2 > idx1)
146 				return true;
147 		}
148 		if (pattern.indexOf('?') != -1) {
149 			return true;
150 		} else {
151 			// check if the backslash escapes one of the glob special characters
152 			// if not, backslash is not part of a regex and treated literally
153 			int backSlash = pattern.indexOf('\\');
154 			if (backSlash >= 0) {
155 				int nextIdx = backSlash + 1;
156 				if (pattern.length() == nextIdx) {
157 					return false;
158 				}
159 				char nextChar = pattern.charAt(nextIdx);
160 				if (escapedByBackslash(nextChar)) {
161 					return true;
162 				} else {
163 					return false;
164 				}
165 			}
166 		}
167 		return false;
168 	}
169 
170 	private static boolean escapedByBackslash(char nextChar) {
171 		return nextChar == '?' || nextChar == '*' || nextChar == '[';
172 	}
173 
174 	static PatternState checkWildCards(String pattern) {
175 		if (isComplexWildcard(pattern))
176 			return PatternState.COMPLEX;
177 		int startIdx = pattern.indexOf('*');
178 		if (startIdx < 0)
179 			return PatternState.NONE;
180 
181 		if (startIdx == pattern.length() - 1)
182 			return PatternState.TRAILING_ASTERISK_ONLY;
183 		if (pattern.lastIndexOf('*') == 0)
184 			return PatternState.LEADING_ASTERISK_ONLY;
185 
186 		return PatternState.COMPLEX;
187 	}
188 
189 	static enum PatternState {
190 		LEADING_ASTERISK_ONLY, TRAILING_ASTERISK_ONLY, COMPLEX, NONE
191 	}
192 
193 	final static List<String> POSIX_CHAR_CLASSES = Arrays.asList(
194 			"alnum", "alpha", "blank", "cntrl", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
195 			// [:alnum:] [:alpha:] [:blank:] [:cntrl:]
196 			"digit", "graph", "lower", "print", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
197 			// [:digit:] [:graph:] [:lower:] [:print:]
198 			"punct", "space", "upper", "xdigit", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
199 			// [:punct:] [:space:] [:upper:] [:xdigit:]
200 			"word" //$NON-NLS-1$
201 	// [:word:] XXX I don't see it in
202 	// http://man7.org/linux/man-pages/man7/glob.7.html
203 	// but this was in org.eclipse.jgit.fnmatch.GroupHead.java ???
204 			);
205 
206 	private static final String DL = "\\p{javaDigit}\\p{javaLetter}"; //$NON-NLS-1$
207 
208 	final static List<String> JAVA_CHAR_CLASSES = Arrays
209 			.asList("\\p{Alnum}", "\\p{javaLetter}", "\\p{Blank}", "\\p{Cntrl}", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
210 					// [:alnum:] [:alpha:] [:blank:] [:cntrl:]
211 					"\\p{javaDigit}", "[\\p{Graph}" + DL + "]", "\\p{Ll}", "[\\p{Print}" + DL + "]", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$
212 					// [:digit:] [:graph:] [:lower:] [:print:]
213 					"\\p{Punct}", "\\p{Space}", "\\p{Lu}", "\\p{XDigit}", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
214 					// [:punct:] [:space:] [:upper:] [:xdigit:]
215 					"[" + DL + "_]" //$NON-NLS-1$ //$NON-NLS-2$
216 							// [:word:]
217 			);
218 
219 	// Collating symbols [[.a.]] or equivalence class expressions [[=a=]] are
220 	// not supported by CLI git (at least not by 1.9.1)
221 	final static Pattern UNSUPPORTED = Pattern
222 			.compile("\\[\\[[.=]\\w+[.=]\\]\\]"); //$NON-NLS-1$
223 
224 	/**
225 	 * Conversion from glob to Java regex following two sources: <li>
226 	 * http://man7.org/linux/man-pages/man7/glob.7.html <li>
227 	 * org.eclipse.jgit.fnmatch.FileNameMatcher.java Seems that there are
228 	 * various ways to define what "glob" can be.
229 	 *
230 	 * @param pattern
231 	 *            non null pattern
232 	 *
233 	 * @return Java regex pattern corresponding to given glob pattern
234 	 * @throws InvalidPatternException
235 	 */
236 	static Pattern convertGlob(String pattern) throws InvalidPatternException {
237 		if (UNSUPPORTED.matcher(pattern).find())
238 			throw new InvalidPatternException(
239 					"Collating symbols [[.a.]] or equivalence class expressions [[=a=]] are not supported", //$NON-NLS-1$
240 					pattern);
241 
242 		StringBuilder sb = new StringBuilder(pattern.length());
243 
244 		int in_brackets = 0;
245 		boolean seenEscape = false;
246 		boolean ignoreLastBracket = false;
247 		boolean in_char_class = false;
248 		// 6 is the length of the longest posix char class "xdigit"
249 		char[] charClass = new char[6];
250 
251 		for (int i = 0; i < pattern.length(); i++) {
252 			final char c = pattern.charAt(i);
253 			switch (c) {
254 
255 			case '*':
256 				if (seenEscape || in_brackets > 0)
257 					sb.append(c);
258 				else
259 					sb.append('.').append(c);
260 				break;
261 
262 			case '(': // fall-through
263 			case ')': // fall-through
264 			case '{': // fall-through
265 			case '}': // fall-through
266 			case '+': // fall-through
267 			case '$': // fall-through
268 			case '^': // fall-through
269 			case '|':
270 				if (seenEscape || in_brackets > 0)
271 					sb.append(c);
272 				else
273 					sb.append('\\').append(c);
274 				break;
275 
276 			case '.':
277 				if (seenEscape)
278 					sb.append(c);
279 				else
280 					sb.append('\\').append('.');
281 				break;
282 
283 			case '?':
284 				if (seenEscape || in_brackets > 0)
285 					sb.append(c);
286 				else
287 					sb.append('.');
288 				break;
289 
290 			case ':':
291 				if (in_brackets > 0)
292 					if (lookBehind(sb) == '['
293 							&& isLetter(lookAhead(pattern, i)))
294 						in_char_class = true;
295 				sb.append(':');
296 				break;
297 
298 			case '-':
299 				if (in_brackets > 0) {
300 					if (lookAhead(pattern, i) == ']')
301 						sb.append('\\').append(c);
302 					else
303 						sb.append(c);
304 				} else
305 					sb.append('-');
306 				break;
307 
308 			case '\\':
309 				if (in_brackets > 0) {
310 					char lookAhead = lookAhead(pattern, i);
311 					if (lookAhead == ']' || lookAhead == '[')
312 						ignoreLastBracket = true;
313 				} else {
314 					//
315 					char lookAhead = lookAhead(pattern, i);
316 					if (lookAhead != '\\' && lookAhead != '['
317 							&& lookAhead != '?' && lookAhead != '*'
318 							&& lookAhead != ' ' && lookBehind(sb) != '\\') {
319 						break;
320 					}
321 				}
322 				sb.append(c);
323 				break;
324 
325 			case '[':
326 				if (in_brackets > 0) {
327 					sb.append('\\').append('[');
328 					ignoreLastBracket = true;
329 				} else {
330 					if (!seenEscape) {
331 						in_brackets++;
332 						ignoreLastBracket = false;
333 					}
334 					sb.append('[');
335 				}
336 				break;
337 
338 			case ']':
339 				if (seenEscape) {
340 					sb.append(']');
341 					ignoreLastBracket = true;
342 					break;
343 				}
344 				if (in_brackets <= 0) {
345 					sb.append('\\').append(']');
346 					ignoreLastBracket = true;
347 					break;
348 				}
349 				char lookBehind = lookBehind(sb);
350 				if ((lookBehind == '[' && !ignoreLastBracket)
351 						|| lookBehind == '^') {
352 					sb.append('\\');
353 					sb.append(']');
354 					ignoreLastBracket = true;
355 				} else {
356 					ignoreLastBracket = false;
357 					if (!in_char_class) {
358 						in_brackets--;
359 						sb.append(']');
360 					} else {
361 						in_char_class = false;
362 						String charCl = checkPosixCharClass(charClass);
363 						// delete last \[:: chars and set the pattern
364 						if (charCl != null) {
365 							sb.setLength(sb.length() - 4);
366 							sb.append(charCl);
367 						}
368 						reset(charClass);
369 					}
370 				}
371 				break;
372 
373 			case '!':
374 				if (in_brackets > 0) {
375 					if (lookBehind(sb) == '[')
376 						sb.append('^');
377 					else
378 						sb.append(c);
379 				} else
380 					sb.append(c);
381 				break;
382 
383 			default:
384 				if (in_char_class)
385 					setNext(charClass, c);
386 				else
387 					sb.append(c);
388 				break;
389 			} // end switch
390 
391 			seenEscape = c == '\\';
392 
393 		} // end for
394 
395 		if (in_brackets > 0)
396 			throw new InvalidPatternException("Not closed bracket?", pattern); //$NON-NLS-1$
397 		try {
398 			return Pattern.compile(sb.toString());
399 		} catch (PatternSyntaxException e) {
400 			InvalidPatternException patternException = new InvalidPatternException(
401 					MessageFormat.format(JGitText.get().invalidIgnoreRule,
402 							pattern),
403 					pattern);
404 			patternException.initCause(e);
405 			throw patternException;
406 		}
407 	}
408 
409 	/**
410 	 * @param buffer
411 	 * @return zero of the buffer is empty, otherwise the last character from
412 	 *         buffer
413 	 */
414 	private static char lookBehind(StringBuilder buffer) {
415 		return buffer.length() > 0 ? buffer.charAt(buffer.length() - 1) : 0;
416 	}
417 
418 	/**
419 	 * @param pattern
420 	 * @param i
421 	 *            current pointer in the pattern
422 	 * @return zero of the index is out of range, otherwise the next character
423 	 *         from given position
424 	 */
425 	private static char lookAhead(String pattern, int i) {
426 		int idx = i + 1;
427 		return idx >= pattern.length() ? 0 : pattern.charAt(idx);
428 	}
429 
430 	private static void setNext(char[] buffer, char c) {
431 		for (int i = 0; i < buffer.length; i++)
432 			if (buffer[i] == 0) {
433 				buffer[i] = c;
434 				break;
435 			}
436 	}
437 
438 	private static void reset(char[] buffer) {
439 		for (int i = 0; i < buffer.length; i++)
440 			buffer[i] = 0;
441 	}
442 
443 	private static String checkPosixCharClass(char[] buffer) {
444 		for (int i = 0; i < POSIX_CHAR_CLASSES.size(); i++) {
445 			String clazz = POSIX_CHAR_CLASSES.get(i);
446 			boolean match = true;
447 			for (int j = 0; j < clazz.length(); j++)
448 				if (buffer[j] != clazz.charAt(j)) {
449 					match = false;
450 					break;
451 				}
452 			if (match)
453 				return JAVA_CHAR_CLASSES.get(i);
454 		}
455 		return null;
456 	}
457 
458 	static String deleteBackslash(String s) {
459 		if (s.indexOf('\\') < 0) {
460 			return s;
461 		}
462 		StringBuilder sb = new StringBuilder(s.length());
463 		for (int i = 0; i < s.length(); i++) {
464 			char ch = s.charAt(i);
465 			if (ch == '\\') {
466 				if (i + 1 == s.length()) {
467 					continue;
468 				}
469 				char next = s.charAt(i + 1);
470 				if (next == '\\') {
471 					sb.append(ch);
472 					i++;
473 					continue;
474 				}
475 				if (!escapedByBackslash(next)) {
476 					continue;
477 				}
478 			}
479 			sb.append(ch);
480 		}
481 		return sb.toString();
482 	}
483 
484 }