View Javadoc

1   //
2   //  ========================================================================
3   //  Copyright (c) 1995-2016 Mort Bay Consulting Pty. Ltd.
4   //  ------------------------------------------------------------------------
5   //  All rights reserved. This program and the accompanying materials
6   //  are made available under the terms of the Eclipse Public License v1.0
7   //  and Apache License v2.0 which accompanies this distribution.
8   //
9   //      The Eclipse Public License is available at
10  //      http://www.eclipse.org/legal/epl-v10.html
11  //
12  //      The Apache License v2.0 is available at
13  //      http://www.opensource.org/licenses/apache2.0.php
14  //
15  //  You may elect to redistribute this code under either of these licenses.
16  //  ========================================================================
17  //
18  
19  package org.eclipse.jetty.util;
20  
21  import java.io.IOException;
22  import java.nio.ByteBuffer;
23  
24  import org.eclipse.jetty.util.log.Log;
25  import org.eclipse.jetty.util.log.Logger;
26  
27  /* ------------------------------------------------------------ */
28  /**
29   * Utf8 Appendable abstract base class
30   *
31   * This abstract class wraps a standard {@link java.lang.Appendable} and provides methods to append UTF-8 encoded bytes, that are converted into characters.
32   *
33   * This class is stateful and up to 4 calls to {@link #append(byte)} may be needed before state a character is appended to the string buffer.
34   *
35   * The UTF-8 decoding is done by this class and no additional buffers or Readers are used. The UTF-8 code was inspired by
36   * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
37   *
38   * License information for Bjoern Hoehrmann's code:
39   *
40   * Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
41   * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal
42   * in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
43   * copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
44   *
45   * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
46   *
47   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
48   * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
49   * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
50   **/
51  public abstract class Utf8Appendable
52  {
53      protected static final Logger LOG = Log.getLogger(Utf8Appendable.class);
54      public static final char REPLACEMENT = '\ufffd';
55      public static final byte[] REPLACEMENT_UTF8 = new byte[] {(byte)0xEF,(byte)0xBF,(byte)0xBD };
56      private static final int UTF8_ACCEPT = 0;
57      private static final int UTF8_REJECT = 12;
58  
59      protected final Appendable _appendable;
60      protected int _state = UTF8_ACCEPT;
61  
62      private static final byte[] BYTE_TABLE =
63      {
64          // The first part of the table maps bytes to character classes that
65          // to reduce the size of the transition table and create bitmasks.
66           0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
67           0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
68           0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
69           0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
70           1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
71           7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
72           8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
73          10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8
74      };
75  
76      private static final byte[] TRANS_TABLE =
77      {
78          // The second part is a transition table that maps a combination
79          // of a state of the automaton and a character class to a state.
80           0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
81          12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
82          12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
83          12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
84          12,36,12,12,12,12,12,12,12,12,12,12
85      };
86  
87      private int _codep;
88  
89      public Utf8Appendable(Appendable appendable)
90      {
91          _appendable = appendable;
92      }
93  
94      public abstract int length();
95  
96      protected void reset()
97      {
98          _state = UTF8_ACCEPT;
99      }
100 
101     
102     private void checkCharAppend() throws IOException
103     {
104         if (_state != UTF8_ACCEPT)
105         {
106             _appendable.append(REPLACEMENT);
107             int state=_state;
108             _state=UTF8_ACCEPT;
109             throw new NotUtf8Exception("char appended in state "+state);
110         }
111     }
112     
113     public void append(char c)
114     {
115         try
116         {
117             checkCharAppend();
118             _appendable.append(c); 
119         }
120         catch (IOException e)
121         {
122             throw new RuntimeException(e);
123         }
124     }
125 
126     public void append(String s)
127     {
128         try
129         {
130             checkCharAppend();
131             _appendable.append(s); 
132         }
133         catch (IOException e)
134         {
135             throw new RuntimeException(e);
136         }
137     }
138     
139     public void append(String s,int offset,int length)
140     {
141         try
142         {
143             checkCharAppend();
144             _appendable.append(s,offset,offset+length); 
145         }
146         catch (IOException e)
147         {
148             throw new RuntimeException(e);
149         }
150     }
151     
152     
153     public void append(byte b)
154     {
155         try
156         {
157             appendByte(b);
158         }
159         catch (IOException e)
160         {
161             throw new RuntimeException(e);
162         }
163     }
164     
165     public void append(ByteBuffer buf)
166     {
167         try
168         {
169             while (buf.remaining() > 0)
170             {
171                 appendByte(buf.get());
172             }
173         }
174         catch (IOException e)
175         {
176             throw new RuntimeException(e);
177         }
178     }
179 
180     public void append(byte[] b, int offset, int length)
181     {
182         try
183         {
184             int end = offset + length;
185             for (int i = offset; i < end; i++)
186                 appendByte(b[i]);
187         }
188         catch (IOException e)
189         {
190             throw new RuntimeException(e);
191         }
192     }
193 
194     public boolean append(byte[] b, int offset, int length, int maxChars)
195     {
196         try
197         {
198             int end = offset + length;
199             for (int i = offset; i < end; i++)
200             {
201                 if (length() > maxChars)
202                     return false;
203                 appendByte(b[i]);
204             }
205             return true;
206         }
207         catch (IOException e)
208         {
209             throw new RuntimeException(e);
210         }
211     }
212 
213     protected void appendByte(byte b) throws IOException
214     {
215 
216         if (b > 0 && _state == UTF8_ACCEPT)
217         {
218             _appendable.append((char)(b & 0xFF));
219         }
220         else
221         {
222             int i = b & 0xFF;
223             int type = BYTE_TABLE[i];
224             _codep = _state == UTF8_ACCEPT ? (0xFF >> type) & i : (i & 0x3F) | (_codep << 6);
225             int next = TRANS_TABLE[_state + type];
226 
227             switch(next)
228             {
229                 case UTF8_ACCEPT:
230                     _state=next;
231                     if (_codep < Character.MIN_HIGH_SURROGATE)
232                     {
233                         _appendable.append((char)_codep);
234                     }
235                     else
236                     {
237                         for (char c : Character.toChars(_codep))
238                             _appendable.append(c);
239                     }
240                     break;
241                     
242                 case UTF8_REJECT:
243                     String reason = "byte "+TypeUtil.toHexString(b)+" in state "+(_state/12);
244                     _codep=0;
245                     _state = UTF8_ACCEPT;
246                     _appendable.append(REPLACEMENT);
247                     throw new NotUtf8Exception(reason);
248                     
249                 default:
250                     _state=next;
251                     
252             }
253         }
254     }
255 
256     public boolean isUtf8SequenceComplete()
257     {
258         return _state == UTF8_ACCEPT;
259     }
260 
261     @SuppressWarnings("serial")
262     public static class NotUtf8Exception extends IllegalArgumentException
263     {
264         public NotUtf8Exception(String reason)
265         {
266             super("Not valid UTF8! "+reason);
267         }
268     }
269 
270     protected void checkState()
271     {
272         if (!isUtf8SequenceComplete())
273         {
274             _codep=0;
275             _state = UTF8_ACCEPT;
276             try
277             {
278                 _appendable.append(REPLACEMENT);
279             }
280             catch(IOException e)
281             {
282                 throw new RuntimeException(e);
283             }
284             throw new NotUtf8Exception("incomplete UTF8 sequence");
285         }
286     }
287     
288     public String toReplacedString()
289     {
290         if (!isUtf8SequenceComplete())
291         {
292             _codep=0;
293             _state = UTF8_ACCEPT;
294             try
295             {
296                 _appendable.append(REPLACEMENT);
297             }
298             catch(IOException e)
299             {
300                 throw new RuntimeException(e);
301             }
302             Throwable th= new NotUtf8Exception("incomplete UTF8 sequence");
303             LOG.warn(th.toString());
304             LOG.debug(th);
305         }
306         return _appendable.toString();
307     }
308 }