|
| 1 | +/* |
| 2 | + * Copyright (c) 1995, 2013, Oracle and/or its affiliates. All rights reserved. |
| 3 | + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
| 4 | + * |
| 5 | + * This code is free software; you can redistribute it and/or modify it |
| 6 | + * under the terms of the GNU General Public License version 2 only, as |
| 7 | + * published by the Free Software Foundation. Oracle designates this |
| 8 | + * particular file as subject to the "Classpath" exception as provided |
| 9 | + * by Oracle in the LICENSE file that accompanied this code. |
| 10 | + * |
| 11 | + * This code is distributed in the hope that it will be useful, but WITHOUT |
| 12 | + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| 13 | + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| 14 | + * version 2 for more details (a copy is included in the LICENSE file that |
| 15 | + * accompanied this code). |
| 16 | + * |
| 17 | + * You should have received a copy of the GNU General Public License version |
| 18 | + * 2 along with this work; if not, write to the Free Software Foundation, |
| 19 | + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
| 20 | + * |
| 21 | + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
| 22 | + * or visit www.oracle.com if you need additional information or have any |
| 23 | + * questions. |
| 24 | + */ |
| 25 | + |
| 26 | +package java.net; |
| 27 | + |
| 28 | +import java.io.ByteArrayOutputStream; |
| 29 | +import java.io.BufferedWriter; |
| 30 | +import java.io.OutputStreamWriter; |
| 31 | +import java.io.IOException; |
| 32 | +import java.io.UnsupportedEncodingException; |
| 33 | +import java.io.CharArrayWriter; |
| 34 | +import java.nio.charset.Charset; |
| 35 | +import java.nio.charset.IllegalCharsetNameException; |
| 36 | +import java.nio.charset.UnsupportedCharsetException ; |
| 37 | +import java.util.BitSet; |
| 38 | +import java.security.AccessController; |
| 39 | +import java.security.PrivilegedAction; |
| 40 | +import sun.security.action.GetBooleanAction; |
| 41 | +import sun.security.action.GetPropertyAction; |
| 42 | + |
| 43 | +/** |
| 44 | + * Utility class for HTML form encoding. This class contains static methods |
| 45 | + * for converting a String to the <CODE>application/x-www-form-urlencoded</CODE> MIME |
| 46 | + * format. For more information about HTML form encoding, consult the HTML |
| 47 | + * <A HREF="http://www.w3.org/TR/html4/">specification</A>. |
| 48 | + * |
| 49 | + * <p> |
| 50 | + * When encoding a String, the following rules apply: |
| 51 | + * |
| 52 | + * <ul> |
| 53 | + * <li>The alphanumeric characters "{@code a}" through |
| 54 | + * "{@code z}", "{@code A}" through |
| 55 | + * "{@code Z}" and "{@code 0}" |
| 56 | + * through "{@code 9}" remain the same. |
| 57 | + * <li>The special characters "{@code .}", |
| 58 | + * "{@code -}", "{@code *}", and |
| 59 | + * "{@code _}" remain the same. |
| 60 | + * <li>The space character " " is |
| 61 | + * converted into a plus sign "{@code +}". |
| 62 | + * <li>All other characters are unsafe and are first converted into |
| 63 | + * one or more bytes using some encoding scheme. Then each byte is |
| 64 | + * represented by the 3-character string |
| 65 | + * "<i>{@code %xy}</i>", where <i>xy</i> is the |
| 66 | + * two-digit hexadecimal representation of the byte. |
| 67 | + * The recommended encoding scheme to use is UTF-8. However, |
| 68 | + * for compatibility reasons, if an encoding is not specified, |
| 69 | + * then the default encoding of the platform is used. |
| 70 | + * </ul> |
| 71 | + * |
| 72 | + * <p> |
| 73 | + * For example using UTF-8 as the encoding scheme the string "The |
| 74 | + * string ü@foo-bar" would get converted to |
| 75 | + * "The+string+%C3%BC%40foo-bar" because in UTF-8 the character |
| 76 | + * ü is encoded as two bytes C3 (hex) and BC (hex), and the |
| 77 | + * character @ is encoded as one byte 40 (hex). |
| 78 | + * |
| 79 | + * @author Herb Jellinek |
| 80 | + * @since JDK1.0 |
| 81 | + */ |
| 82 | +public class URLEncoder { |
| 83 | + static BitSet dontNeedEncoding; |
| 84 | + static final int caseDiff = ('a' - 'A'); |
| 85 | + static String dfltEncName = null; |
| 86 | + |
| 87 | + static { |
| 88 | + |
| 89 | + /* The list of characters that are not encoded has been |
| 90 | + * determined as follows: |
| 91 | + * |
| 92 | + * RFC 2396 states: |
| 93 | + * ----- |
| 94 | + * Data characters that are allowed in a URI but do not have a |
| 95 | + * reserved purpose are called unreserved. These include upper |
| 96 | + * and lower case letters, decimal digits, and a limited set of |
| 97 | + * punctuation marks and symbols. |
| 98 | + * |
| 99 | + * unreserved = alphanum | mark |
| 100 | + * |
| 101 | + * mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")" |
| 102 | + * |
| 103 | + * Unreserved characters can be escaped without changing the |
| 104 | + * semantics of the URI, but this should not be done unless the |
| 105 | + * URI is being used in a context that does not allow the |
| 106 | + * unescaped character to appear. |
| 107 | + * ----- |
| 108 | + * |
| 109 | + * It appears that both Netscape and Internet Explorer escape |
| 110 | + * all special characters from this list with the exception |
| 111 | + * of "-", "_", ".", "*". While it is not clear why they are |
| 112 | + * escaping the other characters, perhaps it is safest to |
| 113 | + * assume that there might be contexts in which the others |
| 114 | + * are unsafe if not escaped. Therefore, we will use the same |
| 115 | + * list. It is also noteworthy that this is consistent with |
| 116 | + * O'Reilly's "HTML: The Definitive Guide" (page 164). |
| 117 | + * |
| 118 | + * As a last note, Intenet Explorer does not encode the "@" |
| 119 | + * character which is clearly not unreserved according to the |
| 120 | + * RFC. We are being consistent with the RFC in this matter, |
| 121 | + * as is Netscape. |
| 122 | + * |
| 123 | + */ |
| 124 | + |
| 125 | + dontNeedEncoding = new BitSet(256); |
| 126 | + int i; |
| 127 | + for (i = 'a'; i <= 'z'; i++) { |
| 128 | + dontNeedEncoding.set(i); |
| 129 | + } |
| 130 | + for (i = 'A'; i <= 'Z'; i++) { |
| 131 | + dontNeedEncoding.set(i); |
| 132 | + } |
| 133 | + for (i = '0'; i <= '9'; i++) { |
| 134 | + dontNeedEncoding.set(i); |
| 135 | + } |
| 136 | + dontNeedEncoding.set(' '); /* encoding a space to a + is done |
| 137 | + * in the encode() method */ |
| 138 | + dontNeedEncoding.set('-'); |
| 139 | + dontNeedEncoding.set('_'); |
| 140 | + dontNeedEncoding.set('.'); |
| 141 | + dontNeedEncoding.set('*'); |
| 142 | + |
| 143 | + dfltEncName = AccessController.doPrivileged( |
| 144 | + new GetPropertyAction("file.encoding") |
| 145 | + ); |
| 146 | + } |
| 147 | + |
| 148 | + /** |
| 149 | + * You can't call the constructor. |
| 150 | + */ |
| 151 | + private URLEncoder() { } |
| 152 | + |
| 153 | + /** |
| 154 | + * Translates a string into {@code x-www-form-urlencoded} |
| 155 | + * format. This method uses the platform's default encoding |
| 156 | + * as the encoding scheme to obtain the bytes for unsafe characters. |
| 157 | + * |
| 158 | + * @param s {@code String} to be translated. |
| 159 | + * @deprecated The resulting string may vary depending on the platform's |
| 160 | + * default encoding. Instead, use the encode(String,String) |
| 161 | + * method to specify the encoding. |
| 162 | + * @return the translated {@code String}. |
| 163 | + */ |
| 164 | + @Deprecated |
| 165 | + public static String encode(String s) { |
| 166 | + |
| 167 | + String str = null; |
| 168 | + |
| 169 | + try { |
| 170 | + str = encode(s, dfltEncName); |
| 171 | + } catch (UnsupportedEncodingException e) { |
| 172 | + // The system should always have the platform default |
| 173 | + } |
| 174 | + |
| 175 | + return str; |
| 176 | + } |
| 177 | + |
| 178 | + /** |
| 179 | + * Translates a string into {@code application/x-www-form-urlencoded} |
| 180 | + * format using a specific encoding scheme. This method uses the |
| 181 | + * supplied encoding scheme to obtain the bytes for unsafe |
| 182 | + * characters. |
| 183 | + * <p> |
| 184 | + * <em><strong>Note:</strong> The <a href= |
| 185 | + * "http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars"> |
| 186 | + * World Wide Web Consortium Recommendation</a> states that |
| 187 | + * UTF-8 should be used. Not doing so may introduce |
| 188 | + * incompatibilities.</em> |
| 189 | + * |
| 190 | + * @param s {@code String} to be translated. |
| 191 | + * @param enc The name of a supported |
| 192 | + * <a href="../lang/package-summary.html#charenc">character |
| 193 | + * encoding</a>. |
| 194 | + * @return the translated {@code String}. |
| 195 | + * @exception UnsupportedEncodingException |
| 196 | + * If the named encoding is not supported |
| 197 | + * @see URLDecoder#decode(java.lang.String, java.lang.String) |
| 198 | + * @since 1.4 |
| 199 | + */ |
| 200 | + public static String encode(String s, String enc) |
| 201 | + throws UnsupportedEncodingException { |
| 202 | + |
| 203 | + boolean needToChange = false; |
| 204 | + StringBuffer out = new StringBuffer(s.length()); |
| 205 | + Charset charset; |
| 206 | + CharArrayWriter charArrayWriter = new CharArrayWriter(); |
| 207 | + |
| 208 | + if (enc == null) |
| 209 | + throw new NullPointerException("charsetName"); |
| 210 | + |
| 211 | + try { |
| 212 | + charset = Charset.forName(enc); |
| 213 | + } catch (IllegalCharsetNameException e) { |
| 214 | + throw new UnsupportedEncodingException(enc); |
| 215 | + } catch (UnsupportedCharsetException e) { |
| 216 | + throw new UnsupportedEncodingException(enc); |
| 217 | + } |
| 218 | + |
| 219 | + for (int i = 0; i < s.length();) { |
| 220 | + int c = (int) s.charAt(i); |
| 221 | + //System.out.println("Examining character: " + c); |
| 222 | + if (dontNeedEncoding.get(c)) { |
| 223 | + if (c == ' ') { |
| 224 | + c = '+'; |
| 225 | + needToChange = true; |
| 226 | + } |
| 227 | + //System.out.println("Storing: " + c); |
| 228 | + out.append((char)c); |
| 229 | + i++; |
| 230 | + } else { |
| 231 | + // convert to external encoding before hex conversion |
| 232 | + do { |
| 233 | + charArrayWriter.write(c); |
| 234 | + /* |
| 235 | + * If this character represents the start of a Unicode |
| 236 | + * surrogate pair, then pass in two characters. It's not |
| 237 | + * clear what should be done if a bytes reserved in the |
| 238 | + * surrogate pairs range occurs outside of a legal |
| 239 | + * surrogate pair. For now, just treat it as if it were |
| 240 | + * any other character. |
| 241 | + */ |
| 242 | + if (c >= 0xD800 && c <= 0xDBFF) { |
| 243 | + /* |
| 244 | + System.out.println(Integer.toHexString(c) |
| 245 | + + " is high surrogate"); |
| 246 | + */ |
| 247 | + if ( (i+1) < s.length()) { |
| 248 | + int d = (int) s.charAt(i+1); |
| 249 | + /* |
| 250 | + System.out.println("\tExamining " |
| 251 | + + Integer.toHexString(d)); |
| 252 | + */ |
| 253 | + if (d >= 0xDC00 && d <= 0xDFFF) { |
| 254 | + /* |
| 255 | + System.out.println("\t" |
| 256 | + + Integer.toHexString(d) |
| 257 | + + " is low surrogate"); |
| 258 | + */ |
| 259 | + charArrayWriter.write(d); |
| 260 | + i++; |
| 261 | + } |
| 262 | + } |
| 263 | + } |
| 264 | + i++; |
| 265 | + } while (i < s.length() && !dontNeedEncoding.get((c = (int) s.charAt(i)))); |
| 266 | + |
| 267 | + charArrayWriter.flush(); |
| 268 | + String str = new String(charArrayWriter.toCharArray()); |
| 269 | + byte[] ba = str.getBytes(charset); |
| 270 | + for (int j = 0; j < ba.length; j++) { |
| 271 | + out.append('%'); |
| 272 | + char ch = Character.forDigit((ba[j] >> 4) & 0xF, 16); |
| 273 | + // converting to use uppercase letter as part of |
| 274 | + // the hex value if ch is a letter. |
| 275 | + if (Character.isLetter(ch)) { |
| 276 | + ch -= caseDiff; |
| 277 | + } |
| 278 | + out.append(ch); |
| 279 | + ch = Character.forDigit(ba[j] & 0xF, 16); |
| 280 | + if (Character.isLetter(ch)) { |
| 281 | + ch -= caseDiff; |
| 282 | + } |
| 283 | + out.append(ch); |
| 284 | + } |
| 285 | + charArrayWriter.reset(); |
| 286 | + needToChange = true; |
| 287 | + } |
| 288 | + } |
| 289 | + |
| 290 | + return (needToChange? out.toString() : s); |
| 291 | + } |
| 292 | +} |
0 commit comments