From 54efa9f0c2f153239e1c48099d1289c353a7f5ca Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Tue, 12 May 2026 22:50:27 +0200 Subject: [PATCH 1/6] NUTCH-3176 URLUtil and urlnormalizer-basic: add support for IDNA2008 - URLUtil: - make IDNA2008 the default for the methods toASCII and toUNICODE - provide methods to convert host names both for IDNA2003 and IDNA2008 - urlnormalizer-basic: - convert host names using IDNA2008 if the property urlnormalizer.basic.host.idna2008 is true - refactor to share methods between URLUtil and urlnormalizer-basic --- conf/nutch-default.xml | 11 ++ ivy/ivy.xml | 1 + src/java/org/apache/nutch/util/URLUtil.java | 148 +++++++++++++++--- .../basic/BasicURLNormalizer.java | 46 +++--- .../basic/TestBasicURLNormalizer.java | 40 +++++ .../org/apache/nutch/util/TestURLUtil.java | 58 ++++++- 6 files changed, 256 insertions(+), 48 deletions(-) diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index cc0e8d4388..f26ad419c9 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -1556,6 +1556,17 @@ + + urlnormalizer.basic.host.idna2008 + false + If true, let urlnormalizer-basic + normalize Internationalized Domain Names (IDNs) using the + standard IDNA2008 (RFC 5890). If false, use IDNA2003 (RFC 3490). + Note that urlnormalizer.basic.host.idn must be set, otherwise + this property has no effect. + + + urlnormalizer.basic.host.trim-trailing-dot false diff --git a/ivy/ivy.xml b/ivy/ivy.xml index 70763adc02..944a329a01 100644 --- a/ivy/ivy.xml +++ b/ivy/ivy.xml @@ -118,6 +118,7 @@ + diff --git a/src/java/org/apache/nutch/util/URLUtil.java b/src/java/org/apache/nutch/util/URLUtil.java index 44c6309d2a..ec5aefbe46 100644 --- a/src/java/org/apache/nutch/util/URLUtil.java +++ b/src/java/org/apache/nutch/util/URLUtil.java @@ -16,6 +16,7 @@ */ package org.apache.nutch.util; +import java.lang.invoke.MethodHandles; import java.net.IDN; import java.net.MalformedURLException; import java.net.URI; @@ -23,11 +24,22 @@ import java.util.Locale; import java.util.regex.Pattern; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.ibm.icu.text.IDNA; + import crawlercommons.domains.EffectiveTldFinder; /** Utility class for URL analysis */ public class URLUtil { + private static final Logger LOG = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + private static final IDNA idna = IDNA.getUTS46Instance( + IDNA.NONTRANSITIONAL_TO_ASCII | IDNA.NONTRANSITIONAL_TO_UNICODE); + /** * Resolve relative URL-s and fix a java.net.URL error in handling of URLs * with pure query targets. @@ -520,17 +532,29 @@ public static String getProtocol(URL url) { return url.getProtocol(); } + public static boolean isAscii(String str) { + char[] chars = str.toCharArray(); + for (char c : chars) { + if (c > 127) { + return false; + } + } + return true; + } + public static String toASCII(String url) { try { URL u = new URL(url); String host = u.getHost(); - if (host == null || host.isEmpty()) { - // no host name => no punycoded domain name - // also do not add additional slashes for file: URLs (NUTCH-1880) + if (host == null || host.isEmpty() || isAscii(host)) { + // - no host name => no punycoded domain name + // - also do not add additional slashes for file: URLs (NUTCH-1880) + // - do nothing if host is already ASCII-only return url; } - URI p = new URI(u.getProtocol(), u.getUserInfo(), IDN.toASCII(host), - u.getPort(), u.getPath(), u.getQuery(), u.getRef()); + URI p = new URI(u.getProtocol(), u.getUserInfo(), + convertIDNA2008(u.getHost(), true), u.getPort(), u.getPath(), + u.getQuery(), u.getRef()); return p.toString(); } catch (Exception e) { @@ -542,9 +566,10 @@ public static String toUNICODE(String url) { try { URL u = new URL(url); String host = u.getHost(); - if (host == null || host.isEmpty()) { - // no host name => no punycoded domain name - // also do not add additional slashes for file: URLs (NUTCH-1880) + if (host == null || host.isEmpty() || !host.contains("xn--")) { + // - no host name => no punycoded domain name + // - also do not add additional slashes for file: URLs (NUTCH-1880) + // - ??? return url; } StringBuilder sb = new StringBuilder(); @@ -554,7 +579,7 @@ public static String toUNICODE(String url) { sb.append(u.getUserInfo()); sb.append('@'); } - sb.append(IDN.toUnicode(host)); + sb.append(convertIDNA2008(u.getHost(), false)); if (u.getPort() != -1) { sb.append(':'); sb.append(u.getPort()); @@ -572,22 +597,83 @@ public static String toUNICODE(String url) { } /** - * For testing - * @param args print with no args to get help + * Convert IDN host to ASCII or Unicode using Java's built-in {@link IDN} + * class. + * + * The conversion supports only IDNA2003, it does not support IDNA2008. + * However, unless the parameter strictIDNA2003 is true, the + * methods {@link IDN#toASCII(String, int)} resp. + * {@link IDN#toUnicode(String, int)} are called passing the flag + * {@link IDN#ALLOW_UNASSIGNED} to avoid that the conversion fails on + * characters not in the repertoire of Unicode 3.2. + * + * @param host + * host name to be converted + * @param toAscii + * if true convert to ASCII, otherwise to Unicode + * @param strictIDNA2003 + * if true, do + * @return converted host name + * @throws MalformedURLException + * if the conversion fails */ - public static void main(String[] args) { - - if (args.length != 1) { - System.err.println("Usage : URLUtil "); - return; + public static String convertIDNA2003(String host, boolean toAscii, + boolean strictIDNA2003) throws MalformedURLException { + try { + if (toAscii) { + return IDN.toASCII(host, strictIDNA2003 ? 0 : IDN.ALLOW_UNASSIGNED); + } else { + return IDN.toUnicode(host, strictIDNA2003 ? 0 : IDN.ALLOW_UNASSIGNED); + } + } catch (IllegalArgumentException | IndexOutOfBoundsException e) { + // IllegalArgumentException: thrown if the input string contains + // non-convertible Unicode codepoints + // IndexOutOfBoundsException: thrown (undocumented) if one "label" + // (non-ASCII dot-separated segment) is longer than 256 characters, + // cf. https://bugs.openjdk.java.net/browse/JDK-6806873 + LOG.debug("Failed to convert IDN host {}: ", host, e); + throw (MalformedURLException) new MalformedURLException( + "Invalid IDN " + host + ": " + e.getMessage()).initCause(e); } + } - String url = args[0]; - try { - System.out.println(URLUtil.getDomainName(new URL(url))); - } catch (MalformedURLException ex) { - ex.printStackTrace(); + /** + * Convert IDN host to ASCII or Unicode using ICU's {@link IDNA} class. + * + * The conversion supports IDNA2008 names. + * + * @param host + * host name to be converted + * @param toAscii + * if true convert to ASCII, otherwise to Unicode + * @return converted host name + * @throws MalformedURLException + * if the conversion fails + */ + public static String convertIDNA2008(String host, boolean toAscii) + throws MalformedURLException { + final IDNA.Info idnaInfo = new IDNA.Info(); + final StringBuilder hostConverted = new StringBuilder(); + if (toAscii) { + idna.nameToASCII(host, hostConverted, idnaInfo); + } else { + idna.nameToUnicode(host, hostConverted, idnaInfo); + } + if (idnaInfo.hasErrors()) { + StringBuilder msg = new StringBuilder(); + for (IDNA.Error error : idnaInfo.getErrors()) { + if (msg.length() == 0) { + msg.append("Invalid IDNA2008 host").append(host).append(": "); + } else { + msg.append(", "); + } + msg.append(error.name()); + } + String errorMsg = msg.toString(); + LOG.debug("Failed to convert IDN host {}: {}", host, errorMsg); + throw new MalformedURLException(errorMsg); } + return hostConverted.toString(); } /** @@ -610,4 +696,24 @@ public static boolean isHomePageOf(URL url, String hostName) { && url.getRef() == null // && url.getUserInfo() == null; } + + /** + * For testing + * @param args print with no args to get help + */ + public static void main(String[] args) { + + if (args.length != 1) { + System.err.println("Usage : URLUtil "); + System.err.println("\nExtract and print pay-level domain names for the input URL"); + return; + } + + String url = args[0]; + try { + System.out.println(URLUtil.getDomainName(new URL(url))); + } catch (MalformedURLException ex) { + ex.printStackTrace(); + } + } } diff --git a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java index 2123d8fa9f..165ee548c5 100644 --- a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java +++ b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java @@ -16,12 +16,11 @@ */ package org.apache.nutch.net.urlnormalizer.basic; -import java.lang.invoke.MethodHandles; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; -import java.net.IDN; +import java.lang.invoke.MethodHandles; import java.net.MalformedURLException; import java.net.URISyntaxException; import java.net.URL; @@ -36,6 +35,7 @@ import org.apache.nutch.net.URLNormalizer; import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.URLUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -47,6 +47,12 @@ *
  • normalize * percent-encoding in URL paths
  • + *
  • normalize the host name if it is an Internationalized Domain Name (IDN) + * to ASCII or Unicode, depending on the configuration properties + * urlnormalizer.basic.host.idn and + * urlnormalizer.basic.host.idna2008
  • + *
  • remove a trailing dot in the host name (if the property + * urlnormalizer.basic.host.trim-trailing-dot is true)
  • * */ public class BasicURLNormalizer implements URLNormalizer { @@ -54,6 +60,7 @@ public class BasicURLNormalizer implements URLNormalizer { .getLogger(MethodHandles.lookup().lookupClass()); public final static String NORM_HOST_IDN = "urlnormalizer.basic.host.idn"; + public final static String NORM_HOST_IDNA_2008 = "urlnormalizer.basic.host.idna2008"; public final static String NORM_HOST_TRIM_TRAILING_DOT = "urlnormalizer.basic.host.trim-trailing-dot"; /** @@ -132,20 +139,11 @@ private static boolean isHexCharacter(int c) { || (0x30 <= c && c <= 0x39); } - private static boolean isAscii(String str) { - char[] chars = str.toCharArray(); - for (char c : chars) { - if (c > 127) { - return false; - } - } - return true; - } - private Configuration conf; private boolean hostIDNtoASCII; private boolean hostASCIItoIDN; + private boolean hostIDNA2008; private boolean hostTrimTrailingDot; @Override @@ -162,6 +160,7 @@ public void setConf(Configuration conf) { } else if (normIdn.equalsIgnoreCase("toUnicode")) { hostASCIItoIDN = true; } + hostIDNA2008 = conf.getBoolean(NORM_HOST_IDNA_2008, false); hostTrimTrailingDot = conf.getBoolean(NORM_HOST_TRIM_TRAILING_DOT, false); } @@ -429,21 +428,18 @@ private String normalizeHostName(String host) throws MalformedURLException { // 3. if configured: convert between Unicode and ASCII forms // for Internationalized Domain Names (IDNs) - if (hostIDNtoASCII && !isAscii(host)) { - try { - host = IDN.toASCII(host); - } catch (IllegalArgumentException | IndexOutOfBoundsException e) { - // IllegalArgumentException: thrown if the input string contains - // non-convertible Unicode codepoints - // IndexOutOfBoundsException: thrown (undocumented) if one "label" - // (non-ASCII dot-separated segment) is longer than 256 characters, - // cf. https://bugs.openjdk.java.net/browse/JDK-6806873 - LOG.debug("Failed to convert IDN host {}: ", host, e); - throw (MalformedURLException) new MalformedURLException( - "Invalid IDN " + host + ": " + e.getMessage()).initCause(e); + if (hostIDNtoASCII && !URLUtil.isAscii(host)) { + if (hostIDNA2008) { + host = URLUtil.convertIDNA2008(host, true); + } else { + host = URLUtil.convertIDNA2003(host, true, false); } } else if (hostASCIItoIDN && host.contains("xn--")) { - host = IDN.toUnicode(host); + if (hostIDNA2008) { + host = URLUtil.convertIDNA2008(host, false); + } else { + host = URLUtil.convertIDNA2003(host, false, false); + } } // 4. optionally trim a trailing dot diff --git a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java index a6bad41f2e..1d3813ac86 100644 --- a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java +++ b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java @@ -274,6 +274,46 @@ public void testHostName() throws Exception { "https://www.example.org/"); } + /** + * Test for IDNA2008 and IDNA2003 compatibility. + */ + @Test + public void testHostNameIDNA2008() throws Exception { + // IDNA2008 (https://www.rfc-editor.org/rfc/rfc5890.html#section-1.1) + BasicURLNormalizer norm = new BasicURLNormalizer(); + conf = NutchConfiguration.create(); + conf.set(BasicURLNormalizer.NORM_HOST_IDN, "toAscii"); + norm.setConf(conf); + + // IDNA2003 / RFC 3490 + // Note: IDNA2008 and IDNA2003 deviate for this example + normalizeTest(norm, "https://straße.de/", "https://strasse.de/"); + + // Verify that characters not in Unicode 3.2 do not fail the normalization + normalizeTest(norm, "https://example.ᬩᬮᬶ.id/", "https://example.xn--9tfky.id/"); + + // IDNA2008 / RFC 5890 + conf.setBoolean(BasicURLNormalizer.NORM_HOST_IDNA_2008, true); + norm.setConf(conf); + // Note: this is different from IDNA2003 + normalizeTest(norm, "https://straße.de/", "https://xn--strae-oqa.de/"); + + // Verify that characters not in Unicode 3.2 do not fail the normalization + normalizeTest(norm, "https://example.ᬩᬮᬶ.id/", "https://example.xn--9tfky.id/"); + + // test conversion to Unicode (IDNA2008) + conf.set(BasicURLNormalizer.NORM_HOST_IDN, "toUnicode"); + norm.setConf(conf); + normalizeTest(norm, "https://xn--strae-oqa.de/", "https://straße.de/"); + normalizeTest(norm, "https://example.xn--9tfky.id/", "https://example.ᬩᬮᬶ.id/"); + + // test conversion to Unicode (IDNA2003) + conf.setBoolean(BasicURLNormalizer.NORM_HOST_IDNA_2008, false); + norm.setConf(conf); + normalizeTest(norm, "https://xn--strae-oqa.de/", "https://xn--strae-oqa.de/"); + normalizeTest(norm, "https://example.xn--9tfky.id/", "https://example.ᬩᬮᬶ.id/"); + } + /** * Test that normalizer throws MalformedURLException for invalid URLs */ diff --git a/src/test/org/apache/nutch/util/TestURLUtil.java b/src/test/org/apache/nutch/util/TestURLUtil.java index 092edb9c18..b0f036a1c9 100644 --- a/src/test/org/apache/nutch/util/TestURLUtil.java +++ b/src/test/org/apache/nutch/util/TestURLUtil.java @@ -16,11 +16,16 @@ */ package org.apache.nutch.util; +import java.net.MalformedURLException; import java.net.URL; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.CsvSource; -import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; /** Test class for URLUtil */ public class TestURLUtil { @@ -312,7 +317,12 @@ public void testToUNICODE() throws Exception { assertEquals("http://www.medizin.uni-tübingen.de:8080/search.php?q=abc#p1", URLUtil .toUNICODE("http://www.medizin.xn--uni-tbingen-xhb.de:8080/search.php?q=abc#p1")); - + // do not fail on characters not in Unicode 3.2 + assertEquals("https://example.ᬩᬮᬶ.id/", + URLUtil.toUNICODE("https://example.xn--9tfky.id/")); + // IDNA2008 + assertEquals("http://straße.de/", + URLUtil.toUNICODE("http://xn--strae-oqa.de/")); } @Test @@ -324,6 +334,50 @@ public void testToASCII() throws Exception { assertEquals("http://www.medizin.xn--uni-tbingen-xhb.de:8080/search.php?q=abc#p1", URLUtil .toASCII("http://www.medizin.uni-tübingen.de:8080/search.php?q=abc#p1")); + // IDNA2003 + // assertEquals("http://strasse.de/", + // URLUtil.toASCII("http://straße.de/")); + // do not fail on characters not in Unicode 3.2 + assertEquals("https://example.xn--9tfky.id/", + URLUtil.toASCII("https://example.ᬩᬮᬶ.id/")); + // IDNA2008 + assertEquals("http://xn--strae-oqa.de/", + URLUtil.toASCII("http://straße.de/")); + } + + @ParameterizedTest + @CsvSource({ // + "www.xn--evir-zoa.com,www.çevir.com,IDNA2003,true", // + "xn--uni-tbingen-xhb.de,uni-tübingen.de,IDNA2003,true", // + "example.xn--9tfky.id,example.ᬩᬮᬶ.id,IDNA2008,true", // + "xn--53h.example,☕.example,IDNA2008,true", // + "xn--0ca.xn--ssa73l,à.א̈,IDNA2008,true", // + // Note: IDNA2008 and IDNA2003 deviate for the following example + "xn--strae-oqa.de,straße.de,IDNA2008,true", // + "strasse.de,straße.de,IDNA2003,false", // + "strasse.de,strasse.de,IDNA2003,true", // + }) + public final void testConvertHost(String ascii, String unicode, String type, + boolean roundTrip) throws Exception { + if ("IDNA2008".equals(type)) { + assertEquals(ascii, URLUtil.convertIDNA2008(unicode, true)); + assertEquals(unicode, URLUtil.convertIDNA2008(ascii, false)); + try { + assertNotNull(URLUtil.convertIDNA2003(unicode, true, false)); + } catch (MalformedURLException e) { + /* + * Ok. A IDNA2008 input may raise an exception when using the IDNA2003 + * method + */ + } + } else if ("IDNA2003".equals(type)) { + assertEquals(ascii, URLUtil.convertIDNA2003(unicode, true, true)); + assertEquals(ascii, URLUtil.convertIDNA2003(unicode, true, false)); + if (roundTrip) { + assertEquals(unicode, URLUtil.convertIDNA2003(ascii, false, true)); + assertEquals(unicode, URLUtil.convertIDNA2003(ascii, false, false)); + } + } } @Test From 66f55e1e427e400de5ec1d84f7332b241cfc3c61 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 13 May 2026 11:02:47 +0200 Subject: [PATCH 2/6] NUTCH-3176 URLUtil and urlnormalizer-basic: add support for IDNA2008 URLUtil: also convert host if not already lowercased --- src/java/org/apache/nutch/util/URLUtil.java | 35 ++++++++++++++++----- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/src/java/org/apache/nutch/util/URLUtil.java b/src/java/org/apache/nutch/util/URLUtil.java index ec5aefbe46..454def1309 100644 --- a/src/java/org/apache/nutch/util/URLUtil.java +++ b/src/java/org/apache/nutch/util/URLUtil.java @@ -542,18 +542,28 @@ public static boolean isAscii(String str) { return true; } + /** + * Convert URL with IDN host/domain name into the ASCII representation. + * + * @param url + * URL string to convert + * @return URL string with ASCII host/domain name or null if conversion fails. + */ public static String toASCII(String url) { try { URL u = new URL(url); String host = u.getHost(); - if (host == null || host.isEmpty() || isAscii(host)) { + String hostLowerCase = host.toLowerCase(Locale.ROOT); + if (host == null || host.isEmpty() + || (isAscii(host) && host.equals(hostLowerCase))) { // - no host name => no punycoded domain name // - also do not add additional slashes for file: URLs (NUTCH-1880) // - do nothing if host is already ASCII-only + // - not already in lowercase => conversion also lowercases host name return url; } URI p = new URI(u.getProtocol(), u.getUserInfo(), - convertIDNA2008(u.getHost(), true), u.getPort(), u.getPath(), + convertIDNA2008(hostLowerCase, true), u.getPort(), u.getPath(), u.getQuery(), u.getRef()); return p.toString(); @@ -562,14 +572,25 @@ public static String toASCII(String url) { } } + /** + * Convert URL with IDN host/domain name to the Unicode representation. + * + * @param url + * URL string to convert + * @return URL string with Unicode host/domain name or null if conversion + * fails. + */ public static String toUNICODE(String url) { try { URL u = new URL(url); String host = u.getHost(); - if (host == null || host.isEmpty() || !host.contains("xn--")) { + String hostLowerCase = host.toLowerCase(Locale.ROOT); + if (host == null || host.isEmpty() + || (!hostLowerCase.contains("xn--") && host.equals(hostLowerCase))) { // - no host name => no punycoded domain name // - also do not add additional slashes for file: URLs (NUTCH-1880) - // - ??? + // - contains 'xn--' => needs conversion + // - not already in lowercase => conversion also lowercases host name return url; } StringBuilder sb = new StringBuilder(); @@ -579,7 +600,7 @@ public static String toUNICODE(String url) { sb.append(u.getUserInfo()); sb.append('@'); } - sb.append(convertIDNA2008(u.getHost(), false)); + sb.append(convertIDNA2008(hostLowerCase, false)); if (u.getPort() != -1) { sb.append(':'); sb.append(u.getPort()); @@ -608,7 +629,7 @@ public static String toUNICODE(String url) { * characters not in the repertoire of Unicode 3.2. * * @param host - * host name to be converted + * host name to be converted (lowercase expected) * @param toAscii * if true convert to ASCII, otherwise to Unicode * @param strictIDNA2003 @@ -643,7 +664,7 @@ public static String convertIDNA2003(String host, boolean toAscii, * The conversion supports IDNA2008 names. * * @param host - * host name to be converted + * host name to be converted (lowercase expected) * @param toAscii * if true convert to ASCII, otherwise to Unicode * @return converted host name From 36cc2302917ac59998297d4ae9ac3c9c9662b0ca Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 13 May 2026 11:04:48 +0200 Subject: [PATCH 3/6] Refactor calls of URLDecoder and pass Charset instead of String (since Java 10) --- .../apache/nutch/crawl/DeduplicationJob.java | 9 ++++----- .../nutch/plugin/PluginManifestParser.java | 7 ++----- .../nutch/protocol/file/FileResponse.java | 17 ++++++----------- .../apache/nutch/protocol/ftp/FtpResponse.java | 3 ++- .../urlnormalizer/ajax/AjaxURLNormalizer.java | 17 +++++++---------- .../urlnormalizer/basic/BasicURLNormalizer.java | 11 +++++------ 6 files changed, 26 insertions(+), 38 deletions(-) diff --git a/src/java/org/apache/nutch/crawl/DeduplicationJob.java b/src/java/org/apache/nutch/crawl/DeduplicationJob.java index 50aa4cd7bd..52bf422308 100644 --- a/src/java/org/apache/nutch/crawl/DeduplicationJob.java +++ b/src/java/org/apache/nutch/crawl/DeduplicationJob.java @@ -17,9 +17,9 @@ package org.apache.nutch.crawl; import java.io.IOException; -import java.io.UnsupportedEncodingException; import java.lang.invoke.MethodHandles; import java.net.URLDecoder; +import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.HashMap; import java.util.Map; @@ -34,7 +34,6 @@ import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.Counter; -import org.apache.hadoop.mapreduce.CounterGroup; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; @@ -70,7 +69,7 @@ public class DeduplicationJob extends NutchTool implements Tool { protected final static Text urlKey = new Text("_URLTEMPKEY_"); protected final static String DEDUPLICATION_GROUP_MODE = "deduplication.group.mode"; protected final static String DEDUPLICATION_COMPARE_ORDER = "deduplication.compare.order"; - protected final static String UTF_8 = StandardCharsets.UTF_8.toString(); + protected final static Charset UTF_8 = StandardCharsets.UTF_8; public static class DBFilter extends Mapper { @@ -224,13 +223,13 @@ protected CrawlDatum getDuplicate(CrawlDatum existingDoc, CrawlDatum newDoc) { String urlnewDoc = newDoc.getMetaData().get(urlKey).toString(); try { urlExisting = URLDecoder.decode(urlExisting, UTF_8); - } catch (UnsupportedEncodingException | IllegalArgumentException e) { + } catch (IllegalArgumentException e) { LOG.error("Error decoding: {}", urlExisting, e); // use the encoded URL } try { urlnewDoc = URLDecoder.decode(urlnewDoc, UTF_8); - } catch (UnsupportedEncodingException | IllegalArgumentException e) { + } catch (IllegalArgumentException e) { LOG.error("Error decoding: {}", urlnewDoc, e); // use the encoded URL } diff --git a/src/java/org/apache/nutch/plugin/PluginManifestParser.java b/src/java/org/apache/nutch/plugin/PluginManifestParser.java index 10ce4fdb7b..95208fa433 100644 --- a/src/java/org/apache/nutch/plugin/PluginManifestParser.java +++ b/src/java/org/apache/nutch/plugin/PluginManifestParser.java @@ -18,11 +18,11 @@ import java.io.File; import java.io.IOException; -import java.io.UnsupportedEncodingException; import java.lang.invoke.MethodHandles; import java.net.MalformedURLException; import java.net.URL; import java.net.URLDecoder; +import java.nio.charset.StandardCharsets; import java.util.HashMap; import java.util.Map; @@ -124,10 +124,7 @@ public File getPluginFolder(String name) { String path = url.getPath(); if (WINDOWS && path.startsWith("/")) // patch a windows bug path = path.substring(1); - try { - path = URLDecoder.decode(path, "UTF-8"); // decode the url path - } catch (UnsupportedEncodingException e) { - } + path = URLDecoder.decode(path, StandardCharsets.UTF_8); // decode the url path directory = new File(path); } else if (!directory.exists()) { LOG.warn("Plugins: directory not found: {}", name); diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java index c50988c2dd..64186b9035 100644 --- a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java +++ b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java @@ -16,20 +16,18 @@ */ package org.apache.nutch.protocol.file; -import java.net.URL; import java.io.IOException; -import java.io.UnsupportedEncodingException; +import java.net.URL; +import java.nio.charset.StandardCharsets; +import org.apache.hadoop.conf.Configuration; import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.protocol.Content; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.net.protocols.HttpDateFormat; import org.apache.nutch.net.protocols.Response; - +import org.apache.nutch.protocol.Content; import org.apache.tika.Tika; -import org.apache.hadoop.conf.Configuration; - /** * FileResponse.java mimics file replies as http response. It tries its best to * follow http's way for headers, response codes as well as exceptions. @@ -125,11 +123,8 @@ public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf) String path = url.getPath().isEmpty() ? "/" : url.getPath(); - try { - // specify the encoding via the config later? - path = java.net.URLDecoder.decode(path, "UTF-8"); - } catch (UnsupportedEncodingException ex) { - } + // specify the encoding via the config later? + path = java.net.URLDecoder.decode(path, StandardCharsets.UTF_8); try { diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java index 8796cfc0b3..0d7ad1b289 100644 --- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java +++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java @@ -30,6 +30,7 @@ import java.net.InetAddress; import java.net.URL; +import java.nio.charset.StandardCharsets; import java.util.List; import java.util.LinkedList; import java.io.ByteArrayOutputStream; @@ -245,7 +246,7 @@ public FtpResponse(URL url, CrawlDatum datum, Ftp ftp, Configuration conf) this.content = null; - path = java.net.URLDecoder.decode(path, "UTF-8"); + path = java.net.URLDecoder.decode(path, StandardCharsets.UTF_8); if (path.endsWith("/")) { getDirAsHttpResponse(path, datum.getModifiedTime()); diff --git a/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java b/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java index 2342ced68f..5518e39544 100644 --- a/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java +++ b/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java @@ -16,19 +16,18 @@ */ package org.apache.nutch.net.urlnormalizer.ajax; +import static java.nio.charset.StandardCharsets.UTF_8; + import java.lang.invoke.MethodHandles; +import java.net.MalformedURLException; import java.net.URL; import java.net.URLDecoder; -import java.net.MalformedURLException; -import java.nio.charset.Charset; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; import org.apache.nutch.net.URLNormalizer; import org.apache.nutch.net.URLNormalizers; -import org.apache.hadoop.conf.Configuration; - +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * URLNormalizer capable of dealing with AJAX URL's. * @@ -43,13 +42,11 @@ public class AjaxURLNormalizer implements URLNormalizer { public static String ESCAPED_URL_PART = "_escaped_fragment_="; private Configuration conf; - private Charset utf8; /** * Default constructor. */ public AjaxURLNormalizer() { - utf8 = Charset.forName("UTF-8"); } /** @@ -195,7 +192,7 @@ protected String escape(String fragmentPart) { String hex = null; StringBuilder sb = new StringBuilder(fragmentPart.length()); - for (byte b : fragmentPart.getBytes(utf8)) { + for (byte b : fragmentPart.getBytes(UTF_8)) { if (b < 33) { sb.append('%'); diff --git a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java index 165ee548c5..2a6fd64000 100644 --- a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java +++ b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java @@ -19,7 +19,6 @@ import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; -import java.io.UnsupportedEncodingException; import java.lang.invoke.MethodHandles; import java.net.MalformedURLException; import java.net.URISyntaxException; @@ -77,7 +76,7 @@ public class BasicURLNormalizer implements URLNormalizer { .compile("%([0-9A-Fa-f]{2})"); // charset used for encoding URLs before escaping - private final static Charset utf8 = StandardCharsets.UTF_8; + private final static Charset UTF_8 = StandardCharsets.UTF_8; /** look-up table for characters which should not be escaped in URL paths */ private final static boolean[] unescapedCharacters = new boolean[128]; @@ -363,7 +362,7 @@ private String escapePath(String path) { StringBuilder sb = new StringBuilder(path.length()); // Traverse over all bytes in this URL - byte[] bytes = path.getBytes(utf8); + byte[] bytes = path.getBytes(UTF_8); for (int i = 0; i < bytes.length; i++) { byte b = bytes[i]; // Is this a control character? @@ -414,8 +413,8 @@ private String normalizeHostName(String host) throws MalformedURLException { // 1. unescape percent-encoded characters in host name if (host.indexOf('%') != -1) { try { - host = URLDecoder.decode(host, StandardCharsets.UTF_8.toString()); - } catch (UnsupportedEncodingException | IllegalArgumentException e) { + host = URLDecoder.decode(host, UTF_8); + } catch (IllegalArgumentException e) { LOG.debug("Failed to convert percent-encoded host name {}: ", host, e); throw (MalformedURLException) new MalformedURLException( "Invalid percent-encoded host name " + host + ": " + e.getMessage()) @@ -462,7 +461,7 @@ public static void main(String args[]) throws IOException { } String line, normUrl; BufferedReader in = new BufferedReader( - new InputStreamReader(System.in, utf8)); + new InputStreamReader(System.in, UTF_8)); while ((line = in.readLine()) != null) { try { normUrl = normalizer.normalize(line, scope); From 947cd288f5b301a55ec6d502080d5fc28dc82a62 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 13 May 2026 11:16:15 +0200 Subject: [PATCH 4/6] NUTCH-3176 URLUtil and urlnormalizer-basic: add support for IDNA2008 urlnormalizer-basic: properly set direction of conversion (to ASCII or to Unicode) if setConf(conf) is called repeatedly --- .../nutch/net/urlnormalizer/basic/BasicURLNormalizer.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java index 2a6fd64000..4ff9fc64a6 100644 --- a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java +++ b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java @@ -156,7 +156,9 @@ public void setConf(Configuration conf) { String normIdn = conf.get(NORM_HOST_IDN, ""); if (normIdn.equalsIgnoreCase("toAscii")) { hostIDNtoASCII = true; + hostASCIItoIDN = false; } else if (normIdn.equalsIgnoreCase("toUnicode")) { + hostIDNtoASCII = false; hostASCIItoIDN = true; } hostIDNA2008 = conf.getBoolean(NORM_HOST_IDNA_2008, false); From 84699a62c21ab8df6b78429ac325cd84c985a01d Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 13 May 2026 11:19:28 +0200 Subject: [PATCH 5/6] NUTCH-3176 URLUtil and urlnormalizer-basic: add support for IDNA2008 Extend unit tests. --- .../basic/TestBasicURLNormalizer.java | 26 +++++++ .../org/apache/nutch/util/TestURLUtil.java | 76 +++++++++++++++++-- 2 files changed, 97 insertions(+), 5 deletions(-) diff --git a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java index 1d3813ac86..090c25f2da 100644 --- a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java +++ b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java @@ -258,15 +258,24 @@ public void testHostName() throws Exception { // test Internationalized Domain Names BasicURLNormalizer norm = new BasicURLNormalizer(); conf = NutchConfiguration.create(); + + // to ASCII normalization conf.set(BasicURLNormalizer.NORM_HOST_IDN, "toAscii"); norm.setConf(conf); normalizeTest(norm, "https://нэб.рф/", "https://xn--90ax2c.xn--p1ai/"); // verify escaping of percent-encoded characters in IDNs (NUTCH-2824) normalizeTest(norm, "https://www.0251-sachverst%c3%a4ndiger.de/", "https://www.xn--0251-sachverstndiger-ozb.de/"); + // verify that host names with uppercase characters are normalized + normalizeTest(norm, "https://нЭб.РФ/", "https://xn--90ax2c.xn--p1ai/"); + + // to Unicode normalization conf.set(BasicURLNormalizer.NORM_HOST_IDN, "toUnicode"); norm.setConf(conf); normalizeTest(norm, "https://xn--90ax2c.xn--p1ai/", "https://нэб.рф/"); + // verify that host names with uppercase characters are normalized + normalizeTest(norm, "https://Xn--90Ax2c.xN--P1ai/", "https://нэб.рф/"); + // test removal of trailing dot conf.setBoolean(BasicURLNormalizer.NORM_HOST_TRIM_TRAILING_DOT, true); norm.setConf(conf); @@ -301,17 +310,34 @@ public void testHostNameIDNA2008() throws Exception { // Verify that characters not in Unicode 3.2 do not fail the normalization normalizeTest(norm, "https://example.ᬩᬮᬶ.id/", "https://example.xn--9tfky.id/"); + // mixed encodings (Unicode, Punycode, percent encoding) + normalizeTest(norm, "https://xn--p1ai.%D1%80%D1%84/", + "https://xn--p1ai.xn--p1ai/"); + normalizeTest(norm, "https://xn--p1ai.рф/", "https://xn--p1ai.xn--p1ai/"); + // test conversion to Unicode (IDNA2008) conf.set(BasicURLNormalizer.NORM_HOST_IDN, "toUnicode"); norm.setConf(conf); normalizeTest(norm, "https://xn--strae-oqa.de/", "https://straße.de/"); normalizeTest(norm, "https://example.xn--9tfky.id/", "https://example.ᬩᬮᬶ.id/"); + // mixed encodings (Unicode, Punycode, percent encoding), mixed case + normalizeTest(norm, "https://xN--p1aI.Xn--P1ai/", "https://рф.рф/"); + normalizeTest(norm, "https://xN--p1Ai.%D1%80%d1%84/", "https://рф.рф/"); + normalizeTest(norm, "https://булГаков.xN--p1Ai.%D1%80%d1%84/", "https://булгаков.рф.рф/"); + normalizeTest(norm, "https://гоГоль.%d1%80%D1%84.Рф/", "https://гоголь.рф.рф/"); + // test conversion to Unicode (IDNA2003) conf.setBoolean(BasicURLNormalizer.NORM_HOST_IDNA_2008, false); norm.setConf(conf); normalizeTest(norm, "https://xn--strae-oqa.de/", "https://xn--strae-oqa.de/"); normalizeTest(norm, "https://example.xn--9tfky.id/", "https://example.ᬩᬮᬶ.id/"); + + // mixed encodings (Unicode, Punycode, percent encoding), mixed case + normalizeTest(norm, "https://xN--p1aI.Xn--P1ai/", "https://рф.рф/"); + normalizeTest(norm, "https://xN--p1Ai.%D1%80%d1%84/", "https://рф.рф/"); + normalizeTest(norm, "https://булГаков.xN--p1Ai.%D1%80%d1%84/", "https://булгаков.рф.рф/"); + normalizeTest(norm, "https://гоГоль.%d1%80%D1%84.Рф/", "https://гоголь.рф.рф/"); } /** diff --git a/src/test/org/apache/nutch/util/TestURLUtil.java b/src/test/org/apache/nutch/util/TestURLUtil.java index b0f036a1c9..200ea59a06 100644 --- a/src/test/org/apache/nutch/util/TestURLUtil.java +++ b/src/test/org/apache/nutch/util/TestURLUtil.java @@ -16,6 +16,12 @@ */ package org.apache.nutch.util; +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + import java.net.MalformedURLException; import java.net.URL; @@ -23,10 +29,6 @@ import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.CsvSource; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNotNull; -import static org.junit.jupiter.api.Assertions.assertNull; - /** Test class for URLUtil */ public class TestURLUtil { @@ -323,6 +325,14 @@ public void testToUNICODE() throws Exception { // IDNA2008 assertEquals("http://straße.de/", URLUtil.toUNICODE("http://xn--strae-oqa.de/")); + // host names with uppercase characters + assertEquals("https://googie.com/", + URLUtil.toUNICODE("https://googIe.com/")); + assertEquals("https://googie.com/", URLUtil.toASCII("https://googIe.com/")); + assertEquals("https://xn--90ax2c.xn--p1ai/", + URLUtil.toASCII("https://нЭб.РФ/")); + assertEquals("https://нэб.рф/", + URLUtil.toUNICODE("https://Xn--90Ax2c.xN--P1ai/")); } @Test @@ -350,15 +360,39 @@ public void testToASCII() throws Exception { "www.xn--evir-zoa.com,www.çevir.com,IDNA2003,true", // "xn--uni-tbingen-xhb.de,uni-tübingen.de,IDNA2003,true", // "example.xn--9tfky.id,example.ᬩᬮᬶ.id,IDNA2008,true", // + // Test examples from whatwg-url "xn--53h.example,☕.example,IDNA2008,true", // "xn--0ca.xn--ssa73l,à.א̈,IDNA2008,true", // - // Note: IDNA2008 and IDNA2003 deviate for the following example + "xn--mgba3gch31f060k.com,\u0646\u0627\u0645\u0647\u200c\u0627\u06cc.com,IDNA2008,true", // + /* Note: IDNA2008 and IDNA2003 deviate for the following examples, + * cf. https://www.unicode.org/reports/tr46/#IDNA2003-Section */ "xn--strae-oqa.de,straße.de,IDNA2008,true", // "strasse.de,straße.de,IDNA2003,false", // "strasse.de,strasse.de,IDNA2003,true", // + "xn--fa-hia.de,faß.de,IDNA2008,true", // + "fass.de,faß.de,IDNA2003,false", // + "fass.de,fass.de,IDNA2003,true", // + "xn--nxasmm1c.com,βόλος.com,IDNA2008,true", // + "xn--nxasmq6b.com,βόλος.com,IDNA2003,false", // + "xn--nxasmq6b.com,βόλοσ.com,IDNA2003,true", // + "xn--10cl1a0b660p.com,ශ්‍රී.com,IDNA2008,true", // + "xn--10cl1a0b.com,ශ්‍රී.com,IDNA2003,false", // + "xn--10cl1a0b.com,ශ්රී.com,IDNA2003,true", // + "xn--mgba3gch31f060k.com,نامه‌ای.com,IDNA2008,true", // + "xn--mgba3gch31f.com,نامه‌ای.com,IDNA2003,false", // + "xn--mgba3gch31f.com,نامهای.com,IDNA2003,true", // + // mixed lowercase/uppercase: no round trip conversion + "xn--bb-eka.at,ÖBB.at,IDNA2003,false", // + "xn--bb-eka.at,öbb.at,IDNA2003,true", // + // mixed encoding (Punycode and Unicode) + "xn--p1ai.xn--p1ai,рф.xn--p1ai,IDNA2003,false", // + "xn--p1ai.xn--p1ai,xn--p1ai.рф,IDNA2003,false", // + // percent-encoding is not supported + // "xn--p1ai.xn--p1ai,xn--p1ai.%D1%80%D1%84,IDNA2003,false", // }) public final void testConvertHost(String ascii, String unicode, String type, boolean roundTrip) throws Exception { + System.out.println(ascii + " <> " + unicode); if ("IDNA2008".equals(type)) { assertEquals(ascii, URLUtil.convertIDNA2008(unicode, true)); assertEquals(unicode, URLUtil.convertIDNA2008(ascii, false)); @@ -380,6 +414,38 @@ public final void testConvertHost(String ascii, String unicode, String type, } } + @Test + public final void testConvertHostInvalid() { + // broken Punycode + assertDoesNotThrow(() -> assertEquals("xn--xn--bss-7z6ccid.com", + URLUtil.convertIDNA2003("xn--xn--bss-7z6ccid.com", false, true))); + + // invalid Punycode + assertThrows(MalformedURLException.class, + () -> URLUtil.convertIDNA2008("xn--0.pt", false)); + + // IDNA2003 not allowing characters not in Unicode 3.2 + assertThrows(MalformedURLException.class, + () -> URLUtil.convertIDNA2003("☕.example", true, true)); + assertDoesNotThrow(() -> assertEquals("xn--53h.example", + URLUtil.convertIDNA2003("xn--53h.example", false, true))); + + // IDNA2008 invalid, + // cf. https://www.unicode.org/reports/tr46/#Implementation_Notes + // cf. https://www.unicode.org/Public/17.0.0/idna/IdnaTestV2.txt + // disallowed character: ⒈ (U+2488 - DIGIT ONE FULL STOP) + assertThrows(MalformedURLException.class, + () -> URLUtil.convertIDNA2008("\u2488com", true)); + assertThrows(MalformedURLException.class, + () -> URLUtil.convertIDNA2008("xn--acom-0w1b", false)); + assertThrows(MalformedURLException.class, + () -> URLUtil.convertIDNA2008("xn--xn--a--gua.pt", false)); + assertThrows(MalformedURLException.class, + () -> URLUtil.convertIDNA2008("xn--a-ä.pt", false)); + assertThrows(MalformedURLException.class, + () -> URLUtil.convertIDNA2008("xn--a-ä.pt", true)); + } + @Test public void testFileProtocol() throws Exception { // keep one single slash NUTCH-1483 From afb943960138d7f81017ab4bbf558072e4f89db0 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 13 May 2026 22:26:42 +0200 Subject: [PATCH 6/6] Format source code --- src/java/org/apache/nutch/util/URLUtil.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/java/org/apache/nutch/util/URLUtil.java b/src/java/org/apache/nutch/util/URLUtil.java index 454def1309..fd036480a6 100644 --- a/src/java/org/apache/nutch/util/URLUtil.java +++ b/src/java/org/apache/nutch/util/URLUtil.java @@ -544,7 +544,7 @@ public static boolean isAscii(String str) { /** * Convert URL with IDN host/domain name into the ASCII representation. - * + * * @param url * URL string to convert * @return URL string with ASCII host/domain name or null if conversion fails. @@ -574,7 +574,7 @@ public static String toASCII(String url) { /** * Convert URL with IDN host/domain name to the Unicode representation. - * + * * @param url * URL string to convert * @return URL string with Unicode host/domain name or null if conversion @@ -620,14 +620,14 @@ public static String toUNICODE(String url) { /** * Convert IDN host to ASCII or Unicode using Java's built-in {@link IDN} * class. - * + * * The conversion supports only IDNA2003, it does not support IDNA2008. * However, unless the parameter strictIDNA2003 is true, the * methods {@link IDN#toASCII(String, int)} resp. * {@link IDN#toUnicode(String, int)} are called passing the flag * {@link IDN#ALLOW_UNASSIGNED} to avoid that the conversion fails on * characters not in the repertoire of Unicode 3.2. - * + * * @param host * host name to be converted (lowercase expected) * @param toAscii @@ -660,9 +660,9 @@ public static String convertIDNA2003(String host, boolean toAscii, /** * Convert IDN host to ASCII or Unicode using ICU's {@link IDNA} class. - * + * * The conversion supports IDNA2008 names. - * + * * @param host * host name to be converted (lowercase expected) * @param toAscii