diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index cc0e8d4388..f26ad419c9 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1556,6 +1556,17 @@
+
+ urlnormalizer.basic.host.idna2008
+ false
+ If true, let urlnormalizer-basic
+ normalize Internationalized Domain Names (IDNs) using the
+ standard IDNA2008 (RFC 5890). If false, use IDNA2003 (RFC 3490).
+ Note that urlnormalizer.basic.host.idn must be set, otherwise
+ this property has no effect.
+
+
+
urlnormalizer.basic.host.trim-trailing-dot
false
diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index 70763adc02..944a329a01 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -118,6 +118,7 @@
+
diff --git a/src/java/org/apache/nutch/crawl/DeduplicationJob.java b/src/java/org/apache/nutch/crawl/DeduplicationJob.java
index 50aa4cd7bd..52bf422308 100644
--- a/src/java/org/apache/nutch/crawl/DeduplicationJob.java
+++ b/src/java/org/apache/nutch/crawl/DeduplicationJob.java
@@ -17,9 +17,9 @@
package org.apache.nutch.crawl;
import java.io.IOException;
-import java.io.UnsupportedEncodingException;
import java.lang.invoke.MethodHandles;
import java.net.URLDecoder;
+import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
@@ -34,7 +34,6 @@
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Counter;
-import org.apache.hadoop.mapreduce.CounterGroup;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
@@ -70,7 +69,7 @@ public class DeduplicationJob extends NutchTool implements Tool {
protected final static Text urlKey = new Text("_URLTEMPKEY_");
protected final static String DEDUPLICATION_GROUP_MODE = "deduplication.group.mode";
protected final static String DEDUPLICATION_COMPARE_ORDER = "deduplication.compare.order";
- protected final static String UTF_8 = StandardCharsets.UTF_8.toString();
+ protected final static Charset UTF_8 = StandardCharsets.UTF_8;
public static class DBFilter extends
Mapper {
@@ -224,13 +223,13 @@ protected CrawlDatum getDuplicate(CrawlDatum existingDoc, CrawlDatum newDoc) {
String urlnewDoc = newDoc.getMetaData().get(urlKey).toString();
try {
urlExisting = URLDecoder.decode(urlExisting, UTF_8);
- } catch (UnsupportedEncodingException | IllegalArgumentException e) {
+ } catch (IllegalArgumentException e) {
LOG.error("Error decoding: {}", urlExisting, e);
// use the encoded URL
}
try {
urlnewDoc = URLDecoder.decode(urlnewDoc, UTF_8);
- } catch (UnsupportedEncodingException | IllegalArgumentException e) {
+ } catch (IllegalArgumentException e) {
LOG.error("Error decoding: {}", urlnewDoc, e);
// use the encoded URL
}
diff --git a/src/java/org/apache/nutch/plugin/PluginManifestParser.java b/src/java/org/apache/nutch/plugin/PluginManifestParser.java
index 10ce4fdb7b..95208fa433 100644
--- a/src/java/org/apache/nutch/plugin/PluginManifestParser.java
+++ b/src/java/org/apache/nutch/plugin/PluginManifestParser.java
@@ -18,11 +18,11 @@
import java.io.File;
import java.io.IOException;
-import java.io.UnsupportedEncodingException;
import java.lang.invoke.MethodHandles;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder;
+import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
@@ -124,10 +124,7 @@ public File getPluginFolder(String name) {
String path = url.getPath();
if (WINDOWS && path.startsWith("/")) // patch a windows bug
path = path.substring(1);
- try {
- path = URLDecoder.decode(path, "UTF-8"); // decode the url path
- } catch (UnsupportedEncodingException e) {
- }
+ path = URLDecoder.decode(path, StandardCharsets.UTF_8); // decode the url path
directory = new File(path);
} else if (!directory.exists()) {
LOG.warn("Plugins: directory not found: {}", name);
diff --git a/src/java/org/apache/nutch/util/URLUtil.java b/src/java/org/apache/nutch/util/URLUtil.java
index 44c6309d2a..fd036480a6 100644
--- a/src/java/org/apache/nutch/util/URLUtil.java
+++ b/src/java/org/apache/nutch/util/URLUtil.java
@@ -16,6 +16,7 @@
*/
package org.apache.nutch.util;
+import java.lang.invoke.MethodHandles;
import java.net.IDN;
import java.net.MalformedURLException;
import java.net.URI;
@@ -23,11 +24,22 @@
import java.util.Locale;
import java.util.regex.Pattern;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.ibm.icu.text.IDNA;
+
import crawlercommons.domains.EffectiveTldFinder;
/** Utility class for URL analysis */
public class URLUtil {
+ private static final Logger LOG = LoggerFactory
+ .getLogger(MethodHandles.lookup().lookupClass());
+
+ private static final IDNA idna = IDNA.getUTS46Instance(
+ IDNA.NONTRANSITIONAL_TO_ASCII | IDNA.NONTRANSITIONAL_TO_UNICODE);
+
/**
* Resolve relative URL-s and fix a java.net.URL error in handling of URLs
* with pure query targets.
@@ -520,17 +532,39 @@ public static String getProtocol(URL url) {
return url.getProtocol();
}
+ public static boolean isAscii(String str) {
+ char[] chars = str.toCharArray();
+ for (char c : chars) {
+ if (c > 127) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Convert URL with IDN host/domain name into the ASCII representation.
+ *
+ * @param url
+ * URL string to convert
+ * @return URL string with ASCII host/domain name or null if conversion fails.
+ */
public static String toASCII(String url) {
try {
URL u = new URL(url);
String host = u.getHost();
- if (host == null || host.isEmpty()) {
- // no host name => no punycoded domain name
- // also do not add additional slashes for file: URLs (NUTCH-1880)
+ String hostLowerCase = host.toLowerCase(Locale.ROOT);
+ if (host == null || host.isEmpty()
+ || (isAscii(host) && host.equals(hostLowerCase))) {
+ // - no host name => no punycoded domain name
+ // - also do not add additional slashes for file: URLs (NUTCH-1880)
+ // - do nothing if host is already ASCII-only
+ // - not already in lowercase => conversion also lowercases host name
return url;
}
- URI p = new URI(u.getProtocol(), u.getUserInfo(), IDN.toASCII(host),
- u.getPort(), u.getPath(), u.getQuery(), u.getRef());
+ URI p = new URI(u.getProtocol(), u.getUserInfo(),
+ convertIDNA2008(hostLowerCase, true), u.getPort(), u.getPath(),
+ u.getQuery(), u.getRef());
return p.toString();
} catch (Exception e) {
@@ -538,13 +572,25 @@ public static String toASCII(String url) {
}
}
+ /**
+ * Convert URL with IDN host/domain name to the Unicode representation.
+ *
+ * @param url
+ * URL string to convert
+ * @return URL string with Unicode host/domain name or null if conversion
+ * fails.
+ */
public static String toUNICODE(String url) {
try {
URL u = new URL(url);
String host = u.getHost();
- if (host == null || host.isEmpty()) {
- // no host name => no punycoded domain name
- // also do not add additional slashes for file: URLs (NUTCH-1880)
+ String hostLowerCase = host.toLowerCase(Locale.ROOT);
+ if (host == null || host.isEmpty()
+ || (!hostLowerCase.contains("xn--") && host.equals(hostLowerCase))) {
+ // - no host name => no punycoded domain name
+ // - also do not add additional slashes for file: URLs (NUTCH-1880)
+ // - contains 'xn--' => needs conversion
+ // - not already in lowercase => conversion also lowercases host name
return url;
}
StringBuilder sb = new StringBuilder();
@@ -554,7 +600,7 @@ public static String toUNICODE(String url) {
sb.append(u.getUserInfo());
sb.append('@');
}
- sb.append(IDN.toUnicode(host));
+ sb.append(convertIDNA2008(hostLowerCase, false));
if (u.getPort() != -1) {
sb.append(':');
sb.append(u.getPort());
@@ -572,22 +618,83 @@ public static String toUNICODE(String url) {
}
/**
- * For testing
- * @param args print with no args to get help
+ * Convert IDN host to ASCII or Unicode using Java's built-in {@link IDN}
+ * class.
+ *
+ * The conversion supports only IDNA2003, it does not support IDNA2008.
+ * However, unless the parameter strictIDNA2003 is true, the
+ * methods {@link IDN#toASCII(String, int)} resp.
+ * {@link IDN#toUnicode(String, int)} are called passing the flag
+ * {@link IDN#ALLOW_UNASSIGNED} to avoid that the conversion fails on
+ * characters not in the repertoire of Unicode 3.2.
+ *
+ * @param host
+ * host name to be converted (lowercase expected)
+ * @param toAscii
+ * if true convert to ASCII, otherwise to Unicode
+ * @param strictIDNA2003
+ * if true, do
+ * @return converted host name
+ * @throws MalformedURLException
+ * if the conversion fails
*/
- public static void main(String[] args) {
-
- if (args.length != 1) {
- System.err.println("Usage : URLUtil ");
- return;
+ public static String convertIDNA2003(String host, boolean toAscii,
+ boolean strictIDNA2003) throws MalformedURLException {
+ try {
+ if (toAscii) {
+ return IDN.toASCII(host, strictIDNA2003 ? 0 : IDN.ALLOW_UNASSIGNED);
+ } else {
+ return IDN.toUnicode(host, strictIDNA2003 ? 0 : IDN.ALLOW_UNASSIGNED);
+ }
+ } catch (IllegalArgumentException | IndexOutOfBoundsException e) {
+ // IllegalArgumentException: thrown if the input string contains
+ // non-convertible Unicode codepoints
+ // IndexOutOfBoundsException: thrown (undocumented) if one "label"
+ // (non-ASCII dot-separated segment) is longer than 256 characters,
+ // cf. https://bugs.openjdk.java.net/browse/JDK-6806873
+ LOG.debug("Failed to convert IDN host {}: ", host, e);
+ throw (MalformedURLException) new MalformedURLException(
+ "Invalid IDN " + host + ": " + e.getMessage()).initCause(e);
}
+ }
- String url = args[0];
- try {
- System.out.println(URLUtil.getDomainName(new URL(url)));
- } catch (MalformedURLException ex) {
- ex.printStackTrace();
+ /**
+ * Convert IDN host to ASCII or Unicode using ICU's {@link IDNA} class.
+ *
+ * The conversion supports IDNA2008 names.
+ *
+ * @param host
+ * host name to be converted (lowercase expected)
+ * @param toAscii
+ * if true convert to ASCII, otherwise to Unicode
+ * @return converted host name
+ * @throws MalformedURLException
+ * if the conversion fails
+ */
+ public static String convertIDNA2008(String host, boolean toAscii)
+ throws MalformedURLException {
+ final IDNA.Info idnaInfo = new IDNA.Info();
+ final StringBuilder hostConverted = new StringBuilder();
+ if (toAscii) {
+ idna.nameToASCII(host, hostConverted, idnaInfo);
+ } else {
+ idna.nameToUnicode(host, hostConverted, idnaInfo);
}
+ if (idnaInfo.hasErrors()) {
+ StringBuilder msg = new StringBuilder();
+ for (IDNA.Error error : idnaInfo.getErrors()) {
+ if (msg.length() == 0) {
+ msg.append("Invalid IDNA2008 host").append(host).append(": ");
+ } else {
+ msg.append(", ");
+ }
+ msg.append(error.name());
+ }
+ String errorMsg = msg.toString();
+ LOG.debug("Failed to convert IDN host {}: {}", host, errorMsg);
+ throw new MalformedURLException(errorMsg);
+ }
+ return hostConverted.toString();
}
/**
@@ -610,4 +717,24 @@ public static boolean isHomePageOf(URL url, String hostName) {
&& url.getRef() == null //
&& url.getUserInfo() == null;
}
+
+ /**
+ * For testing
+ * @param args print with no args to get help
+ */
+ public static void main(String[] args) {
+
+ if (args.length != 1) {
+ System.err.println("Usage : URLUtil ");
+ System.err.println("\nExtract and print pay-level domain names for the input URL");
+ return;
+ }
+
+ String url = args[0];
+ try {
+ System.out.println(URLUtil.getDomainName(new URL(url)));
+ } catch (MalformedURLException ex) {
+ ex.printStackTrace();
+ }
+ }
}
diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
index c50988c2dd..64186b9035 100644
--- a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
+++ b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
@@ -16,20 +16,18 @@
*/
package org.apache.nutch.protocol.file;
-import java.net.URL;
import java.io.IOException;
-import java.io.UnsupportedEncodingException;
+import java.net.URL;
+import java.nio.charset.StandardCharsets;
+import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.protocol.Content;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.nutch.net.protocols.Response;
-
+import org.apache.nutch.protocol.Content;
import org.apache.tika.Tika;
-import org.apache.hadoop.conf.Configuration;
-
/**
* FileResponse.java mimics file replies as http response. It tries its best to
* follow http's way for headers, response codes as well as exceptions.
@@ -125,11 +123,8 @@ public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf)
String path = url.getPath().isEmpty() ? "/" : url.getPath();
- try {
- // specify the encoding via the config later?
- path = java.net.URLDecoder.decode(path, "UTF-8");
- } catch (UnsupportedEncodingException ex) {
- }
+ // specify the encoding via the config later?
+ path = java.net.URLDecoder.decode(path, StandardCharsets.UTF_8);
try {
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
index 8796cfc0b3..0d7ad1b289 100644
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
+++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
@@ -30,6 +30,7 @@
import java.net.InetAddress;
import java.net.URL;
+import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.LinkedList;
import java.io.ByteArrayOutputStream;
@@ -245,7 +246,7 @@ public FtpResponse(URL url, CrawlDatum datum, Ftp ftp, Configuration conf)
this.content = null;
- path = java.net.URLDecoder.decode(path, "UTF-8");
+ path = java.net.URLDecoder.decode(path, StandardCharsets.UTF_8);
if (path.endsWith("/")) {
getDirAsHttpResponse(path, datum.getModifiedTime());
diff --git a/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java b/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java
index 2342ced68f..5518e39544 100644
--- a/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java
+++ b/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java
@@ -16,19 +16,18 @@
*/
package org.apache.nutch.net.urlnormalizer.ajax;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
import java.lang.invoke.MethodHandles;
+import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder;
-import java.net.MalformedURLException;
-import java.nio.charset.Charset;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.net.URLNormalizer;
import org.apache.nutch.net.URLNormalizers;
-import org.apache.hadoop.conf.Configuration;
-
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
* URLNormalizer capable of dealing with AJAX URL's.
*
@@ -43,13 +42,11 @@ public class AjaxURLNormalizer implements URLNormalizer {
public static String ESCAPED_URL_PART = "_escaped_fragment_=";
private Configuration conf;
- private Charset utf8;
/**
* Default constructor.
*/
public AjaxURLNormalizer() {
- utf8 = Charset.forName("UTF-8");
}
/**
@@ -195,7 +192,7 @@ protected String escape(String fragmentPart) {
String hex = null;
StringBuilder sb = new StringBuilder(fragmentPart.length());
- for (byte b : fragmentPart.getBytes(utf8)) {
+ for (byte b : fragmentPart.getBytes(UTF_8)) {
if (b < 33) {
sb.append('%');
diff --git a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
index 2123d8fa9f..4ff9fc64a6 100644
--- a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
+++ b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
@@ -16,12 +16,10 @@
*/
package org.apache.nutch.net.urlnormalizer.basic;
-import java.lang.invoke.MethodHandles;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
-import java.io.UnsupportedEncodingException;
-import java.net.IDN;
+import java.lang.invoke.MethodHandles;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
@@ -36,6 +34,7 @@
import org.apache.nutch.net.URLNormalizer;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.URLUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -47,6 +46,12 @@
* normalize
* percent-encoding in URL paths
+ * normalize the host name if it is an Internationalized Domain Name (IDN)
+ * to ASCII or Unicode, depending on the configuration properties
+ * urlnormalizer.basic.host.idn and
+ * urlnormalizer.basic.host.idna2008
+ * remove a trailing dot in the host name (if the property
+ * urlnormalizer.basic.host.trim-trailing-dot is true)
*
*/
public class BasicURLNormalizer implements URLNormalizer {
@@ -54,6 +59,7 @@ public class BasicURLNormalizer implements URLNormalizer {
.getLogger(MethodHandles.lookup().lookupClass());
public final static String NORM_HOST_IDN = "urlnormalizer.basic.host.idn";
+ public final static String NORM_HOST_IDNA_2008 = "urlnormalizer.basic.host.idna2008";
public final static String NORM_HOST_TRIM_TRAILING_DOT = "urlnormalizer.basic.host.trim-trailing-dot";
/**
@@ -70,7 +76,7 @@ public class BasicURLNormalizer implements URLNormalizer {
.compile("%([0-9A-Fa-f]{2})");
// charset used for encoding URLs before escaping
- private final static Charset utf8 = StandardCharsets.UTF_8;
+ private final static Charset UTF_8 = StandardCharsets.UTF_8;
/** look-up table for characters which should not be escaped in URL paths */
private final static boolean[] unescapedCharacters = new boolean[128];
@@ -132,20 +138,11 @@ private static boolean isHexCharacter(int c) {
|| (0x30 <= c && c <= 0x39);
}
- private static boolean isAscii(String str) {
- char[] chars = str.toCharArray();
- for (char c : chars) {
- if (c > 127) {
- return false;
- }
- }
- return true;
- }
-
private Configuration conf;
private boolean hostIDNtoASCII;
private boolean hostASCIItoIDN;
+ private boolean hostIDNA2008;
private boolean hostTrimTrailingDot;
@Override
@@ -159,9 +156,12 @@ public void setConf(Configuration conf) {
String normIdn = conf.get(NORM_HOST_IDN, "");
if (normIdn.equalsIgnoreCase("toAscii")) {
hostIDNtoASCII = true;
+ hostASCIItoIDN = false;
} else if (normIdn.equalsIgnoreCase("toUnicode")) {
+ hostIDNtoASCII = false;
hostASCIItoIDN = true;
}
+ hostIDNA2008 = conf.getBoolean(NORM_HOST_IDNA_2008, false);
hostTrimTrailingDot = conf.getBoolean(NORM_HOST_TRIM_TRAILING_DOT, false);
}
@@ -364,7 +364,7 @@ private String escapePath(String path) {
StringBuilder sb = new StringBuilder(path.length());
// Traverse over all bytes in this URL
- byte[] bytes = path.getBytes(utf8);
+ byte[] bytes = path.getBytes(UTF_8);
for (int i = 0; i < bytes.length; i++) {
byte b = bytes[i];
// Is this a control character?
@@ -415,8 +415,8 @@ private String normalizeHostName(String host) throws MalformedURLException {
// 1. unescape percent-encoded characters in host name
if (host.indexOf('%') != -1) {
try {
- host = URLDecoder.decode(host, StandardCharsets.UTF_8.toString());
- } catch (UnsupportedEncodingException | IllegalArgumentException e) {
+ host = URLDecoder.decode(host, UTF_8);
+ } catch (IllegalArgumentException e) {
LOG.debug("Failed to convert percent-encoded host name {}: ", host, e);
throw (MalformedURLException) new MalformedURLException(
"Invalid percent-encoded host name " + host + ": " + e.getMessage())
@@ -429,21 +429,18 @@ private String normalizeHostName(String host) throws MalformedURLException {
// 3. if configured: convert between Unicode and ASCII forms
// for Internationalized Domain Names (IDNs)
- if (hostIDNtoASCII && !isAscii(host)) {
- try {
- host = IDN.toASCII(host);
- } catch (IllegalArgumentException | IndexOutOfBoundsException e) {
- // IllegalArgumentException: thrown if the input string contains
- // non-convertible Unicode codepoints
- // IndexOutOfBoundsException: thrown (undocumented) if one "label"
- // (non-ASCII dot-separated segment) is longer than 256 characters,
- // cf. https://bugs.openjdk.java.net/browse/JDK-6806873
- LOG.debug("Failed to convert IDN host {}: ", host, e);
- throw (MalformedURLException) new MalformedURLException(
- "Invalid IDN " + host + ": " + e.getMessage()).initCause(e);
+ if (hostIDNtoASCII && !URLUtil.isAscii(host)) {
+ if (hostIDNA2008) {
+ host = URLUtil.convertIDNA2008(host, true);
+ } else {
+ host = URLUtil.convertIDNA2003(host, true, false);
}
} else if (hostASCIItoIDN && host.contains("xn--")) {
- host = IDN.toUnicode(host);
+ if (hostIDNA2008) {
+ host = URLUtil.convertIDNA2008(host, false);
+ } else {
+ host = URLUtil.convertIDNA2003(host, false, false);
+ }
}
// 4. optionally trim a trailing dot
@@ -466,7 +463,7 @@ public static void main(String args[]) throws IOException {
}
String line, normUrl;
BufferedReader in = new BufferedReader(
- new InputStreamReader(System.in, utf8));
+ new InputStreamReader(System.in, UTF_8));
while ((line = in.readLine()) != null) {
try {
normUrl = normalizer.normalize(line, scope);
diff --git a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
index a6bad41f2e..090c25f2da 100644
--- a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
+++ b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
@@ -258,15 +258,24 @@ public void testHostName() throws Exception {
// test Internationalized Domain Names
BasicURLNormalizer norm = new BasicURLNormalizer();
conf = NutchConfiguration.create();
+
+ // to ASCII normalization
conf.set(BasicURLNormalizer.NORM_HOST_IDN, "toAscii");
norm.setConf(conf);
normalizeTest(norm, "https://нэб.рф/", "https://xn--90ax2c.xn--p1ai/");
// verify escaping of percent-encoded characters in IDNs (NUTCH-2824)
normalizeTest(norm, "https://www.0251-sachverst%c3%a4ndiger.de/",
"https://www.xn--0251-sachverstndiger-ozb.de/");
+ // verify that host names with uppercase characters are normalized
+ normalizeTest(norm, "https://нЭб.РФ/", "https://xn--90ax2c.xn--p1ai/");
+
+ // to Unicode normalization
conf.set(BasicURLNormalizer.NORM_HOST_IDN, "toUnicode");
norm.setConf(conf);
normalizeTest(norm, "https://xn--90ax2c.xn--p1ai/", "https://нэб.рф/");
+ // verify that host names with uppercase characters are normalized
+ normalizeTest(norm, "https://Xn--90Ax2c.xN--P1ai/", "https://нэб.рф/");
+
// test removal of trailing dot
conf.setBoolean(BasicURLNormalizer.NORM_HOST_TRIM_TRAILING_DOT, true);
norm.setConf(conf);
@@ -274,6 +283,63 @@ public void testHostName() throws Exception {
"https://www.example.org/");
}
+ /**
+ * Test for IDNA2008 and IDNA2003 compatibility.
+ */
+ @Test
+ public void testHostNameIDNA2008() throws Exception {
+ // IDNA2008 (https://www.rfc-editor.org/rfc/rfc5890.html#section-1.1)
+ BasicURLNormalizer norm = new BasicURLNormalizer();
+ conf = NutchConfiguration.create();
+ conf.set(BasicURLNormalizer.NORM_HOST_IDN, "toAscii");
+ norm.setConf(conf);
+
+ // IDNA2003 / RFC 3490
+ // Note: IDNA2008 and IDNA2003 deviate for this example
+ normalizeTest(norm, "https://straße.de/", "https://strasse.de/");
+
+ // Verify that characters not in Unicode 3.2 do not fail the normalization
+ normalizeTest(norm, "https://example.ᬩᬮᬶ.id/", "https://example.xn--9tfky.id/");
+
+ // IDNA2008 / RFC 5890
+ conf.setBoolean(BasicURLNormalizer.NORM_HOST_IDNA_2008, true);
+ norm.setConf(conf);
+ // Note: this is different from IDNA2003
+ normalizeTest(norm, "https://straße.de/", "https://xn--strae-oqa.de/");
+
+ // Verify that characters not in Unicode 3.2 do not fail the normalization
+ normalizeTest(norm, "https://example.ᬩᬮᬶ.id/", "https://example.xn--9tfky.id/");
+
+ // mixed encodings (Unicode, Punycode, percent encoding)
+ normalizeTest(norm, "https://xn--p1ai.%D1%80%D1%84/",
+ "https://xn--p1ai.xn--p1ai/");
+ normalizeTest(norm, "https://xn--p1ai.рф/", "https://xn--p1ai.xn--p1ai/");
+
+ // test conversion to Unicode (IDNA2008)
+ conf.set(BasicURLNormalizer.NORM_HOST_IDN, "toUnicode");
+ norm.setConf(conf);
+ normalizeTest(norm, "https://xn--strae-oqa.de/", "https://straße.de/");
+ normalizeTest(norm, "https://example.xn--9tfky.id/", "https://example.ᬩᬮᬶ.id/");
+
+ // mixed encodings (Unicode, Punycode, percent encoding), mixed case
+ normalizeTest(norm, "https://xN--p1aI.Xn--P1ai/", "https://рф.рф/");
+ normalizeTest(norm, "https://xN--p1Ai.%D1%80%d1%84/", "https://рф.рф/");
+ normalizeTest(norm, "https://булГаков.xN--p1Ai.%D1%80%d1%84/", "https://булгаков.рф.рф/");
+ normalizeTest(norm, "https://гоГоль.%d1%80%D1%84.Рф/", "https://гоголь.рф.рф/");
+
+ // test conversion to Unicode (IDNA2003)
+ conf.setBoolean(BasicURLNormalizer.NORM_HOST_IDNA_2008, false);
+ norm.setConf(conf);
+ normalizeTest(norm, "https://xn--strae-oqa.de/", "https://xn--strae-oqa.de/");
+ normalizeTest(norm, "https://example.xn--9tfky.id/", "https://example.ᬩᬮᬶ.id/");
+
+ // mixed encodings (Unicode, Punycode, percent encoding), mixed case
+ normalizeTest(norm, "https://xN--p1aI.Xn--P1ai/", "https://рф.рф/");
+ normalizeTest(norm, "https://xN--p1Ai.%D1%80%d1%84/", "https://рф.рф/");
+ normalizeTest(norm, "https://булГаков.xN--p1Ai.%D1%80%d1%84/", "https://булгаков.рф.рф/");
+ normalizeTest(norm, "https://гоГоль.%d1%80%D1%84.Рф/", "https://гоголь.рф.рф/");
+ }
+
/**
* Test that normalizer throws MalformedURLException for invalid URLs
*/
diff --git a/src/test/org/apache/nutch/util/TestURLUtil.java b/src/test/org/apache/nutch/util/TestURLUtil.java
index 092edb9c18..200ea59a06 100644
--- a/src/test/org/apache/nutch/util/TestURLUtil.java
+++ b/src/test/org/apache/nutch/util/TestURLUtil.java
@@ -16,11 +16,18 @@
*/
package org.apache.nutch.util;
+import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+
+import java.net.MalformedURLException;
import java.net.URL;
import org.junit.jupiter.api.Test;
-
-import static org.junit.jupiter.api.Assertions.*;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.CsvSource;
/** Test class for URLUtil */
public class TestURLUtil {
@@ -312,7 +319,20 @@ public void testToUNICODE() throws Exception {
assertEquals("http://www.medizin.uni-tübingen.de:8080/search.php?q=abc#p1",
URLUtil
.toUNICODE("http://www.medizin.xn--uni-tbingen-xhb.de:8080/search.php?q=abc#p1"));
-
+ // do not fail on characters not in Unicode 3.2
+ assertEquals("https://example.ᬩᬮᬶ.id/",
+ URLUtil.toUNICODE("https://example.xn--9tfky.id/"));
+ // IDNA2008
+ assertEquals("http://straße.de/",
+ URLUtil.toUNICODE("http://xn--strae-oqa.de/"));
+ // host names with uppercase characters
+ assertEquals("https://googie.com/",
+ URLUtil.toUNICODE("https://googIe.com/"));
+ assertEquals("https://googie.com/", URLUtil.toASCII("https://googIe.com/"));
+ assertEquals("https://xn--90ax2c.xn--p1ai/",
+ URLUtil.toASCII("https://нЭб.РФ/"));
+ assertEquals("https://нэб.рф/",
+ URLUtil.toUNICODE("https://Xn--90Ax2c.xN--P1ai/"));
}
@Test
@@ -324,6 +344,106 @@ public void testToASCII() throws Exception {
assertEquals("http://www.medizin.xn--uni-tbingen-xhb.de:8080/search.php?q=abc#p1",
URLUtil
.toASCII("http://www.medizin.uni-tübingen.de:8080/search.php?q=abc#p1"));
+ // IDNA2003
+ // assertEquals("http://strasse.de/",
+ // URLUtil.toASCII("http://straße.de/"));
+ // do not fail on characters not in Unicode 3.2
+ assertEquals("https://example.xn--9tfky.id/",
+ URLUtil.toASCII("https://example.ᬩᬮᬶ.id/"));
+ // IDNA2008
+ assertEquals("http://xn--strae-oqa.de/",
+ URLUtil.toASCII("http://straße.de/"));
+ }
+
+ @ParameterizedTest
+ @CsvSource({ //
+ "www.xn--evir-zoa.com,www.çevir.com,IDNA2003,true", //
+ "xn--uni-tbingen-xhb.de,uni-tübingen.de,IDNA2003,true", //
+ "example.xn--9tfky.id,example.ᬩᬮᬶ.id,IDNA2008,true", //
+ // Test examples from whatwg-url
+ "xn--53h.example,☕.example,IDNA2008,true", //
+ "xn--0ca.xn--ssa73l,à.א̈,IDNA2008,true", //
+ "xn--mgba3gch31f060k.com,\u0646\u0627\u0645\u0647\u200c\u0627\u06cc.com,IDNA2008,true", //
+ /* Note: IDNA2008 and IDNA2003 deviate for the following examples,
+ * cf. https://www.unicode.org/reports/tr46/#IDNA2003-Section */
+ "xn--strae-oqa.de,straße.de,IDNA2008,true", //
+ "strasse.de,straße.de,IDNA2003,false", //
+ "strasse.de,strasse.de,IDNA2003,true", //
+ "xn--fa-hia.de,faß.de,IDNA2008,true", //
+ "fass.de,faß.de,IDNA2003,false", //
+ "fass.de,fass.de,IDNA2003,true", //
+ "xn--nxasmm1c.com,βόλος.com,IDNA2008,true", //
+ "xn--nxasmq6b.com,βόλος.com,IDNA2003,false", //
+ "xn--nxasmq6b.com,βόλοσ.com,IDNA2003,true", //
+ "xn--10cl1a0b660p.com,ශ්රී.com,IDNA2008,true", //
+ "xn--10cl1a0b.com,ශ්රී.com,IDNA2003,false", //
+ "xn--10cl1a0b.com,ශ්රී.com,IDNA2003,true", //
+ "xn--mgba3gch31f060k.com,نامهای.com,IDNA2008,true", //
+ "xn--mgba3gch31f.com,نامهای.com,IDNA2003,false", //
+ "xn--mgba3gch31f.com,نامهای.com,IDNA2003,true", //
+ // mixed lowercase/uppercase: no round trip conversion
+ "xn--bb-eka.at,ÖBB.at,IDNA2003,false", //
+ "xn--bb-eka.at,öbb.at,IDNA2003,true", //
+ // mixed encoding (Punycode and Unicode)
+ "xn--p1ai.xn--p1ai,рф.xn--p1ai,IDNA2003,false", //
+ "xn--p1ai.xn--p1ai,xn--p1ai.рф,IDNA2003,false", //
+ // percent-encoding is not supported
+ // "xn--p1ai.xn--p1ai,xn--p1ai.%D1%80%D1%84,IDNA2003,false", //
+ })
+ public final void testConvertHost(String ascii, String unicode, String type,
+ boolean roundTrip) throws Exception {
+ System.out.println(ascii + " <> " + unicode);
+ if ("IDNA2008".equals(type)) {
+ assertEquals(ascii, URLUtil.convertIDNA2008(unicode, true));
+ assertEquals(unicode, URLUtil.convertIDNA2008(ascii, false));
+ try {
+ assertNotNull(URLUtil.convertIDNA2003(unicode, true, false));
+ } catch (MalformedURLException e) {
+ /*
+ * Ok. A IDNA2008 input may raise an exception when using the IDNA2003
+ * method
+ */
+ }
+ } else if ("IDNA2003".equals(type)) {
+ assertEquals(ascii, URLUtil.convertIDNA2003(unicode, true, true));
+ assertEquals(ascii, URLUtil.convertIDNA2003(unicode, true, false));
+ if (roundTrip) {
+ assertEquals(unicode, URLUtil.convertIDNA2003(ascii, false, true));
+ assertEquals(unicode, URLUtil.convertIDNA2003(ascii, false, false));
+ }
+ }
+ }
+
+ @Test
+ public final void testConvertHostInvalid() {
+ // broken Punycode
+ assertDoesNotThrow(() -> assertEquals("xn--xn--bss-7z6ccid.com",
+ URLUtil.convertIDNA2003("xn--xn--bss-7z6ccid.com", false, true)));
+
+ // invalid Punycode
+ assertThrows(MalformedURLException.class,
+ () -> URLUtil.convertIDNA2008("xn--0.pt", false));
+
+ // IDNA2003 not allowing characters not in Unicode 3.2
+ assertThrows(MalformedURLException.class,
+ () -> URLUtil.convertIDNA2003("☕.example", true, true));
+ assertDoesNotThrow(() -> assertEquals("xn--53h.example",
+ URLUtil.convertIDNA2003("xn--53h.example", false, true)));
+
+ // IDNA2008 invalid,
+ // cf. https://www.unicode.org/reports/tr46/#Implementation_Notes
+ // cf. https://www.unicode.org/Public/17.0.0/idna/IdnaTestV2.txt
+ // disallowed character: ⒈ (U+2488 - DIGIT ONE FULL STOP)
+ assertThrows(MalformedURLException.class,
+ () -> URLUtil.convertIDNA2008("\u2488com", true));
+ assertThrows(MalformedURLException.class,
+ () -> URLUtil.convertIDNA2008("xn--acom-0w1b", false));
+ assertThrows(MalformedURLException.class,
+ () -> URLUtil.convertIDNA2008("xn--xn--a--gua.pt", false));
+ assertThrows(MalformedURLException.class,
+ () -> URLUtil.convertIDNA2008("xn--a-ä.pt", false));
+ assertThrows(MalformedURLException.class,
+ () -> URLUtil.convertIDNA2008("xn--a-ä.pt", true));
}
@Test