apache · sebastian-nagel · May 12, 2026 · May 13, 2026 · May 13, 2026 · May 13, 2026
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
@@ -1556,6 +1556,17 @@
   </description>
 </property>
 
+<property>
+  <name>urlnormalizer.basic.host.idna2008</name>
+  <value>false</value>
+  <description>If true, let urlnormalizer-basic
+  normalize Internationalized Domain Names (IDNs) using the
+  standard IDNA2008 (RFC 5890). If false, use IDNA2003 (RFC 3490).
+  Note that urlnormalizer.basic.host.idn must be set, otherwise
+  this property has no effect.
+  </description>
+</property>
+
 <property>
   <name>urlnormalizer.basic.host.trim-trailing-dot</name>
   <value>false</value>

diff --git a/ivy/ivy.xml b/ivy/ivy.xml
@@ -118,6 +118,7 @@
     <!-- Required for JUnit 6 (Jupiter) test execution -->
     <dependency org="org.junit.jupiter" name="junit-jupiter-engine" rev="6.0.3" conf="test->default"/>
     <dependency org="org.junit.jupiter" name="junit-jupiter-api" rev="6.0.3" conf="test->default"/>
+    <dependency org="org.junit.jupiter" name="junit-jupiter-params" rev="6.0.3" conf="test->default"/>
     <!-- Mockito for mocking in tests -->
     <dependency org="org.mockito" name="mockito-core" rev="5.18.0" conf="test->default"/>
     <dependency org="org.mockito" name="mockito-junit-jupiter" rev="5.18.0" conf="test->default"/>

diff --git a/src/java/org/apache/nutch/crawl/DeduplicationJob.java b/src/java/org/apache/nutch/crawl/DeduplicationJob.java
@@ -17,9 +17,9 @@
 package org.apache.nutch.crawl;
 
 import java.io.IOException;
-import java.io.UnsupportedEncodingException;
 import java.lang.invoke.MethodHandles;
 import java.net.URLDecoder;
+import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.util.HashMap;
 import java.util.Map;
@@ -34,7 +34,6 @@
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.mapreduce.Counter;
-import org.apache.hadoop.mapreduce.CounterGroup;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.hadoop.mapreduce.Reducer;
@@ -70,7 +69,7 @@ public class DeduplicationJob extends NutchTool implements Tool {
   protected final static Text urlKey = new Text("_URLTEMPKEY_");
   protected final static String DEDUPLICATION_GROUP_MODE = "deduplication.group.mode";
   protected final static String DEDUPLICATION_COMPARE_ORDER = "deduplication.compare.order";
-  protected final static String UTF_8 = StandardCharsets.UTF_8.toString();
+  protected final static Charset UTF_8 = StandardCharsets.UTF_8;
 
   public static class DBFilter extends
       Mapper<Text, CrawlDatum, BytesWritable, CrawlDatum> {
@@ -224,13 +223,13 @@ protected CrawlDatum getDuplicate(CrawlDatum existingDoc, CrawlDatum newDoc) {
           String urlnewDoc = newDoc.getMetaData().get(urlKey).toString();
           try {
             urlExisting = URLDecoder.decode(urlExisting, UTF_8);
-          } catch (UnsupportedEncodingException | IllegalArgumentException e) {
+          } catch (IllegalArgumentException e) {
             LOG.error("Error decoding: {}", urlExisting, e);
             // use the encoded URL
           }
           try {
             urlnewDoc = URLDecoder.decode(urlnewDoc, UTF_8);
-          } catch (UnsupportedEncodingException | IllegalArgumentException e) {
+          } catch (IllegalArgumentException e) {
             LOG.error("Error decoding: {}", urlnewDoc, e);
             // use the encoded URL
           }

diff --git a/src/java/org/apache/nutch/plugin/PluginManifestParser.java b/src/java/org/apache/nutch/plugin/PluginManifestParser.java
@@ -18,11 +18,11 @@
 
 import java.io.File;
 import java.io.IOException;
-import java.io.UnsupportedEncodingException;
 import java.lang.invoke.MethodHandles;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.net.URLDecoder;
+import java.nio.charset.StandardCharsets;
 import java.util.HashMap;
 import java.util.Map;
 
@@ -124,10 +124,7 @@ public File getPluginFolder(String name) {
       String path = url.getPath();
       if (WINDOWS && path.startsWith("/")) // patch a windows bug
         path = path.substring(1);
-      try {
-        path = URLDecoder.decode(path, "UTF-8"); // decode the url path
-      } catch (UnsupportedEncodingException e) {
-      }
+      path = URLDecoder.decode(path, StandardCharsets.UTF_8); // decode the url path
       directory = new File(path);
     } else if (!directory.exists()) {
       LOG.warn("Plugins: directory not found: {}", name);

diff --git a/src/java/org/apache/nutch/util/URLUtil.java b/src/java/org/apache/nutch/util/URLUtil.java
@@ -16,18 +16,30 @@
  */
 package org.apache.nutch.util;
 
+import java.lang.invoke.MethodHandles;
 import java.net.IDN;
 import java.net.MalformedURLException;
 import java.net.URI;
 import java.net.URL;
 import java.util.Locale;
 import java.util.regex.Pattern;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.ibm.icu.text.IDNA;
+
 import crawlercommons.domains.EffectiveTldFinder;
 
 /** Utility class for URL analysis */
 public class URLUtil {
 
+  private static final Logger LOG = LoggerFactory
+      .getLogger(MethodHandles.lookup().lookupClass());
+
+  private static final IDNA idna = IDNA.getUTS46Instance(
+      IDNA.NONTRANSITIONAL_TO_ASCII | IDNA.NONTRANSITIONAL_TO_UNICODE);
+
   /**
    * Resolve relative URL-s and fix a java.net.URL error in handling of URLs
    * with pure query targets.
@@ -520,31 +532,65 @@ public static String getProtocol(URL url) {
     return url.getProtocol();
   }
 
+  public static boolean isAscii(String str) {
+    char[] chars = str.toCharArray();
+    for (char c : chars) {
+      if (c > 127) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /**
+   * Convert URL with IDN host/domain name into the ASCII representation.
+   *
+   * @param url
+   *          URL string to convert
+   * @return URL string with ASCII host/domain name or null if conversion fails.
+   */
   public static String toASCII(String url) {
     try {
       URL u = new URL(url);
       String host = u.getHost();
-      if (host == null || host.isEmpty()) {
-        // no host name => no punycoded domain name
-        // also do not add additional slashes for file: URLs (NUTCH-1880)
+      String hostLowerCase = host.toLowerCase(Locale.ROOT);
+      if (host == null || host.isEmpty()
+          || (isAscii(host) && host.equals(hostLowerCase))) {
+        // - no host name => no punycoded domain name
+        // - also do not add additional slashes for file: URLs (NUTCH-1880)
+        // - do nothing if host is already ASCII-only
+        // - not already in lowercase => conversion also lowercases host name
         return url;
       }
-      URI p = new URI(u.getProtocol(), u.getUserInfo(), IDN.toASCII(host),
-          u.getPort(), u.getPath(), u.getQuery(), u.getRef());
+      URI p = new URI(u.getProtocol(), u.getUserInfo(),
+          convertIDNA2008(hostLowerCase, true), u.getPort(), u.getPath(),
+          u.getQuery(), u.getRef());
 
       return p.toString();
     } catch (Exception e) {
       return null;
     }
   }
 
+  /**
+   * Convert URL with IDN host/domain name to the Unicode representation.
+   *
+   * @param url
+   *          URL string to convert
+   * @return URL string with Unicode host/domain name or null if conversion
+   *         fails.
+   */
   public static String toUNICODE(String url) {
     try {
       URL u = new URL(url);
       String host = u.getHost();
-      if (host == null || host.isEmpty()) {
-        // no host name => no punycoded domain name
-        // also do not add additional slashes for file: URLs (NUTCH-1880)
+      String hostLowerCase = host.toLowerCase(Locale.ROOT);
+      if (host == null || host.isEmpty()
+          || (!hostLowerCase.contains("xn--") && host.equals(hostLowerCase))) {
+        // - no host name => no punycoded domain name
+        // - also do not add additional slashes for file: URLs (NUTCH-1880)
+        // - contains 'xn--' => needs conversion
+        // - not already in lowercase => conversion also lowercases host name
         return url;
       }
       StringBuilder sb = new StringBuilder();
@@ -554,7 +600,7 @@ public static String toUNICODE(String url) {
         sb.append(u.getUserInfo());
         sb.append('@');
       }
-      sb.append(IDN.toUnicode(host));
+      sb.append(convertIDNA2008(hostLowerCase, false));
       if (u.getPort() != -1) {
         sb.append(':');
         sb.append(u.getPort());
@@ -572,22 +618,83 @@ public static String toUNICODE(String url) {
   }
 
   /**
-   * For testing
-   * @param args print with no args to get help
+   * Convert IDN host to ASCII or Unicode using Java's built-in {@link IDN}
+   * class.
+   *
+   * The conversion supports only IDNA2003, it does not support IDNA2008.
+   * However, unless the parameter <code>strictIDNA2003</code> is true, the
+   * methods {@link IDN#toASCII(String, int)} resp.
+   * {@link IDN#toUnicode(String, int)} are called passing the flag
+   * {@link IDN#ALLOW_UNASSIGNED} to avoid that the conversion fails on
+   * characters not in the repertoire of Unicode 3.2.
+   *
+   * @param host
+   *          host name to be converted (lowercase expected)
+   * @param toAscii
+   *          if true convert to ASCII, otherwise to Unicode
+   * @param strictIDNA2003
+   *          if true, do
+   * @return converted host name
+   * @throws MalformedURLException
+   *           if the conversion fails
    */
-  public static void main(String[] args) {
-
-    if (args.length != 1) {
-      System.err.println("Usage : URLUtil <url>");
-      return;
+  public static String convertIDNA2003(String host, boolean toAscii,
+      boolean strictIDNA2003) throws MalformedURLException {
+    try {
+      if (toAscii) {
+        return IDN.toASCII(host, strictIDNA2003 ? 0 : IDN.ALLOW_UNASSIGNED);
+      } else {
+        return IDN.toUnicode(host, strictIDNA2003 ? 0 : IDN.ALLOW_UNASSIGNED);
+      }
+    } catch (IllegalArgumentException | IndexOutOfBoundsException e) {
+      // IllegalArgumentException: thrown if the input string contains
+      // non-convertible Unicode codepoints
+      // IndexOutOfBoundsException: thrown (undocumented) if one "label"
+      // (non-ASCII dot-separated segment) is longer than 256 characters,
+      // cf. https://bugs.openjdk.java.net/browse/JDK-6806873
+      LOG.debug("Failed to convert IDN host {}: ", host, e);
+      throw (MalformedURLException) new MalformedURLException(
+          "Invalid IDN " + host + ": " + e.getMessage()).initCause(e);
     }
+  }
 
-    String url = args[0];
-    try {
-      System.out.println(URLUtil.getDomainName(new URL(url)));
-    } catch (MalformedURLException ex) {
-      ex.printStackTrace();
+  /**
+   * Convert IDN host to ASCII or Unicode using ICU's {@link IDNA} class.
+   *
+   * The conversion supports IDNA2008 names.
+   *
+   * @param host
+   *          host name to be converted (lowercase expected)
+   * @param toAscii
+   *          if true convert to ASCII, otherwise to Unicode
+   * @return converted host name
+   * @throws MalformedURLException
+   *           if the conversion fails
+   */
+  public static String convertIDNA2008(String host, boolean toAscii)
+      throws MalformedURLException {
+    final IDNA.Info idnaInfo = new IDNA.Info();
+    final StringBuilder hostConverted = new StringBuilder();
+    if (toAscii) {
+      idna.nameToASCII(host, hostConverted, idnaInfo);
+    } else {
+      idna.nameToUnicode(host, hostConverted, idnaInfo);
     }
+    if (idnaInfo.hasErrors()) {
+      StringBuilder msg = new StringBuilder();
+      for (IDNA.Error error : idnaInfo.getErrors()) {
+        if (msg.length() == 0) {
+          msg.append("Invalid IDNA2008 host").append(host).append(": ");
+        } else {
+          msg.append(", ");
+        }
+        msg.append(error.name());
+      }
+      String errorMsg = msg.toString();
+      LOG.debug("Failed to convert IDN host {}: {}", host, errorMsg);
+      throw new MalformedURLException(errorMsg);
+    }
+    return hostConverted.toString();
   }
 
   /**
@@ -610,4 +717,24 @@ public static boolean isHomePageOf(URL url, String hostName) {
         && url.getRef() == null //
         && url.getUserInfo() == null;
   }
+
+  /**
+   * For testing
+   * @param args print with no args to get help
+   */
+  public static void main(String[] args) {
+
+    if (args.length != 1) {
+      System.err.println("Usage : URLUtil <url>");
+      System.err.println("\nExtract and print pay-level domain names for the input URL");
+      return;
+    }
+
+    String url = args[0];
+    try {
+      System.out.println(URLUtil.getDomainName(new URL(url)));
+    } catch (MalformedURLException ex) {
+      ex.printStackTrace();
+    }
+  }
 }
diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
@@ -16,20 +16,18 @@
  */
 package org.apache.nutch.protocol.file;
 
-import java.net.URL;
 import java.io.IOException;
-import java.io.UnsupportedEncodingException;
+import java.net.URL;
+import java.nio.charset.StandardCharsets;
 
+import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.protocol.Content;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.net.protocols.HttpDateFormat;
 import org.apache.nutch.net.protocols.Response;
-
+import org.apache.nutch.protocol.Content;
 import org.apache.tika.Tika;
 
-import org.apache.hadoop.conf.Configuration;
-
 /**
  * FileResponse.java mimics file replies as http response. It tries its best to
  * follow http's way for headers, response codes as well as exceptions.
@@ -125,11 +123,8 @@ public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf)
 
     String path = url.getPath().isEmpty() ? "/" : url.getPath();
 
-    try {
-      // specify the encoding via the config later?
-      path = java.net.URLDecoder.decode(path, "UTF-8");
-    } catch (UnsupportedEncodingException ex) {
-    }
+    // specify the encoding via the config later?
+    path = java.net.URLDecoder.decode(path, StandardCharsets.UTF_8);
 
     try {
 

diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
@@ -30,6 +30,7 @@
 
 import java.net.InetAddress;
 import java.net.URL;
+import java.nio.charset.StandardCharsets;
 import java.util.List;
 import java.util.LinkedList;
 import java.io.ByteArrayOutputStream;
@@ -245,7 +246,7 @@ public FtpResponse(URL url, CrawlDatum datum, Ftp ftp, Configuration conf)
 
       this.content = null;
 
-      path = java.net.URLDecoder.decode(path, "UTF-8");
+      path = java.net.URLDecoder.decode(path, StandardCharsets.UTF_8);
 
       if (path.endsWith("/")) {
         getDirAsHttpResponse(path, datum.getModifiedTime());