Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions conf/nutch-default.xml
Original file line number Diff line number Diff line change
Expand Up @@ -1556,6 +1556,17 @@
</description>
</property>

<property>
<name>urlnormalizer.basic.host.idna2008</name>
<value>false</value>
<description>If true, let urlnormalizer-basic
normalize Internationalized Domain Names (IDNs) using the
standard IDNA2008 (RFC 5890). If false, use IDNA2003 (RFC 3490).
Note that urlnormalizer.basic.host.idn must be set, otherwise
this property has no effect.
</description>
</property>

<property>
<name>urlnormalizer.basic.host.trim-trailing-dot</name>
<value>false</value>
Expand Down
1 change: 1 addition & 0 deletions ivy/ivy.xml
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@
<!-- Required for JUnit 6 (Jupiter) test execution -->
<dependency org="org.junit.jupiter" name="junit-jupiter-engine" rev="6.0.3" conf="test->default"/>
<dependency org="org.junit.jupiter" name="junit-jupiter-api" rev="6.0.3" conf="test->default"/>
<dependency org="org.junit.jupiter" name="junit-jupiter-params" rev="6.0.3" conf="test->default"/>
<!-- Mockito for mocking in tests -->
<dependency org="org.mockito" name="mockito-core" rev="5.18.0" conf="test->default"/>
<dependency org="org.mockito" name="mockito-junit-jupiter" rev="5.18.0" conf="test->default"/>
Expand Down
9 changes: 4 additions & 5 deletions src/java/org/apache/nutch/crawl/DeduplicationJob.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@
package org.apache.nutch.crawl;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.lang.invoke.MethodHandles;
import java.net.URLDecoder;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
Expand All @@ -34,7 +34,6 @@
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.CounterGroup;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
Expand Down Expand Up @@ -70,7 +69,7 @@ public class DeduplicationJob extends NutchTool implements Tool {
protected final static Text urlKey = new Text("_URLTEMPKEY_");
protected final static String DEDUPLICATION_GROUP_MODE = "deduplication.group.mode";
protected final static String DEDUPLICATION_COMPARE_ORDER = "deduplication.compare.order";
protected final static String UTF_8 = StandardCharsets.UTF_8.toString();
protected final static Charset UTF_8 = StandardCharsets.UTF_8;

public static class DBFilter extends
Mapper<Text, CrawlDatum, BytesWritable, CrawlDatum> {
Expand Down Expand Up @@ -224,13 +223,13 @@ protected CrawlDatum getDuplicate(CrawlDatum existingDoc, CrawlDatum newDoc) {
String urlnewDoc = newDoc.getMetaData().get(urlKey).toString();
try {
urlExisting = URLDecoder.decode(urlExisting, UTF_8);
} catch (UnsupportedEncodingException | IllegalArgumentException e) {
} catch (IllegalArgumentException e) {
LOG.error("Error decoding: {}", urlExisting, e);
// use the encoded URL
}
try {
urlnewDoc = URLDecoder.decode(urlnewDoc, UTF_8);
} catch (UnsupportedEncodingException | IllegalArgumentException e) {
} catch (IllegalArgumentException e) {
LOG.error("Error decoding: {}", urlnewDoc, e);
// use the encoded URL
}
Expand Down
7 changes: 2 additions & 5 deletions src/java/org/apache/nutch/plugin/PluginManifestParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@

import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.lang.invoke.MethodHandles;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;

Expand Down Expand Up @@ -124,10 +124,7 @@ public File getPluginFolder(String name) {
String path = url.getPath();
if (WINDOWS && path.startsWith("/")) // patch a windows bug
path = path.substring(1);
try {
path = URLDecoder.decode(path, "UTF-8"); // decode the url path
} catch (UnsupportedEncodingException e) {
}
path = URLDecoder.decode(path, StandardCharsets.UTF_8); // decode the url path
directory = new File(path);
} else if (!directory.exists()) {
LOG.warn("Plugins: directory not found: {}", name);
Expand Down
169 changes: 148 additions & 21 deletions src/java/org/apache/nutch/util/URLUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,30 @@
*/
package org.apache.nutch.util;

import java.lang.invoke.MethodHandles;
import java.net.IDN;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URL;
import java.util.Locale;
import java.util.regex.Pattern;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.ibm.icu.text.IDNA;

import crawlercommons.domains.EffectiveTldFinder;

/** Utility class for URL analysis */
public class URLUtil {

private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());

private static final IDNA idna = IDNA.getUTS46Instance(
IDNA.NONTRANSITIONAL_TO_ASCII | IDNA.NONTRANSITIONAL_TO_UNICODE);

/**
* Resolve relative URL-s and fix a java.net.URL error in handling of URLs
* with pure query targets.
Expand Down Expand Up @@ -520,31 +532,65 @@ public static String getProtocol(URL url) {
return url.getProtocol();
}

public static boolean isAscii(String str) {
char[] chars = str.toCharArray();
for (char c : chars) {
if (c > 127) {
return false;
}
}
return true;
}

/**
* Convert URL with IDN host/domain name into the ASCII representation.
*
* @param url
* URL string to convert
* @return URL string with ASCII host/domain name or null if conversion fails.
*/
public static String toASCII(String url) {
try {
URL u = new URL(url);
String host = u.getHost();
if (host == null || host.isEmpty()) {
// no host name => no punycoded domain name
// also do not add additional slashes for file: URLs (NUTCH-1880)
String hostLowerCase = host.toLowerCase(Locale.ROOT);
if (host == null || host.isEmpty()
|| (isAscii(host) && host.equals(hostLowerCase))) {
// - no host name => no punycoded domain name
// - also do not add additional slashes for file: URLs (NUTCH-1880)
// - do nothing if host is already ASCII-only
// - not already in lowercase => conversion also lowercases host name
return url;
}
URI p = new URI(u.getProtocol(), u.getUserInfo(), IDN.toASCII(host),
u.getPort(), u.getPath(), u.getQuery(), u.getRef());
URI p = new URI(u.getProtocol(), u.getUserInfo(),
convertIDNA2008(hostLowerCase, true), u.getPort(), u.getPath(),
u.getQuery(), u.getRef());

return p.toString();
} catch (Exception e) {
return null;
}
}

/**
* Convert URL with IDN host/domain name to the Unicode representation.
*
* @param url
* URL string to convert
* @return URL string with Unicode host/domain name or null if conversion
* fails.
*/
public static String toUNICODE(String url) {
try {
URL u = new URL(url);
String host = u.getHost();
if (host == null || host.isEmpty()) {
// no host name => no punycoded domain name
// also do not add additional slashes for file: URLs (NUTCH-1880)
String hostLowerCase = host.toLowerCase(Locale.ROOT);
if (host == null || host.isEmpty()
|| (!hostLowerCase.contains("xn--") && host.equals(hostLowerCase))) {
// - no host name => no punycoded domain name
// - also do not add additional slashes for file: URLs (NUTCH-1880)
// - contains 'xn--' => needs conversion
// - not already in lowercase => conversion also lowercases host name
return url;
}
StringBuilder sb = new StringBuilder();
Expand All @@ -554,7 +600,7 @@ public static String toUNICODE(String url) {
sb.append(u.getUserInfo());
sb.append('@');
}
sb.append(IDN.toUnicode(host));
sb.append(convertIDNA2008(hostLowerCase, false));
if (u.getPort() != -1) {
sb.append(':');
sb.append(u.getPort());
Expand All @@ -572,22 +618,83 @@ public static String toUNICODE(String url) {
}

/**
* For testing
* @param args print with no args to get help
* Convert IDN host to ASCII or Unicode using Java's built-in {@link IDN}
* class.
*
* The conversion supports only IDNA2003, it does not support IDNA2008.
* However, unless the parameter <code>strictIDNA2003</code> is true, the
* methods {@link IDN#toASCII(String, int)} resp.
* {@link IDN#toUnicode(String, int)} are called passing the flag
* {@link IDN#ALLOW_UNASSIGNED} to avoid that the conversion fails on
* characters not in the repertoire of Unicode 3.2.
*
* @param host
* host name to be converted (lowercase expected)
* @param toAscii
* if true convert to ASCII, otherwise to Unicode
* @param strictIDNA2003
* if true, do
* @return converted host name
* @throws MalformedURLException
* if the conversion fails
*/
public static void main(String[] args) {

if (args.length != 1) {
System.err.println("Usage : URLUtil <url>");
return;
public static String convertIDNA2003(String host, boolean toAscii,
boolean strictIDNA2003) throws MalformedURLException {
try {
if (toAscii) {
return IDN.toASCII(host, strictIDNA2003 ? 0 : IDN.ALLOW_UNASSIGNED);
} else {
return IDN.toUnicode(host, strictIDNA2003 ? 0 : IDN.ALLOW_UNASSIGNED);
}
} catch (IllegalArgumentException | IndexOutOfBoundsException e) {
// IllegalArgumentException: thrown if the input string contains
// non-convertible Unicode codepoints
// IndexOutOfBoundsException: thrown (undocumented) if one "label"
// (non-ASCII dot-separated segment) is longer than 256 characters,
// cf. https://bugs.openjdk.java.net/browse/JDK-6806873
LOG.debug("Failed to convert IDN host {}: ", host, e);
throw (MalformedURLException) new MalformedURLException(
"Invalid IDN " + host + ": " + e.getMessage()).initCause(e);
}
}

String url = args[0];
try {
System.out.println(URLUtil.getDomainName(new URL(url)));
} catch (MalformedURLException ex) {
ex.printStackTrace();
/**
* Convert IDN host to ASCII or Unicode using ICU's {@link IDNA} class.
*
* The conversion supports IDNA2008 names.
*
* @param host
* host name to be converted (lowercase expected)
* @param toAscii
* if true convert to ASCII, otherwise to Unicode
* @return converted host name
* @throws MalformedURLException
* if the conversion fails
*/
public static String convertIDNA2008(String host, boolean toAscii)
throws MalformedURLException {
final IDNA.Info idnaInfo = new IDNA.Info();
final StringBuilder hostConverted = new StringBuilder();
if (toAscii) {
idna.nameToASCII(host, hostConverted, idnaInfo);
} else {
idna.nameToUnicode(host, hostConverted, idnaInfo);
}
if (idnaInfo.hasErrors()) {
StringBuilder msg = new StringBuilder();
for (IDNA.Error error : idnaInfo.getErrors()) {
if (msg.length() == 0) {
msg.append("Invalid IDNA2008 host").append(host).append(": ");
} else {
msg.append(", ");
}
msg.append(error.name());
}
String errorMsg = msg.toString();
LOG.debug("Failed to convert IDN host {}: {}", host, errorMsg);
throw new MalformedURLException(errorMsg);
}
return hostConverted.toString();
}

/**
Expand All @@ -610,4 +717,24 @@ public static boolean isHomePageOf(URL url, String hostName) {
&& url.getRef() == null //
&& url.getUserInfo() == null;
}

/**
* For testing
* @param args print with no args to get help
*/
public static void main(String[] args) {

if (args.length != 1) {
System.err.println("Usage : URLUtil <url>");
System.err.println("\nExtract and print pay-level domain names for the input URL");
return;
}

String url = args[0];
try {
System.out.println(URLUtil.getDomainName(new URL(url)));
} catch (MalformedURLException ex) {
ex.printStackTrace();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,18 @@
*/
package org.apache.nutch.protocol.file;

import java.net.URL;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.nio.charset.StandardCharsets;

import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.nutch.net.protocols.Response;

import org.apache.nutch.protocol.Content;
import org.apache.tika.Tika;

import org.apache.hadoop.conf.Configuration;

/**
* FileResponse.java mimics file replies as http response. It tries its best to
* follow http's way for headers, response codes as well as exceptions.
Expand Down Expand Up @@ -125,11 +123,8 @@ public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf)

String path = url.getPath().isEmpty() ? "/" : url.getPath();

try {
// specify the encoding via the config later?
path = java.net.URLDecoder.decode(path, "UTF-8");
} catch (UnsupportedEncodingException ex) {
}
// specify the encoding via the config later?
path = java.net.URLDecoder.decode(path, StandardCharsets.UTF_8);

try {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

import java.net.InetAddress;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.LinkedList;
import java.io.ByteArrayOutputStream;
Expand Down Expand Up @@ -245,7 +246,7 @@ public FtpResponse(URL url, CrawlDatum datum, Ftp ftp, Configuration conf)

this.content = null;

path = java.net.URLDecoder.decode(path, "UTF-8");
path = java.net.URLDecoder.decode(path, StandardCharsets.UTF_8);

if (path.endsWith("/")) {
getDirAsHttpResponse(path, datum.getModifiedTime());
Expand Down
Loading
Loading