Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 29 additions & 6 deletions src/java/org/apache/nutch/crawl/CrawlDbFilter.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,17 @@

import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.net.MalformedURLException;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metrics.ErrorTracker;
import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.net.URLFilterException;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;

Expand Down Expand Up @@ -56,6 +59,8 @@ public class CrawlDbFilter extends
private Counter orphanRecordsRemovedCounter;
private Counter urlsFilteredCounter;

private ErrorTracker errorTracker;

private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());

Expand All @@ -77,6 +82,9 @@ public void setup(Mapper<Text, CrawlDatum, Text, CrawlDatum>.Context context) {

// Initialize cached counter references
initCounters(context);

// Initialize error tracker with cached counters (NUTCH-3164)
errorTracker = new ErrorTracker(NutchMetrics.GROUP_CRAWLDB_FILTER, context);
}

/**
Expand Down Expand Up @@ -114,17 +122,32 @@ public void map(Text key, CrawlDatum value,
if (url != null && urlNormalizers) {
try {
url = normalizers.normalize(url, scope); // normalize the url
} catch (Exception e) {
LOG.warn("Skipping {}: ", url, e);
url = null;
} catch (MalformedURLException e) {
// NUTCH-3164: malformed URL is a legitimate reason to drop; tracked via
// ErrorTracker, not urlsFilteredCounter (which conflates filtering with
// malformed input).
LOG.error("Skipping malformed URL {}: {}", url, e.getMessage());
errorTracker.incrementCounters(e);
return;
} catch (RuntimeException e) {
// NUTCH-3164: a normalizer plugin bug must not silently delete URLs.
LOG.error("Unexpected exception normalizing {}, keeping URL: ", url, e);
errorTracker.incrementCounters(e);
}
}
if (url != null && urlFiltering) {
try {
url = filters.filter(url); // filter the url
} catch (Exception e) {
LOG.warn("Skipping {}: ", url, e);
url = null;
} catch (URLFilterException e) {
// NUTCH-3164: URLFilterException signals an internal filter failure,
// not URL rejection (rejection is communicated by returning null).
// Track via ErrorTracker; do not drop the URL.
LOG.error("Filter error for {}, keeping URL: {}", url, e.getMessage());
errorTracker.incrementCounters(e);
} catch (RuntimeException e) {
// NUTCH-3164: a filter plugin bug must not silently delete URLs.
LOG.error("Unexpected exception filtering {}, keeping URL: ", url, e);
errorTracker.incrementCounters(e);
}
}
if (url == null) {
Expand Down