(crawler) Remove illegal requests when denied via robots.txt

The commit removes attempts at probing the root document, feed URLs, and favicon if we are not permitted to do so via robots.txt
(ndp) Simplify code
2025-10-06 07:32:38 +02:00 · 2025-06-22 17:10:44 +02:00 · 2025-06-22 16:08:55 +02:00
6 changed files with 202 additions and 156 deletions
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
@@ -115,9 +115,13 @@ public class CrawlerRetreiver implements AutoCloseable {
                    final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(probedUrl.domain, warcRecorder);
                    final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());

+                    if (!robotsRules.isAllowed(probedUrl.toString())) {
+                        warcRecorder.flagAsRobotsTxtError(probedUrl);
+                        yield 1; // Nothing we can do here, we aren't allowed to fetch the root URL
+                    }
                    delayTimer.waitFetchDelay(0); // initial delay after robots.txt

-                    DomainStateDb.SummaryRecord summaryRecord = sniffRootDocument(probedUrl, delayTimer);
+                    DomainStateDb.SummaryRecord summaryRecord = sniffRootDocument(probedUrl, robotsRules, delayTimer);
                    domainStateDb.save(summaryRecord);

                    if (Thread.interrupted()) {
@@ -270,11 +274,11 @@ public class CrawlerRetreiver implements AutoCloseable {



-    private DomainStateDb.SummaryRecord sniffRootDocument(EdgeUrl rootUrl, CrawlDelayTimer timer) {
+    private DomainStateDb.SummaryRecord sniffRootDocument(EdgeUrl rootUrl, SimpleRobotRules robotsRules, CrawlDelayTimer timer) {
        Optional<String> feedLink = Optional.empty();

        try {
-            var url = rootUrl.withPathAndParam("/", null);
+            EdgeUrl url = rootUrl.withPathAndParam("/", null);

            HttpFetchResult result = fetcher.fetchContent(url, warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
            timer.waitFetchDelay(0);
@@ -331,7 +335,7 @@ public class CrawlerRetreiver implements AutoCloseable {


            if (feedLink.isEmpty()) {
-                feedLink = guessFeedUrl(timer);
+                feedLink = guessFeedUrl(timer, robotsRules);
            }

            // Download the sitemap if available
@@ -339,14 +343,18 @@ public class CrawlerRetreiver implements AutoCloseable {

            // Grab the favicon if it exists

-            if (fetcher.fetchContent(faviconUrl, warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED) instanceof HttpFetchResult.ResultOk iconResult) {
-                String contentType = iconResult.header("Content-Type");
-                byte[] iconData = iconResult.getBodyBytes();
+            if (robotsRules.isAllowed(faviconUrl.toString())) {
+                if (fetcher.fetchContent(faviconUrl, warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED)
+                        instanceof HttpFetchResult.ResultOk iconResult)
+                {
+                    String contentType = iconResult.header("Content-Type");
+                    byte[] iconData = iconResult.getBodyBytes();

-                domainStateDb.saveIcon(
-                        domain,
-                        new DomainStateDb.FaviconRecord(contentType, iconData)
-                );
+                    domainStateDb.saveIcon(
+                            domain,
+                            new DomainStateDb.FaviconRecord(contentType, iconData)
+                    );
+                }
            }
            timer.waitFetchDelay(0);

@@ -383,7 +391,7 @@ public class CrawlerRetreiver implements AutoCloseable {
            "blog/rss"
    );

-    private Optional<String> guessFeedUrl(CrawlDelayTimer timer) throws InterruptedException {
+    private Optional<String> guessFeedUrl(CrawlDelayTimer timer, SimpleRobotRules robotsRules) throws InterruptedException {
        var oldDomainStateRecord = domainStateDb.getSummary(domain);

        // If we are already aware of an old feed URL, then we can just revalidate it
@@ -396,6 +404,9 @@ public class CrawlerRetreiver implements AutoCloseable {

        for (String endpoint : likelyFeedEndpoints) {
            String url = "https://" + domain + "/" + endpoint;
+            if (!robotsRules.isAllowed(url)) {
+                continue;
+            }
            if (validateFeedUrl(url, timer)) {
                return Optional.of(url);
            }
--- a/code/processes/new-domain-process/java/nu/marginalia/ndp/DomainEvaluator.java
+++ b/code/processes/new-domain-process/java/nu/marginalia/ndp/DomainEvaluator.java
@@ -2,40 +2,41 @@ package nu.marginalia.ndp;


 import com.google.inject.Inject;
+import com.google.inject.Singleton;
 import nu.marginalia.WmsaHome;
 import nu.marginalia.contenttype.ContentType;
 import nu.marginalia.contenttype.DocumentBodyToString;
 import nu.marginalia.coordination.DomainCoordinator;
 import nu.marginalia.link_parser.LinkParser;
 import nu.marginalia.model.EdgeDomain;
+import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.ndp.io.HttpClientProvider;
-import nu.marginalia.ndp.model.DomainToTest;
 import org.apache.hc.client5.http.classic.HttpClient;
-import org.apache.hc.core5.http.ClassicHttpResponse;
 import org.apache.hc.core5.http.io.entity.EntityUtils;
 import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;

-import java.net.URI;
-import java.net.URISyntaxException;
+import java.io.InputStream;
 import java.security.KeyManagementException;
 import java.security.NoSuchAlgorithmException;
 import java.time.Duration;
-import java.time.Instant;
 import java.util.Objects;
+import java.util.Optional;
 import java.util.concurrent.TimeUnit;

-
+/**  Evaluates a domain to determine if it is worth indexing.
+ *  This class fetches the root document, checks the response code, content type,
+ *  and parses the HTML to ensure it smells alright.
+ */
+@Singleton
 public class DomainEvaluator {
    private final HttpClient client;
    private final String userAgentString = WmsaHome.getUserAgent().uaString();

    private final LinkParser linkParser = new LinkParser();
    private final DomainCoordinator domainCoordinator;
-    sealed interface FetchResult permits FetchSuccess, FetchFailure {}
-    record FetchSuccess(Document content) implements FetchResult {}
-    record FetchFailure(String reason) implements FetchResult {}

    @Inject
    public DomainEvaluator(DomainCoordinator domainCoordinator) throws NoSuchAlgorithmException, KeyManagementException {
@@ -43,100 +44,103 @@ public class DomainEvaluator {
        client = HttpClientProvider.createClient();
    }

-    public boolean evaluateDomain(DomainToTest domain) throws Exception {
-        var edgeDomain = new EdgeDomain(domain.domainName());
+    public boolean evaluateDomain(String domainName) {
+        var edgeDomain = new EdgeDomain(domainName);
+
+        // Grab a lock on the domain to prevent concurrent evaluations between processes
        try (var lock = domainCoordinator.lockDomain(edgeDomain)) {
-            var result = fetch(domain.domainName());
+            var rootUrl = edgeDomain.toRootUrlHttps();

-            Instant start = Instant.now();
+            var request = ClassicRequestBuilder.get(rootUrl.asURI())
+                    .addHeader("User-Agent", userAgentString)
+                    .addHeader("Accept-Encoding", "gzip")
+                    .addHeader("Accept", "text/html,application/xhtml+xml;q=0.9")
+                    .build();

-            var ret = switch(result) {
-                case FetchSuccess(Document content) -> validateHtml(content, edgeDomain);
-                case FetchFailure failure -> false;
-            };
+            return client.execute(request, (rsp) -> {
+                if (rsp.getEntity() == null)
+                    return false;

-            // Sleep for up to 1 second before we yield the lock to respect rate limits reasonably well
-            Instant end = Instant.now();
-            Duration sleepDuration = Duration.ofSeconds(1).minus(Duration.between(start, end));
+                try {
+                    // Check if the response code indicates a successful fetch
+                    if (200 != rsp.getCode()) {
+                        return false;
+                    }

-            if (sleepDuration.isPositive()) {
-                TimeUnit.MILLISECONDS.sleep(sleepDuration.toMillis());
-            }
+                    byte[] content;
+                    // Read the content from the response entity
+                    try (InputStream contentStream = rsp.getEntity().getContent()) {
+                        content = contentStream.readNBytes(8192);
+                    }

-            return ret;
+                    // Parse the content (if it's valid)
+                    ContentType contentType = ContentType.parse(rsp.getEntity().getContentType());
+
+                    // Validate the content type
+                    if (!contentType.contentType().startsWith("text/html") && !contentType.contentType().startsWith("application/xhtml+xml"))
+                        return false;
+
+                    // Parse the document body to a Jsoup Document
+                    final Document document = Jsoup.parse(DocumentBodyToString.getStringData(contentType, content));
+                    final String text = document.body().text();
+
+                    if (text.length() < 100)
+                        return false;
+                    if (text.contains("404 Not Found") || text.contains("Page not found"))
+                        return false;
+                    if (hasMetaRefresh(document))
+                        return false; // This almost always indicates a parked domain
+                    if (!hasInternalLink(document, edgeDomain, rootUrl))
+                        return false; // No internal links means it's not worth indexing
+
+                    return true;
+                }
+                catch (Exception e) {
+                    return false;
+                }
+                finally {
+                    // May or may not be necessary, but let's ensure we clean up the response entity
+                    // to avoid resource leaks
+                    EntityUtils.consumeQuietly(rsp.getEntity());
+
+                    // Sleep for a while before yielding the lock, to avoid immediately hammering the domain
+                    // from another process
+                    sleepQuietly(Duration.ofSeconds(1));
+                }
+            });
+        }
+        catch (Exception ex) {
+            return false; // If we fail to fetch or parse the domain, we consider it invalid
        }
    }

-    private boolean validateHtml(Document content, EdgeDomain domain) {
-        var rootUrl = domain.toRootUrlHttps();
-        var text = content.body().text();
+    private boolean hasInternalLink(Document document, EdgeDomain currentDomain, EdgeUrl rootUrl) {
+        for (Element atag : document.select("a")) {
+            Optional<EdgeDomain> destDomain = linkParser
+                    .parseLink(rootUrl, atag)
+                    .map(EdgeUrl::getDomain);

-        if (text.length() < 100) {
-            return false; // Too short to be a valid page
+            if (destDomain.isPresent() && Objects.equals(currentDomain, destDomain.get()))
+                return true;
        }
-
-        if (text.contains("404 Not Found") || text.contains("Page not found")) {
-            return false; // Common indicators of a 404 page
-        }
-
-        for (var metaTag : content.select("meta")) {
-            if ("refresh".equalsIgnoreCase(metaTag.attr("http-equiv"))) {
-                return false; // Page has a refresh tag, very likely a parked domain
-            }
-        }
-
-        boolean hasInternalLink = false;
-
-        for (var atag : content.select("a")) {
-            var link = linkParser.parseLink(rootUrl, atag);
-            if (link.isEmpty()) {
-                continue; // Skip invalid links
-            }
-            var edgeUrl = link.get();
-            if (Objects.equals(domain, edgeUrl.getDomain())) {
-                hasInternalLink = true;
-            }
-        }
-
-        return hasInternalLink;
+        return false;
    }

-    private FetchResult fetch(String domain) throws URISyntaxException {
-        var uri = new URI("https://" + domain + "/");
-
-        var request = ClassicRequestBuilder.get(uri)
-                .addHeader("User-Agent", userAgentString)
-                .addHeader("Accept-Encoding", "gzip")
-                .addHeader("Accept", "text/html,application/xhtml+xml;q=0.9")
-                .build();
+    private boolean hasMetaRefresh(Document document) {
+        for (Element metaTag : document.select("meta")) {
+            if ("refresh".equalsIgnoreCase(metaTag.attr("http-equiv")))
+                return true;
+        }
+        return false;
+    }

+    private void sleepQuietly(Duration duration) {
        try {
-            return client.execute(request, (rsp) -> responseHandler(rsp, domain));
-        } catch (Exception e) {
-            return new FetchFailure("Failed to fetch domain: " + e.getMessage());
+            TimeUnit.MILLISECONDS.sleep(duration.toMillis());
+        } catch (InterruptedException e) {
+            throw new RuntimeException(e);
        }
    }

-    private FetchResult responseHandler(ClassicHttpResponse rsp, String domain) {
-        if (rsp.getEntity() == null)
-            return new FetchFailure("No content returned from " + domain);
-
-        try {
-            int code = rsp.getCode();
-            byte[] content = rsp.getEntity().getContent().readAllBytes();
-
-            if (code >= 300) {
-                return new FetchFailure("Received HTTP " + code + " from " + domain);
-            }
-
-            ContentType contentType = ContentType.parse(rsp.getEntity().getContentType());
-            var html = DocumentBodyToString.getStringData(contentType, content);
-            return new FetchSuccess(Jsoup.parse(html));
-        }
-        catch (Exception e) {
-            EntityUtils.consumeQuietly(rsp.getEntity());
-            return new FetchFailure("Failed to read content from " + domain + ": " + e.getMessage());
-        }
-    }

 }
--- a/code/processes/new-domain-process/java/nu/marginalia/ndp/DomainNodeAllocator.java
+++ b/code/processes/new-domain-process/java/nu/marginalia/ndp/DomainNodeAllocator.java
@@ -16,6 +16,9 @@ public class DomainNodeAllocator {

    private final NodeConfigurationService nodeConfigurationService;
    private final HikariDataSource dataSource;
+    private final PriorityQueue<NodeCount> countPerNode = new PriorityQueue<>();
+
+    private volatile boolean initialized = false;

    private record NodeCount(int nodeId, int count)
        implements Comparable<NodeCount>
@@ -30,8 +33,6 @@ public class DomainNodeAllocator {
        }
    }

-    private final PriorityQueue<NodeCount> countPerNode = new PriorityQueue<>();
-    volatile boolean initialized = false;

    @Inject
    public DomainNodeAllocator(NodeConfigurationService nodeConfigurationService, HikariDataSource dataSource) {
@@ -43,6 +44,43 @@ public class DomainNodeAllocator {
                .start(this::initialize);
    }

+    public synchronized int totalCount() {
+        ensureInitialized();
+        return countPerNode.stream().mapToInt(NodeCount::count).sum();
+    }
+
+    /** Returns the next node ID to assign a domain to.
+     * This method is synchronized to ensure thread safety when multiple threads are allocating domains.
+     * The node ID returned is guaranteed to be one of the viable nodes configured in the system.
+     */
+    public synchronized int nextNodeId() {
+        ensureInitialized();
+
+        // Synchronized is fine here as this is not a hot path
+        // (and PriorityBlockingQueue won't help since we're re-adding the same element with a new count all the time)
+
+        NodeCount allocation = countPerNode.remove();
+        countPerNode.add(allocation.incrementCount());
+        return allocation.nodeId();
+    }
+
+
+    private void ensureInitialized() {
+        if (initialized) return;
+
+        synchronized (this) {
+            while (!initialized) {
+                try {
+                    // Wait until the initialization is complete
+                    this.wait(1000);
+                } catch (InterruptedException e) {
+                    Thread.currentThread().interrupt();
+                    throw new RuntimeException("DomainAllocator initialization interrupted", e);
+                }
+            }
+        }
+    }
+

    public void initialize() {
        if (initialized) return;
@@ -89,39 +127,5 @@ public class DomainNodeAllocator {
        initialized = true;
    }

-    private void ensureInitialized() {
-        if (initialized) return;

-        synchronized (this) {
-            while (!initialized) {
-                try {
-                    // Wait until the initialization is complete
-                    this.wait(1000);
-                } catch (InterruptedException e) {
-                    Thread.currentThread().interrupt();
-                    throw new RuntimeException("DomainAllocator initialization interrupted", e);
-                }
-            }
-        }
-    }
-
-    public synchronized int totalCount() {
-        ensureInitialized();
-        return countPerNode.stream().mapToInt(NodeCount::count).sum();
-    }
-
-    /** Returns the next node ID to assign a domain to.
-     * This method is synchronized to ensure thread safety when multiple threads are allocating domains.
-     * The node ID returned is guaranteed to be one of the viable nodes configured in the system.
-     */
-    public synchronized int nextNodeId() {
-        ensureInitialized();
-
-        // Synchronized is fine here as this is not a hot path
-        // (and PriorityBlockingQueue won't help since we're re-adding the same element with a new count all the time)
-
-        NodeCount allocation = countPerNode.remove();
-        countPerNode.add(allocation.incrementCount());
-        return allocation.nodeId();
-    }
 }
--- a/code/processes/new-domain-process/java/nu/marginalia/ndp/DomainTestingQueue.java
+++ b/code/processes/new-domain-process/java/nu/marginalia/ndp/DomainTestingQueue.java
@@ -13,7 +13,9 @@ import java.util.concurrent.ArrayBlockingQueue;
 import java.util.concurrent.ConcurrentHashMap;

 public class DomainTestingQueue {
-    private final ArrayBlockingQueue<DomainToTest> queue = new ArrayBlockingQueue<>(1000);
+    private static Logger logger = LoggerFactory.getLogger(DomainTestingQueue.class);
+
+    private final ArrayBlockingQueue<DomainToTest> queue = new ArrayBlockingQueue<>(2);

    // This will grow quite large, but should be manageable in memory, as theoretical maximum is around 100M domains,
    // order of 2 GB in memory.
@@ -21,7 +23,6 @@ public class DomainTestingQueue {

    private final HikariDataSource dataSource;

-    private static Logger logger = LoggerFactory.getLogger(DomainTestingQueue.class);

    @Inject
    public DomainTestingQueue(HikariDataSource dataSource) {
--- a/code/processes/new-domain-process/java/nu/marginalia/ndp/NdpMain.java
+++ b/code/processes/new-domain-process/java/nu/marginalia/ndp/NdpMain.java
@@ -84,8 +84,23 @@ public class NdpMain extends ProcessMainClass {
                    hb.progress("Discovery Process", cnt, toInsertCount);
                }

-                var nextDomain = domainTestingQueue.next();
-                threadPool.submit(() -> evaluateDomain(nextDomain));
+                final DomainToTest nextDomain = domainTestingQueue.next();
+                threadPool.submit(() -> {
+                    try {
+                        if (domainEvaluator.evaluateDomain(nextDomain.domainName())) {
+                            logger.info("Accepting: {}", nextDomain.domainName());
+                            domainCount.incrementAndGet();
+                            domainTestingQueue.accept(nextDomain, domainNodeAllocator.nextNodeId());
+                        } else {
+                            logger.info("Rejecting: {}", nextDomain.domainName());
+                            domainTestingQueue.reject(nextDomain);
+                        }
+                    }
+                    catch (Exception e) {
+                        domainTestingQueue.reject(nextDomain);
+                        logger.error("Error evaluating domain: " + nextDomain.domainId(), e);
+                    }
+                });
            }
        }

@@ -97,24 +112,6 @@ public class NdpMain extends ProcessMainClass {

    }

-
-    private void evaluateDomain(DomainToTest nextDomain) {
-        try {
-            if (domainEvaluator.evaluateDomain(nextDomain)) {
-                logger.info("Accepting: {}", nextDomain.domainName());
-                domainCount.incrementAndGet();
-                domainTestingQueue.accept(nextDomain, domainNodeAllocator.nextNodeId());
-            } else {
-                logger.info("Rejecting: {}", nextDomain.domainName());
-                domainTestingQueue.reject(nextDomain);
-            }
-        }
-        catch (Exception e) {
-            domainTestingQueue.reject(nextDomain);
-            logger.error("Error evaluating domain: " + nextDomain.domainId(), e);
-        }
-    }
-
    public static void main(String[] args) throws Exception {
        // Prevent Java from caching DNS lookups forever (filling up the system RAM as a result)
        Security.setProperty("networkaddress.cache.ttl" , "3600");
--- a/code/processes/new-domain-process/test/nu/marginalia/ndp/DomainEvaluatorTest.java
+++ b/code/processes/new-domain-process/test/nu/marginalia/ndp/DomainEvaluatorTest.java
@@ -0,0 +1,29 @@
+package nu.marginalia.ndp;
+
+import nu.marginalia.coordination.LocalDomainCoordinator;
+import org.junit.jupiter.api.Tag;
+import org.junit.jupiter.api.Test;
+
+import java.security.KeyManagementException;
+import java.security.NoSuchAlgorithmException;
+
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+class DomainEvaluatorTest {
+
+    @Tag("flaky") // Exclude from CI runs due to potential network issues
+    @Test
+    public void testSunnyDay() throws NoSuchAlgorithmException, KeyManagementException {
+        DomainEvaluator evaluator = new DomainEvaluator(new LocalDomainCoordinator());
+
+        // Should be a valid domain
+        assertTrue(evaluator.evaluateDomain("www.marginalia.nu"));
+
+        // Should be a redirect to www.marginalia.nu
+        assertFalse(evaluator.evaluateDomain("memex.marginalia.nu"));
+
+        // Should fail on Anubis
+        assertFalse(evaluator.evaluateDomain("marginalia-search.com"));
+    }
+}
Author	SHA1	Message	Date
Viktor Lofgren	e50d09cc01	(crawler) Remove illegal requests when denied via robots.txt The commit removes attempts at probing the root document, feed URLs, and favicon if we are not permitted to do so via robots.txt	2025-06-22 17:10:44 +02:00
Viktor Lofgren	bce3892ce0	(ndp) Simplify code	2025-06-22 16:08:55 +02:00