(crawler) Further rearrange crawl order

Limit crawl order preferrence to edu domains, to avoid hitting stuff like medium and wordpress with shotgun requests.
(crawler) Add some jitter to crawl delay to avoid accidentally synchronized requests
2025-10-05 21:22:39 +02:00 · 2025-03-27 11:19:20 +01:00 · 2025-03-27 11:15:16 +01:00 · 2025-03-27 11:12:48 +01:00 · 2025-03-27 11:02:21 +01:00 · 2025-03-26 16:51:37 +01:00
10 changed files with 75 additions and 42 deletions
--- a/build.gradle
+++ b/build.gradle
@@ -43,12 +43,11 @@ subprojects.forEach {it ->
 }

 ext {
-    jvmVersion=23
-    dockerImageBase='container-registry.oracle.com/graalvm/jdk:23'
+    jvmVersion = 24
+    dockerImageBase='container-registry.oracle.com/graalvm/jdk:24'
    dockerImageTag='latest'
    dockerImageRegistry='marginalia'
    jibVersion = '3.4.4'
-
 }

 idea {
--- a/code/common/model/java/nu/marginalia/model/EdgeDomain.java
+++ b/code/common/model/java/nu/marginalia/model/EdgeDomain.java
@@ -14,7 +14,7 @@ public class EdgeDomain implements Serializable {
    @Nonnull
    public final String topDomain;

-    public EdgeDomain(String host) {
+    public EdgeDomain(@Nonnull String host) {
        Objects.requireNonNull(host, "domain name must not be null");

        host = host.toLowerCase();
@@ -61,6 +61,10 @@ public class EdgeDomain implements Serializable {
        this.topDomain = topDomain;
    }

+    public static String getTopDomain(String host) {
+        return new EdgeDomain(host).topDomain;
+    }
+
    private boolean looksLikeGovTld(String host) {
        if (host.length() < 8)
            return false;
@@ -116,24 +120,6 @@ public class EdgeDomain implements Serializable {
        return topDomain.substring(0, cutPoint).toLowerCase();
    }

-    public String getLongDomainKey() {
-        StringBuilder ret = new StringBuilder();
-
-        int cutPoint = topDomain.indexOf('.');
-        if (cutPoint < 0) {
-            ret.append(topDomain);
-        } else {
-            ret.append(topDomain, 0, cutPoint);
-        }
-
-        if (!subDomain.isEmpty() && !"www".equals(subDomain)) {
-            ret.append(":");
-            ret.append(subDomain);
-        }
-
-        return ret.toString().toLowerCase();
-    }
-
    /** If possible, try to provide an alias domain,
     * i.e. a domain name that is very likely to link to this one
     * */
--- a/code/common/service/test/nu/marginalia/service/discovery/ZkServiceRegistryTest.java
+++ b/code/common/service/test/nu/marginalia/service/discovery/ZkServiceRegistryTest.java
@@ -25,7 +25,7 @@ import static org.mockito.Mockito.when;
 class ZkServiceRegistryTest {
    private static final int ZOOKEEPER_PORT = 2181;
    private static final GenericContainer<?> zookeeper =
-            new GenericContainer<>("zookeeper:3.8.0")
+            new GenericContainer<>("zookeeper:3.8")
                    .withExposedPorts(ZOOKEEPER_PORT);

    List<ZkServiceRegistry> registries = new ArrayList<>();
--- a/code/libraries/blocking-thread-pool/java/nu/marginalia/util/SimpleBlockingThreadPool.java
+++ b/code/libraries/blocking-thread-pool/java/nu/marginalia/util/SimpleBlockingThreadPool.java
@@ -23,16 +23,33 @@ public class SimpleBlockingThreadPool {
    private final Logger logger = LoggerFactory.getLogger(SimpleBlockingThreadPool.class);

    public SimpleBlockingThreadPool(String name, int poolSize, int queueSize) {
+        this(name, poolSize, queueSize, ThreadType.PLATFORM);
+    }
+
+    public SimpleBlockingThreadPool(String name, int poolSize, int queueSize, ThreadType threadType) {
        tasks = new ArrayBlockingQueue<>(queueSize);

        for (int i = 0; i < poolSize; i++) {
-            Thread worker = new Thread(this::worker, name  + "[" + i + "]");
-            worker.setDaemon(true);
-            worker.start();
+
+            Thread.Builder threadBuilder = switch (threadType) {
+                case VIRTUAL -> Thread.ofVirtual();
+                case PLATFORM -> Thread.ofPlatform().daemon(true);
+            };
+
+            Thread worker = threadBuilder
+                    .name(name  + "[" + i + "]")
+                    .start(this::worker);
+
            workers.add(worker);
        }

    }
+
+    public enum ThreadType {
+        VIRTUAL,
+        PLATFORM
+    }
+
    public void submit(Task task) throws InterruptedException {
        tasks.put(task);
    }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
@@ -41,10 +41,7 @@ import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.StandardCopyOption;
 import java.security.Security;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;
@@ -108,7 +105,8 @@ public class CrawlerMain extends ProcessMainClass {

        pool = new SimpleBlockingThreadPool("CrawlerPool",
                Integer.getInteger("crawler.poolSize", 256),
-                1);
+                1,
+                SimpleBlockingThreadPool.ThreadType.VIRTUAL);


        // Wait for the blacklist to be loaded before starting the crawl
@@ -224,10 +222,7 @@ public class CrawlerMain extends ProcessMainClass {

        logger.info("Loaded {} domains", crawlSpecRecords.size());

-        // Shuffle the domains to ensure we get a good mix of domains in each crawl,
-        // so that e.g. the big domains don't get all crawled at once, or we end up
-        // crawling the same server in parallel from different subdomains...
-        Collections.shuffle(crawlSpecRecords);
+        crawlSpecRecords.sort(crawlSpecArrangement(crawlSpecRecords));

        // First a validation run to ensure the file is all good to parse
        if (crawlSpecRecords.isEmpty()) {
@@ -306,6 +301,28 @@ public class CrawlerMain extends ProcessMainClass {
        }
    }

+    /** Create a comparator that sorts the crawl specs in a way that is beneficial for the crawl,
+     * we want to enqueue domains that tend ro be large and have common top domains first,
+     * but otherwise have a random order.
+     * <p></p>
+     * Note, we can't use hash codes for randomization as it is not desirable to have the same order
+     * every time the process is restarted (and CrawlSpecRecord is a record, which defines equals and
+     * hashcode based on the fields).
+     * */
+    private Comparator<CrawlSpecRecord> crawlSpecArrangement(List<CrawlSpecRecord> records) {
+        Random r = new Random();
+        Map<String, Integer> randomOrder = new HashMap<>(records.size());
+
+        for (var spec : records) {
+            randomOrder.put(spec.domain, r.nextInt());
+        }
+
+        return Comparator.comparing((CrawlSpecRecord spec) -> spec.domain.contains(".edu"))
+                .reversed()
+                .thenComparing(spec -> randomOrder.get(spec.domain))
+                .thenComparing(Record::hashCode); // non-deterministic tie-breaker to
+    }
+
    /** Submit a task for execution if it can be run, returns true if it was submitted
     * or if it can be discarded */
    private boolean trySubmitDeferredTask(CrawlTask task) {
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java
@@ -61,7 +61,7 @@ public class HttpFetcherImpl implements HttpFetcher {
                .cookieHandler(cookies)
                .followRedirects(HttpClient.Redirect.NORMAL)
                .connectTimeout(Duration.ofSeconds(8))
-                .executor(Executors.newCachedThreadPool())
+                .executor(Executors.newVirtualThreadPerTaskExecutor())
                .build();
    }

--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java
@@ -3,6 +3,7 @@ package nu.marginalia.crawl.retreival;
 import nu.marginalia.crawl.fetcher.HttpFetcherImpl;

 import java.time.Duration;
+import java.util.concurrent.ThreadLocalRandom;

 import static java.lang.Math.max;
 import static java.lang.Math.min;
@@ -53,12 +54,13 @@ public class CrawlDelayTimer {
    public void waitFetchDelay(long spentTime) {
        long sleepTime = delayTime;

+        long jitter = ThreadLocalRandom.current().nextLong(0, 150);
        try {
            if (sleepTime >= 1) {
                if (spentTime > sleepTime)
                    return;

-                Thread.sleep(min(sleepTime - spentTime, 5000));
+                Thread.sleep(min(sleepTime - spentTime, 5000) + jitter);
            } else {
                // When no crawl delay is specified, lean toward twice the fetch+process time,
                // within sane limits. This means slower servers get slower crawling, and faster
@@ -71,17 +73,17 @@ public class CrawlDelayTimer {
                if (spentTime > sleepTime)
                    return;

-                Thread.sleep(sleepTime - spentTime);
+                Thread.sleep(sleepTime - spentTime + jitter);
            }

            if (slowDown) {
                // Additional delay when the server is signalling it wants slower requests
-                Thread.sleep(DEFAULT_CRAWL_DELAY_MIN_MS);
+                Thread.sleep(DEFAULT_CRAWL_DELAY_MIN_MS + jitter);
            }
        }
        catch (InterruptedException e) {
            Thread.currentThread().interrupt();
-            throw new RuntimeException();
+            throw new RuntimeException("Interrupted", e);
        }
    }
 }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
@@ -108,6 +108,11 @@ public class CrawlerRetreiver implements AutoCloseable {
                    DomainStateDb.SummaryRecord summaryRecord = sniffRootDocument(probedUrl, delayTimer);
                    domainStateDb.save(summaryRecord);

+                    if (Thread.interrupted()) {
+                        // There's a small chance we're interrupted during the sniffing portion
+                        throw new InterruptedException();
+                    }
+
                    // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
                    if (crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer) > 0) {
                        // If we have reference data, we will always grow the crawl depth a bit
@@ -140,7 +145,6 @@ public class CrawlerRetreiver implements AutoCloseable {
                            CrawlDelayTimer delayTimer,
                            DomainLinks domainLinks) {

-
        // Add external links to the crawl frontier
        crawlFrontier.addAllToQueue(domainLinks.getUrls(rootUrl.proto));

@@ -289,6 +293,10 @@ public class CrawlerRetreiver implements AutoCloseable {
        }
        catch (Exception ex) {
            logger.error("Error configuring link filter", ex);
+            if (Thread.interrupted()) {
+                Thread.currentThread().interrupt();
+                return DomainStateDb.SummaryRecord.forError(domain, "Crawler Interrupted", ex.getMessage());
+            }
        }
        finally {
            crawlFrontier.addVisited(rootUrl);
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java
@@ -46,6 +46,10 @@ public class CrawlerRevisitor {
                break;
            }

+            if (Thread.interrupted()) {
+                throw new InterruptedException();
+            }
+
            var urlMaybe = EdgeUrl.parse(doc.url);
            if (urlMaybe.isEmpty())
                continue;
--- a/code/services-application/search-service/resources/jte/serp/part/sidebar.jte
+++ b/code/services-application/search-service/resources/jte/serp/part/sidebar.jte
@@ -38,7 +38,7 @@
            <div class="space-y-2">
                @for (SearchFilters.SearchOption option : filters.searchOptions())
                    <label class="flex items-center">
-                            <button title="${option.name()}" onclick="document.location='$unsafe{option.getUrl()}'" class="flex-1 py-2 pl-2 rounded flex space-x-2 dark:has-[:checked]:bg-gray-950 has-[:checked]:bg-gray-100 has-[:checked]:text-slate-900 dark:has-[:checked]:text-slate-100 hover:bg-gray-50 dark:hover:bg-gray-950 bg-white dark:bg-gray-900 dark:border dark:border-gray-600 text-margeblue dark:text-slate-200 outline-1 active:outline">
+                            <button title="${option.name()}" onclick="document.location='$unsafe{option.getUrl()}'" class="flex-1 py-2 pl-2 rounded flex space-x-2 dark:has-[:checked]:bg-gray-950 has-[:checked]:bg-gray-300 has-[:checked]:text-slate-900 dark:has-[:checked]:text-slate-100 hover:bg-gray-50 dark:hover:bg-gray-950 bg-white dark:bg-gray-900 dark:border dark:border-gray-600 text-margeblue dark:text-slate-200 outline-1 active:outline">
                                @if (option.isSet())
                                    <input type="checkbox" checked class="sr-only" aria-checked="true" />
                                @else
Author	SHA1	Message	Date
Viktor Lofgren	1c2426a052	(crawler) Further rearrange crawl order Limit crawl order preferrence to edu domains, to avoid hitting stuff like medium and wordpress with shotgun requests.	2025-03-27 11:19:20 +01:00
Viktor Lofgren	34df7441ac	(crawler) Add some jitter to crawl delay to avoid accidentally synchronized requests	2025-03-27 11:15:16 +01:00
Viktor Lofgren	5387e2bd80	(crawler) Adjust crawl order to get a better mixture of domains	2025-03-27 11:12:48 +01:00
Viktor Lofgren	0f3b24d0f8	(crawler) Evaluate virtual threads for the crawler The change also alters SimpleBlockingThreadPool to add the option to use virtual threads instead of platform threads.	2025-03-27 11:02:21 +01:00
Viktor Lofgren	a732095d2a	(crawler) Improve crawl task ordering Further improve the ordering of the crawl tasks in order to ensure that potentially blocking tasks are enqueued as soon as possible.	2025-03-26 16:51:37 +01:00
Viktor Lofgren	6607f0112f	(crawler) Improve how the crawler deals with interruptions In some cases, it threads would previously fail to terminate when interrupted.	2025-03-26 16:19:57 +01:00
Viktor Lofgren	4913730de9	(jdk) Upgrade to Java 24	2025-03-26 13:26:06 +01:00
Viktor Lofgren	1db64f9d56	(chore) Fix zookeeper test by upgrading zk image version. Test suddenly broke due to the increasing entropy of the universe.	2025-03-26 11:47:14 +01:00
Viktor Lofgren	4dcff14498	(search) Improve contrast with light mode	2025-03-25 13:15:31 +01:00