(crawler) Further rearrange crawl order

Limit crawl order preferrence to edu domains, to avoid hitting stuff like medium and wordpress with shotgun requests.
(crawler) Add some jitter to crawl delay to avoid accidentally synchronized requests
2025-10-06 07:32:38 +02:00 · 2025-03-27 11:19:20 +01:00 · 2025-03-27 11:15:16 +01:00
2 changed files with 8 additions and 8 deletions
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
@@ -302,8 +302,8 @@ public class CrawlerMain extends ProcessMainClass {
    }
    /** Create a comparator that sorts the crawl specs in a way that is beneficial for the crawl,
-     * we want to enqueue domains that have common top domains first, but otherwise have a random
+     * we want to enqueue domains that tend ro be large and have common top domains first,
-     * order.
+     * but otherwise have a random order.
     * <p></p>
     * Note, we can't use hash codes for randomization as it is not desirable to have the same order
     * every time the process is restarted (and CrawlSpecRecord is a record, which defines equals and
@@ -311,15 +311,13 @@ public class CrawlerMain extends ProcessMainClass {
     * */
    private Comparator<CrawlSpecRecord> crawlSpecArrangement(List<CrawlSpecRecord> records) {
        Random r = new Random();
        Map<String, Integer> topDomainCounts = new HashMap<>(4 + (int) Math.sqrt(records.size()));
        Map<String, Integer> randomOrder = new HashMap<>(records.size());
        for (var spec : records) {
            topDomainCounts.merge(EdgeDomain.getTopDomain(spec.domain), 1, Integer::sum);
            randomOrder.put(spec.domain, r.nextInt());
        }
-        return Comparator.comparing((CrawlSpecRecord spec) -> topDomainCounts.getOrDefault(EdgeDomain.getTopDomain(spec.domain), 0) >= 8)
+        return Comparator.comparing((CrawlSpecRecord spec) -> spec.domain.contains(".edu"))
                .reversed()
                .thenComparing(spec -> randomOrder.get(spec.domain))
                .thenComparing(Record::hashCode); // non-deterministic tie-breaker to
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java
@@ -3,6 +3,7 @@ package nu.marginalia.crawl.retreival;
 import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
 import java.time.Duration;
 import java.util.concurrent.ThreadLocalRandom;
 import static java.lang.Math.max;
 import static java.lang.Math.min;
@@ -53,12 +54,13 @@ public class CrawlDelayTimer {
    public void waitFetchDelay(long spentTime) {
        long sleepTime = delayTime;
        long jitter = ThreadLocalRandom.current().nextLong(0, 150);
        try {
            if (sleepTime >= 1) {
                if (spentTime > sleepTime)
                    return;
-                Thread.sleep(min(sleepTime - spentTime, 5000));
+                Thread.sleep(min(sleepTime - spentTime, 5000) + jitter);
            } else {
                // When no crawl delay is specified, lean toward twice the fetch+process time,
                // within sane limits. This means slower servers get slower crawling, and faster
@@ -71,12 +73,12 @@ public class CrawlDelayTimer {
                if (spentTime > sleepTime)
                    return;
-                Thread.sleep(sleepTime - spentTime);
+                Thread.sleep(sleepTime - spentTime + jitter);
            }
            if (slowDown) {
                // Additional delay when the server is signalling it wants slower requests
-                Thread.sleep(DEFAULT_CRAWL_DELAY_MIN_MS);
+                Thread.sleep(DEFAULT_CRAWL_DELAY_MIN_MS + jitter);
            }
        }
        catch (InterruptedException e) {
Author	SHA1	Message	Date
Viktor Lofgren	1c2426a052	(crawler) Further rearrange crawl order Limit crawl order preferrence to edu domains, to avoid hitting stuff like medium and wordpress with shotgun requests.	2025-03-27 11:19:20 +01:00
Viktor Lofgren	34df7441ac	(crawler) Add some jitter to crawl delay to avoid accidentally synchronized requests	2025-03-27 11:15:16 +01:00