(crawler) Reduce the likelihood of crawler tasks locking on domains before they are ready

Change to a bounded queue and adding a sleep to reduce the amount of effectively busy looping threads.
2025-10-06 07:32:38 +02:00 · 2025-04-21 00:39:26 +02:00 · 2025-04-21 00:36:48 +02:00
1 changed files with 9 additions and 3 deletions
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
@@ -42,8 +42,8 @@ import java.nio.file.Path;
 import java.nio.file.StandardCopyOption;
 import java.security.Security;
 import java.util.*;
+import java.util.concurrent.ArrayBlockingQueue;
 import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.LinkedBlockingQueue;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;

@@ -66,7 +66,7 @@ public class CrawlerMain extends ProcessMainClass {
    private final DomainLocks domainLocks = new DomainLocks();

    private final Map<String, CrawlTask> pendingCrawlTasks = new ConcurrentHashMap<>();
-    private final LinkedBlockingQueue<CrawlTask> retryQueue = new LinkedBlockingQueue<>();
+    private final ArrayBlockingQueue<CrawlTask> retryQueue = new ArrayBlockingQueue<>(64);

    private final AtomicInteger tasksDone = new AtomicInteger(0);
    private final HttpFetcherImpl fetcher;
@@ -445,7 +445,13 @@ public class CrawlerMain extends ProcessMainClass {
            // We don't have a lock, so we can't run this task
            // we return to avoid blocking the pool for too long
            if (lock.isEmpty()) {
-                retryQueue.add(this);
+                if (retryQueue.remainingCapacity() > 0) {
+                    // Sleep a moment to avoid busy looping via the retry queue
+                    // in the case when few tasks remain
+                    Thread.sleep(10);
+                }
+
+                retryQueue.put(this);
                return;
            }
            DomainLocks.DomainLock domainLock = lock.get();
Author	SHA1	Message	Date
Viktor Lofgren	e84d5c497a	(crawler) Reduce the likelihood of crawler tasks locking on domains before they are ready Change to a bounded queue and adding a sleep to reduce the amount of effectively busy looping threads.	2025-04-21 00:39:26 +02:00
Viktor Lofgren	2d2d3e2466	(crawler) Reduce the likelihood of crawler tasks locking on domains before they are ready Change to a bounded queue and adding a sleep to reduce the amount of effectively busy looping threads.	2025-04-21 00:36:48 +02:00