(crawler) Improve deferred task behavior

(search) Change icon for small web filter
2025-10-06 07:32:38 +02:00 · 2025-03-18 12:54:18 +01:00 · 2025-03-18 12:25:22 +01:00 · 2025-03-17 12:07:34 +01:00 · 2025-03-17 12:04:34 +01:00 · 2025-03-17 11:39:19 +01:00
6 changed files with 68 additions and 30 deletions
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
@@ -41,7 +41,10 @@ import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.StandardCopyOption;
 import java.security.Security;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;
@@ -248,44 +251,35 @@ public class CrawlerMain extends ProcessMainClass {
            // List of deferred tasks used to ensure beneficial scheduling of domains with regard to DomainLocks,
            // merely shuffling the domains tends to lead to a lot of threads being blocked waiting for a semphore,
            // this will more aggressively attempt to schedule the jobs to avoid blocking
-            List<CrawlTask> deferredTasks = new LinkedList<>();
+            List<CrawlTask> taskList = new ArrayList<>();

-            // Create crawl tasks and submit them to the pool for execution
+            // Create crawl tasks
            for (CrawlSpecRecord crawlSpec : crawlSpecRecords) {
-                if (workLog.isJobFinished(crawlSpec.domain()))
+                if (workLog.isJobFinished(crawlSpec.domain))
                    continue;

-                // Add to the end of the deferral list
-                deferredTasks.addLast(new CrawlTask(
+                var task = new CrawlTask(
                        crawlSpec,
                        anchorTagsSource,
                        outputDir,
                        warcArchiver,
                        domainStateDb,
-                        workLog));
+                        workLog);

-                // Start every task we currently can from the deferral list
-                deferredTasks.removeIf(task -> {
-                    if (task.canRun()) {
-                        if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) != null) {
-                            return true; // task has already run, duplicate in crawl specs
-                        }
-
-                        // This blocks the caller when the pool is full
-                        pool.submitQuietly(task);
-                        return true;
-                    }
-
-                    return false;
-                });
+                // Try to run immediately, to avoid unnecessarily keeping the entire work set in RAM
+                if (!trySubmitDeferredTask(task)) {
+                    // Otherwise add to the taskList for deferred execution
+                    taskList.add(task);
+                }
            }

-            // Schedule any lingering tasks for immediate execution
-            for (var task : deferredTasks) {
-                if (pendingCrawlTasks.putIfAbsent(task.domain, task) != null)
-                    continue;
+             // Schedule viable tasks for execution until list is empty
+            while (!taskList.isEmpty()) {
+                taskList.removeIf(this::trySubmitDeferredTask);

-                pool.submitQuietly(task);
+                // Add a small pause here to avoid busy looping toward the end of the execution cycle when
+                // we might have no new viable tasks to run for hours on end
+                TimeUnit.MILLISECONDS.sleep(50);
            }

            logger.info("Shutting down the pool, waiting for tasks to complete...");
@@ -312,6 +306,28 @@ public class CrawlerMain extends ProcessMainClass {
        }
    }

+    /** Submit a task for execution if it can be run, returns true if it was submitted
+     * or if it can be discarded */
+    private boolean trySubmitDeferredTask(CrawlTask task) {
+        if (!task.canRun()) {
+            return false;
+        }
+
+        if (pendingCrawlTasks.putIfAbsent(task.domain, task) != null) {
+            return true; // task has already run, duplicate in crawl specs
+        }
+
+        try {
+            // This blocks the caller when the pool is full
+            pool.submitQuietly(task);
+            return true;
+        }
+        catch (RuntimeException ex) {
+            logger.error("Failed to submit task " + task.domain, ex);
+            return false;
+        }
+    }
+
    public void runForSingleDomain(String targetDomainName, FileStorageId fileStorageId) throws Exception {
        runForSingleDomain(targetDomainName, fileStorageService.getStorage(fileStorageId).asPath());
    }
@@ -377,6 +393,11 @@ public class CrawlerMain extends ProcessMainClass {
        @Override
        public void run() throws Exception {

+            if (workLog.isJobFinished(domain)) { // No-Op
+                logger.info("Omitting task {}, as it is already run", domain);
+                return;
+            }
+
            Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE);
            Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP);
            Path slopFile = CrawlerOutputFile.createSlopPath(outputDir, id, domain);
@@ -431,7 +452,7 @@ public class CrawlerMain extends ProcessMainClass {
                logger.error("Error fetching domain " + domain, e);
            }
            finally {
-                // We don't need to double-count these; it's also kept int he workLog
+                // We don't need to double-count these; it's also kept in the workLog
                pendingCrawlTasks.remove(domain);
                Thread.currentThread().setName("[idle]");

@@ -522,7 +543,7 @@ public class CrawlerMain extends ProcessMainClass {
    //
    // This must be synchronized as chewing through parquet files in parallel leads to enormous memory overhead
    private synchronized Path migrateParquetData(Path inputPath, String domain, Path crawlDataRoot) throws IOException {
-        if (!inputPath.endsWith(".parquet")) {
+        if (!inputPath.toString().endsWith(".parquet")) {
            return inputPath;
        }

--- a/code/services-application/search-service/java/nu/marginalia/search/model/SearchFilters.java
+++ b/code/services-application/search-service/java/nu/marginalia/search/model/SearchFilters.java
@@ -81,6 +81,7 @@ public class SearchFilters {
                            ),
                            List.of(
                                    new Filter("Vintage", "fa-clock-rotate-left", SearchProfile.VINTAGE, parameters),
+                                    new Filter("Small Web", "fa-minus", SearchProfile.SMALLWEB, parameters),
                                    new Filter("Plain Text", "fa-file", SearchProfile.PLAIN_TEXT, parameters),
                                    new Filter("Tilde", "fa-house", SearchProfile.TILDE, parameters)
                            ),
--- a/code/services-application/search-service/resources/jte/part/testenvHead.jte
+++ b/code/services-application/search-service/resources/jte/part/testenvHead.jte
@@ -9,6 +9,14 @@
                    nicotine: '#f8f8ee',
                    margeblue: '#3e5f6f',
                    liteblue: '#0066cc',
+                },
+                screens: {
+                    'coarsepointer': {
+                        'raw': '(pointer: coarse)'
+                    },
+                    'finepointer': {
+                        'raw': '(pointer: fine)'
+                    },
                }
            },
            screens: {
--- a/code/services-application/search-service/resources/jte/serp/main.jte
+++ b/code/services-application/search-service/resources/jte/serp/main.jte
@@ -23,7 +23,7 @@
                @template.serp.part.searchform(query = results.getParams().query(), profile = results.getProfile(), filters = results.getFilters())
            </div>
            <div class="grow"></div>
-            <button class="fixed bottom-10 right-5 sm:hidden text-sm bg-margeblue text-white p-4 rounded-xl active:text-slate-200" id="filter-button">
+            <button class="fixed bottom-10 right-5 finepointer:hidden md:hidden text-sm bg-margeblue text-white p-4 rounded-xl active:text-slate-200" id="filter-button">
                <i class="fas fa-filter mr-3"></i>
                Filters
            </button>
--- a/code/services-application/search-service/resources/jte/serp/part/sidebar.jte
+++ b/code/services-application/search-service/resources/jte/serp/part/sidebar.jte
@@ -3,7 +3,7 @@

@param SearchFilters filters

-<aside class="md:w-64 py-4 shrink-0 hidden sm:block">
+<aside class="md:w-64 py-4 shrink-0 hidden md:block finepointer:block">
    <div class="space-y-6 sticky top-4">
        <div class="bg-white dark:bg-gray-800 p-4 border dark:border-gray-600 border-gray-300">
            <h2 class="font-medium mb-3 flex items-center font-serif hidden md:block">
--- a/code/services-application/search-service/tailwind/tailwind.config.js
+++ b/code/services-application/search-service/tailwind/tailwind.config.js
@@ -9,6 +9,14 @@ module.exports = {
        nicotine: '#f8f8ee',
        margeblue: '#3e5f6f',
        liteblue: '#0066cc',
+      },
+      screens: {
+        'coarsepointer': {
+          'raw': '(pointer: coarse)'
+        },
+        'finepointer': {
+          'raw': '(pointer: fine)'
+        },
      }
    },
    screens: {
Author	SHA1	Message	Date
Viktor Lofgren	9f18ced73d	(crawler) Improve deferred task behavior	2025-03-18 12:54:18 +01:00
Viktor Lofgren	18e91269ab	(crawler) Improve deferred task behavior	2025-03-18 12:25:22 +01:00
Viktor Lofgren	e315ca5758	(search) Change icon for small web filter The previous icon was of an irregular size and shifted the layout in an unaesthetic way.	2025-03-17 12:07:34 +01:00
Viktor Lofgren	3ceea17c1d	(search) Adjustments to devicd detection in CSS Use pointer:fine media query to better distinguish between mobile devices and PCs with a window in portrait orientation. With this, we never show mobile filtering functionality on mobile; and never show the touch-inaccessible minimized sidebar on mobile.	2025-03-17 12:04:34 +01:00
Viktor Lofgren	b34527c1a3	(search) Add small web filter for new UI	2025-03-17 11:39:19 +01:00
Viktor Lofgren	185bf28fca	(crawler) Correct issue leading to parquet files not being correctly preconverted Path.endsWith("str") != String.endsWith(".str")	2025-03-10 13:48:12 +01:00