(crawler) Improve deferred task behavior

(search) Change icon for small web filter
2025-10-06 07:32:38 +02:00 · 2025-03-18 12:54:18 +01:00 · 2025-03-18 12:25:22 +01:00 · 2025-03-17 12:07:34 +01:00 · 2025-03-17 12:04:34 +01:00 · 2025-03-17 11:39:19 +01:00
11 changed files with 96 additions and 15 deletions
--- a/code/common/service/java/nu/marginalia/service/module/ServiceConfigurationModule.java
+++ b/code/common/service/java/nu/marginalia/service/module/ServiceConfigurationModule.java
@@ -121,6 +121,7 @@ public class ServiceConfigurationModule extends AbstractModule {
        while (nets.hasMoreElements()) {
            NetworkInterface netif = nets.nextElement();
            logger.info("Considering network interface {}:  Up? {},  Loopback? {}", netif.getDisplayName(), netif.isUp(), netif.isLoopback());
            if (!netif.isUp() || netif.isLoopback()) {
                continue;
            }
@@ -128,6 +129,7 @@ public class ServiceConfigurationModule extends AbstractModule {
            Enumeration<InetAddress> inetAddresses = netif.getInetAddresses();
            while (inetAddresses.hasMoreElements()) {
                InetAddress addr = inetAddresses.nextElement();
                logger.info("Considering address {}: SiteLocal? {}, Loopback? {}", addr.getHostAddress(), addr.isSiteLocalAddress(), addr.isLoopbackAddress());
                if (addr.isSiteLocalAddress() && !addr.isLoopbackAddress()) {
                    return addr.getHostAddress();
                }
--- a/code/common/service/java/nu/marginalia/service/server/MetricsServer.java
+++ b/code/common/service/java/nu/marginalia/service/server/MetricsServer.java
@@ -13,7 +13,7 @@ import java.net.InetSocketAddress;
 public class MetricsServer {
-    private static Logger logger = LoggerFactory.getLogger(MetricsServer.class);
+    private static final Logger logger = LoggerFactory.getLogger(MetricsServer.class);
    @Inject
    public MetricsServer(ServiceConfiguration configuration) {
@@ -30,6 +30,8 @@ public class MetricsServer {
            context.addServlet(new ServletHolder(new MetricsServlet()), "/metrics");
            logger.info("MetricsServer listening on {}:{}", configuration.bindAddress(), configuration.metricsPort());
            server.start();
        }
        catch (Exception|NoSuchMethodError ex) {
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
@@ -248,9 +248,14 @@ public class CrawlerMain extends ProcessMainClass {
            // (this happens when the process is restarted after a crash or a shutdown)
            tasksDone.set(workLog.countFinishedJobs());
-            // Create crawl tasks and submit them to the pool for execution
+            // List of deferred tasks used to ensure beneficial scheduling of domains with regard to DomainLocks,
            // merely shuffling the domains tends to lead to a lot of threads being blocked waiting for a semphore,
            // this will more aggressively attempt to schedule the jobs to avoid blocking
            List<CrawlTask> taskList = new ArrayList<>();
            // Create crawl tasks
            for (CrawlSpecRecord crawlSpec : crawlSpecRecords) {
-                if (workLog.isJobFinished(crawlSpec.domain()))
+                if (workLog.isJobFinished(crawlSpec.domain))
                    continue;
                var task = new CrawlTask(
@@ -261,11 +266,22 @@ public class CrawlerMain extends ProcessMainClass {
                        domainStateDb,
                        workLog);
-                if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) == null) {
+                // Try to run immediately, to avoid unnecessarily keeping the entire work set in RAM
-                    pool.submitQuietly(task);
+                if (!trySubmitDeferredTask(task)) {
                    // Otherwise add to the taskList for deferred execution
                    taskList.add(task);
                }
            }
             // Schedule viable tasks for execution until list is empty
            while (!taskList.isEmpty()) {
                taskList.removeIf(this::trySubmitDeferredTask);
                // Add a small pause here to avoid busy looping toward the end of the execution cycle when
                // we might have no new viable tasks to run for hours on end
                TimeUnit.MILLISECONDS.sleep(50);
            }
            logger.info("Shutting down the pool, waiting for tasks to complete...");
            pool.shutDown();
@@ -290,6 +306,28 @@ public class CrawlerMain extends ProcessMainClass {
        }
    }
    /** Submit a task for execution if it can be run, returns true if it was submitted
     * or if it can be discarded */
    private boolean trySubmitDeferredTask(CrawlTask task) {
        if (!task.canRun()) {
            return false;
        }
        if (pendingCrawlTasks.putIfAbsent(task.domain, task) != null) {
            return true; // task has already run, duplicate in crawl specs
        }
        try {
            // This blocks the caller when the pool is full
            pool.submitQuietly(task);
            return true;
        }
        catch (RuntimeException ex) {
            logger.error("Failed to submit task " + task.domain, ex);
            return false;
        }
    }
    public void runForSingleDomain(String targetDomainName, FileStorageId fileStorageId) throws Exception {
        runForSingleDomain(targetDomainName, fileStorageService.getStorage(fileStorageId).asPath());
    }
@@ -346,9 +384,20 @@ public class CrawlerMain extends ProcessMainClass {
            this.id = Integer.toHexString(domain.hashCode());
        }
        /** Best effort indicator whether we could start this now without getting stuck in
         * DomainLocks purgatory */
        public boolean canRun() {
            return domainLocks.canLock(new EdgeDomain(domain));
        }
        @Override
        public void run() throws Exception {
            if (workLog.isJobFinished(domain)) { // No-Op
                logger.info("Omitting task {}, as it is already run", domain);
                return;
            }
            Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE);
            Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP);
            Path slopFile = CrawlerOutputFile.createSlopPath(outputDir, id, domain);
@@ -403,7 +452,7 @@ public class CrawlerMain extends ProcessMainClass {
                logger.error("Error fetching domain " + domain, e);
            }
            finally {
-                // We don't need to double-count these; it's also kept int he workLog
+                // We don't need to double-count these; it's also kept in the workLog
                pendingCrawlTasks.remove(domain);
                Thread.currentThread().setName("[idle]");
@@ -494,7 +543,7 @@ public class CrawlerMain extends ProcessMainClass {
    //
    // This must be synchronized as chewing through parquet files in parallel leads to enormous memory overhead
    private synchronized Path migrateParquetData(Path inputPath, String domain, Path crawlDataRoot) throws IOException {
-        if (!inputPath.endsWith(".parquet")) {
+        if (!inputPath.toString().endsWith(".parquet")) {
            return inputPath;
        }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java
@@ -251,6 +251,7 @@ public class HttpFetcherImpl implements HttpFetcher {
        return new SitemapRetriever();
    }
    /** Recursively fetch sitemaps */
    @Override
    public List<EdgeUrl> fetchSitemapUrls(String root, CrawlDelayTimer delayTimer) {
        try {
@@ -270,7 +271,7 @@ public class HttpFetcherImpl implements HttpFetcher {
            while (!sitemapQueue.isEmpty() && ret.size() < 20_000 && ++fetchedSitemaps < 10) {
                var head = sitemapQueue.removeFirst();
-                switch (fetchSitemap(head)) {
+                switch (fetchSingleSitemap(head)) {
                    case SitemapResult.SitemapUrls(List<String> urls) -> {
                        for (var url : urls) {
@@ -306,7 +307,7 @@ public class HttpFetcherImpl implements HttpFetcher {
    }
-    private SitemapResult fetchSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
+    private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
        HttpRequest getRequest = HttpRequest.newBuilder()
                .GET()
                .uri(sitemapUrl.asURI())
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/logic/DomainLocks.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/logic/DomainLocks.java
@@ -44,6 +44,14 @@ public class DomainLocks {
        return new Semaphore(2);
    }
    public boolean canLock(EdgeDomain domain) {
        Semaphore sem = locks.get(domain.topDomain.toLowerCase());
        if (null == sem)
            return true;
        else
            return sem.availablePermits() > 0;
    }
    public static class DomainLock implements AutoCloseable {
        private final String domainName;
        private final Semaphore semaphore;
--- a/code/processes/crawling-process/model/java/nu/marginalia/io/SerializableCrawlDataStream.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/io/SerializableCrawlDataStream.java
@@ -42,18 +42,20 @@ public interface SerializableCrawlDataStream extends AutoCloseable {
    {
        String fileName = fullPath.getFileName().toString();
-        if (fileName.endsWith(".parquet")) {
+
        if (fileName.endsWith(".slop.zip")) {
            try {
-                return new ParquetSerializableCrawlDataStream(fullPath);
+                return new SlopSerializableCrawlDataStream(fullPath);
            } catch (Exception ex) {
                logger.error("Error reading domain data from " + fullPath, ex);
                return SerializableCrawlDataStream.empty();
            }
        }
-        if (fileName.endsWith(".slop.zip")) {
+        else if (fileName.endsWith(".parquet")) {
            logger.error("Opening deprecated parquet-style crawl data stream", new Exception());
            try {
-                return new SlopSerializableCrawlDataStream(fullPath);
+                return new ParquetSerializableCrawlDataStream(fullPath);
            } catch (Exception ex) {
                logger.error("Error reading domain data from " + fullPath, ex);
                return SerializableCrawlDataStream.empty();
--- a/code/services-application/search-service/java/nu/marginalia/search/model/SearchFilters.java
+++ b/code/services-application/search-service/java/nu/marginalia/search/model/SearchFilters.java
@@ -81,6 +81,7 @@ public class SearchFilters {
                            ),
                            List.of(
                                    new Filter("Vintage", "fa-clock-rotate-left", SearchProfile.VINTAGE, parameters),
                                    new Filter("Small Web", "fa-minus", SearchProfile.SMALLWEB, parameters),
                                    new Filter("Plain Text", "fa-file", SearchProfile.PLAIN_TEXT, parameters),
                                    new Filter("Tilde", "fa-house", SearchProfile.TILDE, parameters)
                            ),
--- a/code/services-application/search-service/resources/jte/part/testenvHead.jte
+++ b/code/services-application/search-service/resources/jte/part/testenvHead.jte
@@ -9,6 +9,14 @@
                    nicotine: '#f8f8ee',
                    margeblue: '#3e5f6f',
                    liteblue: '#0066cc',
                },
                screens: {
                    'coarsepointer': {
                        'raw': '(pointer: coarse)'
                    },
                    'finepointer': {
                        'raw': '(pointer: fine)'
                    },
                }
            },
            screens: {
--- a/code/services-application/search-service/resources/jte/serp/main.jte
+++ b/code/services-application/search-service/resources/jte/serp/main.jte
@@ -23,7 +23,7 @@
                @template.serp.part.searchform(query = results.getParams().query(), profile = results.getProfile(), filters = results.getFilters())
            </div>
            <div class="grow"></div>
-            <button class="fixed bottom-10 right-5 sm:hidden text-sm bg-margeblue text-white p-4 rounded-xl active:text-slate-200" id="filter-button">
+            <button class="fixed bottom-10 right-5 finepointer:hidden md:hidden text-sm bg-margeblue text-white p-4 rounded-xl active:text-slate-200" id="filter-button">
                <i class="fas fa-filter mr-3"></i>
                Filters
            </button>
--- a/code/services-application/search-service/resources/jte/serp/part/sidebar.jte
+++ b/code/services-application/search-service/resources/jte/serp/part/sidebar.jte
@@ -3,7 +3,7 @@
@param SearchFilters filters
-<aside class="md:w-64 py-4 shrink-0 hidden sm:block">
+<aside class="md:w-64 py-4 shrink-0 hidden md:block finepointer:block">
    <div class="space-y-6 sticky top-4">
        <div class="bg-white dark:bg-gray-800 p-4 border dark:border-gray-600 border-gray-300">
            <h2 class="font-medium mb-3 flex items-center font-serif hidden md:block">
--- a/code/services-application/search-service/tailwind/tailwind.config.js
+++ b/code/services-application/search-service/tailwind/tailwind.config.js
@@ -9,6 +9,14 @@ module.exports = {
        nicotine: '#f8f8ee',
        margeblue: '#3e5f6f',
        liteblue: '#0066cc',
      },
      screens: {
        'coarsepointer': {
          'raw': '(pointer: coarse)'
        },
        'finepointer': {
          'raw': '(pointer: fine)'
        },
      }
    },
    screens: {
Author	SHA1	Message	Date
Viktor Lofgren	9f18ced73d	(crawler) Improve deferred task behavior	2025-03-18 12:54:18 +01:00
Viktor Lofgren	18e91269ab	(crawler) Improve deferred task behavior	2025-03-18 12:25:22 +01:00
Viktor Lofgren	e315ca5758	(search) Change icon for small web filter The previous icon was of an irregular size and shifted the layout in an unaesthetic way.	2025-03-17 12:07:34 +01:00
Viktor Lofgren	3ceea17c1d	(search) Adjustments to devicd detection in CSS Use pointer:fine media query to better distinguish between mobile devices and PCs with a window in portrait orientation. With this, we never show mobile filtering functionality on mobile; and never show the touch-inaccessible minimized sidebar on mobile.	2025-03-17 12:04:34 +01:00
Viktor Lofgren	b34527c1a3	(search) Add small web filter for new UI	2025-03-17 11:39:19 +01:00
Viktor Lofgren	185bf28fca	(crawler) Correct issue leading to parquet files not being correctly preconverted Path.endsWith("str") != String.endsWith(".str")	2025-03-10 13:48:12 +01:00
Viktor Lofgren	78cc25584a	(crawler) Add error logging when entering bad path for historical crawl data	2025-03-10 13:38:40 +01:00
Viktor Lofgren	62ba30bacf	(common) Log info about metrics server	2025-03-10 13:12:39 +01:00
Viktor Lofgren	3bb84eb206	(common) Log info about metrics server	2025-03-10 13:03:48 +01:00
Viktor Lofgren	be7d13ccce	(crawler) Correct task execution logic in crawler The old behavior would flag domains as pending too soon, leading to them being omitted from execution if they were not immediately available to run.	2025-03-09 13:47:51 +01:00
Viktor Lofgren	8c088a7c0b	(crawler) Remove custom thread factory This was causing issues, and not really doing much of benefit.	2025-03-09 11:50:52 +01:00
Viktor Lofgren	ea9a642b9b	(crawler) More effective task scheduling in the crawler This should hopefully allow more threads to be busy	2025-03-09 11:44:59 +01:00